Mercurial > dropbear
changeset 19:e1037a1e12e7 libtommath-orig
0.30 release of LibTomMath
author | Matt Johnston <matt@ucc.asn.au> |
---|---|
date | Tue, 15 Jun 2004 14:42:57 +0000 |
parents | 86e0b50a9b58 |
children | d29b64170cf0 |
files | bn.ilg bn.ind bn.pdf bn.tex poster.out poster.pdf poster.tex tommath.out tommath.pdf tommath.src tommath.tex |
diffstat | 10 files changed, 18976 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bn.ilg Tue Jun 15 14:42:57 2004 +0000 @@ -0,0 +1,6 @@ +This is makeindex, version 2.14 [02-Oct-2002] (kpathsea + Thai support). +Scanning input file bn.idx....done (79 entries accepted, 0 rejected). +Sorting entries....done (511 comparisons). +Generating output file bn.ind....done (82 lines written, 0 warnings). +Output written in bn.ind. +Transcript written in bn.ilg.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bn.ind Tue Jun 15 14:42:57 2004 +0000 @@ -0,0 +1,82 @@ +\begin{theindex} + + \item mp\_add, \hyperpage{25} + \item mp\_add\_d, \hyperpage{48} + \item mp\_and, \hyperpage{25} + \item mp\_clear, \hyperpage{7} + \item mp\_clear\_multi, \hyperpage{8} + \item mp\_cmp, \hyperpage{20} + \item mp\_cmp\_d, \hyperpage{21} + \item mp\_cmp\_mag, \hyperpage{19} + \item mp\_div, \hyperpage{26} + \item mp\_div\_2, \hyperpage{22} + \item mp\_div\_2d, \hyperpage{24} + \item mp\_div\_d, \hyperpage{48} + \item mp\_dr\_reduce, \hyperpage{36} + \item mp\_dr\_setup, \hyperpage{36} + \item MP\_EQ, \hyperpage{18} + \item mp\_error\_to\_string, \hyperpage{6} + \item mp\_expt\_d, \hyperpage{39} + \item mp\_exptmod, \hyperpage{39} + \item mp\_exteuclid, \hyperpage{47} + \item mp\_gcd, \hyperpage{47} + \item mp\_get\_int, \hyperpage{16} + \item mp\_grow, \hyperpage{12} + \item MP\_GT, \hyperpage{18} + \item mp\_init, \hyperpage{7} + \item mp\_init\_copy, \hyperpage{9} + \item mp\_init\_multi, \hyperpage{8} + \item mp\_init\_set, \hyperpage{17} + \item mp\_init\_set\_int, \hyperpage{17} + \item mp\_init\_size, \hyperpage{10} + \item mp\_int, \hyperpage{6} + \item mp\_invmod, \hyperpage{48} + \item mp\_jacobi, \hyperpage{48} + \item mp\_lcm, \hyperpage{47} + \item mp\_lshd, \hyperpage{24} + \item MP\_LT, \hyperpage{18} + \item MP\_MEM, \hyperpage{5} + \item mp\_mod, \hyperpage{31} + \item mp\_mod\_d, \hyperpage{48} + \item mp\_montgomery\_calc\_normalization, \hyperpage{34} + \item mp\_montgomery\_reduce, \hyperpage{33} + \item mp\_montgomery\_setup, \hyperpage{33} + \item mp\_mul, \hyperpage{27} + \item mp\_mul\_2, \hyperpage{22} + \item mp\_mul\_2d, \hyperpage{24} + \item mp\_mul\_d, \hyperpage{48} + \item mp\_n\_root, \hyperpage{40} + \item mp\_neg, \hyperpage{25} + \item MP\_NO, \hyperpage{5} + \item MP\_OKAY, \hyperpage{5} + \item mp\_or, \hyperpage{25} + \item mp\_prime\_fermat, \hyperpage{41} + \item mp\_prime\_is\_divisible, \hyperpage{41} + \item mp\_prime\_is\_prime, \hyperpage{42} + \item mp\_prime\_miller\_rabin, \hyperpage{41} + \item mp\_prime\_next\_prime, \hyperpage{42} + \item mp\_prime\_rabin\_miller\_trials, \hyperpage{42} + \item mp\_prime\_random, \hyperpage{43} + \item mp\_prime\_random\_ex, \hyperpage{43} + \item mp\_radix\_size, \hyperpage{45} + \item mp\_read\_radix, \hyperpage{45} + \item mp\_read\_unsigned\_bin, \hyperpage{46} + \item mp\_reduce, \hyperpage{32} + \item mp\_reduce\_2k, \hyperpage{37} + \item mp\_reduce\_2k\_setup, \hyperpage{37} + \item mp\_reduce\_setup, \hyperpage{32} + \item mp\_rshd, \hyperpage{24} + \item mp\_set, \hyperpage{15} + \item mp\_set\_int, \hyperpage{16} + \item mp\_shrink, \hyperpage{11} + \item mp\_sqr, \hyperpage{29} + \item mp\_sub, \hyperpage{25} + \item mp\_sub\_d, \hyperpage{48} + \item mp\_to\_unsigned\_bin, \hyperpage{46} + \item mp\_toradix, \hyperpage{45} + \item mp\_unsigned\_bin\_size, \hyperpage{46} + \item MP\_VAL, \hyperpage{5} + \item mp\_xor, \hyperpage{25} + \item MP\_YES, \hyperpage{5} + +\end{theindex}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bn.tex Tue Jun 15 14:42:57 2004 +0000 @@ -0,0 +1,1733 @@ +\documentclass[b5paper]{book} +\usepackage{hyperref} +\usepackage{makeidx} +\usepackage{amssymb} +\usepackage{color} +\usepackage{alltt} +\usepackage{graphicx} +\usepackage{layout} +\def\union{\cup} +\def\intersect{\cap} +\def\getsrandom{\stackrel{\rm R}{\gets}} +\def\cross{\times} +\def\cat{\hspace{0.5em} \| \hspace{0.5em}} +\def\catn{$\|$} +\def\divides{\hspace{0.3em} | \hspace{0.3em}} +\def\nequiv{\not\equiv} +\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}} +\def\lcm{{\rm lcm}} +\def\gcd{{\rm gcd}} +\def\log{{\rm log}} +\def\ord{{\rm ord}} +\def\abs{{\mathit abs}} +\def\rep{{\mathit rep}} +\def\mod{{\mathit\ mod\ }} +\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})} +\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor} +\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil} +\def\Or{{\rm\ or\ }} +\def\And{{\rm\ and\ }} +\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}} +\def\implies{\Rightarrow} +\def\undefined{{\rm ``undefined"}} +\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}} +\let\oldphi\phi +\def\phi{\varphi} +\def\Pr{{\rm Pr}} +\newcommand{\str}[1]{{\mathbf{#1}}} +\def\F{{\mathbb F}} +\def\N{{\mathbb N}} +\def\Z{{\mathbb Z}} +\def\R{{\mathbb R}} +\def\C{{\mathbb C}} +\def\Q{{\mathbb Q}} +\definecolor{DGray}{gray}{0.5} +\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}} +\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}} +\def\gap{\vspace{0.5ex}} +\makeindex +\begin{document} +\frontmatter +\pagestyle{empty} +\title{LibTomMath User Manual \\ v0.30} +\author{Tom St Denis \\ [email protected]} +\maketitle +This text, the library and the accompanying textbook are all hereby placed in the public domain. This book has been +formatted for B5 [176x250] paper using the \LaTeX{} {\em book} macro package. + +\vspace{10cm} + +\begin{flushright}Open Source. Open Academia. Open Minds. + +\mbox{ } + +Tom St Denis, + +Ontario, Canada +\end{flushright} + +\tableofcontents +\listoffigures +\mainmatter +\pagestyle{headings} +\chapter{Introduction} +\section{What is LibTomMath?} +LibTomMath is a library of source code which provides a series of efficient and carefully written functions for manipulating +large integer numbers. It was written in portable ISO C source code so that it will build on any platform with a conforming +C compiler. + +In a nutshell the library was written from scratch with verbose comments to help instruct computer science students how +to implement ``bignum'' math. However, the resulting code has proven to be very useful. It has been used by numerous +universities, commercial and open source software developers. It has been used on a variety of platforms ranging from +Linux and Windows based x86 to ARM based Gameboys and PPC based MacOS machines. + +\section{License} +As of the v0.25 the library source code has been placed in the public domain with every new release. As of the v0.28 +release the textbook ``Implementing Multiple Precision Arithmetic'' has been placed in the public domain with every new +release as well. This textbook is meant to compliment the project by providing a more solid walkthrough of the development +algorithms used in the library. + +Since both\footnote{Note that the MPI files under mtest/ are copyrighted by Michael Fromberger. They are not required to use LibTomMath.} are in the +public domain everyone is entitled to do with them as they see fit. + +\section{Building LibTomMath} + +LibTomMath is meant to be very ``GCC friendly'' as it comes with a makefile well suited for GCC. However, the library will +also build in MSVC, Borland C out of the box. For any other ISO C compiler a makefile will have to be made by the end +developer. + +To build the library for GCC simply issue the + +\begin{alltt} +make +\end{alltt} + +command. This will build the library and archive the object files in ``libtommath.a''. Now you simply link against that +and include ``tommath.h'' within your programs. + +Alternatively to build with MSVC type + +\begin{alltt} +nmake -f makefile.msvc +\end{alltt} + +This will build the library and archive the object files in ``tommath.lib''. This has been tested with MSVC version 6.00 +with service pack 5. + +There is limited support for making a ``DLL'' in windows via the ``makefile.cygwin\_dll'' makefile. It requires Cygwin +to work with since it requires the auto-export/import functionality. The resulting DLL and imprt library ``libtomcrypt.dll.a'' +can be used to link LibTomMath dynamically to any Windows program using Cygwin. + +\subsection{Testing} +To build the library and the test harness type + +\begin{alltt} +make test +\end{alltt} + +This will build the library, ``test'' and ``mtest/mtest''. The ``test'' program will accept test vectors and verify the +results. ``mtest/mtest'' will generate test vectors using the MPI library by Michael Fromberger\footnote{A copy of MPI +is included in the package}. Simply pipe mtest into test using + +\begin{alltt} +mtest/mtest | test +\end{alltt} + +If you do not have a ``/dev/urandom'' style RNG source you will have to write your own PRNG and simply pipe that into +mtest. For example, if your PRNG program is called ``myprng'' simply invoke + +\begin{alltt} +myprng | mtest/mtest | test +\end{alltt} + +This will output a row of numbers that are increasing. Each column is a different test (such as addition, multiplication, etc) +that is being performed. The numbers represent how many times the test was invoked. If an error is detected the program +will exit with a dump of the relevent numbers it was working with. + +\section{Purpose of LibTomMath} +Unlike GNU MP (GMP) Library, LIP, OpenSSL or various other commercial kits (Miracl), LibTomMath was not written with +bleeding edge performance in mind. First and foremost LibTomMath was written to be entirely open. Not only is the +source code public domain (unlike various other GPL/etc licensed code), not only is the code freely downloadable but the +source code is also accessible for computer science students attempting to learn ``BigNum'' or multiple precision +arithmetic techniques. + +LibTomMath was written to be an instructive collection of source code. This is why there are many comments, only one +function per source file and often I use a ``middle-road'' approach where I don't cut corners for an extra 2\% speed +increase. + +Source code alone cannot really teach how the algorithms work which is why I also wrote a textbook that accompanies +the library (beat that!). + +So you may be thinking ``should I use LibTomMath?'' and the answer is a definite maybe. Let me tabulate what I think +are the pros and cons of LibTomMath by comparing it to the math routines from GnuPG\footnote{GnuPG v1.2.3 versus LibTomMath v0.28}. + +\newpage\begin{figure}[here] +\begin{small} +\begin{center} +\begin{tabular}{|l|c|c|l|} +\hline \textbf{Criteria} & \textbf{Pro} & \textbf{Con} & \textbf{Notes} \\ +\hline Few lines of code per file & X & & GnuPG $ = 300.9$, LibTomMath $ = 76.04$ \\ +\hline Commented function prototypes & X && GnuPG function names are cryptic. \\ +\hline Speed && X & LibTomMath is slower. \\ +\hline Totally free & X & & GPL has unfavourable restrictions.\\ +\hline Large function base & X & & GnuPG is barebones. \\ +\hline Four modular reduction algorithms & X & & Faster modular exponentiation. \\ +\hline Portable & X & & GnuPG requires configuration to build. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{LibTomMath Valuation} +\end{figure} + +It may seem odd to compare LibTomMath to GnuPG since the math in GnuPG is only a small portion of the entire application. +However, LibTomMath was written with cryptography in mind. It provides essentially all of the functions a cryptosystem +would require when working with large integers. + +So it may feel tempting to just rip the math code out of GnuPG (or GnuMP where it was taken from originally) in your +own application but I think there are reasons not to. While LibTomMath is slower than libraries such as GnuMP it is +not normally significantly slower. On x86 machines the difference is normally a factor of two when performing modular +exponentiations. + +Essentially the only time you wouldn't use LibTomMath is when blazing speed is the primary concern. + +\chapter{Getting Started with LibTomMath} +\section{Building Programs} +In order to use LibTomMath you must include ``tommath.h'' and link against the appropriate library file (typically +libtommath.a). There is no library initialization required and the entire library is thread safe. + +\section{Return Codes} +There are three possible return codes a function may return. + +\index{MP\_OKAY}\index{MP\_YES}\index{MP\_NO}\index{MP\_VAL}\index{MP\_MEM} +\begin{figure}[here!] +\begin{center} +\begin{small} +\begin{tabular}{|l|l|} +\hline \textbf{Code} & \textbf{Meaning} \\ +\hline MP\_OKAY & The function succeeded. \\ +\hline MP\_VAL & The function input was invalid. \\ +\hline MP\_MEM & Heap memory exhausted. \\ +\hline &\\ +\hline MP\_YES & Response is yes. \\ +\hline MP\_NO & Response is no. \\ +\hline +\end{tabular} +\end{small} +\end{center} +\caption{Return Codes} +\end{figure} + +The last two codes listed are not actually ``return'ed'' by a function. They are placed in an integer (the caller must +provide the address of an integer it can store to) which the caller can access. To convert one of the three return codes +to a string use the following function. + +\index{mp\_error\_to\_string} +\begin{alltt} +char *mp_error_to_string(int code); +\end{alltt} + +This will return a pointer to a string which describes the given error code. It will not work for the return codes +MP\_YES and MP\_NO. + +\section{Data Types} +The basic ``multiple precision integer'' type is known as the ``mp\_int'' within LibTomMath. This data type is used to +organize all of the data required to manipulate the integer it represents. Within LibTomMath it has been prototyped +as the following. + +\index{mp\_int} +\begin{alltt} +typedef struct \{ + int used, alloc, sign; + mp_digit *dp; +\} mp_int; +\end{alltt} + +Where ``mp\_digit'' is a data type that represents individual digits of the integer. By default, an mp\_digit is the +ISO C ``unsigned long'' data type and each digit is $28-$bits long. The mp\_digit type can be configured to suit other +platforms by defining the appropriate macros. + +All LTM functions that use the mp\_int type will expect a pointer to mp\_int structure. You must allocate memory to +hold the structure itself by yourself (whether off stack or heap it doesn't matter). The very first thing that must be +done to use an mp\_int is that it must be initialized. + +\section{Function Organization} + +The arithmetic functions of the library are all organized to have the same style prototype. That is source operands +are passed on the left and the destination is on the right. For instance, + +\begin{alltt} +mp_add(&a, &b, &c); /* c = a + b */ +mp_mul(&a, &a, &c); /* c = a * a */ +mp_div(&a, &b, &c, &d); /* c = [a/b], d = a mod b */ +\end{alltt} + +Another feature of the way the functions have been implemented is that source operands can be destination operands as well. +For instance, + +\begin{alltt} +mp_add(&a, &b, &b); /* b = a + b */ +mp_div(&a, &b, &a, &c); /* a = [a/b], c = a mod b */ +\end{alltt} + +This allows operands to be re-used which can make programming simpler. + +\section{Initialization} +\subsection{Single Initialization} +A single mp\_int can be initialized with the ``mp\_init'' function. + +\index{mp\_init} +\begin{alltt} +int mp_init (mp_int * a); +\end{alltt} + +This function expects a pointer to an mp\_int structure and will initialize the members of the structure so the mp\_int +represents the default integer which is zero. If the functions returns MP\_OKAY then the mp\_int is ready to be used +by the other LibTomMath functions. + +\begin{small} \begin{alltt} +int main(void) +\{ + mp_int number; + int result; + + if ((result = mp_init(&number)) != MP_OKAY) \{ + printf("Error initializing the number. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* use the number */ + + return EXIT_SUCCESS; +\} +\end{alltt} \end{small} + +\subsection{Single Free} +When you are finished with an mp\_int it is ideal to return the heap it used back to the system. The following function +provides this functionality. + +\index{mp\_clear} +\begin{alltt} +void mp_clear (mp_int * a); +\end{alltt} + +The function expects a pointer to a previously initialized mp\_int structure and frees the heap it uses. It sets the +pointer\footnote{The ``dp'' member.} within the mp\_int to \textbf{NULL} which is used to prevent double free situations. +Is is legal to call mp\_clear() twice on the same mp\_int in a row. + +\begin{small} \begin{alltt} +int main(void) +\{ + mp_int number; + int result; + + if ((result = mp_init(&number)) != MP_OKAY) \{ + printf("Error initializing the number. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* use the number */ + + /* We're done with it. */ + mp_clear(&number); + + return EXIT_SUCCESS; +\} +\end{alltt} \end{small} + +\subsection{Multiple Initializations} +Certain algorithms require more than one large integer. In these instances it is ideal to initialize all of the mp\_int +variables in an ``all or nothing'' fashion. That is, they are either all initialized successfully or they are all +not initialized. + +The mp\_init\_multi() function provides this functionality. + +\index{mp\_init\_multi} \index{mp\_clear\_multi} +\begin{alltt} +int mp_init_multi(mp_int *mp, ...); +\end{alltt} + +It accepts a \textbf{NULL} terminated list of pointers to mp\_int structures. It will attempt to initialize them all +at once. If the function returns MP\_OKAY then all of the mp\_int variables are ready to use, otherwise none of them +are available for use. A complementary mp\_clear\_multi() function allows multiple mp\_int variables to be free'd +from the heap at the same time. + +\begin{small} \begin{alltt} +int main(void) +\{ + mp_int num1, num2, num3; + int result; + + if ((result = mp_init_multi(&num1, + &num2, + &num3, NULL)) != MP\_OKAY) \{ + printf("Error initializing the numbers. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* use the numbers */ + + /* We're done with them. */ + mp_clear_multi(&num1, &num2, &num3, NULL); + + return EXIT_SUCCESS; +\} +\end{alltt} \end{small} + +\subsection{Other Initializers} +To initialized and make a copy of an mp\_int the mp\_init\_copy() function has been provided. + +\index{mp\_init\_copy} +\begin{alltt} +int mp_init_copy (mp_int * a, mp_int * b); +\end{alltt} + +This function will initialize $a$ and make it a copy of $b$ if all goes well. + +\begin{small} \begin{alltt} +int main(void) +\{ + mp_int num1, num2; + int result; + + /* initialize and do work on num1 ... */ + + /* We want a copy of num1 in num2 now */ + if ((result = mp_init_copy(&num2, &num1)) != MP_OKAY) \{ + printf("Error initializing the copy. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* now num2 is ready and contains a copy of num1 */ + + /* We're done with them. */ + mp_clear_multi(&num1, &num2, NULL); + + return EXIT_SUCCESS; +\} +\end{alltt} \end{small} + +Another less common initializer is mp\_init\_size() which allows the user to initialize an mp\_int with a given +default number of digits. By default, all initializers allocate \textbf{MP\_PREC} digits. This function lets +you override this behaviour. + +\index{mp\_init\_size} +\begin{alltt} +int mp_init_size (mp_int * a, int size); +\end{alltt} + +The $size$ parameter must be greater than zero. If the function succeeds the mp\_int $a$ will be initialized +to have $size$ digits (which are all initially zero). + +\begin{small} \begin{alltt} +int main(void) +\{ + mp_int number; + int result; + + /* we need a 60-digit number */ + if ((result = mp_init_size(&number, 60)) != MP_OKAY) \{ + printf("Error initializing the number. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* use the number */ + + return EXIT_SUCCESS; +\} +\end{alltt} \end{small} + +\section{Maintenance Functions} + +\subsection{Reducing Memory Usage} +When an mp\_int is in a state where it won't be changed again\footnote{A Diffie-Hellman modulus for instance.} excess +digits can be removed to return memory to the heap with the mp\_shrink() function. + +\index{mp\_shrink} +\begin{alltt} +int mp_shrink (mp_int * a); +\end{alltt} + +This will remove excess digits of the mp\_int $a$. If the operation fails the mp\_int should be intact without the +excess digits being removed. Note that you can use a shrunk mp\_int in further computations, however, such operations +will require heap operations which can be slow. It is not ideal to shrink mp\_int variables that you will further +modify in the system (unless you are seriously low on memory). + +\begin{small} \begin{alltt} +int main(void) +\{ + mp_int number; + int result; + + if ((result = mp_init(&number)) != MP_OKAY) \{ + printf("Error initializing the number. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* use the number [e.g. pre-computation] */ + + /* We're done with it for now. */ + if ((result = mp_shrink(&number)) != MP_OKAY) \{ + printf("Error shrinking the number. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* use it .... */ + + + /* we're done with it. */ + mp_clear(&number); + + return EXIT_SUCCESS; +\} +\end{alltt} \end{small} + +\subsection{Adding additional digits} + +Within the mp\_int structure are two parameters which control the limitations of the array of digits that represent +the integer the mp\_int is meant to equal. The \textit{used} parameter dictates how many digits are significant, that is, +contribute to the value of the mp\_int. The \textit{alloc} parameter dictates how many digits are currently available in +the array. If you need to perform an operation that requires more digits you will have to mp\_grow() the mp\_int to +your desired size. + +\index{mp\_grow} +\begin{alltt} +int mp_grow (mp_int * a, int size); +\end{alltt} + +This will grow the array of digits of $a$ to $size$. If the \textit{alloc} parameter is already bigger than +$size$ the function will not do anything. + +\begin{small} \begin{alltt} +int main(void) +\{ + mp_int number; + int result; + + if ((result = mp_init(&number)) != MP_OKAY) \{ + printf("Error initializing the number. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* use the number */ + + /* We need to add 20 digits to the number */ + if ((result = mp_grow(&number, number.alloc + 20)) != MP_OKAY) \{ + printf("Error growing the number. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + + /* use the number */ + + /* we're done with it. */ + mp_clear(&number); + + return EXIT_SUCCESS; +\} +\end{alltt} \end{small} + +\chapter{Basic Operations} +\section{Small Constants} +Setting mp\_ints to small constants is a relatively common operation. To accomodate these instances there are two +small constant assignment functions. The first function is used to set a single digit constant while the second sets +an ISO C style ``unsigned long'' constant. The reason for both functions is efficiency. Setting a single digit is quick but the +domain of a digit can change (it's always at least $0 \ldots 127$). + +\subsection{Single Digit} + +Setting a single digit can be accomplished with the following function. + +\index{mp\_set} +\begin{alltt} +void mp_set (mp_int * a, mp_digit b); +\end{alltt} + +This will zero the contents of $a$ and make it represent an integer equal to the value of $b$. Note that this +function has a return type of \textbf{void}. It cannot cause an error so it is safe to assume the function +succeeded. + +\begin{small} \begin{alltt} +int main(void) +\{ + mp_int number; + int result; + + if ((result = mp_init(&number)) != MP_OKAY) \{ + printf("Error initializing the number. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* set the number to 5 */ + mp_set(&number, 5); + + /* we're done with it. */ + mp_clear(&number); + + return EXIT_SUCCESS; +\} +\end{alltt} \end{small} + +\subsection{Long Constants} + +To set a constant that is the size of an ISO C ``unsigned long'' and larger than a single digit the following function +can be used. + +\index{mp\_set\_int} +\begin{alltt} +int mp_set_int (mp_int * a, unsigned long b); +\end{alltt} + +This will assign the value of the 32-bit variable $b$ to the mp\_int $a$. Unlike mp\_set() this function will always +accept a 32-bit input regardless of the size of a single digit. However, since the value may span several digits +this function can fail if it runs out of heap memory. + +To get the ``unsigned long'' copy of an mp\_int the following function can be used. + +\index{mp\_get\_int} +\begin{alltt} +unsigned long mp_get_int (mp_int * a); +\end{alltt} + +This will return the 32 least significant bits of the mp\_int $a$. + +\begin{small} \begin{alltt} +int main(void) +\{ + mp_int number; + int result; + + if ((result = mp_init(&number)) != MP_OKAY) \{ + printf("Error initializing the number. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* set the number to 654321 (note this is bigger than 127) */ + if ((result = mp_set_int(&number, 654321)) != MP_OKAY) \{ + printf("Error setting the value of the number. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + printf("number == \%lu", mp_get_int(&number)); + + /* we're done with it. */ + mp_clear(&number); + + return EXIT_SUCCESS; +\} +\end{alltt} \end{small} + +This should output the following if the program succeeds. + +\begin{alltt} +number == 654321 +\end{alltt} + +\subsection{Initialize and Setting Constants} +To both initialize and set small constants the following two functions are available. +\index{mp\_init\_set} \index{mp\_init\_set\_int} +\begin{alltt} +int mp_init_set (mp_int * a, mp_digit b); +int mp_init_set_int (mp_int * a, unsigned long b); +\end{alltt} + +Both functions work like the previous counterparts except they first mp\_init $a$ before setting the values. + +\begin{alltt} +int main(void) +\{ + mp_int number1, number2; + int result; + + /* initialize and set a single digit */ + if ((result = mp_init_set(&number1, 100)) != MP_OKAY) \{ + printf("Error setting number1: \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* initialize and set a long */ + if ((result = mp_init_set_int(&number2, 1023)) != MP_OKAY) \{ + printf("Error setting number2: \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* display */ + printf("Number1, Number2 == \%lu, \%lu", + mp_get_int(&number1), mp_get_int(&number2)); + + /* clear */ + mp_clear_multi(&number1, &number2, NULL); + + return EXIT_SUCCESS; +\} +\end{alltt} + +If this program succeeds it shall output. +\begin{alltt} +Number1, Number2 == 100, 1023 +\end{alltt} + +\section{Comparisons} + +Comparisons in LibTomMath are always performed in a ``left to right'' fashion. There are three possible return codes +for any comparison. + +\index{MP\_GT} \index{MP\_EQ} \index{MP\_LT} +\begin{figure}[here] +\begin{center} +\begin{tabular}{|c|c|} +\hline \textbf{Result Code} & \textbf{Meaning} \\ +\hline MP\_GT & $a > b$ \\ +\hline MP\_EQ & $a = b$ \\ +\hline MP\_LT & $a < b$ \\ +\hline +\end{tabular} +\end{center} +\caption{Comparison Codes for $a, b$} +\label{fig:CMP} +\end{figure} + +In figure \ref{fig:CMP} two integers $a$ and $b$ are being compared. In this case $a$ is said to be ``to the left'' of +$b$. + +\subsection{Unsigned comparison} + +An unsigned comparison considers only the digits themselves and not the associated \textit{sign} flag of the +mp\_int structures. This is analogous to an absolute comparison. The function mp\_cmp\_mag() will compare two +mp\_int variables based on their digits only. + +\index{mp\_cmp\_mag} +\begin{alltt} +int mp_cmp(mp_int * a, mp_int * b); +\end{alltt} +This will compare $a$ to $b$ placing $a$ to the left of $b$. This function cannot fail and will return one of the +three compare codes listed in figure \ref{fig:CMP}. + +\begin{small} \begin{alltt} +int main(void) +\{ + mp_int number1, number2; + int result; + + if ((result = mp_init_multi(&number1, &number2, NULL)) != MP_OKAY) \{ + printf("Error initializing the numbers. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* set the number1 to 5 */ + mp_set(&number1, 5); + + /* set the number2 to -6 */ + mp_set(&number2, 6); + if ((result = mp_neg(&number2, &number2)) != MP_OKAY) \{ + printf("Error negating number2. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + switch(mp_cmp_mag(&number1, &number2)) \{ + case MP_GT: printf("|number1| > |number2|"); break; + case MP_EQ: printf("|number1| = |number2|"); break; + case MP_LT: printf("|number1| < |number2|"); break; + \} + + /* we're done with it. */ + mp_clear_multi(&number1, &number2, NULL); + + return EXIT_SUCCESS; +\} +\end{alltt} \end{small} + +If this program\footnote{This function uses the mp\_neg() function which is discussed in section \ref{sec:NEG}.} completes +successfully it should print the following. + +\begin{alltt} +|number1| < |number2| +\end{alltt} + +This is because $\vert -6 \vert = 6$ and obviously $5 < 6$. + +\subsection{Signed comparison} + +To compare two mp\_int variables based on their signed value the mp\_cmp() function is provided. + +\index{mp\_cmp} +\begin{alltt} +int mp_cmp(mp_int * a, mp_int * b); +\end{alltt} + +This will compare $a$ to the left of $b$. It will first compare the signs of the two mp\_int variables. If they +differ it will return immediately based on their signs. If the signs are equal then it will compare the digits +individually. This function will return one of the compare conditions codes listed in figure \ref{fig:CMP}. + +\begin{small} \begin{alltt} +int main(void) +\{ + mp_int number1, number2; + int result; + + if ((result = mp_init_multi(&number1, &number2, NULL)) != MP_OKAY) \{ + printf("Error initializing the numbers. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* set the number1 to 5 */ + mp_set(&number1, 5); + + /* set the number2 to -6 */ + mp_set(&number2, 6); + if ((result = mp_neg(&number2, &number2)) != MP_OKAY) \{ + printf("Error negating number2. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + switch(mp_cmp(&number1, &number2)) \{ + case MP_GT: printf("number1 > number2"); break; + case MP_EQ: printf("number1 = number2"); break; + case MP_LT: printf("number1 < number2"); break; + \} + + /* we're done with it. */ + mp_clear_multi(&number1, &number2, NULL); + + return EXIT_SUCCESS; +\} +\end{alltt} \end{small} + +If this program\footnote{This function uses the mp\_neg() function which is discussed in section \ref{sec:NEG}.} completes +successfully it should print the following. + +\begin{alltt} +number1 > number2 +\end{alltt} + +\subsection{Single Digit} + +To compare a single digit against an mp\_int the following function has been provided. + +\index{mp\_cmp\_d} +\begin{alltt} +int mp_cmp_d(mp_int * a, mp_digit b); +\end{alltt} + +This will compare $a$ to the left of $b$ using a signed comparison. Note that it will always treat $b$ as +positive. This function is rather handy when you have to compare against small values such as $1$ (which often +comes up in cryptography). The function cannot fail and will return one of the tree compare condition codes +listed in figure \ref{fig:CMP}. + + +\begin{small} \begin{alltt} +int main(void) +\{ + mp_int number; + int result; + + if ((result = mp_init(&number)) != MP_OKAY) \{ + printf("Error initializing the number. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* set the number to 5 */ + mp_set(&number, 5); + + switch(mp_cmp_d(&number, 7)) \{ + case MP_GT: printf("number > 7"); break; + case MP_EQ: printf("number = 7"); break; + case MP_LT: printf("number < 7"); break; + \} + + /* we're done with it. */ + mp_clear(&number); + + return EXIT_SUCCESS; +\} +\end{alltt} \end{small} + +If this program functions properly it will print out the following. + +\begin{alltt} +number < 7 +\end{alltt} + +\section{Logical Operations} + +Logical operations are operations that can be performed either with simple shifts or boolean operators such as +AND, XOR and OR directly. These operations are very quick. + +\subsection{Multiplication by two} + +Multiplications and divisions by any power of two can be performed with quick logical shifts either left or +right depending on the operation. + +When multiplying or dividing by two a special case routine can be used which are as follows. +\index{mp\_mul\_2} \index{mp\_div\_2} +\begin{alltt} +int mp_mul_2(mp_int * a, mp_int * b); +int mp_div_2(mp_int * a, mp_int * b); +\end{alltt} + +The former will assign twice $a$ to $b$ while the latter will assign half $a$ to $b$. These functions are fast +since the shift counts and maskes are hardcoded into the routines. + +\begin{small} \begin{alltt} +int main(void) +\{ + mp_int number; + int result; + + if ((result = mp_init(&number)) != MP_OKAY) \{ + printf("Error initializing the number. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* set the number to 5 */ + mp_set(&number, 5); + + /* multiply by two */ + if ((result = mp\_mul\_2(&number, &number)) != MP_OKAY) \{ + printf("Error multiplying the number. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + switch(mp_cmp_d(&number, 7)) \{ + case MP_GT: printf("2*number > 7"); break; + case MP_EQ: printf("2*number = 7"); break; + case MP_LT: printf("2*number < 7"); break; + \} + + /* now divide by two */ + if ((result = mp\_div\_2(&number, &number)) != MP_OKAY) \{ + printf("Error dividing the number. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + switch(mp_cmp_d(&number, 7)) \{ + case MP_GT: printf("2*number/2 > 7"); break; + case MP_EQ: printf("2*number/2 = 7"); break; + case MP_LT: printf("2*number/2 < 7"); break; + \} + + /* we're done with it. */ + mp_clear(&number); + + return EXIT_SUCCESS; +\} +\end{alltt} \end{small} + +If this program is successful it will print out the following text. + +\begin{alltt} +2*number > 7 +2*number/2 < 7 +\end{alltt} + +Since $10 > 7$ and $5 < 7$. To multiply by a power of two the following function can be used. + +\index{mp\_mul\_2d} +\begin{alltt} +int mp_mul_2d(mp_int * a, int b, mp_int * c); +\end{alltt} + +This will multiply $a$ by $2^b$ and store the result in ``c''. If the value of $b$ is less than or equal to +zero the function will copy $a$ to ``c'' without performing any further actions. + +To divide by a power of two use the following. + +\index{mp\_div\_2d} +\begin{alltt} +int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d); +\end{alltt} +Which will divide $a$ by $2^b$, store the quotient in ``c'' and the remainder in ``d'. If $b \le 0$ then the +function simply copies $a$ over to ``c'' and zeroes $d$. The variable $d$ may be passed as a \textbf{NULL} +value to signal that the remainder is not desired. + +\subsection{Polynomial Basis Operations} + +Strictly speaking the organization of the integers within the mp\_int structures is what is known as a +``polynomial basis''. This simply means a field element is stored by divisions of a radix. For example, if +$f(x) = \sum_{i=0}^{k} y_ix^k$ for any vector $\vec y$ then the array of digits in $\vec y$ are said to be +the polynomial basis representation of $z$ if $f(\beta) = z$ for a given radix $\beta$. + +To multiply by the polynomial $g(x) = x$ all you have todo is shift the digits of the basis left one place. The +following function provides this operation. + +\index{mp\_lshd} +\begin{alltt} +int mp_lshd (mp_int * a, int b); +\end{alltt} + +This will multiply $a$ in place by $x^b$ which is equivalent to shifting the digits left $b$ places and inserting zeroes +in the least significant digits. Similarly to divide by a power of $x$ the following function is provided. + +\index{mp\_rshd} +\begin{alltt} +void mp_rshd (mp_int * a, int b) +\end{alltt} +This will divide $a$ in place by $x^b$ and discard the remainder. This function cannot fail as it performs the operations +in place and no new digits are required to complete it. + +\subsection{AND, OR and XOR Operations} + +While AND, OR and XOR operations are not typical ``bignum functions'' they can be useful in several instances. The +three functions are prototyped as follows. + +\index{mp\_or} \index{mp\_and} \index{mp\_xor} +\begin{alltt} +int mp_or (mp_int * a, mp_int * b, mp_int * c); +int mp_and (mp_int * a, mp_int * b, mp_int * c); +int mp_xor (mp_int * a, mp_int * b, mp_int * c); +\end{alltt} + +Which compute $c = a \odot b$ where $\odot$ is one of OR, AND or XOR. + +\section{Addition and Subtraction} + +To compute an addition or subtraction the following two functions can be used. + +\index{mp\_add} \index{mp\_sub} +\begin{alltt} +int mp_add (mp_int * a, mp_int * b, mp_int * c); +int mp_sub (mp_int * a, mp_int * b, mp_int * c) +\end{alltt} + +Which perform $c = a \odot b$ where $\odot$ is one of signed addition or subtraction. The operations are fully sign +aware. + +\section{Sign Manipulation} +\subsection{Negation} +\label{sec:NEG} +Simple integer negation can be performed with the following. + +\index{mp\_neg} +\begin{alltt} +int mp_neg (mp_int * a, mp_int * b); +\end{alltt} + +Which assigns $-a$ to $b$. + +\subsection{Absolute} +Simple integer absolutes can be performed with the following. + +\index{mp\_neg} +\begin{alltt} +int mp_abs (mp_int * a, mp_int * b); +\end{alltt} + +Which assigns $\vert a \vert$ to $b$. + +\section{Integer Division and Remainder} +To perform a complete and general integer division with remainder use the following function. + +\index{mp\_div} +\begin{alltt} +int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d); +\end{alltt} + +This divides $a$ by $b$ and stores the quotient in $c$ and $d$. The signed quotient is computed such that +$bc + d = a$. Note that either of $c$ or $d$ can be set to \textbf{NULL} if their value is not required. If +$b$ is zero the function returns \textbf{MP\_VAL}. + + +\chapter{Multiplication and Squaring} +\section{Multiplication} +A full signed integer multiplication can be performed with the following. +\index{mp\_mul} +\begin{alltt} +int mp_mul (mp_int * a, mp_int * b, mp_int * c); +\end{alltt} +Which assigns the full signed product $ab$ to $c$. This function actually breaks into one of four cases which are +specific multiplication routines optimized for given parameters. First there are the Toom-Cook multiplications which +should only be used with very large inputs. This is followed by the Karatsuba multiplications which are for moderate +sized inputs. Then followed by the Comba and baseline multipliers. + +Fortunately for the developer you don't really need to know this unless you really want to fine tune the system. mp\_mul() +will determine on its own\footnote{Some tweaking may be required.} what routine to use automatically when it is called. + +\begin{alltt} +int main(void) +\{ + mp_int number1, number2; + int result; + + /* Initialize the numbers */ + if ((result = mp_init_multi(&number1, + &number2, NULL)) != MP_OKAY) \{ + printf("Error initializing the numbers. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* set the terms */ + if ((result = mp_set_int(&number, 257)) != MP_OKAY) \{ + printf("Error setting number1. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + if ((result = mp_set_int(&number2, 1023)) != MP_OKAY) \{ + printf("Error setting number2. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* multiply them */ + if ((result = mp_mul(&number1, &number2, + &number1)) != MP_OKAY) \{ + printf("Error multiplying terms. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* display */ + printf("number1 * number2 == \%lu", mp_get_int(&number1)); + + /* free terms and return */ + mp_clear_multi(&number1, &number2, NULL); + + return EXIT_SUCCESS; +\} +\end{alltt} + +If this program succeeds it shall output the following. + +\begin{alltt} +number1 * number2 == 262911 +\end{alltt} + +\section{Squaring} +Since squaring can be performed faster than multiplication it is performed it's own function instead of just using +mp\_mul(). + +\index{mp\_sqr} +\begin{alltt} +int mp_sqr (mp_int * a, mp_int * b); +\end{alltt} + +Will square $a$ and store it in $b$. Like the case of multiplication there are four different squaring +algorithms all which can be called from mp\_sqr(). It is ideal to use mp\_sqr over mp\_mul when squaring terms. + +\section{Tuning Polynomial Basis Routines} + +Both of the Toom-Cook and Karatsuba multiplication algorithms are faster than the traditional $O(n^2)$ approach that +the Comba and baseline algorithms use. At $O(n^{1.464973})$ and $O(n^{1.584962})$ running times respectfully they require +considerably less work. For example, a 10000-digit multiplication would take roughly 724,000 single precision +multiplications with Toom-Cook or 100,000,000 single precision multiplications with the standard Comba (a factor +of 138). + +So why not always use Karatsuba or Toom-Cook? The simple answer is that they have so much overhead that they're not +actually faster than Comba until you hit distinct ``cutoff'' points. For Karatsuba with the default configuration, +GCC 3.3.1 and an Athlon XP processor the cutoff point is roughly 110 digits (about 70 for the Intel P4). That is, at +110 digits Karatsuba and Comba multiplications just about break even and for 110+ digits Karatsuba is faster. + +Toom-Cook has incredible overhead and is probably only useful for very large inputs. So far no known cutoff points +exist and for the most part I just set the cutoff points very high to make sure they're not called. + +A demo program in the ``etc/'' directory of the project called ``tune.c'' can be used to find the cutoff points. This +can be built with GCC as follows + +\begin{alltt} +make XXX +\end{alltt} +Where ``XXX'' is one of the following entries from the table \ref{fig:tuning}. + +\begin{figure}[here] +\begin{center} +\begin{small} +\begin{tabular}{|l|l|} +\hline \textbf{Value of XXX} & \textbf{Meaning} \\ +\hline tune & Builds portable tuning application \\ +\hline tune86 & Builds x86 (pentium and up) program for COFF \\ +\hline tune86c & Builds x86 program for Cygwin \\ +\hline tune86l & Builds x86 program for Linux (ELF format) \\ +\hline +\end{tabular} +\end{small} +\end{center} +\caption{Build Names for Tuning Programs} +\label{fig:tuning} +\end{figure} + +When the program is running it will output a series of measurements for different cutoff points. It will first find +good Karatsuba squaring and multiplication points. Then it proceeds to find Toom-Cook points. Note that the Toom-Cook +tuning takes a very long time as the cutoff points are likely to be very high. + +\chapter{Modular Reduction} + +Modular reduction is process of taking the remainder of one quantity divided by another. Expressed +as (\ref{eqn:mod}) the modular reduction is equivalent to the remainder of $b$ divided by $c$. + +\begin{equation} +a \equiv b \mbox{ (mod }c\mbox{)} +\label{eqn:mod} +\end{equation} + +Of particular interest to cryptography are reductions where $b$ is limited to the range $0 \le b < c^2$ since particularly +fast reduction algorithms can be written for the limited range. + +Note that one of the four optimized reduction algorithms are automatically chosen in the modular exponentiation +algorithm mp\_exptmod when an appropriate modulus is detected. + +\section{Straight Division} +In order to effect an arbitrary modular reduction the following algorithm is provided. + +\index{mp\_mod} +\begin{alltt} +int mp_mod(mp_int *a, mp_int *b, mp_int *c); +\end{alltt} + +This reduces $a$ modulo $b$ and stores the result in $c$. The sign of $c$ shall agree with the sign +of $b$. This algorithm accepts an input $a$ of any range and is not limited by $0 \le a < b^2$. + +\section{Barrett Reduction} + +Barrett reduction is a generic optimized reduction algorithm that requires pre--computation to achieve +a decent speedup over straight division. First a $mu$ value must be precomputed with the following function. + +\index{mp\_reduce\_setup} +\begin{alltt} +int mp_reduce_setup(mp_int *a, mp_int *b); +\end{alltt} + +Given a modulus in $b$ this produces the required $mu$ value in $a$. For any given modulus this only has to +be computed once. Modular reduction can now be performed with the following. + +\index{mp\_reduce} +\begin{alltt} +int mp_reduce(mp_int *a, mp_int *b, mp_int *c); +\end{alltt} + +This will reduce $a$ in place modulo $b$ with the precomputed $mu$ value in $c$. $a$ must be in the range +$0 \le a < b^2$. + +\begin{alltt} +int main(void) +\{ + mp_int a, b, c, mu; + int result; + + /* initialize a,b to desired values, mp_init mu, + * c and set c to 1...we want to compute a^3 mod b + */ + + /* get mu value */ + if ((result = mp_reduce_setup(&mu, b)) != MP_OKAY) \{ + printf("Error getting mu. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* square a to get c = a^2 */ + if ((result = mp_sqr(&a, &c)) != MP_OKAY) \{ + printf("Error squaring. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* now reduce `c' modulo b */ + if ((result = mp_reduce(&c, &b, &mu)) != MP_OKAY) \{ + printf("Error reducing. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* multiply a to get c = a^3 */ + if ((result = mp_mul(&a, &c, &c)) != MP_OKAY) \{ + printf("Error reducing. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* now reduce `c' modulo b */ + if ((result = mp_reduce(&c, &b, &mu)) != MP_OKAY) \{ + printf("Error reducing. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* c now equals a^3 mod b */ + + return EXIT_SUCCESS; +\} +\end{alltt} + +This program will calculate $a^3 \mbox{ mod }b$ if all the functions succeed. + +\section{Montgomery Reduction} + +Montgomery is a specialized reduction algorithm for any odd moduli. Like Barrett reduction a pre--computation +step is required. This is accomplished with the following. + +\index{mp\_montgomery\_setup} +\begin{alltt} +int mp_montgomery_setup(mp_int *a, mp_digit *mp); +\end{alltt} + +For the given odd moduli $a$ the precomputation value is placed in $mp$. The reduction is computed with the +following. + +\index{mp\_montgomery\_reduce} +\begin{alltt} +int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp); +\end{alltt} +This reduces $a$ in place modulo $m$ with the pre--computed value $mp$. $a$ must be in the range +$0 \le a < b^2$. + +Montgomery reduction is faster than Barrett reduction for moduli smaller than the ``comba'' limit. With the default +setup for instance, the limit is $127$ digits ($3556$--bits). Note that this function is not limited to +$127$ digits just that it falls back to a baseline algorithm after that point. + +An important observation is that this reduction does not return $a \mbox{ mod }m$ but $aR^{-1} \mbox{ mod }m$ +where $R = \beta^n$, $n$ is the n number of digits in $m$ and $\beta$ is radix used (default is $2^{28}$). + +To quickly calculate $R$ the following function was provided. + +\index{mp\_montgomery\_calc\_normalization} +\begin{alltt} +int mp_montgomery_calc_normalization(mp_int *a, mp_int *b); +\end{alltt} +Which calculates $a = R$ for the odd moduli $b$ without using multiplication or division. + +The normal modus operandi for Montgomery reductions is to normalize the integers before entering the system. For +example, to calculate $a^3 \mbox { mod }b$ using Montgomery reduction the value of $a$ can be normalized by +multiplying it by $R$. Consider the following code snippet. + +\begin{alltt} +int main(void) +\{ + mp_int a, b, c, R; + mp_digit mp; + int result; + + /* initialize a,b to desired values, + * mp_init R, c and set c to 1.... + */ + + /* get normalization */ + if ((result = mp_montgomery_calc_normalization(&R, b)) != MP_OKAY) \{ + printf("Error getting norm. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* get mp value */ + if ((result = mp_montgomery_setup(&c, &mp)) != MP_OKAY) \{ + printf("Error setting up montgomery. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* normalize `a' so now a is equal to aR */ + if ((result = mp_mulmod(&a, &R, &b, &a)) != MP_OKAY) \{ + printf("Error computing aR. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* square a to get c = a^2R^2 */ + if ((result = mp_sqr(&a, &c)) != MP_OKAY) \{ + printf("Error squaring. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* now reduce `c' back down to c = a^2R^2 * R^-1 == a^2R */ + if ((result = mp_montgomery_reduce(&c, &b, mp)) != MP_OKAY) \{ + printf("Error reducing. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* multiply a to get c = a^3R^2 */ + if ((result = mp_mul(&a, &c, &c)) != MP_OKAY) \{ + printf("Error reducing. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* now reduce `c' back down to c = a^3R^2 * R^-1 == a^3R */ + if ((result = mp_montgomery_reduce(&c, &b, mp)) != MP_OKAY) \{ + printf("Error reducing. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* now reduce (again) `c' back down to c = a^3R * R^-1 == a^3 */ + if ((result = mp_montgomery_reduce(&c, &b, mp)) != MP_OKAY) \{ + printf("Error reducing. \%s", + mp_error_to_string(result)); + return EXIT_FAILURE; + \} + + /* c now equals a^3 mod b */ + + return EXIT_SUCCESS; +\} +\end{alltt} + +This particular example does not look too efficient but it demonstrates the point of the algorithm. By +normalizing the inputs the reduced results are always of the form $aR$ for some variable $a$. This allows +a single final reduction to correct for the normalization and the fast reduction used within the algorithm. + +For more details consider examining the file \textit{bn\_mp\_exptmod\_fast.c}. + +\section{Restricted Dimminished Radix} + +``Dimminished Radix'' reduction refers to reduction with respect to moduli that are ameniable to simple +digit shifting and small multiplications. In this case the ``restricted'' variant refers to moduli of the +form $\beta^k - p$ for some $k \ge 0$ and $0 < p < \beta$ where $\beta$ is the radix (default to $2^{28}$). + +As in the case of Montgomery reduction there is a pre--computation phase required for a given modulus. + +\index{mp\_dr\_setup} +\begin{alltt} +void mp_dr_setup(mp_int *a, mp_digit *d); +\end{alltt} + +This computes the value required for the modulus $a$ and stores it in $d$. This function cannot fail +and does not return any error codes. After the pre--computation a reduction can be performed with the +following. + +\index{mp\_dr\_reduce} +\begin{alltt} +int mp_dr_reduce(mp_int *a, mp_int *b, mp_digit mp); +\end{alltt} + +This reduces $a$ in place modulo $b$ with the pre--computed value $mp$. $b$ must be of a restricted +dimminished radix form and $a$ must be in the range $0 \le a < b^2$. Dimminished radix reductions are +much faster than both Barrett and Montgomery reductions as they have a much lower asymtotic running time. + +Since the moduli are restricted this algorithm is not particularly useful for something like Rabin, RSA or +BBS cryptographic purposes. This reduction algorithm is useful for Diffie-Hellman and ECC where fixed +primes are acceptable. + +Note that unlike Montgomery reduction there is no normalization process. The result of this function is +equal to the correct residue. + +\section{Unrestricted Dimminshed Radix} + +Unrestricted reductions work much like the restricted counterparts except in this case the moduli is of the +form $2^k - p$ for $0 < p < \beta$. In this sense the unrestricted reductions are more flexible as they +can be applied to a wider range of numbers. + +\index{mp\_reduce\_2k\_setup} +\begin{alltt} +int mp_reduce_2k_setup(mp_int *a, mp_digit *d); +\end{alltt} + +This will compute the required $d$ value for the given moduli $a$. + +\index{mp\_reduce\_2k} +\begin{alltt} +int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d); +\end{alltt} + +This will reduce $a$ in place modulo $n$ with the pre--computed value $d$. From my experience this routine is +slower than mp\_dr\_reduce but faster for most moduli sizes than the Montgomery reduction. + +\chapter{Exponentiation} +\section{Single Digit Exponentiation} +\index{mp\_expt\_d} +\begin{alltt} +int mp_expt_d (mp_int * a, mp_digit b, mp_int * c) +\end{alltt} +This computes $c = a^b$ using a simple binary left-to-right algorithm. It is faster than repeated multiplications by +$a$ for all values of $b$ greater than three. + +\section{Modular Exponentiation} +\index{mp\_exptmod} +\begin{alltt} +int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) +\end{alltt} +This computes $Y \equiv G^X \mbox{ (mod }P\mbox{)}$ using a variable width sliding window algorithm. This function +will automatically detect the fastest modular reduction technique to use during the operation. For negative values of +$X$ the operation is performed as $Y \equiv (G^{-1} \mbox{ mod }P)^{\vert X \vert} \mbox{ (mod }P\mbox{)}$ provided that +$gcd(G, P) = 1$. + +This function is actually a shell around the two internal exponentiation functions. This routine will automatically +detect when Barrett, Montgomery, Restricted and Unrestricted Dimminished Radix based exponentiation can be used. Generally +moduli of the a ``restricted dimminished radix'' form lead to the fastest modular exponentiations. Followed by Montgomery +and the other two algorithms. + +\section{Root Finding} +\index{mp\_n\_root} +\begin{alltt} +int mp_n_root (mp_int * a, mp_digit b, mp_int * c) +\end{alltt} +This computes $c = a^{1/b}$ such that $c^b \le a$ and $(c+1)^b > a$. The implementation of this function is not +ideal for values of $b$ greater than three. It will work but become very slow. So unless you are working with very small +numbers (less than 1000 bits) I'd avoid $b > 3$ situations. Will return a positive root only for even roots and return +a root with the sign of the input for odd roots. For example, performing $4^{1/2}$ will return $2$ whereas $(-8)^{1/3}$ +will return $-2$. + +This algorithm uses the ``Newton Approximation'' method and will converge on the correct root fairly quickly. Since +the algorithm requires raising $a$ to the power of $b$ it is not ideal to attempt to find roots for large +values of $b$. If particularly large roots are required then a factor method could be used instead. For example, +$a^{1/16}$ is equivalent to $\left (a^{1/4} \right)^{1/4}$. + +\chapter{Prime Numbers} +\section{Trial Division} +\index{mp\_prime\_is\_divisible} +\begin{alltt} +int mp_prime_is_divisible (mp_int * a, int *result) +\end{alltt} +This will attempt to evenly divide $a$ by a list of primes\footnote{Default is the first 256 primes.} and store the +outcome in ``result''. That is if $result = 0$ then $a$ is not divisible by the primes, otherwise it is. Note that +if the function does not return \textbf{MP\_OKAY} the value in ``result'' should be considered undefined\footnote{Currently +the default is to set it to zero first.}. + +\section{Fermat Test} +\index{mp\_prime\_fermat} +\begin{alltt} +int mp_prime_fermat (mp_int * a, mp_int * b, int *result) +\end{alltt} +Performs a Fermat primality test to the base $b$. That is it computes $b^a \mbox{ mod }a$ and tests whether the value is +equal to $b$ or not. If the values are equal then $a$ is probably prime and $result$ is set to one. Otherwise $result$ +is set to zero. + +\section{Miller-Rabin Test} +\index{mp\_prime\_miller\_rabin} +\begin{alltt} +int mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result) +\end{alltt} +Performs a Miller-Rabin test to the base $b$ of $a$. This test is much stronger than the Fermat test and is very hard to +fool (besides with Carmichael numbers). If $a$ passes the test (therefore is probably prime) $result$ is set to one. +Otherwise $result$ is set to zero. + +Note that is suggested that you use the Miller-Rabin test instead of the Fermat test since all of the failures of +Miller-Rabin are a subset of the failures of the Fermat test. + +\subsection{Required Number of Tests} +Generally to ensure a number is very likely to be prime you have to perform the Miller-Rabin with at least a half-dozen +or so unique bases. However, it has been proven that the probability of failure goes down as the size of the input goes up. +This is why a simple function has been provided to help out. + +\index{mp\_prime\_rabin\_miller\_trials} +\begin{alltt} +int mp_prime_rabin_miller_trials(int size) +\end{alltt} +This returns the number of trials required for a $2^{-96}$ (or lower) probability of failure for a given ``size'' expressed +in bits. This comes in handy specially since larger numbers are slower to test. For example, a 512-bit number would +require ten tests whereas a 1024-bit number would only require four tests. + +You should always still perform a trial division before a Miller-Rabin test though. + +\section{Primality Testing} +\index{mp\_prime\_is\_prime} +\begin{alltt} +int mp_prime_is_prime (mp_int * a, int t, int *result) +\end{alltt} +This will perform a trial division followed by $t$ rounds of Miller-Rabin tests on $a$ and store the result in $result$. +If $a$ passes all of the tests $result$ is set to one, otherwise it is set to zero. Note that $t$ is bounded by +$1 \le t < PRIME\_SIZE$ where $PRIME\_SIZE$ is the number of primes in the prime number table (by default this is $256$). + +\section{Next Prime} +\index{mp\_prime\_next\_prime} +\begin{alltt} +int mp_prime_next_prime(mp_int *a, int t, int bbs_style) +\end{alltt} +This finds the next prime after $a$ that passes mp\_prime\_is\_prime() with $t$ tests. Set $bbs\_style$ to one if you +want only the next prime congruent to $3 \mbox{ mod } 4$, otherwise set it to zero to find any next prime. + +\section{Random Primes} +\index{mp\_prime\_random} +\begin{alltt} +int mp_prime_random(mp_int *a, int t, int size, int bbs, + ltm_prime_callback cb, void *dat) +\end{alltt} +This will find a prime greater than $256^{size}$ which can be ``bbs\_style'' or not depending on $bbs$ and must pass +$t$ rounds of tests. The ``ltm\_prime\_callback'' is a typedef for + +\begin{alltt} +typedef int ltm_prime_callback(unsigned char *dst, int len, void *dat); +\end{alltt} + +Which is a function that must read $len$ bytes (and return the amount stored) into $dst$. The $dat$ variable is simply +copied from the original input. It can be used to pass RNG context data to the callback. The function +mp\_prime\_random() is more suitable for generating primes which must be secret (as in the case of RSA) since there +is no skew on the least significant bits. + +\textit{Note:} As of v0.30 of the LibTomMath library this function has been deprecated. It is still available +but users are encouraged to use the new mp\_prime\_random\_ex() function instead. + +\subsection{Extended Generation} +\index{mp\_prime\_random\_ex} +\begin{alltt} +int mp_prime_random_ex(mp_int *a, int t, + int size, int flags, + ltm_prime_callback cb, void *dat); +\end{alltt} +This will generate a prime in $a$ using $t$ tests of the primality testing algorithms. The variable $size$ +specifies the bit length of the prime desired. The variable $flags$ specifies one of several options available +(see fig. \ref{fig:primeopts}) which can be OR'ed together. The callback parameters are used as in +mp\_prime\_random(). + +\begin{figure}[here] +\begin{center} +\begin{small} +\begin{tabular}{|r|l|} +\hline \textbf{Flag} & \textbf{Meaning} \\ +\hline LTM\_PRIME\_BBS & Make the prime congruent to $3$ modulo $4$ \\ +\hline LTM\_PRIME\_SAFE & Make a prime $p$ such that $(p - 1)/2$ is also prime. \\ + & This option implies LTM\_PRIME\_BBS as well. \\ +\hline LTM\_PRIME\_2MSB\_OFF & Makes sure that the bit adjacent to the most significant bit \\ + & Is forced to zero. \\ +\hline LTM\_PRIME\_2MSB\_ON & Makes sure that the bit adjacent to the most significant bit \\ + & Is forced to one. \\ +\hline +\end{tabular} +\end{small} +\end{center} +\caption{Primality Generation Options} +\label{fig:primeopts} +\end{figure} + +\chapter{Input and Output} +\section{ASCII Conversions} +\subsection{To ASCII} +\index{mp\_toradix} +\begin{alltt} +int mp_toradix (mp_int * a, char *str, int radix); +\end{alltt} +This still store $a$ in ``str'' as a base-``radix'' string of ASCII chars. This function appends a NUL character +to terminate the string. Valid values of ``radix'' line in the range $[2, 64]$. To determine the size (exact) required +by the conversion before storing any data use the following function. + +\index{mp\_radix\_size} +\begin{alltt} +int mp_radix_size (mp_int * a, int radix, int *size) +\end{alltt} +This stores in ``size'' the number of characters (including space for the NUL terminator) required. Upon error this +function returns an error code and ``size'' will be zero. + +\subsection{From ASCII} +\index{mp\_read\_radix} +\begin{alltt} +int mp_read_radix (mp_int * a, char *str, int radix); +\end{alltt} +This will read the base-``radix'' NUL terminated string from ``str'' into $a$. It will stop reading when it reads a +character it does not recognize (which happens to include th NUL char... imagine that...). A single leading $-$ sign +can be used to denote a negative number. + +\section{Binary Conversions} + +Converting an mp\_int to and from binary is another keen idea. + +\index{mp\_unsigned\_bin\_size} +\begin{alltt} +int mp_unsigned_bin_size(mp_int *a); +\end{alltt} + +This will return the number of bytes (octets) required to store the unsigned copy of the integer $a$. + +\index{mp\_to\_unsigned\_bin} +\begin{alltt} +int mp_to_unsigned_bin(mp_int *a, unsigned char *b); +\end{alltt} +This will store $a$ into the buffer $b$ in big--endian format. Fortunately this is exactly what DER (or is it ASN?) +requires. It does not store the sign of the integer. + +\index{mp\_read\_unsigned\_bin} +\begin{alltt} +int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c); +\end{alltt} +This will read in an unsigned big--endian array of bytes (octets) from $b$ of length $c$ into $a$. The resulting +integer $a$ will always be positive. + +For those who acknowledge the existence of negative numbers (heretic!) there are ``signed'' versions of the +previous functions. + +\begin{alltt} +int mp_signed_bin_size(mp_int *a); +int mp_read_signed_bin(mp_int *a, unsigned char *b, int c); +int mp_to_signed_bin(mp_int *a, unsigned char *b); +\end{alltt} +They operate essentially the same as the unsigned copies except they prefix the data with zero or non--zero +byte depending on the sign. If the sign is zpos (e.g. not negative) the prefix is zero, otherwise the prefix +is non--zero. + +\chapter{Algebraic Functions} +\section{Extended Euclidean Algorithm} +\index{mp\_exteuclid} +\begin{alltt} +int mp_exteuclid(mp_int *a, mp_int *b, + mp_int *U1, mp_int *U2, mp_int *U3); +\end{alltt} + +This finds the triple U1/U2/U3 using the Extended Euclidean algorithm such that the following equation holds. + +\begin{equation} +a \cdot U1 + b \cdot U2 = U3 +\end{equation} + +Any of the U1/U2/U3 paramters can be set to \textbf{NULL} if they are not desired. + +\section{Greatest Common Divisor} +\index{mp\_gcd} +\begin{alltt} +int mp_gcd (mp_int * a, mp_int * b, mp_int * c) +\end{alltt} +This will compute the greatest common divisor of $a$ and $b$ and store it in $c$. + +\section{Least Common Multiple} +\index{mp\_lcm} +\begin{alltt} +int mp_lcm (mp_int * a, mp_int * b, mp_int * c) +\end{alltt} +This will compute the least common multiple of $a$ and $b$ and store it in $c$. + +\section{Jacobi Symbol} +\index{mp\_jacobi} +\begin{alltt} +int mp_jacobi (mp_int * a, mp_int * p, int *c) +\end{alltt} +This will compute the Jacobi symbol for $a$ with respect to $p$. If $p$ is prime this essentially computes the Legendre +symbol. The result is stored in $c$ and can take on one of three values $\lbrace -1, 0, 1 \rbrace$. If $p$ is prime +then the result will be $-1$ when $a$ is not a quadratic residue modulo $p$. The result will be $0$ if $a$ divides $p$ +and the result will be $1$ if $a$ is a quadratic residue modulo $p$. + +\section{Modular Inverse} +\index{mp\_invmod} +\begin{alltt} +int mp_invmod (mp_int * a, mp_int * b, mp_int * c) +\end{alltt} +Computes the multiplicative inverse of $a$ modulo $b$ and stores the result in $c$ such that $ac \equiv 1 \mbox{ (mod }b\mbox{)}$. + +\section{Single Digit Functions} + +For those using small numbers (\textit{snicker snicker}) there are several ``helper'' functions + +\index{mp\_add\_d} \index{mp\_sub\_d} \index{mp\_mul\_d} \index{mp\_div\_d} \index{mp\_mod\_d} +\begin{alltt} +int mp_add_d(mp_int *a, mp_digit b, mp_int *c); +int mp_sub_d(mp_int *a, mp_digit b, mp_int *c); +int mp_mul_d(mp_int *a, mp_digit b, mp_int *c); +int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d); +int mp_mod_d(mp_int *a, mp_digit b, mp_digit *c); +\end{alltt} + +These work like the full mp\_int capable variants except the second parameter $b$ is a mp\_digit. These +functions fairly handy if you have to work with relatively small numbers since you will not have to allocate +an entire mp\_int to store a number like $1$ or $2$. + +\input{bn.ind} + +\end{document}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/poster.tex Tue Jun 15 14:42:57 2004 +0000 @@ -0,0 +1,35 @@ +\documentclass[landscape,11pt]{article} +\usepackage{amsmath, amssymb} +\usepackage{hyperref} +\begin{document} +\hspace*{-3in} +\begin{tabular}{llllll} +$c = a + b$ & {\tt mp\_add(\&a, \&b, \&c)} & $b = 2a$ & {\tt mp\_mul\_2(\&a, \&b)} & \\ +$c = a - b$ & {\tt mp\_sub(\&a, \&b, \&c)} & $b = a/2$ & {\tt mp\_div\_2(\&a, \&b)} & \\ +$c = ab $ & {\tt mp\_mul(\&a, \&b, \&c)} & $c = 2^ba$ & {\tt mp\_mul\_2d(\&a, b, \&c)} \\ +$b = a^2 $ & {\tt mp\_sqr(\&a, \&b)} & $c = a/2^b, d = a \mod 2^b$ & {\tt mp\_div\_2d(\&a, b, \&c, \&d)} \\ +$c = \lfloor a/b \rfloor, d = a \mod b$ & {\tt mp\_div(\&a, \&b, \&c, \&d)} & $c = a \mod 2^b $ & {\tt mp\_mod\_2d(\&a, b, \&c)} \\ + && \\ +$a = b $ & {\tt mp\_set\_int(\&a, b)} & $c = a \vee b$ & {\tt mp\_or(\&a, \&b, \&c)} \\ +$b = a $ & {\tt mp\_copy(\&a, \&b)} & $c = a \wedge b$ & {\tt mp\_and(\&a, \&b, \&c)} \\ + && $c = a \oplus b$ & {\tt mp\_xor(\&a, \&b, \&c)} \\ + & \\ +$b = -a $ & {\tt mp\_neg(\&a, \&b)} & $d = a + b \mod c$ & {\tt mp\_addmod(\&a, \&b, \&c, \&d)} \\ +$b = |a| $ & {\tt mp\_abs(\&a, \&b)} & $d = a - b \mod c$ & {\tt mp\_submod(\&a, \&b, \&c, \&d)} \\ + && $d = ab \mod c$ & {\tt mp\_mulmod(\&a, \&b, \&c, \&d)} \\ +Compare $a$ and $b$ & {\tt mp\_cmp(\&a, \&b)} & $c = a^2 \mod b$ & {\tt mp\_sqrmod(\&a, \&b, \&c)} \\ +Is Zero? & {\tt mp\_iszero(\&a)} & $c = a^{-1} \mod b$ & {\tt mp\_invmod(\&a, \&b, \&c)} \\ +Is Even? & {\tt mp\_iseven(\&a)} & $d = a^b \mod c$ & {\tt mp\_exptmod(\&a, \&b, \&c, \&d)} \\ +Is Odd ? & {\tt mp\_isodd(\&a)} \\ +&\\ +$\vert \vert a \vert \vert$ & {\tt mp\_unsigned\_bin\_size(\&a)} & $res$ = 1 if $a$ prime to $t$ rounds? & {\tt mp\_prime\_is\_prime(\&a, t, \&res)} \\ +$buf \leftarrow a$ & {\tt mp\_to\_unsigned\_bin(\&a, buf)} & Next prime after $a$ to $t$ rounds. & {\tt mp\_prime\_next\_prime(\&a, t, bbs\_style)} \\ +$a \leftarrow buf[0..len-1]$ & {\tt mp\_read\_unsigned\_bin(\&a, buf, len)} \\ +&\\ +$b = \sqrt{a}$ & {\tt mp\_sqrt(\&a, \&b)} & $c = \mbox{gcd}(a, b)$ & {\tt mp\_gcd(\&a, \&b, \&c)} \\ +$c = a^{1/b}$ & {\tt mp\_n\_root(\&a, b, \&c)} & $c = \mbox{lcm}(a, b)$ & {\tt mp\_lcm(\&a, \&b, \&c)} \\ +&\\ +Greater Than & MP\_GT & Equal To & MP\_EQ \\ +Less Than & MP\_LT & Bits per digit & DIGIT\_BIT \\ +\end{tabular} +\end{document}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tommath.out Tue Jun 15 14:42:57 2004 +0000 @@ -0,0 +1,139 @@ +\BOOKMARK [0][-]{chapter.1}{Introduction}{} +\BOOKMARK [1][-]{section.1.1}{Multiple Precision Arithmetic}{chapter.1} +\BOOKMARK [2][-]{subsection.1.1.1}{What is Multiple Precision Arithmetic?}{section.1.1} +\BOOKMARK [2][-]{subsection.1.1.2}{The Need for Multiple Precision Arithmetic}{section.1.1} +\BOOKMARK [2][-]{subsection.1.1.3}{Benefits of Multiple Precision Arithmetic}{section.1.1} +\BOOKMARK [1][-]{section.1.2}{Purpose of This Text}{chapter.1} +\BOOKMARK [1][-]{section.1.3}{Discussion and Notation}{chapter.1} +\BOOKMARK [2][-]{subsection.1.3.1}{Notation}{section.1.3} +\BOOKMARK [2][-]{subsection.1.3.2}{Precision Notation}{section.1.3} +\BOOKMARK [2][-]{subsection.1.3.3}{Algorithm Inputs and Outputs}{section.1.3} +\BOOKMARK [2][-]{subsection.1.3.4}{Mathematical Expressions}{section.1.3} +\BOOKMARK [2][-]{subsection.1.3.5}{Work Effort}{section.1.3} +\BOOKMARK [1][-]{section.1.4}{Exercises}{chapter.1} +\BOOKMARK [1][-]{section.1.5}{Introduction to LibTomMath}{chapter.1} +\BOOKMARK [2][-]{subsection.1.5.1}{What is LibTomMath?}{section.1.5} +\BOOKMARK [2][-]{subsection.1.5.2}{Goals of LibTomMath}{section.1.5} +\BOOKMARK [1][-]{section.1.6}{Choice of LibTomMath}{chapter.1} +\BOOKMARK [2][-]{subsection.1.6.1}{Code Base}{section.1.6} +\BOOKMARK [2][-]{subsection.1.6.2}{API Simplicity}{section.1.6} +\BOOKMARK [2][-]{subsection.1.6.3}{Optimizations}{section.1.6} +\BOOKMARK [2][-]{subsection.1.6.4}{Portability and Stability}{section.1.6} +\BOOKMARK [2][-]{subsection.1.6.5}{Choice}{section.1.6} +\BOOKMARK [0][-]{chapter.2}{Getting Started}{} +\BOOKMARK [1][-]{section.2.1}{Library Basics}{chapter.2} +\BOOKMARK [1][-]{section.2.2}{What is a Multiple Precision Integer?}{chapter.2} +\BOOKMARK [2][-]{subsection.2.2.1}{The mp\137int Structure}{section.2.2} +\BOOKMARK [1][-]{section.2.3}{Argument Passing}{chapter.2} +\BOOKMARK [1][-]{section.2.4}{Return Values}{chapter.2} +\BOOKMARK [1][-]{section.2.5}{Initialization and Clearing}{chapter.2} +\BOOKMARK [2][-]{subsection.2.5.1}{Initializing an mp\137int}{section.2.5} +\BOOKMARK [2][-]{subsection.2.5.2}{Clearing an mp\137int}{section.2.5} +\BOOKMARK [1][-]{section.2.6}{Maintenance Algorithms}{chapter.2} +\BOOKMARK [2][-]{subsection.2.6.1}{Augmenting an mp\137int's Precision}{section.2.6} +\BOOKMARK [2][-]{subsection.2.6.2}{Initializing Variable Precision mp\137ints}{section.2.6} +\BOOKMARK [2][-]{subsection.2.6.3}{Multiple Integer Initializations and Clearings}{section.2.6} +\BOOKMARK [2][-]{subsection.2.6.4}{Clamping Excess Digits}{section.2.6} +\BOOKMARK [0][-]{chapter.3}{Basic Operations}{} +\BOOKMARK [1][-]{section.3.1}{Introduction}{chapter.3} +\BOOKMARK [1][-]{section.3.2}{Assigning Values to mp\137int Structures}{chapter.3} +\BOOKMARK [2][-]{subsection.3.2.1}{Copying an mp\137int}{section.3.2} +\BOOKMARK [2][-]{subsection.3.2.2}{Creating a Clone}{section.3.2} +\BOOKMARK [1][-]{section.3.3}{Zeroing an Integer}{chapter.3} +\BOOKMARK [1][-]{section.3.4}{Sign Manipulation}{chapter.3} +\BOOKMARK [2][-]{subsection.3.4.1}{Absolute Value}{section.3.4} +\BOOKMARK [2][-]{subsection.3.4.2}{Integer Negation}{section.3.4} +\BOOKMARK [1][-]{section.3.5}{Small Constants}{chapter.3} +\BOOKMARK [2][-]{subsection.3.5.1}{Setting Small Constants}{section.3.5} +\BOOKMARK [2][-]{subsection.3.5.2}{Setting Large Constants}{section.3.5} +\BOOKMARK [1][-]{section.3.6}{Comparisons}{chapter.3} +\BOOKMARK [2][-]{subsection.3.6.1}{Unsigned Comparisions}{section.3.6} +\BOOKMARK [2][-]{subsection.3.6.2}{Signed Comparisons}{section.3.6} +\BOOKMARK [0][-]{chapter.4}{Basic Arithmetic}{} +\BOOKMARK [1][-]{section.4.1}{Introduction}{chapter.4} +\BOOKMARK [1][-]{section.4.2}{Addition and Subtraction}{chapter.4} +\BOOKMARK [2][-]{subsection.4.2.1}{Low Level Addition}{section.4.2} +\BOOKMARK [2][-]{subsection.4.2.2}{Low Level Subtraction}{section.4.2} +\BOOKMARK [2][-]{subsection.4.2.3}{High Level Addition}{section.4.2} +\BOOKMARK [2][-]{subsection.4.2.4}{High Level Subtraction}{section.4.2} +\BOOKMARK [1][-]{section.4.3}{Bit and Digit Shifting}{chapter.4} +\BOOKMARK [2][-]{subsection.4.3.1}{Multiplication by Two}{section.4.3} +\BOOKMARK [2][-]{subsection.4.3.2}{Division by Two}{section.4.3} +\BOOKMARK [1][-]{section.4.4}{Polynomial Basis Operations}{chapter.4} +\BOOKMARK [2][-]{subsection.4.4.1}{Multiplication by x}{section.4.4} +\BOOKMARK [2][-]{subsection.4.4.2}{Division by x}{section.4.4} +\BOOKMARK [1][-]{section.4.5}{Powers of Two}{chapter.4} +\BOOKMARK [2][-]{subsection.4.5.1}{Multiplication by Power of Two}{section.4.5} +\BOOKMARK [2][-]{subsection.4.5.2}{Division by Power of Two}{section.4.5} +\BOOKMARK [2][-]{subsection.4.5.3}{Remainder of Division by Power of Two}{section.4.5} +\BOOKMARK [0][-]{chapter.5}{Multiplication and Squaring}{} +\BOOKMARK [1][-]{section.5.1}{The Multipliers}{chapter.5} +\BOOKMARK [1][-]{section.5.2}{Multiplication}{chapter.5} +\BOOKMARK [2][-]{subsection.5.2.1}{The Baseline Multiplication}{section.5.2} +\BOOKMARK [2][-]{subsection.5.2.2}{Faster Multiplication by the ``Comba'' Method}{section.5.2} +\BOOKMARK [2][-]{subsection.5.2.3}{Polynomial Basis Multiplication}{section.5.2} +\BOOKMARK [2][-]{subsection.5.2.4}{Karatsuba Multiplication}{section.5.2} +\BOOKMARK [2][-]{subsection.5.2.5}{Toom-Cook 3-Way Multiplication}{section.5.2} +\BOOKMARK [2][-]{subsection.5.2.6}{Signed Multiplication}{section.5.2} +\BOOKMARK [1][-]{section.5.3}{Squaring}{chapter.5} +\BOOKMARK [2][-]{subsection.5.3.1}{The Baseline Squaring Algorithm}{section.5.3} +\BOOKMARK [2][-]{subsection.5.3.2}{Faster Squaring by the ``Comba'' Method}{section.5.3} +\BOOKMARK [2][-]{subsection.5.3.3}{Polynomial Basis Squaring}{section.5.3} +\BOOKMARK [2][-]{subsection.5.3.4}{Karatsuba Squaring}{section.5.3} +\BOOKMARK [2][-]{subsection.5.3.5}{Toom-Cook Squaring}{section.5.3} +\BOOKMARK [2][-]{subsection.5.3.6}{High Level Squaring}{section.5.3} +\BOOKMARK [0][-]{chapter.6}{Modular Reduction}{} +\BOOKMARK [1][-]{section.6.1}{Basics of Modular Reduction}{chapter.6} +\BOOKMARK [1][-]{section.6.2}{The Barrett Reduction}{chapter.6} +\BOOKMARK [2][-]{subsection.6.2.1}{Fixed Point Arithmetic}{section.6.2} +\BOOKMARK [2][-]{subsection.6.2.2}{Choosing a Radix Point}{section.6.2} +\BOOKMARK [2][-]{subsection.6.2.3}{Trimming the Quotient}{section.6.2} +\BOOKMARK [2][-]{subsection.6.2.4}{Trimming the Residue}{section.6.2} +\BOOKMARK [2][-]{subsection.6.2.5}{The Barrett Algorithm}{section.6.2} +\BOOKMARK [2][-]{subsection.6.2.6}{The Barrett Setup Algorithm}{section.6.2} +\BOOKMARK [1][-]{section.6.3}{The Montgomery Reduction}{chapter.6} +\BOOKMARK [2][-]{subsection.6.3.1}{Digit Based Montgomery Reduction}{section.6.3} +\BOOKMARK [2][-]{subsection.6.3.2}{Baseline Montgomery Reduction}{section.6.3} +\BOOKMARK [2][-]{subsection.6.3.3}{Faster ``Comba'' Montgomery Reduction}{section.6.3} +\BOOKMARK [2][-]{subsection.6.3.4}{Montgomery Setup}{section.6.3} +\BOOKMARK [1][-]{section.6.4}{The Diminished Radix Algorithm}{chapter.6} +\BOOKMARK [2][-]{subsection.6.4.1}{Choice of Moduli}{section.6.4} +\BOOKMARK [2][-]{subsection.6.4.2}{Choice of k}{section.6.4} +\BOOKMARK [2][-]{subsection.6.4.3}{Restricted Diminished Radix Reduction}{section.6.4} +\BOOKMARK [2][-]{subsection.6.4.4}{Unrestricted Diminished Radix Reduction}{section.6.4} +\BOOKMARK [1][-]{section.6.5}{Algorithm Comparison}{chapter.6} +\BOOKMARK [0][-]{chapter.7}{Exponentiation}{} +\BOOKMARK [1][-]{section.7.1}{Exponentiation Basics}{chapter.7} +\BOOKMARK [2][-]{subsection.7.1.1}{Single Digit Exponentiation}{section.7.1} +\BOOKMARK [1][-]{section.7.2}{k-ary Exponentiation}{chapter.7} +\BOOKMARK [2][-]{subsection.7.2.1}{Optimal Values of k}{section.7.2} +\BOOKMARK [2][-]{subsection.7.2.2}{Sliding-Window Exponentiation}{section.7.2} +\BOOKMARK [1][-]{section.7.3}{Modular Exponentiation}{chapter.7} +\BOOKMARK [2][-]{subsection.7.3.1}{Barrett Modular Exponentiation}{section.7.3} +\BOOKMARK [1][-]{section.7.4}{Quick Power of Two}{chapter.7} +\BOOKMARK [0][-]{chapter.8}{Higher Level Algorithms}{} +\BOOKMARK [1][-]{section.8.1}{Integer Division with Remainder}{chapter.8} +\BOOKMARK [2][-]{subsection.8.1.1}{Quotient Estimation}{section.8.1} +\BOOKMARK [2][-]{subsection.8.1.2}{Normalized Integers}{section.8.1} +\BOOKMARK [2][-]{subsection.8.1.3}{Radix- Division with Remainder}{section.8.1} +\BOOKMARK [1][-]{section.8.2}{Single Digit Helpers}{chapter.8} +\BOOKMARK [2][-]{subsection.8.2.1}{Single Digit Addition and Subtraction}{section.8.2} +\BOOKMARK [2][-]{subsection.8.2.2}{Single Digit Multiplication}{section.8.2} +\BOOKMARK [2][-]{subsection.8.2.3}{Single Digit Division}{section.8.2} +\BOOKMARK [2][-]{subsection.8.2.4}{Single Digit Root Extraction}{section.8.2} +\BOOKMARK [1][-]{section.8.3}{Random Number Generation}{chapter.8} +\BOOKMARK [1][-]{section.8.4}{Formatted Representations}{chapter.8} +\BOOKMARK [2][-]{subsection.8.4.1}{Reading Radix-n Input}{section.8.4} +\BOOKMARK [2][-]{subsection.8.4.2}{Generating Radix-n Output}{section.8.4} +\BOOKMARK [0][-]{chapter.9}{Number Theoretic Algorithms}{} +\BOOKMARK [1][-]{section.9.1}{Greatest Common Divisor}{chapter.9} +\BOOKMARK [2][-]{subsection.9.1.1}{Complete Greatest Common Divisor}{section.9.1} +\BOOKMARK [1][-]{section.9.2}{Least Common Multiple}{chapter.9} +\BOOKMARK [1][-]{section.9.3}{Jacobi Symbol Computation}{chapter.9} +\BOOKMARK [2][-]{subsection.9.3.1}{Jacobi Symbol}{section.9.3} +\BOOKMARK [1][-]{section.9.4}{Modular Inverse}{chapter.9} +\BOOKMARK [2][-]{subsection.9.4.1}{General Case}{section.9.4} +\BOOKMARK [1][-]{section.9.5}{Primality Tests}{chapter.9} +\BOOKMARK [2][-]{subsection.9.5.1}{Trial Division}{section.9.5} +\BOOKMARK [2][-]{subsection.9.5.2}{The Fermat Test}{section.9.5} +\BOOKMARK [2][-]{subsection.9.5.3}{The Miller-Rabin Test}{section.9.5}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tommath.src Tue Jun 15 14:42:57 2004 +0000 @@ -0,0 +1,6287 @@ +\documentclass[b5paper]{book} +\usepackage{hyperref} +\usepackage{makeidx} +\usepackage{amssymb} +\usepackage{color} +\usepackage{alltt} +\usepackage{graphicx} +\usepackage{layout} +\def\union{\cup} +\def\intersect{\cap} +\def\getsrandom{\stackrel{\rm R}{\gets}} +\def\cross{\times} +\def\cat{\hspace{0.5em} \| \hspace{0.5em}} +\def\catn{$\|$} +\def\divides{\hspace{0.3em} | \hspace{0.3em}} +\def\nequiv{\not\equiv} +\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}} +\def\lcm{{\rm lcm}} +\def\gcd{{\rm gcd}} +\def\log{{\rm log}} +\def\ord{{\rm ord}} +\def\abs{{\mathit abs}} +\def\rep{{\mathit rep}} +\def\mod{{\mathit\ mod\ }} +\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})} +\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor} +\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil} +\def\Or{{\rm\ or\ }} +\def\And{{\rm\ and\ }} +\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}} +\def\implies{\Rightarrow} +\def\undefined{{\rm ``undefined"}} +\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}} +\let\oldphi\phi +\def\phi{\varphi} +\def\Pr{{\rm Pr}} +\newcommand{\str}[1]{{\mathbf{#1}}} +\def\F{{\mathbb F}} +\def\N{{\mathbb N}} +\def\Z{{\mathbb Z}} +\def\R{{\mathbb R}} +\def\C{{\mathbb C}} +\def\Q{{\mathbb Q}} +\definecolor{DGray}{gray}{0.5} +\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}} +\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}} +\def\gap{\vspace{0.5ex}} +\makeindex +\begin{document} +\frontmatter +\pagestyle{empty} +\title{Implementing Multiple Precision Arithmetic \\ ~ \\ Draft Edition } +\author{\mbox{ +%\begin{small} +\begin{tabular}{c} +Tom St Denis \\ +Algonquin College \\ +\\ +Mads Rasmussen \\ +Open Communications Security \\ +\\ +Greg Rose \\ +QUALCOMM Australia \\ +\end{tabular} +%\end{small} +} +} +\maketitle +This text has been placed in the public domain. This text corresponds to the v0.30 release of the +LibTomMath project. + +\begin{alltt} +Tom St Denis +111 Banning Rd +Ottawa, Ontario +K2L 1C3 +Canada + +Phone: 1-613-836-3160 +Email: [email protected] +\end{alltt} + +This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{} +{\em book} macro package and the Perl {\em booker} package. + +\tableofcontents +\listoffigures +\chapter*{Prefaces to the Draft Edition} +I started this text in April 2003 to complement my LibTomMath library. That is, explain how to implement the functions +contained in LibTomMath. The goal is to have a textbook that any Computer Science student can use when implementing their +own multiple precision arithmetic. The plan I wanted to follow was flesh out all the +ideas and concepts I had floating around in my head and then work on it afterwards refining a little bit at a time. Chance +would have it that I ended up with my summer off from Algonquin College and I was given four months solid to work on the +text. + +Choosing to not waste any time I dove right into the project even before my spring semester was finished. I wrote a bit +off and on at first. The moment my exams were finished I jumped into long 12 to 16 hour days. The result after only +a couple of months was a ten chapter, three hundred page draft that I quickly had distributed to anyone who wanted +to read it. I had Jean-Luc Cooke print copies for me and I brought them to Crypto'03 in Santa Barbara. So far I have +managed to grab a certain level of attention having people from around the world ask me for copies of the text was certain +rewarding. + +Now we are past December 2003. By this time I had pictured that I would have at least finished my second draft of the text. +Currently I am far off from this goal. I've done partial re-writes of chapters one, two and three but they are not even +finished yet. I haven't given up on the project, only had some setbacks. First O'Reilly declined to publish the text then +Addison-Wesley and Greg is tried another which I don't know the name of. However, at this point I want to focus my energy +onto finishing the book not securing a contract. + +So why am I writing this text? It seems like a lot of work right? Most certainly it is a lot of work writing a textbook. +Even the simplest introductory material has to be lined with references and figures. A lot of the text has to be re-written +from point form to prose form to ensure an easier read. Why am I doing all this work for free then? Simple. My philosophy +is quite simply ``Open Source. Open Academia. Open Minds'' which means that to achieve a goal of open minds, that is, +people willing to accept new ideas and explore the unknown you have to make available material they can access freely +without hinderance. + +I've been writing free software since I was about sixteen but only recently have I hit upon software that people have come +to depend upon. I started LibTomCrypt in December 2001 and now several major companies use it as integral portions of their +software. Several educational institutions use it as a matter of course and many freelance developers use it as +part of their projects. To further my contributions I started the LibTomMath project in December 2002 aimed at providing +multiple precision arithmetic routines that students could learn from. That is write routines that are not only easy +to understand and follow but provide quite impressive performance considering they are all in standard portable ISO C. + +The second leg of my philosophy is ``Open Academia'' which is where this textbook comes in. In the end, when all is +said and done the text will be useable by educational institutions as a reference on multiple precision arithmetic. + +At this time I feel I should share a little information about myself. The most common question I was asked at +Crypto'03, perhaps just out of professional courtesy, was which school I either taught at or attended. The unfortunate +truth is that I neither teach at or attend a school of academic reputation. I'm currently at Algonquin College which +is what I'd like to call ``somewhat academic but mostly vocational'' college. In otherwords, job training. + +I'm a 21 year old computer science student mostly self-taught in the areas I am aware of (which includes a half-dozen +computer science fields, a few fields of mathematics and some English). I look forward to teaching someday but I am +still far off from that goal. + +Now it would be improper for me to not introduce the rest of the texts co-authors. While they are only contributing +corrections and editorial feedback their support has been tremendously helpful in presenting the concepts laid out +in the text so far. Greg has always been there for me. He has tracked my LibTom projects since their inception and even +sent cheques to help pay tuition from time to time. His background has provided a wonderful source to bounce ideas off +of and improve the quality of my writing. Mads is another fellow who has just ``been there''. I don't even recall what +his interest in the LibTom projects is but I'm definitely glad he has been around. His ability to catch logical errors +in my written English have saved me on several occasions to say the least. + +What to expect next? Well this is still a rough draft. I've only had the chance to update a few chapters. However, I've +been getting the feeling that people are starting to use my text and I owe them some updated material. My current tenative +plan is to edit one chapter every two weeks starting January 4th. It seems insane but my lower course load at college +should provide ample time. By Crypto'04 I plan to have a 2nd draft of the text polished and ready to hand out to as many +people who will take it. + +\begin{flushright} Tom St Denis \end{flushright} + +\newpage +I found the opportunity to work with Tom appealing for several reasons, not only could I broaden my own horizons, but also +contribute to educate others facing the problem of having to handle big number mathematical calculations. + +This book is Tom's child and he has been caring and fostering the project ever since the beginning with a clear mind of +how he wanted the project to turn out. I have helped by proofreading the text and we have had several discussions about +the layout and language used. + +I hold a masters degree in cryptography from the University of Southern Denmark and have always been interested in the +practical aspects of cryptography. + +Having worked in the security consultancy business for several years in S\~{a}o Paulo, Brazil, I have been in touch with a +great deal of work in which multiple precision mathematics was needed. Understanding the possibilities for speeding up +multiple precision calculations is often very important since we deal with outdated machine architecture where modular +reductions, for example, become painfully slow. + +This text is for people who stop and wonder when first examining algorithms such as RSA for the first time and asks +themselves, ``You tell me this is only secure for large numbers, fine; but how do you implement these numbers?'' + +\begin{flushright} +Mads Rasmussen + +S\~{a}o Paulo - SP + +Brazil +\end{flushright} + +\newpage +It's all because I broke my leg. That just happened to be at about the same time that Tom asked for someone to review the section of the book about +Karatsuba multiplication. I was laid up, alone and immobile, and thought ``Why not?'' I vaguely knew what Karatsuba multiplication was, but not +really, so I thought I could help, learn, and stop myself from watching daytime cable TV, all at once. + +At the time of writing this, I've still not met Tom or Mads in meatspace. I've been following Tom's progress since his first splash on the +sci.crypt Usenet news group. I watched him go from a clueless newbie, to the cryptographic equivalent of a reformed smoker, to a real +contributor to the field, over a period of about two years. I've been impressed with his obvious intelligence, and astounded by his productivity. +Of course, he's young enough to be my own child, so he doesn't have my problems with staying awake. + +When I reviewed that single section of the book, in its very earliest form, I was very pleasantly surprised. So I decided to collaborate more fully, +and at least review all of it, and perhaps write some bits too. There's still a long way to go with it, and I have watched a number of close +friends go through the mill of publication, so I think that the way to go is longer than Tom thinks it is. Nevertheless, it's a good effort, +and I'm pleased to be involved with it. + +\begin{flushright} +Greg Rose, Sydney, Australia, June 2003. +\end{flushright} + +\mainmatter +\pagestyle{headings} +\chapter{Introduction} +\section{Multiple Precision Arithmetic} + +\subsection{What is Multiple Precision Arithmetic?} +When we think of long-hand arithmetic such as addition or multiplication we rarely consider the fact that we instinctively +raise or lower the precision of the numbers we are dealing with. For example, in decimal we almost immediate can +reason that $7$ times $6$ is $42$. However, $42$ has two digits of precision as opposed to one digit we started with. +Further multiplications of say $3$ result in a larger precision result $126$. In these few examples we have multiple +precisions for the numbers we are working with. Despite the various levels of precision a single subset\footnote{With the occasional optimization.} + of algorithms can be designed to accomodate them. + +By way of comparison a fixed or single precision operation would lose precision on various operations. For example, in +the decimal system with fixed precision $6 \cdot 7 = 2$. + +Essentially at the heart of computer based multiple precision arithmetic are the same long-hand algorithms taught in +schools to manually add, subtract, multiply and divide. + +\subsection{The Need for Multiple Precision Arithmetic} +The most prevalent need for multiple precision arithmetic, often referred to as ``bignum'' math, is within the implementation +of public-key cryptography algorithms. Algorithms such as RSA \cite{RSAREF} and Diffie-Hellman \cite{DHREF} require +integers of significant magnitude to resist known cryptanalytic attacks. For example, at the time of this writing a +typical RSA modulus would be at least greater than $10^{309}$. However, modern programming languages such as ISO C \cite{ISOC} and +Java \cite{JAVA} only provide instrinsic support for integers which are relatively small and single precision. + +\begin{figure}[!here] +\begin{center} +\begin{tabular}{|r|c|} +\hline \textbf{Data Type} & \textbf{Range} \\ +\hline char & $-128 \ldots 127$ \\ +\hline short & $-32768 \ldots 32767$ \\ +\hline long & $-2147483648 \ldots 2147483647$ \\ +\hline long long & $-9223372036854775808 \ldots 9223372036854775807$ \\ +\hline +\end{tabular} +\end{center} +\caption{Typical Data Types for the C Programming Language} +\label{fig:ISOC} +\end{figure} + +The largest data type guaranteed to be provided by the ISO C programming +language\footnote{As per the ISO C standard. However, each compiler vendor is allowed to augment the precision as they +see fit.} can only represent values up to $10^{19}$ as shown in figure \ref{fig:ISOC}. On its own the C language is +insufficient to accomodate the magnitude required for the problem at hand. An RSA modulus of magnitude $10^{19}$ could be +trivially factored\footnote{A Pollard-Rho factoring would take only $2^{16}$ time.} on the average desktop computer, +rendering any protocol based on the algorithm insecure. Multiple precision algorithms solve this very problem by +extending the range of representable integers while using single precision data types. + +Most advancements in fast multiple precision arithmetic stem from the need for faster and more efficient cryptographic +primitives. Faster modular reduction and exponentiation algorithms such as Barrett's algorithm, which have appeared in +various cryptographic journals, can render algorithms such as RSA and Diffie-Hellman more efficient. In fact, several +major companies such as RSA Security, Certicom and Entrust have built entire product lines on the implementation and +deployment of efficient algorithms. + +However, cryptography is not the only field of study that can benefit from fast multiple precision integer routines. +Another auxiliary use of multiple precision integers is high precision floating point data types. +The basic IEEE \cite{IEEE} standard floating point type is made up of an integer mantissa $q$, an exponent $e$ and a sign bit $s$. +Numbers are given in the form $n = q \cdot b^e \cdot -1^s$ where $b = 2$ is the most common base for IEEE. Since IEEE +floating point is meant to be implemented in hardware the precision of the mantissa is often fairly small +(\textit{23, 48 and 64 bits}). The mantissa is merely an integer and a multiple precision integer could be used to create +a mantissa of much larger precision than hardware alone can efficiently support. This approach could be useful where +scientific applications must minimize the total output error over long calculations. + +Another use for large integers is within arithmetic on polynomials of large characteristic (i.e. $GF(p)[x]$ for large $p$). +In fact the library discussed within this text has already been used to form a polynomial basis library\footnote{See \url{http://poly.libtomcrypt.org} for more details.}. + +\subsection{Benefits of Multiple Precision Arithmetic} +\index{precision} +The benefit of multiple precision representations over single or fixed precision representations is that +no precision is lost while representing the result of an operation which requires excess precision. For example, +the product of two $n$-bit integers requires at least $2n$ bits of precision to be represented faithfully. A multiple +precision algorithm would augment the precision of the destination to accomodate the result while a single precision system +would truncate excess bits to maintain a fixed level of precision. + +It is possible to implement algorithms which require large integers with fixed precision algorithms. For example, elliptic +curve cryptography (\textit{ECC}) is often implemented on smartcards by fixing the precision of the integers to the maximum +size the system will ever need. Such an approach can lead to vastly simpler algorithms which can accomodate the +integers required even if the host platform cannot natively accomodate them\footnote{For example, the average smartcard +processor has an 8 bit accumulator.}. However, as efficient as such an approach may be, the resulting source code is not +normally very flexible. It cannot, at runtime, accomodate inputs of higher magnitude than the designer anticipated. + +Multiple precision algorithms have the most overhead of any style of arithmetic. For the the most part the +overhead can be kept to a minimum with careful planning, but overall, it is not well suited for most memory starved +platforms. However, multiple precision algorithms do offer the most flexibility in terms of the magnitude of the +inputs. That is, the same algorithms based on multiple precision integers can accomodate any reasonable size input +without the designer's explicit forethought. This leads to lower cost of ownership for the code as it only has to +be written and tested once. + +\section{Purpose of This Text} +The purpose of this text is to instruct the reader regarding how to implement efficient multiple precision algorithms. +That is to not only explain a limited subset of the core theory behind the algorithms but also the various ``house keeping'' +elements that are neglected by authors of other texts on the subject. Several well reknowned texts \cite{TAOCPV2,HAC} +give considerably detailed explanations of the theoretical aspects of algorithms and often very little information +regarding the practical implementation aspects. + +In most cases how an algorithm is explained and how it is actually implemented are two very different concepts. For +example, the Handbook of Applied Cryptography (\textit{HAC}), algorithm 14.7 on page 594, gives a relatively simple +algorithm for performing multiple precision integer addition. However, the description lacks any discussion concerning +the fact that the two integer inputs may be of differing magnitudes. As a result the implementation is not as simple +as the text would lead people to believe. Similarly the division routine (\textit{algorithm 14.20, pp. 598}) does not +discuss how to handle sign or handle the dividend's decreasing magnitude in the main loop (\textit{step \#3}). + +Both texts also do not discuss several key optimal algorithms required such as ``Comba'' and Karatsuba multipliers +and fast modular inversion, which we consider practical oversights. These optimal algorithms are vital to achieve +any form of useful performance in non-trivial applications. + +To solve this problem the focus of this text is on the practical aspects of implementing a multiple precision integer +package. As a case study the ``LibTomMath''\footnote{Available at \url{http://math.libtomcrypt.org}} package is used +to demonstrate algorithms with real implementations\footnote{In the ISO C programming language.} that have been field +tested and work very well. The LibTomMath library is freely available on the Internet for all uses and this text +discusses a very large portion of the inner workings of the library. + +The algorithms that are presented will always include at least one ``pseudo-code'' description followed +by the actual C source code that implements the algorithm. The pseudo-code can be used to implement the same +algorithm in other programming languages as the reader sees fit. + +This text shall also serve as a walkthrough of the creation of multiple precision algorithms from scratch. Showing +the reader how the algorithms fit together as well as where to start on various taskings. + +\section{Discussion and Notation} +\subsection{Notation} +A multiple precision integer of $n$-digits shall be denoted as $x = (x_{n-1} ... x_1 x_0)_{ \beta }$ and represent +the integer $x \equiv \sum_{i=0}^{n-1} x_i\beta^i$. The elements of the array $x$ are said to be the radix $\beta$ digits +of the integer. For example, $x = (1,2,3)_{10}$ would represent the integer +$1\cdot 10^2 + 2\cdot10^1 + 3\cdot10^0 = 123$. + +\index{mp\_int} +The term ``mp\_int'' shall refer to a composite structure which contains the digits of the integer it represents, as well +as auxilary data required to manipulate the data. These additional members are discussed further in section +\ref{sec:MPINT}. For the purposes of this text a ``multiple precision integer'' and an ``mp\_int'' are assumed to be +synonymous. When an algorithm is specified to accept an mp\_int variable it is assumed the various auxliary data members +are present as well. An expression of the type \textit{variablename.item} implies that it should evaluate to the +member named ``item'' of the variable. For example, a string of characters may have a member ``length'' which would +evaluate to the number of characters in the string. If the string $a$ equals ``hello'' then it follows that +$a.length = 5$. + +For certain discussions more generic algorithms are presented to help the reader understand the final algorithm used +to solve a given problem. When an algorithm is described as accepting an integer input it is assumed the input is +a plain integer with no additional multiple-precision members. That is, algorithms that use integers as opposed to +mp\_ints as inputs do not concern themselves with the housekeeping operations required such as memory management. These +algorithms will be used to establish the relevant theory which will subsequently be used to describe a multiple +precision algorithm to solve the same problem. + +\subsection{Precision Notation} +For the purposes of this text a single precision variable must be able to represent integers in the range +$0 \le x < q \beta$ while a double precision variable must be able to represent integers in the range +$0 \le x < q \beta^2$. The variable $\beta$ represents the radix of a single digit of a multiple precision integer and +must be of the form $q^p$ for $q, p \in \Z^+$. The extra radix-$q$ factor allows additions and subtractions to proceed +without truncation of the carry. Since all modern computers are binary, it is assumed that $q$ is two, for all intents +and purposes. + +\index{mp\_digit} \index{mp\_word} +Within the source code that will be presented for each algorithm, the data type \textbf{mp\_digit} will represent +a single precision integer type, while, the data type \textbf{mp\_word} will represent a double precision integer type. In +several algorithms (notably the Comba routines) temporary results will be stored in arrays of double precision mp\_words. +For the purposes of this text $x_j$ will refer to the $j$'th digit of a single precision array and $\hat x_j$ will refer to +the $j$'th digit of a double precision array. Whenever an expression is to be assigned to a double precision +variable it is assumed that all single precision variables are promoted to double precision during the evaluation. +Expressions that are assigned to a single precision variable are truncated to fit within the precision of a single +precision data type. + +For example, if $\beta = 10^2$ a single precision data type may represent a value in the +range $0 \le x < 10^3$, while a double precision data type may represent a value in the range $0 \le x < 10^5$. Let +$a = 23$ and $b = 49$ represent two single precision variables. The single precision product shall be written +as $c \leftarrow a \cdot b$ while the double precision product shall be written as $\hat c \leftarrow a \cdot b$. +In this particular case, $\hat c = 1127$ and $c = 127$. The most significant digit of the product would not fit +in a single precision data type and as a result $c \ne \hat c$. + +\subsection{Algorithm Inputs and Outputs} +Within the algorithm descriptions all variables are assumed to be scalars of either single or double precision +as indicated. The only exception to this rule is when variables have been indicated to be of type mp\_int. This +distinction is important as scalars are often used as array indicies and various other counters. + +\subsection{Mathematical Expressions} +The $\lfloor \mbox{ } \rfloor$ brackets imply an expression truncated to an integer not greater than the expression +itself. For example, $\lfloor 5.7 \rfloor = 5$. Similarly the $\lceil \mbox{ } \rceil$ brackets imply an expression +rounded to an integer not less than the expression itself. For example, $\lceil 5.1 \rceil = 6$. Typically when +the $/$ division symbol is used the intention is to perform an integer division with truncation. For example, +$5/2 = 2$ which will often be written as $\lfloor 5/2 \rfloor = 2$ for clarity. When an expression is written as a +fraction a real value division is implied, for example ${5 \over 2} = 2.5$. + +The norm of a multiple precision integer, for example, $\vert \vert x \vert \vert$ will be used to represent the number of digits in the representation +of the integer. For example, $\vert \vert 123 \vert \vert = 3$ and $\vert \vert 79452 \vert \vert = 5$. + +\subsection{Work Effort} +\index{big-Oh} +To measure the efficiency of the specified algorithms, a modified big-Oh notation is used. In this system all +single precision operations are considered to have the same cost\footnote{Except where explicitly noted.}. +That is a single precision addition, multiplication and division are assumed to take the same time to +complete. While this is generally not true in practice, it will simplify the discussions considerably. + +Some algorithms have slight advantages over others which is why some constants will not be removed in +the notation. For example, a normal baseline multiplication (section \ref{sec:basemult}) requires $O(n^2)$ work while a +baseline squaring (section \ref{sec:basesquare}) requires $O({{n^2 + n}\over 2})$ work. In standard big-Oh notation these +would both be said to be equivalent to $O(n^2)$. However, +in the context of the this text this is not the case as the magnitude of the inputs will typically be rather small. As a +result small constant factors in the work effort will make an observable difference in algorithm efficiency. + +All of the algorithms presented in this text have a polynomial time work level. That is, of the form +$O(n^k)$ for $n, k \in \Z^{+}$. This will help make useful comparisons in terms of the speed of the algorithms and how +various optimizations will help pay off in the long run. + +\section{Exercises} +Within the more advanced chapters a section will be set aside to give the reader some challenging exercises related to +the discussion at hand. These exercises are not designed to be prize winning problems, but instead to be thought +provoking. Wherever possible the problems are forward minded, stating problems that will be answered in subsequent +chapters. The reader is encouraged to finish the exercises as they appear to get a better understanding of the +subject material. + +That being said, the problems are designed to affirm knowledge of a particular subject matter. Students in particular +are encouraged to verify they can answer the problems correctly before moving on. + +Similar to the exercises of \cite[pp. ix]{TAOCPV2} these exercises are given a scoring system based on the difficulty of +the problem. However, unlike \cite{TAOCPV2} the problems do not get nearly as hard. The scoring of these +exercises ranges from one (the easiest) to five (the hardest). The following table sumarizes the +scoring system used. + +\begin{figure}[here] +\begin{center} +\begin{small} +\begin{tabular}{|c|l|} +\hline $\left [ 1 \right ]$ & An easy problem that should only take the reader a manner of \\ + & minutes to solve. Usually does not involve much computer time \\ + & to solve. \\ +\hline $\left [ 2 \right ]$ & An easy problem that involves a marginal amount of computer \\ + & time usage. Usually requires a program to be written to \\ + & solve the problem. \\ +\hline $\left [ 3 \right ]$ & A moderately hard problem that requires a non-trivial amount \\ + & of work. Usually involves trivial research and development of \\ + & new theory from the perspective of a student. \\ +\hline $\left [ 4 \right ]$ & A moderately hard problem that involves a non-trivial amount \\ + & of work and research, the solution to which will demonstrate \\ + & a higher mastery of the subject matter. \\ +\hline $\left [ 5 \right ]$ & A hard problem that involves concepts that are difficult for a \\ + & novice to solve. Solutions to these problems will demonstrate a \\ + & complete mastery of the given subject. \\ +\hline +\end{tabular} +\end{small} +\end{center} +\caption{Exercise Scoring System} +\end{figure} + +Problems at the first level are meant to be simple questions that the reader can answer quickly without programming a solution or +devising new theory. These problems are quick tests to see if the material is understood. Problems at the second level +are also designed to be easy but will require a program or algorithm to be implemented to arrive at the answer. These +two levels are essentially entry level questions. + +Problems at the third level are meant to be a bit more difficult than the first two levels. The answer is often +fairly obvious but arriving at an exacting solution requires some thought and skill. These problems will almost always +involve devising a new algorithm or implementing a variation of another algorithm previously presented. Readers who can +answer these questions will feel comfortable with the concepts behind the topic at hand. + +Problems at the fourth level are meant to be similar to those of the level three questions except they will require +additional research to be completed. The reader will most likely not know the answer right away, nor will the text provide +the exact details of the answer until a subsequent chapter. + +Problems at the fifth level are meant to be the hardest +problems relative to all the other problems in the chapter. People who can correctly answer fifth level problems have a +mastery of the subject matter at hand. + +Often problems will be tied together. The purpose of this is to start a chain of thought that will be discussed in future chapters. The reader +is encouraged to answer the follow-up problems and try to draw the relevance of problems. + +\section{Introduction to LibTomMath} + +\subsection{What is LibTomMath?} +LibTomMath is a free and open source multiple precision integer library written entirely in portable ISO C. By portable it +is meant that the library does not contain any code that is computer platform dependent or otherwise problematic to use on +any given platform. + +The library has been successfully tested under numerous operating systems including Unix\footnote{All of these +trademarks belong to their respective rightful owners.}, MacOS, Windows, Linux, PalmOS and on standalone hardware such +as the Gameboy Advance. The library is designed to contain enough functionality to be able to develop applications such +as public key cryptosystems and still maintain a relatively small footprint. + +\subsection{Goals of LibTomMath} + +Libraries which obtain the most efficiency are rarely written in a high level programming language such as C. However, +even though this library is written entirely in ISO C, considerable care has been taken to optimize the algorithm implementations within the +library. Specifically the code has been written to work well with the GNU C Compiler (\textit{GCC}) on both x86 and ARM +processors. Wherever possible, highly efficient algorithms, such as Karatsuba multiplication, sliding window +exponentiation and Montgomery reduction have been provided to make the library more efficient. + +Even with the nearly optimal and specialized algorithms that have been included the Application Programing Interface +(\textit{API}) has been kept as simple as possible. Often generic place holder routines will make use of specialized +algorithms automatically without the developer's specific attention. One such example is the generic multiplication +algorithm \textbf{mp\_mul()} which will automatically use Toom--Cook, Karatsuba, Comba or baseline multiplication +based on the magnitude of the inputs and the configuration of the library. + +Making LibTomMath as efficient as possible is not the only goal of the LibTomMath project. Ideally the library should +be source compatible with another popular library which makes it more attractive for developers to use. In this case the +MPI library was used as a API template for all the basic functions. MPI was chosen because it is another library that fits +in the same niche as LibTomMath. Even though LibTomMath uses MPI as the template for the function names and argument +passing conventions, it has been written from scratch by Tom St Denis. + +The project is also meant to act as a learning tool for students, the logic being that no easy-to-follow ``bignum'' +library exists which can be used to teach computer science students how to perform fast and reliable multiple precision +integer arithmetic. To this end the source code has been given quite a few comments and algorithm discussion points. + +\section{Choice of LibTomMath} +LibTomMath was chosen as the case study of this text not only because the author of both projects is one and the same but +for more worthy reasons. Other libraries such as GMP \cite{GMP}, MPI \cite{MPI}, LIP \cite{LIP} and OpenSSL +\cite{OPENSSL} have multiple precision integer arithmetic routines but would not be ideal for this text for +reasons that will be explained in the following sub-sections. + +\subsection{Code Base} +The LibTomMath code base is all portable ISO C source code. This means that there are no platform dependent conditional +segments of code littered throughout the source. This clean and uncluttered approach to the library means that a +developer can more readily discern the true intent of a given section of source code without trying to keep track of +what conditional code will be used. + +The code base of LibTomMath is well organized. Each function is in its own separate source code file +which allows the reader to find a given function very quickly. On average there are $76$ lines of code per source +file which makes the source very easily to follow. By comparison MPI and LIP are single file projects making code tracing +very hard. GMP has many conditional code segments which also hinder tracing. + +When compiled with GCC for the x86 processor and optimized for speed the entire library is approximately $100$KiB\footnote{The notation ``KiB'' means $2^{10}$ octets, similarly ``MiB'' means $2^{20}$ octets.} + which is fairly small compared to GMP (over $250$KiB). LibTomMath is slightly larger than MPI (which compiles to about +$50$KiB) but LibTomMath is also much faster and more complete than MPI. + +\subsection{API Simplicity} +LibTomMath is designed after the MPI library and shares the API design. Quite often programs that use MPI will build +with LibTomMath without change. The function names correlate directly to the action they perform. Almost all of the +functions share the same parameter passing convention. The learning curve is fairly shallow with the API provided +which is an extremely valuable benefit for the student and developer alike. + +The LIP library is an example of a library with an API that is awkward to work with. LIP uses function names that are often ``compressed'' to +illegible short hand. LibTomMath does not share this characteristic. + +The GMP library also does not return error codes. Instead it uses a POSIX.1 \cite{POSIX1} signal system where errors +are signaled to the host application. This happens to be the fastest approach but definitely not the most versatile. In +effect a math error (i.e. invalid input, heap error, etc) can cause a program to stop functioning which is definitely +undersireable in many situations. + +\subsection{Optimizations} +While LibTomMath is certainly not the fastest library (GMP often beats LibTomMath by a factor of two) it does +feature a set of optimal algorithms for tasks such as modular reduction, exponentiation, multiplication and squaring. GMP +and LIP also feature such optimizations while MPI only uses baseline algorithms with no optimizations. GMP lacks a few +of the additional modular reduction optimizations that LibTomMath features\footnote{At the time of this writing GMP +only had Barrett and Montgomery modular reduction algorithms.}. + +LibTomMath is almost always an order of magnitude faster than the MPI library at computationally expensive tasks such as modular +exponentiation. In the grand scheme of ``bignum'' libraries LibTomMath is faster than the average library and usually +slower than the best libraries such as GMP and OpenSSL by only a small factor. + +\subsection{Portability and Stability} +LibTomMath will build ``out of the box'' on any platform equipped with a modern version of the GNU C Compiler +(\textit{GCC}). This means that without changes the library will build without configuration or setting up any +variables. LIP and MPI will build ``out of the box'' as well but have numerous known bugs. Most notably the author of +MPI has recently stopped working on his library and LIP has long since been discontinued. + +GMP requires a configuration script to run and will not build out of the box. GMP and LibTomMath are still in active +development and are very stable across a variety of platforms. + +\subsection{Choice} +LibTomMath is a relatively compact, well documented, highly optimized and portable library which seems only natural for +the case study of this text. Various source files from the LibTomMath project will be included within the text. However, +the reader is encouraged to download their own copy of the library to actually be able to work with the library. + +\chapter{Getting Started} +\section{Library Basics} +The trick to writing any useful library of source code is to build a solid foundation and work outwards from it. First, +a problem along with allowable solution parameters should be identified and analyzed. In this particular case the +inability to accomodate multiple precision integers is the problem. Futhermore, the solution must be written +as portable source code that is reasonably efficient across several different computer platforms. + +After a foundation is formed the remainder of the library can be designed and implemented in a hierarchical fashion. +That is, to implement the lowest level dependencies first and work towards the most abstract functions last. For example, +before implementing a modular exponentiation algorithm one would implement a modular reduction algorithm. +By building outwards from a base foundation instead of using a parallel design methodology the resulting project is +highly modular. Being highly modular is a desirable property of any project as it often means the resulting product +has a small footprint and updates are easy to perform. + +Usually when I start a project I will begin with the header file. I define the data types I think I will need and +prototype the initial functions that are not dependent on other functions (within the library). After I +implement these base functions I prototype more dependent functions and implement them. The process repeats until +I implement all of the functions I require. For example, in the case of LibTomMath I implemented functions such as +mp\_init() well before I implemented mp\_mul() and even further before I implemented mp\_exptmod(). As an example as to +why this design works note that the Karatsuba and Toom-Cook multipliers were written \textit{after} the +dependent function mp\_exptmod() was written. Adding the new multiplication algorithms did not require changes to the +mp\_exptmod() function itself and lowered the total cost of ownership (\textit{so to speak}) and of development +for new algorithms. This methodology allows new algorithms to be tested in a complete framework with relative ease. + +FIGU,design_process,Design Flow of the First Few Original LibTomMath Functions. + +Only after the majority of the functions were in place did I pursue a less hierarchical approach to auditing and optimizing +the source code. For example, one day I may audit the multipliers and the next day the polynomial basis functions. + +It only makes sense to begin the text with the preliminary data types and support algorithms required as well. +This chapter discusses the core algorithms of the library which are the dependents for every other algorithm. + +\section{What is a Multiple Precision Integer?} +Recall that most programming languages, in particular ISO C \cite{ISOC}, only have fixed precision data types that on their own cannot +be used to represent values larger than their precision will allow. The purpose of multiple precision algorithms is +to use fixed precision data types to create and manipulate multiple precision integers which may represent values +that are very large. + +As a well known analogy, school children are taught how to form numbers larger than nine by prepending more radix ten digits. In the decimal system +the largest single digit value is $9$. However, by concatenating digits together larger numbers may be represented. Newly prepended digits +(\textit{to the left}) are said to be in a different power of ten column. That is, the number $123$ can be described as having a $1$ in the hundreds +column, $2$ in the tens column and $3$ in the ones column. Or more formally $123 = 1 \cdot 10^2 + 2 \cdot 10^1 + 3 \cdot 10^0$. Computer based +multiple precision arithmetic is essentially the same concept. Larger integers are represented by adjoining fixed +precision computer words with the exception that a different radix is used. + +What most people probably do not think about explicitly are the various other attributes that describe a multiple precision +integer. For example, the integer $154_{10}$ has two immediately obvious properties. First, the integer is positive, +that is the sign of this particular integer is positive as opposed to negative. Second, the integer has three digits in +its representation. There is an additional property that the integer posesses that does not concern pencil-and-paper +arithmetic. The third property is how many digits placeholders are available to hold the integer. + +The human analogy of this third property is ensuring there is enough space on the paper to write the integer. For example, +if one starts writing a large number too far to the right on a piece of paper they will have to erase it and move left. +Similarly, computer algorithms must maintain strict control over memory usage to ensure that the digits of an integer +will not exceed the allowed boundaries. These three properties make up what is known as a multiple precision +integer or mp\_int for short. + +\subsection{The mp\_int Structure} +\label{sec:MPINT} +The mp\_int structure is the ISO C based manifestation of what represents a multiple precision integer. The ISO C standard does not provide for +any such data type but it does provide for making composite data types known as structures. The following is the structure definition +used within LibTomMath. + +\index{mp\_int} +\begin{verbatim} +typedef struct { + int used, alloc, sign; + mp_digit *dp; +} mp_int; +\end{verbatim} + +The mp\_int structure can be broken down as follows. + +\begin{enumerate} +\item The \textbf{used} parameter denotes how many digits of the array \textbf{dp} contain the digits used to represent +a given integer. The \textbf{used} count must be positive (or zero) and may not exceed the \textbf{alloc} count. + +\item The \textbf{alloc} parameter denotes how +many digits are available in the array to use by functions before it has to increase in size. When the \textbf{used} count +of a result would exceed the \textbf{alloc} count all of the algorithms will automatically increase the size of the +array to accommodate the precision of the result. + +\item The pointer \textbf{dp} points to a dynamically allocated array of digits that represent the given multiple +precision integer. It is padded with $(\textbf{alloc} - \textbf{used})$ zero digits. The array is maintained in a least +significant digit order. As a pencil and paper analogy the array is organized such that the right most digits are stored +first starting at the location indexed by zero\footnote{In C all arrays begin at zero.} in the array. For example, +if \textbf{dp} contains $\lbrace a, b, c, \ldots \rbrace$ where \textbf{dp}$_0 = a$, \textbf{dp}$_1 = b$, \textbf{dp}$_2 = c$, $\ldots$ then +it would represent the integer $a + b\beta + c\beta^2 + \ldots$ + +\index{MP\_ZPOS} \index{MP\_NEG} +\item The \textbf{sign} parameter denotes the sign as either zero/positive (\textbf{MP\_ZPOS}) or negative (\textbf{MP\_NEG}). +\end{enumerate} + +\subsubsection{Valid mp\_int Structures} +Several rules are placed on the state of an mp\_int structure and are assumed to be followed for reasons of efficiency. +The only exceptions are when the structure is passed to initialization functions such as mp\_init() and mp\_init\_copy(). + +\begin{enumerate} +\item The value of \textbf{alloc} may not be less than one. That is \textbf{dp} always points to a previously allocated +array of digits. +\item The value of \textbf{used} may not exceed \textbf{alloc} and must be greater than or equal to zero. +\item The value of \textbf{used} implies the digit at index $(used - 1)$ of the \textbf{dp} array is non-zero. That is, +leading zero digits in the most significant positions must be trimmed. + \begin{enumerate} + \item Digits in the \textbf{dp} array at and above the \textbf{used} location must be zero. + \end{enumerate} +\item The value of \textbf{sign} must be \textbf{MP\_ZPOS} if \textbf{used} is zero; +this represents the mp\_int value of zero. +\end{enumerate} + +\section{Argument Passing} +A convention of argument passing must be adopted early on in the development of any library. Making the function +prototypes consistent will help eliminate many headaches in the future as the library grows to significant complexity. +In LibTomMath the multiple precision integer functions accept parameters from left to right as pointers to mp\_int +structures. That means that the source (input) operands are placed on the left and the destination (output) on the right. +Consider the following examples. + +\begin{verbatim} + mp_mul(&a, &b, &c); /* c = a * b */ + mp_add(&a, &b, &a); /* a = a + b */ + mp_sqr(&a, &b); /* b = a * a */ +\end{verbatim} + +The left to right order is a fairly natural way to implement the functions since it lets the developer read aloud the +functions and make sense of them. For example, the first function would read ``multiply a and b and store in c''. + +Certain libraries (\textit{LIP by Lenstra for instance}) accept parameters the other way around, to mimic the order +of assignment expressions. That is, the destination (output) is on the left and arguments (inputs) are on the right. In +truth, it is entirely a matter of preference. In the case of LibTomMath the convention from the MPI library has been +adopted. + +Another very useful design consideration, provided for in LibTomMath, is whether to allow argument sources to also be a +destination. For example, the second example (\textit{mp\_add}) adds $a$ to $b$ and stores in $a$. This is an important +feature to implement since it allows the calling functions to cut down on the number of variables it must maintain. +However, to implement this feature specific care has to be given to ensure the destination is not modified before the +source is fully read. + +\section{Return Values} +A well implemented application, no matter what its purpose, should trap as many runtime errors as possible and return them +to the caller. By catching runtime errors a library can be guaranteed to prevent undefined behaviour. However, the end +developer can still manage to cause a library to crash. For example, by passing an invalid pointer an application may +fault by dereferencing memory not owned by the application. + +In the case of LibTomMath the only errors that are checked for are related to inappropriate inputs (division by zero for +instance) and memory allocation errors. It will not check that the mp\_int passed to any function is valid nor +will it check pointers for validity. Any function that can cause a runtime error will return an error code as an +\textbf{int} data type with one of the following values. + +\index{MP\_OKAY} \index{MP\_VAL} \index{MP\_MEM} +\begin{center} +\begin{tabular}{|l|l|} +\hline \textbf{Value} & \textbf{Meaning} \\ +\hline \textbf{MP\_OKAY} & The function was successful \\ +\hline \textbf{MP\_VAL} & One of the input value(s) was invalid \\ +\hline \textbf{MP\_MEM} & The function ran out of heap memory \\ +\hline +\end{tabular} +\end{center} + +When an error is detected within a function it should free any memory it allocated, often during the initialization of +temporary mp\_ints, and return as soon as possible. The goal is to leave the system in the same state it was when the +function was called. Error checking with this style of API is fairly simple. + +\begin{verbatim} + int err; + if ((err = mp_add(&a, &b, &c)) != MP_OKAY) { + printf("Error: %s\n", mp_error_to_string(err)); + exit(EXIT_FAILURE); + } +\end{verbatim} + +The GMP \cite{GMP} library uses C style \textit{signals} to flag errors which is of questionable use. Not all errors are fatal +and it was not deemed ideal by the author of LibTomMath to force developers to have signal handlers for such cases. + +\section{Initialization and Clearing} +The logical starting point when actually writing multiple precision integer functions is the initialization and +clearing of the mp\_int structures. These two algorithms will be used by the majority of the higher level algorithms. + +Given the basic mp\_int structure an initialization routine must first allocate memory to hold the digits of +the integer. Often it is optimal to allocate a sufficiently large pre-set number of digits even though +the initial integer will represent zero. If only a single digit were allocated quite a few subsequent re-allocations +would occur when operations are performed on the integers. There is a tradeoff between how many default digits to allocate +and how many re-allocations are tolerable. Obviously allocating an excessive amount of digits initially will waste +memory and become unmanageable. + +If the memory for the digits has been successfully allocated then the rest of the members of the structure must +be initialized. Since the initial state of an mp\_int is to represent the zero integer, the allocated digits must be set +to zero. The \textbf{used} count set to zero and \textbf{sign} set to \textbf{MP\_ZPOS}. + +\subsection{Initializing an mp\_int} +An mp\_int is said to be initialized if it is set to a valid, preferably default, state such that all of the members of the +structure are set to valid values. The mp\_init algorithm will perform such an action. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_init}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Allocate memory and initialize $a$ to a known valid mp\_int state. \\ +\hline \\ +1. Allocate memory for \textbf{MP\_PREC} digits. \\ +2. If the allocation failed return(\textit{MP\_MEM}) \\ +3. for $n$ from $0$ to $MP\_PREC - 1$ do \\ +\hspace{3mm}3.1 $a_n \leftarrow 0$\\ +4. $a.sign \leftarrow MP\_ZPOS$\\ +5. $a.used \leftarrow 0$\\ +6. $a.alloc \leftarrow MP\_PREC$\\ +7. Return(\textit{MP\_OKAY})\\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_init} +\end{figure} + +\textbf{Algorithm mp\_init.} +The \textbf{MP\_PREC} name represents a constant\footnote{Defined in the ``tommath.h'' header file within LibTomMath.} +used to dictate the minimum precision of allocated mp\_int integers. Ideally, it is at least equal to $32$ since for most +purposes that will be more than enough. + +Memory for the default number of digits is allocated first. If the allocation fails the algorithm returns immediately +with the \textbf{MP\_MEM} error code. If the allocation succeeds the remaining members of the mp\_int structure +must be initialized to reflect the default initial state. + +The allocated digits are all set to zero (step three) to ensure they are in a known state. The \textbf{sign}, \textbf{used} +and \textbf{alloc} are subsequently initialized to represent the zero integer. By step seven the algorithm returns a success +code and the mp\_int $a$ has been successfully initialized to a valid state representing the integer zero. + +\textbf{Remark.} +This function introduces the idiosyncrasy that all iterative loops, commonly initiated with the ``for'' keyword, iterate incrementally +when the ``to'' keyword is placed between two expressions. For example, ``for $a$ from $b$ to $c$ do'' means that +a subsequent expression (or body of expressions) are to be evaluated upto $c - b$ times so long as $b \le c$. In each +iteration the variable $a$ is substituted for a new integer that lies inclusively between $b$ and $c$. If $b > c$ occured +the loop would not iterate. By contrast if the ``downto'' keyword were used in place of ``to'' the loop would iterate +decrementally. + +EXAM,bn_mp_init.c + +One immediate observation of this initializtion function is that it does not return a pointer to a mp\_int structure. It +is assumed that the caller has already allocated memory for the mp\_int structure, typically on the application stack. The +call to mp\_init() is used only to initialize the members of the structure to a known default state. + +Before any of the other members of the structure are initialized memory from the application heap is allocated with +the calloc() function (line @22,calloc@). The size of the allocated memory is large enough to hold \textbf{MP\_PREC} +mp\_digit variables. The calloc() function is used instead\footnote{calloc() will allocate memory in the same +manner as malloc() except that it also sets the contents to zero upon successfully allocating the memory.} of malloc() +since digits have to be set to zero for the function to finish correctly. The \textbf{OPT\_CAST} token is a macro +definition which will turn into a cast from void * to mp\_digit * for C++ compilers. It is not required for C compilers. + +After the memory has been successfully allocated the remainder of the members are initialized +(lines @29,used@ through @31,sign@) to their respective default states. At this point the algorithm has succeeded and +a success code is returned to the calling function. + +If this function returns \textbf{MP\_OKAY} it is safe to assume the mp\_int structure has been properly initialized and +is safe to use with other functions within the library. + +\subsection{Clearing an mp\_int} +When an mp\_int is no longer required by the application, the memory that has been allocated for its digits must be +returned to the application's memory pool with the mp\_clear algorithm. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_clear}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. The memory for $a$ is freed for reuse. \\ +\hline \\ +1. If $a$ has been previously freed then return(\textit{MP\_OKAY}). \\ +2. for $n$ from 0 to $a.used - 1$ do \\ +\hspace{3mm}2.1 $a_n \leftarrow 0$ \\ +3. Free the memory allocated for the digits of $a$. \\ +4. $a.used \leftarrow 0$ \\ +5. $a.alloc \leftarrow 0$ \\ +6. $a.sign \leftarrow MP\_ZPOS$ \\ +7. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_clear} +\end{figure} + +\textbf{Algorithm mp\_clear.} +This algorithm releases the memory allocated for an mp\_int back into the memory pool for reuse. It is designed +such that a given mp\_int structure can be cleared multiple times between initializations without attempting to +free the memory twice\footnote{In ISO C for example, calling free() twice on the same memory block causes undefinied +behaviour.}. + +The first step determines if the mp\_int structure has been marked as free already. If it has, the algorithm returns +success immediately as no further actions are required. Otherwise, the algorithm will proceed to put the structure +in a known empty and otherwise invalid state. First the digits of the mp\_int are set to zero. The memory that has been allocated for the +digits is then freed. The \textbf{used} and \textbf{alloc} counts are both set to zero and the \textbf{sign} set to +\textbf{MP\_ZPOS}. This known fixed state for cleared mp\_int structures will make debuging easier for the end +developer. That is, if they spot (via their debugger) an mp\_int they are using that is in this state it will be +obvious that they erroneously and prematurely cleared the mp\_int structure. + +Note that once an mp\_int has been cleared the mp\_int structure is no longer in a valid state for any other algorithm +with the exception of algorithms mp\_init, mp\_init\_copy, mp\_init\_size and mp\_clear. + +EXAM,bn_mp_clear.c + +The ``if'' statement (line @21,a->dp != NULL@) prevents the heap from being corrupted if a user double-frees an +mp\_int. This is because once the memory is freed the pointer is set to \textbf{NULL} (line @30,NULL@). + +Without the check, code that accidentally calls mp\_clear twice for a given mp\_int structure would try to free the memory +allocated for the digits twice. This may cause some C libraries to signal a fault. By setting the pointer to +\textbf{NULL} it helps debug code that may inadvertently free the mp\_int before it is truly not needed, because attempts +to reference digits should fail immediately. The allocated digits are set to zero before being freed (line @24,memset@). +This is ideal for cryptographic situations where the integer that the mp\_int represents might need to be kept a secret. + +\section{Maintenance Algorithms} + +The previous sections describes how to initialize and clear an mp\_int structure. To further support operations +that are to be performed on mp\_int structures (such as addition and multiplication) the dependent algorithms must be +able to augment the precision of an mp\_int and +initialize mp\_ints with differing initial conditions. + +These algorithms complete the set of low level algorithms required to work with mp\_int structures in the higher level +algorithms such as addition, multiplication and modular exponentiation. + +\subsection{Augmenting an mp\_int's Precision} +When storing a value in an mp\_int structure, a sufficient number of digits must be available to accomodate the entire +result of an operation without loss of precision. Quite often the size of the array given by the \textbf{alloc} member +is large enough to simply increase the \textbf{used} digit count. However, when the size of the array is too small it +must be re-sized appropriately to accomodate the result. The mp\_grow algorithm will provide this functionality. + +\newpage\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_grow}. \\ +\textbf{Input}. An mp\_int $a$ and an integer $b$. \\ +\textbf{Output}. $a$ is expanded to accomodate $b$ digits. \\ +\hline \\ +1. if $a.alloc \ge b$ then return(\textit{MP\_OKAY}) \\ +2. $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\ +3. $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\ +4. Re-Allocate the array of digits $a$ to size $v$ \\ +5. If the allocation failed then return(\textit{MP\_MEM}). \\ +6. for n from a.alloc to $v - 1$ do \\ +\hspace{+3mm}6.1 $a_n \leftarrow 0$ \\ +7. $a.alloc \leftarrow v$ \\ +8. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_grow} +\end{figure} + +\textbf{Algorithm mp\_grow.} +It is ideal to prevent re-allocations from being performed if they are not required (step one). This is useful to +prevent mp\_ints from growing excessively in code that erroneously calls mp\_grow. + +The requested digit count is padded up to next multiple of \textbf{MP\_PREC} plus an additional \textbf{MP\_PREC} (steps two and three). +This helps prevent many trivial reallocations that would grow an mp\_int by trivially small values. + +It is assumed that the reallocation (step four) leaves the lower $a.alloc$ digits of the mp\_int intact. This is much +akin to how the \textit{realloc} function from the standard C library works. Since the newly allocated digits are +assumed to contain undefined values they are initially set to zero. + +EXAM,bn_mp_grow.c + +The first step is to see if we actually need to perform a re-allocation at all (line @24,a->alloc < size@). If a reallocation +must occur the digit count is padded upwards to help prevent many trivial reallocations (line @28,size@). Next the reallocation is performed +and the return of realloc() is stored in a temporary pointer named $tmp$ (line @36,realloc@). The return is stored in a temporary +instead of $a.dp$ to prevent the code from losing the original pointer in case the reallocation fails. Had the return been stored +in $a.dp$ instead there would be no way to reclaim the heap originally used. + +If the reallocation fails the function will return \textbf{MP\_MEM} (line @39,return@), otherwise, the value of $tmp$ is assigned +to the pointer $a.dp$ and the function continues. A simple for loop from line @48,a->alloc@ to line @50,}@ will zero all digits +that were above the old \textbf{alloc} limit to make sure the integer is in a known state. + +\subsection{Initializing Variable Precision mp\_ints} +Occasionally the number of digits required will be known in advance of an initialization, based on, for example, the size +of input mp\_ints to a given algorithm. The purpose of algorithm mp\_init\_size is similar to mp\_init except that it +will allocate \textit{at least} a specified number of digits. + +\begin{figure}[here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_init\_size}. \\ +\textbf{Input}. An mp\_int $a$ and the requested number of digits $b$. \\ +\textbf{Output}. $a$ is initialized to hold at least $b$ digits. \\ +\hline \\ +1. $u \leftarrow b \mbox{ (mod }MP\_PREC\mbox{)}$ \\ +2. $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\ +3. Allocate $v$ digits. \\ +4. for $n$ from $0$ to $v - 1$ do \\ +\hspace{3mm}4.1 $a_n \leftarrow 0$ \\ +5. $a.sign \leftarrow MP\_ZPOS$\\ +6. $a.used \leftarrow 0$\\ +7. $a.alloc \leftarrow v$\\ +8. Return(\textit{MP\_OKAY})\\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_init\_size} +\end{figure} + +\textbf{Algorithm mp\_init\_size.} +This algorithm will initialize an mp\_int structure $a$ like algorithm mp\_init with the exception that the number of +digits allocated can be controlled by the second input argument $b$. The input size is padded upwards so it is a +multiple of \textbf{MP\_PREC} plus an additional \textbf{MP\_PREC} digits. This padding is used to prevent trivial +allocations from becoming a bottleneck in the rest of the algorithms. + +Like algorithm mp\_init, the mp\_int structure is initialized to a default state representing the integer zero. This +particular algorithm is useful if it is known ahead of time the approximate size of the input. If the approximation is +correct no further memory re-allocations are required to work with the mp\_int. + +EXAM,bn_mp_init_size.c + +The number of digits $b$ requested is padded (line @22,MP_PREC@) by first augmenting it to the next multiple of +\textbf{MP\_PREC} and then adding \textbf{MP\_PREC} to the result. If the memory can be successfully allocated the +mp\_int is placed in a default state representing the integer zero. Otherwise, the error code \textbf{MP\_MEM} will be +returned (line @27,return@). + +The digits are allocated and set to zero at the same time with the calloc() function (line @25,calloc@). The +\textbf{used} count is set to zero, the \textbf{alloc} count set to the padded digit count and the \textbf{sign} flag set +to \textbf{MP\_ZPOS} to achieve a default valid mp\_int state (lines @29,used@, @30,alloc@ and @31,sign@). If the function +returns succesfully then it is correct to assume that the mp\_int structure is in a valid state for the remainder of the +functions to work with. + +\subsection{Multiple Integer Initializations and Clearings} +Occasionally a function will require a series of mp\_int data types to be made available simultaneously. +The purpose of algorithm mp\_init\_multi is to initialize a variable length array of mp\_int structures in a single +statement. It is essentially a shortcut to multiple initializations. + +\newpage\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_init\_multi}. \\ +\textbf{Input}. Variable length array $V_k$ of mp\_int variables of length $k$. \\ +\textbf{Output}. The array is initialized such that each mp\_int of $V_k$ is ready to use. \\ +\hline \\ +1. for $n$ from 0 to $k - 1$ do \\ +\hspace{+3mm}1.1. Initialize the mp\_int $V_n$ (\textit{mp\_init}) \\ +\hspace{+3mm}1.2. If initialization failed then do \\ +\hspace{+6mm}1.2.1. for $j$ from $0$ to $n$ do \\ +\hspace{+9mm}1.2.1.1. Free the mp\_int $V_j$ (\textit{mp\_clear}) \\ +\hspace{+6mm}1.2.2. Return(\textit{MP\_MEM}) \\ +2. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_init\_multi} +\end{figure} + +\textbf{Algorithm mp\_init\_multi.} +The algorithm will initialize the array of mp\_int variables one at a time. If a runtime error has been detected +(\textit{step 1.2}) all of the previously initialized variables are cleared. The goal is an ``all or nothing'' +initialization which allows for quick recovery from runtime errors. + +EXAM,bn_mp_init_multi.c + +This function intializes a variable length list of mp\_int structure pointers. However, instead of having the mp\_int +structures in an actual C array they are simply passed as arguments to the function. This function makes use of the +``...'' argument syntax of the C programming language. The list is terminated with a final \textbf{NULL} argument +appended on the right. + +The function uses the ``stdarg.h'' \textit{va} functions to step portably through the arguments to the function. A count +$n$ of succesfully initialized mp\_int structures is maintained (line @47,n++@) such that if a failure does occur, +the algorithm can backtrack and free the previously initialized structures (lines @27,if@ to @46,}@). + + +\subsection{Clamping Excess Digits} +When a function anticipates a result will be $n$ digits it is simpler to assume this is true within the body of +the function instead of checking during the computation. For example, a multiplication of a $i$ digit number by a +$j$ digit produces a result of at most $i + j$ digits. It is entirely possible that the result is $i + j - 1$ +though, with no final carry into the last position. However, suppose the destination had to be first expanded +(\textit{via mp\_grow}) to accomodate $i + j - 1$ digits than further expanded to accomodate the final carry. +That would be a considerable waste of time since heap operations are relatively slow. + +The ideal solution is to always assume the result is $i + j$ and fix up the \textbf{used} count after the function +terminates. This way a single heap operation (\textit{at most}) is required. However, if the result was not checked +there would be an excess high order zero digit. + +For example, suppose the product of two integers was $x_n = (0x_{n-1}x_{n-2}...x_0)_{\beta}$. The leading zero digit +will not contribute to the precision of the result. In fact, through subsequent operations more leading zero digits would +accumulate to the point the size of the integer would be prohibitive. As a result even though the precision is very +low the representation is excessively large. + +The mp\_clamp algorithm is designed to solve this very problem. It will trim high-order zeros by decrementing the +\textbf{used} count until a non-zero most significant digit is found. Also in this system, zero is considered to be a +positive number which means that if the \textbf{used} count is decremented to zero, the sign must be set to +\textbf{MP\_ZPOS}. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_clamp}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Any excess leading zero digits of $a$ are removed \\ +\hline \\ +1. while $a.used > 0$ and $a_{a.used - 1} = 0$ do \\ +\hspace{+3mm}1.1 $a.used \leftarrow a.used - 1$ \\ +2. if $a.used = 0$ then do \\ +\hspace{+3mm}2.1 $a.sign \leftarrow MP\_ZPOS$ \\ +\hline \\ +\end{tabular} +\end{center} +\caption{Algorithm mp\_clamp} +\end{figure} + +\textbf{Algorithm mp\_clamp.} +As can be expected this algorithm is very simple. The loop on step one is expected to iterate only once or twice at +the most. For example, this will happen in cases where there is not a carry to fill the last position. Step two fixes the sign for +when all of the digits are zero to ensure that the mp\_int is valid at all times. + +EXAM,bn_mp_clamp.c + +Note on line @27,while@ how to test for the \textbf{used} count is made on the left of the \&\& operator. In the C programming +language the terms to \&\& are evaluated left to right with a boolean short-circuit if any condition fails. This is +important since if the \textbf{used} is zero the test on the right would fetch below the array. That is obviously +undesirable. The parenthesis on line @28,a->used@ is used to make sure the \textbf{used} count is decremented and not +the pointer ``a''. + +\section*{Exercises} +\begin{tabular}{cl} +$\left [ 1 \right ]$ & Discuss the relevance of the \textbf{used} member of the mp\_int structure. \\ + & \\ +$\left [ 1 \right ]$ & Discuss the consequences of not using padding when performing allocations. \\ + & \\ +$\left [ 2 \right ]$ & Estimate an ideal value for \textbf{MP\_PREC} when performing 1024-bit RSA \\ + & encryption when $\beta = 2^{28}$. \\ + & \\ +$\left [ 1 \right ]$ & Discuss the relevance of the algorithm mp\_clamp. What does it prevent? \\ + & \\ +$\left [ 1 \right ]$ & Give an example of when the algorithm mp\_init\_copy might be useful. \\ + & \\ +\end{tabular} + + +%%% +% CHAPTER FOUR +%%% + +\chapter{Basic Operations} + +\section{Introduction} +In the previous chapter a series of low level algorithms were established that dealt with initializing and maintaining +mp\_int structures. This chapter will discuss another set of seemingly non-algebraic algorithms which will form the low +level basis of the entire library. While these algorithm are relatively trivial it is important to understand how they +work before proceeding since these algorithms will be used almost intrinsically in the following chapters. + +The algorithms in this chapter deal primarily with more ``programmer'' related tasks such as creating copies of +mp\_int structures, assigning small values to mp\_int structures and comparisons of the values mp\_int structures +represent. + +\section{Assigning Values to mp\_int Structures} +\subsection{Copying an mp\_int} +Assigning the value that a given mp\_int structure represents to another mp\_int structure shall be known as making +a copy for the purposes of this text. The copy of the mp\_int will be a separate entity that represents the same +value as the mp\_int it was copied from. The mp\_copy algorithm provides this functionality. + +\newpage\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_copy}. \\ +\textbf{Input}. An mp\_int $a$ and $b$. \\ +\textbf{Output}. Store a copy of $a$ in $b$. \\ +\hline \\ +1. If $b.alloc < a.used$ then grow $b$ to $a.used$ digits. (\textit{mp\_grow}) \\ +2. for $n$ from 0 to $a.used - 1$ do \\ +\hspace{3mm}2.1 $b_{n} \leftarrow a_{n}$ \\ +3. for $n$ from $a.used$ to $b.used - 1$ do \\ +\hspace{3mm}3.1 $b_{n} \leftarrow 0$ \\ +4. $b.used \leftarrow a.used$ \\ +5. $b.sign \leftarrow a.sign$ \\ +6. return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_copy} +\end{figure} + +\textbf{Algorithm mp\_copy.} +This algorithm copies the mp\_int $a$ such that upon succesful termination of the algorithm the mp\_int $b$ will +represent the same integer as the mp\_int $a$. The mp\_int $b$ shall be a complete and distinct copy of the +mp\_int $a$ meaing that the mp\_int $a$ can be modified and it shall not affect the value of the mp\_int $b$. + +If $b$ does not have enough room for the digits of $a$ it must first have its precision augmented via the mp\_grow +algorithm. The digits of $a$ are copied over the digits of $b$ and any excess digits of $b$ are set to zero (step two +and three). The \textbf{used} and \textbf{sign} members of $a$ are finally copied over the respective members of +$b$. + +\textbf{Remark.} This algorithm also introduces a new idiosyncrasy that will be used throughout the rest of the +text. The error return codes of other algorithms are not explicitly checked in the pseudo-code presented. For example, in +step one of the mp\_copy algorithm the return of mp\_grow is not explicitly checked to ensure it succeeded. Text space is +limited so it is assumed that if a algorithm fails it will clear all temporarily allocated mp\_ints and return +the error code itself. However, the C code presented will demonstrate all of the error handling logic required to +implement the pseudo-code. + +EXAM,bn_mp_copy.c + +Occasionally a dependent algorithm may copy an mp\_int effectively into itself such as when the input and output +mp\_int structures passed to a function are one and the same. For this case it is optimal to return immediately without +copying digits (line @24,a == b@). + +The mp\_int $b$ must have enough digits to accomodate the used digits of the mp\_int $a$. If $b.alloc$ is less than +$a.used$ the algorithm mp\_grow is used to augment the precision of $b$ (lines @29,alloc@ to @33,}@). In order to +simplify the inner loop that copies the digits from $a$ to $b$, two aliases $tmpa$ and $tmpb$ point directly at the digits +of the mp\_ints $a$ and $b$ respectively. These aliases (lines @42,tmpa@ and @45,tmpb@) allow the compiler to access the digits without first dereferencing the +mp\_int pointers and then subsequently the pointer to the digits. + +After the aliases are established the digits from $a$ are copied into $b$ (lines @48,for@ to @50,}@) and then the excess +digits of $b$ are set to zero (lines @53,for@ to @55,}@). Both ``for'' loops make use of the pointer aliases and in +fact the alias for $b$ is carried through into the second ``for'' loop to clear the excess digits. This optimization +allows the alias to stay in a machine register fairly easy between the two loops. + +\textbf{Remarks.} The use of pointer aliases is an implementation methodology first introduced in this function that will +be used considerably in other functions. Technically, a pointer alias is simply a short hand alias used to lower the +number of pointer dereferencing operations required to access data. For example, a for loop may resemble + +\begin{alltt} +for (x = 0; x < 100; x++) \{ + a->num[4]->dp[x] = 0; +\} +\end{alltt} + +This could be re-written using aliases as + +\begin{alltt} +mp_digit *tmpa; +a = a->num[4]->dp; +for (x = 0; x < 100; x++) \{ + *a++ = 0; +\} +\end{alltt} + +In this case an alias is used to access the +array of digits within an mp\_int structure directly. It may seem that a pointer alias is strictly not required +as a compiler may optimize out the redundant pointer operations. However, there are two dominant reasons to use aliases. + +The first reason is that most compilers will not effectively optimize pointer arithmetic. For example, some optimizations +may work for the Microsoft Visual C++ compiler (MSVC) and not for the GNU C Compiler (GCC). Also some optimizations may +work for GCC and not MSVC. As such it is ideal to find a common ground for as many compilers as possible. Pointer +aliases optimize the code considerably before the compiler even reads the source code which means the end compiled code +stands a better chance of being faster. + +The second reason is that pointer aliases often can make an algorithm simpler to read. Consider the first ``for'' +loop of the function mp\_copy() re-written to not use pointer aliases. + +\begin{alltt} + /* copy all the digits */ + for (n = 0; n < a->used; n++) \{ + b->dp[n] = a->dp[n]; + \} +\end{alltt} + +Whether this code is harder to read depends strongly on the individual. However, it is quantifiably slightly more +complicated as there are four variables within the statement instead of just two. + +\subsubsection{Nested Statements} +Another commonly used technique in the source routines is that certain sections of code are nested. This is used in +particular with the pointer aliases to highlight code phases. For example, a Comba multiplier (discussed in chapter six) +will typically have three different phases. First the temporaries are initialized, then the columns calculated and +finally the carries are propagated. In this example the middle column production phase will typically be nested as it +uses temporary variables and aliases the most. + +The nesting also simplies the source code as variables that are nested are only valid for their scope. As a result +the various temporary variables required do not propagate into other sections of code. + + +\subsection{Creating a Clone} +Another common operation is to make a local temporary copy of an mp\_int argument. To initialize an mp\_int +and then copy another existing mp\_int into the newly intialized mp\_int will be known as creating a clone. This is +useful within functions that need to modify an argument but do not wish to actually modify the original copy. The +mp\_init\_copy algorithm has been designed to help perform this task. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_init\_copy}. \\ +\textbf{Input}. An mp\_int $a$ and $b$\\ +\textbf{Output}. $a$ is initialized to be a copy of $b$. \\ +\hline \\ +1. Init $a$. (\textit{mp\_init}) \\ +2. Copy $b$ to $a$. (\textit{mp\_copy}) \\ +3. Return the status of the copy operation. \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_init\_copy} +\end{figure} + +\textbf{Algorithm mp\_init\_copy.} +This algorithm will initialize an mp\_int variable and copy another previously initialized mp\_int variable into it. As +such this algorithm will perform two operations in one step. + +EXAM,bn_mp_init_copy.c + +This will initialize \textbf{a} and make it a verbatim copy of the contents of \textbf{b}. Note that +\textbf{a} will have its own memory allocated which means that \textbf{b} may be cleared after the call +and \textbf{a} will be left intact. + +\section{Zeroing an Integer} +Reseting an mp\_int to the default state is a common step in many algorithms. The mp\_zero algorithm will be the algorithm used to +perform this task. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_zero}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Zero the contents of $a$ \\ +\hline \\ +1. $a.used \leftarrow 0$ \\ +2. $a.sign \leftarrow$ MP\_ZPOS \\ +3. for $n$ from 0 to $a.alloc - 1$ do \\ +\hspace{3mm}3.1 $a_n \leftarrow 0$ \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_zero} +\end{figure} + +\textbf{Algorithm mp\_zero.} +This algorithm simply resets a mp\_int to the default state. + +EXAM,bn_mp_zero.c + +After the function is completed, all of the digits are zeroed, the \textbf{used} count is zeroed and the +\textbf{sign} variable is set to \textbf{MP\_ZPOS}. + +\section{Sign Manipulation} +\subsection{Absolute Value} +With the mp\_int representation of an integer, calculating the absolute value is trivial. The mp\_abs algorithm will compute +the absolute value of an mp\_int. + +\newpage\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_abs}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Computes $b = \vert a \vert$ \\ +\hline \\ +1. Copy $a$ to $b$. (\textit{mp\_copy}) \\ +2. If the copy failed return(\textit{MP\_MEM}). \\ +3. $b.sign \leftarrow MP\_ZPOS$ \\ +4. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_abs} +\end{figure} + +\textbf{Algorithm mp\_abs.} +This algorithm computes the absolute of an mp\_int input. First it copies $a$ over $b$. This is an example of an +algorithm where the check in mp\_copy that determines if the source and destination are equal proves useful. This allows, +for instance, the developer to pass the same mp\_int as the source and destination to this function without addition +logic to handle it. + +EXAM,bn_mp_abs.c + +\subsection{Integer Negation} +With the mp\_int representation of an integer, calculating the negation is also trivial. The mp\_neg algorithm will compute +the negative of an mp\_int input. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_neg}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Computes $b = -a$ \\ +\hline \\ +1. Copy $a$ to $b$. (\textit{mp\_copy}) \\ +2. If the copy failed return(\textit{MP\_MEM}). \\ +3. If $a.used = 0$ then return(\textit{MP\_OKAY}). \\ +4. If $a.sign = MP\_ZPOS$ then do \\ +\hspace{3mm}4.1 $b.sign = MP\_NEG$. \\ +5. else do \\ +\hspace{3mm}5.1 $b.sign = MP\_ZPOS$. \\ +6. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_neg} +\end{figure} + +\textbf{Algorithm mp\_neg.} +This algorithm computes the negation of an input. First it copies $a$ over $b$. If $a$ has no used digits then +the algorithm returns immediately. Otherwise it flips the sign flag and stores the result in $b$. Note that if +$a$ had no digits then it must be positive by definition. Had step three been omitted then the algorithm would return +zero as negative. + +EXAM,bn_mp_neg.c + +\section{Small Constants} +\subsection{Setting Small Constants} +Often a mp\_int must be set to a relatively small value such as $1$ or $2$. For these cases the mp\_set algorithm is useful. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_set}. \\ +\textbf{Input}. An mp\_int $a$ and a digit $b$ \\ +\textbf{Output}. Make $a$ equivalent to $b$ \\ +\hline \\ +1. Zero $a$ (\textit{mp\_zero}). \\ +2. $a_0 \leftarrow b \mbox{ (mod }\beta\mbox{)}$ \\ +3. $a.used \leftarrow \left \lbrace \begin{array}{ll} + 1 & \mbox{if }a_0 > 0 \\ + 0 & \mbox{if }a_0 = 0 + \end{array} \right .$ \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_set} +\end{figure} + +\textbf{Algorithm mp\_set.} +This algorithm sets a mp\_int to a small single digit value. Step number 1 ensures that the integer is reset to the default state. The +single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adjusted accordingly. + +EXAM,bn_mp_set.c + +Line @21,mp_zero@ calls mp\_zero() to clear the mp\_int and reset the sign. Line @22,MP_MASK@ copies the digit +into the least significant location. Note the usage of a new constant \textbf{MP\_MASK}. This constant is used to quickly +reduce an integer modulo $\beta$. Since $\beta$ is of the form $2^k$ for any suitable $k$ it suffices to perform a binary AND with +$MP\_MASK = 2^k - 1$ to perform the reduction. Finally line @23,a->used@ will set the \textbf{used} member with respect to the +digit actually set. This function will always make the integer positive. + +One important limitation of this function is that it will only set one digit. The size of a digit is not fixed, meaning source that uses +this function should take that into account. Only trivially small constants can be set using this function. + +\subsection{Setting Large Constants} +To overcome the limitations of the mp\_set algorithm the mp\_set\_int algorithm is ideal. It accepts a ``long'' +data type as input and will always treat it as a 32-bit integer. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_set\_int}. \\ +\textbf{Input}. An mp\_int $a$ and a ``long'' integer $b$ \\ +\textbf{Output}. Make $a$ equivalent to $b$ \\ +\hline \\ +1. Zero $a$ (\textit{mp\_zero}) \\ +2. for $n$ from 0 to 7 do \\ +\hspace{3mm}2.1 $a \leftarrow a \cdot 16$ (\textit{mp\_mul2d}) \\ +\hspace{3mm}2.2 $u \leftarrow \lfloor b / 2^{4(7 - n)} \rfloor \mbox{ (mod }16\mbox{)}$\\ +\hspace{3mm}2.3 $a_0 \leftarrow a_0 + u$ \\ +\hspace{3mm}2.4 $a.used \leftarrow a.used + 1$ \\ +3. Clamp excess used digits (\textit{mp\_clamp}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_set\_int} +\end{figure} + +\textbf{Algorithm mp\_set\_int.} +The algorithm performs eight iterations of a simple loop where in each iteration four bits from the source are added to the +mp\_int. Step 2.1 will multiply the current result by sixteen making room for four more bits in the less significant positions. In step 2.2 the +next four bits from the source are extracted and are added to the mp\_int. The \textbf{used} digit count is +incremented to reflect the addition. The \textbf{used} digit counter is incremented since if any of the leading digits were zero the mp\_int would have +zero digits used and the newly added four bits would be ignored. + +Excess zero digits are trimmed in steps 2.1 and 3 by using higher level algorithms mp\_mul2d and mp\_clamp. + +EXAM,bn_mp_set_int.c + +This function sets four bits of the number at a time to handle all practical \textbf{DIGIT\_BIT} sizes. The weird +addition on line @38,a->used@ ensures that the newly added in bits are added to the number of digits. While it may not +seem obvious as to why the digit counter does not grow exceedingly large it is because of the shift on line @27,mp_mul_2d@ +as well as the call to mp\_clamp() on line @40,mp_clamp@. Both functions will clamp excess leading digits which keeps +the number of used digits low. + +\section{Comparisons} +\subsection{Unsigned Comparisions} +Comparing a multiple precision integer is performed with the exact same algorithm used to compare two decimal numbers. For example, +to compare $1,234$ to $1,264$ the digits are extracted by their positions. That is we compare $1 \cdot 10^3 + 2 \cdot 10^2 + 3 \cdot 10^1 + 4 \cdot 10^0$ +to $1 \cdot 10^3 + 2 \cdot 10^2 + 6 \cdot 10^1 + 4 \cdot 10^0$ by comparing single digits at a time starting with the highest magnitude +positions. If any leading digit of one integer is greater than a digit in the same position of another integer then obviously it must be greater. + +The first comparision routine that will be developed is the unsigned magnitude compare which will perform a comparison based on the digits of two +mp\_int variables alone. It will ignore the sign of the two inputs. Such a function is useful when an absolute comparison is required or if the +signs are known to agree in advance. + +To facilitate working with the results of the comparison functions three constants are required. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{|r|l|} +\hline \textbf{Constant} & \textbf{Meaning} \\ +\hline \textbf{MP\_GT} & Greater Than \\ +\hline \textbf{MP\_EQ} & Equal To \\ +\hline \textbf{MP\_LT} & Less Than \\ +\hline +\end{tabular} +\end{center} +\caption{Comparison Return Codes} +\end{figure} + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_cmp\_mag}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$. \\ +\textbf{Output}. Unsigned comparison results ($a$ to the left of $b$). \\ +\hline \\ +1. If $a.used > b.used$ then return(\textit{MP\_GT}) \\ +2. If $a.used < b.used$ then return(\textit{MP\_LT}) \\ +3. for n from $a.used - 1$ to 0 do \\ +\hspace{+3mm}3.1 if $a_n > b_n$ then return(\textit{MP\_GT}) \\ +\hspace{+3mm}3.2 if $a_n < b_n$ then return(\textit{MP\_LT}) \\ +4. Return(\textit{MP\_EQ}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_cmp\_mag} +\end{figure} + +\textbf{Algorithm mp\_cmp\_mag.} +By saying ``$a$ to the left of $b$'' it is meant that the comparison is with respect to $a$, that is if $a$ is greater than $b$ it will return +\textbf{MP\_GT} and similar with respect to when $a = b$ and $a < b$. The first two steps compare the number of digits used in both $a$ and $b$. +Obviously if the digit counts differ there would be an imaginary zero digit in the smaller number where the leading digit of the larger number is. +If both have the same number of digits than the actual digits themselves must be compared starting at the leading digit. + +By step three both inputs must have the same number of digits so its safe to start from either $a.used - 1$ or $b.used - 1$ and count down to +the zero'th digit. If after all of the digits have been compared, no difference is found, the algorithm returns \textbf{MP\_EQ}. + +EXAM,bn_mp_cmp_mag.c + +The two if statements on lines @24,if@ and @28,if@ compare the number of digits in the two inputs. These two are performed before all of the digits +are compared since it is a very cheap test to perform and can potentially save considerable time. The implementation given is also not valid +without those two statements. $b.alloc$ may be smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the +array of digits. + +\subsection{Signed Comparisons} +Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}). Based on an unsigned magnitude +comparison a trivial signed comparison algorithm can be written. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_cmp}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ \\ +\textbf{Output}. Signed Comparison Results ($a$ to the left of $b$) \\ +\hline \\ +1. if $a.sign = MP\_NEG$ and $b.sign = MP\_ZPOS$ then return(\textit{MP\_LT}) \\ +2. if $a.sign = MP\_ZPOS$ and $b.sign = MP\_NEG$ then return(\textit{MP\_GT}) \\ +3. if $a.sign = MP\_NEG$ then \\ +\hspace{+3mm}3.1 Return the unsigned comparison of $b$ and $a$ (\textit{mp\_cmp\_mag}) \\ +4 Otherwise \\ +\hspace{+3mm}4.1 Return the unsigned comparison of $a$ and $b$ \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_cmp} +\end{figure} + +\textbf{Algorithm mp\_cmp.} +The first two steps compare the signs of the two inputs. If the signs do not agree then it can return right away with the appropriate +comparison code. When the signs are equal the digits of the inputs must be compared to determine the correct result. In step +three the unsigned comparision flips the order of the arguments since they are both negative. For instance, if $-a > -b$ then +$\vert a \vert < \vert b \vert$. Step number four will compare the two when they are both positive. + +EXAM,bn_mp_cmp.c + +The two if statements on lines @22,if@ and @26,if@ perform the initial sign comparison. If the signs are not the equal then which ever +has the positive sign is larger. At line @30,if@, the inputs are compared based on magnitudes. If the signs were both negative then +the unsigned comparison is performed in the opposite direction (\textit{line @31,mp_cmp_mag@}). Otherwise, the signs are assumed to +be both positive and a forward direction unsigned comparison is performed. + +\section*{Exercises} +\begin{tabular}{cl} +$\left [ 2 \right ]$ & Modify algorithm mp\_set\_int to accept as input a variable length array of bits. \\ + & \\ +$\left [ 3 \right ]$ & Give the probability that algorithm mp\_cmp\_mag will have to compare $k$ digits \\ + & of two random digits (of equal magnitude) before a difference is found. \\ + & \\ +$\left [ 1 \right ]$ & Suggest a simple method to speed up the implementation of mp\_cmp\_mag based \\ + & on the observations made in the previous problem. \\ + & +\end{tabular} + +\chapter{Basic Arithmetic} +\section{Introduction} +At this point algorithms for initialization, clearing, zeroing, copying, comparing and setting small constants have been +established. The next logical set of algorithms to develop are addition, subtraction and digit shifting algorithms. These +algorithms make use of the lower level algorithms and are the cruicial building block for the multiplication algorithms. It is very important +that these algorithms are highly optimized. On their own they are simple $O(n)$ algorithms but they can be called from higher level algorithms +which easily places them at $O(n^2)$ or even $O(n^3)$ work levels. + +MARK,SHIFTS +All of the algorithms within this chapter make use of the logical bit shift operations denoted by $<<$ and $>>$ for left and right +logical shifts respectively. A logical shift is analogous to sliding the decimal point of radix-10 representations. For example, the real +number $0.9345$ is equivalent to $93.45\%$ which is found by sliding the the decimal two places to the right (\textit{multiplying by $\beta^2 = 10^2$}). +Algebraically a binary logical shift is equivalent to a division or multiplication by a power of two. +For example, $a << k = a \cdot 2^k$ while $a >> k = \lfloor a/2^k \rfloor$. + +One significant difference between a logical shift and the way decimals are shifted is that digits below the zero'th position are removed +from the number. For example, consider $1101_2 >> 1$ using decimal notation this would produce $110.1_2$. However, with a logical shift the +result is $110_2$. + +\section{Addition and Subtraction} +In common twos complement fixed precision arithmetic negative numbers are easily represented by subtraction from the modulus. For example, with 32-bit integers +$a - b\mbox{ (mod }2^{32}\mbox{)}$ is the same as $a + (2^{32} - b) \mbox{ (mod }2^{32}\mbox{)}$ since $2^{32} \equiv 0 \mbox{ (mod }2^{32}\mbox{)}$. +As a result subtraction can be performed with a trivial series of logical operations and an addition. + +However, in multiple precision arithmetic negative numbers are not represented in the same way. Instead a sign flag is used to keep track of the +sign of the integer. As a result signed addition and subtraction are actually implemented as conditional usage of lower level addition or +subtraction algorithms with the sign fixed up appropriately. + +The lower level algorithms will add or subtract integers without regard to the sign flag. That is they will add or subtract the magnitude of +the integers respectively. + +\subsection{Low Level Addition} +An unsigned addition of multiple precision integers is performed with the same long-hand algorithm used to add decimal numbers. That is to add the +trailing digits first and propagate the resulting carry upwards. Since this is a lower level algorithm the name will have a ``s\_'' prefix. +Historically that convention stems from the MPI library where ``s\_'' stood for static functions that were hidden from the developer entirely. + +\newpage +\begin{figure}[!here] +\begin{center} +\begin{small} +\begin{tabular}{l} +\hline Algorithm \textbf{s\_mp\_add}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ \\ +\textbf{Output}. The unsigned addition $c = \vert a \vert + \vert b \vert$. \\ +\hline \\ +1. if $a.used > b.used$ then \\ +\hspace{+3mm}1.1 $min \leftarrow b.used$ \\ +\hspace{+3mm}1.2 $max \leftarrow a.used$ \\ +\hspace{+3mm}1.3 $x \leftarrow a$ \\ +2. else \\ +\hspace{+3mm}2.1 $min \leftarrow a.used$ \\ +\hspace{+3mm}2.2 $max \leftarrow b.used$ \\ +\hspace{+3mm}2.3 $x \leftarrow b$ \\ +3. If $c.alloc < max + 1$ then grow $c$ to hold at least $max + 1$ digits (\textit{mp\_grow}) \\ +4. $oldused \leftarrow c.used$ \\ +5. $c.used \leftarrow max + 1$ \\ +6. $u \leftarrow 0$ \\ +7. for $n$ from $0$ to $min - 1$ do \\ +\hspace{+3mm}7.1 $c_n \leftarrow a_n + b_n + u$ \\ +\hspace{+3mm}7.2 $u \leftarrow c_n >> lg(\beta)$ \\ +\hspace{+3mm}7.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ +8. if $min \ne max$ then do \\ +\hspace{+3mm}8.1 for $n$ from $min$ to $max - 1$ do \\ +\hspace{+6mm}8.1.1 $c_n \leftarrow x_n + u$ \\ +\hspace{+6mm}8.1.2 $u \leftarrow c_n >> lg(\beta)$ \\ +\hspace{+6mm}8.1.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ +9. $c_{max} \leftarrow u$ \\ +10. if $olduse > max$ then \\ +\hspace{+3mm}10.1 for $n$ from $max + 1$ to $oldused - 1$ do \\ +\hspace{+6mm}10.1.1 $c_n \leftarrow 0$ \\ +11. Clamp excess digits in $c$. (\textit{mp\_clamp}) \\ +12. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{small} +\end{center} +\caption{Algorithm s\_mp\_add} +\end{figure} + +\textbf{Algorithm s\_mp\_add.} +This algorithm is loosely based on algorithm 14.7 of HAC \cite[pp. 594]{HAC} but has been extended to allow the inputs to have different magnitudes. +Coincidentally the description of algorithm A in Knuth \cite[pp. 266]{TAOCPV2} shares the same deficiency as the algorithm from \cite{HAC}. Even the +MIX pseudo machine code presented by Knuth \cite[pp. 266-267]{TAOCPV2} is incapable of handling inputs which are of different magnitudes. + +The first thing that has to be accomplished is to sort out which of the two inputs is the largest. The addition logic +will simply add all of the smallest input to the largest input and store that first part of the result in the +destination. Then it will apply a simpler addition loop to excess digits of the larger input. + +The first two steps will handle sorting the inputs such that $min$ and $max$ hold the digit counts of the two +inputs. The variable $x$ will be an mp\_int alias for the largest input or the second input $b$ if they have the +same number of digits. After the inputs are sorted the destination $c$ is grown as required to accomodate the sum +of the two inputs. The original \textbf{used} count of $c$ is copied and set to the new used count. + +At this point the first addition loop will go through as many digit positions that both inputs have. The carry +variable $\mu$ is set to zero outside the loop. Inside the loop an ``addition'' step requires three statements to produce +one digit of the summand. First +two digits from $a$ and $b$ are added together along with the carry $\mu$. The carry of this step is extracted and stored +in $\mu$ and finally the digit of the result $c_n$ is truncated within the range $0 \le c_n < \beta$. + +Now all of the digit positions that both inputs have in common have been exhausted. If $min \ne max$ then $x$ is an alias +for one of the inputs that has more digits. A simplified addition loop is then used to essentially copy the remaining digits +and the carry to the destination. + +The final carry is stored in $c_{max}$ and digits above $max$ upto $oldused$ are zeroed which completes the addition. + + +EXAM,bn_s_mp_add.c + +Lines @27,if@ to @35,}@ perform the initial sorting of the inputs and determine the $min$ and $max$ variables. Note that $x$ is a pointer to a +mp\_int assigned to the largest input, in effect it is a local alias. Lines @37,init@ to @42,}@ ensure that the destination is grown to +accomodate the result of the addition. + +Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style. The three aliases that are on +lines @56,tmpa@, @59,tmpb@ and @62,tmpc@ represent the two inputs and destination variables respectively. These aliases are used to ensure the +compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int. + +The initial carry $u$ is cleared on line @65,u = 0@, note that $u$ is of type mp\_digit which ensures type compatibility within the +implementation. The initial addition loop begins on line @66,for@ and ends on line @75,}@. Similarly the conditional addition loop +begins on line @81,for@ and ends on line @90,}@. The addition is finished with the final carry being stored in $tmpc$ on line @94,tmpc++@. +Note the ``++'' operator on the same line. After line @94,tmpc++@ $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$. This is useful +for the next loop on lines @97,for@ to @99,}@ which set any old upper digits to zero. + +\subsection{Low Level Subtraction} +The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm. The principle difference is that the +unsigned subtraction algorithm requires the result to be positive. That is when computing $a - b$ the condition $\vert a \vert \ge \vert b\vert$ must +be met for this algorithm to function properly. Keep in mind this low level algorithm is not meant to be used in higher level algorithms directly. +This algorithm as will be shown can be used to create functional signed addition and subtraction algorithms. + +MARK,GAMMA + +For this algorithm a new variable is required to make the description simpler. Recall from section 1.3.1 that a mp\_digit must be able to represent +the range $0 \le x < 2\beta$ for the algorithms to work correctly. However, it is allowable that a mp\_digit represent a larger range of values. For +this algorithm we will assume that the variable $\gamma$ represents the number of bits available in a +mp\_digit (\textit{this implies $2^{\gamma} > \beta$}). + +For example, the default for LibTomMath is to use a ``unsigned long'' for the mp\_digit ``type'' while $\beta = 2^{28}$. In ISO C an ``unsigned long'' +data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma = 32$. + +\newpage\begin{figure}[!here] +\begin{center} +\begin{small} +\begin{tabular}{l} +\hline Algorithm \textbf{s\_mp\_sub}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ ($\vert a \vert \ge \vert b \vert$) \\ +\textbf{Output}. The unsigned subtraction $c = \vert a \vert - \vert b \vert$. \\ +\hline \\ +1. $min \leftarrow b.used$ \\ +2. $max \leftarrow a.used$ \\ +3. If $c.alloc < max$ then grow $c$ to hold at least $max$ digits. (\textit{mp\_grow}) \\ +4. $oldused \leftarrow c.used$ \\ +5. $c.used \leftarrow max$ \\ +6. $u \leftarrow 0$ \\ +7. for $n$ from $0$ to $min - 1$ do \\ +\hspace{3mm}7.1 $c_n \leftarrow a_n - b_n - u$ \\ +\hspace{3mm}7.2 $u \leftarrow c_n >> (\gamma - 1)$ \\ +\hspace{3mm}7.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ +8. if $min < max$ then do \\ +\hspace{3mm}8.1 for $n$ from $min$ to $max - 1$ do \\ +\hspace{6mm}8.1.1 $c_n \leftarrow a_n - u$ \\ +\hspace{6mm}8.1.2 $u \leftarrow c_n >> (\gamma - 1)$ \\ +\hspace{6mm}8.1.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ +9. if $oldused > max$ then do \\ +\hspace{3mm}9.1 for $n$ from $max$ to $oldused - 1$ do \\ +\hspace{6mm}9.1.1 $c_n \leftarrow 0$ \\ +10. Clamp excess digits of $c$. (\textit{mp\_clamp}). \\ +11. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{small} +\end{center} +\caption{Algorithm s\_mp\_sub} +\end{figure} + +\textbf{Algorithm s\_mp\_sub.} +This algorithm performs the unsigned subtraction of two mp\_int variables under the restriction that the result must be positive. That is when +passing variables $a$ and $b$ the condition that $\vert a \vert \ge \vert b \vert$ must be met for the algorithm to function correctly. This +algorithm is loosely based on algorithm 14.9 \cite[pp. 595]{HAC} and is similar to algorithm S in \cite[pp. 267]{TAOCPV2} as well. As was the case +of the algorithm s\_mp\_add both other references lack discussion concerning various practical details such as when the inputs differ in magnitude. + +The initial sorting of the inputs is trivial in this algorithm since $a$ is guaranteed to have at least the same magnitude of $b$. Steps 1 and 2 +set the $min$ and $max$ variables. Unlike the addition routine there is guaranteed to be no carry which means that the final result can be at +most $max$ digits in length as opposed to $max + 1$. Similar to the addition algorithm the \textbf{used} count of $c$ is copied locally and +set to the maximal count for the operation. + +The subtraction loop that begins on step seven is essentially the same as the addition loop of algorithm s\_mp\_add except single precision +subtraction is used instead. Note the use of the $\gamma$ variable to extract the carry (\textit{also known as the borrow}) within the subtraction +loops. Under the assumption that two's complement single precision arithmetic is used this will successfully extract the desired carry. + +For example, consider subtracting $0101_2$ from $0100_2$ where $\gamma = 4$ and $\beta = 2$. The least significant bit will force a carry upwards to +the third bit which will be set to zero after the borrow. After the very first bit has been subtracted $4 - 1 \equiv 0011_2$ will remain, When the +third bit of $0101_2$ is subtracted from the result it will cause another carry. In this case though the carry will be forced to propagate all the +way to the most significant bit. + +Recall that $\beta < 2^{\gamma}$. This means that if a carry does occur just before the $lg(\beta)$'th bit it will propagate all the way to the most +significant bit. Thus, the high order bits of the mp\_digit that are not part of the actual digit will either be all zero, or all one. All that +is needed is a single zero or one bit for the carry. Therefore a single logical shift right by $\gamma - 1$ positions is sufficient to extract the +carry. This method of carry extraction may seem awkward but the reason for it becomes apparent when the implementation is discussed. + +If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and copy operation to propagate through the larger input $a$ into $c$. Step +10 will ensure that any leading digits of $c$ above the $max$'th position are zeroed. + +EXAM,bn_s_mp_sub.c + +Line @24,min@ and @25,max@ perform the initial hardcoded sorting of the inputs. In reality the $min$ and $max$ variables are only aliases and are only +used to make the source code easier to read. Again the pointer alias optimization is used within this algorithm. Lines @42,tmpa@, @43,tmpb@ and @44,tmpc@ initialize the aliases for +$a$, $b$ and $c$ respectively. + +The first subtraction loop occurs on lines @47,u = 0@ through @61,}@. The theory behind the subtraction loop is exactly the same as that for +the addition loop. As remarked earlier there is an implementation reason for using the ``awkward'' method of extracting the carry +(\textit{see line @57, >>@}). The traditional method for extracting the carry would be to shift by $lg(\beta)$ positions and logically AND +the least significant bit. The AND operation is required because all of the bits above the $\lg(\beta)$'th bit will be set to one after a carry +occurs from subtraction. This carry extraction requires two relatively cheap operations to extract the carry. The other method is to simply +shift the most significant bit to the least significant bit thus extracting the carry with a single cheap operation. This optimization only works on +twos compliment machines which is a safe assumption to make. + +If $a$ has a larger magnitude than $b$ an additional loop (\textit{see lines @64,for@ through @73,}@}) is required to propagate the carry through +$a$ and copy the result to $c$. + +\subsection{High Level Addition} +Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be +established. This high level addition algorithm will be what other algorithms and developers will use to perform addition of mp\_int data +types. + +Recall from section 5.2 that an mp\_int represents an integer with an unsigned mantissa (\textit{the array of digits}) and a \textbf{sign} +flag. A high level addition is actually performed as a series of eight separate cases which can be optimized down to three unique cases. + +\begin{figure}[!here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_add}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ \\ +\textbf{Output}. The signed addition $c = a + b$. \\ +\hline \\ +1. if $a.sign = b.sign$ then do \\ +\hspace{3mm}1.1 $c.sign \leftarrow a.sign$ \\ +\hspace{3mm}1.2 $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add})\\ +2. else do \\ +\hspace{3mm}2.1 if $\vert a \vert < \vert b \vert$ then do (\textit{mp\_cmp\_mag}) \\ +\hspace{6mm}2.1.1 $c.sign \leftarrow b.sign$ \\ +\hspace{6mm}2.1.2 $c \leftarrow \vert b \vert - \vert a \vert$ (\textit{s\_mp\_sub}) \\ +\hspace{3mm}2.2 else do \\ +\hspace{6mm}2.2.1 $c.sign \leftarrow a.sign$ \\ +\hspace{6mm}2.2.2 $c \leftarrow \vert a \vert - \vert b \vert$ \\ +3. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_add} +\end{figure} + +\textbf{Algorithm mp\_add.} +This algorithm performs the signed addition of two mp\_int variables. There is no reference algorithm to draw upon from +either \cite{TAOCPV2} or \cite{HAC} since they both only provide unsigned operations. The algorithm is fairly +straightforward but restricted since subtraction can only produce positive results. + +\begin{figure}[here] +\begin{small} +\begin{center} +\begin{tabular}{|c|c|c|c|c|} +\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert > \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\ +\hline $+$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\ +\hline $+$ & $+$ & No & $c = a + b$ & $a.sign$ \\ +\hline $-$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\ +\hline $-$ & $-$ & No & $c = a + b$ & $a.sign$ \\ +\hline &&&&\\ + +\hline $+$ & $-$ & No & $c = b - a$ & $b.sign$ \\ +\hline $-$ & $+$ & No & $c = b - a$ & $b.sign$ \\ + +\hline &&&&\\ + +\hline $+$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\ +\hline $-$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\ + +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Addition Guide Chart} +\label{fig:AddChart} +\end{figure} + +Figure~\ref{fig:AddChart} lists all of the eight possible input combinations and is sorted to show that only three +specific cases need to be handled. The return code of the unsigned operations at step 1.2, 2.1.2 and 2.2.2 are +forwarded to step three to check for errors. This simplifies the description of the algorithm considerably and best +follows how the implementation actually was achieved. + +Also note how the \textbf{sign} is set before the unsigned addition or subtraction is performed. Recall from the descriptions of algorithms +s\_mp\_add and s\_mp\_sub that the mp\_clamp function is used at the end to trim excess digits. The mp\_clamp algorithm will set the \textbf{sign} +to \textbf{MP\_ZPOS} when the \textbf{used} digit count reaches zero. + +For example, consider performing $-a + a$ with algorithm mp\_add. By the description of the algorithm the sign is set to \textbf{MP\_NEG} which would +produce a result of $-0$. However, since the sign is set first then the unsigned addition is performed the subsequent usage of algorithm mp\_clamp +within algorithm s\_mp\_add will force $-0$ to become $0$. + +EXAM,bn_mp_add.c + +The source code follows the algorithm fairly closely. The most notable new source code addition is the usage of the $res$ integer variable which +is used to pass result of the unsigned operations forward. Unlike in the algorithm, the variable $res$ is merely returned as is without +explicitly checking it and returning the constant \textbf{MP\_OKAY}. The observation is this algorithm will succeed or fail only if the lower +level functions do so. Returning their return code is sufficient. + +\subsection{High Level Subtraction} +The high level signed subtraction algorithm is essentially the same as the high level signed addition algorithm. + +\newpage\begin{figure}[!here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_sub}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ \\ +\textbf{Output}. The signed subtraction $c = a - b$. \\ +\hline \\ +1. if $a.sign \ne b.sign$ then do \\ +\hspace{3mm}1.1 $c.sign \leftarrow a.sign$ \\ +\hspace{3mm}1.2 $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add}) \\ +2. else do \\ +\hspace{3mm}2.1 if $\vert a \vert \ge \vert b \vert$ then do (\textit{mp\_cmp\_mag}) \\ +\hspace{6mm}2.1.1 $c.sign \leftarrow a.sign$ \\ +\hspace{6mm}2.1.2 $c \leftarrow \vert a \vert - \vert b \vert$ (\textit{s\_mp\_sub}) \\ +\hspace{3mm}2.2 else do \\ +\hspace{6mm}2.2.1 $c.sign \leftarrow \left \lbrace \begin{array}{ll} + MP\_ZPOS & \mbox{if }a.sign = MP\_NEG \\ + MP\_NEG & \mbox{otherwise} \\ + \end{array} \right .$ \\ +\hspace{6mm}2.2.2 $c \leftarrow \vert b \vert - \vert a \vert$ \\ +3. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_sub} +\end{figure} + +\textbf{Algorithm mp\_sub.} +This algorithm performs the signed subtraction of two inputs. Similar to algorithm mp\_add there is no reference in either \cite{TAOCPV2} or +\cite{HAC}. Also this algorithm is restricted by algorithm s\_mp\_sub. Chart \ref{fig:SubChart} lists the eight possible inputs and +the operations required. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{|c|c|c|c|c|} +\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert \ge \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\ +\hline $+$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\ +\hline $+$ & $-$ & No & $c = a + b$ & $a.sign$ \\ +\hline $-$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\ +\hline $-$ & $+$ & No & $c = a + b$ & $a.sign$ \\ +\hline &&&& \\ +\hline $+$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\ +\hline $-$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\ +\hline &&&& \\ +\hline $+$ & $+$ & No & $c = b - a$ & $\mbox{opposite of }a.sign$ \\ +\hline $-$ & $-$ & No & $c = b - a$ & $\mbox{opposite of }a.sign$ \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Subtraction Guide Chart} +\label{fig:SubChart} +\end{figure} + +Similar to the case of algorithm mp\_add the \textbf{sign} is set first before the unsigned addition or subtraction. That is to prevent the +algorithm from producing $-a - -a = -0$ as a result. + +EXAM,bn_mp_sub.c + +Much like the implementation of algorithm mp\_add the variable $res$ is used to catch the return code of the unsigned addition or subtraction operations +and forward it to the end of the function. On line @38, != MP_LT@ the ``not equal to'' \textbf{MP\_LT} expression is used to emulate a +``greater than or equal to'' comparison. + +\section{Bit and Digit Shifting} +MARK,POLY +It is quite common to think of a multiple precision integer as a polynomial in $x$, that is $y = f(\beta)$ where $f(x) = \sum_{i=0}^{n-1} a_i x^i$. +This notation arises within discussion of Montgomery and Diminished Radix Reduction as well as Karatsuba multiplication and squaring. + +In order to facilitate operations on polynomials in $x$ as above a series of simple ``digit'' algorithms have to be established. That is to shift +the digits left or right as well to shift individual bits of the digits left and right. It is important to note that not all ``shift'' operations +are on radix-$\beta$ digits. + +\subsection{Multiplication by Two} + +In a binary system where the radix is a power of two multiplication by two not only arises often in other algorithms it is a fairly efficient +operation to perform. A single precision logical shift left is sufficient to multiply a single digit by two. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_mul\_2}. \\ +\textbf{Input}. One mp\_int $a$ \\ +\textbf{Output}. $b = 2a$. \\ +\hline \\ +1. If $b.alloc < a.used + 1$ then grow $b$ to hold $a.used + 1$ digits. (\textit{mp\_grow}) \\ +2. $oldused \leftarrow b.used$ \\ +3. $b.used \leftarrow a.used$ \\ +4. $r \leftarrow 0$ \\ +5. for $n$ from 0 to $a.used - 1$ do \\ +\hspace{3mm}5.1 $rr \leftarrow a_n >> (lg(\beta) - 1)$ \\ +\hspace{3mm}5.2 $b_n \leftarrow (a_n << 1) + r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{3mm}5.3 $r \leftarrow rr$ \\ +6. If $r \ne 0$ then do \\ +\hspace{3mm}6.1 $b_{n + 1} \leftarrow r$ \\ +\hspace{3mm}6.2 $b.used \leftarrow b.used + 1$ \\ +7. If $b.used < oldused - 1$ then do \\ +\hspace{3mm}7.1 for $n$ from $b.used$ to $oldused - 1$ do \\ +\hspace{6mm}7.1.1 $b_n \leftarrow 0$ \\ +8. $b.sign \leftarrow a.sign$ \\ +9. Return(\textit{MP\_OKAY}).\\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_mul\_2} +\end{figure} + +\textbf{Algorithm mp\_mul\_2.} +This algorithm will quickly multiply a mp\_int by two provided $\beta$ is a power of two. Neither \cite{TAOCPV2} nor \cite{HAC} describe such +an algorithm despite the fact it arises often in other algorithms. The algorithm is setup much like the lower level algorithm s\_mp\_add since +it is for all intents and purposes equivalent to the operation $b = \vert a \vert + \vert a \vert$. + +Step 1 and 2 grow the input as required to accomodate the maximum number of \textbf{used} digits in the result. The initial \textbf{used} count +is set to $a.used$ at step 4. Only if there is a final carry will the \textbf{used} count require adjustment. + +Step 6 is an optimization implementation of the addition loop for this specific case. That is since the two values being added together +are the same there is no need to perform two reads from the digits of $a$. Step 6.1 performs a single precision shift on the current digit $a_n$ to +obtain what will be the carry for the next iteration. Step 6.2 calculates the $n$'th digit of the result as single precision shift of $a_n$ plus +the previous carry. Recall from ~SHIFTS~ that $a_n << 1$ is equivalent to $a_n \cdot 2$. An iteration of the addition loop is finished with +forwarding the carry to the next iteration. + +Step 7 takes care of any final carry by setting the $a.used$'th digit of the result to the carry and augmenting the \textbf{used} count of $b$. +Step 8 clears any leading digits of $b$ in case it originally had a larger magnitude than $a$. + +EXAM,bn_mp_mul_2.c + +This implementation is essentially an optimized implementation of s\_mp\_add for the case of doubling an input. The only noteworthy difference +is the use of the logical shift operator on line @52,<<@ to perform a single precision doubling. + +\subsection{Division by Two} +A division by two can just as easily be accomplished with a logical shift right as multiplication by two can be with a logical shift left. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_div\_2}. \\ +\textbf{Input}. One mp\_int $a$ \\ +\textbf{Output}. $b = a/2$. \\ +\hline \\ +1. If $b.alloc < a.used$ then grow $b$ to hold $a.used$ digits. (\textit{mp\_grow}) \\ +2. If the reallocation failed return(\textit{MP\_MEM}). \\ +3. $oldused \leftarrow b.used$ \\ +4. $b.used \leftarrow a.used$ \\ +5. $r \leftarrow 0$ \\ +6. for $n$ from $b.used - 1$ to $0$ do \\ +\hspace{3mm}6.1 $rr \leftarrow a_n \mbox{ (mod }2\mbox{)}$\\ +\hspace{3mm}6.2 $b_n \leftarrow (a_n >> 1) + (r << (lg(\beta) - 1)) \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{3mm}6.3 $r \leftarrow rr$ \\ +7. If $b.used < oldused - 1$ then do \\ +\hspace{3mm}7.1 for $n$ from $b.used$ to $oldused - 1$ do \\ +\hspace{6mm}7.1.1 $b_n \leftarrow 0$ \\ +8. $b.sign \leftarrow a.sign$ \\ +9. Clamp excess digits of $b$. (\textit{mp\_clamp}) \\ +10. Return(\textit{MP\_OKAY}).\\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_div\_2} +\end{figure} + +\textbf{Algorithm mp\_div\_2.} +This algorithm will divide an mp\_int by two using logical shifts to the right. Like mp\_mul\_2 it uses a modified low level addition +core as the basis of the algorithm. Unlike mp\_mul\_2 the shift operations work from the leading digit to the trailing digit. The algorithm +could be written to work from the trailing digit to the leading digit however, it would have to stop one short of $a.used - 1$ digits to prevent +reading past the end of the array of digits. + +Essentially the loop at step 6 is similar to that of mp\_mul\_2 except the logical shifts go in the opposite direction and the carry is at the +least significant bit not the most significant bit. + +EXAM,bn_mp_div_2.c + +\section{Polynomial Basis Operations} +Recall from ~POLY~ that any integer can be represented as a polynomial in $x$ as $y = f(\beta)$. Such a representation is also known as +the polynomial basis \cite[pp. 48]{ROSE}. Given such a notation a multiplication or division by $x$ amounts to shifting whole digits a single +place. The need for such operations arises in several other higher level algorithms such as Barrett and Montgomery reduction, integer +division and Karatsuba multiplication. + +Converting from an array of digits to polynomial basis is very simple. Consider the integer $y \equiv (a_2, a_1, a_0)_{\beta}$ and recall that +$y = \sum_{i=0}^{2} a_i \beta^i$. Simply replace $\beta$ with $x$ and the expression is in polynomial basis. For example, $f(x) = 8x + 9$ is the +polynomial basis representation for $89$ using radix ten. That is, $f(10) = 8(10) + 9 = 89$. + +\subsection{Multiplication by $x$} + +Given a polynomial in $x$ such as $f(x) = a_n x^n + a_{n-1} x^{n-1} + ... + a_0$ multiplying by $x$ amounts to shifting the coefficients up one +degree. In this case $f(x) \cdot x = a_n x^{n+1} + a_{n-1} x^n + ... + a_0 x$. From a scalar basis point of view multiplying by $x$ is equivalent to +multiplying by the integer $\beta$. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_lshd}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $a \leftarrow a \cdot \beta^b$ (equivalent to multiplication by $x^b$). \\ +\hline \\ +1. If $b \le 0$ then return(\textit{MP\_OKAY}). \\ +2. If $a.alloc < a.used + b$ then grow $a$ to at least $a.used + b$ digits. (\textit{mp\_grow}). \\ +3. If the reallocation failed return(\textit{MP\_MEM}). \\ +4. $a.used \leftarrow a.used + b$ \\ +5. $i \leftarrow a.used - 1$ \\ +6. $j \leftarrow a.used - 1 - b$ \\ +7. for $n$ from $a.used - 1$ to $b$ do \\ +\hspace{3mm}7.1 $a_{i} \leftarrow a_{j}$ \\ +\hspace{3mm}7.2 $i \leftarrow i - 1$ \\ +\hspace{3mm}7.3 $j \leftarrow j - 1$ \\ +8. for $n$ from 0 to $b - 1$ do \\ +\hspace{3mm}8.1 $a_n \leftarrow 0$ \\ +9. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_lshd} +\end{figure} + +\textbf{Algorithm mp\_lshd.} +This algorithm multiplies an mp\_int by the $b$'th power of $x$. This is equivalent to multiplying by $\beta^b$. The algorithm differs +from the other algorithms presented so far as it performs the operation in place instead storing the result in a separate location. The +motivation behind this change is due to the way this function is typically used. Algorithms such as mp\_add store the result in an optionally +different third mp\_int because the original inputs are often still required. Algorithm mp\_lshd (\textit{and similarly algorithm mp\_rshd}) is +typically used on values where the original value is no longer required. The algorithm will return success immediately if +$b \le 0$ since the rest of algorithm is only valid when $b > 0$. + +First the destination $a$ is grown as required to accomodate the result. The counters $i$ and $j$ are used to form a \textit{sliding window} over +the digits of $a$ of length $b$. The head of the sliding window is at $i$ (\textit{the leading digit}) and the tail at $j$ (\textit{the trailing digit}). +The loop on step 7 copies the digit from the tail to the head. In each iteration the window is moved down one digit. The last loop on +step 8 sets the lower $b$ digits to zero. + +\newpage +FIGU,sliding_window,Sliding Window Movement + +EXAM,bn_mp_lshd.c + +The if statement on line @24,if@ ensures that the $b$ variable is greater than zero. The \textbf{used} count is incremented by $b$ before +the copy loop begins. This elminates the need for an additional variable in the for loop. The variable $top$ on line @42,top@ is an alias +for the leading digit while $bottom$ on line @45,bottom@ is an alias for the trailing edge. The aliases form a window of exactly $b$ digits +over the input. + +\subsection{Division by $x$} + +Division by powers of $x$ is easily achieved by shifting the digits right and removing any that will end up to the right of the zero'th digit. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_rshd}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $a \leftarrow a / \beta^b$ (Divide by $x^b$). \\ +\hline \\ +1. If $b \le 0$ then return. \\ +2. If $a.used \le b$ then do \\ +\hspace{3mm}2.1 Zero $a$. (\textit{mp\_zero}). \\ +\hspace{3mm}2.2 Return. \\ +3. $i \leftarrow 0$ \\ +4. $j \leftarrow b$ \\ +5. for $n$ from 0 to $a.used - b - 1$ do \\ +\hspace{3mm}5.1 $a_i \leftarrow a_j$ \\ +\hspace{3mm}5.2 $i \leftarrow i + 1$ \\ +\hspace{3mm}5.3 $j \leftarrow j + 1$ \\ +6. for $n$ from $a.used - b$ to $a.used - 1$ do \\ +\hspace{3mm}6.1 $a_n \leftarrow 0$ \\ +7. $a.used \leftarrow a.used - b$ \\ +8. Return. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_rshd} +\end{figure} + +\textbf{Algorithm mp\_rshd.} +This algorithm divides the input in place by the $b$'th power of $x$. It is analogous to dividing by a $\beta^b$ but much quicker since +it does not require single precision division. This algorithm does not actually return an error code as it cannot fail. + +If the input $b$ is less than one the algorithm quickly returns without performing any work. If the \textbf{used} count is less than or equal +to the shift count $b$ then it will simply zero the input and return. + +After the trivial cases of inputs have been handled the sliding window is setup. Much like the case of algorithm mp\_lshd a sliding window that +is $b$ digits wide is used to copy the digits. Unlike mp\_lshd the window slides in the opposite direction from the trailing to the leading digit. +Also the digits are copied from the leading to the trailing edge. + +Once the window copy is complete the upper digits must be zeroed and the \textbf{used} count decremented. + +EXAM,bn_mp_rshd.c + +The only noteworthy element of this routine is the lack of a return type. + +-- Will update later to give it a return type...Tom + +\section{Powers of Two} + +Now that algorithms for moving single bits as well as whole digits exist algorithms for moving the ``in between'' distances are required. For +example, to quickly multiply by $2^k$ for any $k$ without using a full multiplier algorithm would prove useful. Instead of performing single +shifts $k$ times to achieve a multiplication by $2^{\pm k}$ a mixture of whole digit shifting and partial digit shifting is employed. + +\subsection{Multiplication by Power of Two} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_mul\_2d}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $c \leftarrow a \cdot 2^b$. \\ +\hline \\ +1. $c \leftarrow a$. (\textit{mp\_copy}) \\ +2. If $c.alloc < c.used + \lfloor b / lg(\beta) \rfloor + 2$ then grow $c$ accordingly. \\ +3. If the reallocation failed return(\textit{MP\_MEM}). \\ +4. If $b \ge lg(\beta)$ then \\ +\hspace{3mm}4.1 $c \leftarrow c \cdot \beta^{\lfloor b / lg(\beta) \rfloor}$ (\textit{mp\_lshd}). \\ +\hspace{3mm}4.2 If step 4.1 failed return(\textit{MP\_MEM}). \\ +5. $d \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\ +6. If $d \ne 0$ then do \\ +\hspace{3mm}6.1 $mask \leftarrow 2^d$ \\ +\hspace{3mm}6.2 $r \leftarrow 0$ \\ +\hspace{3mm}6.3 for $n$ from $0$ to $c.used - 1$ do \\ +\hspace{6mm}6.3.1 $rr \leftarrow c_n >> (lg(\beta) - d) \mbox{ (mod }mask\mbox{)}$ \\ +\hspace{6mm}6.3.2 $c_n \leftarrow (c_n << d) + r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{6mm}6.3.3 $r \leftarrow rr$ \\ +\hspace{3mm}6.4 If $r > 0$ then do \\ +\hspace{6mm}6.4.1 $c_{c.used} \leftarrow r$ \\ +\hspace{6mm}6.4.2 $c.used \leftarrow c.used + 1$ \\ +7. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_mul\_2d} +\end{figure} + +\textbf{Algorithm mp\_mul\_2d.} +This algorithm multiplies $a$ by $2^b$ and stores the result in $c$. The algorithm uses algorithm mp\_lshd and a derivative of algorithm mp\_mul\_2 to +quickly compute the product. + +First the algorithm will multiply $a$ by $x^{\lfloor b / lg(\beta) \rfloor}$ which will ensure that the remainder multiplicand is less than +$\beta$. For example, if $b = 37$ and $\beta = 2^{28}$ then this step will multiply by $x$ leaving a multiplication by $2^{37 - 28} = 2^{9}$ +left. + +After the digits have been shifted appropriately at most $lg(\beta) - 1$ shifts are left to perform. Step 5 calculates the number of remaining shifts +required. If it is non-zero a modified shift loop is used to calculate the remaining product. +Essentially the loop is a generic version of algorith mp\_mul2 designed to handle any shift count in the range $1 \le x < lg(\beta)$. The $mask$ +variable is used to extract the upper $d$ bits to form the carry for the next iteration. + +This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to +complete. It is possible to optimize this algorithm down to a $O(n)$ algorithm at a cost of making the algorithm slightly harder to follow. + +EXAM,bn_mp_mul_2d.c + +Notes to be revised when code is updated. -- Tom + +\subsection{Division by Power of Two} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_div\_2d}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $c \leftarrow \lfloor a / 2^b \rfloor, d \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\ +\hline \\ +1. If $b \le 0$ then do \\ +\hspace{3mm}1.1 $c \leftarrow a$ (\textit{mp\_copy}) \\ +\hspace{3mm}1.2 $d \leftarrow 0$ (\textit{mp\_zero}) \\ +\hspace{3mm}1.3 Return(\textit{MP\_OKAY}). \\ +2. $c \leftarrow a$ \\ +3. $d \leftarrow a \mbox{ (mod }2^b\mbox{)}$ (\textit{mp\_mod\_2d}) \\ +4. If $b \ge lg(\beta)$ then do \\ +\hspace{3mm}4.1 $c \leftarrow \lfloor c/\beta^{\lfloor b/lg(\beta) \rfloor} \rfloor$ (\textit{mp\_rshd}). \\ +5. $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\ +6. If $k \ne 0$ then do \\ +\hspace{3mm}6.1 $mask \leftarrow 2^k$ \\ +\hspace{3mm}6.2 $r \leftarrow 0$ \\ +\hspace{3mm}6.3 for $n$ from $c.used - 1$ to $0$ do \\ +\hspace{6mm}6.3.1 $rr \leftarrow c_n \mbox{ (mod }mask\mbox{)}$ \\ +\hspace{6mm}6.3.2 $c_n \leftarrow (c_n >> k) + (r << (lg(\beta) - k))$ \\ +\hspace{6mm}6.3.3 $r \leftarrow rr$ \\ +7. Clamp excess digits of $c$. (\textit{mp\_clamp}) \\ +8. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_div\_2d} +\end{figure} + +\textbf{Algorithm mp\_div\_2d.} +This algorithm will divide an input $a$ by $2^b$ and produce the quotient and remainder. The algorithm is designed much like algorithm +mp\_mul\_2d by first using whole digit shifts then single precision shifts. This algorithm will also produce the remainder of the division +by using algorithm mp\_mod\_2d. + +EXAM,bn_mp_div_2d.c + +The implementation of algorithm mp\_div\_2d is slightly different than the algorithm specifies. The remainder $d$ may be optionally +ignored by passing \textbf{NULL} as the pointer to the mp\_int variable. The temporary mp\_int variable $t$ is used to hold the +result of the remainder operation until the end. This allows $d$ and $a$ to represent the same mp\_int without modifying $a$ before +the quotient is obtained. + +The remainder of the source code is essentially the same as the source code for mp\_mul\_2d. (-- Fix this paragraph up later, Tom). + +\subsection{Remainder of Division by Power of Two} + +The last algorithm in the series of polynomial basis power of two algorithms is calculating the remainder of division by $2^b$. This +algorithm benefits from the fact that in twos complement arithmetic $a \mbox{ (mod }2^b\mbox{)}$ is the same as $a$ AND $2^b - 1$. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_mod\_2d}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $c \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\ +\hline \\ +1. If $b \le 0$ then do \\ +\hspace{3mm}1.1 $c \leftarrow 0$ (\textit{mp\_zero}) \\ +\hspace{3mm}1.2 Return(\textit{MP\_OKAY}). \\ +2. If $b > a.used \cdot lg(\beta)$ then do \\ +\hspace{3mm}2.1 $c \leftarrow a$ (\textit{mp\_copy}) \\ +\hspace{3mm}2.2 Return the result of step 2.1. \\ +3. $c \leftarrow a$ \\ +4. If step 3 failed return(\textit{MP\_MEM}). \\ +5. for $n$ from $\lceil b / lg(\beta) \rceil$ to $c.used$ do \\ +\hspace{3mm}5.1 $c_n \leftarrow 0$ \\ +6. $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\ +7. $c_{\lfloor b / lg(\beta) \rfloor} \leftarrow c_{\lfloor b / lg(\beta) \rfloor} \mbox{ (mod }2^{k}\mbox{)}$. \\ +8. Clamp excess digits of $c$. (\textit{mp\_clamp}) \\ +9. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_mod\_2d} +\end{figure} + +\textbf{Algorithm mp\_mod\_2d.} +This algorithm will quickly calculate the value of $a \mbox{ (mod }2^b\mbox{)}$. First if $b$ is less than or equal to zero the +result is set to zero. If $b$ is greater than the number of bits in $a$ then it simply copies $a$ to $c$ and returns. Otherwise, $a$ +is copied to $b$, leading digits are removed and the remaining leading digit is trimed to the exact bit count. + +EXAM,bn_mp_mod_2d.c + +-- Add comments later, Tom. + +\section*{Exercises} +\begin{tabular}{cl} +$\left [ 3 \right ] $ & Devise an algorithm that performs $a \cdot 2^b$ for generic values of $b$ \\ + & in $O(n)$ time. \\ + &\\ +$\left [ 3 \right ] $ & Devise an efficient algorithm to multiply by small low hamming \\ + & weight values such as $3$, $5$ and $9$. Extend it to handle all values \\ + & upto $64$ with a hamming weight less than three. \\ + &\\ +$\left [ 2 \right ] $ & Modify the preceding algorithm to handle values of the form \\ + & $2^k - 1$ as well. \\ + &\\ +$\left [ 3 \right ] $ & Using only algorithms mp\_mul\_2, mp\_div\_2 and mp\_add create an \\ + & algorithm to multiply two integers in roughly $O(2n^2)$ time for \\ + & any $n$-bit input. Note that the time of addition is ignored in the \\ + & calculation. \\ + & \\ +$\left [ 5 \right ] $ & Improve the previous algorithm to have a working time of at most \\ + & $O \left (2^{(k-1)}n + \left ({2n^2 \over k} \right ) \right )$ for an appropriate choice of $k$. Again ignore \\ + & the cost of addition. \\ + & \\ +$\left [ 2 \right ] $ & Devise a chart to find optimal values of $k$ for the previous problem \\ + & for $n = 64 \ldots 1024$ in steps of $64$. \\ + & \\ +$\left [ 2 \right ] $ & Using only algorithms mp\_abs and mp\_sub devise another method for \\ + & calculating the result of a signed comparison. \\ + & +\end{tabular} + +\chapter{Multiplication and Squaring} +\section{The Multipliers} +For most number theoretic problems including certain public key cryptographic algorithms, the ``multipliers'' form the most important subset of +algorithms of any multiple precision integer package. The set of multiplier algorithms include integer multiplication, squaring and modular reduction +where in each of the algorithms single precision multiplication is the dominant operation performed. This chapter will discuss integer multiplication +and squaring, leaving modular reductions for the subsequent chapter. + +The importance of the multiplier algorithms is for the most part driven by the fact that certain popular public key algorithms are based on modular +exponentiation, that is computing $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for some arbitrary choice of $a$, $b$, $c$ and $d$. During a modular +exponentiation the majority\footnote{Roughly speaking a modular exponentiation will spend about 40\% of the time performing modular reductions, +35\% of the time performing squaring and 25\% of the time performing multiplications.} of the processor time is spent performing single precision +multiplications. + +For centuries general purpose multiplication has required a lengthly $O(n^2)$ process, whereby each digit of one multiplicand has to be multiplied +against every digit of the other multiplicand. Traditional long-hand multiplication is based on this process; while the techniques can differ the +overall algorithm used is essentially the same. Only ``recently'' have faster algorithms been studied. First Karatsuba multiplication was discovered in +1962. This algorithm can multiply two numbers with considerably fewer single precision multiplications when compared to the long-hand approach. +This technique led to the discovery of polynomial basis algorithms (\textit{good reference?}) and subquently Fourier Transform based solutions. + +\section{Multiplication} +\subsection{The Baseline Multiplication} +\label{sec:basemult} +\index{baseline multiplication} +Computing the product of two integers in software can be achieved using a trivial adaptation of the standard $O(n^2)$ long-hand multiplication +algorithm that school children are taught. The algorithm is considered an $O(n^2)$ algorithm since for two $n$-digit inputs $n^2$ single precision +multiplications are required. More specifically for a $m$ and $n$ digit input $m \cdot n$ single precision multiplications are required. To +simplify most discussions, it will be assumed that the inputs have comparable number of digits. + +The ``baseline multiplication'' algorithm is designed to act as the ``catch-all'' algorithm, only to be used when the faster algorithms cannot be +used. This algorithm does not use any particularly interesting optimizations and should ideally be avoided if possible. One important +facet of this algorithm, is that it has been modified to only produce a certain amount of output digits as resolution. The importance of this +modification will become evident during the discussion of Barrett modular reduction. Recall that for a $n$ and $m$ digit input the product +will be at most $n + m$ digits. Therefore, this algorithm can be reduced to a full multiplier by having it produce $n + m$ digits of the product. + +Recall from ~GAMMA~ the definition of $\gamma$ as the number of bits in the type \textbf{mp\_digit}. We shall now extend the variable set to +include $\alpha$ which shall represent the number of bits in the type \textbf{mp\_word}. This implies that $2^{\alpha} > 2 \cdot \beta^2$. The +constant $\delta = 2^{\alpha - 2lg(\beta)}$ will represent the maximal weight of any column in a product (\textit{see ~COMBA~ for more information}). + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{s\_mp\_mul\_digs}. \\ +\textbf{Input}. mp\_int $a$, mp\_int $b$ and an integer $digs$ \\ +\textbf{Output}. $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\ +\hline \\ +1. If min$(a.used, b.used) < \delta$ then do \\ +\hspace{3mm}1.1 Calculate $c = \vert a \vert \cdot \vert b \vert$ by the Comba method (\textit{see algorithm~\ref{fig:COMBAMULT}}). \\ +\hspace{3mm}1.2 Return the result of step 1.1 \\ +\\ +Allocate and initialize a temporary mp\_int. \\ +2. Init $t$ to be of size $digs$ \\ +3. If step 2 failed return(\textit{MP\_MEM}). \\ +4. $t.used \leftarrow digs$ \\ +\\ +Compute the product. \\ +5. for $ix$ from $0$ to $a.used - 1$ do \\ +\hspace{3mm}5.1 $u \leftarrow 0$ \\ +\hspace{3mm}5.2 $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\ +\hspace{3mm}5.3 If $pb < 1$ then goto step 6. \\ +\hspace{3mm}5.4 for $iy$ from $0$ to $pb - 1$ do \\ +\hspace{6mm}5.4.1 $\hat r \leftarrow t_{iy + ix} + a_{ix} \cdot b_{iy} + u$ \\ +\hspace{6mm}5.4.2 $t_{iy + ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{6mm}5.4.3 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ +\hspace{3mm}5.5 if $ix + pb < digs$ then do \\ +\hspace{6mm}5.5.1 $t_{ix + pb} \leftarrow u$ \\ +6. Clamp excess digits of $t$. \\ +7. Swap $c$ with $t$ \\ +8. Clear $t$ \\ +9. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm s\_mp\_mul\_digs} +\end{figure} + +\textbf{Algorithm s\_mp\_mul\_digs.} +This algorithm computes the unsigned product of two inputs $a$ and $b$, limited to an output precision of $digs$ digits. While it may seem +a bit awkward to modify the function from its simple $O(n^2)$ description, the usefulness of partial multipliers will arise in a subsequent +algorithm. The algorithm is loosely based on algorithm 14.12 from \cite[pp. 595]{HAC} and is similar to Algorithm M of Knuth \cite[pp. 268]{TAOCPV2}. +Algorithm s\_mp\_mul\_digs differs from these cited references since it can produce a variable output precision regardless of the precision of the +inputs. + +The first thing this algorithm checks for is whether a Comba multiplier can be used instead. If the minimum digit count of either +input is less than $\delta$, then the Comba method may be used instead. After the Comba method is ruled out, the baseline algorithm begins. A +temporary mp\_int variable $t$ is used to hold the intermediate result of the product. This allows the algorithm to be used to +compute products when either $a = c$ or $b = c$ without overwriting the inputs. + +All of step 5 is the infamous $O(n^2)$ multiplication loop slightly modified to only produce upto $digs$ digits of output. The $pb$ variable +is given the count of digits to read from $b$ inside the nested loop. If $pb \le 1$ then no more output digits can be produced and the algorithm +will exit the loop. The best way to think of the loops are as a series of $pb \times 1$ multiplications. That is, in each pass of the +innermost loop $a_{ix}$ is multiplied against $b$ and the result is added (\textit{with an appropriate shift}) to $t$. + +For example, consider multiplying $576$ by $241$. That is equivalent to computing $10^0(1)(576) + 10^1(4)(576) + 10^2(2)(576)$ which is best +visualized in the following table. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{|c|c|c|c|c|c|l|} +\hline && & 5 & 7 & 6 & \\ +\hline $\times$&& & 2 & 4 & 1 & \\ +\hline &&&&&&\\ + && & 5 & 7 & 6 & $10^0(1)(576)$ \\ + &2 & 3 & 6 & 1 & 6 & $10^1(4)(576) + 10^0(1)(576)$ \\ + 1 & 3 & 8 & 8 & 1 & 6 & $10^2(2)(576) + 10^1(4)(576) + 10^0(1)(576)$ \\ +\hline +\end{tabular} +\end{center} +\caption{Long-Hand Multiplication Diagram} +\end{figure} + +Each row of the product is added to the result after being shifted to the left (\textit{multiplied by a power of the radix}) by the appropriate +count. That is in pass $ix$ of the inner loop the product is added starting at the $ix$'th digit of the reult. + +Step 5.4.1 introduces the hat symbol (\textit{e.g. $\hat r$}) which represents a double precision variable. The multiplication on that step +is assumed to be a double wide output single precision multiplication. That is, two single precision variables are multiplied to produce a +double precision result. The step is somewhat optimized from a long-hand multiplication algorithm because the carry from the addition in step +5.4.1 is propagated through the nested loop. If the carry was not propagated immediately it would overflow the single precision digit +$t_{ix+iy}$ and the result would be lost. + +At step 5.5 the nested loop is finished and any carry that was left over should be forwarded. The carry does not have to be added to the $ix+pb$'th +digit since that digit is assumed to be zero at this point. However, if $ix + pb \ge digs$ the carry is not set as it would make the result +exceed the precision requested. + +EXAM,bn_s_mp_mul_digs.c + +Lines @31,if@ to @35,}@ determine if the Comba method can be used first. The conditions for using the Comba routine are that min$(a.used, b.used) < \delta$ and +the number of digits of output is less than \textbf{MP\_WARRAY}. This new constant is used to control +the stack usage in the Comba routines. By default it is set to $\delta$ but can be reduced when memory is at a premium. + +Of particular importance is the calculation of the $ix+iy$'th column on lines @64,mp_word@, @65,mp_word@ and @66,mp_word@. Note how all of the +variables are cast to the type \textbf{mp\_word}, which is also the type of variable $\hat r$. That is to ensure that double precision operations +are used instead of single precision. The multiplication on line @65,) * (@ makes use of a specific GCC optimizer behaviour. On the outset it looks like +the compiler will have to use a double precision multiplication to produce the result required. Such an operation would be horribly slow on most +processors and drag this to a crawl. However, GCC is smart enough to realize that double wide output single precision multipliers can be used. For +example, the instruction ``MUL'' on the x86 processor can multiply two 32-bit values and produce a 64-bit result. + +\subsection{Faster Multiplication by the ``Comba'' Method} +MARK,COMBA + +One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be computed and propagated upwards. This +makes the nested loop very sequential and hard to unroll and implement in parallel. The ``Comba'' \cite{COMBA} method is named after little known +(\textit{in cryptographic venues}) Paul G. Comba who described a method of implementing fast multipliers that do not require nested +carry fixup operations. As an interesting aside it seems that Paul Barrett describes a similar technique in +his 1986 paper \cite{BARRETT} written five years before. + +At the heart of the Comba technique is once again the long-hand algorithm. Except in this case a slight twist is placed on how +the columns of the result are produced. In the standard long-hand algorithm rows of products are produced then added together to form the +final result. In the baseline algorithm the columns are added together after each iteration to get the result instantaneously. + +In the Comba algorithm the columns of the result are produced entirely independently of each other. That is at the $O(n^2)$ level a +simple multiplication and addition step is performed. The carries of the columns are propagated after the nested loop to reduce the amount +of work requiored. Succintly the first step of the algorithm is to compute the product vector $\vec x$ as follows. + +\begin{equation} +\vec x_n = \sum_{i+j = n} a_ib_j, \forall n \in \lbrace 0, 1, 2, \ldots, i + j \rbrace +\end{equation} + +Where $\vec x_n$ is the $n'th$ column of the output vector. Consider the following example which computes the vector $\vec x$ for the multiplication +of $576$ and $241$. + +\newpage\begin{figure}[here] +\begin{small} +\begin{center} +\begin{tabular}{|c|c|c|c|c|c|} + \hline & & 5 & 7 & 6 & First Input\\ + \hline $\times$ & & 2 & 4 & 1 & Second Input\\ +\hline & & $1 \cdot 5 = 5$ & $1 \cdot 7 = 7$ & $1 \cdot 6 = 6$ & First pass \\ + & $4 \cdot 5 = 20$ & $4 \cdot 7+5=33$ & $4 \cdot 6+7=31$ & 6 & Second pass \\ + $2 \cdot 5 = 10$ & $2 \cdot 7 + 20 = 34$ & $2 \cdot 6+33=45$ & 31 & 6 & Third pass \\ +\hline 10 & 34 & 45 & 31 & 6 & Final Result \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Comba Multiplication Diagram} +\end{figure} + +At this point the vector $x = \left < 10, 34, 45, 31, 6 \right >$ is the result of the first step of the Comba multipler. +Now the columns must be fixed by propagating the carry upwards. The resultant vector will have one extra dimension over the input vector which is +congruent to adding a leading zero digit. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Comba Fixup}. \\ +\textbf{Input}. Vector $\vec x$ of dimension $k$ \\ +\textbf{Output}. Vector $\vec x$ such that the carries have been propagated. \\ +\hline \\ +1. for $n$ from $0$ to $k - 1$ do \\ +\hspace{3mm}1.1 $\vec x_{n+1} \leftarrow \vec x_{n+1} + \lfloor \vec x_{n}/\beta \rfloor$ \\ +\hspace{3mm}1.2 $\vec x_{n} \leftarrow \vec x_{n} \mbox{ (mod }\beta\mbox{)}$ \\ +2. Return($\vec x$). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm Comba Fixup} +\end{figure} + +With that algorithm and $k = 5$ and $\beta = 10$ the following vector is produced $\vec x= \left < 1, 3, 8, 8, 1, 6 \right >$. In this case +$241 \cdot 576$ is in fact $138816$ and the procedure succeeded. If the algorithm is correct and as will be demonstrated shortly more +efficient than the baseline algorithm why not simply always use this algorithm? + +\subsubsection{Column Weight.} +At the nested $O(n^2)$ level the Comba method adds the product of two single precision variables to each column of the output +independently. A serious obstacle is if the carry is lost, due to lack of precision before the algorithm has a chance to fix +the carries. For example, in the multiplication of two three-digit numbers the third column of output will be the sum of +three single precision multiplications. If the precision of the accumulator for the output digits is less then $3 \cdot (\beta - 1)^2$ then +an overflow can occur and the carry information will be lost. For any $m$ and $n$ digit inputs the maximum weight of any column is +min$(m, n)$ which is fairly obvious. + +The maximum number of terms in any column of a product is known as the ``column weight'' and strictly governs when the algorithm can be used. Recall +from earlier that a double precision type has $\alpha$ bits of resolution and a single precision digit has $lg(\beta)$ bits of precision. Given these +two quantities we must not violate the following + +\begin{equation} +k \cdot \left (\beta - 1 \right )^2 < 2^{\alpha} +\end{equation} + +Which reduces to + +\begin{equation} +k \cdot \left ( \beta^2 - 2\beta + 1 \right ) < 2^{\alpha} +\end{equation} + +Let $\rho = lg(\beta)$ represent the number of bits in a single precision digit. By further re-arrangement of the equation the final solution is +found. + +\begin{equation} +k < {{2^{\alpha}} \over {\left (2^{2\rho} - 2^{\rho + 1} + 1 \right )}} +\end{equation} + +The defaults for LibTomMath are $\beta = 2^{28}$ and $\alpha = 2^{64}$ which means that $k$ is bounded by $k < 257$. In this configuration +the smaller input may not have more than $256$ digits if the Comba method is to be used. This is quite satisfactory for most applications since +$256$ digits would allow for numbers in the range of $0 \le x < 2^{7168}$ which, is much larger than most public key cryptographic algorithms require. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{fast\_s\_mp\_mul\_digs}. \\ +\textbf{Input}. mp\_int $a$, mp\_int $b$ and an integer $digs$ \\ +\textbf{Output}. $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\ +\hline \\ +Place an array of \textbf{MP\_WARRAY} double precision digits named $\hat W$ on the stack. \\ +1. If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{mp\_grow}) \\ +2. If step 1 failed return(\textit{MP\_MEM}).\\ +\\ +Zero the temporary array $\hat W$. \\ +3. for $n$ from $0$ to $digs - 1$ do \\ +\hspace{3mm}3.1 $\hat W_n \leftarrow 0$ \\ +\\ +Compute the columns. \\ +4. for $ix$ from $0$ to $a.used - 1$ do \\ +\hspace{3mm}4.1 $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\ +\hspace{3mm}4.2 If $pb < 1$ then goto step 5. \\ +\hspace{3mm}4.3 for $iy$ from $0$ to $pb - 1$ do \\ +\hspace{6mm}4.3.1 $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}b_{iy}$ \\ +\\ +Propagate the carries upwards. \\ +5. $oldused \leftarrow c.used$ \\ +6. $c.used \leftarrow digs$ \\ +7. If $digs > 1$ then do \\ +\hspace{3mm}7.1. for $ix$ from $1$ to $digs - 1$ do \\ +\hspace{6mm}7.1.1 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix-1} / \beta \rfloor$ \\ +\hspace{6mm}7.1.2 $c_{ix - 1} \leftarrow \hat W_{ix - 1} \mbox{ (mod }\beta\mbox{)}$ \\ +8. else do \\ +\hspace{3mm}8.1 $ix \leftarrow 0$ \\ +9. $c_{ix} \leftarrow \hat W_{ix} \mbox{ (mod }\beta\mbox{)}$ \\ +\\ +Zero excess digits. \\ +10. If $digs < oldused$ then do \\ +\hspace{3mm}10.1 for $n$ from $digs$ to $oldused - 1$ do \\ +\hspace{6mm}10.1.1 $c_n \leftarrow 0$ \\ +11. Clamp excessive digits of $c$. (\textit{mp\_clamp}) \\ +12. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm fast\_s\_mp\_mul\_digs} +\label{fig:COMBAMULT} +\end{figure} + +\textbf{Algorithm fast\_s\_mp\_mul\_digs.} +This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision. The algorithm +essentially peforms the same calculation as algorithm s\_mp\_mul\_digs, just much faster. + +The array $\hat W$ is meant to be on the stack when the algorithm is used. The size of the array does not change which is ideal. Note also that +unlike algorithm s\_mp\_mul\_digs no temporary mp\_int is required since the result is calculated directly in $\hat W$. + +The $O(n^2)$ loop on step four is where the Comba method's advantages begin to show through in comparison to the baseline algorithm. The lack of +a carry variable or propagation in this loop allows the loop to be performed with only single precision multiplication and additions. Now that each +iteration of the inner loop can be performed independent of the others the inner loop can be performed with a high level of parallelism. + +To measure the benefits of the Comba method over the baseline method consider the number of operations that are required. If the +cost in terms of time of a multiply and addition is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require +$O \left ((p + q)n^2 \right )$ time to multiply two $n$-digit numbers. The Comba method requires only $O(pn^2 + qn)$ time, however in practice, +the speed increase is actually much more. With $O(n)$ space the algorithm can be reduced to $O(pn + qn)$ time by implementing the $n$ multiply +and addition operations in the nested loop in parallel. + +EXAM,bn_fast_s_mp_mul_digs.c + +The memset on line @47,memset@ clears the initial $\hat W$ array to zero in a single step. Like the slower baseline multiplication +implementation a series of aliases (\textit{lines @67, tmpx@, @70, tmpy@ and @75,_W@}) are used to simplify the inner $O(n^2)$ loop. +In this case a new alias $\_\hat W$ has been added which refers to the double precision columns offset by $ix$ in each pass. + +The inner loop on lines @83,for@, @84,mp_word@ and @85,}@ is where the algorithm will spend the majority of the time, which is why it has been +stripped to the bones of any extra baggage\footnote{Hence the pointer aliases.}. On x86 processors the multiplication and additions amount to at the +very least five instructions (\textit{two loads, two additions, one multiply}) while on the ARMv4 processors they amount to only three +(\textit{one load, one store, one multiply-add}). For both of the x86 and ARMv4 processors the GCC compiler performs a good job at unrolling the loop +and scheduling the instructions so there are very few dependency stalls. + +In theory the difference between the baseline and comba algorithms is a mere $O(qn)$ time difference. However, in the $O(n^2)$ nested loop of the +baseline method there are dependency stalls as the algorithm must wait for the multiplier to finish before propagating the carry to the next +digit. As a result fewer of the often multiple execution units\footnote{The AMD Athlon has three execution units and the Intel P4 has four.} can +be simultaneously used. + +\subsection{Polynomial Basis Multiplication} +To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication. In the following algorithms +the use of polynomial basis representation for two integers $a$ and $b$ as $f(x) = \sum_{i=0}^{n} a_i x^i$ and +$g(x) = \sum_{i=0}^{n} b_i x^i$ respectively, is required. In this system both $f(x)$ and $g(x)$ have $n + 1$ terms and are of the $n$'th degree. + +The product $a \cdot b \equiv f(x)g(x)$ is the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$. The coefficients $w_i$ will +directly yield the desired product when $\beta$ is substituted for $x$. The direct solution to solve for the $2n + 1$ coefficients +requires $O(n^2)$ time and would in practice be slower than the Comba technique. + +However, numerical analysis theory indicates that only $2n + 1$ distinct points in $W(x)$ are required to determine the values of the $2n + 1$ unknown +coefficients. This means by finding $\zeta_y = W(y)$ for $2n + 1$ small values of $y$ the coefficients of $W(x)$ can be found with +Gaussian elimination. This technique is also occasionally refered to as the \textit{interpolation technique} (\textit{references please...}) since in +effect an interpolation based on $2n + 1$ points will yield a polynomial equivalent to $W(x)$. + +The coefficients of the polynomial $W(x)$ are unknown which makes finding $W(y)$ for any value of $y$ impossible. However, since +$W(x) = f(x)g(x)$ the equivalent $\zeta_y = f(y) g(y)$ can be used in its place. The benefit of this technique stems from the +fact that $f(y)$ and $g(y)$ are much smaller than either $a$ or $b$ respectively. As a result finding the $2n + 1$ relations required +by multiplying $f(y)g(y)$ involves multiplying integers that are much smaller than either of the inputs. + +When picking points to gather relations there are always three obvious points to choose, $y = 0, 1$ and $ \infty$. The $\zeta_0$ term +is simply the product $W(0) = w_0 = a_0 \cdot b_0$. The $\zeta_1$ term is the product +$W(1) = \left (\sum_{i = 0}^{n} a_i \right ) \left (\sum_{i = 0}^{n} b_i \right )$. The third point $\zeta_{\infty}$ is less obvious but rather +simple to explain. The $2n + 1$'th coefficient of $W(x)$ is numerically equivalent to the most significant column in an integer multiplication. +The point at $\infty$ is used symbolically to represent the most significant column, that is $W(\infty) = w_{2n} = a_nb_n$. Note that the +points at $y = 0$ and $\infty$ yield the coefficients $w_0$ and $w_{2n}$ directly. + +If more points are required they should be of small values and powers of two such as $2^q$ and the related \textit{mirror points} +$\left (2^q \right )^{2n} \cdot \zeta_{2^{-q}}$ for small values of $q$. The term ``mirror point'' stems from the fact that +$\left (2^q \right )^{2n} \cdot \zeta_{2^{-q}}$ can be calculated in the exact opposite fashion as $\zeta_{2^q}$. For +example, when $n = 2$ and $q = 1$ then following two equations are equivalent to the point $\zeta_{2}$ and its mirror. + +\begin{eqnarray} +\zeta_{2} = f(2)g(2) = (4a_2 + 2a_1 + a_0)(4b_2 + 2b_1 + b_0) \nonumber \\ +16 \cdot \zeta_{1 \over 2} = 4f({1\over 2}) \cdot 4g({1 \over 2}) = (a_2 + 2a_1 + 4a_0)(b_2 + 2b_1 + 4b_0) +\end{eqnarray} + +Using such points will allow the values of $f(y)$ and $g(y)$ to be independently calculated using only left shifts. For example, when $n = 2$ the +polynomial $f(2^q)$ is equal to $2^q((2^qa_2) + a_1) + a_0$. This technique of polynomial representation is known as Horner's method. + +As a general rule of the algorithm when the inputs are split into $n$ parts each there are $2n - 1$ multiplications. Each multiplication is of +multiplicands that have $n$ times fewer digits than the inputs. The asymptotic running time of this algorithm is +$O \left ( k^{lg_n(2n - 1)} \right )$ for $k$ digit inputs (\textit{assuming they have the same number of digits}). Figure~\ref{fig:exponent} +summarizes the exponents for various values of $n$. + +\begin{figure} +\begin{center} +\begin{tabular}{|c|c|c|} +\hline \textbf{Split into $n$ Parts} & \textbf{Exponent} & \textbf{Notes}\\ +\hline $2$ & $1.584962501$ & This is Karatsuba Multiplication. \\ +\hline $3$ & $1.464973520$ & This is Toom-Cook Multiplication. \\ +\hline $4$ & $1.403677461$ &\\ +\hline $5$ & $1.365212389$ &\\ +\hline $10$ & $1.278753601$ &\\ +\hline $100$ & $1.149426538$ &\\ +\hline $1000$ & $1.100270931$ &\\ +\hline $10000$ & $1.075252070$ &\\ +\hline +\end{tabular} +\end{center} +\caption{Asymptotic Running Time of Polynomial Basis Multiplication} +\label{fig:exponent} +\end{figure} + +At first it may seem like a good idea to choose $n = 1000$ since the exponent is approximately $1.1$. However, the overhead +of solving for the 2001 terms of $W(x)$ will certainly consume any savings the algorithm could offer for all but exceedingly large +numbers. + +\subsubsection{Cutoff Point} +The polynomial basis multiplication algorithms all require fewer single precision multiplications than a straight Comba approach. However, +the algorithms incur an overhead (\textit{at the $O(n)$ work level}) since they require a system of equations to be solved. This makes the +polynomial basis approach more costly to use with small inputs. + +Let $m$ represent the number of digits in the multiplicands (\textit{assume both multiplicands have the same number of digits}). There exists a +point $y$ such that when $m < y$ the polynomial basis algorithms are more costly than Comba, when $m = y$ they are roughly the same cost and +when $m > y$ the Comba methods are slower than the polynomial basis algorithms. + +The exact location of $y$ depends on several key architectural elements of the computer platform in question. + +\begin{enumerate} +\item The ratio of clock cycles for single precision multiplication versus other simpler operations such as addition, shifting, etc. For example +on the AMD Athlon the ratio is roughly $17 : 1$ while on the Intel P4 it is $29 : 1$. The higher the ratio in favour of multiplication the lower +the cutoff point $y$ will be. + +\item The complexity of the linear system of equations (\textit{for the coefficients of $W(x)$}) is. Generally speaking as the number of splits +grows the complexity grows substantially. Ideally solving the system will only involve addition, subtraction and shifting of integers. This +directly reflects on the ratio previous mentioned. + +\item To a lesser extent memory bandwidth and function call overheads. Provided the values are in the processor cache this is less of an +influence over the cutoff point. + +\end{enumerate} + +A clean cutoff point separation occurs when a point $y$ is found such that all of the cutoff point conditions are met. For example, if the point +is too low then there will be values of $m$ such that $m > y$ and the Comba method is still faster. Finding the cutoff points is fairly simple when +a high resolution timer is available. + +\subsection{Karatsuba Multiplication} +Karatsuba \cite{KARA} multiplication when originally proposed in 1962 was among the first set of algorithms to break the $O(n^2)$ barrier for +general purpose multiplication. Given two polynomial basis representations $f(x) = ax + b$ and $g(x) = cx + d$, Karatsuba proved with +light algebra \cite{KARAP} that the following polynomial is equivalent to multiplication of the two integers the polynomials represent. + +\begin{equation} +f(x) \cdot g(x) = acx^2 + ((a - b)(c - d) - (ac + bd))x + bd +\end{equation} + +Using the observation that $ac$ and $bd$ could be re-used only three half sized multiplications would be required to produce the product. Applying +this algorithm recursively, the work factor becomes $O(n^{lg(3)})$ which is substantially better than the work factor $O(n^2)$ of the Comba technique. It turns +out what Karatsuba did not know or at least did not publish was that this is simply polynomial basis multiplication with the points +$\zeta_0$, $\zeta_{\infty}$ and $-\zeta_{-1}$. Consider the resultant system of equations. + +\begin{center} +\begin{tabular}{rcrcrcrc} +$\zeta_{0}$ & $=$ & & & & & $w_0$ \\ +$-\zeta_{-1}$ & $=$ & $-w_2$ & $+$ & $w_1$ & $-$ & $w_0$ \\ +$\zeta_{\infty}$ & $=$ & $w_2$ & & & & \\ +\end{tabular} +\end{center} + +By adding the first and last equation to the equation in the middle the term $w_1$ can be isolated and all three coefficients solved for. The simplicity +of this system of equations has made Karatsuba fairly popular. In fact the cutoff point is often fairly low\footnote{With LibTomMath 0.18 it is 70 and 109 digits for the Intel P4 and AMD Athlon respectively.} +making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman. It is worth noting that the point +$\zeta_1$ could be substituted for $-\zeta_{-1}$. In this case the first and third row are subtracted instead of added to the second row. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_karatsuba\_mul}. \\ +\textbf{Input}. mp\_int $a$ and mp\_int $b$ \\ +\textbf{Output}. $c \leftarrow \vert a \vert \cdot \vert b \vert$ \\ +\hline \\ +1. Init the following mp\_int variables: $x0$, $x1$, $y0$, $y1$, $t1$, $x0y0$, $x1y1$.\\ +2. If step 2 failed then return(\textit{MP\_MEM}). \\ +\\ +Split the input. e.g. $a = x1 \cdot \beta^B + x0$ \\ +3. $B \leftarrow \mbox{min}(a.used, b.used)/2$ \\ +4. $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\ +5. $y0 \leftarrow b \mbox{ (mod }\beta^B\mbox{)}$ \\ +6. $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_rshd}) \\ +7. $y1 \leftarrow \lfloor b / \beta^B \rfloor$ \\ +\\ +Calculate the three products. \\ +8. $x0y0 \leftarrow x0 \cdot y0$ (\textit{mp\_mul}) \\ +9. $x1y1 \leftarrow x1 \cdot y1$ \\ +10. $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\ +11. $x0 \leftarrow y1 - y0$ \\ +12. $t1 \leftarrow t1 \cdot x0$ \\ +\\ +Calculate the middle term. \\ +13. $x0 \leftarrow x0y0 + x1y1$ \\ +14. $t1 \leftarrow x0 - t1$ \\ +\\ +Calculate the final product. \\ +15. $t1 \leftarrow t1 \cdot \beta^B$ (\textit{mp\_lshd}) \\ +16. $x1y1 \leftarrow x1y1 \cdot \beta^{2B}$ \\ +17. $t1 \leftarrow x0y0 + t1$ \\ +18. $c \leftarrow t1 + x1y1$ \\ +19. Clear all of the temporary variables. \\ +20. Return(\textit{MP\_OKAY}).\\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_karatsuba\_mul} +\end{figure} + +\textbf{Algorithm mp\_karatsuba\_mul.} +This algorithm computes the unsigned product of two inputs using the Karatsuba multiplication algorithm. It is loosely based on the description +from Knuth \cite[pp. 294-295]{TAOCPV2}. + +\index{radix point} +In order to split the two inputs into their respective halves, a suitable \textit{radix point} must be chosen. The radix point chosen must +be used for both of the inputs meaning that it must be smaller than the smallest input. Step 3 chooses the radix point $B$ as half of the +smallest input \textbf{used} count. After the radix point is chosen the inputs are split into lower and upper halves. Step 4 and 5 +compute the lower halves. Step 6 and 7 computer the upper halves. + +After the halves have been computed the three intermediate half-size products must be computed. Step 8 and 9 compute the trivial products +$x0 \cdot y0$ and $x1 \cdot y1$. The mp\_int $x0$ is used as a temporary variable after $x1 - x0$ has been computed. By using $x0$ instead +of an additional temporary variable, the algorithm can avoid an addition memory allocation operation. + +The remaining steps 13 through 18 compute the Karatsuba polynomial through a variety of digit shifting and addition operations. + +EXAM,bn_mp_karatsuba_mul.c + +The new coding element in this routine, not seen in previous routines, is the usage of goto statements. The conventional +wisdom is that goto statements should be avoided. This is generally true, however when every single function call can fail, it makes sense +to handle error recovery with a single piece of code. Lines @61,if@ to @75,if@ handle initializing all of the temporary variables +required. Note how each of the if statements goes to a different label in case of failure. This allows the routine to correctly free only +the temporaries that have been successfully allocated so far. + +The temporary variables are all initialized using the mp\_init\_size routine since they are expected to be large. This saves the +additional reallocation that would have been necessary. Also $x0$, $x1$, $y0$ and $y1$ have to be able to hold at least their respective +number of digits for the next section of code. + +The first algebraic portion of the algorithm is to split the two inputs into their halves. However, instead of using mp\_mod\_2d and mp\_rshd +to extract the halves, the respective code has been placed inline within the body of the function. To initialize the halves, the \textbf{used} and +\textbf{sign} members are copied first. The first for loop on line @98,for@ copies the lower halves. Since they are both the same magnitude it +is simpler to calculate both lower halves in a single loop. The for loop on lines @104,for@ and @109,for@ calculate the upper halves $x1$ and +$y1$ respectively. + +By inlining the calculation of the halves, the Karatsuba multiplier has a slightly lower overhead and can be used for smaller magnitude inputs. + +When line @152,err@ is reached, the algorithm has completed succesfully. The ``error status'' variable $err$ is set to \textbf{MP\_OKAY} so that +the same code that handles errors can be used to clear the temporary variables and return. + +\subsection{Toom-Cook $3$-Way Multiplication} +Toom-Cook $3$-Way \cite{TOOM} multiplication is essentially the polynomial basis algorithm for $n = 2$ except that the points are +chosen such that $\zeta$ is easy to compute and the resulting system of equations easy to reduce. Here, the points $\zeta_{0}$, +$16 \cdot \zeta_{1 \over 2}$, $\zeta_1$, $\zeta_2$ and $\zeta_{\infty}$ make up the five required points to solve for the coefficients +of the $W(x)$. + +With the five relations that Toom-Cook specifies, the following system of equations is formed. + +\begin{center} +\begin{tabular}{rcrcrcrcrcr} +$\zeta_0$ & $=$ & $0w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $1w_0$ \\ +$16 \cdot \zeta_{1 \over 2}$ & $=$ & $1w_4$ & $+$ & $2w_3$ & $+$ & $4w_2$ & $+$ & $8w_1$ & $+$ & $16w_0$ \\ +$\zeta_1$ & $=$ & $1w_4$ & $+$ & $1w_3$ & $+$ & $1w_2$ & $+$ & $1w_1$ & $+$ & $1w_0$ \\ +$\zeta_2$ & $=$ & $16w_4$ & $+$ & $8w_3$ & $+$ & $4w_2$ & $+$ & $2w_1$ & $+$ & $1w_0$ \\ +$\zeta_{\infty}$ & $=$ & $1w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $0w_0$ \\ +\end{tabular} +\end{center} + +A trivial solution to this matrix requires $12$ subtractions, two multiplications by a small power of two, two divisions by a small power +of two, two divisions by three and one multiplication by three. All of these $19$ sub-operations require less than quadratic time, meaning that +the algorithm can be faster than a baseline multiplication. However, the greater complexity of this algorithm places the cutoff point +(\textbf{TOOM\_MUL\_CUTOFF}) where Toom-Cook becomes more efficient much higher than the Karatsuba cutoff point. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_toom\_mul}. \\ +\textbf{Input}. mp\_int $a$ and mp\_int $b$ \\ +\textbf{Output}. $c \leftarrow a \cdot b $ \\ +\hline \\ +Split $a$ and $b$ into three pieces. E.g. $a = a_2 \beta^{2k} + a_1 \beta^{k} + a_0$ \\ +1. $k \leftarrow \lfloor \mbox{min}(a.used, b.used) / 3 \rfloor$ \\ +2. $a_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\ +3. $a_1 \leftarrow \lfloor a / \beta^k \rfloor$, $a_1 \leftarrow a_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\ +4. $a_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $a_2 \leftarrow a_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\ +5. $b_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\ +6. $b_1 \leftarrow \lfloor a / \beta^k \rfloor$, $b_1 \leftarrow b_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\ +7. $b_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $b_2 \leftarrow b_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\ +\\ +Find the five equations for $w_0, w_1, ..., w_4$. \\ +8. $w_0 \leftarrow a_0 \cdot b_0$ \\ +9. $w_4 \leftarrow a_2 \cdot b_2$ \\ +10. $tmp_1 \leftarrow 2 \cdot a_0$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_2$ \\ +11. $tmp_2 \leftarrow 2 \cdot b_0$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_2$ \\ +12. $w_1 \leftarrow tmp_1 \cdot tmp_2$ \\ +13. $tmp_1 \leftarrow 2 \cdot a_2$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_0$ \\ +14. $tmp_2 \leftarrow 2 \cdot b_2$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_0$ \\ +15. $w_3 \leftarrow tmp_1 \cdot tmp_2$ \\ +16. $tmp_1 \leftarrow a_0 + a_1$, $tmp_1 \leftarrow tmp_1 + a_2$, $tmp_2 \leftarrow b_0 + b_1$, $tmp_2 \leftarrow tmp_2 + b_2$ \\ +17. $w_2 \leftarrow tmp_1 \cdot tmp_2$ \\ +\\ +Continued on the next page.\\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_toom\_mul} +\end{figure} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_toom\_mul} (continued). \\ +\textbf{Input}. mp\_int $a$ and mp\_int $b$ \\ +\textbf{Output}. $c \leftarrow a \cdot b $ \\ +\hline \\ +Now solve the system of equations. \\ +18. $w_1 \leftarrow w_4 - w_1$, $w_3 \leftarrow w_3 - w_0$ \\ +19. $w_1 \leftarrow \lfloor w_1 / 2 \rfloor$, $w_3 \leftarrow \lfloor w_3 / 2 \rfloor$ \\ +20. $w_2 \leftarrow w_2 - w_0$, $w_2 \leftarrow w_2 - w_4$ \\ +21. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\ +22. $tmp_1 \leftarrow 8 \cdot w_0$, $w_1 \leftarrow w_1 - tmp_1$, $tmp_1 \leftarrow 8 \cdot w_4$, $w_3 \leftarrow w_3 - tmp_1$ \\ +23. $w_2 \leftarrow 3 \cdot w_2$, $w_2 \leftarrow w_2 - w_1$, $w_2 \leftarrow w_2 - w_3$ \\ +24. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\ +25. $w_1 \leftarrow \lfloor w_1 / 3 \rfloor, w_3 \leftarrow \lfloor w_3 / 3 \rfloor$ \\ +\\ +Now substitute $\beta^k$ for $x$ by shifting $w_0, w_1, ..., w_4$. \\ +26. for $n$ from $1$ to $4$ do \\ +\hspace{3mm}26.1 $w_n \leftarrow w_n \cdot \beta^{nk}$ \\ +27. $c \leftarrow w_0 + w_1$, $c \leftarrow c + w_2$, $c \leftarrow c + w_3$, $c \leftarrow c + w_4$ \\ +28. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_toom\_mul (continued)} +\end{figure} + +\textbf{Algorithm mp\_toom\_mul.} +This algorithm computes the product of two mp\_int variables $a$ and $b$ using the Toom-Cook approach. Compared to the Karatsuba multiplication, this +algorithm has a lower asymptotic running time of approximately $O(n^{1.464})$ but at an obvious cost in overhead. In this +description, several statements have been compounded to save space. The intention is that the statements are executed from left to right across +any given step. + +The two inputs $a$ and $b$ are first split into three $k$-digit integers $a_0, a_1, a_2$ and $b_0, b_1, b_2$ respectively. From these smaller +integers the coefficients of the polynomial basis representations $f(x)$ and $g(x)$ are known and can be used to find the relations required. + +The first two relations $w_0$ and $w_4$ are the points $\zeta_{0}$ and $\zeta_{\infty}$ respectively. The relation $w_1, w_2$ and $w_3$ correspond +to the points $16 \cdot \zeta_{1 \over 2}, \zeta_{2}$ and $\zeta_{1}$ respectively. These are found using logical shifts to independently find +$f(y)$ and $g(y)$ which significantly speeds up the algorithm. + +After the five relations $w_0, w_1, \ldots, w_4$ have been computed, the system they represent must be solved in order for the unknown coefficients +$w_1, w_2$ and $w_3$ to be isolated. The steps 18 through 25 perform the system reduction required as previously described. Each step of +the reduction represents the comparable matrix operation that would be performed had this been performed by pencil. For example, step 18 indicates +that row $1$ must be subtracted from row $4$ and simultaneously row $0$ subtracted from row $3$. + +Once the coeffients have been isolated, the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$ is known. By substituting $\beta^{k}$ for $x$, the integer +result $a \cdot b$ is produced. + +EXAM,bn_mp_toom_mul.c + +-- Comments to be added during editing phase. + +\subsection{Signed Multiplication} +Now that algorithms to handle multiplications of every useful dimensions have been developed, a rather simple finishing touch is required. So far all +of the multiplication algorithms have been unsigned multiplications which leaves only a signed multiplication algorithm to be established. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_mul}. \\ +\textbf{Input}. mp\_int $a$ and mp\_int $b$ \\ +\textbf{Output}. $c \leftarrow a \cdot b$ \\ +\hline \\ +1. If $a.sign = b.sign$ then \\ +\hspace{3mm}1.1 $sign = MP\_ZPOS$ \\ +2. else \\ +\hspace{3mm}2.1 $sign = MP\_ZNEG$ \\ +3. If min$(a.used, b.used) \ge TOOM\_MUL\_CUTOFF$ then \\ +\hspace{3mm}3.1 $c \leftarrow a \cdot b$ using algorithm mp\_toom\_mul \\ +4. else if min$(a.used, b.used) \ge KARATSUBA\_MUL\_CUTOFF$ then \\ +\hspace{3mm}4.1 $c \leftarrow a \cdot b$ using algorithm mp\_karatsuba\_mul \\ +5. else \\ +\hspace{3mm}5.1 $digs \leftarrow a.used + b.used + 1$ \\ +\hspace{3mm}5.2 If $digs < MP\_ARRAY$ and min$(a.used, b.used) \le \delta$ then \\ +\hspace{6mm}5.2.1 $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm fast\_s\_mp\_mul\_digs. \\ +\hspace{3mm}5.3 else \\ +\hspace{6mm}5.3.1 $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm s\_mp\_mul\_digs. \\ +6. $c.sign \leftarrow sign$ \\ +7. Return the result of the unsigned multiplication performed. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_mul} +\end{figure} + +\textbf{Algorithm mp\_mul.} +This algorithm performs the signed multiplication of two inputs. It will make use of any of the three unsigned multiplication algorithms +available when the input is of appropriate size. The \textbf{sign} of the result is not set until the end of the algorithm since algorithm +s\_mp\_mul\_digs will clear it. + +EXAM,bn_mp_mul.c + +The implementation is rather simplistic and is not particularly noteworthy. Line @22,?@ computes the sign of the result using the ``?'' +operator from the C programming language. Line @37,<<@ computes $\delta$ using the fact that $1 << k$ is equal to $2^k$. + +\section{Squaring} +\label{sec:basesquare} + +Squaring is a special case of multiplication where both multiplicands are equal. At first it may seem like there is no significant optimization +available but in fact there is. Consider the multiplication of $576$ against $241$. In total there will be nine single precision multiplications +performed which are $1\cdot 6$, $1 \cdot 7$, $1 \cdot 5$, $4 \cdot 6$, $4 \cdot 7$, $4 \cdot 5$, $2 \cdot 6$, $2 \cdot 7$ and $2 \cdot 5$. Now consider +the multiplication of $123$ against $123$. The nine products are $3 \cdot 3$, $3 \cdot 2$, $3 \cdot 1$, $2 \cdot 3$, $2 \cdot 2$, $2 \cdot 1$, +$1 \cdot 3$, $1 \cdot 2$ and $1 \cdot 1$. On closer inspection some of the products are equivalent. For example, $3 \cdot 2 = 2 \cdot 3$ +and $3 \cdot 1 = 1 \cdot 3$. + +For any $n$-digit input, there are ${{\left (n^2 + n \right)}\over 2}$ possible unique single precision multiplications required compared to the $n^2$ +required for multiplication. The following diagram gives an example of the operations required. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{ccccc|c} +&&1&2&3&\\ +$\times$ &&1&2&3&\\ +\hline && $3 \cdot 1$ & $3 \cdot 2$ & $3 \cdot 3$ & Row 0\\ + & $2 \cdot 1$ & $2 \cdot 2$ & $2 \cdot 3$ && Row 1 \\ + $1 \cdot 1$ & $1 \cdot 2$ & $1 \cdot 3$ &&& Row 2 \\ +\end{tabular} +\end{center} +\caption{Squaring Optimization Diagram} +\end{figure} + +MARK,SQUARE +Starting from zero and numbering the columns from right to left a very simple pattern becomes obvious. For the purposes of this discussion let $x$ +represent the number being squared. The first observation is that in row $k$ the $2k$'th column of the product has a $\left (x_k \right)^2$ term in it. + +The second observation is that every column $j$ in row $k$ where $j \ne 2k$ is part of a double product. Every non-square term of a column will +appear twice hence the name ``double product''. Every odd column is made up entirely of double products. In fact every column is made up of double +products and at most one square (\textit{see the exercise section}). + +The third and final observation is that for row $k$ the first unique non-square term, that is, one that hasn't already appeared in an earlier row, +occurs at column $2k + 1$. For example, on row $1$ of the previous squaring, column one is part of the double product with column one from row zero. +Column two of row one is a square and column three is the first unique column. + +\subsection{The Baseline Squaring Algorithm} +The baseline squaring algorithm is meant to be a catch-all squaring algorithm. It will handle any of the input sizes that the faster routines +will not handle. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{s\_mp\_sqr}. \\ +\textbf{Input}. mp\_int $a$ \\ +\textbf{Output}. $b \leftarrow a^2$ \\ +\hline \\ +1. Init a temporary mp\_int of at least $2 \cdot a.used +1$ digits. (\textit{mp\_init\_size}) \\ +2. If step 1 failed return(\textit{MP\_MEM}) \\ +3. $t.used \leftarrow 2 \cdot a.used + 1$ \\ +4. For $ix$ from 0 to $a.used - 1$ do \\ +\hspace{3mm}Calculate the square. \\ +\hspace{3mm}4.1 $\hat r \leftarrow t_{2ix} + \left (a_{ix} \right )^2$ \\ +\hspace{3mm}4.2 $t_{2ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{3mm}Calculate the double products after the square. \\ +\hspace{3mm}4.3 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ +\hspace{3mm}4.4 For $iy$ from $ix + 1$ to $a.used - 1$ do \\ +\hspace{6mm}4.4.1 $\hat r \leftarrow 2 \cdot a_{ix}a_{iy} + t_{ix + iy} + u$ \\ +\hspace{6mm}4.4.2 $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{6mm}4.4.3 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ +\hspace{3mm}Set the last carry. \\ +\hspace{3mm}4.5 While $u > 0$ do \\ +\hspace{6mm}4.5.1 $iy \leftarrow iy + 1$ \\ +\hspace{6mm}4.5.2 $\hat r \leftarrow t_{ix + iy} + u$ \\ +\hspace{6mm}4.5.3 $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{6mm}4.5.4 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ +5. Clamp excess digits of $t$. (\textit{mp\_clamp}) \\ +6. Exchange $b$ and $t$. \\ +7. Clear $t$ (\textit{mp\_clear}) \\ +8. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm s\_mp\_sqr} +\end{figure} + +\textbf{Algorithm s\_mp\_sqr.} +This algorithm computes the square of an input using the three observations on squaring. It is based fairly faithfully on algorithm 14.16 of HAC +\cite[pp.596-597]{HAC}. Similar to algorithm s\_mp\_mul\_digs, a temporary mp\_int is allocated to hold the result of the squaring. This allows the +destination mp\_int to be the same as the source mp\_int. + +The outer loop of this algorithm begins on step 4. It is best to think of the outer loop as walking down the rows of the partial results, while +the inner loop computes the columns of the partial result. Step 4.1 and 4.2 compute the square term for each row, and step 4.3 and 4.4 propagate +the carry and compute the double products. + +The requirement that a mp\_word be able to represent the range $0 \le x < 2 \beta^2$ arises from this +very algorithm. The product $a_{ix}a_{iy}$ will lie in the range $0 \le x \le \beta^2 - 2\beta + 1$ which is obviously less than $\beta^2$ meaning that +when it is multiplied by two, it can be properly represented by a mp\_word. + +Similar to algorithm s\_mp\_mul\_digs, after every pass of the inner loop, the destination is correctly set to the sum of all of the partial +results calculated so far. This involves expensive carry propagation which will be eliminated in the next algorithm. + +EXAM,bn_s_mp_sqr.c + +Inside the outer loop (\textit{see line @32,for@}) the square term is calculated on line @35,r =@. Line @42,>>@ extracts the carry from the square +term. Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized on lines @45,tmpx@ and @48,tmpt@ respectively. The doubling is performed using two +additions (\textit{see line @57,r + r@}) since it is usually faster than shifting,if not at least as fast. + +\subsection{Faster Squaring by the ``Comba'' Method} +A major drawback to the baseline method is the requirement for single precision shifting inside the $O(n^2)$ nested loop. Squaring has an additional +drawback that it must double the product inside the inner loop as well. As for multiplication, the Comba technique can be used to eliminate these +performance hazards. + +The first obvious solution is to make an array of mp\_words which will hold all of the columns. This will indeed eliminate all of the carry +propagation operations from the inner loop. However, the inner product must still be doubled $O(n^2)$ times. The solution stems from the simple fact +that $2a + 2b + 2c = 2(a + b + c)$. That is the sum of all of the double products is equal to double the sum of all the products. For example, +$ab + ba + ac + ca = 2ab + 2ac = 2(ab + ac)$. + +However, we cannot simply double all of the columns, since the squares appear only once per row. The most practical solution is to have two mp\_word +arrays. One array will hold the squares and the other array will hold the double products. With both arrays the doubling and carry propagation can be +moved to a $O(n)$ work level outside the $O(n^2)$ level. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{fast\_s\_mp\_sqr}. \\ +\textbf{Input}. mp\_int $a$ \\ +\textbf{Output}. $b \leftarrow a^2$ \\ +\hline \\ +Place two arrays of \textbf{MP\_WARRAY} mp\_words named $\hat W$ and $\hat {X}$ on the stack. \\ +1. If $b.alloc < 2a.used + 1$ then grow $b$ to $2a.used + 1$ digits. (\textit{mp\_grow}). \\ +2. If step 1 failed return(\textit{MP\_MEM}). \\ +3. for $ix$ from $0$ to $2a.used + 1$ do \\ +\hspace{3mm}3.1 $\hat W_{ix} \leftarrow 0$ \\ +\hspace{3mm}3.2 $\hat {X}_{ix} \leftarrow 0$ \\ +4. for $ix$ from $0$ to $a.used - 1$ do \\ +\hspace{3mm}Compute the square.\\ +\hspace{3mm}4.1 $\hat {X}_{ix+ix} \leftarrow \left ( a_{ix} \right )^2$ \\ +\\ +\hspace{3mm}Compute the double products.\\ +\hspace{3mm}4.2 for $iy$ from $ix + 1$ to $a.used - 1$ do \\ +\hspace{6mm}4.2.1 $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}a_{iy}$ \\ +5. $oldused \leftarrow b.used$ \\ +6. $b.used \leftarrow 2a.used + 1$ \\ +\\ +Double the products and propagate the carries simultaneously. \\ +7. $\hat W_0 \leftarrow 2 \hat W_0 + \hat {X}_0$ \\ +8. for $ix$ from $1$ to $2a.used$ do \\ +\hspace{3mm}8.1 $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ \\ +\hspace{3mm}8.2 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix - 1} / \beta \rfloor$ \\ +\hspace{3mm}8.3 $b_{ix-1} \leftarrow W_{ix-1} \mbox{ (mod }\beta\mbox{)}$ \\ +9. $b_{2a.used} \leftarrow \hat W_{2a.used} \mbox{ (mod }\beta\mbox{)}$ \\ +10. if $2a.used + 1 < oldused$ then do \\ +\hspace{3mm}10.1 for $ix$ from $2a.used + 1$ to $oldused$ do \\ +\hspace{6mm}10.1.1 $b_{ix} \leftarrow 0$ \\ +11. Clamp excess digits from $b$. (\textit{mp\_clamp}) \\ +12. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm fast\_s\_mp\_sqr} +\end{figure} + +\textbf{Algorithm fast\_s\_mp\_sqr.} +This algorithm computes the square of an input using the Comba technique. It is designed to be a replacement for algorithm s\_mp\_sqr when +the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$. + +This routine requires two arrays of mp\_words to be placed on the stack. The first array $\hat W$ will hold the double products and the second +array $\hat X$ will hold the squares. Though only at most $MP\_WARRAY \over 2$ words of $\hat X$ are used, it has proven faster on most +processors to simply make it a full size array. + +The loop on step 3 will zero the two arrays to prepare them for the squaring step. Step 4.1 computes the squares of the product. Note how +it simply assigns the value into the $\hat X$ array. The nested loop on step 4.2 computes the doubles of the products. This loop +computes the sum of the products for each column. They are not doubled until later. + +After the squaring loop, the products stored in $\hat W$ musted be doubled and the carries propagated forwards. It makes sense to do both +operations at the same time. The expression $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ computes the sum of the double product and the +squares in place. + +EXAM,bn_fast_s_mp_sqr.c + +-- Write something deep and insightful later, Tom. + +\subsection{Polynomial Basis Squaring} +The same algorithm that performs optimal polynomial basis multiplication can be used to perform polynomial basis squaring. The minor exception +is that $\zeta_y = f(y)g(y)$ is actually equivalent to $\zeta_y = f(y)^2$ since $f(y) = g(y)$. Instead of performing $2n + 1$ +multiplications to find the $\zeta$ relations, squaring operations are performed instead. + +\subsection{Karatsuba Squaring} +Let $f(x) = ax + b$ represent the polynomial basis representation of a number to square. +Let $h(x) = \left ( f(x) \right )^2$ represent the square of the polynomial. The Karatsuba equation can be modified to square a +number with the following equation. + +\begin{equation} +h(x) = a^2x^2 + \left (a^2 + b^2 - (a - b)^2 \right )x + b^2 +\end{equation} + +Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a - b)^2$. As in +Karatsuba multiplication, this algorithm can be applied recursively on the input and will achieve an asymptotic running time of +$O \left ( n^{lg(3)} \right )$. + +If the asymptotic times of Karatsuba squaring and multiplication are the same, why not simply use the multiplication algorithm +instead? The answer to this arises from the cutoff point for squaring. As in multiplication there exists a cutoff point, at which the +time required for a Comba based squaring and a Karatsuba based squaring meet. Due to the overhead inherent in the Karatsuba method, the cutoff +point is fairly high. For example, on an AMD Athlon XP processor with $\beta = 2^{28}$, the cutoff point is around 127 digits. + +Consider squaring a 200 digit number with this technique. It will be split into two 100 digit halves which are subsequently squared. +The 100 digit halves will not be squared using Karatsuba, but instead using the faster Comba based squaring algorithm. If Karatsuba multiplication +were used instead, the 100 digit numbers would be squared with a slower Comba based multiplication. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_karatsuba\_sqr}. \\ +\textbf{Input}. mp\_int $a$ \\ +\textbf{Output}. $b \leftarrow a^2$ \\ +\hline \\ +1. Initialize the following temporary mp\_ints: $x0$, $x1$, $t1$, $t2$, $x0x0$ and $x1x1$. \\ +2. If any of the initializations on step 1 failed return(\textit{MP\_MEM}). \\ +\\ +Split the input. e.g. $a = x1\beta^B + x0$ \\ +3. $B \leftarrow \lfloor a.used / 2 \rfloor$ \\ +4. $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\ +5. $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_lshd}) \\ +\\ +Calculate the three squares. \\ +6. $x0x0 \leftarrow x0^2$ (\textit{mp\_sqr}) \\ +7. $x1x1 \leftarrow x1^2$ \\ +8. $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\ +9. $t1 \leftarrow t1^2$ \\ +\\ +Compute the middle term. \\ +10. $t2 \leftarrow x0x0 + x1x1$ (\textit{s\_mp\_add}) \\ +11. $t1 \leftarrow t2 - t1$ \\ +\\ +Compute final product. \\ +12. $t1 \leftarrow t1\beta^B$ (\textit{mp\_lshd}) \\ +13. $x1x1 \leftarrow x1x1\beta^{2B}$ \\ +14. $t1 \leftarrow t1 + x0x0$ \\ +15. $b \leftarrow t1 + x1x1$ \\ +16. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_karatsuba\_sqr} +\end{figure} + +\textbf{Algorithm mp\_karatsuba\_sqr.} +This algorithm computes the square of an input $a$ using the Karatsuba technique. This algorithm is very similar to the Karatsuba based +multiplication algorithm with the exception that the three half-size multiplications have been replaced with three half-size squarings. + +The radix point for squaring is simply placed exactly in the middle of the digits when the input has an odd number of digits, otherwise it is +placed just below the middle. Step 3, 4 and 5 compute the two halves required using $B$ +as the radix point. The first two squares in steps 6 and 7 are rather straightforward while the last square is of a more compact form. + +By expanding $\left (x1 - x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $x1^2 + x0^2 - (x1 - x0)^2 = 2 \cdot x0 \cdot x1$. +Now if $5n$ single precision additions and a squaring of $n$-digits is faster than multiplying two $n$-digit numbers and doubling then +this method is faster. Assuming no further recursions occur, the difference can be estimated with the following inequality. + +Let $p$ represent the cost of a single precision addition and $q$ the cost of a single precision multiplication both in terms of time\footnote{Or +machine clock cycles.}. + +\begin{equation} +5pn +{{q(n^2 + n)} \over 2} \le pn + qn^2 +\end{equation} + +For example, on an AMD Athlon XP processor $p = {1 \over 3}$ and $q = 6$. This implies that the following inequality should hold. +\begin{center} +\begin{tabular}{rcl} +${5n \over 3} + 3n^2 + 3n$ & $<$ & ${n \over 3} + 6n^2$ \\ +${5 \over 3} + 3n + 3$ & $<$ & ${1 \over 3} + 6n$ \\ +${13 \over 9}$ & $<$ & $n$ \\ +\end{tabular} +\end{center} + +This results in a cutoff point around $n = 2$. As a consequence it is actually faster to compute the middle term the ``long way'' on processors +where multiplication is substantially slower\footnote{On the Athlon there is a 1:17 ratio between clock cycles for addition and multiplication. On +the Intel P4 processor this ratio is 1:29 making this method even more beneficial. The only common exception is the ARMv4 processor which has a +ratio of 1:7. } than simpler operations such as addition. + +EXAM,bn_mp_karatsuba_sqr.c + +This implementation is largely based on the implementation of algorithm mp\_karatsuba\_mul. It uses the same inline style to copy and +shift the input into the two halves. The loop from line @54,{@ to line @70,}@ has been modified since only one input exists. The \textbf{used} +count of both $x0$ and $x1$ is fixed up and $x0$ is clamped before the calculations begin. At this point $x1$ and $x0$ are valid equivalents +to the respective halves as if mp\_rshd and mp\_mod\_2d had been used. + +By inlining the copy and shift operations the cutoff point for Karatsuba multiplication can be lowered. On the Athlon the cutoff point +is exactly at the point where Comba squaring can no longer be used (\textit{128 digits}). On slower processors such as the Intel P4 +it is actually below the Comba limit (\textit{at 110 digits}). + +This routine uses the same error trap coding style as mp\_karatsuba\_sqr. As the temporary variables are initialized errors are redirected to +the error trap higher up. If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and mp\_clears are executed normally. + +\textit{Last paragraph sucks. re-write! -- Tom} + +\subsection{Toom-Cook Squaring} +The Toom-Cook squaring algorithm mp\_toom\_sqr is heavily based on the algorithm mp\_toom\_mul with the exception that squarings are used +instead of multiplication to find the five relations.. The reader is encouraged to read the description of the latter algorithm and try to +derive their own Toom-Cook squaring algorithm. + +\subsection{High Level Squaring} +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_sqr}. \\ +\textbf{Input}. mp\_int $a$ \\ +\textbf{Output}. $b \leftarrow a^2$ \\ +\hline \\ +1. If $a.used \ge TOOM\_SQR\_CUTOFF$ then \\ +\hspace{3mm}1.1 $b \leftarrow a^2$ using algorithm mp\_toom\_sqr \\ +2. else if $a.used \ge KARATSUBA\_SQR\_CUTOFF$ then \\ +\hspace{3mm}2.1 $b \leftarrow a^2$ using algorithm mp\_karatsuba\_sqr \\ +3. else \\ +\hspace{3mm}3.1 $digs \leftarrow a.used + b.used + 1$ \\ +\hspace{3mm}3.2 If $digs < MP\_ARRAY$ and $a.used \le \delta$ then \\ +\hspace{6mm}3.2.1 $b \leftarrow a^2$ using algorithm fast\_s\_mp\_sqr. \\ +\hspace{3mm}3.3 else \\ +\hspace{6mm}3.3.1 $b \leftarrow a^2$ using algorithm s\_mp\_sqr. \\ +4. $b.sign \leftarrow MP\_ZPOS$ \\ +5. Return the result of the unsigned squaring performed. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_sqr} +\end{figure} + +\textbf{Algorithm mp\_sqr.} +This algorithm computes the square of the input using one of four different algorithms. If the input is very large and has at least +\textbf{TOOM\_SQR\_CUTOFF} or \textbf{KARATSUBA\_SQR\_CUTOFF} digits then either the Toom-Cook or the Karatsuba Squaring algorithm is used. If +neither of the polynomial basis algorithms should be used then either the Comba or baseline algorithm is used. + +EXAM,bn_mp_sqr.c + +\section*{Exercises} +\begin{tabular}{cl} +$\left [ 3 \right ] $ & Devise an efficient algorithm for selection of the radix point to handle inputs \\ + & that have different number of digits in Karatsuba multiplication. \\ + & \\ +$\left [ 3 \right ] $ & In ~SQUARE~ the fact that every column of a squaring is made up \\ + & of double products and at most one square is stated. Prove this statement. \\ + & \\ +$\left [ 2 \right ] $ & In the Comba squaring algorithm half of the $\hat X$ variables are not used. \\ + & Revise algorithm fast\_s\_mp\_sqr to shrink the $\hat X$ array. \\ + & \\ +$\left [ 3 \right ] $ & Prove the equation for Karatsuba squaring. \\ + & \\ +$\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3)} \right )$ time. \\ + & \\ +$\left [ 2 \right ] $ & Determine the minimal ratio between addition and multiplication clock cycles \\ + & required for equation $6.7$ to be true. \\ + & \\ +\end{tabular} + +\chapter{Modular Reduction} +MARK,REDUCTION +\section{Basics of Modular Reduction} +\index{modular residue} +Modular reduction is an operation that arises quite often within public key cryptography algorithms and various number theoretic algorithms, +such as factoring. Modular reduction algorithms are the third class of algorithms of the ``multipliers'' set. A number $a$ is said to be \textit{reduced} +modulo another number $b$ by finding the remainder of the division $a/b$. Full integer division with remainder is a topic to be covered +in~\ref{sec:division}. + +Modular reduction is equivalent to solving for $r$ in the following equation. $a = bq + r$ where $q = \lfloor a/b \rfloor$. The result +$r$ is said to be ``congruent to $a$ modulo $b$'' which is also written as $r \equiv a \mbox{ (mod }b\mbox{)}$. In other vernacular $r$ is known as the +``modular residue'' which leads to ``quadratic residue''\footnote{That's fancy talk for $b \equiv a^2 \mbox{ (mod }p\mbox{)}$.} and +other forms of residues. + +Modular reductions are normally used to create either finite groups, rings or fields. The most common usage for performance driven modular reductions +is in modular exponentiation algorithms. That is to compute $d = a^b \mbox{ (mod }c\mbox{)}$ as fast as possible. This operation is used in the +RSA and Diffie-Hellman public key algorithms, for example. Modular multiplication and squaring also appears as a fundamental operation in +Elliptic Curve cryptographic algorithms. As will be discussed in the subsequent chapter there exist fast algorithms for computing modular +exponentiations without having to perform (\textit{in this example}) $b - 1$ multiplications. These algorithms will produce partial results in the +range $0 \le x < c^2$ which can be taken advantage of to create several efficient algorithms. They have also been used to create redundancy check +algorithms known as CRCs, error correction codes such as Reed-Solomon and solve a variety of number theoeretic problems. + +\section{The Barrett Reduction} +The Barrett reduction algorithm \cite{BARRETT} was inspired by fast division algorithms which multiply by the reciprocal to emulate +division. Barretts observation was that the residue $c$ of $a$ modulo $b$ is equal to + +\begin{equation} +c = a - b \cdot \lfloor a/b \rfloor +\end{equation} + +Since algorithms such as modular exponentiation would be using the same modulus extensively, typical DSP\footnote{It is worth noting that Barrett's paper +targeted the DSP56K processor.} intuition would indicate the next step would be to replace $a/b$ by a multiplication by the reciprocal. However, +DSP intuition on its own will not work as these numbers are considerably larger than the precision of common DSP floating point data types. +It would take another common optimization to optimize the algorithm. + +\subsection{Fixed Point Arithmetic} +The trick used to optimize the above equation is based on a technique of emulating floating point data types with fixed precision integers. Fixed +point arithmetic would become very popular as it greatly optimize the ``3d-shooter'' genre of games in the mid 1990s when floating point units were +fairly slow if not unavailable. The idea behind fixed point arithmetic is to take a normal $k$-bit integer data type and break it into $p$-bit +integer and a $q$-bit fraction part (\textit{where $p+q = k$}). + +In this system a $k$-bit integer $n$ would actually represent $n/2^q$. For example, with $q = 4$ the integer $n = 37$ would actually represent the +value $2.3125$. To multiply two fixed point numbers the integers are multiplied using traditional arithmetic and subsequently normalized by +moving the implied decimal point back to where it should be. For example, with $q = 4$ to multiply the integers $9$ and $5$ they must be converted +to fixed point first by multiplying by $2^q$. Let $a = 9(2^q)$ represent the fixed point representation of $9$ and $b = 5(2^q)$ represent the +fixed point representation of $5$. The product $ab$ is equal to $45(2^{2q})$ which when normalized by dividing by $2^q$ produces $45(2^q)$. + +This technique became popular since a normal integer multiplication and logical shift right are the only required operations to perform a multiplication +of two fixed point numbers. Using fixed point arithmetic, division can be easily approximated by multiplying by the reciprocal. If $2^q$ is +equivalent to one than $2^q/b$ is equivalent to the fixed point approximation of $1/b$ using real arithmetic. Using this fact dividing an integer +$a$ by another integer $b$ can be achieved with the following expression. + +\begin{equation} +\lfloor a / b \rfloor \mbox{ }\approx\mbox{ } \lfloor (a \cdot \lfloor 2^q / b \rfloor)/2^q \rfloor +\end{equation} + +The precision of the division is proportional to the value of $q$. If the divisor $b$ is used frequently as is the case with +modular exponentiation pre-computing $2^q/b$ will allow a division to be performed with a multiplication and a right shift. Both operations +are considerably faster than division on most processors. + +Consider dividing $19$ by $5$. The correct result is $\lfloor 19/5 \rfloor = 3$. With $q = 3$ the reciprocal is $\lfloor 2^q/5 \rfloor = 1$ which +leads to a product of $19$ which when divided by $2^q$ produces $2$. However, with $q = 4$ the reciprocal is $\lfloor 2^q/5 \rfloor = 3$ and +the result of the emulated division is $\lfloor 3 \cdot 19 / 2^q \rfloor = 3$ which is correct. The value of $2^q$ must be close to or ideally +larger than the dividend. In effect if $a$ is the dividend then $q$ should allow $0 \le \lfloor a/2^q \rfloor \le 1$ in order for this approach +to work correctly. Plugging this form of divison into the original equation the following modular residue equation arises. + +\begin{equation} +c = a - b \cdot \lfloor (a \cdot \lfloor 2^q / b \rfloor)/2^q \rfloor +\end{equation} + +Using the notation from \cite{BARRETT} the value of $\lfloor 2^q / b \rfloor$ will be represented by the $\mu$ symbol. Using the $\mu$ +variable also helps re-inforce the idea that it is meant to be computed once and re-used. + +\begin{equation} +c = a - b \cdot \lfloor (a \cdot \mu)/2^q \rfloor +\end{equation} + +Provided that $2^q \ge a$ this algorithm will produce a quotient that is either exactly correct or off by a value of one. In the context of Barrett +reduction the value of $a$ is bound by $0 \le a \le (b - 1)^2$ meaning that $2^q \ge b^2$ is sufficient to ensure the reciprocal will have enough +precision. + +Let $n$ represent the number of digits in $b$. This algorithm requires approximately $2n^2$ single precision multiplications to produce the quotient and +another $n^2$ single precision multiplications to find the residue. In total $3n^2$ single precision multiplications are required to +reduce the number. + +For example, if $b = 1179677$ and $q = 41$ ($2^q > b^2$), then the reciprocal $\mu$ is equal to $\lfloor 2^q / b \rfloor = 1864089$. Consider reducing +$a = 180388626447$ modulo $b$ using the above reduction equation. The quotient using the new formula is $\lfloor (a \cdot \mu) / 2^q \rfloor = 152913$. +By subtracting $152913b$ from $a$ the correct residue $a \equiv 677346 \mbox{ (mod }b\mbox{)}$ is found. + +\subsection{Choosing a Radix Point} +Using the fixed point representation a modular reduction can be performed with $3n^2$ single precision multiplications. If that were the best +that could be achieved a full division\footnote{A division requires approximately $O(2cn^2)$ single precision multiplications for a small value of $c$. +See~\ref{sec:division} for further details.} might as well be used in its place. The key to optimizing the reduction is to reduce the precision of +the initial multiplication that finds the quotient. + +Let $a$ represent the number of which the residue is sought. Let $b$ represent the modulus used to find the residue. Let $m$ represent +the number of digits in $b$. For the purposes of this discussion we will assume that the number of digits in $a$ is $2m$, which is generally true if +two $m$-digit numbers have been multiplied. Dividing $a$ by $b$ is the same as dividing a $2m$ digit integer by a $m$ digit integer. Digits below the +$m - 1$'th digit of $a$ will contribute at most a value of $1$ to the quotient because $\beta^k < b$ for any $0 \le k \le m - 1$. Another way to +express this is by re-writing $a$ as two parts. If $a' \equiv a \mbox{ (mod }b^m\mbox{)}$ and $a'' = a - a'$ then +${a \over b} \equiv {{a' + a''} \over b}$ which is equivalent to ${a' \over b} + {a'' \over b}$. Since $a'$ is bound to be less than $b$ the quotient +is bound by $0 \le {a' \over b} < 1$. + +Since the digits of $a'$ do not contribute much to the quotient the observation is that they might as well be zero. However, if the digits +``might as well be zero'' they might as well not be there in the first place. Let $q_0 = \lfloor a/\beta^{m-1} \rfloor$ represent the input +with the irrelevant digits trimmed. Now the modular reduction is trimmed to the almost equivalent equation + +\begin{equation} +c = a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor +\end{equation} + +Note that the original divisor $2^q$ has been replaced with $\beta^{m+1}$ where in this case $q$ is a multiple of $lg(\beta)$. Also note that the +exponent on the divisor when added to the amount $q_0$ was shifted by equals $2m$. If the optimization had not been performed the divisor +would have the exponent $2m$ so in the end the exponents do ``add up''. Using the above equation the quotient +$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ can be off from the true quotient by at most two. The original fixed point quotient can be off +by as much as one (\textit{provided the radix point is chosen suitably}) and now that the lower irrelevent digits have been trimmed the quotient +can be off by an additional value of one for a total of at most two. This implies that +$0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$. By first subtracting $b$ times the quotient and then conditionally subtracting +$b$ once or twice the residue is found. + +The quotient is now found using $(m + 1)(m) = m^2 + m$ single precision multiplications and the residue with an additional $m^2$ single +precision multiplications, ignoring the subtractions required. In total $2m^2 + m$ single precision multiplications are required to find the residue. +This is considerably faster than the original attempt. + +For example, let $\beta = 10$ represent the radix of the digits. Let $b = 9999$ represent the modulus which implies $m = 4$. Let $a = 99929878$ +represent the value of which the residue is desired. In this case $q = 8$ since $10^7 < 9999^2$ meaning that $\mu = \lfloor \beta^{q}/b \rfloor = 10001$. +With the new observation the multiplicand for the quotient is equal to $q_0 = \lfloor a / \beta^{m - 1} \rfloor = 99929$. The quotient is then +$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor = 9993$. Subtracting $9993b$ from $a$ and the correct residue $a \equiv 9871 \mbox{ (mod }b\mbox{)}$ +is found. + +\subsection{Trimming the Quotient} +So far the reduction algorithm has been optimized from $3m^2$ single precision multiplications down to $2m^2 + m$ single precision multiplications. As +it stands now the algorithm is already fairly fast compared to a full integer division algorithm. However, there is still room for +optimization. + +After the first multiplication inside the quotient ($q_0 \cdot \mu$) the value is shifted right by $m + 1$ places effectively nullifying the lower +half of the product. It would be nice to be able to remove those digits from the product to effectively cut down the number of single precision +multiplications. If the number of digits in the modulus $m$ is far less than $\beta$ a full product is not required for the algorithm to work properly. +In fact the lower $m - 2$ digits will not affect the upper half of the product at all and do not need to be computed. + +The value of $\mu$ is a $m$-digit number and $q_0$ is a $m + 1$ digit number. Using a full multiplier $(m + 1)(m) = m^2 + m$ single precision +multiplications would be required. Using a multiplier that will only produce digits at and above the $m - 1$'th digit reduces the number +of single precision multiplications to ${m^2 + m} \over 2$ single precision multiplications. + +\subsection{Trimming the Residue} +After the quotient has been calculated it is used to reduce the input. As previously noted the algorithm is not exact and it can be off by a small +multiple of the modulus, that is $0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$. If $b$ is $m$ digits than the +result of reduction equation is a value of at most $m + 1$ digits (\textit{provided $3 < \beta$}) implying that the upper $m - 1$ digits are +implicitly zero. + +The next optimization arises from this very fact. Instead of computing $b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ using a full +$O(m^2)$ multiplication algorithm only the lower $m+1$ digits of the product have to be computed. Similarly the value of $a$ can +be reduced modulo $\beta^{m+1}$ before the multiple of $b$ is subtracted which simplifes the subtraction as well. A multiplication that produces +only the lower $m+1$ digits requires ${m^2 + 3m - 2} \over 2$ single precision multiplications. + +With both optimizations in place the algorithm is the algorithm Barrett proposed. It requires $m^2 + 2m - 1$ single precision multiplications which +is considerably faster than the straightforward $3m^2$ method. + +\subsection{The Barrett Algorithm} +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_reduce}. \\ +\textbf{Input}. mp\_int $a$, mp\_int $b$ and $\mu = \lfloor \beta^{2m}/b \rfloor, m = \lceil lg_{\beta}(b) \rceil, (0 \le a < b^2, b > 1)$ \\ +\textbf{Output}. $a \mbox{ (mod }b\mbox{)}$ \\ +\hline \\ +Let $m$ represent the number of digits in $b$. \\ +1. Make a copy of $a$ and store it in $q$. (\textit{mp\_init\_copy}) \\ +2. $q \leftarrow \lfloor q / \beta^{m - 1} \rfloor$ (\textit{mp\_rshd}) \\ +\\ +Produce the quotient. \\ +3. $q \leftarrow q \cdot \mu$ (\textit{note: only produce digits at or above $m-1$}) \\ +4. $q \leftarrow \lfloor q / \beta^{m + 1} \rfloor$ \\ +\\ +Subtract the multiple of modulus from the input. \\ +5. $a \leftarrow a \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{mp\_mod\_2d}) \\ +6. $q \leftarrow q \cdot b \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{s\_mp\_mul\_digs}) \\ +7. $a \leftarrow a - q$ (\textit{mp\_sub}) \\ +\\ +Add $\beta^{m+1}$ if a carry occured. \\ +8. If $a < 0$ then (\textit{mp\_cmp\_d}) \\ +\hspace{3mm}8.1 $q \leftarrow 1$ (\textit{mp\_set}) \\ +\hspace{3mm}8.2 $q \leftarrow q \cdot \beta^{m+1}$ (\textit{mp\_lshd}) \\ +\hspace{3mm}8.3 $a \leftarrow a + q$ \\ +\\ +Now subtract the modulus if the residue is too large (e.g. quotient too small). \\ +9. While $a \ge b$ do (\textit{mp\_cmp}) \\ +\hspace{3mm}9.1 $c \leftarrow a - b$ \\ +10. Clear $q$. \\ +11. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_reduce} +\end{figure} + +\textbf{Algorithm mp\_reduce.} +This algorithm will reduce the input $a$ modulo $b$ in place using the Barrett algorithm. It is loosely based on algorithm 14.42 of HAC +\cite[pp. 602]{HAC} which is based on the paper from Paul Barrett \cite{BARRETT}. The algorithm has several restrictions and assumptions which must +be adhered to for the algorithm to work. + +First the modulus $b$ is assumed to be positive and greater than one. If the modulus were less than or equal to one than subtracting +a multiple of it would either accomplish nothing or actually enlarge the input. The input $a$ must be in the range $0 \le a < b^2$ in order +for the quotient to have enough precision. If $a$ is the product of two numbers that were already reduced modulo $b$, this will not be a problem. +Technically the algorithm will still work if $a \ge b^2$ but it will take much longer to finish. The value of $\mu$ is passed as an argument to this +algorithm and is assumed to be calculated and stored before the algorithm is used. + +Recall that the multiplication for the quotient on step 3 must only produce digits at or above the $m-1$'th position. An algorithm called +$s\_mp\_mul\_high\_digs$ which has not been presented is used to accomplish this task. The algorithm is based on $s\_mp\_mul\_digs$ except that +instead of stopping at a given level of precision it starts at a given level of precision. This optimal algorithm can only be used if the number +of digits in $b$ is very much smaller than $\beta$. + +While it is known that +$a \ge b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ only the lower $m+1$ digits are being used to compute the residue, so an implied +``borrow'' from the higher digits might leave a negative result. After the multiple of the modulus has been subtracted from $a$ the residue must be +fixed up in case it is negative. The invariant $\beta^{m+1}$ must be added to the residue to make it positive again. + +The while loop at step 9 will subtract $b$ until the residue is less than $b$. If the algorithm is performed correctly this step is +performed at most twice, and on average once. However, if $a \ge b^2$ than it will iterate substantially more times than it should. + +EXAM,bn_mp_reduce.c + +The first multiplication that determines the quotient can be performed by only producing the digits from $m - 1$ and up. This essentially halves +the number of single precision multiplications required. However, the optimization is only safe if $\beta$ is much larger than the number of digits +in the modulus. In the source code this is evaluated on lines @36,if@ to @44,}@ where algorithm s\_mp\_mul\_high\_digs is used when it is +safe to do so. + +\subsection{The Barrett Setup Algorithm} +In order to use algorithm mp\_reduce the value of $\mu$ must be calculated in advance. Ideally this value should be computed once and stored for +future use so that the Barrett algorithm can be used without delay. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_reduce\_setup}. \\ +\textbf{Input}. mp\_int $a$ ($a > 1$) \\ +\textbf{Output}. $\mu \leftarrow \lfloor \beta^{2m}/a \rfloor$ \\ +\hline \\ +1. $\mu \leftarrow 2^{2 \cdot lg(\beta) \cdot m}$ (\textit{mp\_2expt}) \\ +2. $\mu \leftarrow \lfloor \mu / b \rfloor$ (\textit{mp\_div}) \\ +3. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_reduce\_setup} +\end{figure} + +\textbf{Algorithm mp\_reduce\_setup.} +This algorithm computes the reciprocal $\mu$ required for Barrett reduction. First $\beta^{2m}$ is calculated as $2^{2 \cdot lg(\beta) \cdot m}$ which +is equivalent and much faster. The final value is computed by taking the integer quotient of $\lfloor \mu / b \rfloor$. + +EXAM,bn_mp_reduce_setup.c + +This simple routine calculates the reciprocal $\mu$ required by Barrett reduction. Note the extended usage of algorithm mp\_div where the variable +which would received the remainder is passed as NULL. As will be discussed in~\ref{sec:division} the division routine allows both the quotient and the +remainder to be passed as NULL meaning to ignore the value. + +\section{The Montgomery Reduction} +Montgomery reduction\footnote{Thanks to Niels Ferguson for his insightful explanation of the algorithm.} \cite{MONT} is by far the most interesting +form of reduction in common use. It computes a modular residue which is not actually equal to the residue of the input yet instead equal to a +residue times a constant. However, as perplexing as this may sound the algorithm is relatively simple and very efficient. + +Throughout this entire section the variable $n$ will represent the modulus used to form the residue. As will be discussed shortly the value of +$n$ must be odd. The variable $x$ will represent the quantity of which the residue is sought. Similar to the Barrett algorithm the input +is restricted to $0 \le x < n^2$. To begin the description some simple number theory facts must be established. + +\textbf{Fact 1.} Adding $n$ to $x$ does not change the residue since in effect it adds one to the quotient $\lfloor x / n \rfloor$. Another way +to explain this is that $n$ is (\textit{or multiples of $n$ are}) congruent to zero modulo $n$. Adding zero will not change the value of the residue. + +\textbf{Fact 2.} If $x$ is even then performing a division by two in $\Z$ is congruent to $x \cdot 2^{-1} \mbox{ (mod }n\mbox{)}$. Actually +this is an application of the fact that if $x$ is evenly divisible by any $k \in \Z$ then division in $\Z$ will be congruent to +multiplication by $k^{-1}$ modulo $n$. + +From these two simple facts the following simple algorithm can be derived. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Montgomery Reduction}. \\ +\textbf{Input}. Integer $x$, $n$ and $k$ \\ +\textbf{Output}. $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\ +\hline \\ +1. for $t$ from $1$ to $k$ do \\ +\hspace{3mm}1.1 If $x$ is odd then \\ +\hspace{6mm}1.1.1 $x \leftarrow x + n$ \\ +\hspace{3mm}1.2 $x \leftarrow x/2$ \\ +2. Return $x$. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm Montgomery Reduction} +\end{figure} + +The algorithm reduces the input one bit at a time using the two congruencies stated previously. Inside the loop $n$, which is odd, is +added to $x$ if $x$ is odd. This forces $x$ to be even which allows the division by two in $\Z$ to be congruent to a modular division by two. Since +$x$ is assumed to be initially much larger than $n$ the addition of $n$ will contribute an insignificant magnitude to $x$. Let $r$ represent the +final result of the Montgomery algorithm. If $k > lg(n)$ and $0 \le x < n^2$ then the final result is limited to +$0 \le r < \lfloor x/2^k \rfloor + n$. As a result at most a single subtraction is required to get the residue desired. + +\begin{figure}[here] +\begin{small} +\begin{center} +\begin{tabular}{|c|l|} +\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} \\ +\hline $1$ & $x + n = 5812$, $x/2 = 2906$ \\ +\hline $2$ & $x/2 = 1453$ \\ +\hline $3$ & $x + n = 1710$, $x/2 = 855$ \\ +\hline $4$ & $x + n = 1112$, $x/2 = 556$ \\ +\hline $5$ & $x/2 = 278$ \\ +\hline $6$ & $x/2 = 139$ \\ +\hline $7$ & $x + n = 396$, $x/2 = 198$ \\ +\hline $8$ & $x/2 = 99$ \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Example of Montgomery Reduction (I)} +\label{fig:MONT1} +\end{figure} + +Consider the example in figure~\ref{fig:MONT1} which reduces $x = 5555$ modulo $n = 257$ when $k = 8$. The result of the algorithm $r = 99$ is +congruent to the value of $2^{-8} \cdot 5555 \mbox{ (mod }257\mbox{)}$. When $r$ is multiplied by $2^8$ modulo $257$ the correct residue +$r \equiv 158$ is produced. + +Let $k = \lfloor lg(n) \rfloor + 1$ represent the number of bits in $n$. The current algorithm requires $2k^2$ single precision shifts +and $k^2$ single precision additions. At this rate the algorithm is most certainly slower than Barrett reduction and not terribly useful. +Fortunately there exists an alternative representation of the algorithm. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Montgomery Reduction} (modified I). \\ +\textbf{Input}. Integer $x$, $n$ and $k$ \\ +\textbf{Output}. $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\ +\hline \\ +1. for $t$ from $0$ to $k - 1$ do \\ +\hspace{3mm}1.1 If the $t$'th bit of $x$ is one then \\ +\hspace{6mm}1.1.1 $x \leftarrow x + 2^tn$ \\ +2. Return $x/2^k$. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm Montgomery Reduction (modified I)} +\end{figure} + +This algorithm is equivalent since $2^tn$ is a multiple of $n$ and the lower $k$ bits of $x$ are zero by step 2. The number of single +precision shifts has now been reduced from $2k^2$ to $k^2 + k$ which is only a small improvement. + +\begin{figure}[here] +\begin{small} +\begin{center} +\begin{tabular}{|c|l|r|} +\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} & \textbf{Result ($x$) in Binary} \\ +\hline -- & $5555$ & $1010110110011$ \\ +\hline $1$ & $x + 2^{0}n = 5812$ & $1011010110100$ \\ +\hline $2$ & $5812$ & $1011010110100$ \\ +\hline $3$ & $x + 2^{2}n = 6840$ & $1101010111000$ \\ +\hline $4$ & $x + 2^{3}n = 8896$ & $10001011000000$ \\ +\hline $5$ & $8896$ & $10001011000000$ \\ +\hline $6$ & $8896$ & $10001011000000$ \\ +\hline $7$ & $x + 2^{6}n = 25344$ & $110001100000000$ \\ +\hline $8$ & $25344$ & $110001100000000$ \\ +\hline -- & $x/2^k = 99$ & \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Example of Montgomery Reduction (II)} +\label{fig:MONT2} +\end{figure} + +Figure~\ref{fig:MONT2} demonstrates the modified algorithm reducing $x = 5555$ modulo $n = 257$ with $k = 8$. +With this algorithm a single shift right at the end is the only right shift required to reduce the input instead of $k$ right shifts inside the +loop. Note that for the iterations $t = 2, 5, 6$ and $8$ where the result $x$ is not changed. In those iterations the $t$'th bit of $x$ is +zero and the appropriate multiple of $n$ does not need to be added to force the $t$'th bit of the result to zero. + +\subsection{Digit Based Montgomery Reduction} +Instead of computing the reduction on a bit-by-bit basis it is actually much faster to compute it on digit-by-digit basis. Consider the +previous algorithm re-written to compute the Montgomery reduction in this new fashion. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Montgomery Reduction} (modified II). \\ +\textbf{Input}. Integer $x$, $n$ and $k$ \\ +\textbf{Output}. $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\ +\hline \\ +1. for $t$ from $0$ to $k - 1$ do \\ +\hspace{3mm}1.1 $x \leftarrow x + \mu n \beta^t$ \\ +2. Return $x/\beta^k$. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm Montgomery Reduction (modified II)} +\end{figure} + +The value $\mu n \beta^t$ is a multiple of the modulus $n$ meaning that it will not change the residue. If the first digit of +the value $\mu n \beta^t$ equals the negative (modulo $\beta$) of the $t$'th digit of $x$ then the addition will result in a zero digit. This +problem breaks down to solving the following congruency. + +\begin{center} +\begin{tabular}{rcl} +$x_t + \mu n_0$ & $\equiv$ & $0 \mbox{ (mod }\beta\mbox{)}$ \\ +$\mu n_0$ & $\equiv$ & $-x_t \mbox{ (mod }\beta\mbox{)}$ \\ +$\mu$ & $\equiv$ & $-x_t/n_0 \mbox{ (mod }\beta\mbox{)}$ \\ +\end{tabular} +\end{center} + +In each iteration of the loop on step 1 a new value of $\mu$ must be calculated. The value of $-1/n_0 \mbox{ (mod }\beta\mbox{)}$ is used +extensively in this algorithm and should be precomputed. Let $\rho$ represent the negative of the modular inverse of $n_0$ modulo $\beta$. + +For example, let $\beta = 10$ represent the radix. Let $n = 17$ represent the modulus which implies $k = 2$ and $\rho \equiv 7$. Let $x = 33$ +represent the value to reduce. + +\newpage\begin{figure} +\begin{center} +\begin{tabular}{|c|c|c|} +\hline \textbf{Step ($t$)} & \textbf{Value of $x$} & \textbf{Value of $\mu$} \\ +\hline -- & $33$ & --\\ +\hline $0$ & $33 + \mu n = 50$ & $1$ \\ +\hline $1$ & $50 + \mu n \beta = 900$ & $5$ \\ +\hline +\end{tabular} +\end{center} +\caption{Example of Montgomery Reduction} +\end{figure} + +The final result $900$ is then divided by $\beta^k$ to produce the final result $9$. The first observation is that $9 \nequiv x \mbox{ (mod }n\mbox{)}$ +which implies the result is not the modular residue of $x$ modulo $n$. However, recall that the residue is actually multiplied by $\beta^{-k}$ in +the algorithm. To get the true residue the value must be multiplied by $\beta^k$. In this case $\beta^k \equiv 15 \mbox{ (mod }n\mbox{)}$ and +the correct residue is $9 \cdot 15 \equiv 16 \mbox{ (mod }n\mbox{)}$. + +\subsection{Baseline Montgomery Reduction} +The baseline Montgomery reduction algorithm will produce the residue for any size input. It is designed to be a catch-all algororithm for +Montgomery reductions. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_montgomery\_reduce}. \\ +\textbf{Input}. mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\ +\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\ +\textbf{Output}. $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\ +\hline \\ +1. $digs \leftarrow 2n.used + 1$ \\ +2. If $digs < MP\_ARRAY$ and $m.used < \delta$ then \\ +\hspace{3mm}2.1 Use algorithm fast\_mp\_montgomery\_reduce instead. \\ +\\ +Setup $x$ for the reduction. \\ +3. If $x.alloc < digs$ then grow $x$ to $digs$ digits. \\ +4. $x.used \leftarrow digs$ \\ +\\ +Eliminate the lower $k$ digits. \\ +5. For $ix$ from $0$ to $k - 1$ do \\ +\hspace{3mm}5.1 $\mu \leftarrow x_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{3mm}5.2 $u \leftarrow 0$ \\ +\hspace{3mm}5.3 For $iy$ from $0$ to $k - 1$ do \\ +\hspace{6mm}5.3.1 $\hat r \leftarrow \mu n_{iy} + x_{ix + iy} + u$ \\ +\hspace{6mm}5.3.2 $x_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{6mm}5.3.3 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ +\hspace{3mm}5.4 While $u > 0$ do \\ +\hspace{6mm}5.4.1 $iy \leftarrow iy + 1$ \\ +\hspace{6mm}5.4.2 $x_{ix + iy} \leftarrow x_{ix + iy} + u$ \\ +\hspace{6mm}5.4.3 $u \leftarrow \lfloor x_{ix+iy} / \beta \rfloor$ \\ +\hspace{6mm}5.4.4 $x_{ix + iy} \leftarrow x_{ix+iy} \mbox{ (mod }\beta\mbox{)}$ \\ +\\ +Divide by $\beta^k$ and fix up as required. \\ +6. $x \leftarrow \lfloor x / \beta^k \rfloor$ \\ +7. If $x \ge n$ then \\ +\hspace{3mm}7.1 $x \leftarrow x - n$ \\ +8. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_montgomery\_reduce} +\end{figure} + +\textbf{Algorithm mp\_montgomery\_reduce.} +This algorithm reduces the input $x$ modulo $n$ in place using the Montgomery reduction algorithm. The algorithm is loosely based +on algorithm 14.32 of \cite[pp.601]{HAC} except it merges the multiplication of $\mu n \beta^t$ with the addition in the inner loop. The +restrictions on this algorithm are fairly easy to adapt to. First $0 \le x < n^2$ bounds the input to numbers in the same range as +for the Barrett algorithm. Additionally if $n > 1$ and $n$ is odd there will exist a modular inverse $\rho$. $\rho$ must be calculated in +advance of this algorithm. Finally the variable $k$ is fixed and a pseudonym for $n.used$. + +Step 2 decides whether a faster Montgomery algorithm can be used. It is based on the Comba technique meaning that there are limits on +the size of the input. This algorithm is discussed in ~COMBARED~. + +Step 5 is the main reduction loop of the algorithm. The value of $\mu$ is calculated once per iteration in the outer loop. The inner loop +calculates $x + \mu n \beta^{ix}$ by multiplying $\mu n$ and adding the result to $x$ shifted by $ix$ digits. Both the addition and +multiplication are performed in the same loop to save time and memory. Step 5.4 will handle any additional carries that escape the inner loop. + +Using a quick inspection this algorithm requires $n$ single precision multiplications for the outer loop and $n^2$ single precision multiplications +in the inner loop. In total $n^2 + n$ single precision multiplications which compares favourably to Barrett at $n^2 + 2n - 1$ single precision +multiplications. + +EXAM,bn_mp_montgomery_reduce.c + +This is the baseline implementation of the Montgomery reduction algorithm. Lines @30,digs@ to @35,}@ determine if the Comba based +routine can be used instead. Line @47,mu@ computes the value of $\mu$ for that particular iteration of the outer loop. + +The multiplication $\mu n \beta^{ix}$ is performed in one step in the inner loop. The alias $tmpx$ refers to the $ix$'th digit of $x$ and +the alias $tmpn$ refers to the modulus $n$. + +\subsection{Faster ``Comba'' Montgomery Reduction} +MARK,COMBARED + +The Montgomery reduction requires fewer single precision multiplications than a Barrett reduction, however it is much slower due to the serial +nature of the inner loop. The Barrett reduction algorithm requires two slightly modified multipliers which can be implemented with the Comba +technique. The Montgomery reduction algorithm cannot directly use the Comba technique to any significant advantage since the inner loop calculates +a $k \times 1$ product $k$ times. + +The biggest obstacle is that at the $ix$'th iteration of the outer loop the value of $x_{ix}$ is required to calculate $\mu$. This means the +carries from $0$ to $ix - 1$ must have been propagated upwards to form a valid $ix$'th digit. The solution as it turns out is very simple. +Perform a Comba like multiplier and inside the outer loop just after the inner loop fix up the $ix + 1$'th digit by forwarding the carry. + +With this change in place the Montgomery reduction algorithm can be performed with a Comba style multiplication loop which substantially increases +the speed of the algorithm. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{fast\_mp\_montgomery\_reduce}. \\ +\textbf{Input}. mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\ +\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\ +\textbf{Output}. $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\ +\hline \\ +Place an array of \textbf{MP\_WARRAY} mp\_word variables called $\hat W$ on the stack. \\ +1. if $x.alloc < n.used + 1$ then grow $x$ to $n.used + 1$ digits. \\ +Copy the digits of $x$ into the array $\hat W$ \\ +2. For $ix$ from $0$ to $x.used - 1$ do \\ +\hspace{3mm}2.1 $\hat W_{ix} \leftarrow x_{ix}$ \\ +3. For $ix$ from $x.used$ to $2n.used - 1$ do \\ +\hspace{3mm}3.1 $\hat W_{ix} \leftarrow 0$ \\ +Elimiate the lower $k$ digits. \\ +4. for $ix$ from $0$ to $n.used - 1$ do \\ +\hspace{3mm}4.1 $\mu \leftarrow \hat W_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{3mm}4.2 For $iy$ from $0$ to $n.used - 1$ do \\ +\hspace{6mm}4.2.1 $\hat W_{iy + ix} \leftarrow \hat W_{iy + ix} + \mu \cdot n_{iy}$ \\ +\hspace{3mm}4.3 $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\ +Propagate carries upwards. \\ +5. for $ix$ from $n.used$ to $2n.used + 1$ do \\ +\hspace{3mm}5.1 $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\ +Shift right and reduce modulo $\beta$ simultaneously. \\ +6. for $ix$ from $0$ to $n.used + 1$ do \\ +\hspace{3mm}6.1 $x_{ix} \leftarrow \hat W_{ix + n.used} \mbox{ (mod }\beta\mbox{)}$ \\ +Zero excess digits and fixup $x$. \\ +7. if $x.used > n.used + 1$ then do \\ +\hspace{3mm}7.1 for $ix$ from $n.used + 1$ to $x.used - 1$ do \\ +\hspace{6mm}7.1.1 $x_{ix} \leftarrow 0$ \\ +8. $x.used \leftarrow n.used + 1$ \\ +9. Clamp excessive digits of $x$. \\ +10. If $x \ge n$ then \\ +\hspace{3mm}10.1 $x \leftarrow x - n$ \\ +11. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm fast\_mp\_montgomery\_reduce} +\end{figure} + +\textbf{Algorithm fast\_mp\_montgomery\_reduce.} +This algorithm will compute the Montgomery reduction of $x$ modulo $n$ using the Comba technique. It is on most computer platforms significantly +faster than algorithm mp\_montgomery\_reduce and algorithm mp\_reduce (\textit{Barrett reduction}). The algorithm has the same restrictions +on the input as the baseline reduction algorithm. An additional two restrictions are imposed on this algorithm. The number of digits $k$ in the +the modulus $n$ must not violate $MP\_WARRAY > 2k +1$ and $n < \delta$. When $\beta = 2^{28}$ this algorithm can be used to reduce modulo +a modulus of at most $3,556$ bits in length. + +As in the other Comba reduction algorithms there is a $\hat W$ array which stores the columns of the product. It is initially filled with the +contents of $x$ with the excess digits zeroed. The reduction loop is very similar the to the baseline loop at heart. The multiplication on step +4.1 can be single precision only since $ab \mbox{ (mod }\beta\mbox{)} \equiv (a \mbox{ mod }\beta)(b \mbox{ mod }\beta)$. Some multipliers such +as those on the ARM processors take a variable length time to complete depending on the number of bytes of result it must produce. By performing +a single precision multiplication instead half the amount of time is spent. + +Also note that digit $\hat W_{ix}$ must have the carry from the $ix - 1$'th digit propagated upwards in order for this to work. That is what step +4.3 will do. In effect over the $n.used$ iterations of the outer loop the $n.used$'th lower columns all have the their carries propagated forwards. Note +how the upper bits of those same words are not reduced modulo $\beta$. This is because those values will be discarded shortly and there is no +point. + +Step 5 will propagate the remainder of the carries upwards. On step 6 the columns are reduced modulo $\beta$ and shifted simultaneously as they are +stored in the destination $x$. + +EXAM,bn_fast_mp_montgomery_reduce.c + +The $\hat W$ array is first filled with digits of $x$ on line @49,for@ then the rest of the digits are zeroed on line @54,for@. Both loops share +the same alias variables to make the code easier to read. + +The value of $\mu$ is calculated in an interesting fashion. First the value $\hat W_{ix}$ is reduced modulo $\beta$ and cast to a mp\_digit. This +forces the compiler to use a single precision multiplication and prevents any concerns about loss of precision. Line @101,>>@ fixes the carry +for the next iteration of the loop by propagating the carry from $\hat W_{ix}$ to $\hat W_{ix+1}$. + +The for loop on line @113,for@ propagates the rest of the carries upwards through the columns. The for loop on line @126,for@ reduces the columns +modulo $\beta$ and shifts them $k$ places at the same time. The alias $\_ \hat W$ actually refers to the array $\hat W$ starting at the $n.used$'th +digit, that is $\_ \hat W_{t} = \hat W_{n.used + t}$. + +\subsection{Montgomery Setup} +To calculate the variable $\rho$ a relatively simple algorithm will be required. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_montgomery\_setup}. \\ +\textbf{Input}. mp\_int $n$ ($n > 1$ and $(n, 2) = 1$) \\ +\textbf{Output}. $\rho \equiv -1/n_0 \mbox{ (mod }\beta\mbox{)}$ \\ +\hline \\ +1. $b \leftarrow n_0$ \\ +2. If $b$ is even return(\textit{MP\_VAL}) \\ +3. $x \leftarrow ((b + 2) \mbox{ AND } 4) << 1) + b$ \\ +4. for $k$ from 0 to $\lceil lg(lg(\beta)) \rceil - 2$ do \\ +\hspace{3mm}4.1 $x \leftarrow x \cdot (2 - bx)$ \\ +5. $\rho \leftarrow \beta - x \mbox{ (mod }\beta\mbox{)}$ \\ +6. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_montgomery\_setup} +\end{figure} + +\textbf{Algorithm mp\_montgomery\_setup.} +This algorithm will calculate the value of $\rho$ required within the Montgomery reduction algorithms. It uses a very interesting trick +to calculate $1/n_0$ when $\beta$ is a power of two. + +EXAM,bn_mp_montgomery_setup.c + +This source code computes the value of $\rho$ required to perform Montgomery reduction. It has been modified to avoid performing excess +multiplications when $\beta$ is not the default 28-bits. + +\section{The Diminished Radix Algorithm} +The Diminished Radix method of modular reduction \cite{DRMET} is a fairly clever technique which can be more efficient than either the Barrett +or Montgomery methods for certain forms of moduli. The technique is based on the following simple congruence. + +\begin{equation} +(x \mbox{ mod } n) + k \lfloor x / n \rfloor \equiv x \mbox{ (mod }(n - k)\mbox{)} +\end{equation} + +This observation was used in the MMB \cite{MMB} block cipher to create a diffusion primitive. It used the fact that if $n = 2^{31}$ and $k=1$ that +then a x86 multiplier could produce the 62-bit product and use the ``shrd'' instruction to perform a double-precision right shift. The proof +of the above equation is very simple. First write $x$ in the product form. + +\begin{equation} +x = qn + r +\end{equation} + +Now reduce both sides modulo $(n - k)$. + +\begin{equation} +x \equiv qk + r \mbox{ (mod }(n-k)\mbox{)} +\end{equation} + +The variable $n$ reduces modulo $n - k$ to $k$. By putting $q = \lfloor x/n \rfloor$ and $r = x \mbox{ mod } n$ +into the equation the original congruence is reproduced, thus concluding the proof. The following algorithm is based on this observation. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Diminished Radix Reduction}. \\ +\textbf{Input}. Integer $x$, $n$, $k$ \\ +\textbf{Output}. $x \mbox{ mod } (n - k)$ \\ +\hline \\ +1. $q \leftarrow \lfloor x / n \rfloor$ \\ +2. $q \leftarrow k \cdot q$ \\ +3. $x \leftarrow x \mbox{ (mod }n\mbox{)}$ \\ +4. $x \leftarrow x + q$ \\ +5. If $x \ge (n - k)$ then \\ +\hspace{3mm}5.1 $x \leftarrow x - (n - k)$ \\ +\hspace{3mm}5.2 Goto step 1. \\ +6. Return $x$ \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm Diminished Radix Reduction} +\label{fig:DR} +\end{figure} + +This algorithm will reduce $x$ modulo $n - k$ and return the residue. If $0 \le x < (n - k)^2$ then the algorithm will loop almost always +once or twice and occasionally three times. For simplicity sake the value of $x$ is bounded by the following simple polynomial. + +\begin{equation} +0 \le x < n^2 + k^2 - 2nk +\end{equation} + +The true bound is $0 \le x < (n - k - 1)^2$ but this has quite a few more terms. The value of $q$ after step 1 is bounded by the following. + +\begin{equation} +q < n - 2k - k^2/n +\end{equation} + +Since $k^2$ is going to be considerably smaller than $n$ that term will always be zero. The value of $x$ after step 3 is bounded trivially as +$0 \le x < n$. By step four the sum $x + q$ is bounded by + +\begin{equation} +0 \le q + x < (k + 1)n - 2k^2 - 1 +\end{equation} + +With a second pass $q$ will be loosely bounded by $0 \le q < k^2$ after step 2 while $x$ will still be loosely bounded by $0 \le x < n$ after step 3. After the second pass it is highly unlike that the +sum in step 4 will exceed $n - k$. In practice fewer than three passes of the algorithm are required to reduce virtually every input in the +range $0 \le x < (n - k - 1)^2$. + +\begin{figure} +\begin{small} +\begin{center} +\begin{tabular}{|l|} +\hline +$x = 123456789, n = 256, k = 3$ \\ +\hline $q \leftarrow \lfloor x/n \rfloor = 482253$ \\ +$q \leftarrow q*k = 1446759$ \\ +$x \leftarrow x \mbox{ mod } n = 21$ \\ +$x \leftarrow x + q = 1446780$ \\ +$x \leftarrow x - (n - k) = 1446527$ \\ +\hline +$q \leftarrow \lfloor x/n \rfloor = 5650$ \\ +$q \leftarrow q*k = 16950$ \\ +$x \leftarrow x \mbox{ mod } n = 127$ \\ +$x \leftarrow x + q = 17077$ \\ +$x \leftarrow x - (n - k) = 16824$ \\ +\hline +$q \leftarrow \lfloor x/n \rfloor = 65$ \\ +$q \leftarrow q*k = 195$ \\ +$x \leftarrow x \mbox{ mod } n = 184$ \\ +$x \leftarrow x + q = 379$ \\ +$x \leftarrow x - (n - k) = 126$ \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Example Diminished Radix Reduction} +\label{fig:EXDR} +\end{figure} + +Figure~\ref{fig:EXDR} demonstrates the reduction of $x = 123456789$ modulo $n - k = 253$ when $n = 256$ and $k = 3$. Note that even while $x$ +is considerably larger than $(n - k - 1)^2 = 63504$ the algorithm still converges on the modular residue exceedingly fast. In this case only +three passes were required to find the residue $x \equiv 126$. + + +\subsection{Choice of Moduli} +On the surface this algorithm looks like a very expensive algorithm. It requires a couple of subtractions followed by multiplication and other +modular reductions. The usefulness of this algorithm becomes exceedingly clear when an appropriate modulus is chosen. + +Division in general is a very expensive operation to perform. The one exception is when the division is by a power of the radix of representation used. +Division by ten for example is simple for pencil and paper mathematics since it amounts to shifting the decimal place to the right. Similarly division +by two (\textit{or powers of two}) is very simple for binary computers to perform. It would therefore seem logical to choose $n$ of the form $2^p$ +which would imply that $\lfloor x / n \rfloor$ is a simple shift of $x$ right $p$ bits. + +However, there is one operation related to division of power of twos that is even faster than this. If $n = \beta^p$ then the division may be +performed by moving whole digits to the right $p$ places. In practice division by $\beta^p$ is much faster than division by $2^p$ for any $p$. +Also with the choice of $n = \beta^p$ reducing $x$ modulo $n$ merely requires zeroing the digits above the $p-1$'th digit of $x$. + +Throughout the next section the term ``restricted modulus'' will refer to a modulus of the form $\beta^p - k$ whereas the term ``unrestricted +modulus'' will refer to a modulus of the form $2^p - k$. The word ``restricted'' in this case refers to the fact that it is based on the +$2^p$ logic except $p$ must be a multiple of $lg(\beta)$. + +\subsection{Choice of $k$} +Now that division and reduction (\textit{step 1 and 3 of figure~\ref{fig:DR}}) have been optimized to simple digit operations the multiplication by $k$ +in step 2 is the most expensive operation. Fortunately the choice of $k$ is not terribly limited. For all intents and purposes it might +as well be a single digit. The smaller the value of $k$ is the faster the algorithm will be. + +\subsection{Restricted Diminished Radix Reduction} +The restricted Diminished Radix algorithm can quickly reduce an input modulo a modulus of the form $n = \beta^p - k$. This algorithm can reduce +an input $x$ within the range $0 \le x < n^2$ using only a couple passes of the algorithm demonstrated in figure~\ref{fig:DR}. The implementation +of this algorithm has been optimized to avoid additional overhead associated with a division by $\beta^p$, the multiplication by $k$ or the addition +of $x$ and $q$. The resulting algorithm is very efficient and can lead to substantial improvements over Barrett and Montgomery reduction when modular +exponentiations are performed. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_dr\_reduce}. \\ +\textbf{Input}. mp\_int $x$, $n$ and a mp\_digit $k = \beta - n_0$ \\ +\hspace{11.5mm}($0 \le x < n^2$, $n > 1$, $0 < k < \beta$) \\ +\textbf{Output}. $x \mbox{ mod } n$ \\ +\hline \\ +1. $m \leftarrow n.used$ \\ +2. If $x.alloc < 2m$ then grow $x$ to $2m$ digits. \\ +3. $\mu \leftarrow 0$ \\ +4. for $i$ from $0$ to $m - 1$ do \\ +\hspace{3mm}4.1 $\hat r \leftarrow k \cdot x_{m+i} + x_{i} + \mu$ \\ +\hspace{3mm}4.2 $x_{i} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{3mm}4.3 $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\ +5. $x_{m} \leftarrow \mu$ \\ +6. for $i$ from $m + 1$ to $x.used - 1$ do \\ +\hspace{3mm}6.1 $x_{i} \leftarrow 0$ \\ +7. Clamp excess digits of $x$. \\ +8. If $x \ge n$ then \\ +\hspace{3mm}8.1 $x \leftarrow x - n$ \\ +\hspace{3mm}8.2 Goto step 3. \\ +9. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_dr\_reduce} +\end{figure} + +\textbf{Algorithm mp\_dr\_reduce.} +This algorithm will perform the Dimished Radix reduction of $x$ modulo $n$. It has similar restrictions to that of the Barrett reduction +with the addition that $n$ must be of the form $n = \beta^m - k$ where $0 < k <\beta$. + +This algorithm essentially implements the pseudo-code in figure~\ref{fig:DR} except with a slight optimization. The division by $\beta^m$, multiplication by $k$ +and addition of $x \mbox{ mod }\beta^m$ are all performed simultaneously inside the loop on step 4. The division by $\beta^m$ is emulated by accessing +the term at the $m+i$'th position which is subsequently multiplied by $k$ and added to the term at the $i$'th position. After the loop the $m$'th +digit is set to the carry and the upper digits are zeroed. Steps 5 and 6 emulate the reduction modulo $\beta^m$ that should have happend to +$x$ before the addition of the multiple of the upper half. + +At step 8 if $x$ is still larger than $n$ another pass of the algorithm is required. First $n$ is subtracted from $x$ and then the algorithm resumes +at step 3. + +EXAM,bn_mp_dr_reduce.c + +The first step is to grow $x$ as required to $2m$ digits since the reduction is performed in place on $x$. The label on line @49,top:@ is where +the algorithm will resume if further reduction passes are required. In theory it could be placed at the top of the function however, the size of +the modulus and question of whether $x$ is large enough are invariant after the first pass meaning that it would be a waste of time. + +The aliases $tmpx1$ and $tmpx2$ refer to the digits of $x$ where the latter is offset by $m$ digits. By reading digits from $x$ offset by $m$ digits +a division by $\beta^m$ can be simulated virtually for free. The loop on line @61,for@ performs the bulk of the work (\textit{corresponds to step 4 of algorithm 7.11}) +in this algorithm. + +By line @68,mu@ the pointer $tmpx1$ points to the $m$'th digit of $x$ which is where the final carry will be placed. Similarly by line @71,for@ the +same pointer will point to the $m+1$'th digit where the zeroes will be placed. + +Since the algorithm is only valid if both $x$ and $n$ are greater than zero an unsigned comparison suffices to determine if another pass is required. +With the same logic at line @82,sub@ the value of $x$ is known to be greater than or equal to $n$ meaning that an unsigned subtraction can be used +as well. Since the destination of the subtraction is the larger of the inputs the call to algorithm s\_mp\_sub cannot fail and the return code +does not need to be checked. + +\subsubsection{Setup} +To setup the restricted Diminished Radix algorithm the value $k = \beta - n_0$ is required. This algorithm is not really complicated but provided for +completeness. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_dr\_setup}. \\ +\textbf{Input}. mp\_int $n$ \\ +\textbf{Output}. $k = \beta - n_0$ \\ +\hline \\ +1. $k \leftarrow \beta - n_0$ \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_dr\_setup} +\end{figure} + +EXAM,bn_mp_dr_setup.c + +\subsubsection{Modulus Detection} +Another algorithm which will be useful is the ability to detect a restricted Diminished Radix modulus. An integer is said to be +of restricted Diminished Radix form if all of the digits are equal to $\beta - 1$ except the trailing digit which may be any value. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_dr\_is\_modulus}. \\ +\textbf{Input}. mp\_int $n$ \\ +\textbf{Output}. $1$ if $n$ is in D.R form, $0$ otherwise \\ +\hline +1. If $n.used < 2$ then return($0$). \\ +2. for $ix$ from $1$ to $n.used - 1$ do \\ +\hspace{3mm}2.1 If $n_{ix} \ne \beta - 1$ return($0$). \\ +3. Return($1$). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_dr\_is\_modulus} +\end{figure} + +\textbf{Algorithm mp\_dr\_is\_modulus.} +This algorithm determines if a value is in Diminished Radix form. Step 1 rejects obvious cases where fewer than two digits are +in the mp\_int. Step 2 tests all but the first digit to see if they are equal to $\beta - 1$. If the algorithm manages to get to +step 3 then $n$ must be of Diminished Radix form. + +EXAM,bn_mp_dr_is_modulus.c + +\subsection{Unrestricted Diminished Radix Reduction} +The unrestricted Diminished Radix algorithm allows modular reductions to be performed when the modulus is of the form $2^p - k$. This algorithm +is a straightforward adaptation of algorithm~\ref{fig:DR}. + +In general the restricted Diminished Radix reduction algorithm is much faster since it has considerably lower overhead. However, this new +algorithm is much faster than either Montgomery or Barrett reduction when the moduli are of the appropriate form. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_reduce\_2k}. \\ +\textbf{Input}. mp\_int $a$ and $n$. mp\_digit $k$ \\ +\hspace{11.5mm}($a \ge 0$, $n > 1$, $0 < k < \beta$, $n + k$ is a power of two) \\ +\textbf{Output}. $a \mbox{ (mod }n\mbox{)}$ \\ +\hline +1. $p \leftarrow \lceil lg(n) \rceil$ (\textit{mp\_count\_bits}) \\ +2. While $a \ge n$ do \\ +\hspace{3mm}2.1 $q \leftarrow \lfloor a / 2^p \rfloor$ (\textit{mp\_div\_2d}) \\ +\hspace{3mm}2.2 $a \leftarrow a \mbox{ (mod }2^p\mbox{)}$ (\textit{mp\_mod\_2d}) \\ +\hspace{3mm}2.3 $q \leftarrow q \cdot k$ (\textit{mp\_mul\_d}) \\ +\hspace{3mm}2.4 $a \leftarrow a - q$ (\textit{s\_mp\_sub}) \\ +\hspace{3mm}2.5 If $a \ge n$ then do \\ +\hspace{6mm}2.5.1 $a \leftarrow a - n$ \\ +3. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_reduce\_2k} +\end{figure} + +\textbf{Algorithm mp\_reduce\_2k.} +This algorithm quickly reduces an input $a$ modulo an unrestricted Diminished Radix modulus $n$. Division by $2^p$ is emulated with a right +shift which makes the algorithm fairly inexpensive to use. + +EXAM,bn_mp_reduce_2k.c + +The algorithm mp\_count\_bits calculates the number of bits in an mp\_int which is used to find the initial value of $p$. The call to mp\_div\_2d +on line @31,mp_div_2d@ calculates both the quotient $q$ and the remainder $a$ required. By doing both in a single function call the code size +is kept fairly small. The multiplication by $k$ is only performed if $k > 1$. This allows reductions modulo $2^p - 1$ to be performed without +any multiplications. + +The unsigned s\_mp\_add, mp\_cmp\_mag and s\_mp\_sub are used in place of their full sign counterparts since the inputs are only valid if they are +positive. By using the unsigned versions the overhead is kept to a minimum. + +\subsubsection{Unrestricted Setup} +To setup this reduction algorithm the value of $k = 2^p - n$ is required. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_reduce\_2k\_setup}. \\ +\textbf{Input}. mp\_int $n$ \\ +\textbf{Output}. $k = 2^p - n$ \\ +\hline +1. $p \leftarrow \lceil lg(n) \rceil$ (\textit{mp\_count\_bits}) \\ +2. $x \leftarrow 2^p$ (\textit{mp\_2expt}) \\ +3. $x \leftarrow x - n$ (\textit{mp\_sub}) \\ +4. $k \leftarrow x_0$ \\ +5. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_reduce\_2k\_setup} +\end{figure} + +\textbf{Algorithm mp\_reduce\_2k\_setup.} +This algorithm computes the value of $k$ required for the algorithm mp\_reduce\_2k. By making a temporary variable $x$ equal to $2^p$ a subtraction +is sufficient to solve for $k$. Alternatively if $n$ has more than one digit the value of $k$ is simply $\beta - n_0$. + +EXAM,bn_mp_reduce_2k_setup.c + +\subsubsection{Unrestricted Detection} +An integer $n$ is a valid unrestricted Diminished Radix modulus if either of the following are true. + +\begin{enumerate} +\item The number has only one digit. +\item The number has more than one digit and every bit from the $\beta$'th to the most significant is one. +\end{enumerate} + +If either condition is true than there is a power of two $2^p$ such that $0 < 2^p - n < \beta$. If the input is only +one digit than it will always be of the correct form. Otherwise all of the bits above the first digit must be one. This arises from the fact +that there will be value of $k$ that when added to the modulus causes a carry in the first digit which propagates all the way to the most +significant bit. The resulting sum will be a power of two. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_reduce\_is\_2k}. \\ +\textbf{Input}. mp\_int $n$ \\ +\textbf{Output}. $1$ if of proper form, $0$ otherwise \\ +\hline +1. If $n.used = 0$ then return($0$). \\ +2. If $n.used = 1$ then return($1$). \\ +3. $p \leftarrow \lceil lg(n) \rceil$ (\textit{mp\_count\_bits}) \\ +4. for $x$ from $lg(\beta)$ to $p$ do \\ +\hspace{3mm}4.1 If the ($x \mbox{ mod }lg(\beta)$)'th bit of the $\lfloor x / lg(\beta) \rfloor$ of $n$ is zero then return($0$). \\ +5. Return($1$). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_reduce\_is\_2k} +\end{figure} + +\textbf{Algorithm mp\_reduce\_is\_2k.} +This algorithm quickly determines if a modulus is of the form required for algorithm mp\_reduce\_2k to function properly. + +EXAM,bn_mp_reduce_is_2k.c + + + +\section{Algorithm Comparison} +So far three very different algorithms for modular reduction have been discussed. Each of the algorithms have their own strengths and weaknesses +that makes having such a selection very useful. The following table sumarizes the three algorithms along with comparisons of work factors. Since +all three algorithms have the restriction that $0 \le x < n^2$ and $n > 1$ those limitations are not included in the table. + +\begin{center} +\begin{small} +\begin{tabular}{|c|c|c|c|c|c|} +\hline \textbf{Method} & \textbf{Work Required} & \textbf{Limitations} & \textbf{$m = 8$} & \textbf{$m = 32$} & \textbf{$m = 64$} \\ +\hline Barrett & $m^2 + 2m - 1$ & None & $79$ & $1087$ & $4223$ \\ +\hline Montgomery & $m^2 + m$ & $n$ must be odd & $72$ & $1056$ & $4160$ \\ +\hline D.R. & $2m$ & $n = \beta^m - k$ & $16$ & $64$ & $128$ \\ +\hline +\end{tabular} +\end{small} +\end{center} + +In theory Montgomery and Barrett reductions would require roughly the same amount of time to complete. However, in practice since Montgomery +reduction can be written as a single function with the Comba technique it is much faster. Barrett reduction suffers from the overhead of +calling the half precision multipliers, addition and division by $\beta$ algorithms. + +For almost every cryptographic algorithm Montgomery reduction is the algorithm of choice. The one set of algorithms where Diminished Radix reduction truly +shines are based on the discrete logarithm problem such as Diffie-Hellman \cite{DH} and ElGamal \cite{ELGAMAL}. In these algorithms +primes of the form $\beta^m - k$ can be found and shared amongst users. These primes will allow the Diminished Radix algorithm to be used in +modular exponentiation to greatly speed up the operation. + + + +\section*{Exercises} +\begin{tabular}{cl} +$\left [ 3 \right ]$ & Prove that the ``trick'' in algorithm mp\_montgomery\_setup actually \\ + & calculates the correct value of $\rho$. \\ + & \\ +$\left [ 2 \right ]$ & Devise an algorithm to reduce modulo $n + k$ for small $k$ quickly. \\ + & \\ +$\left [ 4 \right ]$ & Prove that the pseudo-code algorithm ``Diminished Radix Reduction'' \\ + & (\textit{figure~\ref{fig:DR}}) terminates. Also prove the probability that it will \\ + & terminate within $1 \le k \le 10$ iterations. \\ + & \\ +\end{tabular} + + +\chapter{Exponentiation} +Exponentiation is the operation of raising one variable to the power of another, for example, $a^b$. A variant of exponentiation, computed +in a finite field or ring, is called modular exponentiation. This latter style of operation is typically used in public key +cryptosystems such as RSA and Diffie-Hellman. The ability to quickly compute modular exponentiations is of great benefit to any +such cryptosystem and many methods have been sought to speed it up. + +\section{Exponentiation Basics} +A trivial algorithm would simply multiply $a$ against itself $b - 1$ times to compute the exponentiation desired. However, as $b$ grows in size +the number of multiplications becomes prohibitive. Imagine what would happen if $b$ $\approx$ $2^{1024}$ as is the case when computing an RSA signature +with a $1024$-bit key. Such a calculation could never be completed as it would take simply far too long. + +Fortunately there is a very simple algorithm based on the laws of exponents. Recall that $lg_a(a^b) = b$ and that $lg_a(a^ba^c) = b + c$ which +are two trivial relationships between the base and the exponent. Let $b_i$ represent the $i$'th bit of $b$ starting from the least +significant bit. If $b$ is a $k$-bit integer than the following equation is true. + +\begin{equation} +a^b = \prod_{i=0}^{k-1} a^{2^i \cdot b_i} +\end{equation} + +By taking the base $a$ logarithm of both sides of the equation the following equation is the result. + +\begin{equation} +b = \sum_{i=0}^{k-1}2^i \cdot b_i +\end{equation} + +The term $a^{2^i}$ can be found from the $i - 1$'th term by squaring the term since $\left ( a^{2^i} \right )^2$ is equal to +$a^{2^{i+1}}$. This observation forms the basis of essentially all fast exponentiation algorithms. It requires $k$ squarings and on average +$k \over 2$ multiplications to compute the result. This is indeed quite an improvement over simply multiplying by $a$ a total of $b-1$ times. + +While this current method is a considerable speed up there are further improvements to be made. For example, the $a^{2^i}$ term does not need to +be computed in an auxilary variable. Consider the following equivalent algorithm. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Left to Right Exponentiation}. \\ +\textbf{Input}. Integer $a$, $b$ and $k$ \\ +\textbf{Output}. $c = a^b$ \\ +\hline \\ +1. $c \leftarrow 1$ \\ +2. for $i$ from $k - 1$ to $0$ do \\ +\hspace{3mm}2.1 $c \leftarrow c^2$ \\ +\hspace{3mm}2.2 $c \leftarrow c \cdot a^{b_i}$ \\ +3. Return $c$. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Left to Right Exponentiation} +\label{fig:LTOR} +\end{figure} + +This algorithm starts from the most significant bit and works towards the least significant bit. When the $i$'th bit of $b$ is set $a$ is +multiplied against the current product. In each iteration the product is squared which doubles the exponent of the individual terms of the +product. + +For example, let $b = 101100_2 \equiv 44_{10}$. The following chart demonstrates the actions of the algorithm. + +\newpage\begin{figure} +\begin{center} +\begin{tabular}{|c|c|} +\hline \textbf{Value of $i$} & \textbf{Value of $c$} \\ +\hline - & $1$ \\ +\hline $5$ & $a$ \\ +\hline $4$ & $a^2$ \\ +\hline $3$ & $a^4 \cdot a$ \\ +\hline $2$ & $a^8 \cdot a^2 \cdot a$ \\ +\hline $1$ & $a^{16} \cdot a^4 \cdot a^2$ \\ +\hline $0$ & $a^{32} \cdot a^8 \cdot a^4$ \\ +\hline +\end{tabular} +\end{center} +\caption{Example of Left to Right Exponentiation} +\end{figure} + +When the product $a^{32} \cdot a^8 \cdot a^4$ is simplified it is equal $a^{44}$ which is the desired exponentiation. This particular algorithm is +called ``Left to Right'' because it reads the exponent in that order. All of the exponentiation algorithms that will be presented are of this nature. + +\subsection{Single Digit Exponentiation} +The first algorithm in the series of exponentiation algorithms will be an unbounded algorithm where the exponent is a single digit. It is intended +to be used when a small power of an input is required (\textit{e.g. $a^5$}). It is faster than simply multiplying $b - 1$ times for all values of +$b$ that are greater than three. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_expt\_d}. \\ +\textbf{Input}. mp\_int $a$ and mp\_digit $b$ \\ +\textbf{Output}. $c = a^b$ \\ +\hline \\ +1. $g \leftarrow a$ (\textit{mp\_init\_copy}) \\ +2. $c \leftarrow 1$ (\textit{mp\_set}) \\ +3. for $x$ from 1 to $lg(\beta)$ do \\ +\hspace{3mm}3.1 $c \leftarrow c^2$ (\textit{mp\_sqr}) \\ +\hspace{3mm}3.2 If $b$ AND $2^{lg(\beta) - 1} \ne 0$ then \\ +\hspace{6mm}3.2.1 $c \leftarrow c \cdot g$ (\textit{mp\_mul}) \\ +\hspace{3mm}3.3 $b \leftarrow b << 1$ \\ +4. Clear $g$. \\ +5. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_expt\_d} +\end{figure} + +\textbf{Algorithm mp\_expt\_d.} +This algorithm computes the value of $a$ raised to the power of a single digit $b$. It uses the left to right exponentiation algorithm to +quickly compute the exponentiation. It is loosely based on algorithm 14.79 of HAC \cite[pp. 615]{HAC} with the difference that the +exponent is a fixed width. + +A copy of $a$ is made first to allow destination variable $c$ be the same as the source variable $a$. The result is set to the initial value of +$1$ in the subsequent step. + +Inside the loop the exponent is read from the most significant bit first down to the least significant bit. First $c$ is invariably squared +on step 3.1. In the following step if the most significant bit of $b$ is one the copy of $a$ is multiplied against $c$. The value +of $b$ is shifted left one bit to make the next bit down from the most signficant bit the new most significant bit. In effect each +iteration of the loop moves the bits of the exponent $b$ upwards to the most significant location. + +EXAM,bn_mp_expt_d.c + +Line @29,mp_set@ sets the initial value of the result to $1$. Next the loop on line @31,for@ steps through each bit of the exponent starting from +the most significant down towards the least significant. The invariant squaring operation placed on line @333,mp_sqr@ is performed first. After +the squaring the result $c$ is multiplied by the base $g$ if and only if the most significant bit of the exponent is set. The shift on line +@47,<<@ moves all of the bits of the exponent upwards towards the most significant location. + +\section{$k$-ary Exponentiation} +When calculating an exponentiation the most time consuming bottleneck is the multiplications which are in general a small factor +slower than squaring. Recall from the previous algorithm that $b_{i}$ refers to the $i$'th bit of the exponent $b$. Suppose instead it referred to +the $i$'th $k$-bit digit of the exponent of $b$. For $k = 1$ the definitions are synonymous and for $k > 1$ algorithm~\ref{fig:KARY} +computes the same exponentiation. A group of $k$ bits from the exponent is called a \textit{window}. That is it is a small window on only a +portion of the entire exponent. Consider the following modification to the basic left to right exponentiation algorithm. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{$k$-ary Exponentiation}. \\ +\textbf{Input}. Integer $a$, $b$, $k$ and $t$ \\ +\textbf{Output}. $c = a^b$ \\ +\hline \\ +1. $c \leftarrow 1$ \\ +2. for $i$ from $t - 1$ to $0$ do \\ +\hspace{3mm}2.1 $c \leftarrow c^{2^k} $ \\ +\hspace{3mm}2.2 Extract the $i$'th $k$-bit word from $b$ and store it in $g$. \\ +\hspace{3mm}2.3 $c \leftarrow c \cdot a^g$ \\ +3. Return $c$. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{$k$-ary Exponentiation} +\label{fig:KARY} +\end{figure} + +The squaring on step 2.1 can be calculated by squaring the value $c$ successively $k$ times. If the values of $a^g$ for $0 < g < 2^k$ have been +precomputed this algorithm requires only $t$ multiplications and $tk$ squarings. The table can be generated with $2^{k - 1} - 1$ squarings and +$2^{k - 1} + 1$ multiplications. This algorithm assumes that the number of bits in the exponent is evenly divisible by $k$. +However, when it is not the remaining $0 < x \le k - 1$ bits can be handled with algorithm~\ref{fig:LTOR}. + +Suppose $k = 4$ and $t = 100$. This modified algorithm will require $109$ multiplications and $408$ squarings to compute the exponentiation. The +original algorithm would on average have required $200$ multiplications and $400$ squrings to compute the same value. The total number of squarings +has increased slightly but the number of multiplications has nearly halved. + +\subsection{Optimal Values of $k$} +An optimal value of $k$ will minimize $2^{k} + \lceil n / k \rceil + n - 1$ for a fixed number of bits in the exponent $n$. The simplest +approach is to brute force search amongst the values $k = 2, 3, \ldots, 8$ for the lowest result. Table~\ref{fig:OPTK} lists optimal values of $k$ +for various exponent sizes and compares the number of multiplication and squarings required against algorithm~\ref{fig:LTOR}. + +\begin{figure}[here] +\begin{center} +\begin{small} +\begin{tabular}{|c|c|c|c|c|c|} +\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:LTOR}} \\ +\hline $16$ & $2$ & $27$ & $24$ \\ +\hline $32$ & $3$ & $49$ & $48$ \\ +\hline $64$ & $3$ & $92$ & $96$ \\ +\hline $128$ & $4$ & $175$ & $192$ \\ +\hline $256$ & $4$ & $335$ & $384$ \\ +\hline $512$ & $5$ & $645$ & $768$ \\ +\hline $1024$ & $6$ & $1257$ & $1536$ \\ +\hline $2048$ & $6$ & $2452$ & $3072$ \\ +\hline $4096$ & $7$ & $4808$ & $6144$ \\ +\hline +\end{tabular} +\end{small} +\end{center} +\caption{Optimal Values of $k$ for $k$-ary Exponentiation} +\label{fig:OPTK} +\end{figure} + +\subsection{Sliding-Window Exponentiation} +A simple modification to the previous algorithm is only generate the upper half of the table in the range $2^{k-1} \le g < 2^k$. Essentially +this is a table for all values of $g$ where the most significant bit of $g$ is a one. However, in order for this to be allowed in the +algorithm values of $g$ in the range $0 \le g < 2^{k-1}$ must be avoided. + +Table~\ref{fig:OPTK2} lists optimal values of $k$ for various exponent sizes and compares the work required against algorithm~\ref{fig:KARY}. + +\begin{figure}[here] +\begin{center} +\begin{small} +\begin{tabular}{|c|c|c|c|c|c|} +\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:KARY}} \\ +\hline $16$ & $3$ & $24$ & $27$ \\ +\hline $32$ & $3$ & $45$ & $49$ \\ +\hline $64$ & $4$ & $87$ & $92$ \\ +\hline $128$ & $4$ & $167$ & $175$ \\ +\hline $256$ & $5$ & $322$ & $335$ \\ +\hline $512$ & $6$ & $628$ & $645$ \\ +\hline $1024$ & $6$ & $1225$ & $1257$ \\ +\hline $2048$ & $7$ & $2403$ & $2452$ \\ +\hline $4096$ & $8$ & $4735$ & $4808$ \\ +\hline +\end{tabular} +\end{small} +\end{center} +\caption{Optimal Values of $k$ for Sliding Window Exponentiation} +\label{fig:OPTK2} +\end{figure} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Sliding Window $k$-ary Exponentiation}. \\ +\textbf{Input}. Integer $a$, $b$, $k$ and $t$ \\ +\textbf{Output}. $c = a^b$ \\ +\hline \\ +1. $c \leftarrow 1$ \\ +2. for $i$ from $t - 1$ to $0$ do \\ +\hspace{3mm}2.1 If the $i$'th bit of $b$ is a zero then \\ +\hspace{6mm}2.1.1 $c \leftarrow c^2$ \\ +\hspace{3mm}2.2 else do \\ +\hspace{6mm}2.2.1 $c \leftarrow c^{2^k}$ \\ +\hspace{6mm}2.2.2 Extract the $k$ bits from $(b_{i}b_{i-1}\ldots b_{i-(k-1)})$ and store it in $g$. \\ +\hspace{6mm}2.2.3 $c \leftarrow c \cdot a^g$ \\ +\hspace{6mm}2.2.4 $i \leftarrow i - k$ \\ +3. Return $c$. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Sliding Window $k$-ary Exponentiation} +\end{figure} + +Similar to the previous algorithm this algorithm must have a special handler when fewer than $k$ bits are left in the exponent. While this +algorithm requires the same number of squarings it can potentially have fewer multiplications. The pre-computed table $a^g$ is also half +the size as the previous table. + +Consider the exponent $b = 111101011001000_2 \equiv 31432_{10}$ with $k = 3$ using both algorithms. The first algorithm will divide the exponent up as +the following five $3$-bit words $b \equiv \left ( 111, 101, 011, 001, 000 \right )_{2}$. The second algorithm will break the +exponent as $b \equiv \left ( 111, 101, 0, 110, 0, 100, 0 \right )_{2}$. The single digit $0$ in the second representation are where +a single squaring took place instead of a squaring and multiplication. In total the first method requires $10$ multiplications and $18$ +squarings. The second method requires $8$ multiplications and $18$ squarings. + +In general the sliding window method is never slower than the generic $k$-ary method and often it is slightly faster. + +\section{Modular Exponentiation} + +Modular exponentiation is essentially computing the power of a base within a finite field or ring. For example, computing +$d \equiv a^b \mbox{ (mod }c\mbox{)}$ is a modular exponentiation. Instead of first computing $a^b$ and then reducing it +modulo $c$ the intermediate result is reduced modulo $c$ after every squaring or multiplication operation. + +This guarantees that any intermediate result is bounded by $0 \le d \le c^2 - 2c + 1$ and can be reduced modulo $c$ quickly using +one of the algorithms presented in ~REDUCTION~. + +Before the actual modular exponentiation algorithm can be written a wrapper algorithm must be written first. This algorithm +will allow the exponent $b$ to be negative which is computed as $c \equiv \left (1 / a \right )^{\vert b \vert} \mbox{(mod }d\mbox{)}$. The +value of $(1/a) \mbox{ mod }c$ is computed using the modular inverse (\textit{see \ref{sec;modinv}}). If no inverse exists the algorithm +terminates with an error. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_exptmod}. \\ +\textbf{Input}. mp\_int $a$, $b$ and $c$ \\ +\textbf{Output}. $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\ +\hline \\ +1. If $c.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\ +2. If $b.sign = MP\_NEG$ then \\ +\hspace{3mm}2.1 $g' \leftarrow g^{-1} \mbox{ (mod }c\mbox{)}$ \\ +\hspace{3mm}2.2 $x' \leftarrow \vert x \vert$ \\ +\hspace{3mm}2.3 Compute $d \equiv g'^{x'} \mbox{ (mod }c\mbox{)}$ via recursion. \\ +3. if $p$ is odd \textbf{OR} $p$ is a D.R. modulus then \\ +\hspace{3mm}3.1 Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm mp\_exptmod\_fast. \\ +4. else \\ +\hspace{3mm}4.1 Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm s\_mp\_exptmod. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_exptmod} +\end{figure} + +\textbf{Algorithm mp\_exptmod.} +The first algorithm which actually performs modular exponentiation is algorithm s\_mp\_exptmod. It is a sliding window $k$-ary algorithm +which uses Barrett reduction to reduce the product modulo $p$. The second algorithm mp\_exptmod\_fast performs the same operation +except it uses either Montgomery or Diminished Radix reduction. The two latter reduction algorithms are clumped in the same exponentiation +algorithm since their arguments are essentially the same (\textit{two mp\_ints and one mp\_digit}). + +EXAM,bn_mp_exptmod.c + +In order to keep the algorithms in a known state the first step on line @29,if@ is to reject any negative modulus as input. If the exponent is +negative the algorithm tries to perform a modular exponentiation with the modular inverse of the base $G$. The temporary variable $tmpG$ is assigned +the modular inverse of $G$ and $tmpX$ is assigned the absolute value of $X$. The algorithm will recuse with these new values with a positive +exponent. + +If the exponent is positive the algorithm resumes the exponentiation. Line @63,dr_@ determines if the modulus is of the restricted Diminished Radix +form. If it is not line @65,reduce@ attempts to determine if it is of a unrestricted Diminished Radix form. The integer $dr$ will take on one +of three values. + +\begin{enumerate} +\item $dr = 0$ means that the modulus is not of either restricted or unrestricted Diminished Radix form. +\item $dr = 1$ means that the modulus is of restricted Diminished Radix form. +\item $dr = 2$ means that the modulus is of unrestricted Diminished Radix form. +\end{enumerate} + +Line @69,if@ determines if the fast modular exponentiation algorithm can be used. It is allowed if $dr \ne 0$ or if the modulus is odd. Otherwise, +the slower s\_mp\_exptmod algorithm is used which uses Barrett reduction. + +\subsection{Barrett Modular Exponentiation} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{s\_mp\_exptmod}. \\ +\textbf{Input}. mp\_int $a$, $b$ and $c$ \\ +\textbf{Output}. $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\ +\hline \\ +1. $k \leftarrow lg(x)$ \\ +2. $winsize \leftarrow \left \lbrace \begin{array}{ll} + 2 & \mbox{if }k \le 7 \\ + 3 & \mbox{if }7 < k \le 36 \\ + 4 & \mbox{if }36 < k \le 140 \\ + 5 & \mbox{if }140 < k \le 450 \\ + 6 & \mbox{if }450 < k \le 1303 \\ + 7 & \mbox{if }1303 < k \le 3529 \\ + 8 & \mbox{if }3529 < k \\ + \end{array} \right .$ \\ +3. Initialize $2^{winsize}$ mp\_ints in an array named $M$ and one mp\_int named $\mu$ \\ +4. Calculate the $\mu$ required for Barrett Reduction (\textit{mp\_reduce\_setup}). \\ +5. $M_1 \leftarrow g \mbox{ (mod }p\mbox{)}$ \\ +\\ +Setup the table of small powers of $g$. First find $g^{2^{winsize}}$ and then all multiples of it. \\ +6. $k \leftarrow 2^{winsize - 1}$ \\ +7. $M_{k} \leftarrow M_1$ \\ +8. for $ix$ from 0 to $winsize - 2$ do \\ +\hspace{3mm}8.1 $M_k \leftarrow \left ( M_k \right )^2$ (\textit{mp\_sqr}) \\ +\hspace{3mm}8.2 $M_k \leftarrow M_k \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\ +9. for $ix$ from $2^{winsize - 1} + 1$ to $2^{winsize} - 1$ do \\ +\hspace{3mm}9.1 $M_{ix} \leftarrow M_{ix - 1} \cdot M_{1}$ (\textit{mp\_mul}) \\ +\hspace{3mm}9.2 $M_{ix} \leftarrow M_{ix} \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\ +10. $res \leftarrow 1$ \\ +\\ +Start Sliding Window. \\ +11. $mode \leftarrow 0, bitcnt \leftarrow 1, buf \leftarrow 0, digidx \leftarrow x.used - 1, bitcpy \leftarrow 0, bitbuf \leftarrow 0$ \\ +12. Loop \\ +\hspace{3mm}12.1 $bitcnt \leftarrow bitcnt - 1$ \\ +\hspace{3mm}12.2 If $bitcnt = 0$ then do \\ +\hspace{6mm}12.2.1 If $digidx = -1$ goto step 13. \\ +\hspace{6mm}12.2.2 $buf \leftarrow x_{digidx}$ \\ +\hspace{6mm}12.2.3 $digidx \leftarrow digidx - 1$ \\ +\hspace{6mm}12.2.4 $bitcnt \leftarrow lg(\beta)$ \\ +Continued on next page. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm s\_mp\_exptmod} +\end{figure} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{s\_mp\_exptmod} (\textit{continued}). \\ +\textbf{Input}. mp\_int $a$, $b$ and $c$ \\ +\textbf{Output}. $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\ +\hline \\ +\hspace{3mm}12.3 $y \leftarrow (buf >> (lg(\beta) - 1))$ AND $1$ \\ +\hspace{3mm}12.4 $buf \leftarrow buf << 1$ \\ +\hspace{3mm}12.5 if $mode = 0$ and $y = 0$ then goto step 12. \\ +\hspace{3mm}12.6 if $mode = 1$ and $y = 0$ then do \\ +\hspace{6mm}12.6.1 $res \leftarrow res^2$ \\ +\hspace{6mm}12.6.2 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ +\hspace{6mm}12.6.3 Goto step 12. \\ +\hspace{3mm}12.7 $bitcpy \leftarrow bitcpy + 1$ \\ +\hspace{3mm}12.8 $bitbuf \leftarrow bitbuf + (y << (winsize - bitcpy))$ \\ +\hspace{3mm}12.9 $mode \leftarrow 2$ \\ +\hspace{3mm}12.10 If $bitcpy = winsize$ then do \\ +\hspace{6mm}Window is full so perform the squarings and single multiplication. \\ +\hspace{6mm}12.10.1 for $ix$ from $0$ to $winsize -1$ do \\ +\hspace{9mm}12.10.1.1 $res \leftarrow res^2$ \\ +\hspace{9mm}12.10.1.2 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ +\hspace{6mm}12.10.2 $res \leftarrow res \cdot M_{bitbuf}$ \\ +\hspace{6mm}12.10.3 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ +\hspace{6mm}Reset the window. \\ +\hspace{6mm}12.10.4 $bitcpy \leftarrow 0, bitbuf \leftarrow 0, mode \leftarrow 1$ \\ +\\ +No more windows left. Check for residual bits of exponent. \\ +13. If $mode = 2$ and $bitcpy > 0$ then do \\ +\hspace{3mm}13.1 for $ix$ form $0$ to $bitcpy - 1$ do \\ +\hspace{6mm}13.1.1 $res \leftarrow res^2$ \\ +\hspace{6mm}13.1.2 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ +\hspace{6mm}13.1.3 $bitbuf \leftarrow bitbuf << 1$ \\ +\hspace{6mm}13.1.4 If $bitbuf$ AND $2^{winsize} \ne 0$ then do \\ +\hspace{9mm}13.1.4.1 $res \leftarrow res \cdot M_{1}$ \\ +\hspace{9mm}13.1.4.2 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ +14. $y \leftarrow res$ \\ +15. Clear $res$, $mu$ and the $M$ array. \\ +16. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm s\_mp\_exptmod (continued)} +\end{figure} + +\textbf{Algorithm s\_mp\_exptmod.} +This algorithm computes the $x$'th power of $g$ modulo $p$ and stores the result in $y$. It takes advantage of the Barrett reduction +algorithm to keep the product small throughout the algorithm. + +The first two steps determine the optimal window size based on the number of bits in the exponent. The larger the exponent the +larger the window size becomes. After a window size $winsize$ has been chosen an array of $2^{winsize}$ mp\_int variables is allocated. This +table will hold the values of $g^x \mbox{ (mod }p\mbox{)}$ for $2^{winsize - 1} \le x < 2^{winsize}$. + +After the table is allocated the first power of $g$ is found. Since $g \ge p$ is allowed it must be first reduced modulo $p$ to make +the rest of the algorithm more efficient. The first element of the table at $2^{winsize - 1}$ is found by squaring $M_1$ successively $winsize - 2$ +times. The rest of the table elements are found by multiplying the previous element by $M_1$ modulo $p$. + +Now that the table is available the sliding window may begin. The following list describes the functions of all the variables in the window. +\begin{enumerate} +\item The variable $mode$ dictates how the bits of the exponent are interpreted. +\begin{enumerate} + \item When $mode = 0$ the bits are ignored since no non-zero bit of the exponent has been seen yet. For example, if the exponent were simply + $1$ then there would be $lg(\beta) - 1$ zero bits before the first non-zero bit. In this case bits are ignored until a non-zero bit is found. + \item When $mode = 1$ a non-zero bit has been seen before and a new $winsize$-bit window has not been formed yet. In this mode leading $0$ bits + are read and a single squaring is performed. If a non-zero bit is read a new window is created. + \item When $mode = 2$ the algorithm is in the middle of forming a window and new bits are appended to the window from the most significant bit + downwards. +\end{enumerate} +\item The variable $bitcnt$ indicates how many bits are left in the current digit of the exponent left to be read. When it reaches zero a new digit + is fetched from the exponent. +\item The variable $buf$ holds the currently read digit of the exponent. +\item The variable $digidx$ is an index into the exponents digits. It starts at the leading digit $x.used - 1$ and moves towards the trailing digit. +\item The variable $bitcpy$ indicates how many bits are in the currently formed window. When it reaches $winsize$ the window is flushed and + the appropriate operations performed. +\item The variable $bitbuf$ holds the current bits of the window being formed. +\end{enumerate} + +All of step 12 is the window processing loop. It will iterate while there are digits available form the exponent to read. The first step +inside this loop is to extract a new digit if no more bits are available in the current digit. If there are no bits left a new digit is +read and if there are no digits left than the loop terminates. + +After a digit is made available step 12.3 will extract the most significant bit of the current digit and move all other bits in the digit +upwards. In effect the digit is read from most significant bit to least significant bit and since the digits are read from leading to +trailing edges the entire exponent is read from most significant bit to least significant bit. + +At step 12.5 if the $mode$ and currently extracted bit $y$ are both zero the bit is ignored and the next bit is read. This prevents the +algorithm from having to perform trivial squaring and reduction operations before the first non-zero bit is read. Step 12.6 and 12.7-10 handle +the two cases of $mode = 1$ and $mode = 2$ respectively. + +FIGU,expt_state,Sliding Window State Diagram + +By step 13 there are no more digits left in the exponent. However, there may be partial bits in the window left. If $mode = 2$ then +a Left-to-Right algorithm is used to process the remaining few bits. + +EXAM,bn_s_mp_exptmod.c + +Lines @26,if@ through @40,}@ determine the optimal window size based on the length of the exponent in bits. The window divisions are sorted +from smallest to greatest so that in each \textbf{if} statement only one condition must be tested. For example, by the \textbf{if} statement +on line @32,if@ the value of $x$ is already known to be greater than $140$. + +The conditional piece of code beginning on line @42,ifdef@ allows the window size to be restricted to five bits. This logic is used to ensure +the table of precomputed powers of $G$ remains relatively small. + +The for loop on line @49,for@ initializes the $M$ array while lines @59,mp_init@ and @62,mp_reduce@ compute the value of $\mu$ required for +Barrett reduction. + +-- More later. + +\section{Quick Power of Two} +Calculating $b = 2^a$ can be performed much quicker than with any of the previous algorithms. Recall that a logical shift left $m << k$ is +equivalent to $m \cdot 2^k$. By this logic when $m = 1$ a quick power of two can be achieved. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_2expt}. \\ +\textbf{Input}. integer $b$ \\ +\textbf{Output}. $a \leftarrow 2^b$ \\ +\hline \\ +1. $a \leftarrow 0$ \\ +2. If $a.alloc < \lfloor b / lg(\beta) \rfloor + 1$ then grow $a$ appropriately. \\ +3. $a.used \leftarrow \lfloor b / lg(\beta) \rfloor + 1$ \\ +4. $a_{\lfloor b / lg(\beta) \rfloor} \leftarrow 1 << (b \mbox{ mod } lg(\beta))$ \\ +5. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_2expt} +\end{figure} + +\textbf{Algorithm mp\_2expt.} + +EXAM,bn_mp_2expt.c + +\chapter{Higher Level Algorithms} + +This chapter discusses the various higher level algorithms that are required to complete a well rounded multiple precision integer package. These +routines are less performance oriented than the algorithms of chapters five, six and seven but are no less important. + +The first section describes a method of integer division with remainder that is universally well known. It provides the signed division logic +for the package. The subsequent section discusses a set of algorithms which allow a single digit to be the 2nd operand for a variety of operations. +These algorithms serve mostly to simplify other algorithms where small constants are required. The last two sections discuss how to manipulate +various representations of integers. For example, converting from an mp\_int to a string of character. + +\section{Integer Division with Remainder} +\label{sec:division} + +Integer division aside from modular exponentiation is the most intensive algorithm to compute. Like addition, subtraction and multiplication +the basis of this algorithm is the long-hand division algorithm taught to school children. Throughout this discussion several common variables +will be used. Let $x$ represent the divisor and $y$ represent the dividend. Let $q$ represent the integer quotient $\lfloor y / x \rfloor$ and +let $r$ represent the remainder $r = y - x \lfloor y / x \rfloor$. The following simple algorithm will be used to start the discussion. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Radix-$\beta$ Integer Division}. \\ +\textbf{Input}. integer $x$ and $y$ \\ +\textbf{Output}. $q = \lfloor y/x\rfloor, r = y - xq$ \\ +\hline \\ +1. $q \leftarrow 0$ \\ +2. $n \leftarrow \vert \vert y \vert \vert - \vert \vert x \vert \vert$ \\ +3. for $t$ from $n$ down to $0$ do \\ +\hspace{3mm}3.1 Maximize $k$ such that $kx\beta^t$ is less than or equal to $y$ and $(k + 1)x\beta^t$ is greater. \\ +\hspace{3mm}3.2 $q \leftarrow q + k\beta^t$ \\ +\hspace{3mm}3.3 $y \leftarrow y - kx\beta^t$ \\ +4. $r \leftarrow y$ \\ +5. Return($q, r$) \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm Radix-$\beta$ Integer Division} +\label{fig:raddiv} +\end{figure} + +As children we are taught this very simple algorithm for the case of $\beta = 10$. Almost instinctively several optimizations are taught for which +their reason of existing are never explained. For this example let $y = 5471$ represent the dividend and $x = 23$ represent the divisor. + +To find the first digit of the quotient the value of $k$ must be maximized such that $kx\beta^t$ is less than or equal to $y$ and +simultaneously $(k + 1)x\beta^t$ is greater than $y$. Implicitly $k$ is the maximum value the $t$'th digit of the quotient may have. The habitual method +used to find the maximum is to ``eyeball'' the two numbers, typically only the leading digits and quickly estimate a quotient. By only using leading +digits a much simpler division may be used to form an educated guess at what the value must be. In this case $k = \lfloor 54/23\rfloor = 2$ quickly +arises as a possible solution. Indeed $2x\beta^2 = 4600$ is less than $y = 5471$ and simultaneously $(k + 1)x\beta^2 = 6900$ is larger than $y$. +As a result $k\beta^2$ is added to the quotient which now equals $q = 200$ and $4600$ is subtracted from $y$ to give a remainder of $y = 841$. + +Again this process is repeated to produce the quotient digit $k = 3$ which makes the quotient $q = 200 + 3\beta = 230$ and the remainder +$y = 841 - 3x\beta = 181$. Finally the last iteration of the loop produces $k = 7$ which leads to the quotient $q = 230 + 7 = 237$ and the +remainder $y = 181 - 7x = 20$. The final quotient and remainder found are $q = 237$ and $r = y = 20$ which are indeed correct since +$237 \cdot 23 + 20 = 5471$ is true. + +\subsection{Quotient Estimation} +\label{sec:divest} +As alluded to earlier the quotient digit $k$ can be estimated from only the leading digits of both the divisor and dividend. When $p$ leading +digits are used from both the divisor and dividend to form an estimation the accuracy of the estimation rises as $p$ grows. Technically +speaking the estimation is based on assuming the lower $\vert \vert y \vert \vert - p$ and $\vert \vert x \vert \vert - p$ lower digits of the +dividend and divisor are zero. + +The value of the estimation may off by a few values in either direction and in general is fairly correct. A simplification \cite[pp. 271]{TAOCPV2} +of the estimation technique is to use $t + 1$ digits of the dividend and $t$ digits of the divisor, in particularly when $t = 1$. The estimate +using this technique is never too small. For the following proof let $t = \vert \vert y \vert \vert - 1$ and $s = \vert \vert x \vert \vert - 1$ +represent the most significant digits of the dividend and divisor respectively. + +\textbf{Proof.}\textit{ The quotient $\hat k = \lfloor (y_t\beta + y_{t-1}) / x_s \rfloor$ is greater than or equal to +$k = \lfloor y / (x \cdot \beta^{\vert \vert y \vert \vert - \vert \vert x \vert \vert - 1}) \rfloor$. } +The first obvious case is when $\hat k = \beta - 1$ in which case the proof is concluded since the real quotient cannot be larger. For all other +cases $\hat k = \lfloor (y_t\beta + y_{t-1}) / x_s \rfloor$ and $\hat k x_s \ge y_t\beta + y_{t-1} - x_s + 1$. The latter portion of the inequalility +$-x_s + 1$ arises from the fact that a truncated integer division will give the same quotient for at most $x_s - 1$ values. Next a series of +inequalities will prove the hypothesis. + +\begin{equation} +y - \hat k x \le y - \hat k x_s\beta^s +\end{equation} + +This is trivially true since $x \ge x_s\beta^s$. Next we replace $\hat kx_s\beta^s$ by the previous inequality for $\hat kx_s$. + +\begin{equation} +y - \hat k x \le y_t\beta^t + \ldots + y_0 - (y_t\beta^t + y_{t-1}\beta^{t-1} - x_s\beta^t + \beta^s) +\end{equation} + +By simplifying the previous inequality the following inequality is formed. + +\begin{equation} +y - \hat k x \le y_{t-2}\beta^{t-2} + \ldots + y_0 + x_s\beta^s - \beta^s +\end{equation} + +Subsequently, + +\begin{equation} +y_{t-2}\beta^{t-2} + \ldots + y_0 + x_s\beta^s - \beta^s < x_s\beta^s \le x +\end{equation} + +Which proves that $y - \hat kx \le x$ and by consequence $\hat k \ge k$ which concludes the proof. \textbf{QED} + + +\subsection{Normalized Integers} +For the purposes of division a normalized input is when the divisors leading digit $x_n$ is greater than or equal to $\beta / 2$. By multiplying both +$x$ and $y$ by $j = \lfloor (\beta / 2) / x_n \rfloor$ the quotient remains unchanged and the remainder is simply $j$ times the original +remainder. The purpose of normalization is to ensure the leading digit of the divisor is sufficiently large such that the estimated quotient will +lie in the domain of a single digit. Consider the maximum dividend $(\beta - 1) \cdot \beta + (\beta - 1)$ and the minimum divisor $\beta / 2$. + +\begin{equation} +{{\beta^2 - 1} \over { \beta / 2}} \le 2\beta - {2 \over \beta} +\end{equation} + +At most the quotient approaches $2\beta$, however, in practice this will not occur since that would imply the previous quotient digit was too small. + +\subsection{Radix-$\beta$ Division with Remainder} +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_div}. \\ +\textbf{Input}. mp\_int $a, b$ \\ +\textbf{Output}. $c = \lfloor a/b \rfloor$, $d = a - bc$ \\ +\hline \\ +1. If $b = 0$ return(\textit{MP\_VAL}). \\ +2. If $\vert a \vert < \vert b \vert$ then do \\ +\hspace{3mm}2.1 $d \leftarrow a$ \\ +\hspace{3mm}2.2 $c \leftarrow 0$ \\ +\hspace{3mm}2.3 Return(\textit{MP\_OKAY}). \\ +\\ +Setup the quotient to receive the digits. \\ +3. Grow $q$ to $a.used + 2$ digits. \\ +4. $q \leftarrow 0$ \\ +5. $x \leftarrow \vert a \vert , y \leftarrow \vert b \vert$ \\ +6. $sign \leftarrow \left \lbrace \begin{array}{ll} + MP\_ZPOS & \mbox{if }a.sign = b.sign \\ + MP\_NEG & \mbox{otherwise} \\ + \end{array} \right .$ \\ +\\ +Normalize the inputs such that the leading digit of $y$ is greater than or equal to $\beta / 2$. \\ +7. $norm \leftarrow (lg(\beta) - 1) - (\lceil lg(y) \rceil \mbox{ (mod }lg(\beta)\mbox{)})$ \\ +8. $x \leftarrow x \cdot 2^{norm}, y \leftarrow y \cdot 2^{norm}$ \\ +\\ +Find the leading digit of the quotient. \\ +9. $n \leftarrow x.used - 1, t \leftarrow y.used - 1$ \\ +10. $y \leftarrow y \cdot \beta^{n - t}$ \\ +11. While ($x \ge y$) do \\ +\hspace{3mm}11.1 $q_{n - t} \leftarrow q_{n - t} + 1$ \\ +\hspace{3mm}11.2 $x \leftarrow x - y$ \\ +12. $y \leftarrow \lfloor y / \beta^{n-t} \rfloor$ \\ +\\ +Continued on the next page. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_div} +\end{figure} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_div} (continued). \\ +\textbf{Input}. mp\_int $a, b$ \\ +\textbf{Output}. $c = \lfloor a/b \rfloor$, $d = a - bc$ \\ +\hline \\ +Now find the remainder fo the digits. \\ +13. for $i$ from $n$ down to $(t + 1)$ do \\ +\hspace{3mm}13.1 If $i > x.used$ then jump to the next iteration of this loop. \\ +\hspace{3mm}13.2 If $x_{i} = y_{t}$ then \\ +\hspace{6mm}13.2.1 $q_{i - t - 1} \leftarrow \beta - 1$ \\ +\hspace{3mm}13.3 else \\ +\hspace{6mm}13.3.1 $\hat r \leftarrow x_{i} \cdot \beta + x_{i - 1}$ \\ +\hspace{6mm}13.3.2 $\hat r \leftarrow \lfloor \hat r / y_{t} \rfloor$ \\ +\hspace{6mm}13.3.3 $q_{i - t - 1} \leftarrow \hat r$ \\ +\hspace{3mm}13.4 $q_{i - t - 1} \leftarrow q_{i - t - 1} + 1$ \\ +\\ +Fixup quotient estimation. \\ +\hspace{3mm}13.5 Loop \\ +\hspace{6mm}13.5.1 $q_{i - t - 1} \leftarrow q_{i - t - 1} - 1$ \\ +\hspace{6mm}13.5.2 t$1 \leftarrow 0$ \\ +\hspace{6mm}13.5.3 t$1_0 \leftarrow y_{t - 1}, $ t$1_1 \leftarrow y_t,$ t$1.used \leftarrow 2$ \\ +\hspace{6mm}13.5.4 $t1 \leftarrow t1 \cdot q_{i - t - 1}$ \\ +\hspace{6mm}13.5.5 t$2_0 \leftarrow x_{i - 2}, $ t$2_1 \leftarrow x_{i - 1}, $ t$2_2 \leftarrow x_i, $ t$2.used \leftarrow 3$ \\ +\hspace{6mm}13.5.6 If $\vert t1 \vert > \vert t2 \vert$ then goto step 13.5. \\ +\hspace{3mm}13.6 t$1 \leftarrow y \cdot q_{i - t - 1}$ \\ +\hspace{3mm}13.7 t$1 \leftarrow $ t$1 \cdot \beta^{i - t - 1}$ \\ +\hspace{3mm}13.8 $x \leftarrow x - $ t$1$ \\ +\hspace{3mm}13.9 If $x.sign = MP\_NEG$ then \\ +\hspace{6mm}13.10 t$1 \leftarrow y$ \\ +\hspace{6mm}13.11 t$1 \leftarrow $ t$1 \cdot \beta^{i - t - 1}$ \\ +\hspace{6mm}13.12 $x \leftarrow x + $ t$1$ \\ +\hspace{6mm}13.13 $q_{i - t - 1} \leftarrow q_{i - t - 1} - 1$ \\ +\\ +Finalize the result. \\ +14. Clamp excess digits of $q$ \\ +15. $c \leftarrow q, c.sign \leftarrow sign$ \\ +16. $x.sign \leftarrow a.sign$ \\ +17. $d \leftarrow \lfloor x / 2^{norm} \rfloor$ \\ +18. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_div (continued)} +\end{figure} +\textbf{Algorithm mp\_div.} +This algorithm will calculate quotient and remainder from an integer division given a dividend and divisor. The algorithm is a signed +division and will produce a fully qualified quotient and remainder. + +First the divisor $b$ must be non-zero which is enforced in step one. If the divisor is larger than the dividend than the quotient is implicitly +zero and the remainder is the dividend. + +After the first two trivial cases of inputs are handled the variable $q$ is setup to receive the digits of the quotient. Two unsigned copies of the +divisor $y$ and dividend $x$ are made as well. The core of the division algorithm is an unsigned division and will only work if the values are +positive. Now the two values $x$ and $y$ must be normalized such that the leading digit of $y$ is greater than or equal to $\beta / 2$. +This is performed by shifting both to the left by enough bits to get the desired normalization. + +At this point the division algorithm can begin producing digits of the quotient. Recall that maximum value of the estimation used is +$2\beta - {2 \over \beta}$ which means that a digit of the quotient must be first produced by another means. In this case $y$ is shifted +to the left (\textit{step ten}) so that it has the same number of digits as $x$. The loop on step eleven will subtract multiples of the +shifted copy of $y$ until $x$ is smaller. Since the leading digit of $y$ is greater than or equal to $\beta/2$ this loop will iterate at most two +times to produce the desired leading digit of the quotient. + +Now the remainder of the digits can be produced. The equation $\hat q = \lfloor {{x_i \beta + x_{i-1}}\over y_t} \rfloor$ is used to fairly +accurately approximate the true quotient digit. The estimation can in theory produce an estimation as high as $2\beta - {2 \over \beta}$ but by +induction the upper quotient digit is correct (\textit{as established on step eleven}) and the estimate must be less than $\beta$. + +Recall from section~\ref{sec:divest} that the estimation is never too low but may be too high. The next step of the estimation process is +to refine the estimation. The loop on step 13.5 uses $x_i\beta^2 + x_{i-1}\beta + x_{i-2}$ and $q_{i - t - 1}(y_t\beta + y_{t-1})$ as a higher +order approximation to adjust the quotient digit. + +After both phases of estimation the quotient digit may still be off by a value of one\footnote{This is similar to the error introduced +by optimizing Barrett reduction.}. Steps 13.6 and 13.7 subtract the multiple of the divisor from the dividend (\textit{Similar to step 3.3 of +algorithm~\ref{fig:raddiv}} and then subsequently add a multiple of the divisor if the quotient was too large. + +Now that the quotient has been determine finializing the result is a matter of clamping the quotient, fixing the sizes and de-normalizing the +remainder. An important aspect of this algorithm seemingly overlooked in other descriptions such as that of Algorithm 14.20 HAC \cite[pp. 598]{HAC} +is that when the estimations are being made (\textit{inside the loop on step 13.5}) that the digits $y_{t-1}$, $x_{i-2}$ and $x_{i-1}$ may lie +outside their respective boundaries. For example, if $t = 0$ or $i \le 1$ then the digits would be undefined. In those cases the digits should +respectively be replaced with a zero. + +EXAM,bn_mp_div.c + +The implementation of this algorithm differs slightly from the pseudo code presented previously. In this algorithm either of the quotient $c$ or +remainder $d$ may be passed as a \textbf{NULL} pointer which indicates their value is not desired. For example, the C code to call the division +algorithm with only the quotient is + +\begin{verbatim} +mp_div(&a, &b, &c, NULL); /* c = [a/b] */ +\end{verbatim} + +Lines @37,if@ and @42,if@ handle the two trivial cases of inputs which are division by zero and dividend smaller than the divisor +respectively. After the two trivial cases all of the temporary variables are initialized. Line @76,neg@ determines the sign of +the quotient and line @77,sign@ ensures that both $x$ and $y$ are positive. + +The number of bits in the leading digit is calculated on line @80,norm@. Implictly an mp\_int with $r$ digits will require $lg(\beta)(r-1) + k$ bits +of precision which when reduced modulo $lg(\beta)$ produces the value of $k$. In this case $k$ is the number of bits in the leading digit which is +exactly what is required. For the algorithm to operate $k$ must equal $lg(\beta) - 1$ and when it does not the inputs must be normalized by shifting +them to the left by $lg(\beta) - 1 - k$ bits. + +Throughout the variables $n$ and $t$ will represent the highest digit of $x$ and $y$ respectively. These are first used to produce the +leading digit of the quotient. The loop beginning on line @113,for@ will produce the remainder of the quotient digits. + +The conditional ``continue'' on line @114,if@ is used to prevent the algorithm from reading past the leading edge of $x$ which can occur when the +algorithm eliminates multiple non-zero digits in a single iteration. This ensures that $x_i$ is always non-zero since by definition the digits +above the $i$'th position $x$ must be zero in order for the quotient to be precise\footnote{Precise as far as integer division is concerned.}. + +Lines @142,t1@, @143,t1@ and @150,t2@ through @152,t2@ manually construct the high accuracy estimations by setting the digits of the two mp\_int +variables directly. + +\section{Single Digit Helpers} + +This section briefly describes a series of single digit helper algorithms which come in handy when working with small constants. All of +the helper functions assume the single digit input is positive and will treat them as such. + +\subsection{Single Digit Addition and Subtraction} + +Both addition and subtraction are performed by ``cheating'' and using mp\_set followed by the higher level addition or subtraction +algorithms. As a result these algorithms are subtantially simpler with a slight cost in performance. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_add\_d}. \\ +\textbf{Input}. mp\_int $a$ and a mp\_digit $b$ \\ +\textbf{Output}. $c = a + b$ \\ +\hline \\ +1. $t \leftarrow b$ (\textit{mp\_set}) \\ +2. $c \leftarrow a + t$ \\ +3. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_add\_d} +\end{figure} + +\textbf{Algorithm mp\_add\_d.} +This algorithm initiates a temporary mp\_int with the value of the single digit and uses algorithm mp\_add to add the two values together. + +EXAM,bn_mp_add_d.c + +Clever use of the letter 't'. + +\subsubsection{Subtraction} +The single digit subtraction algorithm mp\_sub\_d is essentially the same except it uses mp\_sub to subtract the digit from the mp\_int. + +\subsection{Single Digit Multiplication} +Single digit multiplication arises enough in division and radix conversion that it ought to be implement as a special case of the baseline +multiplication algorithm. Essentially this algorithm is a modified version of algorithm s\_mp\_mul\_digs where one of the multiplicands +only has one digit. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_mul\_d}. \\ +\textbf{Input}. mp\_int $a$ and a mp\_digit $b$ \\ +\textbf{Output}. $c = ab$ \\ +\hline \\ +1. $pa \leftarrow a.used$ \\ +2. Grow $c$ to at least $pa + 1$ digits. \\ +3. $oldused \leftarrow c.used$ \\ +4. $c.used \leftarrow pa + 1$ \\ +5. $c.sign \leftarrow a.sign$ \\ +6. $\mu \leftarrow 0$ \\ +7. for $ix$ from $0$ to $pa - 1$ do \\ +\hspace{3mm}7.1 $\hat r \leftarrow \mu + a_{ix}b$ \\ +\hspace{3mm}7.2 $c_{ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{3mm}7.3 $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\ +8. $c_{pa} \leftarrow \mu$ \\ +9. for $ix$ from $pa + 1$ to $oldused$ do \\ +\hspace{3mm}9.1 $c_{ix} \leftarrow 0$ \\ +10. Clamp excess digits of $c$. \\ +11. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_mul\_d} +\end{figure} +\textbf{Algorithm mp\_mul\_d.} +This algorithm quickly multiplies an mp\_int by a small single digit value. It is specially tailored to the job and has a minimal of overhead. +Unlike the full multiplication algorithms this algorithm does not require any significnat temporary storage or memory allocations. + +EXAM,bn_mp_mul_d.c + +In this implementation the destination $c$ may point to the same mp\_int as the source $a$ since the result is written after the digit is +read from the source. This function uses pointer aliases $tmpa$ and $tmpc$ for the digits of $a$ and $c$ respectively. + +\subsection{Single Digit Division} +Like the single digit multiplication algorithm, single digit division is also a fairly common algorithm used in radix conversion. Since the +divisor is only a single digit a specialized variant of the division algorithm can be used to compute the quotient. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_div\_d}. \\ +\textbf{Input}. mp\_int $a$ and a mp\_digit $b$ \\ +\textbf{Output}. $c = \lfloor a / b \rfloor, d = a - cb$ \\ +\hline \\ +1. If $b = 0$ then return(\textit{MP\_VAL}).\\ +2. If $b = 3$ then use algorithm mp\_div\_3 instead. \\ +3. Init $q$ to $a.used$ digits. \\ +4. $q.used \leftarrow a.used$ \\ +5. $q.sign \leftarrow a.sign$ \\ +6. $\hat w \leftarrow 0$ \\ +7. for $ix$ from $a.used - 1$ down to $0$ do \\ +\hspace{3mm}7.1 $\hat w \leftarrow \hat w \beta + a_{ix}$ \\ +\hspace{3mm}7.2 If $\hat w \ge b$ then \\ +\hspace{6mm}7.2.1 $t \leftarrow \lfloor \hat w / b \rfloor$ \\ +\hspace{6mm}7.2.2 $\hat w \leftarrow \hat w \mbox{ (mod }b\mbox{)}$ \\ +\hspace{3mm}7.3 else\\ +\hspace{6mm}7.3.1 $t \leftarrow 0$ \\ +\hspace{3mm}7.4 $q_{ix} \leftarrow t$ \\ +8. $d \leftarrow \hat w$ \\ +9. Clamp excess digits of $q$. \\ +10. $c \leftarrow q$ \\ +11. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_div\_d} +\end{figure} +\textbf{Algorithm mp\_div\_d.} +This algorithm divides the mp\_int $a$ by the single mp\_digit $b$ using an optimized approach. Essentially in every iteration of the +algorithm another digit of the dividend is reduced and another digit of quotient produced. Provided $b < \beta$ the value of $\hat w$ +after step 7.1 will be limited such that $0 \le \lfloor \hat w / b \rfloor < \beta$. + +If the divisor $b$ is equal to three a variant of this algorithm is used which is called mp\_div\_3. It replaces the division by three with +a multiplication by $\lfloor \beta / 3 \rfloor$ and the appropriate shift and residual fixup. In essence it is much like the Barrett reduction +from chapter seven. + +EXAM,bn_mp_div_d.c + +Like the implementation of algorithm mp\_div this algorithm allows either of the quotient or remainder to be passed as a \textbf{NULL} pointer to +indicate the respective value is not required. This allows a trivial single digit modular reduction algorithm, mp\_mod\_d to be created. + +The division and remainder on lines @44,/@ and @45,%@ can be replaced often by a single division on most processors. For example, the 32-bit x86 based +processors can divide a 64-bit quantity by a 32-bit quantity and produce the quotient and remainder simultaneously. Unfortunately the GCC +compiler does not recognize that optimization and will actually produce two function calls to find the quotient and remainder respectively. + +\subsection{Single Digit Root Extraction} + +Finding the $n$'th root of an integer is fairly easy as far as numerical analysis is concerned. Algorithms such as the Newton-Raphson approximation +(\ref{eqn:newton}) series will converge very quickly to a root for any continuous function $f(x)$. + +\begin{equation} +x_{i+1} = x_i - {f(x_i) \over f'(x_i)} +\label{eqn:newton} +\end{equation} + +In this case the $n$'th root is desired and $f(x) = x^n - a$ where $a$ is the integer of which the root is desired. The derivative of $f(x)$ is +simply $f'(x) = nx^{n - 1}$. Of particular importance is that this algorithm will be used over the integers not over the a more continuous domain +such as the real numbers. As a result the root found can be above the true root by few and must be manually adjusted. Ideally at the end of the +algorithm the $n$'th root $b$ of an integer $a$ is desired such that $b^n \le a$. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_n\_root}. \\ +\textbf{Input}. mp\_int $a$ and a mp\_digit $b$ \\ +\textbf{Output}. $c^b \le a$ \\ +\hline \\ +1. If $b$ is even and $a.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\ +2. $sign \leftarrow a.sign$ \\ +3. $a.sign \leftarrow MP\_ZPOS$ \\ +4. t$2 \leftarrow 2$ \\ +5. Loop \\ +\hspace{3mm}5.1 t$1 \leftarrow $ t$2$ \\ +\hspace{3mm}5.2 t$3 \leftarrow $ t$1^{b - 1}$ \\ +\hspace{3mm}5.3 t$2 \leftarrow $ t$3 $ $\cdot$ t$1$ \\ +\hspace{3mm}5.4 t$2 \leftarrow $ t$2 - a$ \\ +\hspace{3mm}5.5 t$3 \leftarrow $ t$3 \cdot b$ \\ +\hspace{3mm}5.6 t$3 \leftarrow \lfloor $t$2 / $t$3 \rfloor$ \\ +\hspace{3mm}5.7 t$2 \leftarrow $ t$1 - $ t$3$ \\ +\hspace{3mm}5.8 If t$1 \ne $ t$2$ then goto step 5. \\ +6. Loop \\ +\hspace{3mm}6.1 t$2 \leftarrow $ t$1^b$ \\ +\hspace{3mm}6.2 If t$2 > a$ then \\ +\hspace{6mm}6.2.1 t$1 \leftarrow $ t$1 - 1$ \\ +\hspace{6mm}6.2.2 Goto step 6. \\ +7. $a.sign \leftarrow sign$ \\ +8. $c \leftarrow $ t$1$ \\ +9. $c.sign \leftarrow sign$ \\ +10. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_n\_root} +\end{figure} +\textbf{Algorithm mp\_n\_root.} +This algorithm finds the integer $n$'th root of an input using the Newton-Raphson approach. It is partially optimized based on the observation +that the numerator of ${f(x) \over f'(x)}$ can be derived from a partial denominator. That is at first the denominator is calculated by finding +$x^{b - 1}$. This value can then be multiplied by $x$ and have $a$ subtracted from it to find the numerator. This saves a total of $b - 1$ +multiplications by t$1$ inside the loop. + +The initial value of the approximation is t$2 = 2$ which allows the algorithm to start with very small values and quickly converge on the +root. Ideally this algorithm is meant to find the $n$'th root of an input where $n$ is bounded by $2 \le n \le 5$. + +EXAM,bn_mp_n_root.c + +\section{Random Number Generation} + +Random numbers come up in a variety of activities from public key cryptography to simple simulations and various randomized algorithms. Pollard-Rho +factoring for example, can make use of random values as starting points to find factors of a composite integer. In this case the algorithm presented +is solely for simulations and not intended for cryptographic use. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_rand}. \\ +\textbf{Input}. An integer $b$ \\ +\textbf{Output}. A pseudo-random number of $b$ digits \\ +\hline \\ +1. $a \leftarrow 0$ \\ +2. If $b \le 0$ return(\textit{MP\_OKAY}) \\ +3. Pick a non-zero random digit $d$. \\ +4. $a \leftarrow a + d$ \\ +5. for $ix$ from 1 to $d - 1$ do \\ +\hspace{3mm}5.1 $a \leftarrow a \cdot \beta$ \\ +\hspace{3mm}5.2 Pick a random digit $d$. \\ +\hspace{3mm}5.3 $a \leftarrow a + d$ \\ +6. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_rand} +\end{figure} +\textbf{Algorithm mp\_rand.} +This algorithm produces a pseudo-random integer of $b$ digits. By ensuring that the first digit is non-zero the algorithm also guarantees that the +final result has at least $b$ digits. It relies heavily on a third-part random number generator which should ideally generate uniformly all of +the integers from $0$ to $\beta - 1$. + +EXAM,bn_mp_rand.c + +\section{Formatted Representations} +The ability to emit a radix-$n$ textual representation of an integer is useful for interacting with human parties. For example, the ability to +be given a string of characters such as ``114585'' and turn it into the radix-$\beta$ equivalent would make it easier to enter numbers +into a program. + +\subsection{Reading Radix-n Input} +For the purposes of this text we will assume that a simple lower ASCII map (\ref{fig:ASC}) is used for the values of from $0$ to $63$ to +printable characters. For example, when the character ``N'' is read it represents the integer $23$. The first $16$ characters of the +map are for the common representations up to hexadecimal. After that they match the ``base64'' encoding scheme which are suitable chosen +such that they are printable. While outputting as base64 may not be too helpful for human operators it does allow communication via non binary +mediums. + +\newpage\begin{figure}[here] +\begin{center} +\begin{tabular}{cc|cc|cc|cc} +\hline \textbf{Value} & \textbf{Char} & \textbf{Value} & \textbf{Char} & \textbf{Value} & \textbf{Char} & \textbf{Value} & \textbf{Char} \\ +\hline +0 & 0 & 1 & 1 & 2 & 2 & 3 & 3 \\ +4 & 4 & 5 & 5 & 6 & 6 & 7 & 7 \\ +8 & 8 & 9 & 9 & 10 & A & 11 & B \\ +12 & C & 13 & D & 14 & E & 15 & F \\ +16 & G & 17 & H & 18 & I & 19 & J \\ +20 & K & 21 & L & 22 & M & 23 & N \\ +24 & O & 25 & P & 26 & Q & 27 & R \\ +28 & S & 29 & T & 30 & U & 31 & V \\ +32 & W & 33 & X & 34 & Y & 35 & Z \\ +36 & a & 37 & b & 38 & c & 39 & d \\ +40 & e & 41 & f & 42 & g & 43 & h \\ +44 & i & 45 & j & 46 & k & 47 & l \\ +48 & m & 49 & n & 50 & o & 51 & p \\ +52 & q & 53 & r & 54 & s & 55 & t \\ +56 & u & 57 & v & 58 & w & 59 & x \\ +60 & y & 61 & z & 62 & $+$ & 63 & $/$ \\ +\hline +\end{tabular} +\end{center} +\caption{Lower ASCII Map} +\label{fig:ASC} +\end{figure} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_read\_radix}. \\ +\textbf{Input}. A string $str$ of length $sn$ and radix $r$. \\ +\textbf{Output}. The radix-$\beta$ equivalent mp\_int. \\ +\hline \\ +1. If $r < 2$ or $r > 64$ return(\textit{MP\_VAL}). \\ +2. $ix \leftarrow 0$ \\ +3. If $str_0 =$ ``-'' then do \\ +\hspace{3mm}3.1 $ix \leftarrow ix + 1$ \\ +\hspace{3mm}3.2 $sign \leftarrow MP\_NEG$ \\ +4. else \\ +\hspace{3mm}4.1 $sign \leftarrow MP\_ZPOS$ \\ +5. $a \leftarrow 0$ \\ +6. for $iy$ from $ix$ to $sn - 1$ do \\ +\hspace{3mm}6.1 Let $y$ denote the position in the map of $str_{iy}$. \\ +\hspace{3mm}6.2 If $str_{iy}$ is not in the map or $y \ge r$ then goto step 7. \\ +\hspace{3mm}6.3 $a \leftarrow a \cdot r$ \\ +\hspace{3mm}6.4 $a \leftarrow a + y$ \\ +7. If $a \ne 0$ then $a.sign \leftarrow sign$ \\ +8. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_read\_radix} +\end{figure} +\textbf{Algorithm mp\_read\_radix.} +This algorithm will read an ASCII string and produce the radix-$\beta$ mp\_int representation of the same integer. A minus symbol ``-'' may precede the +string to indicate the value is negative, otherwise it is assumed to be positive. The algorithm will read up to $sn$ characters from the input +and will stop when it reads a character it cannot map the algorithm stops reading characters from the string. This allows numbers to be embedded +as part of larger input without any significant problem. + +EXAM,bn_mp_read_radix.c + +\subsection{Generating Radix-$n$ Output} +Generating radix-$n$ output is fairly trivial with a division and remainder algorithm. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_toradix}. \\ +\textbf{Input}. A mp\_int $a$ and an integer $r$\\ +\textbf{Output}. The radix-$r$ representation of $a$ \\ +\hline \\ +1. If $r < 2$ or $r > 64$ return(\textit{MP\_VAL}). \\ +2. If $a = 0$ then $str = $ ``$0$'' and return(\textit{MP\_OKAY}). \\ +3. $t \leftarrow a$ \\ +4. $str \leftarrow$ ``'' \\ +5. if $t.sign = MP\_NEG$ then \\ +\hspace{3mm}5.1 $str \leftarrow str + $ ``-'' \\ +\hspace{3mm}5.2 $t.sign = MP\_ZPOS$ \\ +6. While ($t \ne 0$) do \\ +\hspace{3mm}6.1 $d \leftarrow t \mbox{ (mod }r\mbox{)}$ \\ +\hspace{3mm}6.2 $t \leftarrow \lfloor t / r \rfloor$ \\ +\hspace{3mm}6.3 Look up $d$ in the map and store the equivalent character in $y$. \\ +\hspace{3mm}6.4 $str \leftarrow str + y$ \\ +7. If $str_0 = $``$-$'' then \\ +\hspace{3mm}7.1 Reverse the digits $str_1, str_2, \ldots str_n$. \\ +8. Otherwise \\ +\hspace{3mm}8.1 Reverse the digits $str_0, str_1, \ldots str_n$. \\ +9. Return(\textit{MP\_OKAY}).\\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_toradix} +\end{figure} +\textbf{Algorithm mp\_toradix.} +This algorithm computes the radix-$r$ representation of an mp\_int $a$. The ``digits'' of the representation are extracted by reducing +successive powers of $\lfloor a / r^k \rfloor$ the input modulo $r$ until $r^k > a$. Note that instead of actually dividing by $r^k$ in +each iteration the quotient $\lfloor a / r \rfloor$ is saved for the next iteration. As a result a series of trivial $n \times 1$ divisions +are required instead of a series of $n \times k$ divisions. One design flaw of this approach is that the digits are produced in the reverse order +(see~\ref{fig:mpradix}). To remedy this flaw the digits must be swapped or simply ``reversed''. + +\begin{figure} +\begin{center} +\begin{tabular}{|c|c|c|} +\hline \textbf{Value of $a$} & \textbf{Value of $d$} & \textbf{Value of $str$} \\ +\hline $1234$ & -- & -- \\ +\hline $123$ & $4$ & ``4'' \\ +\hline $12$ & $3$ & ``43'' \\ +\hline $1$ & $2$ & ``432'' \\ +\hline $0$ & $1$ & ``4321'' \\ +\hline +\end{tabular} +\end{center} +\caption{Example of Algorithm mp\_toradix.} +\label{fig:mpradix} +\end{figure} + +EXAM,bn_mp_toradix.c + +\chapter{Number Theoretic Algorithms} +This chapter discusses several fundamental number theoretic algorithms such as the greatest common divisor, least common multiple and Jacobi +symbol computation. These algorithms arise as essential components in several key cryptographic algorithms such as the RSA public key algorithm and +various Sieve based factoring algorithms. + +\section{Greatest Common Divisor} +The greatest common divisor of two integers $a$ and $b$, often denoted as $(a, b)$ is the largest integer $k$ that is a proper divisor of +both $a$ and $b$. That is, $k$ is the largest integer such that $0 \equiv a \mbox{ (mod }k\mbox{)}$ and $0 \equiv b \mbox{ (mod }k\mbox{)}$ occur +simultaneously. + +The most common approach (cite) is to reduce one input modulo another. That is if $a$ and $b$ are divisible by some integer $k$ and if $qa + r = b$ then +$r$ is also divisible by $k$. The reduction pattern follows $\left < a , b \right > \rightarrow \left < b, a \mbox{ mod } b \right >$. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Greatest Common Divisor (I)}. \\ +\textbf{Input}. Two positive integers $a$ and $b$ greater than zero. \\ +\textbf{Output}. The greatest common divisor $(a, b)$. \\ +\hline \\ +1. While ($b > 0$) do \\ +\hspace{3mm}1.1 $r \leftarrow a \mbox{ (mod }b\mbox{)}$ \\ +\hspace{3mm}1.2 $a \leftarrow b$ \\ +\hspace{3mm}1.3 $b \leftarrow r$ \\ +2. Return($a$). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm Greatest Common Divisor (I)} +\label{fig:gcd1} +\end{figure} + +This algorithm will quickly converge on the greatest common divisor since the residue $r$ tends diminish rapidly. However, divisions are +relatively expensive operations to perform and should ideally be avoided. There is another approach based on a similar relationship of +greatest common divisors. The faster approach is based on the observation that if $k$ divides both $a$ and $b$ it will also divide $a - b$. +In particular, we would like $a - b$ to decrease in magnitude which implies that $b \ge a$. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Greatest Common Divisor (II)}. \\ +\textbf{Input}. Two positive integers $a$ and $b$ greater than zero. \\ +\textbf{Output}. The greatest common divisor $(a, b)$. \\ +\hline \\ +1. While ($b > 0$) do \\ +\hspace{3mm}1.1 Swap $a$ and $b$ such that $a$ is the smallest of the two. \\ +\hspace{3mm}1.2 $b \leftarrow b - a$ \\ +2. Return($a$). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm Greatest Common Divisor (II)} +\label{fig:gcd2} +\end{figure} + +\textbf{Proof} \textit{Algorithm~\ref{fig:gcd2} will return the greatest common divisor of $a$ and $b$.} +The algorithm in figure~\ref{fig:gcd2} will eventually terminate since $b \ge a$ the subtraction in step 1.2 will be a value less than $b$. In other +words in every iteration that tuple $\left < a, b \right >$ decrease in magnitude until eventually $a = b$. Since both $a$ and $b$ are always +divisible by the greatest common divisor (\textit{until the last iteration}) and in the last iteration of the algorithm $b = 0$, therefore, in the +second to last iteration of the algorithm $b = a$ and clearly $(a, a) = a$ which concludes the proof. \textbf{QED}. + +As a matter of practicality algorithm \ref{fig:gcd1} decreases far too slowly to be useful. Specially if $b$ is much larger than $a$ such that +$b - a$ is still very much larger than $a$. A simple addition to the algorithm is to divide $b - a$ by a power of some integer $p$ which does +not divide the greatest common divisor but will divide $b - a$. In this case ${b - a} \over p$ is also an integer and still divisible by +the greatest common divisor. + +However, instead of factoring $b - a$ to find a suitable value of $p$ the powers of $p$ can be removed from $a$ and $b$ that are in common first. +Then inside the loop whenever $b - a$ is divisible by some power of $p$ it can be safely removed. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Greatest Common Divisor (III)}. \\ +\textbf{Input}. Two positive integers $a$ and $b$ greater than zero. \\ +\textbf{Output}. The greatest common divisor $(a, b)$. \\ +\hline \\ +1. $k \leftarrow 0$ \\ +2. While $a$ and $b$ are both divisible by $p$ do \\ +\hspace{3mm}2.1 $a \leftarrow \lfloor a / p \rfloor$ \\ +\hspace{3mm}2.2 $b \leftarrow \lfloor b / p \rfloor$ \\ +\hspace{3mm}2.3 $k \leftarrow k + 1$ \\ +3. While $a$ is divisible by $p$ do \\ +\hspace{3mm}3.1 $a \leftarrow \lfloor a / p \rfloor$ \\ +4. While $b$ is divisible by $p$ do \\ +\hspace{3mm}4.1 $b \leftarrow \lfloor b / p \rfloor$ \\ +5. While ($b > 0$) do \\ +\hspace{3mm}5.1 Swap $a$ and $b$ such that $a$ is the smallest of the two. \\ +\hspace{3mm}5.2 $b \leftarrow b - a$ \\ +\hspace{3mm}5.3 While $b$ is divisible by $p$ do \\ +\hspace{6mm}5.3.1 $b \leftarrow \lfloor b / p \rfloor$ \\ +6. Return($a \cdot p^k$). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm Greatest Common Divisor (III)} +\label{fig:gcd3} +\end{figure} + +This algorithm is based on the first except it removes powers of $p$ first and inside the main loop to ensure the tuple $\left < a, b \right >$ +decreases more rapidly. The first loop on step two removes powers of $p$ that are in common. A count, $k$, is kept which will present a common +divisor of $p^k$. After step two the remaining common divisor of $a$ and $b$ cannot be divisible by $p$. This means that $p$ can be safely +divided out of the difference $b - a$ so long as the division leaves no remainder. + +In particular the value of $p$ should be chosen such that the division on step 5.3.1 occur often. It also helps that division by $p$ be easy +to compute. The ideal choice of $p$ is two since division by two amounts to a right logical shift. Another important observation is that by +step five both $a$ and $b$ are odd. Therefore, the diffrence $b - a$ must be even which means that each iteration removes one bit from the +largest of the pair. + +\subsection{Complete Greatest Common Divisor} +The algorithms presented so far cannot handle inputs which are zero or negative. The following algorithm can handle all input cases properly +and will produce the greatest common divisor. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_gcd}. \\ +\textbf{Input}. mp\_int $a$ and $b$ \\ +\textbf{Output}. The greatest common divisor $c = (a, b)$. \\ +\hline \\ +1. If $a = 0$ and $b \ne 0$ then \\ +\hspace{3mm}1.1 $c \leftarrow b$ \\ +\hspace{3mm}1.2 Return(\textit{MP\_OKAY}). \\ +2. If $a \ne 0$ and $b = 0$ then \\ +\hspace{3mm}2.1 $c \leftarrow a$ \\ +\hspace{3mm}2.2 Return(\textit{MP\_OKAY}). \\ +3. If $a = b = 0$ then \\ +\hspace{3mm}3.1 $c \leftarrow 1$ \\ +\hspace{3mm}3.2 Return(\textit{MP\_OKAY}). \\ +4. $u \leftarrow \vert a \vert, v \leftarrow \vert b \vert$ \\ +5. $k \leftarrow 0$ \\ +6. While $u.used > 0$ and $v.used > 0$ and $u_0 \equiv v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\ +\hspace{3mm}6.1 $k \leftarrow k + 1$ \\ +\hspace{3mm}6.2 $u \leftarrow \lfloor u / 2 \rfloor$ \\ +\hspace{3mm}6.3 $v \leftarrow \lfloor v / 2 \rfloor$ \\ +7. While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\ +\hspace{3mm}7.1 $u \leftarrow \lfloor u / 2 \rfloor$ \\ +8. While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\ +\hspace{3mm}8.1 $v \leftarrow \lfloor v / 2 \rfloor$ \\ +9. While $v.used > 0$ \\ +\hspace{3mm}9.1 If $\vert u \vert > \vert v \vert$ then \\ +\hspace{6mm}9.1.1 Swap $u$ and $v$. \\ +\hspace{3mm}9.2 $v \leftarrow \vert v \vert - \vert u \vert$ \\ +\hspace{3mm}9.3 While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\ +\hspace{6mm}9.3.1 $v \leftarrow \lfloor v / 2 \rfloor$ \\ +10. $c \leftarrow u \cdot 2^k$ \\ +11. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_gcd} +\end{figure} +\textbf{Algorithm mp\_gcd.} +This algorithm will produce the greatest common divisor of two mp\_ints $a$ and $b$. The algorithm was originally based on Algorithm B of +Knuth \cite[pp. 338]{TAOCPV2} but has been modified to be simpler to explain. In theory it achieves the same asymptotic working time as +Algorithm B and in practice this appears to be true. + +The first three steps handle the cases where either one of or both inputs are zero. If either input is zero the greatest common divisor is the +largest input or zero if they are both zero. If the inputs are not trivial than $u$ and $v$ are assigned the absolute values of +$a$ and $b$ respectively and the algorithm will proceed to reduce the pair. + +Step six will divide out any common factors of two and keep track of the count in the variable $k$. After this step two is no longer a +factor of the remaining greatest common divisor between $u$ and $v$ and can be safely evenly divided out of either whenever they are even. Step +seven and eight ensure that the $u$ and $v$ respectively have no more factors of two. At most only one of the while loops will iterate since +they cannot both be even. + +By step nine both of $u$ and $v$ are odd which is required for the inner logic. First the pair are swapped such that $v$ is equal to +or greater than $u$. This ensures that the subtraction on step 9.2 will always produce a positive and even result. Step 9.3 removes any +factors of two from the difference $u$ to ensure that in the next iteration of the loop both are once again odd. + +After $v = 0$ occurs the variable $u$ has the greatest common divisor of the pair $\left < u, v \right >$ just after step six. The result +must be adjusted by multiplying by the common factors of two ($2^k$) removed earlier. + +EXAM,bn_mp_gcd.c + +This function makes use of the macros mp\_iszero and mp\_iseven. The former evaluates to $1$ if the input mp\_int is equivalent to the +integer zero otherwise it evaluates to $0$. The latter evaluates to $1$ if the input mp\_int represents a non-zero even integer otherwise +it evaluates to $0$. Note that just because mp\_iseven may evaluate to $0$ does not mean the input is odd, it could also be zero. The three +trivial cases of inputs are handled on lines @25,zero@ through @34,}@. After those lines the inputs are assumed to be non-zero. + +Lines @36,if@ and @40,if@ make local copies $u$ and $v$ of the inputs $a$ and $b$ respectively. At this point the common factors of two +must be divided out of the two inputs. The while loop on line @49,while@ iterates so long as both are even. The local integer $k$ is used to +keep track of how many factors of $2$ are pulled out of both values. It is assumed that the number of factors will not exceed the maximum +value of a C ``int'' data type\footnote{Strictly speaking no array in C may have more than entries than are accessible by an ``int'' so this is not +a limitation.}. + +At this point there are no more common factors of two in the two values. The while loops on lines @60,while@ and @65,while@ remove any independent +factors of two such that both $u$ and $v$ are guaranteed to be an odd integer before hitting the main body of the algorithm. The while loop +on line @71, while@ performs the reduction of the pair until $v$ is equal to zero. The unsigned comparison and subtraction algorithms are used in +place of the full signed routines since both values are guaranteed to be positive and the result of the subtraction is guaranteed to be non-negative. + +\section{Least Common Multiple} +The least common multiple of a pair of integers is their product divided by their greatest common divisor. For two integers $a$ and $b$ the +least common multiple is normally denoted as $[ a, b ]$ and numerically equivalent to ${ab} \over {(a, b)}$. For example, if $a = 2 \cdot 2 \cdot 3 = 12$ +and $b = 2 \cdot 3 \cdot 3 \cdot 7 = 126$ the least common multiple is ${126 \over {(12, 126)}} = {126 \over 6} = 21$. + +The least common multiple arises often in coding theory as well as number theory. If two functions have periods of $a$ and $b$ respectively they will +collide, that is be in synchronous states, after only $[ a, b ]$ iterations. This is why, for example, random number generators based on +Linear Feedback Shift Registers (LFSR) tend to use registers with periods which are co-prime (\textit{e.g. the greatest common divisor is one.}). +Similarly in number theory if a composite $n$ has two prime factors $p$ and $q$ then maximal order of any unit of $\Z/n\Z$ will be $[ p - 1, q - 1] $. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_lcm}. \\ +\textbf{Input}. mp\_int $a$ and $b$ \\ +\textbf{Output}. The least common multiple $c = [a, b]$. \\ +\hline \\ +1. $c \leftarrow (a, b)$ \\ +2. $t \leftarrow a \cdot b$ \\ +3. $c \leftarrow \lfloor t / c \rfloor$ \\ +4. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_lcm} +\end{figure} +\textbf{Algorithm mp\_lcm.} +This algorithm computes the least common multiple of two mp\_int inputs $a$ and $b$. It computes the least common multiple directly by +dividing the product of the two inputs by their greatest common divisor. + +EXAM,bn_mp_lcm.c + +\section{Jacobi Symbol Computation} +To explain the Jacobi Symbol we shall first discuss the Legendre function\footnote{Arrg. What is the name of this?} off which the Jacobi symbol is +defined. The Legendre function computes whether or not an integer $a$ is a quadratic residue modulo an odd prime $p$. Numerically it is +equivalent to equation \ref{eqn:legendre}. + +\begin{equation} +a^{(p-1)/2} \equiv \begin{array}{rl} + -1 & \mbox{if }a\mbox{ is a quadratic non-residue.} \\ + 0 & \mbox{if }a\mbox{ divides }p\mbox{.} \\ + 1 & \mbox{if }a\mbox{ is a quadratic residue}. + \end{array} \mbox{ (mod }p\mbox{)} +\label{eqn:legendre} +\end{equation} + +\textbf{Proof.} \textit{Equation \ref{eqn:legendre} correctly identifies the residue status of an integer $a$ modulo a prime $p$.} +An integer $a$ is a quadratic residue if the following equation has a solution. + +\begin{equation} +x^2 \equiv a \mbox{ (mod }p\mbox{)} +\label{eqn:root} +\end{equation} + +Consider the following equation. + +\begin{equation} +0 \equiv x^{p-1} - 1 \equiv \left \lbrace \left (x^2 \right )^{(p-1)/2} - a^{(p-1)/2} \right \rbrace + \left ( a^{(p-1)/2} - 1 \right ) \mbox{ (mod }p\mbox{)} +\label{eqn:rooti} +\end{equation} + +Whether equation \ref{eqn:root} has a solution or not equation \ref{eqn:rooti} is always true. If $a^{(p-1)/2} - 1 \equiv 0 \mbox{ (mod }p\mbox{)}$ +then the quantity in the braces must be zero. By reduction, + +\begin{eqnarray} +\left (x^2 \right )^{(p-1)/2} - a^{(p-1)/2} \equiv 0 \nonumber \\ +\left (x^2 \right )^{(p-1)/2} \equiv a^{(p-1)/2} \nonumber \\ +x^2 \equiv a \mbox{ (mod }p\mbox{)} +\end{eqnarray} + +As a result there must be a solution to the quadratic equation and in turn $a$ must be a quadratic residue. If $a$ does not divide $p$ and $a$ +is not a quadratic residue then the only other value $a^{(p-1)/2}$ may be congruent to is $-1$ since +\begin{equation} +0 \equiv a^{p - 1} - 1 \equiv (a^{(p-1)/2} + 1)(a^{(p-1)/2} - 1) \mbox{ (mod }p\mbox{)} +\end{equation} +One of the terms on the right hand side must be zero. \textbf{QED} + +\subsection{Jacobi Symbol} +The Jacobi symbol is a generalization of the Legendre function for any odd non prime moduli $p$ greater than 2. If $p = \prod_{i=0}^n p_i$ then +the Jacobi symbol $\left ( { a \over p } \right )$ is equal to the following equation. + +\begin{equation} +\left ( { a \over p } \right ) = \left ( { a \over p_0} \right ) \left ( { a \over p_1} \right ) \ldots \left ( { a \over p_n} \right ) +\end{equation} + +By inspection if $p$ is prime the Jacobi symbol is equivalent to the Legendre function. The following facts\footnote{See HAC \cite[pp. 72-74]{HAC} for +further details.} will be used to derive an efficient Jacobi symbol algorithm. Where $p$ is an odd integer greater than two and $a, b \in \Z$ the +following are true. + +\begin{enumerate} +\item $\left ( { a \over p} \right )$ equals $-1$, $0$ or $1$. +\item $\left ( { ab \over p} \right ) = \left ( { a \over p} \right )\left ( { b \over p} \right )$. +\item If $a \equiv b$ then $\left ( { a \over p} \right ) = \left ( { b \over p} \right )$. +\item $\left ( { 2 \over p} \right )$ equals $1$ if $p \equiv 1$ or $7 \mbox{ (mod }8\mbox{)}$. Otherwise, it equals $-1$. +\item $\left ( { a \over p} \right ) \equiv \left ( { p \over a} \right ) \cdot (-1)^{(p-1)(a-1)/4}$. More specifically +$\left ( { a \over p} \right ) = \left ( { p \over a} \right )$ if $p \equiv a \equiv 1 \mbox{ (mod }4\mbox{)}$. +\end{enumerate} + +Using these facts if $a = 2^k \cdot a'$ then + +\begin{eqnarray} +\left ( { a \over p } \right ) = \left ( {{2^k} \over p } \right ) \left ( {a' \over p} \right ) \nonumber \\ + = \left ( {2 \over p } \right )^k \left ( {a' \over p} \right ) +\label{eqn:jacobi} +\end{eqnarray} + +By fact five, + +\begin{equation} +\left ( { a \over p } \right ) = \left ( { p \over a } \right ) \cdot (-1)^{(p-1)(a-1)/4} +\end{equation} + +Subsequently by fact three since $p \equiv (p \mbox{ mod }a) \mbox{ (mod }a\mbox{)}$ then + +\begin{equation} +\left ( { a \over p } \right ) = \left ( { {p \mbox{ mod } a} \over a } \right ) \cdot (-1)^{(p-1)(a-1)/4} +\end{equation} + +By putting both observations into equation \ref{eqn:jacobi} the following simplified equation is formed. + +\begin{equation} +\left ( { a \over p } \right ) = \left ( {2 \over p } \right )^k \left ( {{p\mbox{ mod }a'} \over a'} \right ) \cdot (-1)^{(p-1)(a'-1)/4} +\end{equation} + +The value of $\left ( {{p \mbox{ mod }a'} \over a'} \right )$ can be found by using the same equation recursively. The value of +$\left ( {2 \over p } \right )^k$ equals $1$ if $k$ is even otherwise it equals $\left ( {2 \over p } \right )$. Using this approach the +factors of $p$ do not have to be known. Furthermore, if $(a, p) = 1$ then the algorithm will terminate when the recursion requests the +Jacobi symbol computation of $\left ( {1 \over a'} \right )$ which is simply $1$. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_jacobi}. \\ +\textbf{Input}. mp\_int $a$ and $p$, $a \ge 0$, $p \ge 3$, $p \equiv 1 \mbox{ (mod }2\mbox{)}$ \\ +\textbf{Output}. The Jacobi symbol $c = \left ( {a \over p } \right )$. \\ +\hline \\ +1. If $a = 0$ then \\ +\hspace{3mm}1.1 $c \leftarrow 0$ \\ +\hspace{3mm}1.2 Return(\textit{MP\_OKAY}). \\ +2. If $a = 1$ then \\ +\hspace{3mm}2.1 $c \leftarrow 1$ \\ +\hspace{3mm}2.2 Return(\textit{MP\_OKAY}). \\ +3. $a' \leftarrow a$ \\ +4. $k \leftarrow 0$ \\ +5. While $a'.used > 0$ and $a'_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\ +\hspace{3mm}5.1 $k \leftarrow k + 1$ \\ +\hspace{3mm}5.2 $a' \leftarrow \lfloor a' / 2 \rfloor$ \\ +6. If $k \equiv 0 \mbox{ (mod }2\mbox{)}$ then \\ +\hspace{3mm}6.1 $s \leftarrow 1$ \\ +7. else \\ +\hspace{3mm}7.1 $r \leftarrow p_0 \mbox{ (mod }8\mbox{)}$ \\ +\hspace{3mm}7.2 If $r = 1$ or $r = 7$ then \\ +\hspace{6mm}7.2.1 $s \leftarrow 1$ \\ +\hspace{3mm}7.3 else \\ +\hspace{6mm}7.3.1 $s \leftarrow -1$ \\ +8. If $p_0 \equiv a'_0 \equiv 3 \mbox{ (mod }4\mbox{)}$ then \\ +\hspace{3mm}8.1 $s \leftarrow -s$ \\ +9. If $a' \ne 1$ then \\ +\hspace{3mm}9.1 $p' \leftarrow p \mbox{ (mod }a'\mbox{)}$ \\ +\hspace{3mm}9.2 $s \leftarrow s \cdot \mbox{mp\_jacobi}(p', a')$ \\ +10. $c \leftarrow s$ \\ +11. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_jacobi} +\end{figure} +\textbf{Algorithm mp\_jacobi.} +This algorithm computes the Jacobi symbol for an arbitrary positive integer $a$ with respect to an odd integer $p$ greater than three. The algorithm +is based on algorithm 2.149 of HAC \cite[pp. 73]{HAC}. + +Step numbers one and two handle the trivial cases of $a = 0$ and $a = 1$ respectively. Step five determines the number of two factors in the +input $a$. If $k$ is even than the term $\left ( { 2 \over p } \right )^k$ must always evaluate to one. If $k$ is odd than the term evaluates to one +if $p_0$ is congruent to one or seven modulo eight, otherwise it evaluates to $-1$. After the the $\left ( { 2 \over p } \right )^k$ term is handled +the $(-1)^{(p-1)(a'-1)/4}$ is computed and multiplied against the current product $s$. The latter term evaluates to one if both $p$ and $a'$ +are congruent to one modulo four, otherwise it evaluates to negative one. + +By step nine if $a'$ does not equal one a recursion is required. Step 9.1 computes $p' \equiv p \mbox{ (mod }a'\mbox{)}$ and will recurse to compute +$\left ( {p' \over a'} \right )$ which is multiplied against the current Jacobi product. + +EXAM,bn_mp_jacobi.c + +As a matter of practicality the variable $a'$ as per the pseudo-code is reprensented by the variable $a1$ since the $'$ symbol is not valid for a C +variable name character. + +The two simple cases of $a = 0$ and $a = 1$ are handled at the very beginning to simplify the algorithm. If the input is non-trivial the algorithm +has to proceed compute the Jacobi. The variable $s$ is used to hold the current Jacobi product. Note that $s$ is merely a C ``int'' data type since +the values it may obtain are merely $-1$, $0$ and $1$. + +After a local copy of $a$ is made all of the factors of two are divided out and the total stored in $k$. Technically only the least significant +bit of $k$ is required, however, it makes the algorithm simpler to follow to perform an addition. In practice an exclusive-or and addition have the same +processor requirements and neither is faster than the other. + +Line @59, if@ through @70, }@ determines the value of $\left ( { 2 \over p } \right )^k$. If the least significant bit of $k$ is zero than +$k$ is even and the value is one. Otherwise, the value of $s$ depends on which residue class $p$ belongs to modulo eight. The value of +$(-1)^{(p-1)(a'-1)/4}$ is compute and multiplied against $s$ on lines @73, if@ through @75, }@. + +Finally, if $a1$ does not equal one the algorithm must recurse and compute $\left ( {p' \over a'} \right )$. + +\textit{-- Comment about default $s$ and such...} + +\section{Modular Inverse} +\label{sec:modinv} +The modular inverse of a number actually refers to the modular multiplicative inverse. Essentially for any integer $a$ such that $(a, p) = 1$ there +exist another integer $b$ such that $ab \equiv 1 \mbox{ (mod }p\mbox{)}$. The integer $b$ is called the multiplicative inverse of $a$ which is +denoted as $b = a^{-1}$. Technically speaking modular inversion is a well defined operation for any finite ring or field not just for rings and +fields of integers. However, the former will be the matter of discussion. + +The simplest approach is to compute the algebraic inverse of the input. That is to compute $b \equiv a^{\Phi(p) - 1}$. If $\Phi(p)$ is the +order of the multiplicative subgroup modulo $p$ then $b$ must be the multiplicative inverse of $a$. The proof of which is trivial. + +\begin{equation} +ab \equiv a \left (a^{\Phi(p) - 1} \right ) \equiv a^{\Phi(p)} \equiv a^0 \equiv 1 \mbox{ (mod }p\mbox{)} +\end{equation} + +However, as simple as this approach may be it has two serious flaws. It requires that the value of $\Phi(p)$ be known which if $p$ is composite +requires all of the prime factors. This approach also is very slow as the size of $p$ grows. + +A simpler approach is based on the observation that solving for the multiplicative inverse is equivalent to solving the linear +Diophantine\footnote{See LeVeque \cite[pp. 40-43]{LeVeque} for more information.} equation. + +\begin{equation} +ab + pq = 1 +\end{equation} + +Where $a$, $b$, $p$ and $q$ are all integers. If such a pair of integers $ \left < b, q \right >$ exist than $b$ is the multiplicative inverse of +$a$ modulo $p$. The extended Euclidean algorithm (Knuth \cite[pp. 342]{TAOCPV2}) can be used to solve such equations provided $(a, p) = 1$. +However, instead of using that algorithm directly a variant known as the binary Extended Euclidean algorithm will be used in its place. The +binary approach is very similar to the binary greatest common divisor algorithm except it will produce a full solution to the Diophantine +equation. + +\subsection{General Case} +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_invmod}. \\ +\textbf{Input}. mp\_int $a$ and $b$, $(a, b) = 1$, $p \ge 2$, $0 < a < p$. \\ +\textbf{Output}. The modular inverse $c \equiv a^{-1} \mbox{ (mod }b\mbox{)}$. \\ +\hline \\ +1. If $b \le 0$ then return(\textit{MP\_VAL}). \\ +2. If $b_0 \equiv 1 \mbox{ (mod }2\mbox{)}$ then use algorithm fast\_mp\_invmod. \\ +3. $x \leftarrow \vert a \vert, y \leftarrow b$ \\ +4. If $x_0 \equiv y_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ then return(\textit{MP\_VAL}). \\ +5. $B \leftarrow 0, C \leftarrow 0, A \leftarrow 1, D \leftarrow 1$ \\ +6. While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\ +\hspace{3mm}6.1 $u \leftarrow \lfloor u / 2 \rfloor$ \\ +\hspace{3mm}6.2 If ($A.used > 0$ and $A_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) or ($B.used > 0$ and $B_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) then \\ +\hspace{6mm}6.2.1 $A \leftarrow A + y$ \\ +\hspace{6mm}6.2.2 $B \leftarrow B - x$ \\ +\hspace{3mm}6.3 $A \leftarrow \lfloor A / 2 \rfloor$ \\ +\hspace{3mm}6.4 $B \leftarrow \lfloor B / 2 \rfloor$ \\ +7. While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\ +\hspace{3mm}7.1 $v \leftarrow \lfloor v / 2 \rfloor$ \\ +\hspace{3mm}7.2 If ($C.used > 0$ and $C_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) or ($D.used > 0$ and $D_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) then \\ +\hspace{6mm}7.2.1 $C \leftarrow C + y$ \\ +\hspace{6mm}7.2.2 $D \leftarrow D - x$ \\ +\hspace{3mm}7.3 $C \leftarrow \lfloor C / 2 \rfloor$ \\ +\hspace{3mm}7.4 $D \leftarrow \lfloor D / 2 \rfloor$ \\ +8. If $u \ge v$ then \\ +\hspace{3mm}8.1 $u \leftarrow u - v$ \\ +\hspace{3mm}8.2 $A \leftarrow A - C$ \\ +\hspace{3mm}8.3 $B \leftarrow B - D$ \\ +9. else \\ +\hspace{3mm}9.1 $v \leftarrow v - u$ \\ +\hspace{3mm}9.2 $C \leftarrow C - A$ \\ +\hspace{3mm}9.3 $D \leftarrow D - B$ \\ +10. If $u \ne 0$ goto step 6. \\ +11. If $v \ne 1$ return(\textit{MP\_VAL}). \\ +12. While $C \le 0$ do \\ +\hspace{3mm}12.1 $C \leftarrow C + b$ \\ +13. While $C \ge b$ do \\ +\hspace{3mm}13.1 $C \leftarrow C - b$ \\ +14. $c \leftarrow C$ \\ +15. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\end{figure} +\textbf{Algorithm mp\_invmod.} +This algorithm computes the modular multiplicative inverse of an integer $a$ modulo an integer $b$. This algorithm is a variation of the +extended binary Euclidean algorithm from HAC \cite[pp. 608]{HAC}. It has been modified to only compute the modular inverse and not a complete +Diophantine solution. + +If $b \le 0$ than the modulus is invalid and MP\_VAL is returned. Similarly if both $a$ and $b$ are even then there cannot be a multiplicative +inverse for $a$ and the error is reported. + +The astute reader will observe that steps seven through nine are very similar to the binary greatest common divisor algorithm mp\_gcd. In this case +the other variables to the Diophantine equation are solved. The algorithm terminates when $u = 0$ in which case the solution is + +\begin{equation} +Ca + Db = v +\end{equation} + +If $v$, the greatest common divisor of $a$ and $b$ is not equal to one then the algorithm will report an error as no inverse exists. Otherwise, $C$ +is the modular inverse of $a$. The actual value of $C$ is congruent to, but not necessarily equal to, the ideal modular inverse which should lie +within $1 \le a^{-1} < b$. Step numbers twelve and thirteen adjust the inverse until it is in range. If the original input $a$ is within $0 < a < p$ +then only a couple of additions or subtractions will be required to adjust the inverse. + +EXAM,bn_mp_invmod.c + +\subsubsection{Odd Moduli} + +When the modulus $b$ is odd the variables $A$ and $C$ are fixed and are not required to compute the inverse. In particular by attempting to solve +the Diophantine $Cb + Da = 1$ only $B$ and $D$ are required to find the inverse of $a$. + +The algorithm fast\_mp\_invmod is a direct adaptation of algorithm mp\_invmod with all all steps involving either $A$ or $C$ removed. This +optimization will halve the time required to compute the modular inverse. + +\section{Primality Tests} + +A non-zero integer $a$ is said to be prime if it is not divisible by any other integer excluding one and itself. For example, $a = 7$ is prime +since the integers $2 \ldots 6$ do not evenly divide $a$. By contrast, $a = 6$ is not prime since $a = 6 = 2 \cdot 3$. + +Prime numbers arise in cryptography considerably as they allow finite fields to be formed. The ability to determine whether an integer is prime or +not quickly has been a viable subject in cryptography and number theory for considerable time. The algorithms that will be presented are all +probablistic algorithms in that when they report an integer is composite it must be composite. However, when the algorithms report an integer is +prime the algorithm may be incorrect. + +As will be discussed it is possible to limit the probability of error so well that for practical purposes the probablity of error might as +well be zero. For the purposes of these discussions let $n$ represent the candidate integer of which the primality is in question. + +\subsection{Trial Division} + +Trial division means to attempt to evenly divide a candidate integer by small prime integers. If the candidate can be evenly divided it obviously +cannot be prime. By dividing by all primes $1 < p \le \sqrt{n}$ this test can actually prove whether an integer is prime. However, such a test +would require a prohibitive amount of time as $n$ grows. + +Instead of dividing by every prime, a smaller, more mangeable set of primes may be used instead. By performing trial division with only a subset +of the primes less than $\sqrt{n} + 1$ the algorithm cannot prove if a candidate is prime. However, often it can prove a candidate is not prime. + +The benefit of this test is that trial division by small values is fairly efficient. Specially compared to the other algorithms that will be +discussed shortly. The probability that this approach correctly identifies a composite candidate when tested with all primes upto $q$ is given by +$1 - {1.12 \over ln(q)}$. The graph (\ref{pic:primality}, will be added later) demonstrates the probability of success for the range +$3 \le q \le 100$. + +At approximately $q = 30$ the gain of performing further tests diminishes fairly quickly. At $q = 90$ further testing is generally not going to +be of any practical use. In the case of LibTomMath the default limit $q = 256$ was chosen since it is not too high and will eliminate +approximately $80\%$ of all candidate integers. The constant \textbf{PRIME\_SIZE} is equal to the number of primes in the test base. The +array \_\_prime\_tab is an array of the first \textbf{PRIME\_SIZE} prime numbers. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_prime\_is\_divisible}. \\ +\textbf{Input}. mp\_int $a$ \\ +\textbf{Output}. $c = 1$ if $n$ is divisible by a small prime, otherwise $c = 0$. \\ +\hline \\ +1. for $ix$ from $0$ to $PRIME\_SIZE$ do \\ +\hspace{3mm}1.1 $d \leftarrow n \mbox{ (mod }\_\_prime\_tab_{ix}\mbox{)}$ \\ +\hspace{3mm}1.2 If $d = 0$ then \\ +\hspace{6mm}1.2.1 $c \leftarrow 1$ \\ +\hspace{6mm}1.2.2 Return(\textit{MP\_OKAY}). \\ +2. $c \leftarrow 0$ \\ +3. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_prime\_is\_divisible} +\end{figure} +\textbf{Algorithm mp\_prime\_is\_divisible.} +This algorithm attempts to determine if a candidate integer $n$ is composite by performing trial divisions. + +EXAM,bn_mp_prime_is_divisible.c + +The algorithm defaults to a return of $0$ in case an error occurs. The values in the prime table are all specified to be in the range of a +mp\_digit. The table \_\_prime\_tab is defined in the following file. + +EXAM,bn_prime_tab.c + +Note that there are two possible tables. When an mp\_digit is 7-bits long only the primes upto $127$ may be included, otherwise the primes +upto $1619$ are used. Note that the value of \textbf{PRIME\_SIZE} is a constant dependent on the size of a mp\_digit. + +\subsection{The Fermat Test} +The Fermat test is probably one the oldest tests to have a non-trivial probability of success. It is based on the fact that if $n$ is in +fact prime then $a^{n} \equiv a \mbox{ (mod }n\mbox{)}$ for all $0 < a < n$. The reason being that if $n$ is prime than the order of +the multiplicative sub group is $n - 1$. Any base $a$ must have an order which divides $n - 1$ and as such $a^n$ is equivalent to +$a^1 = a$. + +If $n$ is composite then any given base $a$ does not have to have a period which divides $n - 1$. In which case +it is possible that $a^n \nequiv a \mbox{ (mod }n\mbox{)}$. However, this test is not absolute as it is possible that the order +of a base will divide $n - 1$ which would then be reported as prime. Such a base yields what is known as a Fermat pseudo-prime. Several +integers known as Carmichael numbers will be a pseudo-prime to all valid bases. Fortunately such numbers are extremely rare as $n$ grows +in size. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_prime\_fermat}. \\ +\textbf{Input}. mp\_int $a$ and $b$, $a \ge 2$, $0 < b < a$. \\ +\textbf{Output}. $c = 1$ if $b^a \equiv b \mbox{ (mod }a\mbox{)}$, otherwise $c = 0$. \\ +\hline \\ +1. $t \leftarrow b^a \mbox{ (mod }a\mbox{)}$ \\ +2. If $t = b$ then \\ +\hspace{3mm}2.1 $c = 1$ \\ +3. else \\ +\hspace{3mm}3.1 $c = 0$ \\ +4. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_prime\_fermat} +\end{figure} +\textbf{Algorithm mp\_prime\_fermat.} +This algorithm determines whether an mp\_int $a$ is a Fermat prime to the base $b$ or not. It uses a single modular exponentiation to +determine the result. + +EXAM,bn_mp_prime_fermat.c + +\subsection{The Miller-Rabin Test} +The Miller-Rabin (citation) test is another primality test which has tighter error bounds than the Fermat test specifically with sequentially chosen +candidate integers. The algorithm is based on the observation that if $n - 1 = 2^kr$ and if $b^r \nequiv \pm 1$ then after upto $k - 1$ squarings the +value must be equal to $-1$. The squarings are stopped as soon as $-1$ is observed. If the value of $1$ is observed first it means that +some value not congruent to $\pm 1$ when squared equals one which cannot occur if $n$ is prime. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_prime\_miller\_rabin}. \\ +\textbf{Input}. mp\_int $a$ and $b$, $a \ge 2$, $0 < b < a$. \\ +\textbf{Output}. $c = 1$ if $a$ is a Miller-Rabin prime to the base $a$, otherwise $c = 0$. \\ +\hline +1. $a' \leftarrow a - 1$ \\ +2. $r \leftarrow n1$ \\ +3. $c \leftarrow 0, s \leftarrow 0$ \\ +4. While $r.used > 0$ and $r_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\ +\hspace{3mm}4.1 $s \leftarrow s + 1$ \\ +\hspace{3mm}4.2 $r \leftarrow \lfloor r / 2 \rfloor$ \\ +5. $y \leftarrow b^r \mbox{ (mod }a\mbox{)}$ \\ +6. If $y \nequiv \pm 1$ then \\ +\hspace{3mm}6.1 $j \leftarrow 1$ \\ +\hspace{3mm}6.2 While $j \le (s - 1)$ and $y \nequiv a'$ \\ +\hspace{6mm}6.2.1 $y \leftarrow y^2 \mbox{ (mod }a\mbox{)}$ \\ +\hspace{6mm}6.2.2 If $y = 1$ then goto step 8. \\ +\hspace{6mm}6.2.3 $j \leftarrow j + 1$ \\ +\hspace{3mm}6.3 If $y \nequiv a'$ goto step 8. \\ +7. $c \leftarrow 1$\\ +8. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_prime\_miller\_rabin} +\end{figure} +\textbf{Algorithm mp\_prime\_miller\_rabin.} +This algorithm performs one trial round of the Miller-Rabin algorithm to the base $b$. It will set $c = 1$ if the algorithm cannot determine +if $b$ is composite or $c = 0$ if $b$ is provably composite. The values of $s$ and $r$ are computed such that $a' = a - 1 = 2^sr$. + +If the value $y \equiv b^r$ is congruent to $\pm 1$ then the algorithm cannot prove if $a$ is composite or not. Otherwise, the algorithm will +square $y$ upto $s - 1$ times stopping only when $y \equiv -1$. If $y^2 \equiv 1$ and $y \nequiv \pm 1$ then the algorithm can report that $a$ +is provably composite. If the algorithm performs $s - 1$ squarings and $y \nequiv -1$ then $a$ is provably composite. If $a$ is not provably +composite then it is \textit{probably} prime. + +EXAM,bn_mp_prime_miller_rabin.c + + + + +\backmatter +\appendix +\begin{thebibliography}{ABCDEF} +\bibitem[1]{TAOCPV2} +Donald Knuth, \textit{The Art of Computer Programming}, Third Edition, Volume Two, Seminumerical Algorithms, Addison-Wesley, 1998 + +\bibitem[2]{HAC} +A. Menezes, P. van Oorschot, S. Vanstone, \textit{Handbook of Applied Cryptography}, CRC Press, 1996 + +\bibitem[3]{ROSE} +Michael Rosing, \textit{Implementing Elliptic Curve Cryptography}, Manning Publications, 1999 + +\bibitem[4]{COMBA} +Paul G. Comba, \textit{Exponentiation Cryptosystems on the IBM PC}. IBM Systems Journal 29(4): 526-538 (1990) + +\bibitem[5]{KARA} +A. Karatsuba, Doklay Akad. Nauk SSSR 145 (1962), pp.293-294 + +\bibitem[6]{KARAP} +Andre Weimerskirch and Christof Paar, \textit{Generalizations of the Karatsuba Algorithm for Polynomial Multiplication}, Submitted to Design, Codes and Cryptography, March 2002 + +\bibitem[7]{BARRETT} +Paul Barrett, \textit{Implementing the Rivest Shamir and Adleman Public Key Encryption Algorithm on a Standard Digital Signal Processor}, Advances in Cryptology, Crypto '86, Springer-Verlag. + +\bibitem[8]{MONT} +P.L.Montgomery. \textit{Modular multiplication without trial division}. Mathematics of Computation, 44(170):519-521, April 1985. + +\bibitem[9]{DRMET} +Chae Hoon Lim and Pil Joong Lee, \textit{Generating Efficient Primes for Discrete Log Cryptosystems}, POSTECH Information Research Laboratories + +\bibitem[10]{MMB} +J. Daemen and R. Govaerts and J. Vandewalle, \textit{Block ciphers based on Modular Arithmetic}, State and {P}rogress in the {R}esearch of {C}ryptography, 1993, pp. 80-89 + +\bibitem[11]{RSAREF} +R.L. Rivest, A. Shamir, L. Adleman, \textit{A Method for Obtaining Digital Signatures and Public-Key Cryptosystems} + +\bibitem[12]{DHREF} +Whitfield Diffie, Martin E. Hellman, \textit{New Directions in Cryptography}, IEEE Transactions on Information Theory, 1976 + +\bibitem[13]{IEEE} +IEEE Standard for Binary Floating-Point Arithmetic (ANSI/IEEE Std 754-1985) + +\bibitem[14]{GMP} +GNU Multiple Precision (GMP), \url{http://www.swox.com/gmp/} + +\bibitem[15]{MPI} +Multiple Precision Integer Library (MPI), Michael Fromberger, \url{http://thayer.dartmouth.edu/~sting/mpi/} + +\bibitem[16]{OPENSSL} +OpenSSL Cryptographic Toolkit, \url{http://openssl.org} + +\bibitem[17]{LIP} +Large Integer Package, \url{http://home.hetnet.nl/~ecstr/LIP.zip} + +\bibitem[18]{ISOC} +JTC1/SC22/WG14, ISO/IEC 9899:1999, ``A draft rationale for the C99 standard.'' + +\bibitem[19]{JAVA} +The Sun Java Website, \url{http://java.sun.com/} + +\end{thebibliography} + +\input{tommath.ind} + +\end{document}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tommath.tex Tue Jun 15 14:42:57 2004 +0000 @@ -0,0 +1,10694 @@ +\documentclass[b5paper]{book} +\usepackage{hyperref} +\usepackage{makeidx} +\usepackage{amssymb} +\usepackage{color} +\usepackage{alltt} +\usepackage{graphicx} +\usepackage{layout} +\def\union{\cup} +\def\intersect{\cap} +\def\getsrandom{\stackrel{\rm R}{\gets}} +\def\cross{\times} +\def\cat{\hspace{0.5em} \| \hspace{0.5em}} +\def\catn{$\|$} +\def\divides{\hspace{0.3em} | \hspace{0.3em}} +\def\nequiv{\not\equiv} +\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}} +\def\lcm{{\rm lcm}} +\def\gcd{{\rm gcd}} +\def\log{{\rm log}} +\def\ord{{\rm ord}} +\def\abs{{\mathit abs}} +\def\rep{{\mathit rep}} +\def\mod{{\mathit\ mod\ }} +\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})} +\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor} +\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil} +\def\Or{{\rm\ or\ }} +\def\And{{\rm\ and\ }} +\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}} +\def\implies{\Rightarrow} +\def\undefined{{\rm ``undefined"}} +\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}} +\let\oldphi\phi +\def\phi{\varphi} +\def\Pr{{\rm Pr}} +\newcommand{\str}[1]{{\mathbf{#1}}} +\def\F{{\mathbb F}} +\def\N{{\mathbb N}} +\def\Z{{\mathbb Z}} +\def\R{{\mathbb R}} +\def\C{{\mathbb C}} +\def\Q{{\mathbb Q}} +\definecolor{DGray}{gray}{0.5} +\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}} +\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}} +\def\gap{\vspace{0.5ex}} +\makeindex +\begin{document} +\frontmatter +\pagestyle{empty} +\title{Implementing Multiple Precision Arithmetic \\ ~ \\ Draft Edition } +\author{\mbox{ +%\begin{small} +\begin{tabular}{c} +Tom St Denis \\ +Algonquin College \\ +\\ +Mads Rasmussen \\ +Open Communications Security \\ +\\ +Greg Rose \\ +QUALCOMM Australia \\ +\end{tabular} +%\end{small} +} +} +\maketitle +This text has been placed in the public domain. This text corresponds to the v0.30 release of the +LibTomMath project. + +\begin{alltt} +Tom St Denis +111 Banning Rd +Ottawa, Ontario +K2L 1C3 +Canada + +Phone: 1-613-836-3160 +Email: [email protected] +\end{alltt} + +This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{} +{\em book} macro package and the Perl {\em booker} package. + +\tableofcontents +\listoffigures +\chapter*{Prefaces to the Draft Edition} +I started this text in April 2003 to complement my LibTomMath library. That is, explain how to implement the functions +contained in LibTomMath. The goal is to have a textbook that any Computer Science student can use when implementing their +own multiple precision arithmetic. The plan I wanted to follow was flesh out all the +ideas and concepts I had floating around in my head and then work on it afterwards refining a little bit at a time. Chance +would have it that I ended up with my summer off from Algonquin College and I was given four months solid to work on the +text. + +Choosing to not waste any time I dove right into the project even before my spring semester was finished. I wrote a bit +off and on at first. The moment my exams were finished I jumped into long 12 to 16 hour days. The result after only +a couple of months was a ten chapter, three hundred page draft that I quickly had distributed to anyone who wanted +to read it. I had Jean-Luc Cooke print copies for me and I brought them to Crypto'03 in Santa Barbara. So far I have +managed to grab a certain level of attention having people from around the world ask me for copies of the text was certain +rewarding. + +Now we are past December 2003. By this time I had pictured that I would have at least finished my second draft of the text. +Currently I am far off from this goal. I've done partial re-writes of chapters one, two and three but they are not even +finished yet. I haven't given up on the project, only had some setbacks. First O'Reilly declined to publish the text then +Addison-Wesley and Greg is tried another which I don't know the name of. However, at this point I want to focus my energy +onto finishing the book not securing a contract. + +So why am I writing this text? It seems like a lot of work right? Most certainly it is a lot of work writing a textbook. +Even the simplest introductory material has to be lined with references and figures. A lot of the text has to be re-written +from point form to prose form to ensure an easier read. Why am I doing all this work for free then? Simple. My philosophy +is quite simply ``Open Source. Open Academia. Open Minds'' which means that to achieve a goal of open minds, that is, +people willing to accept new ideas and explore the unknown you have to make available material they can access freely +without hinderance. + +I've been writing free software since I was about sixteen but only recently have I hit upon software that people have come +to depend upon. I started LibTomCrypt in December 2001 and now several major companies use it as integral portions of their +software. Several educational institutions use it as a matter of course and many freelance developers use it as +part of their projects. To further my contributions I started the LibTomMath project in December 2002 aimed at providing +multiple precision arithmetic routines that students could learn from. That is write routines that are not only easy +to understand and follow but provide quite impressive performance considering they are all in standard portable ISO C. + +The second leg of my philosophy is ``Open Academia'' which is where this textbook comes in. In the end, when all is +said and done the text will be useable by educational institutions as a reference on multiple precision arithmetic. + +At this time I feel I should share a little information about myself. The most common question I was asked at +Crypto'03, perhaps just out of professional courtesy, was which school I either taught at or attended. The unfortunate +truth is that I neither teach at or attend a school of academic reputation. I'm currently at Algonquin College which +is what I'd like to call ``somewhat academic but mostly vocational'' college. In otherwords, job training. + +I'm a 21 year old computer science student mostly self-taught in the areas I am aware of (which includes a half-dozen +computer science fields, a few fields of mathematics and some English). I look forward to teaching someday but I am +still far off from that goal. + +Now it would be improper for me to not introduce the rest of the texts co-authors. While they are only contributing +corrections and editorial feedback their support has been tremendously helpful in presenting the concepts laid out +in the text so far. Greg has always been there for me. He has tracked my LibTom projects since their inception and even +sent cheques to help pay tuition from time to time. His background has provided a wonderful source to bounce ideas off +of and improve the quality of my writing. Mads is another fellow who has just ``been there''. I don't even recall what +his interest in the LibTom projects is but I'm definitely glad he has been around. His ability to catch logical errors +in my written English have saved me on several occasions to say the least. + +What to expect next? Well this is still a rough draft. I've only had the chance to update a few chapters. However, I've +been getting the feeling that people are starting to use my text and I owe them some updated material. My current tenative +plan is to edit one chapter every two weeks starting January 4th. It seems insane but my lower course load at college +should provide ample time. By Crypto'04 I plan to have a 2nd draft of the text polished and ready to hand out to as many +people who will take it. + +\begin{flushright} Tom St Denis \end{flushright} + +\newpage +I found the opportunity to work with Tom appealing for several reasons, not only could I broaden my own horizons, but also +contribute to educate others facing the problem of having to handle big number mathematical calculations. + +This book is Tom's child and he has been caring and fostering the project ever since the beginning with a clear mind of +how he wanted the project to turn out. I have helped by proofreading the text and we have had several discussions about +the layout and language used. + +I hold a masters degree in cryptography from the University of Southern Denmark and have always been interested in the +practical aspects of cryptography. + +Having worked in the security consultancy business for several years in S\~{a}o Paulo, Brazil, I have been in touch with a +great deal of work in which multiple precision mathematics was needed. Understanding the possibilities for speeding up +multiple precision calculations is often very important since we deal with outdated machine architecture where modular +reductions, for example, become painfully slow. + +This text is for people who stop and wonder when first examining algorithms such as RSA for the first time and asks +themselves, ``You tell me this is only secure for large numbers, fine; but how do you implement these numbers?'' + +\begin{flushright} +Mads Rasmussen + +S\~{a}o Paulo - SP + +Brazil +\end{flushright} + +\newpage +It's all because I broke my leg. That just happened to be at about the same time that Tom asked for someone to review the section of the book about +Karatsuba multiplication. I was laid up, alone and immobile, and thought ``Why not?'' I vaguely knew what Karatsuba multiplication was, but not +really, so I thought I could help, learn, and stop myself from watching daytime cable TV, all at once. + +At the time of writing this, I've still not met Tom or Mads in meatspace. I've been following Tom's progress since his first splash on the +sci.crypt Usenet news group. I watched him go from a clueless newbie, to the cryptographic equivalent of a reformed smoker, to a real +contributor to the field, over a period of about two years. I've been impressed with his obvious intelligence, and astounded by his productivity. +Of course, he's young enough to be my own child, so he doesn't have my problems with staying awake. + +When I reviewed that single section of the book, in its very earliest form, I was very pleasantly surprised. So I decided to collaborate more fully, +and at least review all of it, and perhaps write some bits too. There's still a long way to go with it, and I have watched a number of close +friends go through the mill of publication, so I think that the way to go is longer than Tom thinks it is. Nevertheless, it's a good effort, +and I'm pleased to be involved with it. + +\begin{flushright} +Greg Rose, Sydney, Australia, June 2003. +\end{flushright} + +\mainmatter +\pagestyle{headings} +\chapter{Introduction} +\section{Multiple Precision Arithmetic} + +\subsection{What is Multiple Precision Arithmetic?} +When we think of long-hand arithmetic such as addition or multiplication we rarely consider the fact that we instinctively +raise or lower the precision of the numbers we are dealing with. For example, in decimal we almost immediate can +reason that $7$ times $6$ is $42$. However, $42$ has two digits of precision as opposed to one digit we started with. +Further multiplications of say $3$ result in a larger precision result $126$. In these few examples we have multiple +precisions for the numbers we are working with. Despite the various levels of precision a single subset\footnote{With the occasional optimization.} + of algorithms can be designed to accomodate them. + +By way of comparison a fixed or single precision operation would lose precision on various operations. For example, in +the decimal system with fixed precision $6 \cdot 7 = 2$. + +Essentially at the heart of computer based multiple precision arithmetic are the same long-hand algorithms taught in +schools to manually add, subtract, multiply and divide. + +\subsection{The Need for Multiple Precision Arithmetic} +The most prevalent need for multiple precision arithmetic, often referred to as ``bignum'' math, is within the implementation +of public-key cryptography algorithms. Algorithms such as RSA \cite{RSAREF} and Diffie-Hellman \cite{DHREF} require +integers of significant magnitude to resist known cryptanalytic attacks. For example, at the time of this writing a +typical RSA modulus would be at least greater than $10^{309}$. However, modern programming languages such as ISO C \cite{ISOC} and +Java \cite{JAVA} only provide instrinsic support for integers which are relatively small and single precision. + +\begin{figure}[!here] +\begin{center} +\begin{tabular}{|r|c|} +\hline \textbf{Data Type} & \textbf{Range} \\ +\hline char & $-128 \ldots 127$ \\ +\hline short & $-32768 \ldots 32767$ \\ +\hline long & $-2147483648 \ldots 2147483647$ \\ +\hline long long & $-9223372036854775808 \ldots 9223372036854775807$ \\ +\hline +\end{tabular} +\end{center} +\caption{Typical Data Types for the C Programming Language} +\label{fig:ISOC} +\end{figure} + +The largest data type guaranteed to be provided by the ISO C programming +language\footnote{As per the ISO C standard. However, each compiler vendor is allowed to augment the precision as they +see fit.} can only represent values up to $10^{19}$ as shown in figure \ref{fig:ISOC}. On its own the C language is +insufficient to accomodate the magnitude required for the problem at hand. An RSA modulus of magnitude $10^{19}$ could be +trivially factored\footnote{A Pollard-Rho factoring would take only $2^{16}$ time.} on the average desktop computer, +rendering any protocol based on the algorithm insecure. Multiple precision algorithms solve this very problem by +extending the range of representable integers while using single precision data types. + +Most advancements in fast multiple precision arithmetic stem from the need for faster and more efficient cryptographic +primitives. Faster modular reduction and exponentiation algorithms such as Barrett's algorithm, which have appeared in +various cryptographic journals, can render algorithms such as RSA and Diffie-Hellman more efficient. In fact, several +major companies such as RSA Security, Certicom and Entrust have built entire product lines on the implementation and +deployment of efficient algorithms. + +However, cryptography is not the only field of study that can benefit from fast multiple precision integer routines. +Another auxiliary use of multiple precision integers is high precision floating point data types. +The basic IEEE \cite{IEEE} standard floating point type is made up of an integer mantissa $q$, an exponent $e$ and a sign bit $s$. +Numbers are given in the form $n = q \cdot b^e \cdot -1^s$ where $b = 2$ is the most common base for IEEE. Since IEEE +floating point is meant to be implemented in hardware the precision of the mantissa is often fairly small +(\textit{23, 48 and 64 bits}). The mantissa is merely an integer and a multiple precision integer could be used to create +a mantissa of much larger precision than hardware alone can efficiently support. This approach could be useful where +scientific applications must minimize the total output error over long calculations. + +Another use for large integers is within arithmetic on polynomials of large characteristic (i.e. $GF(p)[x]$ for large $p$). +In fact the library discussed within this text has already been used to form a polynomial basis library\footnote{See \url{http://poly.libtomcrypt.org} for more details.}. + +\subsection{Benefits of Multiple Precision Arithmetic} +\index{precision} +The benefit of multiple precision representations over single or fixed precision representations is that +no precision is lost while representing the result of an operation which requires excess precision. For example, +the product of two $n$-bit integers requires at least $2n$ bits of precision to be represented faithfully. A multiple +precision algorithm would augment the precision of the destination to accomodate the result while a single precision system +would truncate excess bits to maintain a fixed level of precision. + +It is possible to implement algorithms which require large integers with fixed precision algorithms. For example, elliptic +curve cryptography (\textit{ECC}) is often implemented on smartcards by fixing the precision of the integers to the maximum +size the system will ever need. Such an approach can lead to vastly simpler algorithms which can accomodate the +integers required even if the host platform cannot natively accomodate them\footnote{For example, the average smartcard +processor has an 8 bit accumulator.}. However, as efficient as such an approach may be, the resulting source code is not +normally very flexible. It cannot, at runtime, accomodate inputs of higher magnitude than the designer anticipated. + +Multiple precision algorithms have the most overhead of any style of arithmetic. For the the most part the +overhead can be kept to a minimum with careful planning, but overall, it is not well suited for most memory starved +platforms. However, multiple precision algorithms do offer the most flexibility in terms of the magnitude of the +inputs. That is, the same algorithms based on multiple precision integers can accomodate any reasonable size input +without the designer's explicit forethought. This leads to lower cost of ownership for the code as it only has to +be written and tested once. + +\section{Purpose of This Text} +The purpose of this text is to instruct the reader regarding how to implement efficient multiple precision algorithms. +That is to not only explain a limited subset of the core theory behind the algorithms but also the various ``house keeping'' +elements that are neglected by authors of other texts on the subject. Several well reknowned texts \cite{TAOCPV2,HAC} +give considerably detailed explanations of the theoretical aspects of algorithms and often very little information +regarding the practical implementation aspects. + +In most cases how an algorithm is explained and how it is actually implemented are two very different concepts. For +example, the Handbook of Applied Cryptography (\textit{HAC}), algorithm 14.7 on page 594, gives a relatively simple +algorithm for performing multiple precision integer addition. However, the description lacks any discussion concerning +the fact that the two integer inputs may be of differing magnitudes. As a result the implementation is not as simple +as the text would lead people to believe. Similarly the division routine (\textit{algorithm 14.20, pp. 598}) does not +discuss how to handle sign or handle the dividend's decreasing magnitude in the main loop (\textit{step \#3}). + +Both texts also do not discuss several key optimal algorithms required such as ``Comba'' and Karatsuba multipliers +and fast modular inversion, which we consider practical oversights. These optimal algorithms are vital to achieve +any form of useful performance in non-trivial applications. + +To solve this problem the focus of this text is on the practical aspects of implementing a multiple precision integer +package. As a case study the ``LibTomMath''\footnote{Available at \url{http://math.libtomcrypt.org}} package is used +to demonstrate algorithms with real implementations\footnote{In the ISO C programming language.} that have been field +tested and work very well. The LibTomMath library is freely available on the Internet for all uses and this text +discusses a very large portion of the inner workings of the library. + +The algorithms that are presented will always include at least one ``pseudo-code'' description followed +by the actual C source code that implements the algorithm. The pseudo-code can be used to implement the same +algorithm in other programming languages as the reader sees fit. + +This text shall also serve as a walkthrough of the creation of multiple precision algorithms from scratch. Showing +the reader how the algorithms fit together as well as where to start on various taskings. + +\section{Discussion and Notation} +\subsection{Notation} +A multiple precision integer of $n$-digits shall be denoted as $x = (x_{n-1} ... x_1 x_0)_{ \beta }$ and represent +the integer $x \equiv \sum_{i=0}^{n-1} x_i\beta^i$. The elements of the array $x$ are said to be the radix $\beta$ digits +of the integer. For example, $x = (1,2,3)_{10}$ would represent the integer +$1\cdot 10^2 + 2\cdot10^1 + 3\cdot10^0 = 123$. + +\index{mp\_int} +The term ``mp\_int'' shall refer to a composite structure which contains the digits of the integer it represents, as well +as auxilary data required to manipulate the data. These additional members are discussed further in section +\ref{sec:MPINT}. For the purposes of this text a ``multiple precision integer'' and an ``mp\_int'' are assumed to be +synonymous. When an algorithm is specified to accept an mp\_int variable it is assumed the various auxliary data members +are present as well. An expression of the type \textit{variablename.item} implies that it should evaluate to the +member named ``item'' of the variable. For example, a string of characters may have a member ``length'' which would +evaluate to the number of characters in the string. If the string $a$ equals ``hello'' then it follows that +$a.length = 5$. + +For certain discussions more generic algorithms are presented to help the reader understand the final algorithm used +to solve a given problem. When an algorithm is described as accepting an integer input it is assumed the input is +a plain integer with no additional multiple-precision members. That is, algorithms that use integers as opposed to +mp\_ints as inputs do not concern themselves with the housekeeping operations required such as memory management. These +algorithms will be used to establish the relevant theory which will subsequently be used to describe a multiple +precision algorithm to solve the same problem. + +\subsection{Precision Notation} +For the purposes of this text a single precision variable must be able to represent integers in the range +$0 \le x < q \beta$ while a double precision variable must be able to represent integers in the range +$0 \le x < q \beta^2$. The variable $\beta$ represents the radix of a single digit of a multiple precision integer and +must be of the form $q^p$ for $q, p \in \Z^+$. The extra radix-$q$ factor allows additions and subtractions to proceed +without truncation of the carry. Since all modern computers are binary, it is assumed that $q$ is two, for all intents +and purposes. + +\index{mp\_digit} \index{mp\_word} +Within the source code that will be presented for each algorithm, the data type \textbf{mp\_digit} will represent +a single precision integer type, while, the data type \textbf{mp\_word} will represent a double precision integer type. In +several algorithms (notably the Comba routines) temporary results will be stored in arrays of double precision mp\_words. +For the purposes of this text $x_j$ will refer to the $j$'th digit of a single precision array and $\hat x_j$ will refer to +the $j$'th digit of a double precision array. Whenever an expression is to be assigned to a double precision +variable it is assumed that all single precision variables are promoted to double precision during the evaluation. +Expressions that are assigned to a single precision variable are truncated to fit within the precision of a single +precision data type. + +For example, if $\beta = 10^2$ a single precision data type may represent a value in the +range $0 \le x < 10^3$, while a double precision data type may represent a value in the range $0 \le x < 10^5$. Let +$a = 23$ and $b = 49$ represent two single precision variables. The single precision product shall be written +as $c \leftarrow a \cdot b$ while the double precision product shall be written as $\hat c \leftarrow a \cdot b$. +In this particular case, $\hat c = 1127$ and $c = 127$. The most significant digit of the product would not fit +in a single precision data type and as a result $c \ne \hat c$. + +\subsection{Algorithm Inputs and Outputs} +Within the algorithm descriptions all variables are assumed to be scalars of either single or double precision +as indicated. The only exception to this rule is when variables have been indicated to be of type mp\_int. This +distinction is important as scalars are often used as array indicies and various other counters. + +\subsection{Mathematical Expressions} +The $\lfloor \mbox{ } \rfloor$ brackets imply an expression truncated to an integer not greater than the expression +itself. For example, $\lfloor 5.7 \rfloor = 5$. Similarly the $\lceil \mbox{ } \rceil$ brackets imply an expression +rounded to an integer not less than the expression itself. For example, $\lceil 5.1 \rceil = 6$. Typically when +the $/$ division symbol is used the intention is to perform an integer division with truncation. For example, +$5/2 = 2$ which will often be written as $\lfloor 5/2 \rfloor = 2$ for clarity. When an expression is written as a +fraction a real value division is implied, for example ${5 \over 2} = 2.5$. + +The norm of a multiple precision integer, for example, $\vert \vert x \vert \vert$ will be used to represent the number of digits in the representation +of the integer. For example, $\vert \vert 123 \vert \vert = 3$ and $\vert \vert 79452 \vert \vert = 5$. + +\subsection{Work Effort} +\index{big-Oh} +To measure the efficiency of the specified algorithms, a modified big-Oh notation is used. In this system all +single precision operations are considered to have the same cost\footnote{Except where explicitly noted.}. +That is a single precision addition, multiplication and division are assumed to take the same time to +complete. While this is generally not true in practice, it will simplify the discussions considerably. + +Some algorithms have slight advantages over others which is why some constants will not be removed in +the notation. For example, a normal baseline multiplication (section \ref{sec:basemult}) requires $O(n^2)$ work while a +baseline squaring (section \ref{sec:basesquare}) requires $O({{n^2 + n}\over 2})$ work. In standard big-Oh notation these +would both be said to be equivalent to $O(n^2)$. However, +in the context of the this text this is not the case as the magnitude of the inputs will typically be rather small. As a +result small constant factors in the work effort will make an observable difference in algorithm efficiency. + +All of the algorithms presented in this text have a polynomial time work level. That is, of the form +$O(n^k)$ for $n, k \in \Z^{+}$. This will help make useful comparisons in terms of the speed of the algorithms and how +various optimizations will help pay off in the long run. + +\section{Exercises} +Within the more advanced chapters a section will be set aside to give the reader some challenging exercises related to +the discussion at hand. These exercises are not designed to be prize winning problems, but instead to be thought +provoking. Wherever possible the problems are forward minded, stating problems that will be answered in subsequent +chapters. The reader is encouraged to finish the exercises as they appear to get a better understanding of the +subject material. + +That being said, the problems are designed to affirm knowledge of a particular subject matter. Students in particular +are encouraged to verify they can answer the problems correctly before moving on. + +Similar to the exercises of \cite[pp. ix]{TAOCPV2} these exercises are given a scoring system based on the difficulty of +the problem. However, unlike \cite{TAOCPV2} the problems do not get nearly as hard. The scoring of these +exercises ranges from one (the easiest) to five (the hardest). The following table sumarizes the +scoring system used. + +\begin{figure}[here] +\begin{center} +\begin{small} +\begin{tabular}{|c|l|} +\hline $\left [ 1 \right ]$ & An easy problem that should only take the reader a manner of \\ + & minutes to solve. Usually does not involve much computer time \\ + & to solve. \\ +\hline $\left [ 2 \right ]$ & An easy problem that involves a marginal amount of computer \\ + & time usage. Usually requires a program to be written to \\ + & solve the problem. \\ +\hline $\left [ 3 \right ]$ & A moderately hard problem that requires a non-trivial amount \\ + & of work. Usually involves trivial research and development of \\ + & new theory from the perspective of a student. \\ +\hline $\left [ 4 \right ]$ & A moderately hard problem that involves a non-trivial amount \\ + & of work and research, the solution to which will demonstrate \\ + & a higher mastery of the subject matter. \\ +\hline $\left [ 5 \right ]$ & A hard problem that involves concepts that are difficult for a \\ + & novice to solve. Solutions to these problems will demonstrate a \\ + & complete mastery of the given subject. \\ +\hline +\end{tabular} +\end{small} +\end{center} +\caption{Exercise Scoring System} +\end{figure} + +Problems at the first level are meant to be simple questions that the reader can answer quickly without programming a solution or +devising new theory. These problems are quick tests to see if the material is understood. Problems at the second level +are also designed to be easy but will require a program or algorithm to be implemented to arrive at the answer. These +two levels are essentially entry level questions. + +Problems at the third level are meant to be a bit more difficult than the first two levels. The answer is often +fairly obvious but arriving at an exacting solution requires some thought and skill. These problems will almost always +involve devising a new algorithm or implementing a variation of another algorithm previously presented. Readers who can +answer these questions will feel comfortable with the concepts behind the topic at hand. + +Problems at the fourth level are meant to be similar to those of the level three questions except they will require +additional research to be completed. The reader will most likely not know the answer right away, nor will the text provide +the exact details of the answer until a subsequent chapter. + +Problems at the fifth level are meant to be the hardest +problems relative to all the other problems in the chapter. People who can correctly answer fifth level problems have a +mastery of the subject matter at hand. + +Often problems will be tied together. The purpose of this is to start a chain of thought that will be discussed in future chapters. The reader +is encouraged to answer the follow-up problems and try to draw the relevance of problems. + +\section{Introduction to LibTomMath} + +\subsection{What is LibTomMath?} +LibTomMath is a free and open source multiple precision integer library written entirely in portable ISO C. By portable it +is meant that the library does not contain any code that is computer platform dependent or otherwise problematic to use on +any given platform. + +The library has been successfully tested under numerous operating systems including Unix\footnote{All of these +trademarks belong to their respective rightful owners.}, MacOS, Windows, Linux, PalmOS and on standalone hardware such +as the Gameboy Advance. The library is designed to contain enough functionality to be able to develop applications such +as public key cryptosystems and still maintain a relatively small footprint. + +\subsection{Goals of LibTomMath} + +Libraries which obtain the most efficiency are rarely written in a high level programming language such as C. However, +even though this library is written entirely in ISO C, considerable care has been taken to optimize the algorithm implementations within the +library. Specifically the code has been written to work well with the GNU C Compiler (\textit{GCC}) on both x86 and ARM +processors. Wherever possible, highly efficient algorithms, such as Karatsuba multiplication, sliding window +exponentiation and Montgomery reduction have been provided to make the library more efficient. + +Even with the nearly optimal and specialized algorithms that have been included the Application Programing Interface +(\textit{API}) has been kept as simple as possible. Often generic place holder routines will make use of specialized +algorithms automatically without the developer's specific attention. One such example is the generic multiplication +algorithm \textbf{mp\_mul()} which will automatically use Toom--Cook, Karatsuba, Comba or baseline multiplication +based on the magnitude of the inputs and the configuration of the library. + +Making LibTomMath as efficient as possible is not the only goal of the LibTomMath project. Ideally the library should +be source compatible with another popular library which makes it more attractive for developers to use. In this case the +MPI library was used as a API template for all the basic functions. MPI was chosen because it is another library that fits +in the same niche as LibTomMath. Even though LibTomMath uses MPI as the template for the function names and argument +passing conventions, it has been written from scratch by Tom St Denis. + +The project is also meant to act as a learning tool for students, the logic being that no easy-to-follow ``bignum'' +library exists which can be used to teach computer science students how to perform fast and reliable multiple precision +integer arithmetic. To this end the source code has been given quite a few comments and algorithm discussion points. + +\section{Choice of LibTomMath} +LibTomMath was chosen as the case study of this text not only because the author of both projects is one and the same but +for more worthy reasons. Other libraries such as GMP \cite{GMP}, MPI \cite{MPI}, LIP \cite{LIP} and OpenSSL +\cite{OPENSSL} have multiple precision integer arithmetic routines but would not be ideal for this text for +reasons that will be explained in the following sub-sections. + +\subsection{Code Base} +The LibTomMath code base is all portable ISO C source code. This means that there are no platform dependent conditional +segments of code littered throughout the source. This clean and uncluttered approach to the library means that a +developer can more readily discern the true intent of a given section of source code without trying to keep track of +what conditional code will be used. + +The code base of LibTomMath is well organized. Each function is in its own separate source code file +which allows the reader to find a given function very quickly. On average there are $76$ lines of code per source +file which makes the source very easily to follow. By comparison MPI and LIP are single file projects making code tracing +very hard. GMP has many conditional code segments which also hinder tracing. + +When compiled with GCC for the x86 processor and optimized for speed the entire library is approximately $100$KiB\footnote{The notation ``KiB'' means $2^{10}$ octets, similarly ``MiB'' means $2^{20}$ octets.} + which is fairly small compared to GMP (over $250$KiB). LibTomMath is slightly larger than MPI (which compiles to about +$50$KiB) but LibTomMath is also much faster and more complete than MPI. + +\subsection{API Simplicity} +LibTomMath is designed after the MPI library and shares the API design. Quite often programs that use MPI will build +with LibTomMath without change. The function names correlate directly to the action they perform. Almost all of the +functions share the same parameter passing convention. The learning curve is fairly shallow with the API provided +which is an extremely valuable benefit for the student and developer alike. + +The LIP library is an example of a library with an API that is awkward to work with. LIP uses function names that are often ``compressed'' to +illegible short hand. LibTomMath does not share this characteristic. + +The GMP library also does not return error codes. Instead it uses a POSIX.1 \cite{POSIX1} signal system where errors +are signaled to the host application. This happens to be the fastest approach but definitely not the most versatile. In +effect a math error (i.e. invalid input, heap error, etc) can cause a program to stop functioning which is definitely +undersireable in many situations. + +\subsection{Optimizations} +While LibTomMath is certainly not the fastest library (GMP often beats LibTomMath by a factor of two) it does +feature a set of optimal algorithms for tasks such as modular reduction, exponentiation, multiplication and squaring. GMP +and LIP also feature such optimizations while MPI only uses baseline algorithms with no optimizations. GMP lacks a few +of the additional modular reduction optimizations that LibTomMath features\footnote{At the time of this writing GMP +only had Barrett and Montgomery modular reduction algorithms.}. + +LibTomMath is almost always an order of magnitude faster than the MPI library at computationally expensive tasks such as modular +exponentiation. In the grand scheme of ``bignum'' libraries LibTomMath is faster than the average library and usually +slower than the best libraries such as GMP and OpenSSL by only a small factor. + +\subsection{Portability and Stability} +LibTomMath will build ``out of the box'' on any platform equipped with a modern version of the GNU C Compiler +(\textit{GCC}). This means that without changes the library will build without configuration or setting up any +variables. LIP and MPI will build ``out of the box'' as well but have numerous known bugs. Most notably the author of +MPI has recently stopped working on his library and LIP has long since been discontinued. + +GMP requires a configuration script to run and will not build out of the box. GMP and LibTomMath are still in active +development and are very stable across a variety of platforms. + +\subsection{Choice} +LibTomMath is a relatively compact, well documented, highly optimized and portable library which seems only natural for +the case study of this text. Various source files from the LibTomMath project will be included within the text. However, +the reader is encouraged to download their own copy of the library to actually be able to work with the library. + +\chapter{Getting Started} +\section{Library Basics} +The trick to writing any useful library of source code is to build a solid foundation and work outwards from it. First, +a problem along with allowable solution parameters should be identified and analyzed. In this particular case the +inability to accomodate multiple precision integers is the problem. Futhermore, the solution must be written +as portable source code that is reasonably efficient across several different computer platforms. + +After a foundation is formed the remainder of the library can be designed and implemented in a hierarchical fashion. +That is, to implement the lowest level dependencies first and work towards the most abstract functions last. For example, +before implementing a modular exponentiation algorithm one would implement a modular reduction algorithm. +By building outwards from a base foundation instead of using a parallel design methodology the resulting project is +highly modular. Being highly modular is a desirable property of any project as it often means the resulting product +has a small footprint and updates are easy to perform. + +Usually when I start a project I will begin with the header file. I define the data types I think I will need and +prototype the initial functions that are not dependent on other functions (within the library). After I +implement these base functions I prototype more dependent functions and implement them. The process repeats until +I implement all of the functions I require. For example, in the case of LibTomMath I implemented functions such as +mp\_init() well before I implemented mp\_mul() and even further before I implemented mp\_exptmod(). As an example as to +why this design works note that the Karatsuba and Toom-Cook multipliers were written \textit{after} the +dependent function mp\_exptmod() was written. Adding the new multiplication algorithms did not require changes to the +mp\_exptmod() function itself and lowered the total cost of ownership (\textit{so to speak}) and of development +for new algorithms. This methodology allows new algorithms to be tested in a complete framework with relative ease. + +\begin{center} +\begin{figure}[here] +\includegraphics{pics/design_process.ps} +\caption{Design Flow of the First Few Original LibTomMath Functions.} +\label{pic:design_process} +\end{figure} +\end{center} + +Only after the majority of the functions were in place did I pursue a less hierarchical approach to auditing and optimizing +the source code. For example, one day I may audit the multipliers and the next day the polynomial basis functions. + +It only makes sense to begin the text with the preliminary data types and support algorithms required as well. +This chapter discusses the core algorithms of the library which are the dependents for every other algorithm. + +\section{What is a Multiple Precision Integer?} +Recall that most programming languages, in particular ISO C \cite{ISOC}, only have fixed precision data types that on their own cannot +be used to represent values larger than their precision will allow. The purpose of multiple precision algorithms is +to use fixed precision data types to create and manipulate multiple precision integers which may represent values +that are very large. + +As a well known analogy, school children are taught how to form numbers larger than nine by prepending more radix ten digits. In the decimal system +the largest single digit value is $9$. However, by concatenating digits together larger numbers may be represented. Newly prepended digits +(\textit{to the left}) are said to be in a different power of ten column. That is, the number $123$ can be described as having a $1$ in the hundreds +column, $2$ in the tens column and $3$ in the ones column. Or more formally $123 = 1 \cdot 10^2 + 2 \cdot 10^1 + 3 \cdot 10^0$. Computer based +multiple precision arithmetic is essentially the same concept. Larger integers are represented by adjoining fixed +precision computer words with the exception that a different radix is used. + +What most people probably do not think about explicitly are the various other attributes that describe a multiple precision +integer. For example, the integer $154_{10}$ has two immediately obvious properties. First, the integer is positive, +that is the sign of this particular integer is positive as opposed to negative. Second, the integer has three digits in +its representation. There is an additional property that the integer posesses that does not concern pencil-and-paper +arithmetic. The third property is how many digits placeholders are available to hold the integer. + +The human analogy of this third property is ensuring there is enough space on the paper to write the integer. For example, +if one starts writing a large number too far to the right on a piece of paper they will have to erase it and move left. +Similarly, computer algorithms must maintain strict control over memory usage to ensure that the digits of an integer +will not exceed the allowed boundaries. These three properties make up what is known as a multiple precision +integer or mp\_int for short. + +\subsection{The mp\_int Structure} +\label{sec:MPINT} +The mp\_int structure is the ISO C based manifestation of what represents a multiple precision integer. The ISO C standard does not provide for +any such data type but it does provide for making composite data types known as structures. The following is the structure definition +used within LibTomMath. + +\index{mp\_int} +\begin{verbatim} +typedef struct { + int used, alloc, sign; + mp_digit *dp; +} mp_int; +\end{verbatim} + +The mp\_int structure can be broken down as follows. + +\begin{enumerate} +\item The \textbf{used} parameter denotes how many digits of the array \textbf{dp} contain the digits used to represent +a given integer. The \textbf{used} count must be positive (or zero) and may not exceed the \textbf{alloc} count. + +\item The \textbf{alloc} parameter denotes how +many digits are available in the array to use by functions before it has to increase in size. When the \textbf{used} count +of a result would exceed the \textbf{alloc} count all of the algorithms will automatically increase the size of the +array to accommodate the precision of the result. + +\item The pointer \textbf{dp} points to a dynamically allocated array of digits that represent the given multiple +precision integer. It is padded with $(\textbf{alloc} - \textbf{used})$ zero digits. The array is maintained in a least +significant digit order. As a pencil and paper analogy the array is organized such that the right most digits are stored +first starting at the location indexed by zero\footnote{In C all arrays begin at zero.} in the array. For example, +if \textbf{dp} contains $\lbrace a, b, c, \ldots \rbrace$ where \textbf{dp}$_0 = a$, \textbf{dp}$_1 = b$, \textbf{dp}$_2 = c$, $\ldots$ then +it would represent the integer $a + b\beta + c\beta^2 + \ldots$ + +\index{MP\_ZPOS} \index{MP\_NEG} +\item The \textbf{sign} parameter denotes the sign as either zero/positive (\textbf{MP\_ZPOS}) or negative (\textbf{MP\_NEG}). +\end{enumerate} + +\subsubsection{Valid mp\_int Structures} +Several rules are placed on the state of an mp\_int structure and are assumed to be followed for reasons of efficiency. +The only exceptions are when the structure is passed to initialization functions such as mp\_init() and mp\_init\_copy(). + +\begin{enumerate} +\item The value of \textbf{alloc} may not be less than one. That is \textbf{dp} always points to a previously allocated +array of digits. +\item The value of \textbf{used} may not exceed \textbf{alloc} and must be greater than or equal to zero. +\item The value of \textbf{used} implies the digit at index $(used - 1)$ of the \textbf{dp} array is non-zero. That is, +leading zero digits in the most significant positions must be trimmed. + \begin{enumerate} + \item Digits in the \textbf{dp} array at and above the \textbf{used} location must be zero. + \end{enumerate} +\item The value of \textbf{sign} must be \textbf{MP\_ZPOS} if \textbf{used} is zero; +this represents the mp\_int value of zero. +\end{enumerate} + +\section{Argument Passing} +A convention of argument passing must be adopted early on in the development of any library. Making the function +prototypes consistent will help eliminate many headaches in the future as the library grows to significant complexity. +In LibTomMath the multiple precision integer functions accept parameters from left to right as pointers to mp\_int +structures. That means that the source (input) operands are placed on the left and the destination (output) on the right. +Consider the following examples. + +\begin{verbatim} + mp_mul(&a, &b, &c); /* c = a * b */ + mp_add(&a, &b, &a); /* a = a + b */ + mp_sqr(&a, &b); /* b = a * a */ +\end{verbatim} + +The left to right order is a fairly natural way to implement the functions since it lets the developer read aloud the +functions and make sense of them. For example, the first function would read ``multiply a and b and store in c''. + +Certain libraries (\textit{LIP by Lenstra for instance}) accept parameters the other way around, to mimic the order +of assignment expressions. That is, the destination (output) is on the left and arguments (inputs) are on the right. In +truth, it is entirely a matter of preference. In the case of LibTomMath the convention from the MPI library has been +adopted. + +Another very useful design consideration, provided for in LibTomMath, is whether to allow argument sources to also be a +destination. For example, the second example (\textit{mp\_add}) adds $a$ to $b$ and stores in $a$. This is an important +feature to implement since it allows the calling functions to cut down on the number of variables it must maintain. +However, to implement this feature specific care has to be given to ensure the destination is not modified before the +source is fully read. + +\section{Return Values} +A well implemented application, no matter what its purpose, should trap as many runtime errors as possible and return them +to the caller. By catching runtime errors a library can be guaranteed to prevent undefined behaviour. However, the end +developer can still manage to cause a library to crash. For example, by passing an invalid pointer an application may +fault by dereferencing memory not owned by the application. + +In the case of LibTomMath the only errors that are checked for are related to inappropriate inputs (division by zero for +instance) and memory allocation errors. It will not check that the mp\_int passed to any function is valid nor +will it check pointers for validity. Any function that can cause a runtime error will return an error code as an +\textbf{int} data type with one of the following values. + +\index{MP\_OKAY} \index{MP\_VAL} \index{MP\_MEM} +\begin{center} +\begin{tabular}{|l|l|} +\hline \textbf{Value} & \textbf{Meaning} \\ +\hline \textbf{MP\_OKAY} & The function was successful \\ +\hline \textbf{MP\_VAL} & One of the input value(s) was invalid \\ +\hline \textbf{MP\_MEM} & The function ran out of heap memory \\ +\hline +\end{tabular} +\end{center} + +When an error is detected within a function it should free any memory it allocated, often during the initialization of +temporary mp\_ints, and return as soon as possible. The goal is to leave the system in the same state it was when the +function was called. Error checking with this style of API is fairly simple. + +\begin{verbatim} + int err; + if ((err = mp_add(&a, &b, &c)) != MP_OKAY) { + printf("Error: %s\n", mp_error_to_string(err)); + exit(EXIT_FAILURE); + } +\end{verbatim} + +The GMP \cite{GMP} library uses C style \textit{signals} to flag errors which is of questionable use. Not all errors are fatal +and it was not deemed ideal by the author of LibTomMath to force developers to have signal handlers for such cases. + +\section{Initialization and Clearing} +The logical starting point when actually writing multiple precision integer functions is the initialization and +clearing of the mp\_int structures. These two algorithms will be used by the majority of the higher level algorithms. + +Given the basic mp\_int structure an initialization routine must first allocate memory to hold the digits of +the integer. Often it is optimal to allocate a sufficiently large pre-set number of digits even though +the initial integer will represent zero. If only a single digit were allocated quite a few subsequent re-allocations +would occur when operations are performed on the integers. There is a tradeoff between how many default digits to allocate +and how many re-allocations are tolerable. Obviously allocating an excessive amount of digits initially will waste +memory and become unmanageable. + +If the memory for the digits has been successfully allocated then the rest of the members of the structure must +be initialized. Since the initial state of an mp\_int is to represent the zero integer, the allocated digits must be set +to zero. The \textbf{used} count set to zero and \textbf{sign} set to \textbf{MP\_ZPOS}. + +\subsection{Initializing an mp\_int} +An mp\_int is said to be initialized if it is set to a valid, preferably default, state such that all of the members of the +structure are set to valid values. The mp\_init algorithm will perform such an action. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_init}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Allocate memory and initialize $a$ to a known valid mp\_int state. \\ +\hline \\ +1. Allocate memory for \textbf{MP\_PREC} digits. \\ +2. If the allocation failed return(\textit{MP\_MEM}) \\ +3. for $n$ from $0$ to $MP\_PREC - 1$ do \\ +\hspace{3mm}3.1 $a_n \leftarrow 0$\\ +4. $a.sign \leftarrow MP\_ZPOS$\\ +5. $a.used \leftarrow 0$\\ +6. $a.alloc \leftarrow MP\_PREC$\\ +7. Return(\textit{MP\_OKAY})\\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_init} +\end{figure} + +\textbf{Algorithm mp\_init.} +The \textbf{MP\_PREC} name represents a constant\footnote{Defined in the ``tommath.h'' header file within LibTomMath.} +used to dictate the minimum precision of allocated mp\_int integers. Ideally, it is at least equal to $32$ since for most +purposes that will be more than enough. + +Memory for the default number of digits is allocated first. If the allocation fails the algorithm returns immediately +with the \textbf{MP\_MEM} error code. If the allocation succeeds the remaining members of the mp\_int structure +must be initialized to reflect the default initial state. + +The allocated digits are all set to zero (step three) to ensure they are in a known state. The \textbf{sign}, \textbf{used} +and \textbf{alloc} are subsequently initialized to represent the zero integer. By step seven the algorithm returns a success +code and the mp\_int $a$ has been successfully initialized to a valid state representing the integer zero. + +\textbf{Remark.} +This function introduces the idiosyncrasy that all iterative loops, commonly initiated with the ``for'' keyword, iterate incrementally +when the ``to'' keyword is placed between two expressions. For example, ``for $a$ from $b$ to $c$ do'' means that +a subsequent expression (or body of expressions) are to be evaluated upto $c - b$ times so long as $b \le c$. In each +iteration the variable $a$ is substituted for a new integer that lies inclusively between $b$ and $c$. If $b > c$ occured +the loop would not iterate. By contrast if the ``downto'' keyword were used in place of ``to'' the loop would iterate +decrementally. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_init.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* init a new bigint */ +018 int mp_init (mp_int * a) +019 \{ +020 /* allocate memory required and clear it */ +021 a->dp = OPT_CAST(mp_digit) XCALLOC (sizeof (mp_digit), MP_PREC); +022 if (a->dp == NULL) \{ +023 return MP_MEM; +024 \} +025 +026 /* set the used to zero, allocated digits to the default precision +027 * and sign to positive */ +028 a->used = 0; +029 a->alloc = MP_PREC; +030 a->sign = MP_ZPOS; +031 +032 return MP_OKAY; +033 \} +\end{alltt} +\end{small} + +One immediate observation of this initializtion function is that it does not return a pointer to a mp\_int structure. It +is assumed that the caller has already allocated memory for the mp\_int structure, typically on the application stack. The +call to mp\_init() is used only to initialize the members of the structure to a known default state. + +Before any of the other members of the structure are initialized memory from the application heap is allocated with +the calloc() function (line @22,calloc@). The size of the allocated memory is large enough to hold \textbf{MP\_PREC} +mp\_digit variables. The calloc() function is used instead\footnote{calloc() will allocate memory in the same +manner as malloc() except that it also sets the contents to zero upon successfully allocating the memory.} of malloc() +since digits have to be set to zero for the function to finish correctly. The \textbf{OPT\_CAST} token is a macro +definition which will turn into a cast from void * to mp\_digit * for C++ compilers. It is not required for C compilers. + +After the memory has been successfully allocated the remainder of the members are initialized +(lines 28 through 30) to their respective default states. At this point the algorithm has succeeded and +a success code is returned to the calling function. + +If this function returns \textbf{MP\_OKAY} it is safe to assume the mp\_int structure has been properly initialized and +is safe to use with other functions within the library. + +\subsection{Clearing an mp\_int} +When an mp\_int is no longer required by the application, the memory that has been allocated for its digits must be +returned to the application's memory pool with the mp\_clear algorithm. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_clear}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. The memory for $a$ is freed for reuse. \\ +\hline \\ +1. If $a$ has been previously freed then return(\textit{MP\_OKAY}). \\ +2. for $n$ from 0 to $a.used - 1$ do \\ +\hspace{3mm}2.1 $a_n \leftarrow 0$ \\ +3. Free the memory allocated for the digits of $a$. \\ +4. $a.used \leftarrow 0$ \\ +5. $a.alloc \leftarrow 0$ \\ +6. $a.sign \leftarrow MP\_ZPOS$ \\ +7. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_clear} +\end{figure} + +\textbf{Algorithm mp\_clear.} +This algorithm releases the memory allocated for an mp\_int back into the memory pool for reuse. It is designed +such that a given mp\_int structure can be cleared multiple times between initializations without attempting to +free the memory twice\footnote{In ISO C for example, calling free() twice on the same memory block causes undefinied +behaviour.}. + +The first step determines if the mp\_int structure has been marked as free already. If it has, the algorithm returns +success immediately as no further actions are required. Otherwise, the algorithm will proceed to put the structure +in a known empty and otherwise invalid state. First the digits of the mp\_int are set to zero. The memory that has been allocated for the +digits is then freed. The \textbf{used} and \textbf{alloc} counts are both set to zero and the \textbf{sign} set to +\textbf{MP\_ZPOS}. This known fixed state for cleared mp\_int structures will make debuging easier for the end +developer. That is, if they spot (via their debugger) an mp\_int they are using that is in this state it will be +obvious that they erroneously and prematurely cleared the mp\_int structure. + +Note that once an mp\_int has been cleared the mp\_int structure is no longer in a valid state for any other algorithm +with the exception of algorithms mp\_init, mp\_init\_copy, mp\_init\_size and mp\_clear. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_clear.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* clear one (frees) */ +018 void +019 mp_clear (mp_int * a) +020 \{ +021 /* only do anything if a hasn't been freed previously */ +022 if (a->dp != NULL) \{ +023 /* first zero the digits */ +024 memset (a->dp, 0, sizeof (mp_digit) * a->used); +025 +026 /* free ram */ +027 XFREE(a->dp); +028 +029 /* reset members to make debugging easier */ +030 a->dp = NULL; +031 a->alloc = a->used = 0; +032 a->sign = MP_ZPOS; +033 \} +034 \} +\end{alltt} +\end{small} + +The ``if'' statement (line 22) prevents the heap from being corrupted if a user double-frees an +mp\_int. This is because once the memory is freed the pointer is set to \textbf{NULL} (line 30). + +Without the check, code that accidentally calls mp\_clear twice for a given mp\_int structure would try to free the memory +allocated for the digits twice. This may cause some C libraries to signal a fault. By setting the pointer to +\textbf{NULL} it helps debug code that may inadvertently free the mp\_int before it is truly not needed, because attempts +to reference digits should fail immediately. The allocated digits are set to zero before being freed (line 24). +This is ideal for cryptographic situations where the integer that the mp\_int represents might need to be kept a secret. + +\section{Maintenance Algorithms} + +The previous sections describes how to initialize and clear an mp\_int structure. To further support operations +that are to be performed on mp\_int structures (such as addition and multiplication) the dependent algorithms must be +able to augment the precision of an mp\_int and +initialize mp\_ints with differing initial conditions. + +These algorithms complete the set of low level algorithms required to work with mp\_int structures in the higher level +algorithms such as addition, multiplication and modular exponentiation. + +\subsection{Augmenting an mp\_int's Precision} +When storing a value in an mp\_int structure, a sufficient number of digits must be available to accomodate the entire +result of an operation without loss of precision. Quite often the size of the array given by the \textbf{alloc} member +is large enough to simply increase the \textbf{used} digit count. However, when the size of the array is too small it +must be re-sized appropriately to accomodate the result. The mp\_grow algorithm will provide this functionality. + +\newpage\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_grow}. \\ +\textbf{Input}. An mp\_int $a$ and an integer $b$. \\ +\textbf{Output}. $a$ is expanded to accomodate $b$ digits. \\ +\hline \\ +1. if $a.alloc \ge b$ then return(\textit{MP\_OKAY}) \\ +2. $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\ +3. $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\ +4. Re-Allocate the array of digits $a$ to size $v$ \\ +5. If the allocation failed then return(\textit{MP\_MEM}). \\ +6. for n from a.alloc to $v - 1$ do \\ +\hspace{+3mm}6.1 $a_n \leftarrow 0$ \\ +7. $a.alloc \leftarrow v$ \\ +8. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_grow} +\end{figure} + +\textbf{Algorithm mp\_grow.} +It is ideal to prevent re-allocations from being performed if they are not required (step one). This is useful to +prevent mp\_ints from growing excessively in code that erroneously calls mp\_grow. + +The requested digit count is padded up to next multiple of \textbf{MP\_PREC} plus an additional \textbf{MP\_PREC} (steps two and three). +This helps prevent many trivial reallocations that would grow an mp\_int by trivially small values. + +It is assumed that the reallocation (step four) leaves the lower $a.alloc$ digits of the mp\_int intact. This is much +akin to how the \textit{realloc} function from the standard C library works. Since the newly allocated digits are +assumed to contain undefined values they are initially set to zero. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_grow.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* grow as required */ +018 int mp_grow (mp_int * a, int size) +019 \{ +020 int i; +021 mp_digit *tmp; +022 +023 /* if the alloc size is smaller alloc more ram */ +024 if (a->alloc < size) \{ +025 /* ensure there are always at least MP_PREC digits extra on top */ +026 size += (MP_PREC * 2) - (size % MP_PREC); +027 +028 /* reallocate the array a->dp +029 * +030 * We store the return in a temporary variable +031 * in case the operation failed we don't want +032 * to overwrite the dp member of a. +033 */ +034 tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * size); +035 if (tmp == NULL) \{ +036 /* reallocation failed but "a" is still valid [can be freed] */ +037 return MP_MEM; +038 \} +039 +040 /* reallocation succeeded so set a->dp */ +041 a->dp = tmp; +042 +043 /* zero excess digits */ +044 i = a->alloc; +045 a->alloc = size; +046 for (; i < a->alloc; i++) \{ +047 a->dp[i] = 0; +048 \} +049 \} +050 return MP_OKAY; +051 \} +\end{alltt} +\end{small} + +The first step is to see if we actually need to perform a re-allocation at all (line 24). If a reallocation +must occur the digit count is padded upwards to help prevent many trivial reallocations (line 26). Next the reallocation is performed +and the return of realloc() is stored in a temporary pointer named $tmp$ (line 36). The return is stored in a temporary +instead of $a.dp$ to prevent the code from losing the original pointer in case the reallocation fails. Had the return been stored +in $a.dp$ instead there would be no way to reclaim the heap originally used. + +If the reallocation fails the function will return \textbf{MP\_MEM} (line 37), otherwise, the value of $tmp$ is assigned +to the pointer $a.dp$ and the function continues. A simple for loop from line 46 to line 51 will zero all digits +that were above the old \textbf{alloc} limit to make sure the integer is in a known state. + +\subsection{Initializing Variable Precision mp\_ints} +Occasionally the number of digits required will be known in advance of an initialization, based on, for example, the size +of input mp\_ints to a given algorithm. The purpose of algorithm mp\_init\_size is similar to mp\_init except that it +will allocate \textit{at least} a specified number of digits. + +\begin{figure}[here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_init\_size}. \\ +\textbf{Input}. An mp\_int $a$ and the requested number of digits $b$. \\ +\textbf{Output}. $a$ is initialized to hold at least $b$ digits. \\ +\hline \\ +1. $u \leftarrow b \mbox{ (mod }MP\_PREC\mbox{)}$ \\ +2. $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\ +3. Allocate $v$ digits. \\ +4. for $n$ from $0$ to $v - 1$ do \\ +\hspace{3mm}4.1 $a_n \leftarrow 0$ \\ +5. $a.sign \leftarrow MP\_ZPOS$\\ +6. $a.used \leftarrow 0$\\ +7. $a.alloc \leftarrow v$\\ +8. Return(\textit{MP\_OKAY})\\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_init\_size} +\end{figure} + +\textbf{Algorithm mp\_init\_size.} +This algorithm will initialize an mp\_int structure $a$ like algorithm mp\_init with the exception that the number of +digits allocated can be controlled by the second input argument $b$. The input size is padded upwards so it is a +multiple of \textbf{MP\_PREC} plus an additional \textbf{MP\_PREC} digits. This padding is used to prevent trivial +allocations from becoming a bottleneck in the rest of the algorithms. + +Like algorithm mp\_init, the mp\_int structure is initialized to a default state representing the integer zero. This +particular algorithm is useful if it is known ahead of time the approximate size of the input. If the approximation is +correct no further memory re-allocations are required to work with the mp\_int. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_size.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* init an mp_init for a given size */ +018 int mp_init_size (mp_int * a, int size) +019 \{ +020 /* pad size so there are always extra digits */ +021 size += (MP_PREC * 2) - (size % MP_PREC); +022 +023 /* alloc mem */ +024 a->dp = OPT_CAST(mp_digit) XCALLOC (sizeof (mp_digit), size); +025 if (a->dp == NULL) \{ +026 return MP_MEM; +027 \} +028 a->used = 0; +029 a->alloc = size; +030 a->sign = MP_ZPOS; +031 +032 return MP_OKAY; +033 \} +\end{alltt} +\end{small} + +The number of digits $b$ requested is padded (line 21) by first augmenting it to the next multiple of +\textbf{MP\_PREC} and then adding \textbf{MP\_PREC} to the result. If the memory can be successfully allocated the +mp\_int is placed in a default state representing the integer zero. Otherwise, the error code \textbf{MP\_MEM} will be +returned (line 26). + +The digits are allocated and set to zero at the same time with the calloc() function (line @25,calloc@). The +\textbf{used} count is set to zero, the \textbf{alloc} count set to the padded digit count and the \textbf{sign} flag set +to \textbf{MP\_ZPOS} to achieve a default valid mp\_int state (lines 28, 29 and 30). If the function +returns succesfully then it is correct to assume that the mp\_int structure is in a valid state for the remainder of the +functions to work with. + +\subsection{Multiple Integer Initializations and Clearings} +Occasionally a function will require a series of mp\_int data types to be made available simultaneously. +The purpose of algorithm mp\_init\_multi is to initialize a variable length array of mp\_int structures in a single +statement. It is essentially a shortcut to multiple initializations. + +\newpage\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_init\_multi}. \\ +\textbf{Input}. Variable length array $V_k$ of mp\_int variables of length $k$. \\ +\textbf{Output}. The array is initialized such that each mp\_int of $V_k$ is ready to use. \\ +\hline \\ +1. for $n$ from 0 to $k - 1$ do \\ +\hspace{+3mm}1.1. Initialize the mp\_int $V_n$ (\textit{mp\_init}) \\ +\hspace{+3mm}1.2. If initialization failed then do \\ +\hspace{+6mm}1.2.1. for $j$ from $0$ to $n$ do \\ +\hspace{+9mm}1.2.1.1. Free the mp\_int $V_j$ (\textit{mp\_clear}) \\ +\hspace{+6mm}1.2.2. Return(\textit{MP\_MEM}) \\ +2. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_init\_multi} +\end{figure} + +\textbf{Algorithm mp\_init\_multi.} +The algorithm will initialize the array of mp\_int variables one at a time. If a runtime error has been detected +(\textit{step 1.2}) all of the previously initialized variables are cleared. The goal is an ``all or nothing'' +initialization which allows for quick recovery from runtime errors. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_multi.c +\vspace{-3mm} +\begin{alltt} +016 #include <stdarg.h> +017 +018 int mp_init_multi(mp_int *mp, ...) +019 \{ +020 mp_err res = MP_OKAY; /* Assume ok until proven otherwise */ +021 int n = 0; /* Number of ok inits */ +022 mp_int* cur_arg = mp; +023 va_list args; +024 +025 va_start(args, mp); /* init args to next argument from caller */ +026 while (cur_arg != NULL) \{ +027 if (mp_init(cur_arg) != MP_OKAY) \{ +028 /* Oops - error! Back-track and mp_clear what we already +029 succeeded in init-ing, then return error. +030 */ +031 va_list clean_args; +032 +033 /* end the current list */ +034 va_end(args); +035 +036 /* now start cleaning up */ +037 cur_arg = mp; +038 va_start(clean_args, mp); +039 while (n--) \{ +040 mp_clear(cur_arg); +041 cur_arg = va_arg(clean_args, mp_int*); +042 \} +043 va_end(clean_args); +044 res = MP_MEM; +045 break; +046 \} +047 n++; +048 cur_arg = va_arg(args, mp_int*); +049 \} +050 va_end(args); +051 return res; /* Assumed ok, if error flagged above. */ +052 \} +053 +\end{alltt} +\end{small} + +This function intializes a variable length list of mp\_int structure pointers. However, instead of having the mp\_int +structures in an actual C array they are simply passed as arguments to the function. This function makes use of the +``...'' argument syntax of the C programming language. The list is terminated with a final \textbf{NULL} argument +appended on the right. + +The function uses the ``stdarg.h'' \textit{va} functions to step portably through the arguments to the function. A count +$n$ of succesfully initialized mp\_int structures is maintained (line 47) such that if a failure does occur, +the algorithm can backtrack and free the previously initialized structures (lines 27 to 46). + + +\subsection{Clamping Excess Digits} +When a function anticipates a result will be $n$ digits it is simpler to assume this is true within the body of +the function instead of checking during the computation. For example, a multiplication of a $i$ digit number by a +$j$ digit produces a result of at most $i + j$ digits. It is entirely possible that the result is $i + j - 1$ +though, with no final carry into the last position. However, suppose the destination had to be first expanded +(\textit{via mp\_grow}) to accomodate $i + j - 1$ digits than further expanded to accomodate the final carry. +That would be a considerable waste of time since heap operations are relatively slow. + +The ideal solution is to always assume the result is $i + j$ and fix up the \textbf{used} count after the function +terminates. This way a single heap operation (\textit{at most}) is required. However, if the result was not checked +there would be an excess high order zero digit. + +For example, suppose the product of two integers was $x_n = (0x_{n-1}x_{n-2}...x_0)_{\beta}$. The leading zero digit +will not contribute to the precision of the result. In fact, through subsequent operations more leading zero digits would +accumulate to the point the size of the integer would be prohibitive. As a result even though the precision is very +low the representation is excessively large. + +The mp\_clamp algorithm is designed to solve this very problem. It will trim high-order zeros by decrementing the +\textbf{used} count until a non-zero most significant digit is found. Also in this system, zero is considered to be a +positive number which means that if the \textbf{used} count is decremented to zero, the sign must be set to +\textbf{MP\_ZPOS}. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_clamp}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Any excess leading zero digits of $a$ are removed \\ +\hline \\ +1. while $a.used > 0$ and $a_{a.used - 1} = 0$ do \\ +\hspace{+3mm}1.1 $a.used \leftarrow a.used - 1$ \\ +2. if $a.used = 0$ then do \\ +\hspace{+3mm}2.1 $a.sign \leftarrow MP\_ZPOS$ \\ +\hline \\ +\end{tabular} +\end{center} +\caption{Algorithm mp\_clamp} +\end{figure} + +\textbf{Algorithm mp\_clamp.} +As can be expected this algorithm is very simple. The loop on step one is expected to iterate only once or twice at +the most. For example, this will happen in cases where there is not a carry to fill the last position. Step two fixes the sign for +when all of the digits are zero to ensure that the mp\_int is valid at all times. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_clamp.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* trim unused digits +018 * +019 * This is used to ensure that leading zero digits are +020 * trimed and the leading "used" digit will be non-zero +021 * Typically very fast. Also fixes the sign if there +022 * are no more leading digits +023 */ +024 void +025 mp_clamp (mp_int * a) +026 \{ +027 /* decrease used while the most significant digit is +028 * zero. +029 */ +030 while (a->used > 0 && a->dp[a->used - 1] == 0) \{ +031 --(a->used); +032 \} +033 +034 /* reset the sign flag if used == 0 */ +035 if (a->used == 0) \{ +036 a->sign = MP_ZPOS; +037 \} +038 \} +\end{alltt} +\end{small} + +Note on line 27 how to test for the \textbf{used} count is made on the left of the \&\& operator. In the C programming +language the terms to \&\& are evaluated left to right with a boolean short-circuit if any condition fails. This is +important since if the \textbf{used} is zero the test on the right would fetch below the array. That is obviously +undesirable. The parenthesis on line 30 is used to make sure the \textbf{used} count is decremented and not +the pointer ``a''. + +\section*{Exercises} +\begin{tabular}{cl} +$\left [ 1 \right ]$ & Discuss the relevance of the \textbf{used} member of the mp\_int structure. \\ + & \\ +$\left [ 1 \right ]$ & Discuss the consequences of not using padding when performing allocations. \\ + & \\ +$\left [ 2 \right ]$ & Estimate an ideal value for \textbf{MP\_PREC} when performing 1024-bit RSA \\ + & encryption when $\beta = 2^{28}$. \\ + & \\ +$\left [ 1 \right ]$ & Discuss the relevance of the algorithm mp\_clamp. What does it prevent? \\ + & \\ +$\left [ 1 \right ]$ & Give an example of when the algorithm mp\_init\_copy might be useful. \\ + & \\ +\end{tabular} + + +%%% +% CHAPTER FOUR +%%% + +\chapter{Basic Operations} + +\section{Introduction} +In the previous chapter a series of low level algorithms were established that dealt with initializing and maintaining +mp\_int structures. This chapter will discuss another set of seemingly non-algebraic algorithms which will form the low +level basis of the entire library. While these algorithm are relatively trivial it is important to understand how they +work before proceeding since these algorithms will be used almost intrinsically in the following chapters. + +The algorithms in this chapter deal primarily with more ``programmer'' related tasks such as creating copies of +mp\_int structures, assigning small values to mp\_int structures and comparisons of the values mp\_int structures +represent. + +\section{Assigning Values to mp\_int Structures} +\subsection{Copying an mp\_int} +Assigning the value that a given mp\_int structure represents to another mp\_int structure shall be known as making +a copy for the purposes of this text. The copy of the mp\_int will be a separate entity that represents the same +value as the mp\_int it was copied from. The mp\_copy algorithm provides this functionality. + +\newpage\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_copy}. \\ +\textbf{Input}. An mp\_int $a$ and $b$. \\ +\textbf{Output}. Store a copy of $a$ in $b$. \\ +\hline \\ +1. If $b.alloc < a.used$ then grow $b$ to $a.used$ digits. (\textit{mp\_grow}) \\ +2. for $n$ from 0 to $a.used - 1$ do \\ +\hspace{3mm}2.1 $b_{n} \leftarrow a_{n}$ \\ +3. for $n$ from $a.used$ to $b.used - 1$ do \\ +\hspace{3mm}3.1 $b_{n} \leftarrow 0$ \\ +4. $b.used \leftarrow a.used$ \\ +5. $b.sign \leftarrow a.sign$ \\ +6. return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_copy} +\end{figure} + +\textbf{Algorithm mp\_copy.} +This algorithm copies the mp\_int $a$ such that upon succesful termination of the algorithm the mp\_int $b$ will +represent the same integer as the mp\_int $a$. The mp\_int $b$ shall be a complete and distinct copy of the +mp\_int $a$ meaing that the mp\_int $a$ can be modified and it shall not affect the value of the mp\_int $b$. + +If $b$ does not have enough room for the digits of $a$ it must first have its precision augmented via the mp\_grow +algorithm. The digits of $a$ are copied over the digits of $b$ and any excess digits of $b$ are set to zero (step two +and three). The \textbf{used} and \textbf{sign} members of $a$ are finally copied over the respective members of +$b$. + +\textbf{Remark.} This algorithm also introduces a new idiosyncrasy that will be used throughout the rest of the +text. The error return codes of other algorithms are not explicitly checked in the pseudo-code presented. For example, in +step one of the mp\_copy algorithm the return of mp\_grow is not explicitly checked to ensure it succeeded. Text space is +limited so it is assumed that if a algorithm fails it will clear all temporarily allocated mp\_ints and return +the error code itself. However, the C code presented will demonstrate all of the error handling logic required to +implement the pseudo-code. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_copy.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* copy, b = a */ +018 int +019 mp_copy (mp_int * a, mp_int * b) +020 \{ +021 int res, n; +022 +023 /* if dst == src do nothing */ +024 if (a == b) \{ +025 return MP_OKAY; +026 \} +027 +028 /* grow dest */ +029 if (b->alloc < a->used) \{ +030 if ((res = mp_grow (b, a->used)) != MP_OKAY) \{ +031 return res; +032 \} +033 \} +034 +035 /* zero b and copy the parameters over */ +036 \{ +037 register mp_digit *tmpa, *tmpb; +038 +039 /* pointer aliases */ +040 +041 /* source */ +042 tmpa = a->dp; +043 +044 /* destination */ +045 tmpb = b->dp; +046 +047 /* copy all the digits */ +048 for (n = 0; n < a->used; n++) \{ +049 *tmpb++ = *tmpa++; +050 \} +051 +052 /* clear high digits */ +053 for (; n < b->used; n++) \{ +054 *tmpb++ = 0; +055 \} +056 \} +057 +058 /* copy used count and sign */ +059 b->used = a->used; +060 b->sign = a->sign; +061 return MP_OKAY; +062 \} +\end{alltt} +\end{small} + +Occasionally a dependent algorithm may copy an mp\_int effectively into itself such as when the input and output +mp\_int structures passed to a function are one and the same. For this case it is optimal to return immediately without +copying digits (line 24). + +The mp\_int $b$ must have enough digits to accomodate the used digits of the mp\_int $a$. If $b.alloc$ is less than +$a.used$ the algorithm mp\_grow is used to augment the precision of $b$ (lines 29 to 33). In order to +simplify the inner loop that copies the digits from $a$ to $b$, two aliases $tmpa$ and $tmpb$ point directly at the digits +of the mp\_ints $a$ and $b$ respectively. These aliases (lines 42 and 45) allow the compiler to access the digits without first dereferencing the +mp\_int pointers and then subsequently the pointer to the digits. + +After the aliases are established the digits from $a$ are copied into $b$ (lines 48 to 50) and then the excess +digits of $b$ are set to zero (lines 53 to 55). Both ``for'' loops make use of the pointer aliases and in +fact the alias for $b$ is carried through into the second ``for'' loop to clear the excess digits. This optimization +allows the alias to stay in a machine register fairly easy between the two loops. + +\textbf{Remarks.} The use of pointer aliases is an implementation methodology first introduced in this function that will +be used considerably in other functions. Technically, a pointer alias is simply a short hand alias used to lower the +number of pointer dereferencing operations required to access data. For example, a for loop may resemble + +\begin{alltt} +for (x = 0; x < 100; x++) \{ + a->num[4]->dp[x] = 0; +\} +\end{alltt} + +This could be re-written using aliases as + +\begin{alltt} +mp_digit *tmpa; +a = a->num[4]->dp; +for (x = 0; x < 100; x++) \{ + *a++ = 0; +\} +\end{alltt} + +In this case an alias is used to access the +array of digits within an mp\_int structure directly. It may seem that a pointer alias is strictly not required +as a compiler may optimize out the redundant pointer operations. However, there are two dominant reasons to use aliases. + +The first reason is that most compilers will not effectively optimize pointer arithmetic. For example, some optimizations +may work for the Microsoft Visual C++ compiler (MSVC) and not for the GNU C Compiler (GCC). Also some optimizations may +work for GCC and not MSVC. As such it is ideal to find a common ground for as many compilers as possible. Pointer +aliases optimize the code considerably before the compiler even reads the source code which means the end compiled code +stands a better chance of being faster. + +The second reason is that pointer aliases often can make an algorithm simpler to read. Consider the first ``for'' +loop of the function mp\_copy() re-written to not use pointer aliases. + +\begin{alltt} + /* copy all the digits */ + for (n = 0; n < a->used; n++) \{ + b->dp[n] = a->dp[n]; + \} +\end{alltt} + +Whether this code is harder to read depends strongly on the individual. However, it is quantifiably slightly more +complicated as there are four variables within the statement instead of just two. + +\subsubsection{Nested Statements} +Another commonly used technique in the source routines is that certain sections of code are nested. This is used in +particular with the pointer aliases to highlight code phases. For example, a Comba multiplier (discussed in chapter six) +will typically have three different phases. First the temporaries are initialized, then the columns calculated and +finally the carries are propagated. In this example the middle column production phase will typically be nested as it +uses temporary variables and aliases the most. + +The nesting also simplies the source code as variables that are nested are only valid for their scope. As a result +the various temporary variables required do not propagate into other sections of code. + + +\subsection{Creating a Clone} +Another common operation is to make a local temporary copy of an mp\_int argument. To initialize an mp\_int +and then copy another existing mp\_int into the newly intialized mp\_int will be known as creating a clone. This is +useful within functions that need to modify an argument but do not wish to actually modify the original copy. The +mp\_init\_copy algorithm has been designed to help perform this task. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_init\_copy}. \\ +\textbf{Input}. An mp\_int $a$ and $b$\\ +\textbf{Output}. $a$ is initialized to be a copy of $b$. \\ +\hline \\ +1. Init $a$. (\textit{mp\_init}) \\ +2. Copy $b$ to $a$. (\textit{mp\_copy}) \\ +3. Return the status of the copy operation. \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_init\_copy} +\end{figure} + +\textbf{Algorithm mp\_init\_copy.} +This algorithm will initialize an mp\_int variable and copy another previously initialized mp\_int variable into it. As +such this algorithm will perform two operations in one step. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_copy.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* creates "a" then copies b into it */ +018 int mp_init_copy (mp_int * a, mp_int * b) +019 \{ +020 int res; +021 +022 if ((res = mp_init (a)) != MP_OKAY) \{ +023 return res; +024 \} +025 return mp_copy (b, a); +026 \} +\end{alltt} +\end{small} + +This will initialize \textbf{a} and make it a verbatim copy of the contents of \textbf{b}. Note that +\textbf{a} will have its own memory allocated which means that \textbf{b} may be cleared after the call +and \textbf{a} will be left intact. + +\section{Zeroing an Integer} +Reseting an mp\_int to the default state is a common step in many algorithms. The mp\_zero algorithm will be the algorithm used to +perform this task. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_zero}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Zero the contents of $a$ \\ +\hline \\ +1. $a.used \leftarrow 0$ \\ +2. $a.sign \leftarrow$ MP\_ZPOS \\ +3. for $n$ from 0 to $a.alloc - 1$ do \\ +\hspace{3mm}3.1 $a_n \leftarrow 0$ \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_zero} +\end{figure} + +\textbf{Algorithm mp\_zero.} +This algorithm simply resets a mp\_int to the default state. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_zero.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* set to zero */ +018 void +019 mp_zero (mp_int * a) +020 \{ +021 a->sign = MP_ZPOS; +022 a->used = 0; +023 memset (a->dp, 0, sizeof (mp_digit) * a->alloc); +024 \} +\end{alltt} +\end{small} + +After the function is completed, all of the digits are zeroed, the \textbf{used} count is zeroed and the +\textbf{sign} variable is set to \textbf{MP\_ZPOS}. + +\section{Sign Manipulation} +\subsection{Absolute Value} +With the mp\_int representation of an integer, calculating the absolute value is trivial. The mp\_abs algorithm will compute +the absolute value of an mp\_int. + +\newpage\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_abs}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Computes $b = \vert a \vert$ \\ +\hline \\ +1. Copy $a$ to $b$. (\textit{mp\_copy}) \\ +2. If the copy failed return(\textit{MP\_MEM}). \\ +3. $b.sign \leftarrow MP\_ZPOS$ \\ +4. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_abs} +\end{figure} + +\textbf{Algorithm mp\_abs.} +This algorithm computes the absolute of an mp\_int input. First it copies $a$ over $b$. This is an example of an +algorithm where the check in mp\_copy that determines if the source and destination are equal proves useful. This allows, +for instance, the developer to pass the same mp\_int as the source and destination to this function without addition +logic to handle it. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_abs.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* b = |a| +018 * +019 * Simple function copies the input and fixes the sign to positive +020 */ +021 int +022 mp_abs (mp_int * a, mp_int * b) +023 \{ +024 int res; +025 +026 /* copy a to b */ +027 if (a != b) \{ +028 if ((res = mp_copy (a, b)) != MP_OKAY) \{ +029 return res; +030 \} +031 \} +032 +033 /* force the sign of b to positive */ +034 b->sign = MP_ZPOS; +035 +036 return MP_OKAY; +037 \} +\end{alltt} +\end{small} + +\subsection{Integer Negation} +With the mp\_int representation of an integer, calculating the negation is also trivial. The mp\_neg algorithm will compute +the negative of an mp\_int input. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_neg}. \\ +\textbf{Input}. An mp\_int $a$ \\ +\textbf{Output}. Computes $b = -a$ \\ +\hline \\ +1. Copy $a$ to $b$. (\textit{mp\_copy}) \\ +2. If the copy failed return(\textit{MP\_MEM}). \\ +3. If $a.used = 0$ then return(\textit{MP\_OKAY}). \\ +4. If $a.sign = MP\_ZPOS$ then do \\ +\hspace{3mm}4.1 $b.sign = MP\_NEG$. \\ +5. else do \\ +\hspace{3mm}5.1 $b.sign = MP\_ZPOS$. \\ +6. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_neg} +\end{figure} + +\textbf{Algorithm mp\_neg.} +This algorithm computes the negation of an input. First it copies $a$ over $b$. If $a$ has no used digits then +the algorithm returns immediately. Otherwise it flips the sign flag and stores the result in $b$. Note that if +$a$ had no digits then it must be positive by definition. Had step three been omitted then the algorithm would return +zero as negative. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_neg.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* b = -a */ +018 int mp_neg (mp_int * a, mp_int * b) +019 \{ +020 int res; +021 if ((res = mp_copy (a, b)) != MP_OKAY) \{ +022 return res; +023 \} +024 if (mp_iszero(b) != MP_YES) \{ +025 b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS; +026 \} +027 return MP_OKAY; +028 \} +\end{alltt} +\end{small} + +\section{Small Constants} +\subsection{Setting Small Constants} +Often a mp\_int must be set to a relatively small value such as $1$ or $2$. For these cases the mp\_set algorithm is useful. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_set}. \\ +\textbf{Input}. An mp\_int $a$ and a digit $b$ \\ +\textbf{Output}. Make $a$ equivalent to $b$ \\ +\hline \\ +1. Zero $a$ (\textit{mp\_zero}). \\ +2. $a_0 \leftarrow b \mbox{ (mod }\beta\mbox{)}$ \\ +3. $a.used \leftarrow \left \lbrace \begin{array}{ll} + 1 & \mbox{if }a_0 > 0 \\ + 0 & \mbox{if }a_0 = 0 + \end{array} \right .$ \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_set} +\end{figure} + +\textbf{Algorithm mp\_set.} +This algorithm sets a mp\_int to a small single digit value. Step number 1 ensures that the integer is reset to the default state. The +single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adjusted accordingly. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_set.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* set to a digit */ +018 void mp_set (mp_int * a, mp_digit b) +019 \{ +020 mp_zero (a); +021 a->dp[0] = b & MP_MASK; +022 a->used = (a->dp[0] != 0) ? 1 : 0; +023 \} +\end{alltt} +\end{small} + +Line 20 calls mp\_zero() to clear the mp\_int and reset the sign. Line 21 copies the digit +into the least significant location. Note the usage of a new constant \textbf{MP\_MASK}. This constant is used to quickly +reduce an integer modulo $\beta$. Since $\beta$ is of the form $2^k$ for any suitable $k$ it suffices to perform a binary AND with +$MP\_MASK = 2^k - 1$ to perform the reduction. Finally line 22 will set the \textbf{used} member with respect to the +digit actually set. This function will always make the integer positive. + +One important limitation of this function is that it will only set one digit. The size of a digit is not fixed, meaning source that uses +this function should take that into account. Only trivially small constants can be set using this function. + +\subsection{Setting Large Constants} +To overcome the limitations of the mp\_set algorithm the mp\_set\_int algorithm is ideal. It accepts a ``long'' +data type as input and will always treat it as a 32-bit integer. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_set\_int}. \\ +\textbf{Input}. An mp\_int $a$ and a ``long'' integer $b$ \\ +\textbf{Output}. Make $a$ equivalent to $b$ \\ +\hline \\ +1. Zero $a$ (\textit{mp\_zero}) \\ +2. for $n$ from 0 to 7 do \\ +\hspace{3mm}2.1 $a \leftarrow a \cdot 16$ (\textit{mp\_mul2d}) \\ +\hspace{3mm}2.2 $u \leftarrow \lfloor b / 2^{4(7 - n)} \rfloor \mbox{ (mod }16\mbox{)}$\\ +\hspace{3mm}2.3 $a_0 \leftarrow a_0 + u$ \\ +\hspace{3mm}2.4 $a.used \leftarrow a.used + 1$ \\ +3. Clamp excess used digits (\textit{mp\_clamp}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_set\_int} +\end{figure} + +\textbf{Algorithm mp\_set\_int.} +The algorithm performs eight iterations of a simple loop where in each iteration four bits from the source are added to the +mp\_int. Step 2.1 will multiply the current result by sixteen making room for four more bits in the less significant positions. In step 2.2 the +next four bits from the source are extracted and are added to the mp\_int. The \textbf{used} digit count is +incremented to reflect the addition. The \textbf{used} digit counter is incremented since if any of the leading digits were zero the mp\_int would have +zero digits used and the newly added four bits would be ignored. + +Excess zero digits are trimmed in steps 2.1 and 3 by using higher level algorithms mp\_mul2d and mp\_clamp. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_set\_int.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* set a 32-bit const */ +018 int mp_set_int (mp_int * a, unsigned long b) +019 \{ +020 int x, res; +021 +022 mp_zero (a); +023 +024 /* set four bits at a time */ +025 for (x = 0; x < 8; x++) \{ +026 /* shift the number up four bits */ +027 if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) \{ +028 return res; +029 \} +030 +031 /* OR in the top four bits of the source */ +032 a->dp[0] |= (b >> 28) & 15; +033 +034 /* shift the source up to the next four bits */ +035 b <<= 4; +036 +037 /* ensure that digits are not clamped off */ +038 a->used += 1; +039 \} +040 mp_clamp (a); +041 return MP_OKAY; +042 \} +\end{alltt} +\end{small} + +This function sets four bits of the number at a time to handle all practical \textbf{DIGIT\_BIT} sizes. The weird +addition on line 38 ensures that the newly added in bits are added to the number of digits. While it may not +seem obvious as to why the digit counter does not grow exceedingly large it is because of the shift on line 27 +as well as the call to mp\_clamp() on line 40. Both functions will clamp excess leading digits which keeps +the number of used digits low. + +\section{Comparisons} +\subsection{Unsigned Comparisions} +Comparing a multiple precision integer is performed with the exact same algorithm used to compare two decimal numbers. For example, +to compare $1,234$ to $1,264$ the digits are extracted by their positions. That is we compare $1 \cdot 10^3 + 2 \cdot 10^2 + 3 \cdot 10^1 + 4 \cdot 10^0$ +to $1 \cdot 10^3 + 2 \cdot 10^2 + 6 \cdot 10^1 + 4 \cdot 10^0$ by comparing single digits at a time starting with the highest magnitude +positions. If any leading digit of one integer is greater than a digit in the same position of another integer then obviously it must be greater. + +The first comparision routine that will be developed is the unsigned magnitude compare which will perform a comparison based on the digits of two +mp\_int variables alone. It will ignore the sign of the two inputs. Such a function is useful when an absolute comparison is required or if the +signs are known to agree in advance. + +To facilitate working with the results of the comparison functions three constants are required. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{|r|l|} +\hline \textbf{Constant} & \textbf{Meaning} \\ +\hline \textbf{MP\_GT} & Greater Than \\ +\hline \textbf{MP\_EQ} & Equal To \\ +\hline \textbf{MP\_LT} & Less Than \\ +\hline +\end{tabular} +\end{center} +\caption{Comparison Return Codes} +\end{figure} + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_cmp\_mag}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$. \\ +\textbf{Output}. Unsigned comparison results ($a$ to the left of $b$). \\ +\hline \\ +1. If $a.used > b.used$ then return(\textit{MP\_GT}) \\ +2. If $a.used < b.used$ then return(\textit{MP\_LT}) \\ +3. for n from $a.used - 1$ to 0 do \\ +\hspace{+3mm}3.1 if $a_n > b_n$ then return(\textit{MP\_GT}) \\ +\hspace{+3mm}3.2 if $a_n < b_n$ then return(\textit{MP\_LT}) \\ +4. Return(\textit{MP\_EQ}) \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_cmp\_mag} +\end{figure} + +\textbf{Algorithm mp\_cmp\_mag.} +By saying ``$a$ to the left of $b$'' it is meant that the comparison is with respect to $a$, that is if $a$ is greater than $b$ it will return +\textbf{MP\_GT} and similar with respect to when $a = b$ and $a < b$. The first two steps compare the number of digits used in both $a$ and $b$. +Obviously if the digit counts differ there would be an imaginary zero digit in the smaller number where the leading digit of the larger number is. +If both have the same number of digits than the actual digits themselves must be compared starting at the leading digit. + +By step three both inputs must have the same number of digits so its safe to start from either $a.used - 1$ or $b.used - 1$ and count down to +the zero'th digit. If after all of the digits have been compared, no difference is found, the algorithm returns \textbf{MP\_EQ}. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_cmp\_mag.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* compare maginitude of two ints (unsigned) */ +018 int mp_cmp_mag (mp_int * a, mp_int * b) +019 \{ +020 int n; +021 mp_digit *tmpa, *tmpb; +022 +023 /* compare based on # of non-zero digits */ +024 if (a->used > b->used) \{ +025 return MP_GT; +026 \} +027 +028 if (a->used < b->used) \{ +029 return MP_LT; +030 \} +031 +032 /* alias for a */ +033 tmpa = a->dp + (a->used - 1); +034 +035 /* alias for b */ +036 tmpb = b->dp + (a->used - 1); +037 +038 /* compare based on digits */ +039 for (n = 0; n < a->used; ++n, --tmpa, --tmpb) \{ +040 if (*tmpa > *tmpb) \{ +041 return MP_GT; +042 \} +043 +044 if (*tmpa < *tmpb) \{ +045 return MP_LT; +046 \} +047 \} +048 return MP_EQ; +049 \} +\end{alltt} +\end{small} + +The two if statements on lines 24 and 28 compare the number of digits in the two inputs. These two are performed before all of the digits +are compared since it is a very cheap test to perform and can potentially save considerable time. The implementation given is also not valid +without those two statements. $b.alloc$ may be smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the +array of digits. + +\subsection{Signed Comparisons} +Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}). Based on an unsigned magnitude +comparison a trivial signed comparison algorithm can be written. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_cmp}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ \\ +\textbf{Output}. Signed Comparison Results ($a$ to the left of $b$) \\ +\hline \\ +1. if $a.sign = MP\_NEG$ and $b.sign = MP\_ZPOS$ then return(\textit{MP\_LT}) \\ +2. if $a.sign = MP\_ZPOS$ and $b.sign = MP\_NEG$ then return(\textit{MP\_GT}) \\ +3. if $a.sign = MP\_NEG$ then \\ +\hspace{+3mm}3.1 Return the unsigned comparison of $b$ and $a$ (\textit{mp\_cmp\_mag}) \\ +4 Otherwise \\ +\hspace{+3mm}4.1 Return the unsigned comparison of $a$ and $b$ \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_cmp} +\end{figure} + +\textbf{Algorithm mp\_cmp.} +The first two steps compare the signs of the two inputs. If the signs do not agree then it can return right away with the appropriate +comparison code. When the signs are equal the digits of the inputs must be compared to determine the correct result. In step +three the unsigned comparision flips the order of the arguments since they are both negative. For instance, if $-a > -b$ then +$\vert a \vert < \vert b \vert$. Step number four will compare the two when they are both positive. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_cmp.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* compare two ints (signed)*/ +018 int +019 mp_cmp (mp_int * a, mp_int * b) +020 \{ +021 /* compare based on sign */ +022 if (a->sign != b->sign) \{ +023 if (a->sign == MP_NEG) \{ +024 return MP_LT; +025 \} else \{ +026 return MP_GT; +027 \} +028 \} +029 +030 /* compare digits */ +031 if (a->sign == MP_NEG) \{ +032 /* if negative compare opposite direction */ +033 return mp_cmp_mag(b, a); +034 \} else \{ +035 return mp_cmp_mag(a, b); +036 \} +037 \} +\end{alltt} +\end{small} + +The two if statements on lines 22 and 23 perform the initial sign comparison. If the signs are not the equal then which ever +has the positive sign is larger. At line 31, the inputs are compared based on magnitudes. If the signs were both negative then +the unsigned comparison is performed in the opposite direction (\textit{line 33}). Otherwise, the signs are assumed to +be both positive and a forward direction unsigned comparison is performed. + +\section*{Exercises} +\begin{tabular}{cl} +$\left [ 2 \right ]$ & Modify algorithm mp\_set\_int to accept as input a variable length array of bits. \\ + & \\ +$\left [ 3 \right ]$ & Give the probability that algorithm mp\_cmp\_mag will have to compare $k$ digits \\ + & of two random digits (of equal magnitude) before a difference is found. \\ + & \\ +$\left [ 1 \right ]$ & Suggest a simple method to speed up the implementation of mp\_cmp\_mag based \\ + & on the observations made in the previous problem. \\ + & +\end{tabular} + +\chapter{Basic Arithmetic} +\section{Introduction} +At this point algorithms for initialization, clearing, zeroing, copying, comparing and setting small constants have been +established. The next logical set of algorithms to develop are addition, subtraction and digit shifting algorithms. These +algorithms make use of the lower level algorithms and are the cruicial building block for the multiplication algorithms. It is very important +that these algorithms are highly optimized. On their own they are simple $O(n)$ algorithms but they can be called from higher level algorithms +which easily places them at $O(n^2)$ or even $O(n^3)$ work levels. + +All of the algorithms within this chapter make use of the logical bit shift operations denoted by $<<$ and $>>$ for left and right +logical shifts respectively. A logical shift is analogous to sliding the decimal point of radix-10 representations. For example, the real +number $0.9345$ is equivalent to $93.45\%$ which is found by sliding the the decimal two places to the right (\textit{multiplying by $\beta^2 = 10^2$}). +Algebraically a binary logical shift is equivalent to a division or multiplication by a power of two. +For example, $a << k = a \cdot 2^k$ while $a >> k = \lfloor a/2^k \rfloor$. + +One significant difference between a logical shift and the way decimals are shifted is that digits below the zero'th position are removed +from the number. For example, consider $1101_2 >> 1$ using decimal notation this would produce $110.1_2$. However, with a logical shift the +result is $110_2$. + +\section{Addition and Subtraction} +In common twos complement fixed precision arithmetic negative numbers are easily represented by subtraction from the modulus. For example, with 32-bit integers +$a - b\mbox{ (mod }2^{32}\mbox{)}$ is the same as $a + (2^{32} - b) \mbox{ (mod }2^{32}\mbox{)}$ since $2^{32} \equiv 0 \mbox{ (mod }2^{32}\mbox{)}$. +As a result subtraction can be performed with a trivial series of logical operations and an addition. + +However, in multiple precision arithmetic negative numbers are not represented in the same way. Instead a sign flag is used to keep track of the +sign of the integer. As a result signed addition and subtraction are actually implemented as conditional usage of lower level addition or +subtraction algorithms with the sign fixed up appropriately. + +The lower level algorithms will add or subtract integers without regard to the sign flag. That is they will add or subtract the magnitude of +the integers respectively. + +\subsection{Low Level Addition} +An unsigned addition of multiple precision integers is performed with the same long-hand algorithm used to add decimal numbers. That is to add the +trailing digits first and propagate the resulting carry upwards. Since this is a lower level algorithm the name will have a ``s\_'' prefix. +Historically that convention stems from the MPI library where ``s\_'' stood for static functions that were hidden from the developer entirely. + +\newpage +\begin{figure}[!here] +\begin{center} +\begin{small} +\begin{tabular}{l} +\hline Algorithm \textbf{s\_mp\_add}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ \\ +\textbf{Output}. The unsigned addition $c = \vert a \vert + \vert b \vert$. \\ +\hline \\ +1. if $a.used > b.used$ then \\ +\hspace{+3mm}1.1 $min \leftarrow b.used$ \\ +\hspace{+3mm}1.2 $max \leftarrow a.used$ \\ +\hspace{+3mm}1.3 $x \leftarrow a$ \\ +2. else \\ +\hspace{+3mm}2.1 $min \leftarrow a.used$ \\ +\hspace{+3mm}2.2 $max \leftarrow b.used$ \\ +\hspace{+3mm}2.3 $x \leftarrow b$ \\ +3. If $c.alloc < max + 1$ then grow $c$ to hold at least $max + 1$ digits (\textit{mp\_grow}) \\ +4. $oldused \leftarrow c.used$ \\ +5. $c.used \leftarrow max + 1$ \\ +6. $u \leftarrow 0$ \\ +7. for $n$ from $0$ to $min - 1$ do \\ +\hspace{+3mm}7.1 $c_n \leftarrow a_n + b_n + u$ \\ +\hspace{+3mm}7.2 $u \leftarrow c_n >> lg(\beta)$ \\ +\hspace{+3mm}7.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ +8. if $min \ne max$ then do \\ +\hspace{+3mm}8.1 for $n$ from $min$ to $max - 1$ do \\ +\hspace{+6mm}8.1.1 $c_n \leftarrow x_n + u$ \\ +\hspace{+6mm}8.1.2 $u \leftarrow c_n >> lg(\beta)$ \\ +\hspace{+6mm}8.1.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ +9. $c_{max} \leftarrow u$ \\ +10. if $olduse > max$ then \\ +\hspace{+3mm}10.1 for $n$ from $max + 1$ to $oldused - 1$ do \\ +\hspace{+6mm}10.1.1 $c_n \leftarrow 0$ \\ +11. Clamp excess digits in $c$. (\textit{mp\_clamp}) \\ +12. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{small} +\end{center} +\caption{Algorithm s\_mp\_add} +\end{figure} + +\textbf{Algorithm s\_mp\_add.} +This algorithm is loosely based on algorithm 14.7 of HAC \cite[pp. 594]{HAC} but has been extended to allow the inputs to have different magnitudes. +Coincidentally the description of algorithm A in Knuth \cite[pp. 266]{TAOCPV2} shares the same deficiency as the algorithm from \cite{HAC}. Even the +MIX pseudo machine code presented by Knuth \cite[pp. 266-267]{TAOCPV2} is incapable of handling inputs which are of different magnitudes. + +The first thing that has to be accomplished is to sort out which of the two inputs is the largest. The addition logic +will simply add all of the smallest input to the largest input and store that first part of the result in the +destination. Then it will apply a simpler addition loop to excess digits of the larger input. + +The first two steps will handle sorting the inputs such that $min$ and $max$ hold the digit counts of the two +inputs. The variable $x$ will be an mp\_int alias for the largest input or the second input $b$ if they have the +same number of digits. After the inputs are sorted the destination $c$ is grown as required to accomodate the sum +of the two inputs. The original \textbf{used} count of $c$ is copied and set to the new used count. + +At this point the first addition loop will go through as many digit positions that both inputs have. The carry +variable $\mu$ is set to zero outside the loop. Inside the loop an ``addition'' step requires three statements to produce +one digit of the summand. First +two digits from $a$ and $b$ are added together along with the carry $\mu$. The carry of this step is extracted and stored +in $\mu$ and finally the digit of the result $c_n$ is truncated within the range $0 \le c_n < \beta$. + +Now all of the digit positions that both inputs have in common have been exhausted. If $min \ne max$ then $x$ is an alias +for one of the inputs that has more digits. A simplified addition loop is then used to essentially copy the remaining digits +and the carry to the destination. + +The final carry is stored in $c_{max}$ and digits above $max$ upto $oldused$ are zeroed which completes the addition. + + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_add.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* low level addition, based on HAC pp.594, Algorithm 14.7 */ +018 int +019 s_mp_add (mp_int * a, mp_int * b, mp_int * c) +020 \{ +021 mp_int *x; +022 int olduse, res, min, max; +023 +024 /* find sizes, we let |a| <= |b| which means we have to sort +025 * them. "x" will point to the input with the most digits +026 */ +027 if (a->used > b->used) \{ +028 min = b->used; +029 max = a->used; +030 x = a; +031 \} else \{ +032 min = a->used; +033 max = b->used; +034 x = b; +035 \} +036 +037 /* init result */ +038 if (c->alloc < max + 1) \{ +039 if ((res = mp_grow (c, max + 1)) != MP_OKAY) \{ +040 return res; +041 \} +042 \} +043 +044 /* get old used digit count and set new one */ +045 olduse = c->used; +046 c->used = max + 1; +047 +048 \{ +049 register mp_digit u, *tmpa, *tmpb, *tmpc; +050 register int i; +051 +052 /* alias for digit pointers */ +053 +054 /* first input */ +055 tmpa = a->dp; +056 +057 /* second input */ +058 tmpb = b->dp; +059 +060 /* destination */ +061 tmpc = c->dp; +062 +063 /* zero the carry */ +064 u = 0; +065 for (i = 0; i < min; i++) \{ +066 /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */ +067 *tmpc = *tmpa++ + *tmpb++ + u; +068 +069 /* U = carry bit of T[i] */ +070 u = *tmpc >> ((mp_digit)DIGIT_BIT); +071 +072 /* take away carry bit from T[i] */ +073 *tmpc++ &= MP_MASK; +074 \} +075 +076 /* now copy higher words if any, that is in A+B +077 * if A or B has more digits add those in +078 */ +079 if (min != max) \{ +080 for (; i < max; i++) \{ +081 /* T[i] = X[i] + U */ +082 *tmpc = x->dp[i] + u; +083 +084 /* U = carry bit of T[i] */ +085 u = *tmpc >> ((mp_digit)DIGIT_BIT); +086 +087 /* take away carry bit from T[i] */ +088 *tmpc++ &= MP_MASK; +089 \} +090 \} +091 +092 /* add carry */ +093 *tmpc++ = u; +094 +095 /* clear digits above oldused */ +096 for (i = c->used; i < olduse; i++) \{ +097 *tmpc++ = 0; +098 \} +099 \} +100 +101 mp_clamp (c); +102 return MP_OKAY; +103 \} +\end{alltt} +\end{small} + +Lines 27 to 35 perform the initial sorting of the inputs and determine the $min$ and $max$ variables. Note that $x$ is a pointer to a +mp\_int assigned to the largest input, in effect it is a local alias. Lines 37 to 42 ensure that the destination is grown to +accomodate the result of the addition. + +Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style. The three aliases that are on +lines 55, 58 and 61 represent the two inputs and destination variables respectively. These aliases are used to ensure the +compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int. + +The initial carry $u$ is cleared on line 64, note that $u$ is of type mp\_digit which ensures type compatibility within the +implementation. The initial addition loop begins on line 65 and ends on line 74. Similarly the conditional addition loop +begins on line 80 and ends on line 90. The addition is finished with the final carry being stored in $tmpc$ on line 93. +Note the ``++'' operator on the same line. After line 93 $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$. This is useful +for the next loop on lines 96 to 99 which set any old upper digits to zero. + +\subsection{Low Level Subtraction} +The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm. The principle difference is that the +unsigned subtraction algorithm requires the result to be positive. That is when computing $a - b$ the condition $\vert a \vert \ge \vert b\vert$ must +be met for this algorithm to function properly. Keep in mind this low level algorithm is not meant to be used in higher level algorithms directly. +This algorithm as will be shown can be used to create functional signed addition and subtraction algorithms. + + +For this algorithm a new variable is required to make the description simpler. Recall from section 1.3.1 that a mp\_digit must be able to represent +the range $0 \le x < 2\beta$ for the algorithms to work correctly. However, it is allowable that a mp\_digit represent a larger range of values. For +this algorithm we will assume that the variable $\gamma$ represents the number of bits available in a +mp\_digit (\textit{this implies $2^{\gamma} > \beta$}). + +For example, the default for LibTomMath is to use a ``unsigned long'' for the mp\_digit ``type'' while $\beta = 2^{28}$. In ISO C an ``unsigned long'' +data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma = 32$. + +\newpage\begin{figure}[!here] +\begin{center} +\begin{small} +\begin{tabular}{l} +\hline Algorithm \textbf{s\_mp\_sub}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ ($\vert a \vert \ge \vert b \vert$) \\ +\textbf{Output}. The unsigned subtraction $c = \vert a \vert - \vert b \vert$. \\ +\hline \\ +1. $min \leftarrow b.used$ \\ +2. $max \leftarrow a.used$ \\ +3. If $c.alloc < max$ then grow $c$ to hold at least $max$ digits. (\textit{mp\_grow}) \\ +4. $oldused \leftarrow c.used$ \\ +5. $c.used \leftarrow max$ \\ +6. $u \leftarrow 0$ \\ +7. for $n$ from $0$ to $min - 1$ do \\ +\hspace{3mm}7.1 $c_n \leftarrow a_n - b_n - u$ \\ +\hspace{3mm}7.2 $u \leftarrow c_n >> (\gamma - 1)$ \\ +\hspace{3mm}7.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ +8. if $min < max$ then do \\ +\hspace{3mm}8.1 for $n$ from $min$ to $max - 1$ do \\ +\hspace{6mm}8.1.1 $c_n \leftarrow a_n - u$ \\ +\hspace{6mm}8.1.2 $u \leftarrow c_n >> (\gamma - 1)$ \\ +\hspace{6mm}8.1.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ +9. if $oldused > max$ then do \\ +\hspace{3mm}9.1 for $n$ from $max$ to $oldused - 1$ do \\ +\hspace{6mm}9.1.1 $c_n \leftarrow 0$ \\ +10. Clamp excess digits of $c$. (\textit{mp\_clamp}). \\ +11. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{small} +\end{center} +\caption{Algorithm s\_mp\_sub} +\end{figure} + +\textbf{Algorithm s\_mp\_sub.} +This algorithm performs the unsigned subtraction of two mp\_int variables under the restriction that the result must be positive. That is when +passing variables $a$ and $b$ the condition that $\vert a \vert \ge \vert b \vert$ must be met for the algorithm to function correctly. This +algorithm is loosely based on algorithm 14.9 \cite[pp. 595]{HAC} and is similar to algorithm S in \cite[pp. 267]{TAOCPV2} as well. As was the case +of the algorithm s\_mp\_add both other references lack discussion concerning various practical details such as when the inputs differ in magnitude. + +The initial sorting of the inputs is trivial in this algorithm since $a$ is guaranteed to have at least the same magnitude of $b$. Steps 1 and 2 +set the $min$ and $max$ variables. Unlike the addition routine there is guaranteed to be no carry which means that the final result can be at +most $max$ digits in length as opposed to $max + 1$. Similar to the addition algorithm the \textbf{used} count of $c$ is copied locally and +set to the maximal count for the operation. + +The subtraction loop that begins on step seven is essentially the same as the addition loop of algorithm s\_mp\_add except single precision +subtraction is used instead. Note the use of the $\gamma$ variable to extract the carry (\textit{also known as the borrow}) within the subtraction +loops. Under the assumption that two's complement single precision arithmetic is used this will successfully extract the desired carry. + +For example, consider subtracting $0101_2$ from $0100_2$ where $\gamma = 4$ and $\beta = 2$. The least significant bit will force a carry upwards to +the third bit which will be set to zero after the borrow. After the very first bit has been subtracted $4 - 1 \equiv 0011_2$ will remain, When the +third bit of $0101_2$ is subtracted from the result it will cause another carry. In this case though the carry will be forced to propagate all the +way to the most significant bit. + +Recall that $\beta < 2^{\gamma}$. This means that if a carry does occur just before the $lg(\beta)$'th bit it will propagate all the way to the most +significant bit. Thus, the high order bits of the mp\_digit that are not part of the actual digit will either be all zero, or all one. All that +is needed is a single zero or one bit for the carry. Therefore a single logical shift right by $\gamma - 1$ positions is sufficient to extract the +carry. This method of carry extraction may seem awkward but the reason for it becomes apparent when the implementation is discussed. + +If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and copy operation to propagate through the larger input $a$ into $c$. Step +10 will ensure that any leading digits of $c$ above the $max$'th position are zeroed. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_sub.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */ +018 int +019 s_mp_sub (mp_int * a, mp_int * b, mp_int * c) +020 \{ +021 int olduse, res, min, max; +022 +023 /* find sizes */ +024 min = b->used; +025 max = a->used; +026 +027 /* init result */ +028 if (c->alloc < max) \{ +029 if ((res = mp_grow (c, max)) != MP_OKAY) \{ +030 return res; +031 \} +032 \} +033 olduse = c->used; +034 c->used = max; +035 +036 \{ +037 register mp_digit u, *tmpa, *tmpb, *tmpc; +038 register int i; +039 +040 /* alias for digit pointers */ +041 tmpa = a->dp; +042 tmpb = b->dp; +043 tmpc = c->dp; +044 +045 /* set carry to zero */ +046 u = 0; +047 for (i = 0; i < min; i++) \{ +048 /* T[i] = A[i] - B[i] - U */ +049 *tmpc = *tmpa++ - *tmpb++ - u; +050 +051 /* U = carry bit of T[i] +052 * Note this saves performing an AND operation since +053 * if a carry does occur it will propagate all the way to the +054 * MSB. As a result a single shift is enough to get the carry +055 */ +056 u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1)); +057 +058 /* Clear carry from T[i] */ +059 *tmpc++ &= MP_MASK; +060 \} +061 +062 /* now copy higher words if any, e.g. if A has more digits than B */ +063 for (; i < max; i++) \{ +064 /* T[i] = A[i] - U */ +065 *tmpc = *tmpa++ - u; +066 +067 /* U = carry bit of T[i] */ +068 u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1)); +069 +070 /* Clear carry from T[i] */ +071 *tmpc++ &= MP_MASK; +072 \} +073 +074 /* clear digits above used (since we may not have grown result above) */ + +075 for (i = c->used; i < olduse; i++) \{ +076 *tmpc++ = 0; +077 \} +078 \} +079 +080 mp_clamp (c); +081 return MP_OKAY; +082 \} +083 +\end{alltt} +\end{small} + +Line 24 and 25 perform the initial hardcoded sorting of the inputs. In reality the $min$ and $max$ variables are only aliases and are only +used to make the source code easier to read. Again the pointer alias optimization is used within this algorithm. Lines 41, 42 and 43 initialize the aliases for +$a$, $b$ and $c$ respectively. + +The first subtraction loop occurs on lines 46 through 60. The theory behind the subtraction loop is exactly the same as that for +the addition loop. As remarked earlier there is an implementation reason for using the ``awkward'' method of extracting the carry +(\textit{see line 56}). The traditional method for extracting the carry would be to shift by $lg(\beta)$ positions and logically AND +the least significant bit. The AND operation is required because all of the bits above the $\lg(\beta)$'th bit will be set to one after a carry +occurs from subtraction. This carry extraction requires two relatively cheap operations to extract the carry. The other method is to simply +shift the most significant bit to the least significant bit thus extracting the carry with a single cheap operation. This optimization only works on +twos compliment machines which is a safe assumption to make. + +If $a$ has a larger magnitude than $b$ an additional loop (\textit{see lines 63 through 72}) is required to propagate the carry through +$a$ and copy the result to $c$. + +\subsection{High Level Addition} +Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be +established. This high level addition algorithm will be what other algorithms and developers will use to perform addition of mp\_int data +types. + +Recall from section 5.2 that an mp\_int represents an integer with an unsigned mantissa (\textit{the array of digits}) and a \textbf{sign} +flag. A high level addition is actually performed as a series of eight separate cases which can be optimized down to three unique cases. + +\begin{figure}[!here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_add}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ \\ +\textbf{Output}. The signed addition $c = a + b$. \\ +\hline \\ +1. if $a.sign = b.sign$ then do \\ +\hspace{3mm}1.1 $c.sign \leftarrow a.sign$ \\ +\hspace{3mm}1.2 $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add})\\ +2. else do \\ +\hspace{3mm}2.1 if $\vert a \vert < \vert b \vert$ then do (\textit{mp\_cmp\_mag}) \\ +\hspace{6mm}2.1.1 $c.sign \leftarrow b.sign$ \\ +\hspace{6mm}2.1.2 $c \leftarrow \vert b \vert - \vert a \vert$ (\textit{s\_mp\_sub}) \\ +\hspace{3mm}2.2 else do \\ +\hspace{6mm}2.2.1 $c.sign \leftarrow a.sign$ \\ +\hspace{6mm}2.2.2 $c \leftarrow \vert a \vert - \vert b \vert$ \\ +3. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_add} +\end{figure} + +\textbf{Algorithm mp\_add.} +This algorithm performs the signed addition of two mp\_int variables. There is no reference algorithm to draw upon from +either \cite{TAOCPV2} or \cite{HAC} since they both only provide unsigned operations. The algorithm is fairly +straightforward but restricted since subtraction can only produce positive results. + +\begin{figure}[here] +\begin{small} +\begin{center} +\begin{tabular}{|c|c|c|c|c|} +\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert > \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\ +\hline $+$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\ +\hline $+$ & $+$ & No & $c = a + b$ & $a.sign$ \\ +\hline $-$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\ +\hline $-$ & $-$ & No & $c = a + b$ & $a.sign$ \\ +\hline &&&&\\ + +\hline $+$ & $-$ & No & $c = b - a$ & $b.sign$ \\ +\hline $-$ & $+$ & No & $c = b - a$ & $b.sign$ \\ + +\hline &&&&\\ + +\hline $+$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\ +\hline $-$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\ + +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Addition Guide Chart} +\label{fig:AddChart} +\end{figure} + +Figure~\ref{fig:AddChart} lists all of the eight possible input combinations and is sorted to show that only three +specific cases need to be handled. The return code of the unsigned operations at step 1.2, 2.1.2 and 2.2.2 are +forwarded to step three to check for errors. This simplifies the description of the algorithm considerably and best +follows how the implementation actually was achieved. + +Also note how the \textbf{sign} is set before the unsigned addition or subtraction is performed. Recall from the descriptions of algorithms +s\_mp\_add and s\_mp\_sub that the mp\_clamp function is used at the end to trim excess digits. The mp\_clamp algorithm will set the \textbf{sign} +to \textbf{MP\_ZPOS} when the \textbf{used} digit count reaches zero. + +For example, consider performing $-a + a$ with algorithm mp\_add. By the description of the algorithm the sign is set to \textbf{MP\_NEG} which would +produce a result of $-0$. However, since the sign is set first then the unsigned addition is performed the subsequent usage of algorithm mp\_clamp +within algorithm s\_mp\_add will force $-0$ to become $0$. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_add.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* high level addition (handles signs) */ +018 int mp_add (mp_int * a, mp_int * b, mp_int * c) +019 \{ +020 int sa, sb, res; +021 +022 /* get sign of both inputs */ +023 sa = a->sign; +024 sb = b->sign; +025 +026 /* handle two cases, not four */ +027 if (sa == sb) \{ +028 /* both positive or both negative */ +029 /* add their magnitudes, copy the sign */ +030 c->sign = sa; +031 res = s_mp_add (a, b, c); +032 \} else \{ +033 /* one positive, the other negative */ +034 /* subtract the one with the greater magnitude from */ +035 /* the one of the lesser magnitude. The result gets */ +036 /* the sign of the one with the greater magnitude. */ +037 if (mp_cmp_mag (a, b) == MP_LT) \{ +038 c->sign = sb; +039 res = s_mp_sub (b, a, c); +040 \} else \{ +041 c->sign = sa; +042 res = s_mp_sub (a, b, c); +043 \} +044 \} +045 return res; +046 \} +047 +\end{alltt} +\end{small} + +The source code follows the algorithm fairly closely. The most notable new source code addition is the usage of the $res$ integer variable which +is used to pass result of the unsigned operations forward. Unlike in the algorithm, the variable $res$ is merely returned as is without +explicitly checking it and returning the constant \textbf{MP\_OKAY}. The observation is this algorithm will succeed or fail only if the lower +level functions do so. Returning their return code is sufficient. + +\subsection{High Level Subtraction} +The high level signed subtraction algorithm is essentially the same as the high level signed addition algorithm. + +\newpage\begin{figure}[!here] +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_sub}. \\ +\textbf{Input}. Two mp\_ints $a$ and $b$ \\ +\textbf{Output}. The signed subtraction $c = a - b$. \\ +\hline \\ +1. if $a.sign \ne b.sign$ then do \\ +\hspace{3mm}1.1 $c.sign \leftarrow a.sign$ \\ +\hspace{3mm}1.2 $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add}) \\ +2. else do \\ +\hspace{3mm}2.1 if $\vert a \vert \ge \vert b \vert$ then do (\textit{mp\_cmp\_mag}) \\ +\hspace{6mm}2.1.1 $c.sign \leftarrow a.sign$ \\ +\hspace{6mm}2.1.2 $c \leftarrow \vert a \vert - \vert b \vert$ (\textit{s\_mp\_sub}) \\ +\hspace{3mm}2.2 else do \\ +\hspace{6mm}2.2.1 $c.sign \leftarrow \left \lbrace \begin{array}{ll} + MP\_ZPOS & \mbox{if }a.sign = MP\_NEG \\ + MP\_NEG & \mbox{otherwise} \\ + \end{array} \right .$ \\ +\hspace{6mm}2.2.2 $c \leftarrow \vert b \vert - \vert a \vert$ \\ +3. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\caption{Algorithm mp\_sub} +\end{figure} + +\textbf{Algorithm mp\_sub.} +This algorithm performs the signed subtraction of two inputs. Similar to algorithm mp\_add there is no reference in either \cite{TAOCPV2} or +\cite{HAC}. Also this algorithm is restricted by algorithm s\_mp\_sub. Chart \ref{fig:SubChart} lists the eight possible inputs and +the operations required. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{|c|c|c|c|c|} +\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert \ge \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\ +\hline $+$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\ +\hline $+$ & $-$ & No & $c = a + b$ & $a.sign$ \\ +\hline $-$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\ +\hline $-$ & $+$ & No & $c = a + b$ & $a.sign$ \\ +\hline &&&& \\ +\hline $+$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\ +\hline $-$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\ +\hline &&&& \\ +\hline $+$ & $+$ & No & $c = b - a$ & $\mbox{opposite of }a.sign$ \\ +\hline $-$ & $-$ & No & $c = b - a$ & $\mbox{opposite of }a.sign$ \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Subtraction Guide Chart} +\label{fig:SubChart} +\end{figure} + +Similar to the case of algorithm mp\_add the \textbf{sign} is set first before the unsigned addition or subtraction. That is to prevent the +algorithm from producing $-a - -a = -0$ as a result. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_sub.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* high level subtraction (handles signs) */ +018 int +019 mp_sub (mp_int * a, mp_int * b, mp_int * c) +020 \{ +021 int sa, sb, res; +022 +023 sa = a->sign; +024 sb = b->sign; +025 +026 if (sa != sb) \{ +027 /* subtract a negative from a positive, OR */ +028 /* subtract a positive from a negative. */ +029 /* In either case, ADD their magnitudes, */ +030 /* and use the sign of the first number. */ +031 c->sign = sa; +032 res = s_mp_add (a, b, c); +033 \} else \{ +034 /* subtract a positive from a positive, OR */ +035 /* subtract a negative from a negative. */ +036 /* First, take the difference between their */ +037 /* magnitudes, then... */ +038 if (mp_cmp_mag (a, b) != MP_LT) \{ +039 /* Copy the sign from the first */ +040 c->sign = sa; +041 /* The first has a larger or equal magnitude */ +042 res = s_mp_sub (a, b, c); +043 \} else \{ +044 /* The result has the *opposite* sign from */ +045 /* the first number. */ +046 c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS; +047 /* The second has a larger magnitude */ +048 res = s_mp_sub (b, a, c); +049 \} +050 \} +051 return res; +052 \} +053 +\end{alltt} +\end{small} + +Much like the implementation of algorithm mp\_add the variable $res$ is used to catch the return code of the unsigned addition or subtraction operations +and forward it to the end of the function. On line 38 the ``not equal to'' \textbf{MP\_LT} expression is used to emulate a +``greater than or equal to'' comparison. + +\section{Bit and Digit Shifting} +It is quite common to think of a multiple precision integer as a polynomial in $x$, that is $y = f(\beta)$ where $f(x) = \sum_{i=0}^{n-1} a_i x^i$. +This notation arises within discussion of Montgomery and Diminished Radix Reduction as well as Karatsuba multiplication and squaring. + +In order to facilitate operations on polynomials in $x$ as above a series of simple ``digit'' algorithms have to be established. That is to shift +the digits left or right as well to shift individual bits of the digits left and right. It is important to note that not all ``shift'' operations +are on radix-$\beta$ digits. + +\subsection{Multiplication by Two} + +In a binary system where the radix is a power of two multiplication by two not only arises often in other algorithms it is a fairly efficient +operation to perform. A single precision logical shift left is sufficient to multiply a single digit by two. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_mul\_2}. \\ +\textbf{Input}. One mp\_int $a$ \\ +\textbf{Output}. $b = 2a$. \\ +\hline \\ +1. If $b.alloc < a.used + 1$ then grow $b$ to hold $a.used + 1$ digits. (\textit{mp\_grow}) \\ +2. $oldused \leftarrow b.used$ \\ +3. $b.used \leftarrow a.used$ \\ +4. $r \leftarrow 0$ \\ +5. for $n$ from 0 to $a.used - 1$ do \\ +\hspace{3mm}5.1 $rr \leftarrow a_n >> (lg(\beta) - 1)$ \\ +\hspace{3mm}5.2 $b_n \leftarrow (a_n << 1) + r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{3mm}5.3 $r \leftarrow rr$ \\ +6. If $r \ne 0$ then do \\ +\hspace{3mm}6.1 $b_{n + 1} \leftarrow r$ \\ +\hspace{3mm}6.2 $b.used \leftarrow b.used + 1$ \\ +7. If $b.used < oldused - 1$ then do \\ +\hspace{3mm}7.1 for $n$ from $b.used$ to $oldused - 1$ do \\ +\hspace{6mm}7.1.1 $b_n \leftarrow 0$ \\ +8. $b.sign \leftarrow a.sign$ \\ +9. Return(\textit{MP\_OKAY}).\\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_mul\_2} +\end{figure} + +\textbf{Algorithm mp\_mul\_2.} +This algorithm will quickly multiply a mp\_int by two provided $\beta$ is a power of two. Neither \cite{TAOCPV2} nor \cite{HAC} describe such +an algorithm despite the fact it arises often in other algorithms. The algorithm is setup much like the lower level algorithm s\_mp\_add since +it is for all intents and purposes equivalent to the operation $b = \vert a \vert + \vert a \vert$. + +Step 1 and 2 grow the input as required to accomodate the maximum number of \textbf{used} digits in the result. The initial \textbf{used} count +is set to $a.used$ at step 4. Only if there is a final carry will the \textbf{used} count require adjustment. + +Step 6 is an optimization implementation of the addition loop for this specific case. That is since the two values being added together +are the same there is no need to perform two reads from the digits of $a$. Step 6.1 performs a single precision shift on the current digit $a_n$ to +obtain what will be the carry for the next iteration. Step 6.2 calculates the $n$'th digit of the result as single precision shift of $a_n$ plus +the previous carry. Recall from section 4.1 that $a_n << 1$ is equivalent to $a_n \cdot 2$. An iteration of the addition loop is finished with +forwarding the carry to the next iteration. + +Step 7 takes care of any final carry by setting the $a.used$'th digit of the result to the carry and augmenting the \textbf{used} count of $b$. +Step 8 clears any leading digits of $b$ in case it originally had a larger magnitude than $a$. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_2.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* b = a*2 */ +018 int mp_mul_2(mp_int * a, mp_int * b) +019 \{ +020 int x, res, oldused; +021 +022 /* grow to accomodate result */ +023 if (b->alloc < a->used + 1) \{ +024 if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) \{ +025 return res; +026 \} +027 \} +028 +029 oldused = b->used; +030 b->used = a->used; +031 +032 \{ +033 register mp_digit r, rr, *tmpa, *tmpb; +034 +035 /* alias for source */ +036 tmpa = a->dp; +037 +038 /* alias for dest */ +039 tmpb = b->dp; +040 +041 /* carry */ +042 r = 0; +043 for (x = 0; x < a->used; x++) \{ +044 +045 /* get what will be the *next* carry bit from the +046 * MSB of the current digit +047 */ +048 rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1)); +049 +050 /* now shift up this digit, add in the carry [from the previous] */ +051 *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK; +052 +053 /* copy the carry that would be from the source +054 * digit into the next iteration +055 */ +056 r = rr; +057 \} +058 +059 /* new leading digit? */ +060 if (r != 0) \{ +061 /* add a MSB which is always 1 at this point */ +062 *tmpb = 1; +063 ++(b->used); +064 \} +065 +066 /* now zero any excess digits on the destination +067 * that we didn't write to +068 */ +069 tmpb = b->dp + b->used; +070 for (x = b->used; x < oldused; x++) \{ +071 *tmpb++ = 0; +072 \} +073 \} +074 b->sign = a->sign; +075 return MP_OKAY; +076 \} +\end{alltt} +\end{small} + +This implementation is essentially an optimized implementation of s\_mp\_add for the case of doubling an input. The only noteworthy difference +is the use of the logical shift operator on line 51 to perform a single precision doubling. + +\subsection{Division by Two} +A division by two can just as easily be accomplished with a logical shift right as multiplication by two can be with a logical shift left. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_div\_2}. \\ +\textbf{Input}. One mp\_int $a$ \\ +\textbf{Output}. $b = a/2$. \\ +\hline \\ +1. If $b.alloc < a.used$ then grow $b$ to hold $a.used$ digits. (\textit{mp\_grow}) \\ +2. If the reallocation failed return(\textit{MP\_MEM}). \\ +3. $oldused \leftarrow b.used$ \\ +4. $b.used \leftarrow a.used$ \\ +5. $r \leftarrow 0$ \\ +6. for $n$ from $b.used - 1$ to $0$ do \\ +\hspace{3mm}6.1 $rr \leftarrow a_n \mbox{ (mod }2\mbox{)}$\\ +\hspace{3mm}6.2 $b_n \leftarrow (a_n >> 1) + (r << (lg(\beta) - 1)) \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{3mm}6.3 $r \leftarrow rr$ \\ +7. If $b.used < oldused - 1$ then do \\ +\hspace{3mm}7.1 for $n$ from $b.used$ to $oldused - 1$ do \\ +\hspace{6mm}7.1.1 $b_n \leftarrow 0$ \\ +8. $b.sign \leftarrow a.sign$ \\ +9. Clamp excess digits of $b$. (\textit{mp\_clamp}) \\ +10. Return(\textit{MP\_OKAY}).\\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_div\_2} +\end{figure} + +\textbf{Algorithm mp\_div\_2.} +This algorithm will divide an mp\_int by two using logical shifts to the right. Like mp\_mul\_2 it uses a modified low level addition +core as the basis of the algorithm. Unlike mp\_mul\_2 the shift operations work from the leading digit to the trailing digit. The algorithm +could be written to work from the trailing digit to the leading digit however, it would have to stop one short of $a.used - 1$ digits to prevent +reading past the end of the array of digits. + +Essentially the loop at step 6 is similar to that of mp\_mul\_2 except the logical shifts go in the opposite direction and the carry is at the +least significant bit not the most significant bit. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_2.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* b = a/2 */ +018 int mp_div_2(mp_int * a, mp_int * b) +019 \{ +020 int x, res, oldused; +021 +022 /* copy */ +023 if (b->alloc < a->used) \{ +024 if ((res = mp_grow (b, a->used)) != MP_OKAY) \{ +025 return res; +026 \} +027 \} +028 +029 oldused = b->used; +030 b->used = a->used; +031 \{ +032 register mp_digit r, rr, *tmpa, *tmpb; +033 +034 /* source alias */ +035 tmpa = a->dp + b->used - 1; +036 +037 /* dest alias */ +038 tmpb = b->dp + b->used - 1; +039 +040 /* carry */ +041 r = 0; +042 for (x = b->used - 1; x >= 0; x--) \{ +043 /* get the carry for the next iteration */ +044 rr = *tmpa & 1; +045 +046 /* shift the current digit, add in carry and store */ +047 *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1)); +048 +049 /* forward carry to next iteration */ +050 r = rr; +051 \} +052 +053 /* zero excess digits */ +054 tmpb = b->dp + b->used; +055 for (x = b->used; x < oldused; x++) \{ +056 *tmpb++ = 0; +057 \} +058 \} +059 b->sign = a->sign; +060 mp_clamp (b); +061 return MP_OKAY; +062 \} +\end{alltt} +\end{small} + +\section{Polynomial Basis Operations} +Recall from section 4.3 that any integer can be represented as a polynomial in $x$ as $y = f(\beta)$. Such a representation is also known as +the polynomial basis \cite[pp. 48]{ROSE}. Given such a notation a multiplication or division by $x$ amounts to shifting whole digits a single +place. The need for such operations arises in several other higher level algorithms such as Barrett and Montgomery reduction, integer +division and Karatsuba multiplication. + +Converting from an array of digits to polynomial basis is very simple. Consider the integer $y \equiv (a_2, a_1, a_0)_{\beta}$ and recall that +$y = \sum_{i=0}^{2} a_i \beta^i$. Simply replace $\beta$ with $x$ and the expression is in polynomial basis. For example, $f(x) = 8x + 9$ is the +polynomial basis representation for $89$ using radix ten. That is, $f(10) = 8(10) + 9 = 89$. + +\subsection{Multiplication by $x$} + +Given a polynomial in $x$ such as $f(x) = a_n x^n + a_{n-1} x^{n-1} + ... + a_0$ multiplying by $x$ amounts to shifting the coefficients up one +degree. In this case $f(x) \cdot x = a_n x^{n+1} + a_{n-1} x^n + ... + a_0 x$. From a scalar basis point of view multiplying by $x$ is equivalent to +multiplying by the integer $\beta$. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_lshd}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $a \leftarrow a \cdot \beta^b$ (equivalent to multiplication by $x^b$). \\ +\hline \\ +1. If $b \le 0$ then return(\textit{MP\_OKAY}). \\ +2. If $a.alloc < a.used + b$ then grow $a$ to at least $a.used + b$ digits. (\textit{mp\_grow}). \\ +3. If the reallocation failed return(\textit{MP\_MEM}). \\ +4. $a.used \leftarrow a.used + b$ \\ +5. $i \leftarrow a.used - 1$ \\ +6. $j \leftarrow a.used - 1 - b$ \\ +7. for $n$ from $a.used - 1$ to $b$ do \\ +\hspace{3mm}7.1 $a_{i} \leftarrow a_{j}$ \\ +\hspace{3mm}7.2 $i \leftarrow i - 1$ \\ +\hspace{3mm}7.3 $j \leftarrow j - 1$ \\ +8. for $n$ from 0 to $b - 1$ do \\ +\hspace{3mm}8.1 $a_n \leftarrow 0$ \\ +9. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_lshd} +\end{figure} + +\textbf{Algorithm mp\_lshd.} +This algorithm multiplies an mp\_int by the $b$'th power of $x$. This is equivalent to multiplying by $\beta^b$. The algorithm differs +from the other algorithms presented so far as it performs the operation in place instead storing the result in a separate location. The +motivation behind this change is due to the way this function is typically used. Algorithms such as mp\_add store the result in an optionally +different third mp\_int because the original inputs are often still required. Algorithm mp\_lshd (\textit{and similarly algorithm mp\_rshd}) is +typically used on values where the original value is no longer required. The algorithm will return success immediately if +$b \le 0$ since the rest of algorithm is only valid when $b > 0$. + +First the destination $a$ is grown as required to accomodate the result. The counters $i$ and $j$ are used to form a \textit{sliding window} over +the digits of $a$ of length $b$. The head of the sliding window is at $i$ (\textit{the leading digit}) and the tail at $j$ (\textit{the trailing digit}). +The loop on step 7 copies the digit from the tail to the head. In each iteration the window is moved down one digit. The last loop on +step 8 sets the lower $b$ digits to zero. + +\newpage +\begin{center} +\begin{figure}[here] +\includegraphics{pics/sliding_window.ps} +\caption{Sliding Window Movement} +\label{pic:sliding_window} +\end{figure} +\end{center} + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_lshd.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* shift left a certain amount of digits */ +018 int mp_lshd (mp_int * a, int b) +019 \{ +020 int x, res; +021 +022 /* if its less than zero return */ +023 if (b <= 0) \{ +024 return MP_OKAY; +025 \} +026 +027 /* grow to fit the new digits */ +028 if (a->alloc < a->used + b) \{ +029 if ((res = mp_grow (a, a->used + b)) != MP_OKAY) \{ +030 return res; +031 \} +032 \} +033 +034 \{ +035 register mp_digit *top, *bottom; +036 +037 /* increment the used by the shift amount then copy upwards */ +038 a->used += b; +039 +040 /* top */ +041 top = a->dp + a->used - 1; +042 +043 /* base */ +044 bottom = a->dp + a->used - 1 - b; +045 +046 /* much like mp_rshd this is implemented using a sliding window +047 * except the window goes the otherway around. Copying from +048 * the bottom to the top. see bn_mp_rshd.c for more info. +049 */ +050 for (x = a->used - 1; x >= b; x--) \{ +051 *top-- = *bottom--; +052 \} +053 +054 /* zero the lower digits */ +055 top = a->dp; +056 for (x = 0; x < b; x++) \{ +057 *top++ = 0; +058 \} +059 \} +060 return MP_OKAY; +061 \} +\end{alltt} +\end{small} + +The if statement on line 23 ensures that the $b$ variable is greater than zero. The \textbf{used} count is incremented by $b$ before +the copy loop begins. This elminates the need for an additional variable in the for loop. The variable $top$ on line 41 is an alias +for the leading digit while $bottom$ on line 44 is an alias for the trailing edge. The aliases form a window of exactly $b$ digits +over the input. + +\subsection{Division by $x$} + +Division by powers of $x$ is easily achieved by shifting the digits right and removing any that will end up to the right of the zero'th digit. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_rshd}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $a \leftarrow a / \beta^b$ (Divide by $x^b$). \\ +\hline \\ +1. If $b \le 0$ then return. \\ +2. If $a.used \le b$ then do \\ +\hspace{3mm}2.1 Zero $a$. (\textit{mp\_zero}). \\ +\hspace{3mm}2.2 Return. \\ +3. $i \leftarrow 0$ \\ +4. $j \leftarrow b$ \\ +5. for $n$ from 0 to $a.used - b - 1$ do \\ +\hspace{3mm}5.1 $a_i \leftarrow a_j$ \\ +\hspace{3mm}5.2 $i \leftarrow i + 1$ \\ +\hspace{3mm}5.3 $j \leftarrow j + 1$ \\ +6. for $n$ from $a.used - b$ to $a.used - 1$ do \\ +\hspace{3mm}6.1 $a_n \leftarrow 0$ \\ +7. $a.used \leftarrow a.used - b$ \\ +8. Return. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_rshd} +\end{figure} + +\textbf{Algorithm mp\_rshd.} +This algorithm divides the input in place by the $b$'th power of $x$. It is analogous to dividing by a $\beta^b$ but much quicker since +it does not require single precision division. This algorithm does not actually return an error code as it cannot fail. + +If the input $b$ is less than one the algorithm quickly returns without performing any work. If the \textbf{used} count is less than or equal +to the shift count $b$ then it will simply zero the input and return. + +After the trivial cases of inputs have been handled the sliding window is setup. Much like the case of algorithm mp\_lshd a sliding window that +is $b$ digits wide is used to copy the digits. Unlike mp\_lshd the window slides in the opposite direction from the trailing to the leading digit. +Also the digits are copied from the leading to the trailing edge. + +Once the window copy is complete the upper digits must be zeroed and the \textbf{used} count decremented. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_rshd.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* shift right a certain amount of digits */ +018 void mp_rshd (mp_int * a, int b) +019 \{ +020 int x; +021 +022 /* if b <= 0 then ignore it */ +023 if (b <= 0) \{ +024 return; +025 \} +026 +027 /* if b > used then simply zero it and return */ +028 if (a->used <= b) \{ +029 mp_zero (a); +030 return; +031 \} +032 +033 \{ +034 register mp_digit *bottom, *top; +035 +036 /* shift the digits down */ +037 +038 /* bottom */ +039 bottom = a->dp; +040 +041 /* top [offset into digits] */ +042 top = a->dp + b; +043 +044 /* this is implemented as a sliding window where +045 * the window is b-digits long and digits from +046 * the top of the window are copied to the bottom +047 * +048 * e.g. +049 +050 b-2 | b-1 | b0 | b1 | b2 | ... | bb | ----> +051 /\symbol{92} | ----> +052 \symbol{92}-------------------/ ----> +053 */ +054 for (x = 0; x < (a->used - b); x++) \{ +055 *bottom++ = *top++; +056 \} +057 +058 /* zero the top digits */ +059 for (; x < a->used; x++) \{ +060 *bottom++ = 0; +061 \} +062 \} +063 +064 /* remove excess digits */ +065 a->used -= b; +066 \} +\end{alltt} +\end{small} + +The only noteworthy element of this routine is the lack of a return type. + +-- Will update later to give it a return type...Tom + +\section{Powers of Two} + +Now that algorithms for moving single bits as well as whole digits exist algorithms for moving the ``in between'' distances are required. For +example, to quickly multiply by $2^k$ for any $k$ without using a full multiplier algorithm would prove useful. Instead of performing single +shifts $k$ times to achieve a multiplication by $2^{\pm k}$ a mixture of whole digit shifting and partial digit shifting is employed. + +\subsection{Multiplication by Power of Two} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_mul\_2d}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $c \leftarrow a \cdot 2^b$. \\ +\hline \\ +1. $c \leftarrow a$. (\textit{mp\_copy}) \\ +2. If $c.alloc < c.used + \lfloor b / lg(\beta) \rfloor + 2$ then grow $c$ accordingly. \\ +3. If the reallocation failed return(\textit{MP\_MEM}). \\ +4. If $b \ge lg(\beta)$ then \\ +\hspace{3mm}4.1 $c \leftarrow c \cdot \beta^{\lfloor b / lg(\beta) \rfloor}$ (\textit{mp\_lshd}). \\ +\hspace{3mm}4.2 If step 4.1 failed return(\textit{MP\_MEM}). \\ +5. $d \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\ +6. If $d \ne 0$ then do \\ +\hspace{3mm}6.1 $mask \leftarrow 2^d$ \\ +\hspace{3mm}6.2 $r \leftarrow 0$ \\ +\hspace{3mm}6.3 for $n$ from $0$ to $c.used - 1$ do \\ +\hspace{6mm}6.3.1 $rr \leftarrow c_n >> (lg(\beta) - d) \mbox{ (mod }mask\mbox{)}$ \\ +\hspace{6mm}6.3.2 $c_n \leftarrow (c_n << d) + r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{6mm}6.3.3 $r \leftarrow rr$ \\ +\hspace{3mm}6.4 If $r > 0$ then do \\ +\hspace{6mm}6.4.1 $c_{c.used} \leftarrow r$ \\ +\hspace{6mm}6.4.2 $c.used \leftarrow c.used + 1$ \\ +7. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_mul\_2d} +\end{figure} + +\textbf{Algorithm mp\_mul\_2d.} +This algorithm multiplies $a$ by $2^b$ and stores the result in $c$. The algorithm uses algorithm mp\_lshd and a derivative of algorithm mp\_mul\_2 to +quickly compute the product. + +First the algorithm will multiply $a$ by $x^{\lfloor b / lg(\beta) \rfloor}$ which will ensure that the remainder multiplicand is less than +$\beta$. For example, if $b = 37$ and $\beta = 2^{28}$ then this step will multiply by $x$ leaving a multiplication by $2^{37 - 28} = 2^{9}$ +left. + +After the digits have been shifted appropriately at most $lg(\beta) - 1$ shifts are left to perform. Step 5 calculates the number of remaining shifts +required. If it is non-zero a modified shift loop is used to calculate the remaining product. +Essentially the loop is a generic version of algorith mp\_mul2 designed to handle any shift count in the range $1 \le x < lg(\beta)$. The $mask$ +variable is used to extract the upper $d$ bits to form the carry for the next iteration. + +This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to +complete. It is possible to optimize this algorithm down to a $O(n)$ algorithm at a cost of making the algorithm slightly harder to follow. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_2d.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* shift left by a certain bit count */ +018 int mp_mul_2d (mp_int * a, int b, mp_int * c) +019 \{ +020 mp_digit d; +021 int res; +022 +023 /* copy */ +024 if (a != c) \{ +025 if ((res = mp_copy (a, c)) != MP_OKAY) \{ +026 return res; +027 \} +028 \} +029 +030 if (c->alloc < (int)(c->used + b/DIGIT_BIT + 1)) \{ +031 if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) \{ +032 return res; +033 \} +034 \} +035 +036 /* shift by as many digits in the bit count */ +037 if (b >= (int)DIGIT_BIT) \{ +038 if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) \{ +039 return res; +040 \} +041 \} +042 +043 /* shift any bit count < DIGIT_BIT */ +044 d = (mp_digit) (b % DIGIT_BIT); +045 if (d != 0) \{ +046 register mp_digit *tmpc, shift, mask, r, rr; +047 register int x; +048 +049 /* bitmask for carries */ +050 mask = (((mp_digit)1) << d) - 1; +051 +052 /* shift for msbs */ +053 shift = DIGIT_BIT - d; +054 +055 /* alias */ +056 tmpc = c->dp; +057 +058 /* carry */ +059 r = 0; +060 for (x = 0; x < c->used; x++) \{ +061 /* get the higher bits of the current word */ +062 rr = (*tmpc >> shift) & mask; +063 +064 /* shift the current word and OR in the carry */ +065 *tmpc = ((*tmpc << d) | r) & MP_MASK; +066 ++tmpc; +067 +068 /* set the carry to the carry bits of the current word */ +069 r = rr; +070 \} +071 +072 /* set final carry */ +073 if (r != 0) \{ +074 c->dp[(c->used)++] = r; +075 \} +076 \} +077 mp_clamp (c); +078 return MP_OKAY; +079 \} +\end{alltt} +\end{small} + +Notes to be revised when code is updated. -- Tom + +\subsection{Division by Power of Two} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_div\_2d}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $c \leftarrow \lfloor a / 2^b \rfloor, d \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\ +\hline \\ +1. If $b \le 0$ then do \\ +\hspace{3mm}1.1 $c \leftarrow a$ (\textit{mp\_copy}) \\ +\hspace{3mm}1.2 $d \leftarrow 0$ (\textit{mp\_zero}) \\ +\hspace{3mm}1.3 Return(\textit{MP\_OKAY}). \\ +2. $c \leftarrow a$ \\ +3. $d \leftarrow a \mbox{ (mod }2^b\mbox{)}$ (\textit{mp\_mod\_2d}) \\ +4. If $b \ge lg(\beta)$ then do \\ +\hspace{3mm}4.1 $c \leftarrow \lfloor c/\beta^{\lfloor b/lg(\beta) \rfloor} \rfloor$ (\textit{mp\_rshd}). \\ +5. $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\ +6. If $k \ne 0$ then do \\ +\hspace{3mm}6.1 $mask \leftarrow 2^k$ \\ +\hspace{3mm}6.2 $r \leftarrow 0$ \\ +\hspace{3mm}6.3 for $n$ from $c.used - 1$ to $0$ do \\ +\hspace{6mm}6.3.1 $rr \leftarrow c_n \mbox{ (mod }mask\mbox{)}$ \\ +\hspace{6mm}6.3.2 $c_n \leftarrow (c_n >> k) + (r << (lg(\beta) - k))$ \\ +\hspace{6mm}6.3.3 $r \leftarrow rr$ \\ +7. Clamp excess digits of $c$. (\textit{mp\_clamp}) \\ +8. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_div\_2d} +\end{figure} + +\textbf{Algorithm mp\_div\_2d.} +This algorithm will divide an input $a$ by $2^b$ and produce the quotient and remainder. The algorithm is designed much like algorithm +mp\_mul\_2d by first using whole digit shifts then single precision shifts. This algorithm will also produce the remainder of the division +by using algorithm mp\_mod\_2d. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_2d.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* shift right by a certain bit count (store quotient in c, optional remaind + er in d) */ +018 int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d) +019 \{ +020 mp_digit D, r, rr; +021 int x, res; +022 mp_int t; +023 +024 +025 /* if the shift count is <= 0 then we do no work */ +026 if (b <= 0) \{ +027 res = mp_copy (a, c); +028 if (d != NULL) \{ +029 mp_zero (d); +030 \} +031 return res; +032 \} +033 +034 if ((res = mp_init (&t)) != MP_OKAY) \{ +035 return res; +036 \} +037 +038 /* get the remainder */ +039 if (d != NULL) \{ +040 if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) \{ +041 mp_clear (&t); +042 return res; +043 \} +044 \} +045 +046 /* copy */ +047 if ((res = mp_copy (a, c)) != MP_OKAY) \{ +048 mp_clear (&t); +049 return res; +050 \} +051 +052 /* shift by as many digits in the bit count */ +053 if (b >= (int)DIGIT_BIT) \{ +054 mp_rshd (c, b / DIGIT_BIT); +055 \} +056 +057 /* shift any bit count < DIGIT_BIT */ +058 D = (mp_digit) (b % DIGIT_BIT); +059 if (D != 0) \{ +060 register mp_digit *tmpc, mask, shift; +061 +062 /* mask */ +063 mask = (((mp_digit)1) << D) - 1; +064 +065 /* shift for lsb */ +066 shift = DIGIT_BIT - D; +067 +068 /* alias */ +069 tmpc = c->dp + (c->used - 1); +070 +071 /* carry */ +072 r = 0; +073 for (x = c->used - 1; x >= 0; x--) \{ +074 /* get the lower bits of this word in a temp */ +075 rr = *tmpc & mask; +076 +077 /* shift the current word and mix in the carry bits from the previous + word */ +078 *tmpc = (*tmpc >> D) | (r << shift); +079 --tmpc; +080 +081 /* set the carry to the carry bits of the current word found above */ +082 r = rr; +083 \} +084 \} +085 mp_clamp (c); +086 if (d != NULL) \{ +087 mp_exch (&t, d); +088 \} +089 mp_clear (&t); +090 return MP_OKAY; +091 \} +\end{alltt} +\end{small} + +The implementation of algorithm mp\_div\_2d is slightly different than the algorithm specifies. The remainder $d$ may be optionally +ignored by passing \textbf{NULL} as the pointer to the mp\_int variable. The temporary mp\_int variable $t$ is used to hold the +result of the remainder operation until the end. This allows $d$ and $a$ to represent the same mp\_int without modifying $a$ before +the quotient is obtained. + +The remainder of the source code is essentially the same as the source code for mp\_mul\_2d. (-- Fix this paragraph up later, Tom). + +\subsection{Remainder of Division by Power of Two} + +The last algorithm in the series of polynomial basis power of two algorithms is calculating the remainder of division by $2^b$. This +algorithm benefits from the fact that in twos complement arithmetic $a \mbox{ (mod }2^b\mbox{)}$ is the same as $a$ AND $2^b - 1$. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_mod\_2d}. \\ +\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ +\textbf{Output}. $c \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\ +\hline \\ +1. If $b \le 0$ then do \\ +\hspace{3mm}1.1 $c \leftarrow 0$ (\textit{mp\_zero}) \\ +\hspace{3mm}1.2 Return(\textit{MP\_OKAY}). \\ +2. If $b > a.used \cdot lg(\beta)$ then do \\ +\hspace{3mm}2.1 $c \leftarrow a$ (\textit{mp\_copy}) \\ +\hspace{3mm}2.2 Return the result of step 2.1. \\ +3. $c \leftarrow a$ \\ +4. If step 3 failed return(\textit{MP\_MEM}). \\ +5. for $n$ from $\lceil b / lg(\beta) \rceil$ to $c.used$ do \\ +\hspace{3mm}5.1 $c_n \leftarrow 0$ \\ +6. $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\ +7. $c_{\lfloor b / lg(\beta) \rfloor} \leftarrow c_{\lfloor b / lg(\beta) \rfloor} \mbox{ (mod }2^{k}\mbox{)}$. \\ +8. Clamp excess digits of $c$. (\textit{mp\_clamp}) \\ +9. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_mod\_2d} +\end{figure} + +\textbf{Algorithm mp\_mod\_2d.} +This algorithm will quickly calculate the value of $a \mbox{ (mod }2^b\mbox{)}$. First if $b$ is less than or equal to zero the +result is set to zero. If $b$ is greater than the number of bits in $a$ then it simply copies $a$ to $c$ and returns. Otherwise, $a$ +is copied to $b$, leading digits are removed and the remaining leading digit is trimed to the exact bit count. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_mod\_2d.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* calc a value mod 2**b */ +018 int +019 mp_mod_2d (mp_int * a, int b, mp_int * c) +020 \{ +021 int x, res; +022 +023 /* if b is <= 0 then zero the int */ +024 if (b <= 0) \{ +025 mp_zero (c); +026 return MP_OKAY; +027 \} +028 +029 /* if the modulus is larger than the value than return */ +030 if (b > (int) (a->used * DIGIT_BIT)) \{ +031 res = mp_copy (a, c); +032 return res; +033 \} +034 +035 /* copy */ +036 if ((res = mp_copy (a, c)) != MP_OKAY) \{ +037 return res; +038 \} +039 +040 /* zero digits above the last digit of the modulus */ +041 for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x+ + +) \{ +042 c->dp[x] = 0; +043 \} +044 /* clear the digit that is not completely outside/inside the modulus */ +045 c->dp[b / DIGIT_BIT] &= +046 (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digi + t) 1)); +047 mp_clamp (c); +048 return MP_OKAY; +049 \} +\end{alltt} +\end{small} + +-- Add comments later, Tom. + +\section*{Exercises} +\begin{tabular}{cl} +$\left [ 3 \right ] $ & Devise an algorithm that performs $a \cdot 2^b$ for generic values of $b$ \\ + & in $O(n)$ time. \\ + &\\ +$\left [ 3 \right ] $ & Devise an efficient algorithm to multiply by small low hamming \\ + & weight values such as $3$, $5$ and $9$. Extend it to handle all values \\ + & upto $64$ with a hamming weight less than three. \\ + &\\ +$\left [ 2 \right ] $ & Modify the preceding algorithm to handle values of the form \\ + & $2^k - 1$ as well. \\ + &\\ +$\left [ 3 \right ] $ & Using only algorithms mp\_mul\_2, mp\_div\_2 and mp\_add create an \\ + & algorithm to multiply two integers in roughly $O(2n^2)$ time for \\ + & any $n$-bit input. Note that the time of addition is ignored in the \\ + & calculation. \\ + & \\ +$\left [ 5 \right ] $ & Improve the previous algorithm to have a working time of at most \\ + & $O \left (2^{(k-1)}n + \left ({2n^2 \over k} \right ) \right )$ for an appropriate choice of $k$. Again ignore \\ + & the cost of addition. \\ + & \\ +$\left [ 2 \right ] $ & Devise a chart to find optimal values of $k$ for the previous problem \\ + & for $n = 64 \ldots 1024$ in steps of $64$. \\ + & \\ +$\left [ 2 \right ] $ & Using only algorithms mp\_abs and mp\_sub devise another method for \\ + & calculating the result of a signed comparison. \\ + & +\end{tabular} + +\chapter{Multiplication and Squaring} +\section{The Multipliers} +For most number theoretic problems including certain public key cryptographic algorithms, the ``multipliers'' form the most important subset of +algorithms of any multiple precision integer package. The set of multiplier algorithms include integer multiplication, squaring and modular reduction +where in each of the algorithms single precision multiplication is the dominant operation performed. This chapter will discuss integer multiplication +and squaring, leaving modular reductions for the subsequent chapter. + +The importance of the multiplier algorithms is for the most part driven by the fact that certain popular public key algorithms are based on modular +exponentiation, that is computing $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for some arbitrary choice of $a$, $b$, $c$ and $d$. During a modular +exponentiation the majority\footnote{Roughly speaking a modular exponentiation will spend about 40\% of the time performing modular reductions, +35\% of the time performing squaring and 25\% of the time performing multiplications.} of the processor time is spent performing single precision +multiplications. + +For centuries general purpose multiplication has required a lengthly $O(n^2)$ process, whereby each digit of one multiplicand has to be multiplied +against every digit of the other multiplicand. Traditional long-hand multiplication is based on this process; while the techniques can differ the +overall algorithm used is essentially the same. Only ``recently'' have faster algorithms been studied. First Karatsuba multiplication was discovered in +1962. This algorithm can multiply two numbers with considerably fewer single precision multiplications when compared to the long-hand approach. +This technique led to the discovery of polynomial basis algorithms (\textit{good reference?}) and subquently Fourier Transform based solutions. + +\section{Multiplication} +\subsection{The Baseline Multiplication} +\label{sec:basemult} +\index{baseline multiplication} +Computing the product of two integers in software can be achieved using a trivial adaptation of the standard $O(n^2)$ long-hand multiplication +algorithm that school children are taught. The algorithm is considered an $O(n^2)$ algorithm since for two $n$-digit inputs $n^2$ single precision +multiplications are required. More specifically for a $m$ and $n$ digit input $m \cdot n$ single precision multiplications are required. To +simplify most discussions, it will be assumed that the inputs have comparable number of digits. + +The ``baseline multiplication'' algorithm is designed to act as the ``catch-all'' algorithm, only to be used when the faster algorithms cannot be +used. This algorithm does not use any particularly interesting optimizations and should ideally be avoided if possible. One important +facet of this algorithm, is that it has been modified to only produce a certain amount of output digits as resolution. The importance of this +modification will become evident during the discussion of Barrett modular reduction. Recall that for a $n$ and $m$ digit input the product +will be at most $n + m$ digits. Therefore, this algorithm can be reduced to a full multiplier by having it produce $n + m$ digits of the product. + +Recall from sub-section 4.2.2 the definition of $\gamma$ as the number of bits in the type \textbf{mp\_digit}. We shall now extend the variable set to +include $\alpha$ which shall represent the number of bits in the type \textbf{mp\_word}. This implies that $2^{\alpha} > 2 \cdot \beta^2$. The +constant $\delta = 2^{\alpha - 2lg(\beta)}$ will represent the maximal weight of any column in a product (\textit{see sub-section 5.2.2 for more information}). + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{s\_mp\_mul\_digs}. \\ +\textbf{Input}. mp\_int $a$, mp\_int $b$ and an integer $digs$ \\ +\textbf{Output}. $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\ +\hline \\ +1. If min$(a.used, b.used) < \delta$ then do \\ +\hspace{3mm}1.1 Calculate $c = \vert a \vert \cdot \vert b \vert$ by the Comba method (\textit{see algorithm~\ref{fig:COMBAMULT}}). \\ +\hspace{3mm}1.2 Return the result of step 1.1 \\ +\\ +Allocate and initialize a temporary mp\_int. \\ +2. Init $t$ to be of size $digs$ \\ +3. If step 2 failed return(\textit{MP\_MEM}). \\ +4. $t.used \leftarrow digs$ \\ +\\ +Compute the product. \\ +5. for $ix$ from $0$ to $a.used - 1$ do \\ +\hspace{3mm}5.1 $u \leftarrow 0$ \\ +\hspace{3mm}5.2 $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\ +\hspace{3mm}5.3 If $pb < 1$ then goto step 6. \\ +\hspace{3mm}5.4 for $iy$ from $0$ to $pb - 1$ do \\ +\hspace{6mm}5.4.1 $\hat r \leftarrow t_{iy + ix} + a_{ix} \cdot b_{iy} + u$ \\ +\hspace{6mm}5.4.2 $t_{iy + ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{6mm}5.4.3 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ +\hspace{3mm}5.5 if $ix + pb < digs$ then do \\ +\hspace{6mm}5.5.1 $t_{ix + pb} \leftarrow u$ \\ +6. Clamp excess digits of $t$. \\ +7. Swap $c$ with $t$ \\ +8. Clear $t$ \\ +9. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm s\_mp\_mul\_digs} +\end{figure} + +\textbf{Algorithm s\_mp\_mul\_digs.} +This algorithm computes the unsigned product of two inputs $a$ and $b$, limited to an output precision of $digs$ digits. While it may seem +a bit awkward to modify the function from its simple $O(n^2)$ description, the usefulness of partial multipliers will arise in a subsequent +algorithm. The algorithm is loosely based on algorithm 14.12 from \cite[pp. 595]{HAC} and is similar to Algorithm M of Knuth \cite[pp. 268]{TAOCPV2}. +Algorithm s\_mp\_mul\_digs differs from these cited references since it can produce a variable output precision regardless of the precision of the +inputs. + +The first thing this algorithm checks for is whether a Comba multiplier can be used instead. If the minimum digit count of either +input is less than $\delta$, then the Comba method may be used instead. After the Comba method is ruled out, the baseline algorithm begins. A +temporary mp\_int variable $t$ is used to hold the intermediate result of the product. This allows the algorithm to be used to +compute products when either $a = c$ or $b = c$ without overwriting the inputs. + +All of step 5 is the infamous $O(n^2)$ multiplication loop slightly modified to only produce upto $digs$ digits of output. The $pb$ variable +is given the count of digits to read from $b$ inside the nested loop. If $pb \le 1$ then no more output digits can be produced and the algorithm +will exit the loop. The best way to think of the loops are as a series of $pb \times 1$ multiplications. That is, in each pass of the +innermost loop $a_{ix}$ is multiplied against $b$ and the result is added (\textit{with an appropriate shift}) to $t$. + +For example, consider multiplying $576$ by $241$. That is equivalent to computing $10^0(1)(576) + 10^1(4)(576) + 10^2(2)(576)$ which is best +visualized in the following table. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{|c|c|c|c|c|c|l|} +\hline && & 5 & 7 & 6 & \\ +\hline $\times$&& & 2 & 4 & 1 & \\ +\hline &&&&&&\\ + && & 5 & 7 & 6 & $10^0(1)(576)$ \\ + &2 & 3 & 6 & 1 & 6 & $10^1(4)(576) + 10^0(1)(576)$ \\ + 1 & 3 & 8 & 8 & 1 & 6 & $10^2(2)(576) + 10^1(4)(576) + 10^0(1)(576)$ \\ +\hline +\end{tabular} +\end{center} +\caption{Long-Hand Multiplication Diagram} +\end{figure} + +Each row of the product is added to the result after being shifted to the left (\textit{multiplied by a power of the radix}) by the appropriate +count. That is in pass $ix$ of the inner loop the product is added starting at the $ix$'th digit of the reult. + +Step 5.4.1 introduces the hat symbol (\textit{e.g. $\hat r$}) which represents a double precision variable. The multiplication on that step +is assumed to be a double wide output single precision multiplication. That is, two single precision variables are multiplied to produce a +double precision result. The step is somewhat optimized from a long-hand multiplication algorithm because the carry from the addition in step +5.4.1 is propagated through the nested loop. If the carry was not propagated immediately it would overflow the single precision digit +$t_{ix+iy}$ and the result would be lost. + +At step 5.5 the nested loop is finished and any carry that was left over should be forwarded. The carry does not have to be added to the $ix+pb$'th +digit since that digit is assumed to be zero at this point. However, if $ix + pb \ge digs$ the carry is not set as it would make the result +exceed the precision requested. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_mul\_digs.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* multiplies |a| * |b| and only computes upto digs digits of result +018 * HAC pp. 595, Algorithm 14.12 Modified so you can control how +019 * many digits of output are created. +020 */ +021 int +022 s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) +023 \{ +024 mp_int t; +025 int res, pa, pb, ix, iy; +026 mp_digit u; +027 mp_word r; +028 mp_digit tmpx, *tmpt, *tmpy; +029 +030 /* can we use the fast multiplier? */ +031 if (((digs) < MP_WARRAY) && +032 MIN (a->used, b->used) < +033 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{ +034 return fast_s_mp_mul_digs (a, b, c, digs); +035 \} +036 +037 if ((res = mp_init_size (&t, digs)) != MP_OKAY) \{ +038 return res; +039 \} +040 t.used = digs; +041 +042 /* compute the digits of the product directly */ +043 pa = a->used; +044 for (ix = 0; ix < pa; ix++) \{ +045 /* set the carry to zero */ +046 u = 0; +047 +048 /* limit ourselves to making digs digits of output */ +049 pb = MIN (b->used, digs - ix); +050 +051 /* setup some aliases */ +052 /* copy of the digit from a used within the nested loop */ +053 tmpx = a->dp[ix]; +054 +055 /* an alias for the destination shifted ix places */ +056 tmpt = t.dp + ix; +057 +058 /* an alias for the digits of b */ +059 tmpy = b->dp; +060 +061 /* compute the columns of the output and propagate the carry */ +062 for (iy = 0; iy < pb; iy++) \{ +063 /* compute the column as a mp_word */ +064 r = ((mp_word)*tmpt) + +065 ((mp_word)tmpx) * ((mp_word)*tmpy++) + +066 ((mp_word) u); +067 +068 /* the new column is the lower part of the result */ +069 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); +070 +071 /* get the carry word from the result */ +072 u = (mp_digit) (r >> ((mp_word) DIGIT_BIT)); +073 \} +074 /* set carry if it is placed below digs */ +075 if (ix + iy < digs) \{ +076 *tmpt = u; +077 \} +078 \} +079 +080 mp_clamp (&t); +081 mp_exch (&t, c); +082 +083 mp_clear (&t); +084 return MP_OKAY; +085 \} +\end{alltt} +\end{small} + +Lines 31 to 35 determine if the Comba method can be used first. The conditions for using the Comba routine are that min$(a.used, b.used) < \delta$ and +the number of digits of output is less than \textbf{MP\_WARRAY}. This new constant is used to control +the stack usage in the Comba routines. By default it is set to $\delta$ but can be reduced when memory is at a premium. + +Of particular importance is the calculation of the $ix+iy$'th column on lines 64, 65 and 66. Note how all of the +variables are cast to the type \textbf{mp\_word}, which is also the type of variable $\hat r$. That is to ensure that double precision operations +are used instead of single precision. The multiplication on line 65 makes use of a specific GCC optimizer behaviour. On the outset it looks like +the compiler will have to use a double precision multiplication to produce the result required. Such an operation would be horribly slow on most +processors and drag this to a crawl. However, GCC is smart enough to realize that double wide output single precision multipliers can be used. For +example, the instruction ``MUL'' on the x86 processor can multiply two 32-bit values and produce a 64-bit result. + +\subsection{Faster Multiplication by the ``Comba'' Method} + +One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be computed and propagated upwards. This +makes the nested loop very sequential and hard to unroll and implement in parallel. The ``Comba'' \cite{COMBA} method is named after little known +(\textit{in cryptographic venues}) Paul G. Comba who described a method of implementing fast multipliers that do not require nested +carry fixup operations. As an interesting aside it seems that Paul Barrett describes a similar technique in +his 1986 paper \cite{BARRETT} written five years before. + +At the heart of the Comba technique is once again the long-hand algorithm. Except in this case a slight twist is placed on how +the columns of the result are produced. In the standard long-hand algorithm rows of products are produced then added together to form the +final result. In the baseline algorithm the columns are added together after each iteration to get the result instantaneously. + +In the Comba algorithm the columns of the result are produced entirely independently of each other. That is at the $O(n^2)$ level a +simple multiplication and addition step is performed. The carries of the columns are propagated after the nested loop to reduce the amount +of work requiored. Succintly the first step of the algorithm is to compute the product vector $\vec x$ as follows. + +\begin{equation} +\vec x_n = \sum_{i+j = n} a_ib_j, \forall n \in \lbrace 0, 1, 2, \ldots, i + j \rbrace +\end{equation} + +Where $\vec x_n$ is the $n'th$ column of the output vector. Consider the following example which computes the vector $\vec x$ for the multiplication +of $576$ and $241$. + +\newpage\begin{figure}[here] +\begin{small} +\begin{center} +\begin{tabular}{|c|c|c|c|c|c|} + \hline & & 5 & 7 & 6 & First Input\\ + \hline $\times$ & & 2 & 4 & 1 & Second Input\\ +\hline & & $1 \cdot 5 = 5$ & $1 \cdot 7 = 7$ & $1 \cdot 6 = 6$ & First pass \\ + & $4 \cdot 5 = 20$ & $4 \cdot 7+5=33$ & $4 \cdot 6+7=31$ & 6 & Second pass \\ + $2 \cdot 5 = 10$ & $2 \cdot 7 + 20 = 34$ & $2 \cdot 6+33=45$ & 31 & 6 & Third pass \\ +\hline 10 & 34 & 45 & 31 & 6 & Final Result \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Comba Multiplication Diagram} +\end{figure} + +At this point the vector $x = \left < 10, 34, 45, 31, 6 \right >$ is the result of the first step of the Comba multipler. +Now the columns must be fixed by propagating the carry upwards. The resultant vector will have one extra dimension over the input vector which is +congruent to adding a leading zero digit. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Comba Fixup}. \\ +\textbf{Input}. Vector $\vec x$ of dimension $k$ \\ +\textbf{Output}. Vector $\vec x$ such that the carries have been propagated. \\ +\hline \\ +1. for $n$ from $0$ to $k - 1$ do \\ +\hspace{3mm}1.1 $\vec x_{n+1} \leftarrow \vec x_{n+1} + \lfloor \vec x_{n}/\beta \rfloor$ \\ +\hspace{3mm}1.2 $\vec x_{n} \leftarrow \vec x_{n} \mbox{ (mod }\beta\mbox{)}$ \\ +2. Return($\vec x$). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm Comba Fixup} +\end{figure} + +With that algorithm and $k = 5$ and $\beta = 10$ the following vector is produced $\vec x= \left < 1, 3, 8, 8, 1, 6 \right >$. In this case +$241 \cdot 576$ is in fact $138816$ and the procedure succeeded. If the algorithm is correct and as will be demonstrated shortly more +efficient than the baseline algorithm why not simply always use this algorithm? + +\subsubsection{Column Weight.} +At the nested $O(n^2)$ level the Comba method adds the product of two single precision variables to each column of the output +independently. A serious obstacle is if the carry is lost, due to lack of precision before the algorithm has a chance to fix +the carries. For example, in the multiplication of two three-digit numbers the third column of output will be the sum of +three single precision multiplications. If the precision of the accumulator for the output digits is less then $3 \cdot (\beta - 1)^2$ then +an overflow can occur and the carry information will be lost. For any $m$ and $n$ digit inputs the maximum weight of any column is +min$(m, n)$ which is fairly obvious. + +The maximum number of terms in any column of a product is known as the ``column weight'' and strictly governs when the algorithm can be used. Recall +from earlier that a double precision type has $\alpha$ bits of resolution and a single precision digit has $lg(\beta)$ bits of precision. Given these +two quantities we must not violate the following + +\begin{equation} +k \cdot \left (\beta - 1 \right )^2 < 2^{\alpha} +\end{equation} + +Which reduces to + +\begin{equation} +k \cdot \left ( \beta^2 - 2\beta + 1 \right ) < 2^{\alpha} +\end{equation} + +Let $\rho = lg(\beta)$ represent the number of bits in a single precision digit. By further re-arrangement of the equation the final solution is +found. + +\begin{equation} +k < {{2^{\alpha}} \over {\left (2^{2\rho} - 2^{\rho + 1} + 1 \right )}} +\end{equation} + +The defaults for LibTomMath are $\beta = 2^{28}$ and $\alpha = 2^{64}$ which means that $k$ is bounded by $k < 257$. In this configuration +the smaller input may not have more than $256$ digits if the Comba method is to be used. This is quite satisfactory for most applications since +$256$ digits would allow for numbers in the range of $0 \le x < 2^{7168}$ which, is much larger than most public key cryptographic algorithms require. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{fast\_s\_mp\_mul\_digs}. \\ +\textbf{Input}. mp\_int $a$, mp\_int $b$ and an integer $digs$ \\ +\textbf{Output}. $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\ +\hline \\ +Place an array of \textbf{MP\_WARRAY} double precision digits named $\hat W$ on the stack. \\ +1. If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{mp\_grow}) \\ +2. If step 1 failed return(\textit{MP\_MEM}).\\ +\\ +Zero the temporary array $\hat W$. \\ +3. for $n$ from $0$ to $digs - 1$ do \\ +\hspace{3mm}3.1 $\hat W_n \leftarrow 0$ \\ +\\ +Compute the columns. \\ +4. for $ix$ from $0$ to $a.used - 1$ do \\ +\hspace{3mm}4.1 $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\ +\hspace{3mm}4.2 If $pb < 1$ then goto step 5. \\ +\hspace{3mm}4.3 for $iy$ from $0$ to $pb - 1$ do \\ +\hspace{6mm}4.3.1 $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}b_{iy}$ \\ +\\ +Propagate the carries upwards. \\ +5. $oldused \leftarrow c.used$ \\ +6. $c.used \leftarrow digs$ \\ +7. If $digs > 1$ then do \\ +\hspace{3mm}7.1. for $ix$ from $1$ to $digs - 1$ do \\ +\hspace{6mm}7.1.1 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix-1} / \beta \rfloor$ \\ +\hspace{6mm}7.1.2 $c_{ix - 1} \leftarrow \hat W_{ix - 1} \mbox{ (mod }\beta\mbox{)}$ \\ +8. else do \\ +\hspace{3mm}8.1 $ix \leftarrow 0$ \\ +9. $c_{ix} \leftarrow \hat W_{ix} \mbox{ (mod }\beta\mbox{)}$ \\ +\\ +Zero excess digits. \\ +10. If $digs < oldused$ then do \\ +\hspace{3mm}10.1 for $n$ from $digs$ to $oldused - 1$ do \\ +\hspace{6mm}10.1.1 $c_n \leftarrow 0$ \\ +11. Clamp excessive digits of $c$. (\textit{mp\_clamp}) \\ +12. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm fast\_s\_mp\_mul\_digs} +\label{fig:COMBAMULT} +\end{figure} + +\textbf{Algorithm fast\_s\_mp\_mul\_digs.} +This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision. The algorithm +essentially peforms the same calculation as algorithm s\_mp\_mul\_digs, just much faster. + +The array $\hat W$ is meant to be on the stack when the algorithm is used. The size of the array does not change which is ideal. Note also that +unlike algorithm s\_mp\_mul\_digs no temporary mp\_int is required since the result is calculated directly in $\hat W$. + +The $O(n^2)$ loop on step four is where the Comba method's advantages begin to show through in comparison to the baseline algorithm. The lack of +a carry variable or propagation in this loop allows the loop to be performed with only single precision multiplication and additions. Now that each +iteration of the inner loop can be performed independent of the others the inner loop can be performed with a high level of parallelism. + +To measure the benefits of the Comba method over the baseline method consider the number of operations that are required. If the +cost in terms of time of a multiply and addition is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require +$O \left ((p + q)n^2 \right )$ time to multiply two $n$-digit numbers. The Comba method requires only $O(pn^2 + qn)$ time, however in practice, +the speed increase is actually much more. With $O(n)$ space the algorithm can be reduced to $O(pn + qn)$ time by implementing the $n$ multiply +and addition operations in the nested loop in parallel. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_fast\_s\_mp\_mul\_digs.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* Fast (comba) multiplier +018 * +019 * This is the fast column-array [comba] multiplier. It is +020 * designed to compute the columns of the product first +021 * then handle the carries afterwards. This has the effect +022 * of making the nested loops that compute the columns very +023 * simple and schedulable on super-scalar processors. +024 * +025 * This has been modified to produce a variable number of +026 * digits of output so if say only a half-product is required +027 * you don't have to compute the upper half (a feature +028 * required for fast Barrett reduction). +029 * +030 * Based on Algorithm 14.12 on pp.595 of HAC. +031 * +032 */ +033 int +034 fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) +035 \{ +036 int olduse, res, pa, ix; +037 mp_word W[MP_WARRAY]; +038 +039 /* grow the destination as required */ +040 if (c->alloc < digs) \{ +041 if ((res = mp_grow (c, digs)) != MP_OKAY) \{ +042 return res; +043 \} +044 \} +045 +046 /* clear temp buf (the columns) */ +047 memset (W, 0, sizeof (mp_word) * digs); +048 +049 /* calculate the columns */ +050 pa = a->used; +051 for (ix = 0; ix < pa; ix++) \{ +052 /* this multiplier has been modified to allow you to +053 * control how many digits of output are produced. +054 * So at most we want to make upto "digs" digits of output. +055 * +056 * this adds products to distinct columns (at ix+iy) of W +057 * note that each step through the loop is not dependent on +058 * the previous which means the compiler can easily unroll +059 * the loop without scheduling problems +060 */ +061 \{ +062 register mp_digit tmpx, *tmpy; +063 register mp_word *_W; +064 register int iy, pb; +065 +066 /* alias for the the word on the left e.g. A[ix] * A[iy] */ +067 tmpx = a->dp[ix]; +068 +069 /* alias for the right side */ +070 tmpy = b->dp; +071 +072 /* alias for the columns, each step through the loop adds a new +073 term to each column +074 */ +075 _W = W + ix; +076 +077 /* the number of digits is limited by their placement. E.g. +078 we avoid multiplying digits that will end up above the # of +079 digits of precision requested +080 */ +081 pb = MIN (b->used, digs - ix); +082 +083 for (iy = 0; iy < pb; iy++) \{ +084 *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++); +085 \} +086 \} +087 +088 \} +089 +090 /* setup dest */ +091 olduse = c->used; +092 c->used = digs; +093 +094 \{ +095 register mp_digit *tmpc; +096 +097 /* At this point W[] contains the sums of each column. To get the +098 * correct result we must take the extra bits from each column and +099 * carry them down +100 * +101 * Note that while this adds extra code to the multiplier it +102 * saves time since the carry propagation is removed from the +103 * above nested loop.This has the effect of reducing the work +104 * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the +105 * cost of the shifting. On very small numbers this is slower +106 * but on most cryptographic size numbers it is faster. +107 * +108 * In this particular implementation we feed the carries from +109 * behind which means when the loop terminates we still have one +110 * last digit to copy +111 */ +112 tmpc = c->dp; +113 for (ix = 1; ix < digs; ix++) \{ +114 /* forward the carry from the previous temp */ +115 W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT)); +116 +117 /* now extract the previous digit [below the carry] */ +118 *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); +119 \} +120 /* fetch the last digit */ +121 *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK)); +122 +123 /* clear unused digits [that existed in the old copy of c] */ +124 for (; ix < olduse; ix++) \{ +125 *tmpc++ = 0; +126 \} +127 \} +128 mp_clamp (c); +129 return MP_OKAY; +130 \} +\end{alltt} +\end{small} + +The memset on line 47 clears the initial $\hat W$ array to zero in a single step. Like the slower baseline multiplication +implementation a series of aliases (\textit{lines 67, 70 and 75}) are used to simplify the inner $O(n^2)$ loop. +In this case a new alias $\_\hat W$ has been added which refers to the double precision columns offset by $ix$ in each pass. + +The inner loop on lines 83, 84 and 85 is where the algorithm will spend the majority of the time, which is why it has been +stripped to the bones of any extra baggage\footnote{Hence the pointer aliases.}. On x86 processors the multiplication and additions amount to at the +very least five instructions (\textit{two loads, two additions, one multiply}) while on the ARMv4 processors they amount to only three +(\textit{one load, one store, one multiply-add}). For both of the x86 and ARMv4 processors the GCC compiler performs a good job at unrolling the loop +and scheduling the instructions so there are very few dependency stalls. + +In theory the difference between the baseline and comba algorithms is a mere $O(qn)$ time difference. However, in the $O(n^2)$ nested loop of the +baseline method there are dependency stalls as the algorithm must wait for the multiplier to finish before propagating the carry to the next +digit. As a result fewer of the often multiple execution units\footnote{The AMD Athlon has three execution units and the Intel P4 has four.} can +be simultaneously used. + +\subsection{Polynomial Basis Multiplication} +To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication. In the following algorithms +the use of polynomial basis representation for two integers $a$ and $b$ as $f(x) = \sum_{i=0}^{n} a_i x^i$ and +$g(x) = \sum_{i=0}^{n} b_i x^i$ respectively, is required. In this system both $f(x)$ and $g(x)$ have $n + 1$ terms and are of the $n$'th degree. + +The product $a \cdot b \equiv f(x)g(x)$ is the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$. The coefficients $w_i$ will +directly yield the desired product when $\beta$ is substituted for $x$. The direct solution to solve for the $2n + 1$ coefficients +requires $O(n^2)$ time and would in practice be slower than the Comba technique. + +However, numerical analysis theory indicates that only $2n + 1$ distinct points in $W(x)$ are required to determine the values of the $2n + 1$ unknown +coefficients. This means by finding $\zeta_y = W(y)$ for $2n + 1$ small values of $y$ the coefficients of $W(x)$ can be found with +Gaussian elimination. This technique is also occasionally refered to as the \textit{interpolation technique} (\textit{references please...}) since in +effect an interpolation based on $2n + 1$ points will yield a polynomial equivalent to $W(x)$. + +The coefficients of the polynomial $W(x)$ are unknown which makes finding $W(y)$ for any value of $y$ impossible. However, since +$W(x) = f(x)g(x)$ the equivalent $\zeta_y = f(y) g(y)$ can be used in its place. The benefit of this technique stems from the +fact that $f(y)$ and $g(y)$ are much smaller than either $a$ or $b$ respectively. As a result finding the $2n + 1$ relations required +by multiplying $f(y)g(y)$ involves multiplying integers that are much smaller than either of the inputs. + +When picking points to gather relations there are always three obvious points to choose, $y = 0, 1$ and $ \infty$. The $\zeta_0$ term +is simply the product $W(0) = w_0 = a_0 \cdot b_0$. The $\zeta_1$ term is the product +$W(1) = \left (\sum_{i = 0}^{n} a_i \right ) \left (\sum_{i = 0}^{n} b_i \right )$. The third point $\zeta_{\infty}$ is less obvious but rather +simple to explain. The $2n + 1$'th coefficient of $W(x)$ is numerically equivalent to the most significant column in an integer multiplication. +The point at $\infty$ is used symbolically to represent the most significant column, that is $W(\infty) = w_{2n} = a_nb_n$. Note that the +points at $y = 0$ and $\infty$ yield the coefficients $w_0$ and $w_{2n}$ directly. + +If more points are required they should be of small values and powers of two such as $2^q$ and the related \textit{mirror points} +$\left (2^q \right )^{2n} \cdot \zeta_{2^{-q}}$ for small values of $q$. The term ``mirror point'' stems from the fact that +$\left (2^q \right )^{2n} \cdot \zeta_{2^{-q}}$ can be calculated in the exact opposite fashion as $\zeta_{2^q}$. For +example, when $n = 2$ and $q = 1$ then following two equations are equivalent to the point $\zeta_{2}$ and its mirror. + +\begin{eqnarray} +\zeta_{2} = f(2)g(2) = (4a_2 + 2a_1 + a_0)(4b_2 + 2b_1 + b_0) \nonumber \\ +16 \cdot \zeta_{1 \over 2} = 4f({1\over 2}) \cdot 4g({1 \over 2}) = (a_2 + 2a_1 + 4a_0)(b_2 + 2b_1 + 4b_0) +\end{eqnarray} + +Using such points will allow the values of $f(y)$ and $g(y)$ to be independently calculated using only left shifts. For example, when $n = 2$ the +polynomial $f(2^q)$ is equal to $2^q((2^qa_2) + a_1) + a_0$. This technique of polynomial representation is known as Horner's method. + +As a general rule of the algorithm when the inputs are split into $n$ parts each there are $2n - 1$ multiplications. Each multiplication is of +multiplicands that have $n$ times fewer digits than the inputs. The asymptotic running time of this algorithm is +$O \left ( k^{lg_n(2n - 1)} \right )$ for $k$ digit inputs (\textit{assuming they have the same number of digits}). Figure~\ref{fig:exponent} +summarizes the exponents for various values of $n$. + +\begin{figure} +\begin{center} +\begin{tabular}{|c|c|c|} +\hline \textbf{Split into $n$ Parts} & \textbf{Exponent} & \textbf{Notes}\\ +\hline $2$ & $1.584962501$ & This is Karatsuba Multiplication. \\ +\hline $3$ & $1.464973520$ & This is Toom-Cook Multiplication. \\ +\hline $4$ & $1.403677461$ &\\ +\hline $5$ & $1.365212389$ &\\ +\hline $10$ & $1.278753601$ &\\ +\hline $100$ & $1.149426538$ &\\ +\hline $1000$ & $1.100270931$ &\\ +\hline $10000$ & $1.075252070$ &\\ +\hline +\end{tabular} +\end{center} +\caption{Asymptotic Running Time of Polynomial Basis Multiplication} +\label{fig:exponent} +\end{figure} + +At first it may seem like a good idea to choose $n = 1000$ since the exponent is approximately $1.1$. However, the overhead +of solving for the 2001 terms of $W(x)$ will certainly consume any savings the algorithm could offer for all but exceedingly large +numbers. + +\subsubsection{Cutoff Point} +The polynomial basis multiplication algorithms all require fewer single precision multiplications than a straight Comba approach. However, +the algorithms incur an overhead (\textit{at the $O(n)$ work level}) since they require a system of equations to be solved. This makes the +polynomial basis approach more costly to use with small inputs. + +Let $m$ represent the number of digits in the multiplicands (\textit{assume both multiplicands have the same number of digits}). There exists a +point $y$ such that when $m < y$ the polynomial basis algorithms are more costly than Comba, when $m = y$ they are roughly the same cost and +when $m > y$ the Comba methods are slower than the polynomial basis algorithms. + +The exact location of $y$ depends on several key architectural elements of the computer platform in question. + +\begin{enumerate} +\item The ratio of clock cycles for single precision multiplication versus other simpler operations such as addition, shifting, etc. For example +on the AMD Athlon the ratio is roughly $17 : 1$ while on the Intel P4 it is $29 : 1$. The higher the ratio in favour of multiplication the lower +the cutoff point $y$ will be. + +\item The complexity of the linear system of equations (\textit{for the coefficients of $W(x)$}) is. Generally speaking as the number of splits +grows the complexity grows substantially. Ideally solving the system will only involve addition, subtraction and shifting of integers. This +directly reflects on the ratio previous mentioned. + +\item To a lesser extent memory bandwidth and function call overheads. Provided the values are in the processor cache this is less of an +influence over the cutoff point. + +\end{enumerate} + +A clean cutoff point separation occurs when a point $y$ is found such that all of the cutoff point conditions are met. For example, if the point +is too low then there will be values of $m$ such that $m > y$ and the Comba method is still faster. Finding the cutoff points is fairly simple when +a high resolution timer is available. + +\subsection{Karatsuba Multiplication} +Karatsuba \cite{KARA} multiplication when originally proposed in 1962 was among the first set of algorithms to break the $O(n^2)$ barrier for +general purpose multiplication. Given two polynomial basis representations $f(x) = ax + b$ and $g(x) = cx + d$, Karatsuba proved with +light algebra \cite{KARAP} that the following polynomial is equivalent to multiplication of the two integers the polynomials represent. + +\begin{equation} +f(x) \cdot g(x) = acx^2 + ((a - b)(c - d) - (ac + bd))x + bd +\end{equation} + +Using the observation that $ac$ and $bd$ could be re-used only three half sized multiplications would be required to produce the product. Applying +this algorithm recursively, the work factor becomes $O(n^{lg(3)})$ which is substantially better than the work factor $O(n^2)$ of the Comba technique. It turns +out what Karatsuba did not know or at least did not publish was that this is simply polynomial basis multiplication with the points +$\zeta_0$, $\zeta_{\infty}$ and $-\zeta_{-1}$. Consider the resultant system of equations. + +\begin{center} +\begin{tabular}{rcrcrcrc} +$\zeta_{0}$ & $=$ & & & & & $w_0$ \\ +$-\zeta_{-1}$ & $=$ & $-w_2$ & $+$ & $w_1$ & $-$ & $w_0$ \\ +$\zeta_{\infty}$ & $=$ & $w_2$ & & & & \\ +\end{tabular} +\end{center} + +By adding the first and last equation to the equation in the middle the term $w_1$ can be isolated and all three coefficients solved for. The simplicity +of this system of equations has made Karatsuba fairly popular. In fact the cutoff point is often fairly low\footnote{With LibTomMath 0.18 it is 70 and 109 digits for the Intel P4 and AMD Athlon respectively.} +making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman. It is worth noting that the point +$\zeta_1$ could be substituted for $-\zeta_{-1}$. In this case the first and third row are subtracted instead of added to the second row. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_karatsuba\_mul}. \\ +\textbf{Input}. mp\_int $a$ and mp\_int $b$ \\ +\textbf{Output}. $c \leftarrow \vert a \vert \cdot \vert b \vert$ \\ +\hline \\ +1. Init the following mp\_int variables: $x0$, $x1$, $y0$, $y1$, $t1$, $x0y0$, $x1y1$.\\ +2. If step 2 failed then return(\textit{MP\_MEM}). \\ +\\ +Split the input. e.g. $a = x1 \cdot \beta^B + x0$ \\ +3. $B \leftarrow \mbox{min}(a.used, b.used)/2$ \\ +4. $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\ +5. $y0 \leftarrow b \mbox{ (mod }\beta^B\mbox{)}$ \\ +6. $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_rshd}) \\ +7. $y1 \leftarrow \lfloor b / \beta^B \rfloor$ \\ +\\ +Calculate the three products. \\ +8. $x0y0 \leftarrow x0 \cdot y0$ (\textit{mp\_mul}) \\ +9. $x1y1 \leftarrow x1 \cdot y1$ \\ +10. $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\ +11. $x0 \leftarrow y1 - y0$ \\ +12. $t1 \leftarrow t1 \cdot x0$ \\ +\\ +Calculate the middle term. \\ +13. $x0 \leftarrow x0y0 + x1y1$ \\ +14. $t1 \leftarrow x0 - t1$ \\ +\\ +Calculate the final product. \\ +15. $t1 \leftarrow t1 \cdot \beta^B$ (\textit{mp\_lshd}) \\ +16. $x1y1 \leftarrow x1y1 \cdot \beta^{2B}$ \\ +17. $t1 \leftarrow x0y0 + t1$ \\ +18. $c \leftarrow t1 + x1y1$ \\ +19. Clear all of the temporary variables. \\ +20. Return(\textit{MP\_OKAY}).\\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_karatsuba\_mul} +\end{figure} + +\textbf{Algorithm mp\_karatsuba\_mul.} +This algorithm computes the unsigned product of two inputs using the Karatsuba multiplication algorithm. It is loosely based on the description +from Knuth \cite[pp. 294-295]{TAOCPV2}. + +\index{radix point} +In order to split the two inputs into their respective halves, a suitable \textit{radix point} must be chosen. The radix point chosen must +be used for both of the inputs meaning that it must be smaller than the smallest input. Step 3 chooses the radix point $B$ as half of the +smallest input \textbf{used} count. After the radix point is chosen the inputs are split into lower and upper halves. Step 4 and 5 +compute the lower halves. Step 6 and 7 computer the upper halves. + +After the halves have been computed the three intermediate half-size products must be computed. Step 8 and 9 compute the trivial products +$x0 \cdot y0$ and $x1 \cdot y1$. The mp\_int $x0$ is used as a temporary variable after $x1 - x0$ has been computed. By using $x0$ instead +of an additional temporary variable, the algorithm can avoid an addition memory allocation operation. + +The remaining steps 13 through 18 compute the Karatsuba polynomial through a variety of digit shifting and addition operations. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_karatsuba\_mul.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* c = |a| * |b| using Karatsuba Multiplication using +018 * three half size multiplications +019 * +020 * Let B represent the radix [e.g. 2**DIGIT_BIT] and +021 * let n represent half of the number of digits in +022 * the min(a,b) +023 * +024 * a = a1 * B**n + a0 +025 * b = b1 * B**n + b0 +026 * +027 * Then, a * b => +028 a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0 +029 * +030 * Note that a1b1 and a0b0 are used twice and only need to be +031 * computed once. So in total three half size (half # of +032 * digit) multiplications are performed, a0b0, a1b1 and +033 * (a1-b1)(a0-b0) +034 * +035 * Note that a multiplication of half the digits requires +036 * 1/4th the number of single precision multiplications so in +037 * total after one call 25% of the single precision multiplications +038 * are saved. Note also that the call to mp_mul can end up back +039 * in this function if the a0, a1, b0, or b1 are above the threshold. +040 * This is known as divide-and-conquer and leads to the famous +041 * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than +042 * the standard O(N**2) that the baseline/comba methods use. +043 * Generally though the overhead of this method doesn't pay off +044 * until a certain size (N ~ 80) is reached. +045 */ +046 int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c) +047 \{ +048 mp_int x0, x1, y0, y1, t1, x0y0, x1y1; +049 int B, err; +050 +051 /* default the return code to an error */ +052 err = MP_MEM; +053 +054 /* min # of digits */ +055 B = MIN (a->used, b->used); +056 +057 /* now divide in two */ +058 B = B >> 1; +059 +060 /* init copy all the temps */ +061 if (mp_init_size (&x0, B) != MP_OKAY) +062 goto ERR; +063 if (mp_init_size (&x1, a->used - B) != MP_OKAY) +064 goto X0; +065 if (mp_init_size (&y0, B) != MP_OKAY) +066 goto X1; +067 if (mp_init_size (&y1, b->used - B) != MP_OKAY) +068 goto Y0; +069 +070 /* init temps */ +071 if (mp_init_size (&t1, B * 2) != MP_OKAY) +072 goto Y1; +073 if (mp_init_size (&x0y0, B * 2) != MP_OKAY) +074 goto T1; +075 if (mp_init_size (&x1y1, B * 2) != MP_OKAY) +076 goto X0Y0; +077 +078 /* now shift the digits */ +079 x0.sign = x1.sign = a->sign; +080 y0.sign = y1.sign = b->sign; +081 +082 x0.used = y0.used = B; +083 x1.used = a->used - B; +084 y1.used = b->used - B; +085 +086 \{ +087 register int x; +088 register mp_digit *tmpa, *tmpb, *tmpx, *tmpy; +089 +090 /* we copy the digits directly instead of using higher level functions +091 * since we also need to shift the digits +092 */ +093 tmpa = a->dp; +094 tmpb = b->dp; +095 +096 tmpx = x0.dp; +097 tmpy = y0.dp; +098 for (x = 0; x < B; x++) \{ +099 *tmpx++ = *tmpa++; +100 *tmpy++ = *tmpb++; +101 \} +102 +103 tmpx = x1.dp; +104 for (x = B; x < a->used; x++) \{ +105 *tmpx++ = *tmpa++; +106 \} +107 +108 tmpy = y1.dp; +109 for (x = B; x < b->used; x++) \{ +110 *tmpy++ = *tmpb++; +111 \} +112 \} +113 +114 /* only need to clamp the lower words since by definition the +115 * upper words x1/y1 must have a known number of digits +116 */ +117 mp_clamp (&x0); +118 mp_clamp (&y0); +119 +120 /* now calc the products x0y0 and x1y1 */ +121 /* after this x0 is no longer required, free temp [x0==t2]! */ +122 if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY) +123 goto X1Y1; /* x0y0 = x0*y0 */ +124 if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY) +125 goto X1Y1; /* x1y1 = x1*y1 */ +126 +127 /* now calc x1-x0 and y1-y0 */ +128 if (mp_sub (&x1, &x0, &t1) != MP_OKAY) +129 goto X1Y1; /* t1 = x1 - x0 */ +130 if (mp_sub (&y1, &y0, &x0) != MP_OKAY) +131 goto X1Y1; /* t2 = y1 - y0 */ +132 if (mp_mul (&t1, &x0, &t1) != MP_OKAY) +133 goto X1Y1; /* t1 = (x1 - x0) * (y1 - y0) */ +134 +135 /* add x0y0 */ +136 if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY) +137 goto X1Y1; /* t2 = x0y0 + x1y1 */ +138 if (mp_sub (&x0, &t1, &t1) != MP_OKAY) +139 goto X1Y1; /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */ +140 +141 /* shift by B */ +142 if (mp_lshd (&t1, B) != MP_OKAY) +143 goto X1Y1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */ +144 if (mp_lshd (&x1y1, B * 2) != MP_OKAY) +145 goto X1Y1; /* x1y1 = x1y1 << 2*B */ +146 +147 if (mp_add (&x0y0, &t1, &t1) != MP_OKAY) +148 goto X1Y1; /* t1 = x0y0 + t1 */ +149 if (mp_add (&t1, &x1y1, c) != MP_OKAY) +150 goto X1Y1; /* t1 = x0y0 + t1 + x1y1 */ +151 +152 /* Algorithm succeeded set the return code to MP_OKAY */ +153 err = MP_OKAY; +154 +155 X1Y1:mp_clear (&x1y1); +156 X0Y0:mp_clear (&x0y0); +157 T1:mp_clear (&t1); +158 Y1:mp_clear (&y1); +159 Y0:mp_clear (&y0); +160 X1:mp_clear (&x1); +161 X0:mp_clear (&x0); +162 ERR: +163 return err; +164 \} +\end{alltt} +\end{small} + +The new coding element in this routine, not seen in previous routines, is the usage of goto statements. The conventional +wisdom is that goto statements should be avoided. This is generally true, however when every single function call can fail, it makes sense +to handle error recovery with a single piece of code. Lines 61 to 75 handle initializing all of the temporary variables +required. Note how each of the if statements goes to a different label in case of failure. This allows the routine to correctly free only +the temporaries that have been successfully allocated so far. + +The temporary variables are all initialized using the mp\_init\_size routine since they are expected to be large. This saves the +additional reallocation that would have been necessary. Also $x0$, $x1$, $y0$ and $y1$ have to be able to hold at least their respective +number of digits for the next section of code. + +The first algebraic portion of the algorithm is to split the two inputs into their halves. However, instead of using mp\_mod\_2d and mp\_rshd +to extract the halves, the respective code has been placed inline within the body of the function. To initialize the halves, the \textbf{used} and +\textbf{sign} members are copied first. The first for loop on line 98 copies the lower halves. Since they are both the same magnitude it +is simpler to calculate both lower halves in a single loop. The for loop on lines 104 and 109 calculate the upper halves $x1$ and +$y1$ respectively. + +By inlining the calculation of the halves, the Karatsuba multiplier has a slightly lower overhead and can be used for smaller magnitude inputs. + +When line 153 is reached, the algorithm has completed succesfully. The ``error status'' variable $err$ is set to \textbf{MP\_OKAY} so that +the same code that handles errors can be used to clear the temporary variables and return. + +\subsection{Toom-Cook $3$-Way Multiplication} +Toom-Cook $3$-Way \cite{TOOM} multiplication is essentially the polynomial basis algorithm for $n = 2$ except that the points are +chosen such that $\zeta$ is easy to compute and the resulting system of equations easy to reduce. Here, the points $\zeta_{0}$, +$16 \cdot \zeta_{1 \over 2}$, $\zeta_1$, $\zeta_2$ and $\zeta_{\infty}$ make up the five required points to solve for the coefficients +of the $W(x)$. + +With the five relations that Toom-Cook specifies, the following system of equations is formed. + +\begin{center} +\begin{tabular}{rcrcrcrcrcr} +$\zeta_0$ & $=$ & $0w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $1w_0$ \\ +$16 \cdot \zeta_{1 \over 2}$ & $=$ & $1w_4$ & $+$ & $2w_3$ & $+$ & $4w_2$ & $+$ & $8w_1$ & $+$ & $16w_0$ \\ +$\zeta_1$ & $=$ & $1w_4$ & $+$ & $1w_3$ & $+$ & $1w_2$ & $+$ & $1w_1$ & $+$ & $1w_0$ \\ +$\zeta_2$ & $=$ & $16w_4$ & $+$ & $8w_3$ & $+$ & $4w_2$ & $+$ & $2w_1$ & $+$ & $1w_0$ \\ +$\zeta_{\infty}$ & $=$ & $1w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $0w_0$ \\ +\end{tabular} +\end{center} + +A trivial solution to this matrix requires $12$ subtractions, two multiplications by a small power of two, two divisions by a small power +of two, two divisions by three and one multiplication by three. All of these $19$ sub-operations require less than quadratic time, meaning that +the algorithm can be faster than a baseline multiplication. However, the greater complexity of this algorithm places the cutoff point +(\textbf{TOOM\_MUL\_CUTOFF}) where Toom-Cook becomes more efficient much higher than the Karatsuba cutoff point. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_toom\_mul}. \\ +\textbf{Input}. mp\_int $a$ and mp\_int $b$ \\ +\textbf{Output}. $c \leftarrow a \cdot b $ \\ +\hline \\ +Split $a$ and $b$ into three pieces. E.g. $a = a_2 \beta^{2k} + a_1 \beta^{k} + a_0$ \\ +1. $k \leftarrow \lfloor \mbox{min}(a.used, b.used) / 3 \rfloor$ \\ +2. $a_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\ +3. $a_1 \leftarrow \lfloor a / \beta^k \rfloor$, $a_1 \leftarrow a_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\ +4. $a_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $a_2 \leftarrow a_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\ +5. $b_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\ +6. $b_1 \leftarrow \lfloor a / \beta^k \rfloor$, $b_1 \leftarrow b_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\ +7. $b_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $b_2 \leftarrow b_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\ +\\ +Find the five equations for $w_0, w_1, ..., w_4$. \\ +8. $w_0 \leftarrow a_0 \cdot b_0$ \\ +9. $w_4 \leftarrow a_2 \cdot b_2$ \\ +10. $tmp_1 \leftarrow 2 \cdot a_0$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_2$ \\ +11. $tmp_2 \leftarrow 2 \cdot b_0$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_2$ \\ +12. $w_1 \leftarrow tmp_1 \cdot tmp_2$ \\ +13. $tmp_1 \leftarrow 2 \cdot a_2$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_0$ \\ +14. $tmp_2 \leftarrow 2 \cdot b_2$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_0$ \\ +15. $w_3 \leftarrow tmp_1 \cdot tmp_2$ \\ +16. $tmp_1 \leftarrow a_0 + a_1$, $tmp_1 \leftarrow tmp_1 + a_2$, $tmp_2 \leftarrow b_0 + b_1$, $tmp_2 \leftarrow tmp_2 + b_2$ \\ +17. $w_2 \leftarrow tmp_1 \cdot tmp_2$ \\ +\\ +Continued on the next page.\\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_toom\_mul} +\end{figure} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_toom\_mul} (continued). \\ +\textbf{Input}. mp\_int $a$ and mp\_int $b$ \\ +\textbf{Output}. $c \leftarrow a \cdot b $ \\ +\hline \\ +Now solve the system of equations. \\ +18. $w_1 \leftarrow w_4 - w_1$, $w_3 \leftarrow w_3 - w_0$ \\ +19. $w_1 \leftarrow \lfloor w_1 / 2 \rfloor$, $w_3 \leftarrow \lfloor w_3 / 2 \rfloor$ \\ +20. $w_2 \leftarrow w_2 - w_0$, $w_2 \leftarrow w_2 - w_4$ \\ +21. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\ +22. $tmp_1 \leftarrow 8 \cdot w_0$, $w_1 \leftarrow w_1 - tmp_1$, $tmp_1 \leftarrow 8 \cdot w_4$, $w_3 \leftarrow w_3 - tmp_1$ \\ +23. $w_2 \leftarrow 3 \cdot w_2$, $w_2 \leftarrow w_2 - w_1$, $w_2 \leftarrow w_2 - w_3$ \\ +24. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\ +25. $w_1 \leftarrow \lfloor w_1 / 3 \rfloor, w_3 \leftarrow \lfloor w_3 / 3 \rfloor$ \\ +\\ +Now substitute $\beta^k$ for $x$ by shifting $w_0, w_1, ..., w_4$. \\ +26. for $n$ from $1$ to $4$ do \\ +\hspace{3mm}26.1 $w_n \leftarrow w_n \cdot \beta^{nk}$ \\ +27. $c \leftarrow w_0 + w_1$, $c \leftarrow c + w_2$, $c \leftarrow c + w_3$, $c \leftarrow c + w_4$ \\ +28. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_toom\_mul (continued)} +\end{figure} + +\textbf{Algorithm mp\_toom\_mul.} +This algorithm computes the product of two mp\_int variables $a$ and $b$ using the Toom-Cook approach. Compared to the Karatsuba multiplication, this +algorithm has a lower asymptotic running time of approximately $O(n^{1.464})$ but at an obvious cost in overhead. In this +description, several statements have been compounded to save space. The intention is that the statements are executed from left to right across +any given step. + +The two inputs $a$ and $b$ are first split into three $k$-digit integers $a_0, a_1, a_2$ and $b_0, b_1, b_2$ respectively. From these smaller +integers the coefficients of the polynomial basis representations $f(x)$ and $g(x)$ are known and can be used to find the relations required. + +The first two relations $w_0$ and $w_4$ are the points $\zeta_{0}$ and $\zeta_{\infty}$ respectively. The relation $w_1, w_2$ and $w_3$ correspond +to the points $16 \cdot \zeta_{1 \over 2}, \zeta_{2}$ and $\zeta_{1}$ respectively. These are found using logical shifts to independently find +$f(y)$ and $g(y)$ which significantly speeds up the algorithm. + +After the five relations $w_0, w_1, \ldots, w_4$ have been computed, the system they represent must be solved in order for the unknown coefficients +$w_1, w_2$ and $w_3$ to be isolated. The steps 18 through 25 perform the system reduction required as previously described. Each step of +the reduction represents the comparable matrix operation that would be performed had this been performed by pencil. For example, step 18 indicates +that row $1$ must be subtracted from row $4$ and simultaneously row $0$ subtracted from row $3$. + +Once the coeffients have been isolated, the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$ is known. By substituting $\beta^{k}$ for $x$, the integer +result $a \cdot b$ is produced. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_toom\_mul.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* multiplication using the Toom-Cook 3-way algorithm */ +018 int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c) +019 \{ +020 mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2; +021 int res, B; +022 +023 /* init temps */ +024 if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, +025 &a0, &a1, &a2, &b0, &b1, +026 &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) \{ +027 return res; +028 \} +029 +030 /* B */ +031 B = MIN(a->used, b->used) / 3; +032 +033 /* a = a2 * B**2 + a1 * B + a0 */ +034 if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) \{ +035 goto ERR; +036 \} +037 +038 if ((res = mp_copy(a, &a1)) != MP_OKAY) \{ +039 goto ERR; +040 \} +041 mp_rshd(&a1, B); +042 mp_mod_2d(&a1, DIGIT_BIT * B, &a1); +043 +044 if ((res = mp_copy(a, &a2)) != MP_OKAY) \{ +045 goto ERR; +046 \} +047 mp_rshd(&a2, B*2); +048 +049 /* b = b2 * B**2 + b1 * B + b0 */ +050 if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) \{ +051 goto ERR; +052 \} +053 +054 if ((res = mp_copy(b, &b1)) != MP_OKAY) \{ +055 goto ERR; +056 \} +057 mp_rshd(&b1, B); +058 mp_mod_2d(&b1, DIGIT_BIT * B, &b1); +059 +060 if ((res = mp_copy(b, &b2)) != MP_OKAY) \{ +061 goto ERR; +062 \} +063 mp_rshd(&b2, B*2); +064 +065 /* w0 = a0*b0 */ +066 if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) \{ +067 goto ERR; +068 \} +069 +070 /* w4 = a2 * b2 */ +071 if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) \{ +072 goto ERR; +073 \} +074 +075 /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */ +076 if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) \{ +077 goto ERR; +078 \} +079 if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{ +080 goto ERR; +081 \} +082 if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{ +083 goto ERR; +084 \} +085 if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) \{ +086 goto ERR; +087 \} +088 +089 if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) \{ +090 goto ERR; +091 \} +092 if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{ +093 goto ERR; +094 \} +095 if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{ +096 goto ERR; +097 \} +098 if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) \{ +099 goto ERR; +100 \} +101 +102 if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) \{ +103 goto ERR; +104 \} +105 +106 /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */ +107 if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) \{ +108 goto ERR; +109 \} +110 if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{ +111 goto ERR; +112 \} +113 if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{ +114 goto ERR; +115 \} +116 if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{ +117 goto ERR; +118 \} +119 +120 if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) \{ +121 goto ERR; +122 \} +123 if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{ +124 goto ERR; +125 \} +126 if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{ +127 goto ERR; +128 \} +129 if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{ +130 goto ERR; +131 \} +132 +133 if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) \{ +134 goto ERR; +135 \} +136 +137 +138 /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */ +139 if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) \{ +140 goto ERR; +141 \} +142 if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{ +143 goto ERR; +144 \} +145 if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) \{ +146 goto ERR; +147 \} +148 if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{ +149 goto ERR; +150 \} +151 if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) \{ +152 goto ERR; +153 \} +154 +155 /* now solve the matrix +156 +157 0 0 0 0 1 +158 1 2 4 8 16 +159 1 1 1 1 1 +160 16 8 4 2 1 +161 1 0 0 0 0 +162 +163 using 12 subtractions, 4 shifts, +164 2 small divisions and 1 small multiplication +165 */ +166 +167 /* r1 - r4 */ +168 if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) \{ +169 goto ERR; +170 \} +171 /* r3 - r0 */ +172 if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) \{ +173 goto ERR; +174 \} +175 /* r1/2 */ +176 if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) \{ +177 goto ERR; +178 \} +179 /* r3/2 */ +180 if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) \{ +181 goto ERR; +182 \} +183 /* r2 - r0 - r4 */ +184 if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) \{ +185 goto ERR; +186 \} +187 if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) \{ +188 goto ERR; +189 \} +190 /* r1 - r2 */ +191 if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{ +192 goto ERR; +193 \} +194 /* r3 - r2 */ +195 if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{ +196 goto ERR; +197 \} +198 /* r1 - 8r0 */ +199 if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) \{ +200 goto ERR; +201 \} +202 if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) \{ +203 goto ERR; +204 \} +205 /* r3 - 8r4 */ +206 if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) \{ +207 goto ERR; +208 \} +209 if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) \{ +210 goto ERR; +211 \} +212 /* 3r2 - r1 - r3 */ +213 if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) \{ +214 goto ERR; +215 \} +216 if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) \{ +217 goto ERR; +218 \} +219 if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) \{ +220 goto ERR; +221 \} +222 /* r1 - r2 */ +223 if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{ +224 goto ERR; +225 \} +226 /* r3 - r2 */ +227 if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{ +228 goto ERR; +229 \} +230 /* r1/3 */ +231 if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) \{ +232 goto ERR; +233 \} +234 /* r3/3 */ +235 if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) \{ +236 goto ERR; +237 \} +238 +239 /* at this point shift W[n] by B*n */ +240 if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) \{ +241 goto ERR; +242 \} +243 if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) \{ +244 goto ERR; +245 \} +246 if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) \{ +247 goto ERR; +248 \} +249 if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) \{ +250 goto ERR; +251 \} +252 +253 if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) \{ +254 goto ERR; +255 \} +256 if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) \{ +257 goto ERR; +258 \} +259 if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) \{ +260 goto ERR; +261 \} +262 if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) \{ +263 goto ERR; +264 \} +265 +266 ERR: +267 mp_clear_multi(&w0, &w1, &w2, &w3, &w4, +268 &a0, &a1, &a2, &b0, &b1, +269 &b2, &tmp1, &tmp2, NULL); +270 return res; +271 \} +272 +\end{alltt} +\end{small} + +-- Comments to be added during editing phase. + +\subsection{Signed Multiplication} +Now that algorithms to handle multiplications of every useful dimensions have been developed, a rather simple finishing touch is required. So far all +of the multiplication algorithms have been unsigned multiplications which leaves only a signed multiplication algorithm to be established. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_mul}. \\ +\textbf{Input}. mp\_int $a$ and mp\_int $b$ \\ +\textbf{Output}. $c \leftarrow a \cdot b$ \\ +\hline \\ +1. If $a.sign = b.sign$ then \\ +\hspace{3mm}1.1 $sign = MP\_ZPOS$ \\ +2. else \\ +\hspace{3mm}2.1 $sign = MP\_ZNEG$ \\ +3. If min$(a.used, b.used) \ge TOOM\_MUL\_CUTOFF$ then \\ +\hspace{3mm}3.1 $c \leftarrow a \cdot b$ using algorithm mp\_toom\_mul \\ +4. else if min$(a.used, b.used) \ge KARATSUBA\_MUL\_CUTOFF$ then \\ +\hspace{3mm}4.1 $c \leftarrow a \cdot b$ using algorithm mp\_karatsuba\_mul \\ +5. else \\ +\hspace{3mm}5.1 $digs \leftarrow a.used + b.used + 1$ \\ +\hspace{3mm}5.2 If $digs < MP\_ARRAY$ and min$(a.used, b.used) \le \delta$ then \\ +\hspace{6mm}5.2.1 $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm fast\_s\_mp\_mul\_digs. \\ +\hspace{3mm}5.3 else \\ +\hspace{6mm}5.3.1 $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm s\_mp\_mul\_digs. \\ +6. $c.sign \leftarrow sign$ \\ +7. Return the result of the unsigned multiplication performed. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_mul} +\end{figure} + +\textbf{Algorithm mp\_mul.} +This algorithm performs the signed multiplication of two inputs. It will make use of any of the three unsigned multiplication algorithms +available when the input is of appropriate size. The \textbf{sign} of the result is not set until the end of the algorithm since algorithm +s\_mp\_mul\_digs will clear it. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_mul.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* high level multiplication (handles sign) */ +018 int mp_mul (mp_int * a, mp_int * b, mp_int * c) +019 \{ +020 int res, neg; +021 neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG; +022 +023 /* use Toom-Cook? */ +024 if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) \{ +025 res = mp_toom_mul(a, b, c); +026 /* use Karatsuba? */ +027 \} else if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) \{ +028 res = mp_karatsuba_mul (a, b, c); +029 \} else \{ +030 /* can we use the fast multiplier? +031 * +032 * The fast multiplier can be used if the output will +033 * have less than MP_WARRAY digits and the number of +034 * digits won't affect carry propagation +035 */ +036 int digs = a->used + b->used + 1; +037 +038 if ((digs < MP_WARRAY) && +039 MIN(a->used, b->used) <= +040 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{ +041 res = fast_s_mp_mul_digs (a, b, c, digs); +042 \} else \{ +043 res = s_mp_mul (a, b, c); +044 \} +045 \} +046 c->sign = neg; +047 return res; +048 \} +\end{alltt} +\end{small} + +The implementation is rather simplistic and is not particularly noteworthy. Line 23 computes the sign of the result using the ``?'' +operator from the C programming language. Line 40 computes $\delta$ using the fact that $1 << k$ is equal to $2^k$. + +\section{Squaring} +\label{sec:basesquare} + +Squaring is a special case of multiplication where both multiplicands are equal. At first it may seem like there is no significant optimization +available but in fact there is. Consider the multiplication of $576$ against $241$. In total there will be nine single precision multiplications +performed which are $1\cdot 6$, $1 \cdot 7$, $1 \cdot 5$, $4 \cdot 6$, $4 \cdot 7$, $4 \cdot 5$, $2 \cdot 6$, $2 \cdot 7$ and $2 \cdot 5$. Now consider +the multiplication of $123$ against $123$. The nine products are $3 \cdot 3$, $3 \cdot 2$, $3 \cdot 1$, $2 \cdot 3$, $2 \cdot 2$, $2 \cdot 1$, +$1 \cdot 3$, $1 \cdot 2$ and $1 \cdot 1$. On closer inspection some of the products are equivalent. For example, $3 \cdot 2 = 2 \cdot 3$ +and $3 \cdot 1 = 1 \cdot 3$. + +For any $n$-digit input, there are ${{\left (n^2 + n \right)}\over 2}$ possible unique single precision multiplications required compared to the $n^2$ +required for multiplication. The following diagram gives an example of the operations required. + +\begin{figure}[here] +\begin{center} +\begin{tabular}{ccccc|c} +&&1&2&3&\\ +$\times$ &&1&2&3&\\ +\hline && $3 \cdot 1$ & $3 \cdot 2$ & $3 \cdot 3$ & Row 0\\ + & $2 \cdot 1$ & $2 \cdot 2$ & $2 \cdot 3$ && Row 1 \\ + $1 \cdot 1$ & $1 \cdot 2$ & $1 \cdot 3$ &&& Row 2 \\ +\end{tabular} +\end{center} +\caption{Squaring Optimization Diagram} +\end{figure} + +Starting from zero and numbering the columns from right to left a very simple pattern becomes obvious. For the purposes of this discussion let $x$ +represent the number being squared. The first observation is that in row $k$ the $2k$'th column of the product has a $\left (x_k \right)^2$ term in it. + +The second observation is that every column $j$ in row $k$ where $j \ne 2k$ is part of a double product. Every non-square term of a column will +appear twice hence the name ``double product''. Every odd column is made up entirely of double products. In fact every column is made up of double +products and at most one square (\textit{see the exercise section}). + +The third and final observation is that for row $k$ the first unique non-square term, that is, one that hasn't already appeared in an earlier row, +occurs at column $2k + 1$. For example, on row $1$ of the previous squaring, column one is part of the double product with column one from row zero. +Column two of row one is a square and column three is the first unique column. + +\subsection{The Baseline Squaring Algorithm} +The baseline squaring algorithm is meant to be a catch-all squaring algorithm. It will handle any of the input sizes that the faster routines +will not handle. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{s\_mp\_sqr}. \\ +\textbf{Input}. mp\_int $a$ \\ +\textbf{Output}. $b \leftarrow a^2$ \\ +\hline \\ +1. Init a temporary mp\_int of at least $2 \cdot a.used +1$ digits. (\textit{mp\_init\_size}) \\ +2. If step 1 failed return(\textit{MP\_MEM}) \\ +3. $t.used \leftarrow 2 \cdot a.used + 1$ \\ +4. For $ix$ from 0 to $a.used - 1$ do \\ +\hspace{3mm}Calculate the square. \\ +\hspace{3mm}4.1 $\hat r \leftarrow t_{2ix} + \left (a_{ix} \right )^2$ \\ +\hspace{3mm}4.2 $t_{2ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{3mm}Calculate the double products after the square. \\ +\hspace{3mm}4.3 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ +\hspace{3mm}4.4 For $iy$ from $ix + 1$ to $a.used - 1$ do \\ +\hspace{6mm}4.4.1 $\hat r \leftarrow 2 \cdot a_{ix}a_{iy} + t_{ix + iy} + u$ \\ +\hspace{6mm}4.4.2 $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{6mm}4.4.3 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ +\hspace{3mm}Set the last carry. \\ +\hspace{3mm}4.5 While $u > 0$ do \\ +\hspace{6mm}4.5.1 $iy \leftarrow iy + 1$ \\ +\hspace{6mm}4.5.2 $\hat r \leftarrow t_{ix + iy} + u$ \\ +\hspace{6mm}4.5.3 $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{6mm}4.5.4 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ +5. Clamp excess digits of $t$. (\textit{mp\_clamp}) \\ +6. Exchange $b$ and $t$. \\ +7. Clear $t$ (\textit{mp\_clear}) \\ +8. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm s\_mp\_sqr} +\end{figure} + +\textbf{Algorithm s\_mp\_sqr.} +This algorithm computes the square of an input using the three observations on squaring. It is based fairly faithfully on algorithm 14.16 of HAC +\cite[pp.596-597]{HAC}. Similar to algorithm s\_mp\_mul\_digs, a temporary mp\_int is allocated to hold the result of the squaring. This allows the +destination mp\_int to be the same as the source mp\_int. + +The outer loop of this algorithm begins on step 4. It is best to think of the outer loop as walking down the rows of the partial results, while +the inner loop computes the columns of the partial result. Step 4.1 and 4.2 compute the square term for each row, and step 4.3 and 4.4 propagate +the carry and compute the double products. + +The requirement that a mp\_word be able to represent the range $0 \le x < 2 \beta^2$ arises from this +very algorithm. The product $a_{ix}a_{iy}$ will lie in the range $0 \le x \le \beta^2 - 2\beta + 1$ which is obviously less than $\beta^2$ meaning that +when it is multiplied by two, it can be properly represented by a mp\_word. + +Similar to algorithm s\_mp\_mul\_digs, after every pass of the inner loop, the destination is correctly set to the sum of all of the partial +results calculated so far. This involves expensive carry propagation which will be eliminated in the next algorithm. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_sqr.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */ +018 int +019 s_mp_sqr (mp_int * a, mp_int * b) +020 \{ +021 mp_int t; +022 int res, ix, iy, pa; +023 mp_word r; +024 mp_digit u, tmpx, *tmpt; +025 +026 pa = a->used; +027 if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) \{ +028 return res; +029 \} +030 +031 /* default used is maximum possible size */ +032 t.used = 2*pa + 1; +033 +034 for (ix = 0; ix < pa; ix++) \{ +035 /* first calculate the digit at 2*ix */ +036 /* calculate double precision result */ +037 r = ((mp_word) t.dp[2*ix]) + +038 ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]); +039 +040 /* store lower part in result */ +041 t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK)); +042 +043 /* get the carry */ +044 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT)); +045 +046 /* left hand side of A[ix] * A[iy] */ +047 tmpx = a->dp[ix]; +048 +049 /* alias for where to store the results */ +050 tmpt = t.dp + (2*ix + 1); +051 +052 for (iy = ix + 1; iy < pa; iy++) \{ +053 /* first calculate the product */ +054 r = ((mp_word)tmpx) * ((mp_word)a->dp[iy]); +055 +056 /* now calculate the double precision result, note we use +057 * addition instead of *2 since it's easier to optimize +058 */ +059 r = ((mp_word) *tmpt) + r + r + ((mp_word) u); +060 +061 /* store lower part */ +062 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); +063 +064 /* get carry */ +065 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT)); +066 \} +067 /* propagate upwards */ +068 while (u != ((mp_digit) 0)) \{ +069 r = ((mp_word) *tmpt) + ((mp_word) u); +070 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); +071 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT)); +072 \} +073 \} +074 +075 mp_clamp (&t); +076 mp_exch (&t, b); +077 mp_clear (&t); +078 return MP_OKAY; +079 \} +\end{alltt} +\end{small} + +Inside the outer loop (\textit{see line 34}) the square term is calculated on line 37. Line 44 extracts the carry from the square +term. Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized on lines 47 and 50 respectively. The doubling is performed using two +additions (\textit{see line 59}) since it is usually faster than shifting,if not at least as fast. + +\subsection{Faster Squaring by the ``Comba'' Method} +A major drawback to the baseline method is the requirement for single precision shifting inside the $O(n^2)$ nested loop. Squaring has an additional +drawback that it must double the product inside the inner loop as well. As for multiplication, the Comba technique can be used to eliminate these +performance hazards. + +The first obvious solution is to make an array of mp\_words which will hold all of the columns. This will indeed eliminate all of the carry +propagation operations from the inner loop. However, the inner product must still be doubled $O(n^2)$ times. The solution stems from the simple fact +that $2a + 2b + 2c = 2(a + b + c)$. That is the sum of all of the double products is equal to double the sum of all the products. For example, +$ab + ba + ac + ca = 2ab + 2ac = 2(ab + ac)$. + +However, we cannot simply double all of the columns, since the squares appear only once per row. The most practical solution is to have two mp\_word +arrays. One array will hold the squares and the other array will hold the double products. With both arrays the doubling and carry propagation can be +moved to a $O(n)$ work level outside the $O(n^2)$ level. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{fast\_s\_mp\_sqr}. \\ +\textbf{Input}. mp\_int $a$ \\ +\textbf{Output}. $b \leftarrow a^2$ \\ +\hline \\ +Place two arrays of \textbf{MP\_WARRAY} mp\_words named $\hat W$ and $\hat {X}$ on the stack. \\ +1. If $b.alloc < 2a.used + 1$ then grow $b$ to $2a.used + 1$ digits. (\textit{mp\_grow}). \\ +2. If step 1 failed return(\textit{MP\_MEM}). \\ +3. for $ix$ from $0$ to $2a.used + 1$ do \\ +\hspace{3mm}3.1 $\hat W_{ix} \leftarrow 0$ \\ +\hspace{3mm}3.2 $\hat {X}_{ix} \leftarrow 0$ \\ +4. for $ix$ from $0$ to $a.used - 1$ do \\ +\hspace{3mm}Compute the square.\\ +\hspace{3mm}4.1 $\hat {X}_{ix+ix} \leftarrow \left ( a_{ix} \right )^2$ \\ +\\ +\hspace{3mm}Compute the double products.\\ +\hspace{3mm}4.2 for $iy$ from $ix + 1$ to $a.used - 1$ do \\ +\hspace{6mm}4.2.1 $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}a_{iy}$ \\ +5. $oldused \leftarrow b.used$ \\ +6. $b.used \leftarrow 2a.used + 1$ \\ +\\ +Double the products and propagate the carries simultaneously. \\ +7. $\hat W_0 \leftarrow 2 \hat W_0 + \hat {X}_0$ \\ +8. for $ix$ from $1$ to $2a.used$ do \\ +\hspace{3mm}8.1 $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ \\ +\hspace{3mm}8.2 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix - 1} / \beta \rfloor$ \\ +\hspace{3mm}8.3 $b_{ix-1} \leftarrow W_{ix-1} \mbox{ (mod }\beta\mbox{)}$ \\ +9. $b_{2a.used} \leftarrow \hat W_{2a.used} \mbox{ (mod }\beta\mbox{)}$ \\ +10. if $2a.used + 1 < oldused$ then do \\ +\hspace{3mm}10.1 for $ix$ from $2a.used + 1$ to $oldused$ do \\ +\hspace{6mm}10.1.1 $b_{ix} \leftarrow 0$ \\ +11. Clamp excess digits from $b$. (\textit{mp\_clamp}) \\ +12. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm fast\_s\_mp\_sqr} +\end{figure} + +\textbf{Algorithm fast\_s\_mp\_sqr.} +This algorithm computes the square of an input using the Comba technique. It is designed to be a replacement for algorithm s\_mp\_sqr when +the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$. + +This routine requires two arrays of mp\_words to be placed on the stack. The first array $\hat W$ will hold the double products and the second +array $\hat X$ will hold the squares. Though only at most $MP\_WARRAY \over 2$ words of $\hat X$ are used, it has proven faster on most +processors to simply make it a full size array. + +The loop on step 3 will zero the two arrays to prepare them for the squaring step. Step 4.1 computes the squares of the product. Note how +it simply assigns the value into the $\hat X$ array. The nested loop on step 4.2 computes the doubles of the products. This loop +computes the sum of the products for each column. They are not doubled until later. + +After the squaring loop, the products stored in $\hat W$ musted be doubled and the carries propagated forwards. It makes sense to do both +operations at the same time. The expression $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ computes the sum of the double product and the +squares in place. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_fast\_s\_mp\_sqr.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* fast squaring +018 * +019 * This is the comba method where the columns of the product +020 * are computed first then the carries are computed. This +021 * has the effect of making a very simple inner loop that +022 * is executed the most +023 * +024 * W2 represents the outer products and W the inner. +025 * +026 * A further optimizations is made because the inner +027 * products are of the form "A * B * 2". The *2 part does +028 * not need to be computed until the end which is good +029 * because 64-bit shifts are slow! +030 * +031 * Based on Algorithm 14.16 on pp.597 of HAC. +032 * +033 */ +034 int fast_s_mp_sqr (mp_int * a, mp_int * b) +035 \{ +036 int olduse, newused, res, ix, pa; +037 mp_word W2[MP_WARRAY], W[MP_WARRAY]; +038 +039 /* calculate size of product and allocate as required */ +040 pa = a->used; +041 newused = pa + pa + 1; +042 if (b->alloc < newused) \{ +043 if ((res = mp_grow (b, newused)) != MP_OKAY) \{ +044 return res; +045 \} +046 \} +047 +048 /* zero temp buffer (columns) +049 * Note that there are two buffers. Since squaring requires +050 * a outer and inner product and the inner product requires +051 * computing a product and doubling it (a relatively expensive +052 * op to perform n**2 times if you don't have to) the inner and +053 * outer products are computed in different buffers. This way +054 * the inner product can be doubled using n doublings instead of +055 * n**2 +056 */ +057 memset (W, 0, newused * sizeof (mp_word)); +058 memset (W2, 0, newused * sizeof (mp_word)); +059 +060 /* This computes the inner product. To simplify the inner N**2 loop +061 * the multiplication by two is done afterwards in the N loop. +062 */ +063 for (ix = 0; ix < pa; ix++) \{ +064 /* compute the outer product +065 * +066 * Note that every outer product is computed +067 * for a particular column only once which means that +068 * there is no need todo a double precision addition +069 * into the W2[] array. +070 */ +071 W2[ix + ix] = ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]); +072 +073 \{ +074 register mp_digit tmpx, *tmpy; +075 register mp_word *_W; +076 register int iy; +077 +078 /* copy of left side */ +079 tmpx = a->dp[ix]; +080 +081 /* alias for right side */ +082 tmpy = a->dp + (ix + 1); +083 +084 /* the column to store the result in */ +085 _W = W + (ix + ix + 1); +086 +087 /* inner products */ +088 for (iy = ix + 1; iy < pa; iy++) \{ +089 *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++); +090 \} +091 \} +092 \} +093 +094 /* setup dest */ +095 olduse = b->used; +096 b->used = newused; +097 +098 /* now compute digits +099 * +100 * We have to double the inner product sums, add in the +101 * outer product sums, propagate carries and convert +102 * to single precision. +103 */ +104 \{ +105 register mp_digit *tmpb; +106 +107 /* double first value, since the inner products are +108 * half of what they should be +109 */ +110 W[0] += W[0] + W2[0]; +111 +112 tmpb = b->dp; +113 for (ix = 1; ix < newused; ix++) \{ +114 /* double/add next digit */ +115 W[ix] += W[ix] + W2[ix]; +116 +117 /* propagate carry forwards [from the previous digit] */ +118 W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT)); +119 +120 /* store the current digit now that the carry isn't +121 * needed +122 */ +123 *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); +124 \} +125 /* set the last value. Note even if the carry is zero +126 * this is required since the next step will not zero +127 * it if b originally had a value at b->dp[2*a.used] +128 */ +129 *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK)); +130 +131 /* clear high digits of b if there were any originally */ +132 for (; ix < olduse; ix++) \{ +133 *tmpb++ = 0; +134 \} +135 \} +136 +137 mp_clamp (b); +138 return MP_OKAY; +139 \} +\end{alltt} +\end{small} + +-- Write something deep and insightful later, Tom. + +\subsection{Polynomial Basis Squaring} +The same algorithm that performs optimal polynomial basis multiplication can be used to perform polynomial basis squaring. The minor exception +is that $\zeta_y = f(y)g(y)$ is actually equivalent to $\zeta_y = f(y)^2$ since $f(y) = g(y)$. Instead of performing $2n + 1$ +multiplications to find the $\zeta$ relations, squaring operations are performed instead. + +\subsection{Karatsuba Squaring} +Let $f(x) = ax + b$ represent the polynomial basis representation of a number to square. +Let $h(x) = \left ( f(x) \right )^2$ represent the square of the polynomial. The Karatsuba equation can be modified to square a +number with the following equation. + +\begin{equation} +h(x) = a^2x^2 + \left (a^2 + b^2 - (a - b)^2 \right )x + b^2 +\end{equation} + +Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a - b)^2$. As in +Karatsuba multiplication, this algorithm can be applied recursively on the input and will achieve an asymptotic running time of +$O \left ( n^{lg(3)} \right )$. + +If the asymptotic times of Karatsuba squaring and multiplication are the same, why not simply use the multiplication algorithm +instead? The answer to this arises from the cutoff point for squaring. As in multiplication there exists a cutoff point, at which the +time required for a Comba based squaring and a Karatsuba based squaring meet. Due to the overhead inherent in the Karatsuba method, the cutoff +point is fairly high. For example, on an AMD Athlon XP processor with $\beta = 2^{28}$, the cutoff point is around 127 digits. + +Consider squaring a 200 digit number with this technique. It will be split into two 100 digit halves which are subsequently squared. +The 100 digit halves will not be squared using Karatsuba, but instead using the faster Comba based squaring algorithm. If Karatsuba multiplication +were used instead, the 100 digit numbers would be squared with a slower Comba based multiplication. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_karatsuba\_sqr}. \\ +\textbf{Input}. mp\_int $a$ \\ +\textbf{Output}. $b \leftarrow a^2$ \\ +\hline \\ +1. Initialize the following temporary mp\_ints: $x0$, $x1$, $t1$, $t2$, $x0x0$ and $x1x1$. \\ +2. If any of the initializations on step 1 failed return(\textit{MP\_MEM}). \\ +\\ +Split the input. e.g. $a = x1\beta^B + x0$ \\ +3. $B \leftarrow \lfloor a.used / 2 \rfloor$ \\ +4. $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\ +5. $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_lshd}) \\ +\\ +Calculate the three squares. \\ +6. $x0x0 \leftarrow x0^2$ (\textit{mp\_sqr}) \\ +7. $x1x1 \leftarrow x1^2$ \\ +8. $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\ +9. $t1 \leftarrow t1^2$ \\ +\\ +Compute the middle term. \\ +10. $t2 \leftarrow x0x0 + x1x1$ (\textit{s\_mp\_add}) \\ +11. $t1 \leftarrow t2 - t1$ \\ +\\ +Compute final product. \\ +12. $t1 \leftarrow t1\beta^B$ (\textit{mp\_lshd}) \\ +13. $x1x1 \leftarrow x1x1\beta^{2B}$ \\ +14. $t1 \leftarrow t1 + x0x0$ \\ +15. $b \leftarrow t1 + x1x1$ \\ +16. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_karatsuba\_sqr} +\end{figure} + +\textbf{Algorithm mp\_karatsuba\_sqr.} +This algorithm computes the square of an input $a$ using the Karatsuba technique. This algorithm is very similar to the Karatsuba based +multiplication algorithm with the exception that the three half-size multiplications have been replaced with three half-size squarings. + +The radix point for squaring is simply placed exactly in the middle of the digits when the input has an odd number of digits, otherwise it is +placed just below the middle. Step 3, 4 and 5 compute the two halves required using $B$ +as the radix point. The first two squares in steps 6 and 7 are rather straightforward while the last square is of a more compact form. + +By expanding $\left (x1 - x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $x1^2 + x0^2 - (x1 - x0)^2 = 2 \cdot x0 \cdot x1$. +Now if $5n$ single precision additions and a squaring of $n$-digits is faster than multiplying two $n$-digit numbers and doubling then +this method is faster. Assuming no further recursions occur, the difference can be estimated with the following inequality. + +Let $p$ represent the cost of a single precision addition and $q$ the cost of a single precision multiplication both in terms of time\footnote{Or +machine clock cycles.}. + +\begin{equation} +5pn +{{q(n^2 + n)} \over 2} \le pn + qn^2 +\end{equation} + +For example, on an AMD Athlon XP processor $p = {1 \over 3}$ and $q = 6$. This implies that the following inequality should hold. +\begin{center} +\begin{tabular}{rcl} +${5n \over 3} + 3n^2 + 3n$ & $<$ & ${n \over 3} + 6n^2$ \\ +${5 \over 3} + 3n + 3$ & $<$ & ${1 \over 3} + 6n$ \\ +${13 \over 9}$ & $<$ & $n$ \\ +\end{tabular} +\end{center} + +This results in a cutoff point around $n = 2$. As a consequence it is actually faster to compute the middle term the ``long way'' on processors +where multiplication is substantially slower\footnote{On the Athlon there is a 1:17 ratio between clock cycles for addition and multiplication. On +the Intel P4 processor this ratio is 1:29 making this method even more beneficial. The only common exception is the ARMv4 processor which has a +ratio of 1:7. } than simpler operations such as addition. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_karatsuba\_sqr.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* Karatsuba squaring, computes b = a*a using three +018 * half size squarings +019 * +020 * See comments of mp_karatsuba_mul for details. It +021 * is essentially the same algorithm but merely +022 * tuned to perform recursive squarings. +023 */ +024 int mp_karatsuba_sqr (mp_int * a, mp_int * b) +025 \{ +026 mp_int x0, x1, t1, t2, x0x0, x1x1; +027 int B, err; +028 +029 err = MP_MEM; +030 +031 /* min # of digits */ +032 B = a->used; +033 +034 /* now divide in two */ +035 B = B >> 1; +036 +037 /* init copy all the temps */ +038 if (mp_init_size (&x0, B) != MP_OKAY) +039 goto ERR; +040 if (mp_init_size (&x1, a->used - B) != MP_OKAY) +041 goto X0; +042 +043 /* init temps */ +044 if (mp_init_size (&t1, a->used * 2) != MP_OKAY) +045 goto X1; +046 if (mp_init_size (&t2, a->used * 2) != MP_OKAY) +047 goto T1; +048 if (mp_init_size (&x0x0, B * 2) != MP_OKAY) +049 goto T2; +050 if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY) +051 goto X0X0; +052 +053 \{ +054 register int x; +055 register mp_digit *dst, *src; +056 +057 src = a->dp; +058 +059 /* now shift the digits */ +060 dst = x0.dp; +061 for (x = 0; x < B; x++) \{ +062 *dst++ = *src++; +063 \} +064 +065 dst = x1.dp; +066 for (x = B; x < a->used; x++) \{ +067 *dst++ = *src++; +068 \} +069 \} +070 +071 x0.used = B; +072 x1.used = a->used - B; +073 +074 mp_clamp (&x0); +075 +076 /* now calc the products x0*x0 and x1*x1 */ +077 if (mp_sqr (&x0, &x0x0) != MP_OKAY) +078 goto X1X1; /* x0x0 = x0*x0 */ +079 if (mp_sqr (&x1, &x1x1) != MP_OKAY) +080 goto X1X1; /* x1x1 = x1*x1 */ +081 +082 /* now calc (x1-x0)**2 */ +083 if (mp_sub (&x1, &x0, &t1) != MP_OKAY) +084 goto X1X1; /* t1 = x1 - x0 */ +085 if (mp_sqr (&t1, &t1) != MP_OKAY) +086 goto X1X1; /* t1 = (x1 - x0) * (x1 - x0) */ +087 +088 /* add x0y0 */ +089 if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY) +090 goto X1X1; /* t2 = x0x0 + x1x1 */ +091 if (mp_sub (&t2, &t1, &t1) != MP_OKAY) +092 goto X1X1; /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */ +093 +094 /* shift by B */ +095 if (mp_lshd (&t1, B) != MP_OKAY) +096 goto X1X1; /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */ +097 if (mp_lshd (&x1x1, B * 2) != MP_OKAY) +098 goto X1X1; /* x1x1 = x1x1 << 2*B */ +099 +100 if (mp_add (&x0x0, &t1, &t1) != MP_OKAY) +101 goto X1X1; /* t1 = x0x0 + t1 */ +102 if (mp_add (&t1, &x1x1, b) != MP_OKAY) +103 goto X1X1; /* t1 = x0x0 + t1 + x1x1 */ +104 +105 err = MP_OKAY; +106 +107 X1X1:mp_clear (&x1x1); +108 X0X0:mp_clear (&x0x0); +109 T2:mp_clear (&t2); +110 T1:mp_clear (&t1); +111 X1:mp_clear (&x1); +112 X0:mp_clear (&x0); +113 ERR: +114 return err; +115 \} +\end{alltt} +\end{small} + +This implementation is largely based on the implementation of algorithm mp\_karatsuba\_mul. It uses the same inline style to copy and +shift the input into the two halves. The loop from line 53 to line 69 has been modified since only one input exists. The \textbf{used} +count of both $x0$ and $x1$ is fixed up and $x0$ is clamped before the calculations begin. At this point $x1$ and $x0$ are valid equivalents +to the respective halves as if mp\_rshd and mp\_mod\_2d had been used. + +By inlining the copy and shift operations the cutoff point for Karatsuba multiplication can be lowered. On the Athlon the cutoff point +is exactly at the point where Comba squaring can no longer be used (\textit{128 digits}). On slower processors such as the Intel P4 +it is actually below the Comba limit (\textit{at 110 digits}). + +This routine uses the same error trap coding style as mp\_karatsuba\_sqr. As the temporary variables are initialized errors are redirected to +the error trap higher up. If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and mp\_clears are executed normally. + +\textit{Last paragraph sucks. re-write! -- Tom} + +\subsection{Toom-Cook Squaring} +The Toom-Cook squaring algorithm mp\_toom\_sqr is heavily based on the algorithm mp\_toom\_mul with the exception that squarings are used +instead of multiplication to find the five relations.. The reader is encouraged to read the description of the latter algorithm and try to +derive their own Toom-Cook squaring algorithm. + +\subsection{High Level Squaring} +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_sqr}. \\ +\textbf{Input}. mp\_int $a$ \\ +\textbf{Output}. $b \leftarrow a^2$ \\ +\hline \\ +1. If $a.used \ge TOOM\_SQR\_CUTOFF$ then \\ +\hspace{3mm}1.1 $b \leftarrow a^2$ using algorithm mp\_toom\_sqr \\ +2. else if $a.used \ge KARATSUBA\_SQR\_CUTOFF$ then \\ +\hspace{3mm}2.1 $b \leftarrow a^2$ using algorithm mp\_karatsuba\_sqr \\ +3. else \\ +\hspace{3mm}3.1 $digs \leftarrow a.used + b.used + 1$ \\ +\hspace{3mm}3.2 If $digs < MP\_ARRAY$ and $a.used \le \delta$ then \\ +\hspace{6mm}3.2.1 $b \leftarrow a^2$ using algorithm fast\_s\_mp\_sqr. \\ +\hspace{3mm}3.3 else \\ +\hspace{6mm}3.3.1 $b \leftarrow a^2$ using algorithm s\_mp\_sqr. \\ +4. $b.sign \leftarrow MP\_ZPOS$ \\ +5. Return the result of the unsigned squaring performed. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_sqr} +\end{figure} + +\textbf{Algorithm mp\_sqr.} +This algorithm computes the square of the input using one of four different algorithms. If the input is very large and has at least +\textbf{TOOM\_SQR\_CUTOFF} or \textbf{KARATSUBA\_SQR\_CUTOFF} digits then either the Toom-Cook or the Karatsuba Squaring algorithm is used. If +neither of the polynomial basis algorithms should be used then either the Comba or baseline algorithm is used. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_sqr.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* computes b = a*a */ +018 int +019 mp_sqr (mp_int * a, mp_int * b) +020 \{ +021 int res; +022 +023 /* use Toom-Cook? */ +024 if (a->used >= TOOM_SQR_CUTOFF) \{ +025 res = mp_toom_sqr(a, b); +026 /* Karatsuba? */ +027 \} else if (a->used >= KARATSUBA_SQR_CUTOFF) \{ +028 res = mp_karatsuba_sqr (a, b); +029 \} else \{ +030 /* can we use the fast comba multiplier? */ +031 if ((a->used * 2 + 1) < MP_WARRAY && +032 a->used < +033 (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) \{ +034 res = fast_s_mp_sqr (a, b); +035 \} else \{ +036 res = s_mp_sqr (a, b); +037 \} +038 \} +039 b->sign = MP_ZPOS; +040 return res; +041 \} +\end{alltt} +\end{small} + +\section*{Exercises} +\begin{tabular}{cl} +$\left [ 3 \right ] $ & Devise an efficient algorithm for selection of the radix point to handle inputs \\ + & that have different number of digits in Karatsuba multiplication. \\ + & \\ +$\left [ 3 \right ] $ & In section 5.3 the fact that every column of a squaring is made up \\ + & of double products and at most one square is stated. Prove this statement. \\ + & \\ +$\left [ 2 \right ] $ & In the Comba squaring algorithm half of the $\hat X$ variables are not used. \\ + & Revise algorithm fast\_s\_mp\_sqr to shrink the $\hat X$ array. \\ + & \\ +$\left [ 3 \right ] $ & Prove the equation for Karatsuba squaring. \\ + & \\ +$\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3)} \right )$ time. \\ + & \\ +$\left [ 2 \right ] $ & Determine the minimal ratio between addition and multiplication clock cycles \\ + & required for equation $6.7$ to be true. \\ + & \\ +\end{tabular} + +\chapter{Modular Reduction} +\section{Basics of Modular Reduction} +\index{modular residue} +Modular reduction is an operation that arises quite often within public key cryptography algorithms and various number theoretic algorithms, +such as factoring. Modular reduction algorithms are the third class of algorithms of the ``multipliers'' set. A number $a$ is said to be \textit{reduced} +modulo another number $b$ by finding the remainder of the division $a/b$. Full integer division with remainder is a topic to be covered +in~\ref{sec:division}. + +Modular reduction is equivalent to solving for $r$ in the following equation. $a = bq + r$ where $q = \lfloor a/b \rfloor$. The result +$r$ is said to be ``congruent to $a$ modulo $b$'' which is also written as $r \equiv a \mbox{ (mod }b\mbox{)}$. In other vernacular $r$ is known as the +``modular residue'' which leads to ``quadratic residue''\footnote{That's fancy talk for $b \equiv a^2 \mbox{ (mod }p\mbox{)}$.} and +other forms of residues. + +Modular reductions are normally used to create either finite groups, rings or fields. The most common usage for performance driven modular reductions +is in modular exponentiation algorithms. That is to compute $d = a^b \mbox{ (mod }c\mbox{)}$ as fast as possible. This operation is used in the +RSA and Diffie-Hellman public key algorithms, for example. Modular multiplication and squaring also appears as a fundamental operation in +Elliptic Curve cryptographic algorithms. As will be discussed in the subsequent chapter there exist fast algorithms for computing modular +exponentiations without having to perform (\textit{in this example}) $b - 1$ multiplications. These algorithms will produce partial results in the +range $0 \le x < c^2$ which can be taken advantage of to create several efficient algorithms. They have also been used to create redundancy check +algorithms known as CRCs, error correction codes such as Reed-Solomon and solve a variety of number theoeretic problems. + +\section{The Barrett Reduction} +The Barrett reduction algorithm \cite{BARRETT} was inspired by fast division algorithms which multiply by the reciprocal to emulate +division. Barretts observation was that the residue $c$ of $a$ modulo $b$ is equal to + +\begin{equation} +c = a - b \cdot \lfloor a/b \rfloor +\end{equation} + +Since algorithms such as modular exponentiation would be using the same modulus extensively, typical DSP\footnote{It is worth noting that Barrett's paper +targeted the DSP56K processor.} intuition would indicate the next step would be to replace $a/b$ by a multiplication by the reciprocal. However, +DSP intuition on its own will not work as these numbers are considerably larger than the precision of common DSP floating point data types. +It would take another common optimization to optimize the algorithm. + +\subsection{Fixed Point Arithmetic} +The trick used to optimize the above equation is based on a technique of emulating floating point data types with fixed precision integers. Fixed +point arithmetic would become very popular as it greatly optimize the ``3d-shooter'' genre of games in the mid 1990s when floating point units were +fairly slow if not unavailable. The idea behind fixed point arithmetic is to take a normal $k$-bit integer data type and break it into $p$-bit +integer and a $q$-bit fraction part (\textit{where $p+q = k$}). + +In this system a $k$-bit integer $n$ would actually represent $n/2^q$. For example, with $q = 4$ the integer $n = 37$ would actually represent the +value $2.3125$. To multiply two fixed point numbers the integers are multiplied using traditional arithmetic and subsequently normalized by +moving the implied decimal point back to where it should be. For example, with $q = 4$ to multiply the integers $9$ and $5$ they must be converted +to fixed point first by multiplying by $2^q$. Let $a = 9(2^q)$ represent the fixed point representation of $9$ and $b = 5(2^q)$ represent the +fixed point representation of $5$. The product $ab$ is equal to $45(2^{2q})$ which when normalized by dividing by $2^q$ produces $45(2^q)$. + +This technique became popular since a normal integer multiplication and logical shift right are the only required operations to perform a multiplication +of two fixed point numbers. Using fixed point arithmetic, division can be easily approximated by multiplying by the reciprocal. If $2^q$ is +equivalent to one than $2^q/b$ is equivalent to the fixed point approximation of $1/b$ using real arithmetic. Using this fact dividing an integer +$a$ by another integer $b$ can be achieved with the following expression. + +\begin{equation} +\lfloor a / b \rfloor \mbox{ }\approx\mbox{ } \lfloor (a \cdot \lfloor 2^q / b \rfloor)/2^q \rfloor +\end{equation} + +The precision of the division is proportional to the value of $q$. If the divisor $b$ is used frequently as is the case with +modular exponentiation pre-computing $2^q/b$ will allow a division to be performed with a multiplication and a right shift. Both operations +are considerably faster than division on most processors. + +Consider dividing $19$ by $5$. The correct result is $\lfloor 19/5 \rfloor = 3$. With $q = 3$ the reciprocal is $\lfloor 2^q/5 \rfloor = 1$ which +leads to a product of $19$ which when divided by $2^q$ produces $2$. However, with $q = 4$ the reciprocal is $\lfloor 2^q/5 \rfloor = 3$ and +the result of the emulated division is $\lfloor 3 \cdot 19 / 2^q \rfloor = 3$ which is correct. The value of $2^q$ must be close to or ideally +larger than the dividend. In effect if $a$ is the dividend then $q$ should allow $0 \le \lfloor a/2^q \rfloor \le 1$ in order for this approach +to work correctly. Plugging this form of divison into the original equation the following modular residue equation arises. + +\begin{equation} +c = a - b \cdot \lfloor (a \cdot \lfloor 2^q / b \rfloor)/2^q \rfloor +\end{equation} + +Using the notation from \cite{BARRETT} the value of $\lfloor 2^q / b \rfloor$ will be represented by the $\mu$ symbol. Using the $\mu$ +variable also helps re-inforce the idea that it is meant to be computed once and re-used. + +\begin{equation} +c = a - b \cdot \lfloor (a \cdot \mu)/2^q \rfloor +\end{equation} + +Provided that $2^q \ge a$ this algorithm will produce a quotient that is either exactly correct or off by a value of one. In the context of Barrett +reduction the value of $a$ is bound by $0 \le a \le (b - 1)^2$ meaning that $2^q \ge b^2$ is sufficient to ensure the reciprocal will have enough +precision. + +Let $n$ represent the number of digits in $b$. This algorithm requires approximately $2n^2$ single precision multiplications to produce the quotient and +another $n^2$ single precision multiplications to find the residue. In total $3n^2$ single precision multiplications are required to +reduce the number. + +For example, if $b = 1179677$ and $q = 41$ ($2^q > b^2$), then the reciprocal $\mu$ is equal to $\lfloor 2^q / b \rfloor = 1864089$. Consider reducing +$a = 180388626447$ modulo $b$ using the above reduction equation. The quotient using the new formula is $\lfloor (a \cdot \mu) / 2^q \rfloor = 152913$. +By subtracting $152913b$ from $a$ the correct residue $a \equiv 677346 \mbox{ (mod }b\mbox{)}$ is found. + +\subsection{Choosing a Radix Point} +Using the fixed point representation a modular reduction can be performed with $3n^2$ single precision multiplications. If that were the best +that could be achieved a full division\footnote{A division requires approximately $O(2cn^2)$ single precision multiplications for a small value of $c$. +See~\ref{sec:division} for further details.} might as well be used in its place. The key to optimizing the reduction is to reduce the precision of +the initial multiplication that finds the quotient. + +Let $a$ represent the number of which the residue is sought. Let $b$ represent the modulus used to find the residue. Let $m$ represent +the number of digits in $b$. For the purposes of this discussion we will assume that the number of digits in $a$ is $2m$, which is generally true if +two $m$-digit numbers have been multiplied. Dividing $a$ by $b$ is the same as dividing a $2m$ digit integer by a $m$ digit integer. Digits below the +$m - 1$'th digit of $a$ will contribute at most a value of $1$ to the quotient because $\beta^k < b$ for any $0 \le k \le m - 1$. Another way to +express this is by re-writing $a$ as two parts. If $a' \equiv a \mbox{ (mod }b^m\mbox{)}$ and $a'' = a - a'$ then +${a \over b} \equiv {{a' + a''} \over b}$ which is equivalent to ${a' \over b} + {a'' \over b}$. Since $a'$ is bound to be less than $b$ the quotient +is bound by $0 \le {a' \over b} < 1$. + +Since the digits of $a'$ do not contribute much to the quotient the observation is that they might as well be zero. However, if the digits +``might as well be zero'' they might as well not be there in the first place. Let $q_0 = \lfloor a/\beta^{m-1} \rfloor$ represent the input +with the irrelevant digits trimmed. Now the modular reduction is trimmed to the almost equivalent equation + +\begin{equation} +c = a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor +\end{equation} + +Note that the original divisor $2^q$ has been replaced with $\beta^{m+1}$ where in this case $q$ is a multiple of $lg(\beta)$. Also note that the +exponent on the divisor when added to the amount $q_0$ was shifted by equals $2m$. If the optimization had not been performed the divisor +would have the exponent $2m$ so in the end the exponents do ``add up''. Using the above equation the quotient +$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ can be off from the true quotient by at most two. The original fixed point quotient can be off +by as much as one (\textit{provided the radix point is chosen suitably}) and now that the lower irrelevent digits have been trimmed the quotient +can be off by an additional value of one for a total of at most two. This implies that +$0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$. By first subtracting $b$ times the quotient and then conditionally subtracting +$b$ once or twice the residue is found. + +The quotient is now found using $(m + 1)(m) = m^2 + m$ single precision multiplications and the residue with an additional $m^2$ single +precision multiplications, ignoring the subtractions required. In total $2m^2 + m$ single precision multiplications are required to find the residue. +This is considerably faster than the original attempt. + +For example, let $\beta = 10$ represent the radix of the digits. Let $b = 9999$ represent the modulus which implies $m = 4$. Let $a = 99929878$ +represent the value of which the residue is desired. In this case $q = 8$ since $10^7 < 9999^2$ meaning that $\mu = \lfloor \beta^{q}/b \rfloor = 10001$. +With the new observation the multiplicand for the quotient is equal to $q_0 = \lfloor a / \beta^{m - 1} \rfloor = 99929$. The quotient is then +$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor = 9993$. Subtracting $9993b$ from $a$ and the correct residue $a \equiv 9871 \mbox{ (mod }b\mbox{)}$ +is found. + +\subsection{Trimming the Quotient} +So far the reduction algorithm has been optimized from $3m^2$ single precision multiplications down to $2m^2 + m$ single precision multiplications. As +it stands now the algorithm is already fairly fast compared to a full integer division algorithm. However, there is still room for +optimization. + +After the first multiplication inside the quotient ($q_0 \cdot \mu$) the value is shifted right by $m + 1$ places effectively nullifying the lower +half of the product. It would be nice to be able to remove those digits from the product to effectively cut down the number of single precision +multiplications. If the number of digits in the modulus $m$ is far less than $\beta$ a full product is not required for the algorithm to work properly. +In fact the lower $m - 2$ digits will not affect the upper half of the product at all and do not need to be computed. + +The value of $\mu$ is a $m$-digit number and $q_0$ is a $m + 1$ digit number. Using a full multiplier $(m + 1)(m) = m^2 + m$ single precision +multiplications would be required. Using a multiplier that will only produce digits at and above the $m - 1$'th digit reduces the number +of single precision multiplications to ${m^2 + m} \over 2$ single precision multiplications. + +\subsection{Trimming the Residue} +After the quotient has been calculated it is used to reduce the input. As previously noted the algorithm is not exact and it can be off by a small +multiple of the modulus, that is $0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$. If $b$ is $m$ digits than the +result of reduction equation is a value of at most $m + 1$ digits (\textit{provided $3 < \beta$}) implying that the upper $m - 1$ digits are +implicitly zero. + +The next optimization arises from this very fact. Instead of computing $b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ using a full +$O(m^2)$ multiplication algorithm only the lower $m+1$ digits of the product have to be computed. Similarly the value of $a$ can +be reduced modulo $\beta^{m+1}$ before the multiple of $b$ is subtracted which simplifes the subtraction as well. A multiplication that produces +only the lower $m+1$ digits requires ${m^2 + 3m - 2} \over 2$ single precision multiplications. + +With both optimizations in place the algorithm is the algorithm Barrett proposed. It requires $m^2 + 2m - 1$ single precision multiplications which +is considerably faster than the straightforward $3m^2$ method. + +\subsection{The Barrett Algorithm} +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_reduce}. \\ +\textbf{Input}. mp\_int $a$, mp\_int $b$ and $\mu = \lfloor \beta^{2m}/b \rfloor, m = \lceil lg_{\beta}(b) \rceil, (0 \le a < b^2, b > 1)$ \\ +\textbf{Output}. $a \mbox{ (mod }b\mbox{)}$ \\ +\hline \\ +Let $m$ represent the number of digits in $b$. \\ +1. Make a copy of $a$ and store it in $q$. (\textit{mp\_init\_copy}) \\ +2. $q \leftarrow \lfloor q / \beta^{m - 1} \rfloor$ (\textit{mp\_rshd}) \\ +\\ +Produce the quotient. \\ +3. $q \leftarrow q \cdot \mu$ (\textit{note: only produce digits at or above $m-1$}) \\ +4. $q \leftarrow \lfloor q / \beta^{m + 1} \rfloor$ \\ +\\ +Subtract the multiple of modulus from the input. \\ +5. $a \leftarrow a \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{mp\_mod\_2d}) \\ +6. $q \leftarrow q \cdot b \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{s\_mp\_mul\_digs}) \\ +7. $a \leftarrow a - q$ (\textit{mp\_sub}) \\ +\\ +Add $\beta^{m+1}$ if a carry occured. \\ +8. If $a < 0$ then (\textit{mp\_cmp\_d}) \\ +\hspace{3mm}8.1 $q \leftarrow 1$ (\textit{mp\_set}) \\ +\hspace{3mm}8.2 $q \leftarrow q \cdot \beta^{m+1}$ (\textit{mp\_lshd}) \\ +\hspace{3mm}8.3 $a \leftarrow a + q$ \\ +\\ +Now subtract the modulus if the residue is too large (e.g. quotient too small). \\ +9. While $a \ge b$ do (\textit{mp\_cmp}) \\ +\hspace{3mm}9.1 $c \leftarrow a - b$ \\ +10. Clear $q$. \\ +11. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_reduce} +\end{figure} + +\textbf{Algorithm mp\_reduce.} +This algorithm will reduce the input $a$ modulo $b$ in place using the Barrett algorithm. It is loosely based on algorithm 14.42 of HAC +\cite[pp. 602]{HAC} which is based on the paper from Paul Barrett \cite{BARRETT}. The algorithm has several restrictions and assumptions which must +be adhered to for the algorithm to work. + +First the modulus $b$ is assumed to be positive and greater than one. If the modulus were less than or equal to one than subtracting +a multiple of it would either accomplish nothing or actually enlarge the input. The input $a$ must be in the range $0 \le a < b^2$ in order +for the quotient to have enough precision. If $a$ is the product of two numbers that were already reduced modulo $b$, this will not be a problem. +Technically the algorithm will still work if $a \ge b^2$ but it will take much longer to finish. The value of $\mu$ is passed as an argument to this +algorithm and is assumed to be calculated and stored before the algorithm is used. + +Recall that the multiplication for the quotient on step 3 must only produce digits at or above the $m-1$'th position. An algorithm called +$s\_mp\_mul\_high\_digs$ which has not been presented is used to accomplish this task. The algorithm is based on $s\_mp\_mul\_digs$ except that +instead of stopping at a given level of precision it starts at a given level of precision. This optimal algorithm can only be used if the number +of digits in $b$ is very much smaller than $\beta$. + +While it is known that +$a \ge b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ only the lower $m+1$ digits are being used to compute the residue, so an implied +``borrow'' from the higher digits might leave a negative result. After the multiple of the modulus has been subtracted from $a$ the residue must be +fixed up in case it is negative. The invariant $\beta^{m+1}$ must be added to the residue to make it positive again. + +The while loop at step 9 will subtract $b$ until the residue is less than $b$. If the algorithm is performed correctly this step is +performed at most twice, and on average once. However, if $a \ge b^2$ than it will iterate substantially more times than it should. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* reduces x mod m, assumes 0 < x < m**2, mu is +018 * precomputed via mp_reduce_setup. +019 * From HAC pp.604 Algorithm 14.42 +020 */ +021 int +022 mp_reduce (mp_int * x, mp_int * m, mp_int * mu) +023 \{ +024 mp_int q; +025 int res, um = m->used; +026 +027 /* q = x */ +028 if ((res = mp_init_copy (&q, x)) != MP_OKAY) \{ +029 return res; +030 \} +031 +032 /* q1 = x / b**(k-1) */ +033 mp_rshd (&q, um - 1); +034 +035 /* according to HAC this optimization is ok */ +036 if (((unsigned long) um) > (((mp_digit)1) << (DIGIT_BIT - 1))) \{ +037 if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) \{ +038 goto CLEANUP; +039 \} +040 \} else \{ +041 if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) \{ +042 goto CLEANUP; +043 \} +044 \} +045 +046 /* q3 = q2 / b**(k+1) */ +047 mp_rshd (&q, um + 1); +048 +049 /* x = x mod b**(k+1), quick (no division) */ +050 if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) \{ +051 goto CLEANUP; +052 \} +053 +054 /* q = q * m mod b**(k+1), quick (no division) */ +055 if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) \{ +056 goto CLEANUP; +057 \} +058 +059 /* x = x - q */ +060 if ((res = mp_sub (x, &q, x)) != MP_OKAY) \{ +061 goto CLEANUP; +062 \} +063 +064 /* If x < 0, add b**(k+1) to it */ +065 if (mp_cmp_d (x, 0) == MP_LT) \{ +066 mp_set (&q, 1); +067 if ((res = mp_lshd (&q, um + 1)) != MP_OKAY) +068 goto CLEANUP; +069 if ((res = mp_add (x, &q, x)) != MP_OKAY) +070 goto CLEANUP; +071 \} +072 +073 /* Back off if it's too big */ +074 while (mp_cmp (x, m) != MP_LT) \{ +075 if ((res = s_mp_sub (x, m, x)) != MP_OKAY) \{ +076 goto CLEANUP; +077 \} +078 \} +079 +080 CLEANUP: +081 mp_clear (&q); +082 +083 return res; +084 \} +\end{alltt} +\end{small} + +The first multiplication that determines the quotient can be performed by only producing the digits from $m - 1$ and up. This essentially halves +the number of single precision multiplications required. However, the optimization is only safe if $\beta$ is much larger than the number of digits +in the modulus. In the source code this is evaluated on lines 36 to 44 where algorithm s\_mp\_mul\_high\_digs is used when it is +safe to do so. + +\subsection{The Barrett Setup Algorithm} +In order to use algorithm mp\_reduce the value of $\mu$ must be calculated in advance. Ideally this value should be computed once and stored for +future use so that the Barrett algorithm can be used without delay. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_reduce\_setup}. \\ +\textbf{Input}. mp\_int $a$ ($a > 1$) \\ +\textbf{Output}. $\mu \leftarrow \lfloor \beta^{2m}/a \rfloor$ \\ +\hline \\ +1. $\mu \leftarrow 2^{2 \cdot lg(\beta) \cdot m}$ (\textit{mp\_2expt}) \\ +2. $\mu \leftarrow \lfloor \mu / b \rfloor$ (\textit{mp\_div}) \\ +3. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_reduce\_setup} +\end{figure} + +\textbf{Algorithm mp\_reduce\_setup.} +This algorithm computes the reciprocal $\mu$ required for Barrett reduction. First $\beta^{2m}$ is calculated as $2^{2 \cdot lg(\beta) \cdot m}$ which +is equivalent and much faster. The final value is computed by taking the integer quotient of $\lfloor \mu / b \rfloor$. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_setup.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* pre-calculate the value required for Barrett reduction +018 * For a given modulus "b" it calulates the value required in "a" +019 */ +020 int +021 mp_reduce_setup (mp_int * a, mp_int * b) +022 \{ +023 int res; +024 +025 if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) \{ +026 return res; +027 \} +028 return mp_div (a, b, a, NULL); +029 \} +\end{alltt} +\end{small} + +This simple routine calculates the reciprocal $\mu$ required by Barrett reduction. Note the extended usage of algorithm mp\_div where the variable +which would received the remainder is passed as NULL. As will be discussed in~\ref{sec:division} the division routine allows both the quotient and the +remainder to be passed as NULL meaning to ignore the value. + +\section{The Montgomery Reduction} +Montgomery reduction\footnote{Thanks to Niels Ferguson for his insightful explanation of the algorithm.} \cite{MONT} is by far the most interesting +form of reduction in common use. It computes a modular residue which is not actually equal to the residue of the input yet instead equal to a +residue times a constant. However, as perplexing as this may sound the algorithm is relatively simple and very efficient. + +Throughout this entire section the variable $n$ will represent the modulus used to form the residue. As will be discussed shortly the value of +$n$ must be odd. The variable $x$ will represent the quantity of which the residue is sought. Similar to the Barrett algorithm the input +is restricted to $0 \le x < n^2$. To begin the description some simple number theory facts must be established. + +\textbf{Fact 1.} Adding $n$ to $x$ does not change the residue since in effect it adds one to the quotient $\lfloor x / n \rfloor$. Another way +to explain this is that $n$ is (\textit{or multiples of $n$ are}) congruent to zero modulo $n$. Adding zero will not change the value of the residue. + +\textbf{Fact 2.} If $x$ is even then performing a division by two in $\Z$ is congruent to $x \cdot 2^{-1} \mbox{ (mod }n\mbox{)}$. Actually +this is an application of the fact that if $x$ is evenly divisible by any $k \in \Z$ then division in $\Z$ will be congruent to +multiplication by $k^{-1}$ modulo $n$. + +From these two simple facts the following simple algorithm can be derived. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Montgomery Reduction}. \\ +\textbf{Input}. Integer $x$, $n$ and $k$ \\ +\textbf{Output}. $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\ +\hline \\ +1. for $t$ from $1$ to $k$ do \\ +\hspace{3mm}1.1 If $x$ is odd then \\ +\hspace{6mm}1.1.1 $x \leftarrow x + n$ \\ +\hspace{3mm}1.2 $x \leftarrow x/2$ \\ +2. Return $x$. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm Montgomery Reduction} +\end{figure} + +The algorithm reduces the input one bit at a time using the two congruencies stated previously. Inside the loop $n$, which is odd, is +added to $x$ if $x$ is odd. This forces $x$ to be even which allows the division by two in $\Z$ to be congruent to a modular division by two. Since +$x$ is assumed to be initially much larger than $n$ the addition of $n$ will contribute an insignificant magnitude to $x$. Let $r$ represent the +final result of the Montgomery algorithm. If $k > lg(n)$ and $0 \le x < n^2$ then the final result is limited to +$0 \le r < \lfloor x/2^k \rfloor + n$. As a result at most a single subtraction is required to get the residue desired. + +\begin{figure}[here] +\begin{small} +\begin{center} +\begin{tabular}{|c|l|} +\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} \\ +\hline $1$ & $x + n = 5812$, $x/2 = 2906$ \\ +\hline $2$ & $x/2 = 1453$ \\ +\hline $3$ & $x + n = 1710$, $x/2 = 855$ \\ +\hline $4$ & $x + n = 1112$, $x/2 = 556$ \\ +\hline $5$ & $x/2 = 278$ \\ +\hline $6$ & $x/2 = 139$ \\ +\hline $7$ & $x + n = 396$, $x/2 = 198$ \\ +\hline $8$ & $x/2 = 99$ \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Example of Montgomery Reduction (I)} +\label{fig:MONT1} +\end{figure} + +Consider the example in figure~\ref{fig:MONT1} which reduces $x = 5555$ modulo $n = 257$ when $k = 8$. The result of the algorithm $r = 99$ is +congruent to the value of $2^{-8} \cdot 5555 \mbox{ (mod }257\mbox{)}$. When $r$ is multiplied by $2^8$ modulo $257$ the correct residue +$r \equiv 158$ is produced. + +Let $k = \lfloor lg(n) \rfloor + 1$ represent the number of bits in $n$. The current algorithm requires $2k^2$ single precision shifts +and $k^2$ single precision additions. At this rate the algorithm is most certainly slower than Barrett reduction and not terribly useful. +Fortunately there exists an alternative representation of the algorithm. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Montgomery Reduction} (modified I). \\ +\textbf{Input}. Integer $x$, $n$ and $k$ \\ +\textbf{Output}. $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\ +\hline \\ +1. for $t$ from $0$ to $k - 1$ do \\ +\hspace{3mm}1.1 If the $t$'th bit of $x$ is one then \\ +\hspace{6mm}1.1.1 $x \leftarrow x + 2^tn$ \\ +2. Return $x/2^k$. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm Montgomery Reduction (modified I)} +\end{figure} + +This algorithm is equivalent since $2^tn$ is a multiple of $n$ and the lower $k$ bits of $x$ are zero by step 2. The number of single +precision shifts has now been reduced from $2k^2$ to $k^2 + k$ which is only a small improvement. + +\begin{figure}[here] +\begin{small} +\begin{center} +\begin{tabular}{|c|l|r|} +\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} & \textbf{Result ($x$) in Binary} \\ +\hline -- & $5555$ & $1010110110011$ \\ +\hline $1$ & $x + 2^{0}n = 5812$ & $1011010110100$ \\ +\hline $2$ & $5812$ & $1011010110100$ \\ +\hline $3$ & $x + 2^{2}n = 6840$ & $1101010111000$ \\ +\hline $4$ & $x + 2^{3}n = 8896$ & $10001011000000$ \\ +\hline $5$ & $8896$ & $10001011000000$ \\ +\hline $6$ & $8896$ & $10001011000000$ \\ +\hline $7$ & $x + 2^{6}n = 25344$ & $110001100000000$ \\ +\hline $8$ & $25344$ & $110001100000000$ \\ +\hline -- & $x/2^k = 99$ & \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Example of Montgomery Reduction (II)} +\label{fig:MONT2} +\end{figure} + +Figure~\ref{fig:MONT2} demonstrates the modified algorithm reducing $x = 5555$ modulo $n = 257$ with $k = 8$. +With this algorithm a single shift right at the end is the only right shift required to reduce the input instead of $k$ right shifts inside the +loop. Note that for the iterations $t = 2, 5, 6$ and $8$ where the result $x$ is not changed. In those iterations the $t$'th bit of $x$ is +zero and the appropriate multiple of $n$ does not need to be added to force the $t$'th bit of the result to zero. + +\subsection{Digit Based Montgomery Reduction} +Instead of computing the reduction on a bit-by-bit basis it is actually much faster to compute it on digit-by-digit basis. Consider the +previous algorithm re-written to compute the Montgomery reduction in this new fashion. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Montgomery Reduction} (modified II). \\ +\textbf{Input}. Integer $x$, $n$ and $k$ \\ +\textbf{Output}. $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\ +\hline \\ +1. for $t$ from $0$ to $k - 1$ do \\ +\hspace{3mm}1.1 $x \leftarrow x + \mu n \beta^t$ \\ +2. Return $x/\beta^k$. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm Montgomery Reduction (modified II)} +\end{figure} + +The value $\mu n \beta^t$ is a multiple of the modulus $n$ meaning that it will not change the residue. If the first digit of +the value $\mu n \beta^t$ equals the negative (modulo $\beta$) of the $t$'th digit of $x$ then the addition will result in a zero digit. This +problem breaks down to solving the following congruency. + +\begin{center} +\begin{tabular}{rcl} +$x_t + \mu n_0$ & $\equiv$ & $0 \mbox{ (mod }\beta\mbox{)}$ \\ +$\mu n_0$ & $\equiv$ & $-x_t \mbox{ (mod }\beta\mbox{)}$ \\ +$\mu$ & $\equiv$ & $-x_t/n_0 \mbox{ (mod }\beta\mbox{)}$ \\ +\end{tabular} +\end{center} + +In each iteration of the loop on step 1 a new value of $\mu$ must be calculated. The value of $-1/n_0 \mbox{ (mod }\beta\mbox{)}$ is used +extensively in this algorithm and should be precomputed. Let $\rho$ represent the negative of the modular inverse of $n_0$ modulo $\beta$. + +For example, let $\beta = 10$ represent the radix. Let $n = 17$ represent the modulus which implies $k = 2$ and $\rho \equiv 7$. Let $x = 33$ +represent the value to reduce. + +\newpage\begin{figure} +\begin{center} +\begin{tabular}{|c|c|c|} +\hline \textbf{Step ($t$)} & \textbf{Value of $x$} & \textbf{Value of $\mu$} \\ +\hline -- & $33$ & --\\ +\hline $0$ & $33 + \mu n = 50$ & $1$ \\ +\hline $1$ & $50 + \mu n \beta = 900$ & $5$ \\ +\hline +\end{tabular} +\end{center} +\caption{Example of Montgomery Reduction} +\end{figure} + +The final result $900$ is then divided by $\beta^k$ to produce the final result $9$. The first observation is that $9 \nequiv x \mbox{ (mod }n\mbox{)}$ +which implies the result is not the modular residue of $x$ modulo $n$. However, recall that the residue is actually multiplied by $\beta^{-k}$ in +the algorithm. To get the true residue the value must be multiplied by $\beta^k$. In this case $\beta^k \equiv 15 \mbox{ (mod }n\mbox{)}$ and +the correct residue is $9 \cdot 15 \equiv 16 \mbox{ (mod }n\mbox{)}$. + +\subsection{Baseline Montgomery Reduction} +The baseline Montgomery reduction algorithm will produce the residue for any size input. It is designed to be a catch-all algororithm for +Montgomery reductions. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_montgomery\_reduce}. \\ +\textbf{Input}. mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\ +\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\ +\textbf{Output}. $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\ +\hline \\ +1. $digs \leftarrow 2n.used + 1$ \\ +2. If $digs < MP\_ARRAY$ and $m.used < \delta$ then \\ +\hspace{3mm}2.1 Use algorithm fast\_mp\_montgomery\_reduce instead. \\ +\\ +Setup $x$ for the reduction. \\ +3. If $x.alloc < digs$ then grow $x$ to $digs$ digits. \\ +4. $x.used \leftarrow digs$ \\ +\\ +Eliminate the lower $k$ digits. \\ +5. For $ix$ from $0$ to $k - 1$ do \\ +\hspace{3mm}5.1 $\mu \leftarrow x_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{3mm}5.2 $u \leftarrow 0$ \\ +\hspace{3mm}5.3 For $iy$ from $0$ to $k - 1$ do \\ +\hspace{6mm}5.3.1 $\hat r \leftarrow \mu n_{iy} + x_{ix + iy} + u$ \\ +\hspace{6mm}5.3.2 $x_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{6mm}5.3.3 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ +\hspace{3mm}5.4 While $u > 0$ do \\ +\hspace{6mm}5.4.1 $iy \leftarrow iy + 1$ \\ +\hspace{6mm}5.4.2 $x_{ix + iy} \leftarrow x_{ix + iy} + u$ \\ +\hspace{6mm}5.4.3 $u \leftarrow \lfloor x_{ix+iy} / \beta \rfloor$ \\ +\hspace{6mm}5.4.4 $x_{ix + iy} \leftarrow x_{ix+iy} \mbox{ (mod }\beta\mbox{)}$ \\ +\\ +Divide by $\beta^k$ and fix up as required. \\ +6. $x \leftarrow \lfloor x / \beta^k \rfloor$ \\ +7. If $x \ge n$ then \\ +\hspace{3mm}7.1 $x \leftarrow x - n$ \\ +8. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_montgomery\_reduce} +\end{figure} + +\textbf{Algorithm mp\_montgomery\_reduce.} +This algorithm reduces the input $x$ modulo $n$ in place using the Montgomery reduction algorithm. The algorithm is loosely based +on algorithm 14.32 of \cite[pp.601]{HAC} except it merges the multiplication of $\mu n \beta^t$ with the addition in the inner loop. The +restrictions on this algorithm are fairly easy to adapt to. First $0 \le x < n^2$ bounds the input to numbers in the same range as +for the Barrett algorithm. Additionally if $n > 1$ and $n$ is odd there will exist a modular inverse $\rho$. $\rho$ must be calculated in +advance of this algorithm. Finally the variable $k$ is fixed and a pseudonym for $n.used$. + +Step 2 decides whether a faster Montgomery algorithm can be used. It is based on the Comba technique meaning that there are limits on +the size of the input. This algorithm is discussed in sub-section 6.3.3. + +Step 5 is the main reduction loop of the algorithm. The value of $\mu$ is calculated once per iteration in the outer loop. The inner loop +calculates $x + \mu n \beta^{ix}$ by multiplying $\mu n$ and adding the result to $x$ shifted by $ix$ digits. Both the addition and +multiplication are performed in the same loop to save time and memory. Step 5.4 will handle any additional carries that escape the inner loop. + +Using a quick inspection this algorithm requires $n$ single precision multiplications for the outer loop and $n^2$ single precision multiplications +in the inner loop. In total $n^2 + n$ single precision multiplications which compares favourably to Barrett at $n^2 + 2n - 1$ single precision +multiplications. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_montgomery\_reduce.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* computes xR**-1 == x (mod N) via Montgomery Reduction */ +018 int +019 mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho) +020 \{ +021 int ix, res, digs; +022 mp_digit mu; +023 +024 /* can the fast reduction [comba] method be used? +025 * +026 * Note that unlike in mp_mul you're safely allowed *less* +027 * than the available columns [255 per default] since carries +028 * are fixed up in the inner loop. +029 */ +030 digs = n->used * 2 + 1; +031 if ((digs < MP_WARRAY) && +032 n->used < +033 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{ +034 return fast_mp_montgomery_reduce (x, n, rho); +035 \} +036 +037 /* grow the input as required */ +038 if (x->alloc < digs) \{ +039 if ((res = mp_grow (x, digs)) != MP_OKAY) \{ +040 return res; +041 \} +042 \} +043 x->used = digs; +044 +045 for (ix = 0; ix < n->used; ix++) \{ +046 /* mu = ai * rho mod b +047 * +048 * The value of rho must be precalculated via +049 * bn_mp_montgomery_setup() such that +050 * it equals -1/n0 mod b this allows the +051 * following inner loop to reduce the +052 * input one digit at a time +053 */ +054 mu = (mp_digit) (((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK); +055 +056 /* a = a + mu * m * b**i */ +057 \{ +058 register int iy; +059 register mp_digit *tmpn, *tmpx, u; +060 register mp_word r; +061 +062 /* alias for digits of the modulus */ +063 tmpn = n->dp; +064 +065 /* alias for the digits of x [the input] */ +066 tmpx = x->dp + ix; +067 +068 /* set the carry to zero */ +069 u = 0; +070 +071 /* Multiply and add in place */ +072 for (iy = 0; iy < n->used; iy++) \{ +073 /* compute product and sum */ +074 r = ((mp_word)mu) * ((mp_word)*tmpn++) + +075 ((mp_word) u) + ((mp_word) * tmpx); +076 +077 /* get carry */ +078 u = (mp_digit)(r >> ((mp_word) DIGIT_BIT)); +079 +080 /* fix digit */ +081 *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK)); +082 \} +083 /* At this point the ix'th digit of x should be zero */ +084 +085 +086 /* propagate carries upwards as required*/ +087 while (u) \{ +088 *tmpx += u; +089 u = *tmpx >> DIGIT_BIT; +090 *tmpx++ &= MP_MASK; +091 \} +092 \} +093 \} +094 +095 /* at this point the n.used'th least +096 * significant digits of x are all zero +097 * which means we can shift x to the +098 * right by n.used digits and the +099 * residue is unchanged. +100 */ +101 +102 /* x = x/b**n.used */ +103 mp_clamp(x); +104 mp_rshd (x, n->used); +105 +106 /* if x >= n then x = x - n */ +107 if (mp_cmp_mag (x, n) != MP_LT) \{ +108 return s_mp_sub (x, n, x); +109 \} +110 +111 return MP_OKAY; +112 \} +\end{alltt} +\end{small} + +This is the baseline implementation of the Montgomery reduction algorithm. Lines 30 to 35 determine if the Comba based +routine can be used instead. Line 48 computes the value of $\mu$ for that particular iteration of the outer loop. + +The multiplication $\mu n \beta^{ix}$ is performed in one step in the inner loop. The alias $tmpx$ refers to the $ix$'th digit of $x$ and +the alias $tmpn$ refers to the modulus $n$. + +\subsection{Faster ``Comba'' Montgomery Reduction} + +The Montgomery reduction requires fewer single precision multiplications than a Barrett reduction, however it is much slower due to the serial +nature of the inner loop. The Barrett reduction algorithm requires two slightly modified multipliers which can be implemented with the Comba +technique. The Montgomery reduction algorithm cannot directly use the Comba technique to any significant advantage since the inner loop calculates +a $k \times 1$ product $k$ times. + +The biggest obstacle is that at the $ix$'th iteration of the outer loop the value of $x_{ix}$ is required to calculate $\mu$. This means the +carries from $0$ to $ix - 1$ must have been propagated upwards to form a valid $ix$'th digit. The solution as it turns out is very simple. +Perform a Comba like multiplier and inside the outer loop just after the inner loop fix up the $ix + 1$'th digit by forwarding the carry. + +With this change in place the Montgomery reduction algorithm can be performed with a Comba style multiplication loop which substantially increases +the speed of the algorithm. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{fast\_mp\_montgomery\_reduce}. \\ +\textbf{Input}. mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\ +\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\ +\textbf{Output}. $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\ +\hline \\ +Place an array of \textbf{MP\_WARRAY} mp\_word variables called $\hat W$ on the stack. \\ +1. if $x.alloc < n.used + 1$ then grow $x$ to $n.used + 1$ digits. \\ +Copy the digits of $x$ into the array $\hat W$ \\ +2. For $ix$ from $0$ to $x.used - 1$ do \\ +\hspace{3mm}2.1 $\hat W_{ix} \leftarrow x_{ix}$ \\ +3. For $ix$ from $x.used$ to $2n.used - 1$ do \\ +\hspace{3mm}3.1 $\hat W_{ix} \leftarrow 0$ \\ +Elimiate the lower $k$ digits. \\ +4. for $ix$ from $0$ to $n.used - 1$ do \\ +\hspace{3mm}4.1 $\mu \leftarrow \hat W_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{3mm}4.2 For $iy$ from $0$ to $n.used - 1$ do \\ +\hspace{6mm}4.2.1 $\hat W_{iy + ix} \leftarrow \hat W_{iy + ix} + \mu \cdot n_{iy}$ \\ +\hspace{3mm}4.3 $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\ +Propagate carries upwards. \\ +5. for $ix$ from $n.used$ to $2n.used + 1$ do \\ +\hspace{3mm}5.1 $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\ +Shift right and reduce modulo $\beta$ simultaneously. \\ +6. for $ix$ from $0$ to $n.used + 1$ do \\ +\hspace{3mm}6.1 $x_{ix} \leftarrow \hat W_{ix + n.used} \mbox{ (mod }\beta\mbox{)}$ \\ +Zero excess digits and fixup $x$. \\ +7. if $x.used > n.used + 1$ then do \\ +\hspace{3mm}7.1 for $ix$ from $n.used + 1$ to $x.used - 1$ do \\ +\hspace{6mm}7.1.1 $x_{ix} \leftarrow 0$ \\ +8. $x.used \leftarrow n.used + 1$ \\ +9. Clamp excessive digits of $x$. \\ +10. If $x \ge n$ then \\ +\hspace{3mm}10.1 $x \leftarrow x - n$ \\ +11. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm fast\_mp\_montgomery\_reduce} +\end{figure} + +\textbf{Algorithm fast\_mp\_montgomery\_reduce.} +This algorithm will compute the Montgomery reduction of $x$ modulo $n$ using the Comba technique. It is on most computer platforms significantly +faster than algorithm mp\_montgomery\_reduce and algorithm mp\_reduce (\textit{Barrett reduction}). The algorithm has the same restrictions +on the input as the baseline reduction algorithm. An additional two restrictions are imposed on this algorithm. The number of digits $k$ in the +the modulus $n$ must not violate $MP\_WARRAY > 2k +1$ and $n < \delta$. When $\beta = 2^{28}$ this algorithm can be used to reduce modulo +a modulus of at most $3,556$ bits in length. + +As in the other Comba reduction algorithms there is a $\hat W$ array which stores the columns of the product. It is initially filled with the +contents of $x$ with the excess digits zeroed. The reduction loop is very similar the to the baseline loop at heart. The multiplication on step +4.1 can be single precision only since $ab \mbox{ (mod }\beta\mbox{)} \equiv (a \mbox{ mod }\beta)(b \mbox{ mod }\beta)$. Some multipliers such +as those on the ARM processors take a variable length time to complete depending on the number of bytes of result it must produce. By performing +a single precision multiplication instead half the amount of time is spent. + +Also note that digit $\hat W_{ix}$ must have the carry from the $ix - 1$'th digit propagated upwards in order for this to work. That is what step +4.3 will do. In effect over the $n.used$ iterations of the outer loop the $n.used$'th lower columns all have the their carries propagated forwards. Note +how the upper bits of those same words are not reduced modulo $\beta$. This is because those values will be discarded shortly and there is no +point. + +Step 5 will propagate the remainder of the carries upwards. On step 6 the columns are reduced modulo $\beta$ and shifted simultaneously as they are +stored in the destination $x$. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_fast\_mp\_montgomery\_reduce.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* computes xR**-1 == x (mod N) via Montgomery Reduction +018 * +019 * This is an optimized implementation of mp_montgomery_reduce +020 * which uses the comba method to quickly calculate the columns of the +021 * reduction. +022 * +023 * Based on Algorithm 14.32 on pp.601 of HAC. +024 */ +025 int +026 fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho) +027 \{ +028 int ix, res, olduse; +029 mp_word W[MP_WARRAY]; +030 +031 /* get old used count */ +032 olduse = x->used; +033 +034 /* grow a as required */ +035 if (x->alloc < n->used + 1) \{ +036 if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) \{ +037 return res; +038 \} +039 \} +040 +041 /* first we have to get the digits of the input into +042 * an array of double precision words W[...] +043 */ +044 \{ +045 register mp_word *_W; +046 register mp_digit *tmpx; +047 +048 /* alias for the W[] array */ +049 _W = W; +050 +051 /* alias for the digits of x*/ +052 tmpx = x->dp; +053 +054 /* copy the digits of a into W[0..a->used-1] */ +055 for (ix = 0; ix < x->used; ix++) \{ +056 *_W++ = *tmpx++; +057 \} +058 +059 /* zero the high words of W[a->used..m->used*2] */ +060 for (; ix < n->used * 2 + 1; ix++) \{ +061 *_W++ = 0; +062 \} +063 \} +064 +065 /* now we proceed to zero successive digits +066 * from the least significant upwards +067 */ +068 for (ix = 0; ix < n->used; ix++) \{ +069 /* mu = ai * m' mod b +070 * +071 * We avoid a double precision multiplication (which isn't required) +072 * by casting the value down to a mp_digit. Note this requires +073 * that W[ix-1] have the carry cleared (see after the inner loop) +074 */ +075 register mp_digit mu; +076 mu = (mp_digit) (((W[ix] & MP_MASK) * rho) & MP_MASK); +077 +078 /* a = a + mu * m * b**i +079 * +080 * This is computed in place and on the fly. The multiplication +081 * by b**i is handled by offseting which columns the results +082 * are added to. +083 * +084 * Note the comba method normally doesn't handle carries in the +085 * inner loop In this case we fix the carry from the previous +086 * column since the Montgomery reduction requires digits of the +087 * result (so far) [see above] to work. This is +088 * handled by fixing up one carry after the inner loop. The +089 * carry fixups are done in order so after these loops the +090 * first m->used words of W[] have the carries fixed +091 */ +092 \{ +093 register int iy; +094 register mp_digit *tmpn; +095 register mp_word *_W; +096 +097 /* alias for the digits of the modulus */ +098 tmpn = n->dp; +099 +100 /* Alias for the columns set by an offset of ix */ +101 _W = W + ix; +102 +103 /* inner loop */ +104 for (iy = 0; iy < n->used; iy++) \{ +105 *_W++ += ((mp_word)mu) * ((mp_word)*tmpn++); +106 \} +107 \} +108 +109 /* now fix carry for next digit, W[ix+1] */ +110 W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT); +111 \} +112 +113 /* now we have to propagate the carries and +114 * shift the words downward [all those least +115 * significant digits we zeroed]. +116 */ +117 \{ +118 register mp_digit *tmpx; +119 register mp_word *_W, *_W1; +120 +121 /* nox fix rest of carries */ +122 +123 /* alias for current word */ +124 _W1 = W + ix; +125 +126 /* alias for next word, where the carry goes */ +127 _W = W + ++ix; +128 +129 for (; ix <= n->used * 2 + 1; ix++) \{ +130 *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT); +131 \} +132 +133 /* copy out, A = A/b**n +134 * +135 * The result is A/b**n but instead of converting from an +136 * array of mp_word to mp_digit than calling mp_rshd +137 * we just copy them in the right order +138 */ +139 +140 /* alias for destination word */ +141 tmpx = x->dp; +142 +143 /* alias for shifted double precision result */ +144 _W = W + n->used; +145 +146 for (ix = 0; ix < n->used + 1; ix++) \{ +147 *tmpx++ = (mp_digit)(*_W++ & ((mp_word) MP_MASK)); +148 \} +149 +150 /* zero oldused digits, if the input a was larger than +151 * m->used+1 we'll have to clear the digits +152 */ +153 for (; ix < olduse; ix++) \{ +154 *tmpx++ = 0; +155 \} +156 \} +157 +158 /* set the max used and clamp */ +159 x->used = n->used + 1; +160 mp_clamp (x); +161 +162 /* if A >= m then A = A - m */ +163 if (mp_cmp_mag (x, n) != MP_LT) \{ +164 return s_mp_sub (x, n, x); +165 \} +166 return MP_OKAY; +167 \} +\end{alltt} +\end{small} + +The $\hat W$ array is first filled with digits of $x$ on line 48 then the rest of the digits are zeroed on line 55. Both loops share +the same alias variables to make the code easier to read. + +The value of $\mu$ is calculated in an interesting fashion. First the value $\hat W_{ix}$ is reduced modulo $\beta$ and cast to a mp\_digit. This +forces the compiler to use a single precision multiplication and prevents any concerns about loss of precision. Line 110 fixes the carry +for the next iteration of the loop by propagating the carry from $\hat W_{ix}$ to $\hat W_{ix+1}$. + +The for loop on line 109 propagates the rest of the carries upwards through the columns. The for loop on line 126 reduces the columns +modulo $\beta$ and shifts them $k$ places at the same time. The alias $\_ \hat W$ actually refers to the array $\hat W$ starting at the $n.used$'th +digit, that is $\_ \hat W_{t} = \hat W_{n.used + t}$. + +\subsection{Montgomery Setup} +To calculate the variable $\rho$ a relatively simple algorithm will be required. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_montgomery\_setup}. \\ +\textbf{Input}. mp\_int $n$ ($n > 1$ and $(n, 2) = 1$) \\ +\textbf{Output}. $\rho \equiv -1/n_0 \mbox{ (mod }\beta\mbox{)}$ \\ +\hline \\ +1. $b \leftarrow n_0$ \\ +2. If $b$ is even return(\textit{MP\_VAL}) \\ +3. $x \leftarrow ((b + 2) \mbox{ AND } 4) << 1) + b$ \\ +4. for $k$ from 0 to $\lceil lg(lg(\beta)) \rceil - 2$ do \\ +\hspace{3mm}4.1 $x \leftarrow x \cdot (2 - bx)$ \\ +5. $\rho \leftarrow \beta - x \mbox{ (mod }\beta\mbox{)}$ \\ +6. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_montgomery\_setup} +\end{figure} + +\textbf{Algorithm mp\_montgomery\_setup.} +This algorithm will calculate the value of $\rho$ required within the Montgomery reduction algorithms. It uses a very interesting trick +to calculate $1/n_0$ when $\beta$ is a power of two. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_montgomery\_setup.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* setups the montgomery reduction stuff */ +018 int +019 mp_montgomery_setup (mp_int * n, mp_digit * rho) +020 \{ +021 mp_digit x, b; +022 +023 /* fast inversion mod 2**k +024 * +025 * Based on the fact that +026 * +027 * XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n) +028 * => 2*X*A - X*X*A*A = 1 +029 * => 2*(1) - (1) = 1 +030 */ +031 b = n->dp[0]; +032 +033 if ((b & 1) == 0) \{ +034 return MP_VAL; +035 \} +036 +037 x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */ +038 x *= 2 - b * x; /* here x*a==1 mod 2**8 */ +039 #if !defined(MP_8BIT) +040 x *= 2 - b * x; /* here x*a==1 mod 2**16 */ +041 #endif +042 #if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT)) +043 x *= 2 - b * x; /* here x*a==1 mod 2**32 */ +044 #endif +045 #ifdef MP_64BIT +046 x *= 2 - b * x; /* here x*a==1 mod 2**64 */ +047 #endif +048 +049 /* rho = -1/m mod b */ +050 *rho = (((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - x) & MP_MASK; +051 +052 return MP_OKAY; +053 \} +\end{alltt} +\end{small} + +This source code computes the value of $\rho$ required to perform Montgomery reduction. It has been modified to avoid performing excess +multiplications when $\beta$ is not the default 28-bits. + +\section{The Diminished Radix Algorithm} +The Diminished Radix method of modular reduction \cite{DRMET} is a fairly clever technique which can be more efficient than either the Barrett +or Montgomery methods for certain forms of moduli. The technique is based on the following simple congruence. + +\begin{equation} +(x \mbox{ mod } n) + k \lfloor x / n \rfloor \equiv x \mbox{ (mod }(n - k)\mbox{)} +\end{equation} + +This observation was used in the MMB \cite{MMB} block cipher to create a diffusion primitive. It used the fact that if $n = 2^{31}$ and $k=1$ that +then a x86 multiplier could produce the 62-bit product and use the ``shrd'' instruction to perform a double-precision right shift. The proof +of the above equation is very simple. First write $x$ in the product form. + +\begin{equation} +x = qn + r +\end{equation} + +Now reduce both sides modulo $(n - k)$. + +\begin{equation} +x \equiv qk + r \mbox{ (mod }(n-k)\mbox{)} +\end{equation} + +The variable $n$ reduces modulo $n - k$ to $k$. By putting $q = \lfloor x/n \rfloor$ and $r = x \mbox{ mod } n$ +into the equation the original congruence is reproduced, thus concluding the proof. The following algorithm is based on this observation. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Diminished Radix Reduction}. \\ +\textbf{Input}. Integer $x$, $n$, $k$ \\ +\textbf{Output}. $x \mbox{ mod } (n - k)$ \\ +\hline \\ +1. $q \leftarrow \lfloor x / n \rfloor$ \\ +2. $q \leftarrow k \cdot q$ \\ +3. $x \leftarrow x \mbox{ (mod }n\mbox{)}$ \\ +4. $x \leftarrow x + q$ \\ +5. If $x \ge (n - k)$ then \\ +\hspace{3mm}5.1 $x \leftarrow x - (n - k)$ \\ +\hspace{3mm}5.2 Goto step 1. \\ +6. Return $x$ \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm Diminished Radix Reduction} +\label{fig:DR} +\end{figure} + +This algorithm will reduce $x$ modulo $n - k$ and return the residue. If $0 \le x < (n - k)^2$ then the algorithm will loop almost always +once or twice and occasionally three times. For simplicity sake the value of $x$ is bounded by the following simple polynomial. + +\begin{equation} +0 \le x < n^2 + k^2 - 2nk +\end{equation} + +The true bound is $0 \le x < (n - k - 1)^2$ but this has quite a few more terms. The value of $q$ after step 1 is bounded by the following. + +\begin{equation} +q < n - 2k - k^2/n +\end{equation} + +Since $k^2$ is going to be considerably smaller than $n$ that term will always be zero. The value of $x$ after step 3 is bounded trivially as +$0 \le x < n$. By step four the sum $x + q$ is bounded by + +\begin{equation} +0 \le q + x < (k + 1)n - 2k^2 - 1 +\end{equation} + +With a second pass $q$ will be loosely bounded by $0 \le q < k^2$ after step 2 while $x$ will still be loosely bounded by $0 \le x < n$ after step 3. After the second pass it is highly unlike that the +sum in step 4 will exceed $n - k$. In practice fewer than three passes of the algorithm are required to reduce virtually every input in the +range $0 \le x < (n - k - 1)^2$. + +\begin{figure} +\begin{small} +\begin{center} +\begin{tabular}{|l|} +\hline +$x = 123456789, n = 256, k = 3$ \\ +\hline $q \leftarrow \lfloor x/n \rfloor = 482253$ \\ +$q \leftarrow q*k = 1446759$ \\ +$x \leftarrow x \mbox{ mod } n = 21$ \\ +$x \leftarrow x + q = 1446780$ \\ +$x \leftarrow x - (n - k) = 1446527$ \\ +\hline +$q \leftarrow \lfloor x/n \rfloor = 5650$ \\ +$q \leftarrow q*k = 16950$ \\ +$x \leftarrow x \mbox{ mod } n = 127$ \\ +$x \leftarrow x + q = 17077$ \\ +$x \leftarrow x - (n - k) = 16824$ \\ +\hline +$q \leftarrow \lfloor x/n \rfloor = 65$ \\ +$q \leftarrow q*k = 195$ \\ +$x \leftarrow x \mbox{ mod } n = 184$ \\ +$x \leftarrow x + q = 379$ \\ +$x \leftarrow x - (n - k) = 126$ \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Example Diminished Radix Reduction} +\label{fig:EXDR} +\end{figure} + +Figure~\ref{fig:EXDR} demonstrates the reduction of $x = 123456789$ modulo $n - k = 253$ when $n = 256$ and $k = 3$. Note that even while $x$ +is considerably larger than $(n - k - 1)^2 = 63504$ the algorithm still converges on the modular residue exceedingly fast. In this case only +three passes were required to find the residue $x \equiv 126$. + + +\subsection{Choice of Moduli} +On the surface this algorithm looks like a very expensive algorithm. It requires a couple of subtractions followed by multiplication and other +modular reductions. The usefulness of this algorithm becomes exceedingly clear when an appropriate modulus is chosen. + +Division in general is a very expensive operation to perform. The one exception is when the division is by a power of the radix of representation used. +Division by ten for example is simple for pencil and paper mathematics since it amounts to shifting the decimal place to the right. Similarly division +by two (\textit{or powers of two}) is very simple for binary computers to perform. It would therefore seem logical to choose $n$ of the form $2^p$ +which would imply that $\lfloor x / n \rfloor$ is a simple shift of $x$ right $p$ bits. + +However, there is one operation related to division of power of twos that is even faster than this. If $n = \beta^p$ then the division may be +performed by moving whole digits to the right $p$ places. In practice division by $\beta^p$ is much faster than division by $2^p$ for any $p$. +Also with the choice of $n = \beta^p$ reducing $x$ modulo $n$ merely requires zeroing the digits above the $p-1$'th digit of $x$. + +Throughout the next section the term ``restricted modulus'' will refer to a modulus of the form $\beta^p - k$ whereas the term ``unrestricted +modulus'' will refer to a modulus of the form $2^p - k$. The word ``restricted'' in this case refers to the fact that it is based on the +$2^p$ logic except $p$ must be a multiple of $lg(\beta)$. + +\subsection{Choice of $k$} +Now that division and reduction (\textit{step 1 and 3 of figure~\ref{fig:DR}}) have been optimized to simple digit operations the multiplication by $k$ +in step 2 is the most expensive operation. Fortunately the choice of $k$ is not terribly limited. For all intents and purposes it might +as well be a single digit. The smaller the value of $k$ is the faster the algorithm will be. + +\subsection{Restricted Diminished Radix Reduction} +The restricted Diminished Radix algorithm can quickly reduce an input modulo a modulus of the form $n = \beta^p - k$. This algorithm can reduce +an input $x$ within the range $0 \le x < n^2$ using only a couple passes of the algorithm demonstrated in figure~\ref{fig:DR}. The implementation +of this algorithm has been optimized to avoid additional overhead associated with a division by $\beta^p$, the multiplication by $k$ or the addition +of $x$ and $q$. The resulting algorithm is very efficient and can lead to substantial improvements over Barrett and Montgomery reduction when modular +exponentiations are performed. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_dr\_reduce}. \\ +\textbf{Input}. mp\_int $x$, $n$ and a mp\_digit $k = \beta - n_0$ \\ +\hspace{11.5mm}($0 \le x < n^2$, $n > 1$, $0 < k < \beta$) \\ +\textbf{Output}. $x \mbox{ mod } n$ \\ +\hline \\ +1. $m \leftarrow n.used$ \\ +2. If $x.alloc < 2m$ then grow $x$ to $2m$ digits. \\ +3. $\mu \leftarrow 0$ \\ +4. for $i$ from $0$ to $m - 1$ do \\ +\hspace{3mm}4.1 $\hat r \leftarrow k \cdot x_{m+i} + x_{i} + \mu$ \\ +\hspace{3mm}4.2 $x_{i} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{3mm}4.3 $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\ +5. $x_{m} \leftarrow \mu$ \\ +6. for $i$ from $m + 1$ to $x.used - 1$ do \\ +\hspace{3mm}6.1 $x_{i} \leftarrow 0$ \\ +7. Clamp excess digits of $x$. \\ +8. If $x \ge n$ then \\ +\hspace{3mm}8.1 $x \leftarrow x - n$ \\ +\hspace{3mm}8.2 Goto step 3. \\ +9. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_dr\_reduce} +\end{figure} + +\textbf{Algorithm mp\_dr\_reduce.} +This algorithm will perform the Dimished Radix reduction of $x$ modulo $n$. It has similar restrictions to that of the Barrett reduction +with the addition that $n$ must be of the form $n = \beta^m - k$ where $0 < k <\beta$. + +This algorithm essentially implements the pseudo-code in figure~\ref{fig:DR} except with a slight optimization. The division by $\beta^m$, multiplication by $k$ +and addition of $x \mbox{ mod }\beta^m$ are all performed simultaneously inside the loop on step 4. The division by $\beta^m$ is emulated by accessing +the term at the $m+i$'th position which is subsequently multiplied by $k$ and added to the term at the $i$'th position. After the loop the $m$'th +digit is set to the carry and the upper digits are zeroed. Steps 5 and 6 emulate the reduction modulo $\beta^m$ that should have happend to +$x$ before the addition of the multiple of the upper half. + +At step 8 if $x$ is still larger than $n$ another pass of the algorithm is required. First $n$ is subtracted from $x$ and then the algorithm resumes +at step 3. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_reduce.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* reduce "x" in place modulo "n" using the Diminished Radix algorithm. +018 * +019 * Based on algorithm from the paper +020 * +021 * "Generating Efficient Primes for Discrete Log Cryptosystems" +022 * Chae Hoon Lim, Pil Loong Lee, +023 * POSTECH Information Research Laboratories +024 * +025 * The modulus must be of a special format [see manual] +026 * +027 * Has been modified to use algorithm 7.10 from the LTM book instead +028 * +029 * Input x must be in the range 0 <= x <= (n-1)**2 +030 */ +031 int +032 mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k) +033 \{ +034 int err, i, m; +035 mp_word r; +036 mp_digit mu, *tmpx1, *tmpx2; +037 +038 /* m = digits in modulus */ +039 m = n->used; +040 +041 /* ensure that "x" has at least 2m digits */ +042 if (x->alloc < m + m) \{ +043 if ((err = mp_grow (x, m + m)) != MP_OKAY) \{ +044 return err; +045 \} +046 \} +047 +048 /* top of loop, this is where the code resumes if +049 * another reduction pass is required. +050 */ +051 top: +052 /* aliases for digits */ +053 /* alias for lower half of x */ +054 tmpx1 = x->dp; +055 +056 /* alias for upper half of x, or x/B**m */ +057 tmpx2 = x->dp + m; +058 +059 /* set carry to zero */ +060 mu = 0; +061 +062 /* compute (x mod B**m) + k * [x/B**m] inline and inplace */ +063 for (i = 0; i < m; i++) \{ +064 r = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu; +065 *tmpx1++ = (mp_digit)(r & MP_MASK); +066 mu = (mp_digit)(r >> ((mp_word)DIGIT_BIT)); +067 \} +068 +069 /* set final carry */ +070 *tmpx1++ = mu; +071 +072 /* zero words above m */ +073 for (i = m + 1; i < x->used; i++) \{ +074 *tmpx1++ = 0; +075 \} +076 +077 /* clamp, sub and return */ +078 mp_clamp (x); +079 +080 /* if x >= n then subtract and reduce again +081 * Each successive "recursion" makes the input smaller and smaller. +082 */ +083 if (mp_cmp_mag (x, n) != MP_LT) \{ +084 s_mp_sub(x, n, x); +085 goto top; +086 \} +087 return MP_OKAY; +088 \} +\end{alltt} +\end{small} + +The first step is to grow $x$ as required to $2m$ digits since the reduction is performed in place on $x$. The label on line 51 is where +the algorithm will resume if further reduction passes are required. In theory it could be placed at the top of the function however, the size of +the modulus and question of whether $x$ is large enough are invariant after the first pass meaning that it would be a waste of time. + +The aliases $tmpx1$ and $tmpx2$ refer to the digits of $x$ where the latter is offset by $m$ digits. By reading digits from $x$ offset by $m$ digits +a division by $\beta^m$ can be simulated virtually for free. The loop on line 63 performs the bulk of the work (\textit{corresponds to step 4 of algorithm 7.11}) +in this algorithm. + +By line 70 the pointer $tmpx1$ points to the $m$'th digit of $x$ which is where the final carry will be placed. Similarly by line 73 the +same pointer will point to the $m+1$'th digit where the zeroes will be placed. + +Since the algorithm is only valid if both $x$ and $n$ are greater than zero an unsigned comparison suffices to determine if another pass is required. +With the same logic at line 84 the value of $x$ is known to be greater than or equal to $n$ meaning that an unsigned subtraction can be used +as well. Since the destination of the subtraction is the larger of the inputs the call to algorithm s\_mp\_sub cannot fail and the return code +does not need to be checked. + +\subsubsection{Setup} +To setup the restricted Diminished Radix algorithm the value $k = \beta - n_0$ is required. This algorithm is not really complicated but provided for +completeness. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_dr\_setup}. \\ +\textbf{Input}. mp\_int $n$ \\ +\textbf{Output}. $k = \beta - n_0$ \\ +\hline \\ +1. $k \leftarrow \beta - n_0$ \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_dr\_setup} +\end{figure} + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_setup.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* determines the setup value */ +018 void mp_dr_setup(mp_int *a, mp_digit *d) +019 \{ +020 /* the casts are required if DIGIT_BIT is one less than +021 * the number of bits in a mp_digit [e.g. DIGIT_BIT==31] +022 */ +023 *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - +024 ((mp_word)a->dp[0])); +025 \} +026 +\end{alltt} +\end{small} + +\subsubsection{Modulus Detection} +Another algorithm which will be useful is the ability to detect a restricted Diminished Radix modulus. An integer is said to be +of restricted Diminished Radix form if all of the digits are equal to $\beta - 1$ except the trailing digit which may be any value. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_dr\_is\_modulus}. \\ +\textbf{Input}. mp\_int $n$ \\ +\textbf{Output}. $1$ if $n$ is in D.R form, $0$ otherwise \\ +\hline +1. If $n.used < 2$ then return($0$). \\ +2. for $ix$ from $1$ to $n.used - 1$ do \\ +\hspace{3mm}2.1 If $n_{ix} \ne \beta - 1$ return($0$). \\ +3. Return($1$). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_dr\_is\_modulus} +\end{figure} + +\textbf{Algorithm mp\_dr\_is\_modulus.} +This algorithm determines if a value is in Diminished Radix form. Step 1 rejects obvious cases where fewer than two digits are +in the mp\_int. Step 2 tests all but the first digit to see if they are equal to $\beta - 1$. If the algorithm manages to get to +step 3 then $n$ must be of Diminished Radix form. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_is\_modulus.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* determines if a number is a valid DR modulus */ +018 int mp_dr_is_modulus(mp_int *a) +019 \{ +020 int ix; +021 +022 /* must be at least two digits */ +023 if (a->used < 2) \{ +024 return 0; +025 \} +026 +027 /* must be of the form b**k - a [a <= b] so all +028 * but the first digit must be equal to -1 (mod b). +029 */ +030 for (ix = 1; ix < a->used; ix++) \{ +031 if (a->dp[ix] != MP_MASK) \{ +032 return 0; +033 \} +034 \} +035 return 1; +036 \} +037 +\end{alltt} +\end{small} + +\subsection{Unrestricted Diminished Radix Reduction} +The unrestricted Diminished Radix algorithm allows modular reductions to be performed when the modulus is of the form $2^p - k$. This algorithm +is a straightforward adaptation of algorithm~\ref{fig:DR}. + +In general the restricted Diminished Radix reduction algorithm is much faster since it has considerably lower overhead. However, this new +algorithm is much faster than either Montgomery or Barrett reduction when the moduli are of the appropriate form. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_reduce\_2k}. \\ +\textbf{Input}. mp\_int $a$ and $n$. mp\_digit $k$ \\ +\hspace{11.5mm}($a \ge 0$, $n > 1$, $0 < k < \beta$, $n + k$ is a power of two) \\ +\textbf{Output}. $a \mbox{ (mod }n\mbox{)}$ \\ +\hline +1. $p \leftarrow \lceil lg(n) \rceil$ (\textit{mp\_count\_bits}) \\ +2. While $a \ge n$ do \\ +\hspace{3mm}2.1 $q \leftarrow \lfloor a / 2^p \rfloor$ (\textit{mp\_div\_2d}) \\ +\hspace{3mm}2.2 $a \leftarrow a \mbox{ (mod }2^p\mbox{)}$ (\textit{mp\_mod\_2d}) \\ +\hspace{3mm}2.3 $q \leftarrow q \cdot k$ (\textit{mp\_mul\_d}) \\ +\hspace{3mm}2.4 $a \leftarrow a - q$ (\textit{s\_mp\_sub}) \\ +\hspace{3mm}2.5 If $a \ge n$ then do \\ +\hspace{6mm}2.5.1 $a \leftarrow a - n$ \\ +3. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_reduce\_2k} +\end{figure} + +\textbf{Algorithm mp\_reduce\_2k.} +This algorithm quickly reduces an input $a$ modulo an unrestricted Diminished Radix modulus $n$. Division by $2^p$ is emulated with a right +shift which makes the algorithm fairly inexpensive to use. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_2k.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* reduces a modulo n where n is of the form 2**p - d */ +018 int +019 mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d) +020 \{ +021 mp_int q; +022 int p, res; +023 +024 if ((res = mp_init(&q)) != MP_OKAY) \{ +025 return res; +026 \} +027 +028 p = mp_count_bits(n); +029 top: +030 /* q = a/2**p, a = a mod 2**p */ +031 if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) \{ +032 goto ERR; +033 \} +034 +035 if (d != 1) \{ +036 /* q = q * d */ +037 if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) \{ +038 goto ERR; +039 \} +040 \} +041 +042 /* a = a + q */ +043 if ((res = s_mp_add(a, &q, a)) != MP_OKAY) \{ +044 goto ERR; +045 \} +046 +047 if (mp_cmp_mag(a, n) != MP_LT) \{ +048 s_mp_sub(a, n, a); +049 goto top; +050 \} +051 +052 ERR: +053 mp_clear(&q); +054 return res; +055 \} +056 +\end{alltt} +\end{small} + +The algorithm mp\_count\_bits calculates the number of bits in an mp\_int which is used to find the initial value of $p$. The call to mp\_div\_2d +on line 31 calculates both the quotient $q$ and the remainder $a$ required. By doing both in a single function call the code size +is kept fairly small. The multiplication by $k$ is only performed if $k > 1$. This allows reductions modulo $2^p - 1$ to be performed without +any multiplications. + +The unsigned s\_mp\_add, mp\_cmp\_mag and s\_mp\_sub are used in place of their full sign counterparts since the inputs are only valid if they are +positive. By using the unsigned versions the overhead is kept to a minimum. + +\subsubsection{Unrestricted Setup} +To setup this reduction algorithm the value of $k = 2^p - n$ is required. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_reduce\_2k\_setup}. \\ +\textbf{Input}. mp\_int $n$ \\ +\textbf{Output}. $k = 2^p - n$ \\ +\hline +1. $p \leftarrow \lceil lg(n) \rceil$ (\textit{mp\_count\_bits}) \\ +2. $x \leftarrow 2^p$ (\textit{mp\_2expt}) \\ +3. $x \leftarrow x - n$ (\textit{mp\_sub}) \\ +4. $k \leftarrow x_0$ \\ +5. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_reduce\_2k\_setup} +\end{figure} + +\textbf{Algorithm mp\_reduce\_2k\_setup.} +This algorithm computes the value of $k$ required for the algorithm mp\_reduce\_2k. By making a temporary variable $x$ equal to $2^p$ a subtraction +is sufficient to solve for $k$. Alternatively if $n$ has more than one digit the value of $k$ is simply $\beta - n_0$. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_2k\_setup.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* determines the setup value */ +018 int +019 mp_reduce_2k_setup(mp_int *a, mp_digit *d) +020 \{ +021 int res, p; +022 mp_int tmp; +023 +024 if ((res = mp_init(&tmp)) != MP_OKAY) \{ +025 return res; +026 \} +027 +028 p = mp_count_bits(a); +029 if ((res = mp_2expt(&tmp, p)) != MP_OKAY) \{ +030 mp_clear(&tmp); +031 return res; +032 \} +033 +034 if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) \{ +035 mp_clear(&tmp); +036 return res; +037 \} +038 +039 *d = tmp.dp[0]; +040 mp_clear(&tmp); +041 return MP_OKAY; +042 \} +\end{alltt} +\end{small} + +\subsubsection{Unrestricted Detection} +An integer $n$ is a valid unrestricted Diminished Radix modulus if either of the following are true. + +\begin{enumerate} +\item The number has only one digit. +\item The number has more than one digit and every bit from the $\beta$'th to the most significant is one. +\end{enumerate} + +If either condition is true than there is a power of two $2^p$ such that $0 < 2^p - n < \beta$. If the input is only +one digit than it will always be of the correct form. Otherwise all of the bits above the first digit must be one. This arises from the fact +that there will be value of $k$ that when added to the modulus causes a carry in the first digit which propagates all the way to the most +significant bit. The resulting sum will be a power of two. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_reduce\_is\_2k}. \\ +\textbf{Input}. mp\_int $n$ \\ +\textbf{Output}. $1$ if of proper form, $0$ otherwise \\ +\hline +1. If $n.used = 0$ then return($0$). \\ +2. If $n.used = 1$ then return($1$). \\ +3. $p \leftarrow \lceil lg(n) \rceil$ (\textit{mp\_count\_bits}) \\ +4. for $x$ from $lg(\beta)$ to $p$ do \\ +\hspace{3mm}4.1 If the ($x \mbox{ mod }lg(\beta)$)'th bit of the $\lfloor x / lg(\beta) \rfloor$ of $n$ is zero then return($0$). \\ +5. Return($1$). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_reduce\_is\_2k} +\end{figure} + +\textbf{Algorithm mp\_reduce\_is\_2k.} +This algorithm quickly determines if a modulus is of the form required for algorithm mp\_reduce\_2k to function properly. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_is\_2k.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* determines if mp_reduce_2k can be used */ +018 int mp_reduce_is_2k(mp_int *a) +019 \{ +020 int ix, iy, iz, iw; +021 +022 if (a->used == 0) \{ +023 return 0; +024 \} else if (a->used == 1) \{ +025 return 1; +026 \} else if (a->used > 1) \{ +027 iy = mp_count_bits(a); +028 iz = 1; +029 iw = 1; +030 +031 /* Test every bit from the second digit up, must be 1 */ +032 for (ix = DIGIT_BIT; ix < iy; ix++) \{ +033 if ((a->dp[iw] & iz) == 0) \{ +034 return 0; +035 \} +036 iz <<= 1; +037 if (iz > (int)MP_MASK) \{ +038 ++iw; +039 iz = 1; +040 \} +041 \} +042 \} +043 return 1; +044 \} +045 +\end{alltt} +\end{small} + + + +\section{Algorithm Comparison} +So far three very different algorithms for modular reduction have been discussed. Each of the algorithms have their own strengths and weaknesses +that makes having such a selection very useful. The following table sumarizes the three algorithms along with comparisons of work factors. Since +all three algorithms have the restriction that $0 \le x < n^2$ and $n > 1$ those limitations are not included in the table. + +\begin{center} +\begin{small} +\begin{tabular}{|c|c|c|c|c|c|} +\hline \textbf{Method} & \textbf{Work Required} & \textbf{Limitations} & \textbf{$m = 8$} & \textbf{$m = 32$} & \textbf{$m = 64$} \\ +\hline Barrett & $m^2 + 2m - 1$ & None & $79$ & $1087$ & $4223$ \\ +\hline Montgomery & $m^2 + m$ & $n$ must be odd & $72$ & $1056$ & $4160$ \\ +\hline D.R. & $2m$ & $n = \beta^m - k$ & $16$ & $64$ & $128$ \\ +\hline +\end{tabular} +\end{small} +\end{center} + +In theory Montgomery and Barrett reductions would require roughly the same amount of time to complete. However, in practice since Montgomery +reduction can be written as a single function with the Comba technique it is much faster. Barrett reduction suffers from the overhead of +calling the half precision multipliers, addition and division by $\beta$ algorithms. + +For almost every cryptographic algorithm Montgomery reduction is the algorithm of choice. The one set of algorithms where Diminished Radix reduction truly +shines are based on the discrete logarithm problem such as Diffie-Hellman \cite{DH} and ElGamal \cite{ELGAMAL}. In these algorithms +primes of the form $\beta^m - k$ can be found and shared amongst users. These primes will allow the Diminished Radix algorithm to be used in +modular exponentiation to greatly speed up the operation. + + + +\section*{Exercises} +\begin{tabular}{cl} +$\left [ 3 \right ]$ & Prove that the ``trick'' in algorithm mp\_montgomery\_setup actually \\ + & calculates the correct value of $\rho$. \\ + & \\ +$\left [ 2 \right ]$ & Devise an algorithm to reduce modulo $n + k$ for small $k$ quickly. \\ + & \\ +$\left [ 4 \right ]$ & Prove that the pseudo-code algorithm ``Diminished Radix Reduction'' \\ + & (\textit{figure~\ref{fig:DR}}) terminates. Also prove the probability that it will \\ + & terminate within $1 \le k \le 10$ iterations. \\ + & \\ +\end{tabular} + + +\chapter{Exponentiation} +Exponentiation is the operation of raising one variable to the power of another, for example, $a^b$. A variant of exponentiation, computed +in a finite field or ring, is called modular exponentiation. This latter style of operation is typically used in public key +cryptosystems such as RSA and Diffie-Hellman. The ability to quickly compute modular exponentiations is of great benefit to any +such cryptosystem and many methods have been sought to speed it up. + +\section{Exponentiation Basics} +A trivial algorithm would simply multiply $a$ against itself $b - 1$ times to compute the exponentiation desired. However, as $b$ grows in size +the number of multiplications becomes prohibitive. Imagine what would happen if $b$ $\approx$ $2^{1024}$ as is the case when computing an RSA signature +with a $1024$-bit key. Such a calculation could never be completed as it would take simply far too long. + +Fortunately there is a very simple algorithm based on the laws of exponents. Recall that $lg_a(a^b) = b$ and that $lg_a(a^ba^c) = b + c$ which +are two trivial relationships between the base and the exponent. Let $b_i$ represent the $i$'th bit of $b$ starting from the least +significant bit. If $b$ is a $k$-bit integer than the following equation is true. + +\begin{equation} +a^b = \prod_{i=0}^{k-1} a^{2^i \cdot b_i} +\end{equation} + +By taking the base $a$ logarithm of both sides of the equation the following equation is the result. + +\begin{equation} +b = \sum_{i=0}^{k-1}2^i \cdot b_i +\end{equation} + +The term $a^{2^i}$ can be found from the $i - 1$'th term by squaring the term since $\left ( a^{2^i} \right )^2$ is equal to +$a^{2^{i+1}}$. This observation forms the basis of essentially all fast exponentiation algorithms. It requires $k$ squarings and on average +$k \over 2$ multiplications to compute the result. This is indeed quite an improvement over simply multiplying by $a$ a total of $b-1$ times. + +While this current method is a considerable speed up there are further improvements to be made. For example, the $a^{2^i}$ term does not need to +be computed in an auxilary variable. Consider the following equivalent algorithm. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Left to Right Exponentiation}. \\ +\textbf{Input}. Integer $a$, $b$ and $k$ \\ +\textbf{Output}. $c = a^b$ \\ +\hline \\ +1. $c \leftarrow 1$ \\ +2. for $i$ from $k - 1$ to $0$ do \\ +\hspace{3mm}2.1 $c \leftarrow c^2$ \\ +\hspace{3mm}2.2 $c \leftarrow c \cdot a^{b_i}$ \\ +3. Return $c$. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Left to Right Exponentiation} +\label{fig:LTOR} +\end{figure} + +This algorithm starts from the most significant bit and works towards the least significant bit. When the $i$'th bit of $b$ is set $a$ is +multiplied against the current product. In each iteration the product is squared which doubles the exponent of the individual terms of the +product. + +For example, let $b = 101100_2 \equiv 44_{10}$. The following chart demonstrates the actions of the algorithm. + +\newpage\begin{figure} +\begin{center} +\begin{tabular}{|c|c|} +\hline \textbf{Value of $i$} & \textbf{Value of $c$} \\ +\hline - & $1$ \\ +\hline $5$ & $a$ \\ +\hline $4$ & $a^2$ \\ +\hline $3$ & $a^4 \cdot a$ \\ +\hline $2$ & $a^8 \cdot a^2 \cdot a$ \\ +\hline $1$ & $a^{16} \cdot a^4 \cdot a^2$ \\ +\hline $0$ & $a^{32} \cdot a^8 \cdot a^4$ \\ +\hline +\end{tabular} +\end{center} +\caption{Example of Left to Right Exponentiation} +\end{figure} + +When the product $a^{32} \cdot a^8 \cdot a^4$ is simplified it is equal $a^{44}$ which is the desired exponentiation. This particular algorithm is +called ``Left to Right'' because it reads the exponent in that order. All of the exponentiation algorithms that will be presented are of this nature. + +\subsection{Single Digit Exponentiation} +The first algorithm in the series of exponentiation algorithms will be an unbounded algorithm where the exponent is a single digit. It is intended +to be used when a small power of an input is required (\textit{e.g. $a^5$}). It is faster than simply multiplying $b - 1$ times for all values of +$b$ that are greater than three. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_expt\_d}. \\ +\textbf{Input}. mp\_int $a$ and mp\_digit $b$ \\ +\textbf{Output}. $c = a^b$ \\ +\hline \\ +1. $g \leftarrow a$ (\textit{mp\_init\_copy}) \\ +2. $c \leftarrow 1$ (\textit{mp\_set}) \\ +3. for $x$ from 1 to $lg(\beta)$ do \\ +\hspace{3mm}3.1 $c \leftarrow c^2$ (\textit{mp\_sqr}) \\ +\hspace{3mm}3.2 If $b$ AND $2^{lg(\beta) - 1} \ne 0$ then \\ +\hspace{6mm}3.2.1 $c \leftarrow c \cdot g$ (\textit{mp\_mul}) \\ +\hspace{3mm}3.3 $b \leftarrow b << 1$ \\ +4. Clear $g$. \\ +5. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_expt\_d} +\end{figure} + +\textbf{Algorithm mp\_expt\_d.} +This algorithm computes the value of $a$ raised to the power of a single digit $b$. It uses the left to right exponentiation algorithm to +quickly compute the exponentiation. It is loosely based on algorithm 14.79 of HAC \cite[pp. 615]{HAC} with the difference that the +exponent is a fixed width. + +A copy of $a$ is made first to allow destination variable $c$ be the same as the source variable $a$. The result is set to the initial value of +$1$ in the subsequent step. + +Inside the loop the exponent is read from the most significant bit first down to the least significant bit. First $c$ is invariably squared +on step 3.1. In the following step if the most significant bit of $b$ is one the copy of $a$ is multiplied against $c$. The value +of $b$ is shifted left one bit to make the next bit down from the most signficant bit the new most significant bit. In effect each +iteration of the loop moves the bits of the exponent $b$ upwards to the most significant location. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_expt\_d.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* calculate c = a**b using a square-multiply algorithm */ +018 int mp_expt_d (mp_int * a, mp_digit b, mp_int * c) +019 \{ +020 int res, x; +021 mp_int g; +022 +023 if ((res = mp_init_copy (&g, a)) != MP_OKAY) \{ +024 return res; +025 \} +026 +027 /* set initial result */ +028 mp_set (c, 1); +029 +030 for (x = 0; x < (int) DIGIT_BIT; x++) \{ +031 /* square */ +032 if ((res = mp_sqr (c, c)) != MP_OKAY) \{ +033 mp_clear (&g); +034 return res; +035 \} +036 +037 /* if the bit is set multiply */ +038 if ((b & (mp_digit) (((mp_digit)1) << (DIGIT_BIT - 1))) != 0) \{ +039 if ((res = mp_mul (c, &g, c)) != MP_OKAY) \{ +040 mp_clear (&g); +041 return res; +042 \} +043 \} +044 +045 /* shift to next bit */ +046 b <<= 1; +047 \} +048 +049 mp_clear (&g); +050 return MP_OKAY; +051 \} +\end{alltt} +\end{small} + +Line 28 sets the initial value of the result to $1$. Next the loop on line 30 steps through each bit of the exponent starting from +the most significant down towards the least significant. The invariant squaring operation placed on line 32 is performed first. After +the squaring the result $c$ is multiplied by the base $g$ if and only if the most significant bit of the exponent is set. The shift on line +46 moves all of the bits of the exponent upwards towards the most significant location. + +\section{$k$-ary Exponentiation} +When calculating an exponentiation the most time consuming bottleneck is the multiplications which are in general a small factor +slower than squaring. Recall from the previous algorithm that $b_{i}$ refers to the $i$'th bit of the exponent $b$. Suppose instead it referred to +the $i$'th $k$-bit digit of the exponent of $b$. For $k = 1$ the definitions are synonymous and for $k > 1$ algorithm~\ref{fig:KARY} +computes the same exponentiation. A group of $k$ bits from the exponent is called a \textit{window}. That is it is a small window on only a +portion of the entire exponent. Consider the following modification to the basic left to right exponentiation algorithm. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{$k$-ary Exponentiation}. \\ +\textbf{Input}. Integer $a$, $b$, $k$ and $t$ \\ +\textbf{Output}. $c = a^b$ \\ +\hline \\ +1. $c \leftarrow 1$ \\ +2. for $i$ from $t - 1$ to $0$ do \\ +\hspace{3mm}2.1 $c \leftarrow c^{2^k} $ \\ +\hspace{3mm}2.2 Extract the $i$'th $k$-bit word from $b$ and store it in $g$. \\ +\hspace{3mm}2.3 $c \leftarrow c \cdot a^g$ \\ +3. Return $c$. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{$k$-ary Exponentiation} +\label{fig:KARY} +\end{figure} + +The squaring on step 2.1 can be calculated by squaring the value $c$ successively $k$ times. If the values of $a^g$ for $0 < g < 2^k$ have been +precomputed this algorithm requires only $t$ multiplications and $tk$ squarings. The table can be generated with $2^{k - 1} - 1$ squarings and +$2^{k - 1} + 1$ multiplications. This algorithm assumes that the number of bits in the exponent is evenly divisible by $k$. +However, when it is not the remaining $0 < x \le k - 1$ bits can be handled with algorithm~\ref{fig:LTOR}. + +Suppose $k = 4$ and $t = 100$. This modified algorithm will require $109$ multiplications and $408$ squarings to compute the exponentiation. The +original algorithm would on average have required $200$ multiplications and $400$ squrings to compute the same value. The total number of squarings +has increased slightly but the number of multiplications has nearly halved. + +\subsection{Optimal Values of $k$} +An optimal value of $k$ will minimize $2^{k} + \lceil n / k \rceil + n - 1$ for a fixed number of bits in the exponent $n$. The simplest +approach is to brute force search amongst the values $k = 2, 3, \ldots, 8$ for the lowest result. Table~\ref{fig:OPTK} lists optimal values of $k$ +for various exponent sizes and compares the number of multiplication and squarings required against algorithm~\ref{fig:LTOR}. + +\begin{figure}[here] +\begin{center} +\begin{small} +\begin{tabular}{|c|c|c|c|c|c|} +\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:LTOR}} \\ +\hline $16$ & $2$ & $27$ & $24$ \\ +\hline $32$ & $3$ & $49$ & $48$ \\ +\hline $64$ & $3$ & $92$ & $96$ \\ +\hline $128$ & $4$ & $175$ & $192$ \\ +\hline $256$ & $4$ & $335$ & $384$ \\ +\hline $512$ & $5$ & $645$ & $768$ \\ +\hline $1024$ & $6$ & $1257$ & $1536$ \\ +\hline $2048$ & $6$ & $2452$ & $3072$ \\ +\hline $4096$ & $7$ & $4808$ & $6144$ \\ +\hline +\end{tabular} +\end{small} +\end{center} +\caption{Optimal Values of $k$ for $k$-ary Exponentiation} +\label{fig:OPTK} +\end{figure} + +\subsection{Sliding-Window Exponentiation} +A simple modification to the previous algorithm is only generate the upper half of the table in the range $2^{k-1} \le g < 2^k$. Essentially +this is a table for all values of $g$ where the most significant bit of $g$ is a one. However, in order for this to be allowed in the +algorithm values of $g$ in the range $0 \le g < 2^{k-1}$ must be avoided. + +Table~\ref{fig:OPTK2} lists optimal values of $k$ for various exponent sizes and compares the work required against algorithm~\ref{fig:KARY}. + +\begin{figure}[here] +\begin{center} +\begin{small} +\begin{tabular}{|c|c|c|c|c|c|} +\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:KARY}} \\ +\hline $16$ & $3$ & $24$ & $27$ \\ +\hline $32$ & $3$ & $45$ & $49$ \\ +\hline $64$ & $4$ & $87$ & $92$ \\ +\hline $128$ & $4$ & $167$ & $175$ \\ +\hline $256$ & $5$ & $322$ & $335$ \\ +\hline $512$ & $6$ & $628$ & $645$ \\ +\hline $1024$ & $6$ & $1225$ & $1257$ \\ +\hline $2048$ & $7$ & $2403$ & $2452$ \\ +\hline $4096$ & $8$ & $4735$ & $4808$ \\ +\hline +\end{tabular} +\end{small} +\end{center} +\caption{Optimal Values of $k$ for Sliding Window Exponentiation} +\label{fig:OPTK2} +\end{figure} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Sliding Window $k$-ary Exponentiation}. \\ +\textbf{Input}. Integer $a$, $b$, $k$ and $t$ \\ +\textbf{Output}. $c = a^b$ \\ +\hline \\ +1. $c \leftarrow 1$ \\ +2. for $i$ from $t - 1$ to $0$ do \\ +\hspace{3mm}2.1 If the $i$'th bit of $b$ is a zero then \\ +\hspace{6mm}2.1.1 $c \leftarrow c^2$ \\ +\hspace{3mm}2.2 else do \\ +\hspace{6mm}2.2.1 $c \leftarrow c^{2^k}$ \\ +\hspace{6mm}2.2.2 Extract the $k$ bits from $(b_{i}b_{i-1}\ldots b_{i-(k-1)})$ and store it in $g$. \\ +\hspace{6mm}2.2.3 $c \leftarrow c \cdot a^g$ \\ +\hspace{6mm}2.2.4 $i \leftarrow i - k$ \\ +3. Return $c$. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Sliding Window $k$-ary Exponentiation} +\end{figure} + +Similar to the previous algorithm this algorithm must have a special handler when fewer than $k$ bits are left in the exponent. While this +algorithm requires the same number of squarings it can potentially have fewer multiplications. The pre-computed table $a^g$ is also half +the size as the previous table. + +Consider the exponent $b = 111101011001000_2 \equiv 31432_{10}$ with $k = 3$ using both algorithms. The first algorithm will divide the exponent up as +the following five $3$-bit words $b \equiv \left ( 111, 101, 011, 001, 000 \right )_{2}$. The second algorithm will break the +exponent as $b \equiv \left ( 111, 101, 0, 110, 0, 100, 0 \right )_{2}$. The single digit $0$ in the second representation are where +a single squaring took place instead of a squaring and multiplication. In total the first method requires $10$ multiplications and $18$ +squarings. The second method requires $8$ multiplications and $18$ squarings. + +In general the sliding window method is never slower than the generic $k$-ary method and often it is slightly faster. + +\section{Modular Exponentiation} + +Modular exponentiation is essentially computing the power of a base within a finite field or ring. For example, computing +$d \equiv a^b \mbox{ (mod }c\mbox{)}$ is a modular exponentiation. Instead of first computing $a^b$ and then reducing it +modulo $c$ the intermediate result is reduced modulo $c$ after every squaring or multiplication operation. + +This guarantees that any intermediate result is bounded by $0 \le d \le c^2 - 2c + 1$ and can be reduced modulo $c$ quickly using +one of the algorithms presented in chapter six. + +Before the actual modular exponentiation algorithm can be written a wrapper algorithm must be written first. This algorithm +will allow the exponent $b$ to be negative which is computed as $c \equiv \left (1 / a \right )^{\vert b \vert} \mbox{(mod }d\mbox{)}$. The +value of $(1/a) \mbox{ mod }c$ is computed using the modular inverse (\textit{see \ref{sec;modinv}}). If no inverse exists the algorithm +terminates with an error. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_exptmod}. \\ +\textbf{Input}. mp\_int $a$, $b$ and $c$ \\ +\textbf{Output}. $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\ +\hline \\ +1. If $c.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\ +2. If $b.sign = MP\_NEG$ then \\ +\hspace{3mm}2.1 $g' \leftarrow g^{-1} \mbox{ (mod }c\mbox{)}$ \\ +\hspace{3mm}2.2 $x' \leftarrow \vert x \vert$ \\ +\hspace{3mm}2.3 Compute $d \equiv g'^{x'} \mbox{ (mod }c\mbox{)}$ via recursion. \\ +3. if $p$ is odd \textbf{OR} $p$ is a D.R. modulus then \\ +\hspace{3mm}3.1 Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm mp\_exptmod\_fast. \\ +4. else \\ +\hspace{3mm}4.1 Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm s\_mp\_exptmod. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_exptmod} +\end{figure} + +\textbf{Algorithm mp\_exptmod.} +The first algorithm which actually performs modular exponentiation is algorithm s\_mp\_exptmod. It is a sliding window $k$-ary algorithm +which uses Barrett reduction to reduce the product modulo $p$. The second algorithm mp\_exptmod\_fast performs the same operation +except it uses either Montgomery or Diminished Radix reduction. The two latter reduction algorithms are clumped in the same exponentiation +algorithm since their arguments are essentially the same (\textit{two mp\_ints and one mp\_digit}). + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_exptmod.c +\vspace{-3mm} +\begin{alltt} +016 +017 +018 /* this is a shell function that calls either the normal or Montgomery +019 * exptmod functions. Originally the call to the montgomery code was +020 * embedded in the normal function but that wasted alot of stack space +021 * for nothing (since 99% of the time the Montgomery code would be called) +022 */ +023 int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) +024 \{ +025 int dr; +026 +027 /* modulus P must be positive */ +028 if (P->sign == MP_NEG) \{ +029 return MP_VAL; +030 \} +031 +032 /* if exponent X is negative we have to recurse */ +033 if (X->sign == MP_NEG) \{ +034 mp_int tmpG, tmpX; +035 int err; +036 +037 /* first compute 1/G mod P */ +038 if ((err = mp_init(&tmpG)) != MP_OKAY) \{ +039 return err; +040 \} +041 if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) \{ +042 mp_clear(&tmpG); +043 return err; +044 \} +045 +046 /* now get |X| */ +047 if ((err = mp_init(&tmpX)) != MP_OKAY) \{ +048 mp_clear(&tmpG); +049 return err; +050 \} +051 if ((err = mp_abs(X, &tmpX)) != MP_OKAY) \{ +052 mp_clear_multi(&tmpG, &tmpX, NULL); +053 return err; +054 \} +055 +056 /* and now compute (1/G)**|X| instead of G**X [X < 0] */ +057 err = mp_exptmod(&tmpG, &tmpX, P, Y); +058 mp_clear_multi(&tmpG, &tmpX, NULL); +059 return err; +060 \} +061 +062 /* is it a DR modulus? */ +063 dr = mp_dr_is_modulus(P); +064 +065 /* if not, is it a uDR modulus? */ +066 if (dr == 0) \{ +067 dr = mp_reduce_is_2k(P) << 1; +068 \} +069 +070 /* if the modulus is odd or dr != 0 use the fast method */ +071 if (mp_isodd (P) == 1 || dr != 0) \{ +072 return mp_exptmod_fast (G, X, P, Y, dr); +073 \} else \{ +074 /* otherwise use the generic Barrett reduction technique */ +075 return s_mp_exptmod (G, X, P, Y); +076 \} +077 \} +078 +\end{alltt} +\end{small} + +In order to keep the algorithms in a known state the first step on line 28 is to reject any negative modulus as input. If the exponent is +negative the algorithm tries to perform a modular exponentiation with the modular inverse of the base $G$. The temporary variable $tmpG$ is assigned +the modular inverse of $G$ and $tmpX$ is assigned the absolute value of $X$. The algorithm will recuse with these new values with a positive +exponent. + +If the exponent is positive the algorithm resumes the exponentiation. Line 63 determines if the modulus is of the restricted Diminished Radix +form. If it is not line 67 attempts to determine if it is of a unrestricted Diminished Radix form. The integer $dr$ will take on one +of three values. + +\begin{enumerate} +\item $dr = 0$ means that the modulus is not of either restricted or unrestricted Diminished Radix form. +\item $dr = 1$ means that the modulus is of restricted Diminished Radix form. +\item $dr = 2$ means that the modulus is of unrestricted Diminished Radix form. +\end{enumerate} + +Line 70 determines if the fast modular exponentiation algorithm can be used. It is allowed if $dr \ne 0$ or if the modulus is odd. Otherwise, +the slower s\_mp\_exptmod algorithm is used which uses Barrett reduction. + +\subsection{Barrett Modular Exponentiation} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{s\_mp\_exptmod}. \\ +\textbf{Input}. mp\_int $a$, $b$ and $c$ \\ +\textbf{Output}. $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\ +\hline \\ +1. $k \leftarrow lg(x)$ \\ +2. $winsize \leftarrow \left \lbrace \begin{array}{ll} + 2 & \mbox{if }k \le 7 \\ + 3 & \mbox{if }7 < k \le 36 \\ + 4 & \mbox{if }36 < k \le 140 \\ + 5 & \mbox{if }140 < k \le 450 \\ + 6 & \mbox{if }450 < k \le 1303 \\ + 7 & \mbox{if }1303 < k \le 3529 \\ + 8 & \mbox{if }3529 < k \\ + \end{array} \right .$ \\ +3. Initialize $2^{winsize}$ mp\_ints in an array named $M$ and one mp\_int named $\mu$ \\ +4. Calculate the $\mu$ required for Barrett Reduction (\textit{mp\_reduce\_setup}). \\ +5. $M_1 \leftarrow g \mbox{ (mod }p\mbox{)}$ \\ +\\ +Setup the table of small powers of $g$. First find $g^{2^{winsize}}$ and then all multiples of it. \\ +6. $k \leftarrow 2^{winsize - 1}$ \\ +7. $M_{k} \leftarrow M_1$ \\ +8. for $ix$ from 0 to $winsize - 2$ do \\ +\hspace{3mm}8.1 $M_k \leftarrow \left ( M_k \right )^2$ (\textit{mp\_sqr}) \\ +\hspace{3mm}8.2 $M_k \leftarrow M_k \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\ +9. for $ix$ from $2^{winsize - 1} + 1$ to $2^{winsize} - 1$ do \\ +\hspace{3mm}9.1 $M_{ix} \leftarrow M_{ix - 1} \cdot M_{1}$ (\textit{mp\_mul}) \\ +\hspace{3mm}9.2 $M_{ix} \leftarrow M_{ix} \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\ +10. $res \leftarrow 1$ \\ +\\ +Start Sliding Window. \\ +11. $mode \leftarrow 0, bitcnt \leftarrow 1, buf \leftarrow 0, digidx \leftarrow x.used - 1, bitcpy \leftarrow 0, bitbuf \leftarrow 0$ \\ +12. Loop \\ +\hspace{3mm}12.1 $bitcnt \leftarrow bitcnt - 1$ \\ +\hspace{3mm}12.2 If $bitcnt = 0$ then do \\ +\hspace{6mm}12.2.1 If $digidx = -1$ goto step 13. \\ +\hspace{6mm}12.2.2 $buf \leftarrow x_{digidx}$ \\ +\hspace{6mm}12.2.3 $digidx \leftarrow digidx - 1$ \\ +\hspace{6mm}12.2.4 $bitcnt \leftarrow lg(\beta)$ \\ +Continued on next page. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm s\_mp\_exptmod} +\end{figure} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{s\_mp\_exptmod} (\textit{continued}). \\ +\textbf{Input}. mp\_int $a$, $b$ and $c$ \\ +\textbf{Output}. $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\ +\hline \\ +\hspace{3mm}12.3 $y \leftarrow (buf >> (lg(\beta) - 1))$ AND $1$ \\ +\hspace{3mm}12.4 $buf \leftarrow buf << 1$ \\ +\hspace{3mm}12.5 if $mode = 0$ and $y = 0$ then goto step 12. \\ +\hspace{3mm}12.6 if $mode = 1$ and $y = 0$ then do \\ +\hspace{6mm}12.6.1 $res \leftarrow res^2$ \\ +\hspace{6mm}12.6.2 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ +\hspace{6mm}12.6.3 Goto step 12. \\ +\hspace{3mm}12.7 $bitcpy \leftarrow bitcpy + 1$ \\ +\hspace{3mm}12.8 $bitbuf \leftarrow bitbuf + (y << (winsize - bitcpy))$ \\ +\hspace{3mm}12.9 $mode \leftarrow 2$ \\ +\hspace{3mm}12.10 If $bitcpy = winsize$ then do \\ +\hspace{6mm}Window is full so perform the squarings and single multiplication. \\ +\hspace{6mm}12.10.1 for $ix$ from $0$ to $winsize -1$ do \\ +\hspace{9mm}12.10.1.1 $res \leftarrow res^2$ \\ +\hspace{9mm}12.10.1.2 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ +\hspace{6mm}12.10.2 $res \leftarrow res \cdot M_{bitbuf}$ \\ +\hspace{6mm}12.10.3 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ +\hspace{6mm}Reset the window. \\ +\hspace{6mm}12.10.4 $bitcpy \leftarrow 0, bitbuf \leftarrow 0, mode \leftarrow 1$ \\ +\\ +No more windows left. Check for residual bits of exponent. \\ +13. If $mode = 2$ and $bitcpy > 0$ then do \\ +\hspace{3mm}13.1 for $ix$ form $0$ to $bitcpy - 1$ do \\ +\hspace{6mm}13.1.1 $res \leftarrow res^2$ \\ +\hspace{6mm}13.1.2 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ +\hspace{6mm}13.1.3 $bitbuf \leftarrow bitbuf << 1$ \\ +\hspace{6mm}13.1.4 If $bitbuf$ AND $2^{winsize} \ne 0$ then do \\ +\hspace{9mm}13.1.4.1 $res \leftarrow res \cdot M_{1}$ \\ +\hspace{9mm}13.1.4.2 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ +14. $y \leftarrow res$ \\ +15. Clear $res$, $mu$ and the $M$ array. \\ +16. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm s\_mp\_exptmod (continued)} +\end{figure} + +\textbf{Algorithm s\_mp\_exptmod.} +This algorithm computes the $x$'th power of $g$ modulo $p$ and stores the result in $y$. It takes advantage of the Barrett reduction +algorithm to keep the product small throughout the algorithm. + +The first two steps determine the optimal window size based on the number of bits in the exponent. The larger the exponent the +larger the window size becomes. After a window size $winsize$ has been chosen an array of $2^{winsize}$ mp\_int variables is allocated. This +table will hold the values of $g^x \mbox{ (mod }p\mbox{)}$ for $2^{winsize - 1} \le x < 2^{winsize}$. + +After the table is allocated the first power of $g$ is found. Since $g \ge p$ is allowed it must be first reduced modulo $p$ to make +the rest of the algorithm more efficient. The first element of the table at $2^{winsize - 1}$ is found by squaring $M_1$ successively $winsize - 2$ +times. The rest of the table elements are found by multiplying the previous element by $M_1$ modulo $p$. + +Now that the table is available the sliding window may begin. The following list describes the functions of all the variables in the window. +\begin{enumerate} +\item The variable $mode$ dictates how the bits of the exponent are interpreted. +\begin{enumerate} + \item When $mode = 0$ the bits are ignored since no non-zero bit of the exponent has been seen yet. For example, if the exponent were simply + $1$ then there would be $lg(\beta) - 1$ zero bits before the first non-zero bit. In this case bits are ignored until a non-zero bit is found. + \item When $mode = 1$ a non-zero bit has been seen before and a new $winsize$-bit window has not been formed yet. In this mode leading $0$ bits + are read and a single squaring is performed. If a non-zero bit is read a new window is created. + \item When $mode = 2$ the algorithm is in the middle of forming a window and new bits are appended to the window from the most significant bit + downwards. +\end{enumerate} +\item The variable $bitcnt$ indicates how many bits are left in the current digit of the exponent left to be read. When it reaches zero a new digit + is fetched from the exponent. +\item The variable $buf$ holds the currently read digit of the exponent. +\item The variable $digidx$ is an index into the exponents digits. It starts at the leading digit $x.used - 1$ and moves towards the trailing digit. +\item The variable $bitcpy$ indicates how many bits are in the currently formed window. When it reaches $winsize$ the window is flushed and + the appropriate operations performed. +\item The variable $bitbuf$ holds the current bits of the window being formed. +\end{enumerate} + +All of step 12 is the window processing loop. It will iterate while there are digits available form the exponent to read. The first step +inside this loop is to extract a new digit if no more bits are available in the current digit. If there are no bits left a new digit is +read and if there are no digits left than the loop terminates. + +After a digit is made available step 12.3 will extract the most significant bit of the current digit and move all other bits in the digit +upwards. In effect the digit is read from most significant bit to least significant bit and since the digits are read from leading to +trailing edges the entire exponent is read from most significant bit to least significant bit. + +At step 12.5 if the $mode$ and currently extracted bit $y$ are both zero the bit is ignored and the next bit is read. This prevents the +algorithm from having to perform trivial squaring and reduction operations before the first non-zero bit is read. Step 12.6 and 12.7-10 handle +the two cases of $mode = 1$ and $mode = 2$ respectively. + +\begin{center} +\begin{figure}[here] +\includegraphics{pics/expt_state.ps} +\caption{Sliding Window State Diagram} +\label{pic:expt_state} +\end{figure} +\end{center} + +By step 13 there are no more digits left in the exponent. However, there may be partial bits in the window left. If $mode = 2$ then +a Left-to-Right algorithm is used to process the remaining few bits. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_exptmod.c +\vspace{-3mm} +\begin{alltt} +016 +017 #ifdef MP_LOW_MEM +018 #define TAB_SIZE 32 +019 #else +020 #define TAB_SIZE 256 +021 #endif +022 +023 int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) +024 \{ +025 mp_int M[TAB_SIZE], res, mu; +026 mp_digit buf; +027 int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; +028 +029 /* find window size */ +030 x = mp_count_bits (X); +031 if (x <= 7) \{ +032 winsize = 2; +033 \} else if (x <= 36) \{ +034 winsize = 3; +035 \} else if (x <= 140) \{ +036 winsize = 4; +037 \} else if (x <= 450) \{ +038 winsize = 5; +039 \} else if (x <= 1303) \{ +040 winsize = 6; +041 \} else if (x <= 3529) \{ +042 winsize = 7; +043 \} else \{ +044 winsize = 8; +045 \} +046 +047 #ifdef MP_LOW_MEM +048 if (winsize > 5) \{ +049 winsize = 5; +050 \} +051 #endif +052 +053 /* init M array */ +054 /* init first cell */ +055 if ((err = mp_init(&M[1])) != MP_OKAY) \{ +056 return err; +057 \} +058 +059 /* now init the second half of the array */ +060 for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{ +061 if ((err = mp_init(&M[x])) != MP_OKAY) \{ +062 for (y = 1<<(winsize-1); y < x; y++) \{ +063 mp_clear (&M[y]); +064 \} +065 mp_clear(&M[1]); +066 return err; +067 \} +068 \} +069 +070 /* create mu, used for Barrett reduction */ +071 if ((err = mp_init (&mu)) != MP_OKAY) \{ +072 goto __M; +073 \} +074 if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) \{ +075 goto __MU; +076 \} +077 +078 /* create M table +079 * +080 * The M table contains powers of the base, +081 * e.g. M[x] = G**x mod P +082 * +083 * The first half of the table is not +084 * computed though accept for M[0] and M[1] +085 */ +086 if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) \{ +087 goto __MU; +088 \} +089 +090 /* compute the value at M[1<<(winsize-1)] by squaring +091 * M[1] (winsize-1) times +092 */ +093 if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) \{ +094 goto __MU; +095 \} +096 +097 for (x = 0; x < (winsize - 1); x++) \{ +098 if ((err = mp_sqr (&M[1 << (winsize - 1)], +099 &M[1 << (winsize - 1)])) != MP_OKAY) \{ +100 goto __MU; +101 \} +102 if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) \{ +103 goto __MU; +104 \} +105 \} +106 +107 /* create upper table, that is M[x] = M[x-1] * M[1] (mod P) +108 * for x = (2**(winsize - 1) + 1) to (2**winsize - 1) +109 */ +110 for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) \{ +111 if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) \{ +112 goto __MU; +113 \} +114 if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) \{ +115 goto __MU; +116 \} +117 \} +118 +119 /* setup result */ +120 if ((err = mp_init (&res)) != MP_OKAY) \{ +121 goto __MU; +122 \} +123 mp_set (&res, 1); +124 +125 /* set initial mode and bit cnt */ +126 mode = 0; +127 bitcnt = 1; +128 buf = 0; +129 digidx = X->used - 1; +130 bitcpy = 0; +131 bitbuf = 0; +132 +133 for (;;) \{ +134 /* grab next digit as required */ +135 if (--bitcnt == 0) \{ +136 /* if digidx == -1 we are out of digits */ +137 if (digidx == -1) \{ +138 break; +139 \} +140 /* read next digit and reset the bitcnt */ +141 buf = X->dp[digidx--]; +142 bitcnt = (int) DIGIT_BIT; +143 \} +144 +145 /* grab the next msb from the exponent */ +146 y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1; +147 buf <<= (mp_digit)1; +148 +149 /* if the bit is zero and mode == 0 then we ignore it +150 * These represent the leading zero bits before the first 1 bit +151 * in the exponent. Technically this opt is not required but it +152 * does lower the # of trivial squaring/reductions used +153 */ +154 if (mode == 0 && y == 0) \{ +155 continue; +156 \} +157 +158 /* if the bit is zero and mode == 1 then we square */ +159 if (mode == 1 && y == 0) \{ +160 if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{ +161 goto __RES; +162 \} +163 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{ +164 goto __RES; +165 \} +166 continue; +167 \} +168 +169 /* else we add it to the window */ +170 bitbuf |= (y << (winsize - ++bitcpy)); +171 mode = 2; +172 +173 if (bitcpy == winsize) \{ +174 /* ok window is filled so square as required and multiply */ +175 /* square first */ +176 for (x = 0; x < winsize; x++) \{ +177 if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{ +178 goto __RES; +179 \} +180 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{ +181 goto __RES; +182 \} +183 \} +184 +185 /* then multiply */ +186 if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) \{ +187 goto __RES; +188 \} +189 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{ +190 goto __RES; +191 \} +192 +193 /* empty window and reset */ +194 bitcpy = 0; +195 bitbuf = 0; +196 mode = 1; +197 \} +198 \} +199 +200 /* if bits remain then square/multiply */ +201 if (mode == 2 && bitcpy > 0) \{ +202 /* square then multiply if the bit is set */ +203 for (x = 0; x < bitcpy; x++) \{ +204 if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{ +205 goto __RES; +206 \} +207 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{ +208 goto __RES; +209 \} +210 +211 bitbuf <<= 1; +212 if ((bitbuf & (1 << winsize)) != 0) \{ +213 /* then multiply */ +214 if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) \{ +215 goto __RES; +216 \} +217 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{ +218 goto __RES; +219 \} +220 \} +221 \} +222 \} +223 +224 mp_exch (&res, Y); +225 err = MP_OKAY; +226 __RES:mp_clear (&res); +227 __MU:mp_clear (&mu); +228 __M: +229 mp_clear(&M[1]); +230 for (x = 1<<(winsize-1); x < (1 << winsize); x++) \{ +231 mp_clear (&M[x]); +232 \} +233 return err; +234 \} +\end{alltt} +\end{small} + +Lines 31 through 41 determine the optimal window size based on the length of the exponent in bits. The window divisions are sorted +from smallest to greatest so that in each \textbf{if} statement only one condition must be tested. For example, by the \textbf{if} statement +on line 33 the value of $x$ is already known to be greater than $140$. + +The conditional piece of code beginning on line 47 allows the window size to be restricted to five bits. This logic is used to ensure +the table of precomputed powers of $G$ remains relatively small. + +The for loop on line 60 initializes the $M$ array while lines 61 and 74 compute the value of $\mu$ required for +Barrett reduction. + +-- More later. + +\section{Quick Power of Two} +Calculating $b = 2^a$ can be performed much quicker than with any of the previous algorithms. Recall that a logical shift left $m << k$ is +equivalent to $m \cdot 2^k$. By this logic when $m = 1$ a quick power of two can be achieved. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_2expt}. \\ +\textbf{Input}. integer $b$ \\ +\textbf{Output}. $a \leftarrow 2^b$ \\ +\hline \\ +1. $a \leftarrow 0$ \\ +2. If $a.alloc < \lfloor b / lg(\beta) \rfloor + 1$ then grow $a$ appropriately. \\ +3. $a.used \leftarrow \lfloor b / lg(\beta) \rfloor + 1$ \\ +4. $a_{\lfloor b / lg(\beta) \rfloor} \leftarrow 1 << (b \mbox{ mod } lg(\beta))$ \\ +5. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_2expt} +\end{figure} + +\textbf{Algorithm mp\_2expt.} + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_2expt.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* computes a = 2**b +018 * +019 * Simple algorithm which zeroes the int, grows it then just sets one bit +020 * as required. +021 */ +022 int +023 mp_2expt (mp_int * a, int b) +024 \{ +025 int res; +026 +027 /* zero a as per default */ +028 mp_zero (a); +029 +030 /* grow a to accomodate the single bit */ +031 if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) \{ +032 return res; +033 \} +034 +035 /* set the used count of where the bit will go */ +036 a->used = b / DIGIT_BIT + 1; +037 +038 /* put the single bit in its place */ +039 a->dp[b / DIGIT_BIT] = 1 << (b % DIGIT_BIT); +040 +041 return MP_OKAY; +042 \} +\end{alltt} +\end{small} + +\chapter{Higher Level Algorithms} + +This chapter discusses the various higher level algorithms that are required to complete a well rounded multiple precision integer package. These +routines are less performance oriented than the algorithms of chapters five, six and seven but are no less important. + +The first section describes a method of integer division with remainder that is universally well known. It provides the signed division logic +for the package. The subsequent section discusses a set of algorithms which allow a single digit to be the 2nd operand for a variety of operations. +These algorithms serve mostly to simplify other algorithms where small constants are required. The last two sections discuss how to manipulate +various representations of integers. For example, converting from an mp\_int to a string of character. + +\section{Integer Division with Remainder} +\label{sec:division} + +Integer division aside from modular exponentiation is the most intensive algorithm to compute. Like addition, subtraction and multiplication +the basis of this algorithm is the long-hand division algorithm taught to school children. Throughout this discussion several common variables +will be used. Let $x$ represent the divisor and $y$ represent the dividend. Let $q$ represent the integer quotient $\lfloor y / x \rfloor$ and +let $r$ represent the remainder $r = y - x \lfloor y / x \rfloor$. The following simple algorithm will be used to start the discussion. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Radix-$\beta$ Integer Division}. \\ +\textbf{Input}. integer $x$ and $y$ \\ +\textbf{Output}. $q = \lfloor y/x\rfloor, r = y - xq$ \\ +\hline \\ +1. $q \leftarrow 0$ \\ +2. $n \leftarrow \vert \vert y \vert \vert - \vert \vert x \vert \vert$ \\ +3. for $t$ from $n$ down to $0$ do \\ +\hspace{3mm}3.1 Maximize $k$ such that $kx\beta^t$ is less than or equal to $y$ and $(k + 1)x\beta^t$ is greater. \\ +\hspace{3mm}3.2 $q \leftarrow q + k\beta^t$ \\ +\hspace{3mm}3.3 $y \leftarrow y - kx\beta^t$ \\ +4. $r \leftarrow y$ \\ +5. Return($q, r$) \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm Radix-$\beta$ Integer Division} +\label{fig:raddiv} +\end{figure} + +As children we are taught this very simple algorithm for the case of $\beta = 10$. Almost instinctively several optimizations are taught for which +their reason of existing are never explained. For this example let $y = 5471$ represent the dividend and $x = 23$ represent the divisor. + +To find the first digit of the quotient the value of $k$ must be maximized such that $kx\beta^t$ is less than or equal to $y$ and +simultaneously $(k + 1)x\beta^t$ is greater than $y$. Implicitly $k$ is the maximum value the $t$'th digit of the quotient may have. The habitual method +used to find the maximum is to ``eyeball'' the two numbers, typically only the leading digits and quickly estimate a quotient. By only using leading +digits a much simpler division may be used to form an educated guess at what the value must be. In this case $k = \lfloor 54/23\rfloor = 2$ quickly +arises as a possible solution. Indeed $2x\beta^2 = 4600$ is less than $y = 5471$ and simultaneously $(k + 1)x\beta^2 = 6900$ is larger than $y$. +As a result $k\beta^2$ is added to the quotient which now equals $q = 200$ and $4600$ is subtracted from $y$ to give a remainder of $y = 841$. + +Again this process is repeated to produce the quotient digit $k = 3$ which makes the quotient $q = 200 + 3\beta = 230$ and the remainder +$y = 841 - 3x\beta = 181$. Finally the last iteration of the loop produces $k = 7$ which leads to the quotient $q = 230 + 7 = 237$ and the +remainder $y = 181 - 7x = 20$. The final quotient and remainder found are $q = 237$ and $r = y = 20$ which are indeed correct since +$237 \cdot 23 + 20 = 5471$ is true. + +\subsection{Quotient Estimation} +\label{sec:divest} +As alluded to earlier the quotient digit $k$ can be estimated from only the leading digits of both the divisor and dividend. When $p$ leading +digits are used from both the divisor and dividend to form an estimation the accuracy of the estimation rises as $p$ grows. Technically +speaking the estimation is based on assuming the lower $\vert \vert y \vert \vert - p$ and $\vert \vert x \vert \vert - p$ lower digits of the +dividend and divisor are zero. + +The value of the estimation may off by a few values in either direction and in general is fairly correct. A simplification \cite[pp. 271]{TAOCPV2} +of the estimation technique is to use $t + 1$ digits of the dividend and $t$ digits of the divisor, in particularly when $t = 1$. The estimate +using this technique is never too small. For the following proof let $t = \vert \vert y \vert \vert - 1$ and $s = \vert \vert x \vert \vert - 1$ +represent the most significant digits of the dividend and divisor respectively. + +\textbf{Proof.}\textit{ The quotient $\hat k = \lfloor (y_t\beta + y_{t-1}) / x_s \rfloor$ is greater than or equal to +$k = \lfloor y / (x \cdot \beta^{\vert \vert y \vert \vert - \vert \vert x \vert \vert - 1}) \rfloor$. } +The first obvious case is when $\hat k = \beta - 1$ in which case the proof is concluded since the real quotient cannot be larger. For all other +cases $\hat k = \lfloor (y_t\beta + y_{t-1}) / x_s \rfloor$ and $\hat k x_s \ge y_t\beta + y_{t-1} - x_s + 1$. The latter portion of the inequalility +$-x_s + 1$ arises from the fact that a truncated integer division will give the same quotient for at most $x_s - 1$ values. Next a series of +inequalities will prove the hypothesis. + +\begin{equation} +y - \hat k x \le y - \hat k x_s\beta^s +\end{equation} + +This is trivially true since $x \ge x_s\beta^s$. Next we replace $\hat kx_s\beta^s$ by the previous inequality for $\hat kx_s$. + +\begin{equation} +y - \hat k x \le y_t\beta^t + \ldots + y_0 - (y_t\beta^t + y_{t-1}\beta^{t-1} - x_s\beta^t + \beta^s) +\end{equation} + +By simplifying the previous inequality the following inequality is formed. + +\begin{equation} +y - \hat k x \le y_{t-2}\beta^{t-2} + \ldots + y_0 + x_s\beta^s - \beta^s +\end{equation} + +Subsequently, + +\begin{equation} +y_{t-2}\beta^{t-2} + \ldots + y_0 + x_s\beta^s - \beta^s < x_s\beta^s \le x +\end{equation} + +Which proves that $y - \hat kx \le x$ and by consequence $\hat k \ge k$ which concludes the proof. \textbf{QED} + + +\subsection{Normalized Integers} +For the purposes of division a normalized input is when the divisors leading digit $x_n$ is greater than or equal to $\beta / 2$. By multiplying both +$x$ and $y$ by $j = \lfloor (\beta / 2) / x_n \rfloor$ the quotient remains unchanged and the remainder is simply $j$ times the original +remainder. The purpose of normalization is to ensure the leading digit of the divisor is sufficiently large such that the estimated quotient will +lie in the domain of a single digit. Consider the maximum dividend $(\beta - 1) \cdot \beta + (\beta - 1)$ and the minimum divisor $\beta / 2$. + +\begin{equation} +{{\beta^2 - 1} \over { \beta / 2}} \le 2\beta - {2 \over \beta} +\end{equation} + +At most the quotient approaches $2\beta$, however, in practice this will not occur since that would imply the previous quotient digit was too small. + +\subsection{Radix-$\beta$ Division with Remainder} +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_div}. \\ +\textbf{Input}. mp\_int $a, b$ \\ +\textbf{Output}. $c = \lfloor a/b \rfloor$, $d = a - bc$ \\ +\hline \\ +1. If $b = 0$ return(\textit{MP\_VAL}). \\ +2. If $\vert a \vert < \vert b \vert$ then do \\ +\hspace{3mm}2.1 $d \leftarrow a$ \\ +\hspace{3mm}2.2 $c \leftarrow 0$ \\ +\hspace{3mm}2.3 Return(\textit{MP\_OKAY}). \\ +\\ +Setup the quotient to receive the digits. \\ +3. Grow $q$ to $a.used + 2$ digits. \\ +4. $q \leftarrow 0$ \\ +5. $x \leftarrow \vert a \vert , y \leftarrow \vert b \vert$ \\ +6. $sign \leftarrow \left \lbrace \begin{array}{ll} + MP\_ZPOS & \mbox{if }a.sign = b.sign \\ + MP\_NEG & \mbox{otherwise} \\ + \end{array} \right .$ \\ +\\ +Normalize the inputs such that the leading digit of $y$ is greater than or equal to $\beta / 2$. \\ +7. $norm \leftarrow (lg(\beta) - 1) - (\lceil lg(y) \rceil \mbox{ (mod }lg(\beta)\mbox{)})$ \\ +8. $x \leftarrow x \cdot 2^{norm}, y \leftarrow y \cdot 2^{norm}$ \\ +\\ +Find the leading digit of the quotient. \\ +9. $n \leftarrow x.used - 1, t \leftarrow y.used - 1$ \\ +10. $y \leftarrow y \cdot \beta^{n - t}$ \\ +11. While ($x \ge y$) do \\ +\hspace{3mm}11.1 $q_{n - t} \leftarrow q_{n - t} + 1$ \\ +\hspace{3mm}11.2 $x \leftarrow x - y$ \\ +12. $y \leftarrow \lfloor y / \beta^{n-t} \rfloor$ \\ +\\ +Continued on the next page. \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_div} +\end{figure} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_div} (continued). \\ +\textbf{Input}. mp\_int $a, b$ \\ +\textbf{Output}. $c = \lfloor a/b \rfloor$, $d = a - bc$ \\ +\hline \\ +Now find the remainder fo the digits. \\ +13. for $i$ from $n$ down to $(t + 1)$ do \\ +\hspace{3mm}13.1 If $i > x.used$ then jump to the next iteration of this loop. \\ +\hspace{3mm}13.2 If $x_{i} = y_{t}$ then \\ +\hspace{6mm}13.2.1 $q_{i - t - 1} \leftarrow \beta - 1$ \\ +\hspace{3mm}13.3 else \\ +\hspace{6mm}13.3.1 $\hat r \leftarrow x_{i} \cdot \beta + x_{i - 1}$ \\ +\hspace{6mm}13.3.2 $\hat r \leftarrow \lfloor \hat r / y_{t} \rfloor$ \\ +\hspace{6mm}13.3.3 $q_{i - t - 1} \leftarrow \hat r$ \\ +\hspace{3mm}13.4 $q_{i - t - 1} \leftarrow q_{i - t - 1} + 1$ \\ +\\ +Fixup quotient estimation. \\ +\hspace{3mm}13.5 Loop \\ +\hspace{6mm}13.5.1 $q_{i - t - 1} \leftarrow q_{i - t - 1} - 1$ \\ +\hspace{6mm}13.5.2 t$1 \leftarrow 0$ \\ +\hspace{6mm}13.5.3 t$1_0 \leftarrow y_{t - 1}, $ t$1_1 \leftarrow y_t,$ t$1.used \leftarrow 2$ \\ +\hspace{6mm}13.5.4 $t1 \leftarrow t1 \cdot q_{i - t - 1}$ \\ +\hspace{6mm}13.5.5 t$2_0 \leftarrow x_{i - 2}, $ t$2_1 \leftarrow x_{i - 1}, $ t$2_2 \leftarrow x_i, $ t$2.used \leftarrow 3$ \\ +\hspace{6mm}13.5.6 If $\vert t1 \vert > \vert t2 \vert$ then goto step 13.5. \\ +\hspace{3mm}13.6 t$1 \leftarrow y \cdot q_{i - t - 1}$ \\ +\hspace{3mm}13.7 t$1 \leftarrow $ t$1 \cdot \beta^{i - t - 1}$ \\ +\hspace{3mm}13.8 $x \leftarrow x - $ t$1$ \\ +\hspace{3mm}13.9 If $x.sign = MP\_NEG$ then \\ +\hspace{6mm}13.10 t$1 \leftarrow y$ \\ +\hspace{6mm}13.11 t$1 \leftarrow $ t$1 \cdot \beta^{i - t - 1}$ \\ +\hspace{6mm}13.12 $x \leftarrow x + $ t$1$ \\ +\hspace{6mm}13.13 $q_{i - t - 1} \leftarrow q_{i - t - 1} - 1$ \\ +\\ +Finalize the result. \\ +14. Clamp excess digits of $q$ \\ +15. $c \leftarrow q, c.sign \leftarrow sign$ \\ +16. $x.sign \leftarrow a.sign$ \\ +17. $d \leftarrow \lfloor x / 2^{norm} \rfloor$ \\ +18. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_div (continued)} +\end{figure} +\textbf{Algorithm mp\_div.} +This algorithm will calculate quotient and remainder from an integer division given a dividend and divisor. The algorithm is a signed +division and will produce a fully qualified quotient and remainder. + +First the divisor $b$ must be non-zero which is enforced in step one. If the divisor is larger than the dividend than the quotient is implicitly +zero and the remainder is the dividend. + +After the first two trivial cases of inputs are handled the variable $q$ is setup to receive the digits of the quotient. Two unsigned copies of the +divisor $y$ and dividend $x$ are made as well. The core of the division algorithm is an unsigned division and will only work if the values are +positive. Now the two values $x$ and $y$ must be normalized such that the leading digit of $y$ is greater than or equal to $\beta / 2$. +This is performed by shifting both to the left by enough bits to get the desired normalization. + +At this point the division algorithm can begin producing digits of the quotient. Recall that maximum value of the estimation used is +$2\beta - {2 \over \beta}$ which means that a digit of the quotient must be first produced by another means. In this case $y$ is shifted +to the left (\textit{step ten}) so that it has the same number of digits as $x$. The loop on step eleven will subtract multiples of the +shifted copy of $y$ until $x$ is smaller. Since the leading digit of $y$ is greater than or equal to $\beta/2$ this loop will iterate at most two +times to produce the desired leading digit of the quotient. + +Now the remainder of the digits can be produced. The equation $\hat q = \lfloor {{x_i \beta + x_{i-1}}\over y_t} \rfloor$ is used to fairly +accurately approximate the true quotient digit. The estimation can in theory produce an estimation as high as $2\beta - {2 \over \beta}$ but by +induction the upper quotient digit is correct (\textit{as established on step eleven}) and the estimate must be less than $\beta$. + +Recall from section~\ref{sec:divest} that the estimation is never too low but may be too high. The next step of the estimation process is +to refine the estimation. The loop on step 13.5 uses $x_i\beta^2 + x_{i-1}\beta + x_{i-2}$ and $q_{i - t - 1}(y_t\beta + y_{t-1})$ as a higher +order approximation to adjust the quotient digit. + +After both phases of estimation the quotient digit may still be off by a value of one\footnote{This is similar to the error introduced +by optimizing Barrett reduction.}. Steps 13.6 and 13.7 subtract the multiple of the divisor from the dividend (\textit{Similar to step 3.3 of +algorithm~\ref{fig:raddiv}} and then subsequently add a multiple of the divisor if the quotient was too large. + +Now that the quotient has been determine finializing the result is a matter of clamping the quotient, fixing the sizes and de-normalizing the +remainder. An important aspect of this algorithm seemingly overlooked in other descriptions such as that of Algorithm 14.20 HAC \cite[pp. 598]{HAC} +is that when the estimations are being made (\textit{inside the loop on step 13.5}) that the digits $y_{t-1}$, $x_{i-2}$ and $x_{i-1}$ may lie +outside their respective boundaries. For example, if $t = 0$ or $i \le 1$ then the digits would be undefined. In those cases the digits should +respectively be replaced with a zero. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_div.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* integer signed division. +018 * c*b + d == a [e.g. a/b, c=quotient, d=remainder] +019 * HAC pp.598 Algorithm 14.20 +020 * +021 * Note that the description in HAC is horribly +022 * incomplete. For example, it doesn't consider +023 * the case where digits are removed from 'x' in +024 * the inner loop. It also doesn't consider the +025 * case that y has fewer than three digits, etc.. +026 * +027 * The overall algorithm is as described as +028 * 14.20 from HAC but fixed to treat these cases. +029 */ +030 int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d) +031 \{ +032 mp_int q, x, y, t1, t2; +033 int res, n, t, i, norm, neg; +034 +035 /* is divisor zero ? */ +036 if (mp_iszero (b) == 1) \{ +037 return MP_VAL; +038 \} +039 +040 /* if a < b then q=0, r = a */ +041 if (mp_cmp_mag (a, b) == MP_LT) \{ +042 if (d != NULL) \{ +043 res = mp_copy (a, d); +044 \} else \{ +045 res = MP_OKAY; +046 \} +047 if (c != NULL) \{ +048 mp_zero (c); +049 \} +050 return res; +051 \} +052 +053 if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) \{ +054 return res; +055 \} +056 q.used = a->used + 2; +057 +058 if ((res = mp_init (&t1)) != MP_OKAY) \{ +059 goto __Q; +060 \} +061 +062 if ((res = mp_init (&t2)) != MP_OKAY) \{ +063 goto __T1; +064 \} +065 +066 if ((res = mp_init_copy (&x, a)) != MP_OKAY) \{ +067 goto __T2; +068 \} +069 +070 if ((res = mp_init_copy (&y, b)) != MP_OKAY) \{ +071 goto __X; +072 \} +073 +074 /* fix the sign */ +075 neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG; +076 x.sign = y.sign = MP_ZPOS; +077 +078 /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */ +079 norm = mp_count_bits(&y) % DIGIT_BIT; +080 if (norm < (int)(DIGIT_BIT-1)) \{ +081 norm = (DIGIT_BIT-1) - norm; +082 if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) \{ +083 goto __Y; +084 \} +085 if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) \{ +086 goto __Y; +087 \} +088 \} else \{ +089 norm = 0; +090 \} +091 +092 /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */ +093 n = x.used - 1; +094 t = y.used - 1; +095 +096 /* while (x >= y*b**n-t) do \{ q[n-t] += 1; x -= y*b**\{n-t\} \} */ +097 if ((res = mp_lshd (&y, n - t)) != MP_OKAY) \{ /* y = y*b**\{n-t\} */ +098 goto __Y; +099 \} +100 +101 while (mp_cmp (&x, &y) != MP_LT) \{ +102 ++(q.dp[n - t]); +103 if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) \{ +104 goto __Y; +105 \} +106 \} +107 +108 /* reset y by shifting it back down */ +109 mp_rshd (&y, n - t); +110 +111 /* step 3. for i from n down to (t + 1) */ +112 for (i = n; i >= (t + 1); i--) \{ +113 if (i > x.used) \{ +114 continue; +115 \} +116 +117 /* step 3.1 if xi == yt then set q\{i-t-1\} to b-1, +118 * otherwise set q\{i-t-1\} to (xi*b + x\{i-1\})/yt */ +119 if (x.dp[i] == y.dp[t]) \{ +120 q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1); +121 \} else \{ +122 mp_word tmp; +123 tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT); +124 tmp |= ((mp_word) x.dp[i - 1]); +125 tmp /= ((mp_word) y.dp[t]); +126 if (tmp > (mp_word) MP_MASK) +127 tmp = MP_MASK; +128 q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK)); +129 \} +130 +131 /* while (q\{i-t-1\} * (yt * b + y\{t-1\})) > +132 xi * b**2 + xi-1 * b + xi-2 +133 +134 do q\{i-t-1\} -= 1; +135 */ +136 q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK; +137 do \{ +138 q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK; +139 +140 /* find left hand */ +141 mp_zero (&t1); +142 t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1]; +143 t1.dp[1] = y.dp[t]; +144 t1.used = 2; +145 if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) \{ +146 goto __Y; +147 \} +148 +149 /* find right hand */ +150 t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2]; +151 t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1]; +152 t2.dp[2] = x.dp[i]; +153 t2.used = 3; +154 \} while (mp_cmp_mag(&t1, &t2) == MP_GT); +155 +156 /* step 3.3 x = x - q\{i-t-1\} * y * b**\{i-t-1\} */ +157 if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) \{ +158 goto __Y; +159 \} +160 +161 if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) \{ +162 goto __Y; +163 \} +164 +165 if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) \{ +166 goto __Y; +167 \} +168 +169 /* if x < 0 then \{ x = x + y*b**\{i-t-1\}; q\{i-t-1\} -= 1; \} */ +170 if (x.sign == MP_NEG) \{ +171 if ((res = mp_copy (&y, &t1)) != MP_OKAY) \{ +172 goto __Y; +173 \} +174 if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) \{ +175 goto __Y; +176 \} +177 if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) \{ +178 goto __Y; +179 \} +180 +181 q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK; +182 \} +183 \} +184 +185 /* now q is the quotient and x is the remainder +186 * [which we have to normalize] +187 */ +188 +189 /* get sign before writing to c */ +190 x.sign = a->sign; +191 +192 if (c != NULL) \{ +193 mp_clamp (&q); +194 mp_exch (&q, c); +195 c->sign = neg; +196 \} +197 +198 if (d != NULL) \{ +199 mp_div_2d (&x, norm, &x, NULL); +200 mp_exch (&x, d); +201 \} +202 +203 res = MP_OKAY; +204 +205 __Y:mp_clear (&y); +206 __X:mp_clear (&x); +207 __T2:mp_clear (&t2); +208 __T1:mp_clear (&t1); +209 __Q:mp_clear (&q); +210 return res; +211 \} +\end{alltt} +\end{small} + +The implementation of this algorithm differs slightly from the pseudo code presented previously. In this algorithm either of the quotient $c$ or +remainder $d$ may be passed as a \textbf{NULL} pointer which indicates their value is not desired. For example, the C code to call the division +algorithm with only the quotient is + +\begin{verbatim} +mp_div(&a, &b, &c, NULL); /* c = [a/b] */ +\end{verbatim} + +Lines 36 and 42 handle the two trivial cases of inputs which are division by zero and dividend smaller than the divisor +respectively. After the two trivial cases all of the temporary variables are initialized. Line 75 determines the sign of +the quotient and line 76 ensures that both $x$ and $y$ are positive. + +The number of bits in the leading digit is calculated on line 80. Implictly an mp\_int with $r$ digits will require $lg(\beta)(r-1) + k$ bits +of precision which when reduced modulo $lg(\beta)$ produces the value of $k$. In this case $k$ is the number of bits in the leading digit which is +exactly what is required. For the algorithm to operate $k$ must equal $lg(\beta) - 1$ and when it does not the inputs must be normalized by shifting +them to the left by $lg(\beta) - 1 - k$ bits. + +Throughout the variables $n$ and $t$ will represent the highest digit of $x$ and $y$ respectively. These are first used to produce the +leading digit of the quotient. The loop beginning on line 112 will produce the remainder of the quotient digits. + +The conditional ``continue'' on line 113 is used to prevent the algorithm from reading past the leading edge of $x$ which can occur when the +algorithm eliminates multiple non-zero digits in a single iteration. This ensures that $x_i$ is always non-zero since by definition the digits +above the $i$'th position $x$ must be zero in order for the quotient to be precise\footnote{Precise as far as integer division is concerned.}. + +Lines 142, 143 and 150 through 152 manually construct the high accuracy estimations by setting the digits of the two mp\_int +variables directly. + +\section{Single Digit Helpers} + +This section briefly describes a series of single digit helper algorithms which come in handy when working with small constants. All of +the helper functions assume the single digit input is positive and will treat them as such. + +\subsection{Single Digit Addition and Subtraction} + +Both addition and subtraction are performed by ``cheating'' and using mp\_set followed by the higher level addition or subtraction +algorithms. As a result these algorithms are subtantially simpler with a slight cost in performance. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_add\_d}. \\ +\textbf{Input}. mp\_int $a$ and a mp\_digit $b$ \\ +\textbf{Output}. $c = a + b$ \\ +\hline \\ +1. $t \leftarrow b$ (\textit{mp\_set}) \\ +2. $c \leftarrow a + t$ \\ +3. Return(\textit{MP\_OKAY}) \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_add\_d} +\end{figure} + +\textbf{Algorithm mp\_add\_d.} +This algorithm initiates a temporary mp\_int with the value of the single digit and uses algorithm mp\_add to add the two values together. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_add\_d.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* single digit addition */ +018 int +019 mp_add_d (mp_int * a, mp_digit b, mp_int * c) +020 \{ +021 int res, ix, oldused; +022 mp_digit *tmpa, *tmpc, mu; +023 +024 /* grow c as required */ +025 if (c->alloc < a->used + 1) \{ +026 if ((res = mp_grow(c, a->used + 1)) != MP_OKAY) \{ +027 return res; +028 \} +029 \} +030 +031 /* if a is negative and |a| >= b, call c = |a| - b */ +032 if (a->sign == MP_NEG && (a->used > 1 || a->dp[0] >= b)) \{ +033 /* temporarily fix sign of a */ +034 a->sign = MP_ZPOS; +035 +036 /* c = |a| - b */ +037 res = mp_sub_d(a, b, c); +038 +039 /* fix sign */ +040 a->sign = c->sign = MP_NEG; +041 +042 return res; +043 \} +044 +045 /* old number of used digits in c */ +046 oldused = c->used; +047 +048 /* sign always positive */ +049 c->sign = MP_ZPOS; +050 +051 /* source alias */ +052 tmpa = a->dp; +053 +054 /* destination alias */ +055 tmpc = c->dp; +056 +057 /* if a is positive */ +058 if (a->sign == MP_ZPOS) \{ +059 /* add digit, after this we're propagating +060 * the carry. +061 */ +062 *tmpc = *tmpa++ + b; +063 mu = *tmpc >> DIGIT_BIT; +064 *tmpc++ &= MP_MASK; +065 +066 /* now handle rest of the digits */ +067 for (ix = 1; ix < a->used; ix++) \{ +068 *tmpc = *tmpa++ + mu; +069 mu = *tmpc >> DIGIT_BIT; +070 *tmpc++ &= MP_MASK; +071 \} +072 /* set final carry */ +073 ix++; +074 *tmpc++ = mu; +075 +076 /* setup size */ +077 c->used = a->used + 1; +078 \} else \{ +079 /* a was negative and |a| < b */ +080 c->used = 1; +081 +082 /* the result is a single digit */ +083 if (a->used == 1) \{ +084 *tmpc++ = b - a->dp[0]; +085 \} else \{ +086 *tmpc++ = b; +087 \} +088 +089 /* setup count so the clearing of oldused +090 * can fall through correctly +091 */ +092 ix = 1; +093 \} +094 +095 /* now zero to oldused */ +096 while (ix++ < oldused) \{ +097 *tmpc++ = 0; +098 \} +099 mp_clamp(c); +100 +101 return MP_OKAY; +102 \} +103 +\end{alltt} +\end{small} + +Clever use of the letter 't'. + +\subsubsection{Subtraction} +The single digit subtraction algorithm mp\_sub\_d is essentially the same except it uses mp\_sub to subtract the digit from the mp\_int. + +\subsection{Single Digit Multiplication} +Single digit multiplication arises enough in division and radix conversion that it ought to be implement as a special case of the baseline +multiplication algorithm. Essentially this algorithm is a modified version of algorithm s\_mp\_mul\_digs where one of the multiplicands +only has one digit. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_mul\_d}. \\ +\textbf{Input}. mp\_int $a$ and a mp\_digit $b$ \\ +\textbf{Output}. $c = ab$ \\ +\hline \\ +1. $pa \leftarrow a.used$ \\ +2. Grow $c$ to at least $pa + 1$ digits. \\ +3. $oldused \leftarrow c.used$ \\ +4. $c.used \leftarrow pa + 1$ \\ +5. $c.sign \leftarrow a.sign$ \\ +6. $\mu \leftarrow 0$ \\ +7. for $ix$ from $0$ to $pa - 1$ do \\ +\hspace{3mm}7.1 $\hat r \leftarrow \mu + a_{ix}b$ \\ +\hspace{3mm}7.2 $c_{ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ +\hspace{3mm}7.3 $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\ +8. $c_{pa} \leftarrow \mu$ \\ +9. for $ix$ from $pa + 1$ to $oldused$ do \\ +\hspace{3mm}9.1 $c_{ix} \leftarrow 0$ \\ +10. Clamp excess digits of $c$. \\ +11. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_mul\_d} +\end{figure} +\textbf{Algorithm mp\_mul\_d.} +This algorithm quickly multiplies an mp\_int by a small single digit value. It is specially tailored to the job and has a minimal of overhead. +Unlike the full multiplication algorithms this algorithm does not require any significnat temporary storage or memory allocations. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_d.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* multiply by a digit */ +018 int +019 mp_mul_d (mp_int * a, mp_digit b, mp_int * c) +020 \{ +021 mp_digit u, *tmpa, *tmpc; +022 mp_word r; +023 int ix, res, olduse; +024 +025 /* make sure c is big enough to hold a*b */ +026 if (c->alloc < a->used + 1) \{ +027 if ((res = mp_grow (c, a->used + 1)) != MP_OKAY) \{ +028 return res; +029 \} +030 \} +031 +032 /* get the original destinations used count */ +033 olduse = c->used; +034 +035 /* set the sign */ +036 c->sign = a->sign; +037 +038 /* alias for a->dp [source] */ +039 tmpa = a->dp; +040 +041 /* alias for c->dp [dest] */ +042 tmpc = c->dp; +043 +044 /* zero carry */ +045 u = 0; +046 +047 /* compute columns */ +048 for (ix = 0; ix < a->used; ix++) \{ +049 /* compute product and carry sum for this term */ +050 r = ((mp_word) u) + ((mp_word)*tmpa++) * ((mp_word)b); +051 +052 /* mask off higher bits to get a single digit */ +053 *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK)); +054 +055 /* send carry into next iteration */ +056 u = (mp_digit) (r >> ((mp_word) DIGIT_BIT)); +057 \} +058 +059 /* store final carry [if any] */ +060 *tmpc++ = u; +061 +062 /* now zero digits above the top */ +063 while (ix++ < olduse) \{ +064 *tmpc++ = 0; +065 \} +066 +067 /* set used count */ +068 c->used = a->used + 1; +069 mp_clamp(c); +070 +071 return MP_OKAY; +072 \} +\end{alltt} +\end{small} + +In this implementation the destination $c$ may point to the same mp\_int as the source $a$ since the result is written after the digit is +read from the source. This function uses pointer aliases $tmpa$ and $tmpc$ for the digits of $a$ and $c$ respectively. + +\subsection{Single Digit Division} +Like the single digit multiplication algorithm, single digit division is also a fairly common algorithm used in radix conversion. Since the +divisor is only a single digit a specialized variant of the division algorithm can be used to compute the quotient. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_div\_d}. \\ +\textbf{Input}. mp\_int $a$ and a mp\_digit $b$ \\ +\textbf{Output}. $c = \lfloor a / b \rfloor, d = a - cb$ \\ +\hline \\ +1. If $b = 0$ then return(\textit{MP\_VAL}).\\ +2. If $b = 3$ then use algorithm mp\_div\_3 instead. \\ +3. Init $q$ to $a.used$ digits. \\ +4. $q.used \leftarrow a.used$ \\ +5. $q.sign \leftarrow a.sign$ \\ +6. $\hat w \leftarrow 0$ \\ +7. for $ix$ from $a.used - 1$ down to $0$ do \\ +\hspace{3mm}7.1 $\hat w \leftarrow \hat w \beta + a_{ix}$ \\ +\hspace{3mm}7.2 If $\hat w \ge b$ then \\ +\hspace{6mm}7.2.1 $t \leftarrow \lfloor \hat w / b \rfloor$ \\ +\hspace{6mm}7.2.2 $\hat w \leftarrow \hat w \mbox{ (mod }b\mbox{)}$ \\ +\hspace{3mm}7.3 else\\ +\hspace{6mm}7.3.1 $t \leftarrow 0$ \\ +\hspace{3mm}7.4 $q_{ix} \leftarrow t$ \\ +8. $d \leftarrow \hat w$ \\ +9. Clamp excess digits of $q$. \\ +10. $c \leftarrow q$ \\ +11. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_div\_d} +\end{figure} +\textbf{Algorithm mp\_div\_d.} +This algorithm divides the mp\_int $a$ by the single mp\_digit $b$ using an optimized approach. Essentially in every iteration of the +algorithm another digit of the dividend is reduced and another digit of quotient produced. Provided $b < \beta$ the value of $\hat w$ +after step 7.1 will be limited such that $0 \le \lfloor \hat w / b \rfloor < \beta$. + +If the divisor $b$ is equal to three a variant of this algorithm is used which is called mp\_div\_3. It replaces the division by three with +a multiplication by $\lfloor \beta / 3 \rfloor$ and the appropriate shift and residual fixup. In essence it is much like the Barrett reduction +from chapter seven. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_d.c +\vspace{-3mm} +\begin{alltt} +016 +017 static int s_is_power_of_two(mp_digit b, int *p) +018 \{ +019 int x; +020 +021 for (x = 1; x < DIGIT_BIT; x++) \{ +022 if (b == (((mp_digit)1)<<x)) \{ +023 *p = x; +024 return 1; +025 \} +026 \} +027 return 0; +028 \} +029 +030 /* single digit division (based on routine from MPI) */ +031 int mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d) +032 \{ +033 mp_int q; +034 mp_word w; +035 mp_digit t; +036 int res, ix; +037 +038 /* cannot divide by zero */ +039 if (b == 0) \{ +040 return MP_VAL; +041 \} +042 +043 /* quick outs */ +044 if (b == 1 || mp_iszero(a) == 1) \{ +045 if (d != NULL) \{ +046 *d = 0; +047 \} +048 if (c != NULL) \{ +049 return mp_copy(a, c); +050 \} +051 return MP_OKAY; +052 \} +053 +054 /* power of two ? */ +055 if (s_is_power_of_two(b, &ix) == 1) \{ +056 if (d != NULL) \{ +057 *d = a->dp[0] & ((1<<ix) - 1); +058 \} +059 if (c != NULL) \{ +060 return mp_div_2d(a, ix, c, NULL); +061 \} +062 return MP_OKAY; +063 \} +064 +065 /* three? */ +066 if (b == 3) \{ +067 return mp_div_3(a, c, d); +068 \} +069 +070 /* no easy answer [c'est la vie]. Just division */ +071 if ((res = mp_init_size(&q, a->used)) != MP_OKAY) \{ +072 return res; +073 \} +074 +075 q.used = a->used; +076 q.sign = a->sign; +077 w = 0; +078 for (ix = a->used - 1; ix >= 0; ix--) \{ +079 w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]); +080 +081 if (w >= b) \{ +082 t = (mp_digit)(w / b); +083 w -= ((mp_word)t) * ((mp_word)b); +084 \} else \{ +085 t = 0; +086 \} +087 q.dp[ix] = (mp_digit)t; +088 \} +089 +090 if (d != NULL) \{ +091 *d = (mp_digit)w; +092 \} +093 +094 if (c != NULL) \{ +095 mp_clamp(&q); +096 mp_exch(&q, c); +097 \} +098 mp_clear(&q); +099 +100 return res; +101 \} +102 +\end{alltt} +\end{small} + +Like the implementation of algorithm mp\_div this algorithm allows either of the quotient or remainder to be passed as a \textbf{NULL} pointer to +indicate the respective value is not required. This allows a trivial single digit modular reduction algorithm, mp\_mod\_d to be created. + +The division and remainder on lines 43 and @45,%@ can be replaced often by a single division on most processors. For example, the 32-bit x86 based +processors can divide a 64-bit quantity by a 32-bit quantity and produce the quotient and remainder simultaneously. Unfortunately the GCC +compiler does not recognize that optimization and will actually produce two function calls to find the quotient and remainder respectively. + +\subsection{Single Digit Root Extraction} + +Finding the $n$'th root of an integer is fairly easy as far as numerical analysis is concerned. Algorithms such as the Newton-Raphson approximation +(\ref{eqn:newton}) series will converge very quickly to a root for any continuous function $f(x)$. + +\begin{equation} +x_{i+1} = x_i - {f(x_i) \over f'(x_i)} +\label{eqn:newton} +\end{equation} + +In this case the $n$'th root is desired and $f(x) = x^n - a$ where $a$ is the integer of which the root is desired. The derivative of $f(x)$ is +simply $f'(x) = nx^{n - 1}$. Of particular importance is that this algorithm will be used over the integers not over the a more continuous domain +such as the real numbers. As a result the root found can be above the true root by few and must be manually adjusted. Ideally at the end of the +algorithm the $n$'th root $b$ of an integer $a$ is desired such that $b^n \le a$. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_n\_root}. \\ +\textbf{Input}. mp\_int $a$ and a mp\_digit $b$ \\ +\textbf{Output}. $c^b \le a$ \\ +\hline \\ +1. If $b$ is even and $a.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\ +2. $sign \leftarrow a.sign$ \\ +3. $a.sign \leftarrow MP\_ZPOS$ \\ +4. t$2 \leftarrow 2$ \\ +5. Loop \\ +\hspace{3mm}5.1 t$1 \leftarrow $ t$2$ \\ +\hspace{3mm}5.2 t$3 \leftarrow $ t$1^{b - 1}$ \\ +\hspace{3mm}5.3 t$2 \leftarrow $ t$3 $ $\cdot$ t$1$ \\ +\hspace{3mm}5.4 t$2 \leftarrow $ t$2 - a$ \\ +\hspace{3mm}5.5 t$3 \leftarrow $ t$3 \cdot b$ \\ +\hspace{3mm}5.6 t$3 \leftarrow \lfloor $t$2 / $t$3 \rfloor$ \\ +\hspace{3mm}5.7 t$2 \leftarrow $ t$1 - $ t$3$ \\ +\hspace{3mm}5.8 If t$1 \ne $ t$2$ then goto step 5. \\ +6. Loop \\ +\hspace{3mm}6.1 t$2 \leftarrow $ t$1^b$ \\ +\hspace{3mm}6.2 If t$2 > a$ then \\ +\hspace{6mm}6.2.1 t$1 \leftarrow $ t$1 - 1$ \\ +\hspace{6mm}6.2.2 Goto step 6. \\ +7. $a.sign \leftarrow sign$ \\ +8. $c \leftarrow $ t$1$ \\ +9. $c.sign \leftarrow sign$ \\ +10. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_n\_root} +\end{figure} +\textbf{Algorithm mp\_n\_root.} +This algorithm finds the integer $n$'th root of an input using the Newton-Raphson approach. It is partially optimized based on the observation +that the numerator of ${f(x) \over f'(x)}$ can be derived from a partial denominator. That is at first the denominator is calculated by finding +$x^{b - 1}$. This value can then be multiplied by $x$ and have $a$ subtracted from it to find the numerator. This saves a total of $b - 1$ +multiplications by t$1$ inside the loop. + +The initial value of the approximation is t$2 = 2$ which allows the algorithm to start with very small values and quickly converge on the +root. Ideally this algorithm is meant to find the $n$'th root of an input where $n$ is bounded by $2 \le n \le 5$. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_n\_root.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* find the n'th root of an integer +018 * +019 * Result found such that (c)**b <= a and (c+1)**b > a +020 * +021 * This algorithm uses Newton's approximation +022 * x[i+1] = x[i] - f(x[i])/f'(x[i]) +023 * which will find the root in log(N) time where +024 * each step involves a fair bit. This is not meant to +025 * find huge roots [square and cube, etc]. +026 */ +027 int mp_n_root (mp_int * a, mp_digit b, mp_int * c) +028 \{ +029 mp_int t1, t2, t3; +030 int res, neg; +031 +032 /* input must be positive if b is even */ +033 if ((b & 1) == 0 && a->sign == MP_NEG) \{ +034 return MP_VAL; +035 \} +036 +037 if ((res = mp_init (&t1)) != MP_OKAY) \{ +038 return res; +039 \} +040 +041 if ((res = mp_init (&t2)) != MP_OKAY) \{ +042 goto __T1; +043 \} +044 +045 if ((res = mp_init (&t3)) != MP_OKAY) \{ +046 goto __T2; +047 \} +048 +049 /* if a is negative fudge the sign but keep track */ +050 neg = a->sign; +051 a->sign = MP_ZPOS; +052 +053 /* t2 = 2 */ +054 mp_set (&t2, 2); +055 +056 do \{ +057 /* t1 = t2 */ +058 if ((res = mp_copy (&t2, &t1)) != MP_OKAY) \{ +059 goto __T3; +060 \} +061 +062 /* t2 = t1 - ((t1**b - a) / (b * t1**(b-1))) */ +063 +064 /* t3 = t1**(b-1) */ +065 if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) \{ +066 goto __T3; +067 \} +068 +069 /* numerator */ +070 /* t2 = t1**b */ +071 if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) \{ +072 goto __T3; +073 \} +074 +075 /* t2 = t1**b - a */ +076 if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) \{ +077 goto __T3; +078 \} +079 +080 /* denominator */ +081 /* t3 = t1**(b-1) * b */ +082 if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) \{ +083 goto __T3; +084 \} +085 +086 /* t3 = (t1**b - a)/(b * t1**(b-1)) */ +087 if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) \{ +088 goto __T3; +089 \} +090 +091 if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) \{ +092 goto __T3; +093 \} +094 \} while (mp_cmp (&t1, &t2) != MP_EQ); +095 +096 /* result can be off by a few so check */ +097 for (;;) \{ +098 if ((res = mp_expt_d (&t1, b, &t2)) != MP_OKAY) \{ +099 goto __T3; +100 \} +101 +102 if (mp_cmp (&t2, a) == MP_GT) \{ +103 if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) \{ +104 goto __T3; +105 \} +106 \} else \{ +107 break; +108 \} +109 \} +110 +111 /* reset the sign of a first */ +112 a->sign = neg; +113 +114 /* set the result */ +115 mp_exch (&t1, c); +116 +117 /* set the sign of the result */ +118 c->sign = neg; +119 +120 res = MP_OKAY; +121 +122 __T3:mp_clear (&t3); +123 __T2:mp_clear (&t2); +124 __T1:mp_clear (&t1); +125 return res; +126 \} +\end{alltt} +\end{small} + +\section{Random Number Generation} + +Random numbers come up in a variety of activities from public key cryptography to simple simulations and various randomized algorithms. Pollard-Rho +factoring for example, can make use of random values as starting points to find factors of a composite integer. In this case the algorithm presented +is solely for simulations and not intended for cryptographic use. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_rand}. \\ +\textbf{Input}. An integer $b$ \\ +\textbf{Output}. A pseudo-random number of $b$ digits \\ +\hline \\ +1. $a \leftarrow 0$ \\ +2. If $b \le 0$ return(\textit{MP\_OKAY}) \\ +3. Pick a non-zero random digit $d$. \\ +4. $a \leftarrow a + d$ \\ +5. for $ix$ from 1 to $d - 1$ do \\ +\hspace{3mm}5.1 $a \leftarrow a \cdot \beta$ \\ +\hspace{3mm}5.2 Pick a random digit $d$. \\ +\hspace{3mm}5.3 $a \leftarrow a + d$ \\ +6. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_rand} +\end{figure} +\textbf{Algorithm mp\_rand.} +This algorithm produces a pseudo-random integer of $b$ digits. By ensuring that the first digit is non-zero the algorithm also guarantees that the +final result has at least $b$ digits. It relies heavily on a third-part random number generator which should ideally generate uniformly all of +the integers from $0$ to $\beta - 1$. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_rand.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* makes a pseudo-random int of a given size */ +018 int +019 mp_rand (mp_int * a, int digits) +020 \{ +021 int res; +022 mp_digit d; +023 +024 mp_zero (a); +025 if (digits <= 0) \{ +026 return MP_OKAY; +027 \} +028 +029 /* first place a random non-zero digit */ +030 do \{ +031 d = ((mp_digit) abs (rand ())); +032 \} while (d == 0); +033 +034 if ((res = mp_add_d (a, d, a)) != MP_OKAY) \{ +035 return res; +036 \} +037 +038 while (digits-- > 0) \{ +039 if ((res = mp_lshd (a, 1)) != MP_OKAY) \{ +040 return res; +041 \} +042 +043 if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) \{ +044 return res; +045 \} +046 \} +047 +048 return MP_OKAY; +049 \} +\end{alltt} +\end{small} + +\section{Formatted Representations} +The ability to emit a radix-$n$ textual representation of an integer is useful for interacting with human parties. For example, the ability to +be given a string of characters such as ``114585'' and turn it into the radix-$\beta$ equivalent would make it easier to enter numbers +into a program. + +\subsection{Reading Radix-n Input} +For the purposes of this text we will assume that a simple lower ASCII map (\ref{fig:ASC}) is used for the values of from $0$ to $63$ to +printable characters. For example, when the character ``N'' is read it represents the integer $23$. The first $16$ characters of the +map are for the common representations up to hexadecimal. After that they match the ``base64'' encoding scheme which are suitable chosen +such that they are printable. While outputting as base64 may not be too helpful for human operators it does allow communication via non binary +mediums. + +\newpage\begin{figure}[here] +\begin{center} +\begin{tabular}{cc|cc|cc|cc} +\hline \textbf{Value} & \textbf{Char} & \textbf{Value} & \textbf{Char} & \textbf{Value} & \textbf{Char} & \textbf{Value} & \textbf{Char} \\ +\hline +0 & 0 & 1 & 1 & 2 & 2 & 3 & 3 \\ +4 & 4 & 5 & 5 & 6 & 6 & 7 & 7 \\ +8 & 8 & 9 & 9 & 10 & A & 11 & B \\ +12 & C & 13 & D & 14 & E & 15 & F \\ +16 & G & 17 & H & 18 & I & 19 & J \\ +20 & K & 21 & L & 22 & M & 23 & N \\ +24 & O & 25 & P & 26 & Q & 27 & R \\ +28 & S & 29 & T & 30 & U & 31 & V \\ +32 & W & 33 & X & 34 & Y & 35 & Z \\ +36 & a & 37 & b & 38 & c & 39 & d \\ +40 & e & 41 & f & 42 & g & 43 & h \\ +44 & i & 45 & j & 46 & k & 47 & l \\ +48 & m & 49 & n & 50 & o & 51 & p \\ +52 & q & 53 & r & 54 & s & 55 & t \\ +56 & u & 57 & v & 58 & w & 59 & x \\ +60 & y & 61 & z & 62 & $+$ & 63 & $/$ \\ +\hline +\end{tabular} +\end{center} +\caption{Lower ASCII Map} +\label{fig:ASC} +\end{figure} + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_read\_radix}. \\ +\textbf{Input}. A string $str$ of length $sn$ and radix $r$. \\ +\textbf{Output}. The radix-$\beta$ equivalent mp\_int. \\ +\hline \\ +1. If $r < 2$ or $r > 64$ return(\textit{MP\_VAL}). \\ +2. $ix \leftarrow 0$ \\ +3. If $str_0 =$ ``-'' then do \\ +\hspace{3mm}3.1 $ix \leftarrow ix + 1$ \\ +\hspace{3mm}3.2 $sign \leftarrow MP\_NEG$ \\ +4. else \\ +\hspace{3mm}4.1 $sign \leftarrow MP\_ZPOS$ \\ +5. $a \leftarrow 0$ \\ +6. for $iy$ from $ix$ to $sn - 1$ do \\ +\hspace{3mm}6.1 Let $y$ denote the position in the map of $str_{iy}$. \\ +\hspace{3mm}6.2 If $str_{iy}$ is not in the map or $y \ge r$ then goto step 7. \\ +\hspace{3mm}6.3 $a \leftarrow a \cdot r$ \\ +\hspace{3mm}6.4 $a \leftarrow a + y$ \\ +7. If $a \ne 0$ then $a.sign \leftarrow sign$ \\ +8. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_read\_radix} +\end{figure} +\textbf{Algorithm mp\_read\_radix.} +This algorithm will read an ASCII string and produce the radix-$\beta$ mp\_int representation of the same integer. A minus symbol ``-'' may precede the +string to indicate the value is negative, otherwise it is assumed to be positive. The algorithm will read up to $sn$ characters from the input +and will stop when it reads a character it cannot map the algorithm stops reading characters from the string. This allows numbers to be embedded +as part of larger input without any significant problem. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_read\_radix.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* read a string [ASCII] in a given radix */ +018 int mp_read_radix (mp_int * a, char *str, int radix) +019 \{ +020 int y, res, neg; +021 char ch; +022 +023 /* make sure the radix is ok */ +024 if (radix < 2 || radix > 64) \{ +025 return MP_VAL; +026 \} +027 +028 /* if the leading digit is a +029 * minus set the sign to negative. +030 */ +031 if (*str == '-') \{ +032 ++str; +033 neg = MP_NEG; +034 \} else \{ +035 neg = MP_ZPOS; +036 \} +037 +038 /* set the integer to the default of zero */ +039 mp_zero (a); +040 +041 /* process each digit of the string */ +042 while (*str) \{ +043 /* if the radix < 36 the conversion is case insensitive +044 * this allows numbers like 1AB and 1ab to represent the same value +045 * [e.g. in hex] +046 */ +047 ch = (char) ((radix < 36) ? toupper (*str) : *str); +048 for (y = 0; y < 64; y++) \{ +049 if (ch == mp_s_rmap[y]) \{ +050 break; +051 \} +052 \} +053 +054 /* if the char was found in the map +055 * and is less than the given radix add it +056 * to the number, otherwise exit the loop. +057 */ +058 if (y < radix) \{ +059 if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) \{ +060 return res; +061 \} +062 if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) \{ +063 return res; +064 \} +065 \} else \{ +066 break; +067 \} +068 ++str; +069 \} +070 +071 /* set the sign only if a != 0 */ +072 if (mp_iszero(a) != 1) \{ +073 a->sign = neg; +074 \} +075 return MP_OKAY; +076 \} +\end{alltt} +\end{small} + +\subsection{Generating Radix-$n$ Output} +Generating radix-$n$ output is fairly trivial with a division and remainder algorithm. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_toradix}. \\ +\textbf{Input}. A mp\_int $a$ and an integer $r$\\ +\textbf{Output}. The radix-$r$ representation of $a$ \\ +\hline \\ +1. If $r < 2$ or $r > 64$ return(\textit{MP\_VAL}). \\ +2. If $a = 0$ then $str = $ ``$0$'' and return(\textit{MP\_OKAY}). \\ +3. $t \leftarrow a$ \\ +4. $str \leftarrow$ ``'' \\ +5. if $t.sign = MP\_NEG$ then \\ +\hspace{3mm}5.1 $str \leftarrow str + $ ``-'' \\ +\hspace{3mm}5.2 $t.sign = MP\_ZPOS$ \\ +6. While ($t \ne 0$) do \\ +\hspace{3mm}6.1 $d \leftarrow t \mbox{ (mod }r\mbox{)}$ \\ +\hspace{3mm}6.2 $t \leftarrow \lfloor t / r \rfloor$ \\ +\hspace{3mm}6.3 Look up $d$ in the map and store the equivalent character in $y$. \\ +\hspace{3mm}6.4 $str \leftarrow str + y$ \\ +7. If $str_0 = $``$-$'' then \\ +\hspace{3mm}7.1 Reverse the digits $str_1, str_2, \ldots str_n$. \\ +8. Otherwise \\ +\hspace{3mm}8.1 Reverse the digits $str_0, str_1, \ldots str_n$. \\ +9. Return(\textit{MP\_OKAY}).\\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_toradix} +\end{figure} +\textbf{Algorithm mp\_toradix.} +This algorithm computes the radix-$r$ representation of an mp\_int $a$. The ``digits'' of the representation are extracted by reducing +successive powers of $\lfloor a / r^k \rfloor$ the input modulo $r$ until $r^k > a$. Note that instead of actually dividing by $r^k$ in +each iteration the quotient $\lfloor a / r \rfloor$ is saved for the next iteration. As a result a series of trivial $n \times 1$ divisions +are required instead of a series of $n \times k$ divisions. One design flaw of this approach is that the digits are produced in the reverse order +(see~\ref{fig:mpradix}). To remedy this flaw the digits must be swapped or simply ``reversed''. + +\begin{figure} +\begin{center} +\begin{tabular}{|c|c|c|} +\hline \textbf{Value of $a$} & \textbf{Value of $d$} & \textbf{Value of $str$} \\ +\hline $1234$ & -- & -- \\ +\hline $123$ & $4$ & ``4'' \\ +\hline $12$ & $3$ & ``43'' \\ +\hline $1$ & $2$ & ``432'' \\ +\hline $0$ & $1$ & ``4321'' \\ +\hline +\end{tabular} +\end{center} +\caption{Example of Algorithm mp\_toradix.} +\label{fig:mpradix} +\end{figure} + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_toradix.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* stores a bignum as a ASCII string in a given radix (2..64) */ +018 int mp_toradix (mp_int * a, char *str, int radix) +019 \{ +020 int res, digs; +021 mp_int t; +022 mp_digit d; +023 char *_s = str; +024 +025 /* check range of the radix */ +026 if (radix < 2 || radix > 64) \{ +027 return MP_VAL; +028 \} +029 +030 /* quick out if its zero */ +031 if (mp_iszero(a) == 1) \{ +032 *str++ = '0'; +033 *str = '\symbol{92}0'; +034 return MP_OKAY; +035 \} +036 +037 if ((res = mp_init_copy (&t, a)) != MP_OKAY) \{ +038 return res; +039 \} +040 +041 /* if it is negative output a - */ +042 if (t.sign == MP_NEG) \{ +043 ++_s; +044 *str++ = '-'; +045 t.sign = MP_ZPOS; +046 \} +047 +048 digs = 0; +049 while (mp_iszero (&t) == 0) \{ +050 if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) \{ +051 mp_clear (&t); +052 return res; +053 \} +054 *str++ = mp_s_rmap[d]; +055 ++digs; +056 \} +057 +058 /* reverse the digits of the string. In this case _s points +059 * to the first digit [exluding the sign] of the number] +060 */ +061 bn_reverse ((unsigned char *)_s, digs); +062 +063 /* append a NULL so the string is properly terminated */ +064 *str = '\symbol{92}0'; +065 +066 mp_clear (&t); +067 return MP_OKAY; +068 \} +069 +\end{alltt} +\end{small} + +\chapter{Number Theoretic Algorithms} +This chapter discusses several fundamental number theoretic algorithms such as the greatest common divisor, least common multiple and Jacobi +symbol computation. These algorithms arise as essential components in several key cryptographic algorithms such as the RSA public key algorithm and +various Sieve based factoring algorithms. + +\section{Greatest Common Divisor} +The greatest common divisor of two integers $a$ and $b$, often denoted as $(a, b)$ is the largest integer $k$ that is a proper divisor of +both $a$ and $b$. That is, $k$ is the largest integer such that $0 \equiv a \mbox{ (mod }k\mbox{)}$ and $0 \equiv b \mbox{ (mod }k\mbox{)}$ occur +simultaneously. + +The most common approach (cite) is to reduce one input modulo another. That is if $a$ and $b$ are divisible by some integer $k$ and if $qa + r = b$ then +$r$ is also divisible by $k$. The reduction pattern follows $\left < a , b \right > \rightarrow \left < b, a \mbox{ mod } b \right >$. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Greatest Common Divisor (I)}. \\ +\textbf{Input}. Two positive integers $a$ and $b$ greater than zero. \\ +\textbf{Output}. The greatest common divisor $(a, b)$. \\ +\hline \\ +1. While ($b > 0$) do \\ +\hspace{3mm}1.1 $r \leftarrow a \mbox{ (mod }b\mbox{)}$ \\ +\hspace{3mm}1.2 $a \leftarrow b$ \\ +\hspace{3mm}1.3 $b \leftarrow r$ \\ +2. Return($a$). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm Greatest Common Divisor (I)} +\label{fig:gcd1} +\end{figure} + +This algorithm will quickly converge on the greatest common divisor since the residue $r$ tends diminish rapidly. However, divisions are +relatively expensive operations to perform and should ideally be avoided. There is another approach based on a similar relationship of +greatest common divisors. The faster approach is based on the observation that if $k$ divides both $a$ and $b$ it will also divide $a - b$. +In particular, we would like $a - b$ to decrease in magnitude which implies that $b \ge a$. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Greatest Common Divisor (II)}. \\ +\textbf{Input}. Two positive integers $a$ and $b$ greater than zero. \\ +\textbf{Output}. The greatest common divisor $(a, b)$. \\ +\hline \\ +1. While ($b > 0$) do \\ +\hspace{3mm}1.1 Swap $a$ and $b$ such that $a$ is the smallest of the two. \\ +\hspace{3mm}1.2 $b \leftarrow b - a$ \\ +2. Return($a$). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm Greatest Common Divisor (II)} +\label{fig:gcd2} +\end{figure} + +\textbf{Proof} \textit{Algorithm~\ref{fig:gcd2} will return the greatest common divisor of $a$ and $b$.} +The algorithm in figure~\ref{fig:gcd2} will eventually terminate since $b \ge a$ the subtraction in step 1.2 will be a value less than $b$. In other +words in every iteration that tuple $\left < a, b \right >$ decrease in magnitude until eventually $a = b$. Since both $a$ and $b$ are always +divisible by the greatest common divisor (\textit{until the last iteration}) and in the last iteration of the algorithm $b = 0$, therefore, in the +second to last iteration of the algorithm $b = a$ and clearly $(a, a) = a$ which concludes the proof. \textbf{QED}. + +As a matter of practicality algorithm \ref{fig:gcd1} decreases far too slowly to be useful. Specially if $b$ is much larger than $a$ such that +$b - a$ is still very much larger than $a$. A simple addition to the algorithm is to divide $b - a$ by a power of some integer $p$ which does +not divide the greatest common divisor but will divide $b - a$. In this case ${b - a} \over p$ is also an integer and still divisible by +the greatest common divisor. + +However, instead of factoring $b - a$ to find a suitable value of $p$ the powers of $p$ can be removed from $a$ and $b$ that are in common first. +Then inside the loop whenever $b - a$ is divisible by some power of $p$ it can be safely removed. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{Greatest Common Divisor (III)}. \\ +\textbf{Input}. Two positive integers $a$ and $b$ greater than zero. \\ +\textbf{Output}. The greatest common divisor $(a, b)$. \\ +\hline \\ +1. $k \leftarrow 0$ \\ +2. While $a$ and $b$ are both divisible by $p$ do \\ +\hspace{3mm}2.1 $a \leftarrow \lfloor a / p \rfloor$ \\ +\hspace{3mm}2.2 $b \leftarrow \lfloor b / p \rfloor$ \\ +\hspace{3mm}2.3 $k \leftarrow k + 1$ \\ +3. While $a$ is divisible by $p$ do \\ +\hspace{3mm}3.1 $a \leftarrow \lfloor a / p \rfloor$ \\ +4. While $b$ is divisible by $p$ do \\ +\hspace{3mm}4.1 $b \leftarrow \lfloor b / p \rfloor$ \\ +5. While ($b > 0$) do \\ +\hspace{3mm}5.1 Swap $a$ and $b$ such that $a$ is the smallest of the two. \\ +\hspace{3mm}5.2 $b \leftarrow b - a$ \\ +\hspace{3mm}5.3 While $b$ is divisible by $p$ do \\ +\hspace{6mm}5.3.1 $b \leftarrow \lfloor b / p \rfloor$ \\ +6. Return($a \cdot p^k$). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm Greatest Common Divisor (III)} +\label{fig:gcd3} +\end{figure} + +This algorithm is based on the first except it removes powers of $p$ first and inside the main loop to ensure the tuple $\left < a, b \right >$ +decreases more rapidly. The first loop on step two removes powers of $p$ that are in common. A count, $k$, is kept which will present a common +divisor of $p^k$. After step two the remaining common divisor of $a$ and $b$ cannot be divisible by $p$. This means that $p$ can be safely +divided out of the difference $b - a$ so long as the division leaves no remainder. + +In particular the value of $p$ should be chosen such that the division on step 5.3.1 occur often. It also helps that division by $p$ be easy +to compute. The ideal choice of $p$ is two since division by two amounts to a right logical shift. Another important observation is that by +step five both $a$ and $b$ are odd. Therefore, the diffrence $b - a$ must be even which means that each iteration removes one bit from the +largest of the pair. + +\subsection{Complete Greatest Common Divisor} +The algorithms presented so far cannot handle inputs which are zero or negative. The following algorithm can handle all input cases properly +and will produce the greatest common divisor. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_gcd}. \\ +\textbf{Input}. mp\_int $a$ and $b$ \\ +\textbf{Output}. The greatest common divisor $c = (a, b)$. \\ +\hline \\ +1. If $a = 0$ and $b \ne 0$ then \\ +\hspace{3mm}1.1 $c \leftarrow b$ \\ +\hspace{3mm}1.2 Return(\textit{MP\_OKAY}). \\ +2. If $a \ne 0$ and $b = 0$ then \\ +\hspace{3mm}2.1 $c \leftarrow a$ \\ +\hspace{3mm}2.2 Return(\textit{MP\_OKAY}). \\ +3. If $a = b = 0$ then \\ +\hspace{3mm}3.1 $c \leftarrow 1$ \\ +\hspace{3mm}3.2 Return(\textit{MP\_OKAY}). \\ +4. $u \leftarrow \vert a \vert, v \leftarrow \vert b \vert$ \\ +5. $k \leftarrow 0$ \\ +6. While $u.used > 0$ and $v.used > 0$ and $u_0 \equiv v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\ +\hspace{3mm}6.1 $k \leftarrow k + 1$ \\ +\hspace{3mm}6.2 $u \leftarrow \lfloor u / 2 \rfloor$ \\ +\hspace{3mm}6.3 $v \leftarrow \lfloor v / 2 \rfloor$ \\ +7. While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\ +\hspace{3mm}7.1 $u \leftarrow \lfloor u / 2 \rfloor$ \\ +8. While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\ +\hspace{3mm}8.1 $v \leftarrow \lfloor v / 2 \rfloor$ \\ +9. While $v.used > 0$ \\ +\hspace{3mm}9.1 If $\vert u \vert > \vert v \vert$ then \\ +\hspace{6mm}9.1.1 Swap $u$ and $v$. \\ +\hspace{3mm}9.2 $v \leftarrow \vert v \vert - \vert u \vert$ \\ +\hspace{3mm}9.3 While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\ +\hspace{6mm}9.3.1 $v \leftarrow \lfloor v / 2 \rfloor$ \\ +10. $c \leftarrow u \cdot 2^k$ \\ +11. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_gcd} +\end{figure} +\textbf{Algorithm mp\_gcd.} +This algorithm will produce the greatest common divisor of two mp\_ints $a$ and $b$. The algorithm was originally based on Algorithm B of +Knuth \cite[pp. 338]{TAOCPV2} but has been modified to be simpler to explain. In theory it achieves the same asymptotic working time as +Algorithm B and in practice this appears to be true. + +The first three steps handle the cases where either one of or both inputs are zero. If either input is zero the greatest common divisor is the +largest input or zero if they are both zero. If the inputs are not trivial than $u$ and $v$ are assigned the absolute values of +$a$ and $b$ respectively and the algorithm will proceed to reduce the pair. + +Step six will divide out any common factors of two and keep track of the count in the variable $k$. After this step two is no longer a +factor of the remaining greatest common divisor between $u$ and $v$ and can be safely evenly divided out of either whenever they are even. Step +seven and eight ensure that the $u$ and $v$ respectively have no more factors of two. At most only one of the while loops will iterate since +they cannot both be even. + +By step nine both of $u$ and $v$ are odd which is required for the inner logic. First the pair are swapped such that $v$ is equal to +or greater than $u$. This ensures that the subtraction on step 9.2 will always produce a positive and even result. Step 9.3 removes any +factors of two from the difference $u$ to ensure that in the next iteration of the loop both are once again odd. + +After $v = 0$ occurs the variable $u$ has the greatest common divisor of the pair $\left < u, v \right >$ just after step six. The result +must be adjusted by multiplying by the common factors of two ($2^k$) removed earlier. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_gcd.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* Greatest Common Divisor using the binary method */ +018 int mp_gcd (mp_int * a, mp_int * b, mp_int * c) +019 \{ +020 mp_int u, v; +021 int k, u_lsb, v_lsb, res; +022 +023 /* either zero than gcd is the largest */ +024 if (mp_iszero (a) == 1 && mp_iszero (b) == 0) \{ +025 return mp_abs (b, c); +026 \} +027 if (mp_iszero (a) == 0 && mp_iszero (b) == 1) \{ +028 return mp_abs (a, c); +029 \} +030 +031 /* optimized. At this point if a == 0 then +032 * b must equal zero too +033 */ +034 if (mp_iszero (a) == 1) \{ +035 mp_zero(c); +036 return MP_OKAY; +037 \} +038 +039 /* get copies of a and b we can modify */ +040 if ((res = mp_init_copy (&u, a)) != MP_OKAY) \{ +041 return res; +042 \} +043 +044 if ((res = mp_init_copy (&v, b)) != MP_OKAY) \{ +045 goto __U; +046 \} +047 +048 /* must be positive for the remainder of the algorithm */ +049 u.sign = v.sign = MP_ZPOS; +050 +051 /* B1. Find the common power of two for u and v */ +052 u_lsb = mp_cnt_lsb(&u); +053 v_lsb = mp_cnt_lsb(&v); +054 k = MIN(u_lsb, v_lsb); +055 +056 if (k > 0) \{ +057 /* divide the power of two out */ +058 if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) \{ +059 goto __V; +060 \} +061 +062 if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) \{ +063 goto __V; +064 \} +065 \} +066 +067 /* divide any remaining factors of two out */ +068 if (u_lsb != k) \{ +069 if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) \{ +070 goto __V; +071 \} +072 \} +073 +074 if (v_lsb != k) \{ +075 if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) \{ +076 goto __V; +077 \} +078 \} +079 +080 while (mp_iszero(&v) == 0) \{ +081 /* make sure v is the largest */ +082 if (mp_cmp_mag(&u, &v) == MP_GT) \{ +083 /* swap u and v to make sure v is >= u */ +084 mp_exch(&u, &v); +085 \} +086 +087 /* subtract smallest from largest */ +088 if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) \{ +089 goto __V; +090 \} +091 +092 /* Divide out all factors of two */ +093 if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) \{ +094 goto __V; +095 \} +096 \} +097 +098 /* multiply by 2**k which we divided out at the beginning */ +099 if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) \{ +100 goto __V; +101 \} +102 c->sign = MP_ZPOS; +103 res = MP_OKAY; +104 __V:mp_clear (&u); +105 __U:mp_clear (&v); +106 return res; +107 \} +\end{alltt} +\end{small} + +This function makes use of the macros mp\_iszero and mp\_iseven. The former evaluates to $1$ if the input mp\_int is equivalent to the +integer zero otherwise it evaluates to $0$. The latter evaluates to $1$ if the input mp\_int represents a non-zero even integer otherwise +it evaluates to $0$. Note that just because mp\_iseven may evaluate to $0$ does not mean the input is odd, it could also be zero. The three +trivial cases of inputs are handled on lines 24 through 37. After those lines the inputs are assumed to be non-zero. + +Lines 34 and 40 make local copies $u$ and $v$ of the inputs $a$ and $b$ respectively. At this point the common factors of two +must be divided out of the two inputs. The while loop on line 80 iterates so long as both are even. The local integer $k$ is used to +keep track of how many factors of $2$ are pulled out of both values. It is assumed that the number of factors will not exceed the maximum +value of a C ``int'' data type\footnote{Strictly speaking no array in C may have more than entries than are accessible by an ``int'' so this is not +a limitation.}. + +At this point there are no more common factors of two in the two values. The while loops on lines 80 and 80 remove any independent +factors of two such that both $u$ and $v$ are guaranteed to be an odd integer before hitting the main body of the algorithm. The while loop +on line 80 performs the reduction of the pair until $v$ is equal to zero. The unsigned comparison and subtraction algorithms are used in +place of the full signed routines since both values are guaranteed to be positive and the result of the subtraction is guaranteed to be non-negative. + +\section{Least Common Multiple} +The least common multiple of a pair of integers is their product divided by their greatest common divisor. For two integers $a$ and $b$ the +least common multiple is normally denoted as $[ a, b ]$ and numerically equivalent to ${ab} \over {(a, b)}$. For example, if $a = 2 \cdot 2 \cdot 3 = 12$ +and $b = 2 \cdot 3 \cdot 3 \cdot 7 = 126$ the least common multiple is ${126 \over {(12, 126)}} = {126 \over 6} = 21$. + +The least common multiple arises often in coding theory as well as number theory. If two functions have periods of $a$ and $b$ respectively they will +collide, that is be in synchronous states, after only $[ a, b ]$ iterations. This is why, for example, random number generators based on +Linear Feedback Shift Registers (LFSR) tend to use registers with periods which are co-prime (\textit{e.g. the greatest common divisor is one.}). +Similarly in number theory if a composite $n$ has two prime factors $p$ and $q$ then maximal order of any unit of $\Z/n\Z$ will be $[ p - 1, q - 1] $. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_lcm}. \\ +\textbf{Input}. mp\_int $a$ and $b$ \\ +\textbf{Output}. The least common multiple $c = [a, b]$. \\ +\hline \\ +1. $c \leftarrow (a, b)$ \\ +2. $t \leftarrow a \cdot b$ \\ +3. $c \leftarrow \lfloor t / c \rfloor$ \\ +4. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_lcm} +\end{figure} +\textbf{Algorithm mp\_lcm.} +This algorithm computes the least common multiple of two mp\_int inputs $a$ and $b$. It computes the least common multiple directly by +dividing the product of the two inputs by their greatest common divisor. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_lcm.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* computes least common multiple as |a*b|/(a, b) */ +018 int mp_lcm (mp_int * a, mp_int * b, mp_int * c) +019 \{ +020 int res; +021 mp_int t1, t2; +022 +023 +024 if ((res = mp_init_multi (&t1, &t2, NULL)) != MP_OKAY) \{ +025 return res; +026 \} +027 +028 /* t1 = get the GCD of the two inputs */ +029 if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) \{ +030 goto __T; +031 \} +032 +033 /* divide the smallest by the GCD */ +034 if (mp_cmp_mag(a, b) == MP_LT) \{ +035 /* store quotient in t2 such that t2 * b is the LCM */ +036 if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) \{ +037 goto __T; +038 \} +039 res = mp_mul(b, &t2, c); +040 \} else \{ +041 /* store quotient in t2 such that t2 * a is the LCM */ +042 if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) \{ +043 goto __T; +044 \} +045 res = mp_mul(a, &t2, c); +046 \} +047 +048 /* fix the sign to positive */ +049 c->sign = MP_ZPOS; +050 +051 __T: +052 mp_clear_multi (&t1, &t2, NULL); +053 return res; +054 \} +\end{alltt} +\end{small} + +\section{Jacobi Symbol Computation} +To explain the Jacobi Symbol we shall first discuss the Legendre function\footnote{Arrg. What is the name of this?} off which the Jacobi symbol is +defined. The Legendre function computes whether or not an integer $a$ is a quadratic residue modulo an odd prime $p$. Numerically it is +equivalent to equation \ref{eqn:legendre}. + +\begin{equation} +a^{(p-1)/2} \equiv \begin{array}{rl} + -1 & \mbox{if }a\mbox{ is a quadratic non-residue.} \\ + 0 & \mbox{if }a\mbox{ divides }p\mbox{.} \\ + 1 & \mbox{if }a\mbox{ is a quadratic residue}. + \end{array} \mbox{ (mod }p\mbox{)} +\label{eqn:legendre} +\end{equation} + +\textbf{Proof.} \textit{Equation \ref{eqn:legendre} correctly identifies the residue status of an integer $a$ modulo a prime $p$.} +An integer $a$ is a quadratic residue if the following equation has a solution. + +\begin{equation} +x^2 \equiv a \mbox{ (mod }p\mbox{)} +\label{eqn:root} +\end{equation} + +Consider the following equation. + +\begin{equation} +0 \equiv x^{p-1} - 1 \equiv \left \lbrace \left (x^2 \right )^{(p-1)/2} - a^{(p-1)/2} \right \rbrace + \left ( a^{(p-1)/2} - 1 \right ) \mbox{ (mod }p\mbox{)} +\label{eqn:rooti} +\end{equation} + +Whether equation \ref{eqn:root} has a solution or not equation \ref{eqn:rooti} is always true. If $a^{(p-1)/2} - 1 \equiv 0 \mbox{ (mod }p\mbox{)}$ +then the quantity in the braces must be zero. By reduction, + +\begin{eqnarray} +\left (x^2 \right )^{(p-1)/2} - a^{(p-1)/2} \equiv 0 \nonumber \\ +\left (x^2 \right )^{(p-1)/2} \equiv a^{(p-1)/2} \nonumber \\ +x^2 \equiv a \mbox{ (mod }p\mbox{)} +\end{eqnarray} + +As a result there must be a solution to the quadratic equation and in turn $a$ must be a quadratic residue. If $a$ does not divide $p$ and $a$ +is not a quadratic residue then the only other value $a^{(p-1)/2}$ may be congruent to is $-1$ since +\begin{equation} +0 \equiv a^{p - 1} - 1 \equiv (a^{(p-1)/2} + 1)(a^{(p-1)/2} - 1) \mbox{ (mod }p\mbox{)} +\end{equation} +One of the terms on the right hand side must be zero. \textbf{QED} + +\subsection{Jacobi Symbol} +The Jacobi symbol is a generalization of the Legendre function for any odd non prime moduli $p$ greater than 2. If $p = \prod_{i=0}^n p_i$ then +the Jacobi symbol $\left ( { a \over p } \right )$ is equal to the following equation. + +\begin{equation} +\left ( { a \over p } \right ) = \left ( { a \over p_0} \right ) \left ( { a \over p_1} \right ) \ldots \left ( { a \over p_n} \right ) +\end{equation} + +By inspection if $p$ is prime the Jacobi symbol is equivalent to the Legendre function. The following facts\footnote{See HAC \cite[pp. 72-74]{HAC} for +further details.} will be used to derive an efficient Jacobi symbol algorithm. Where $p$ is an odd integer greater than two and $a, b \in \Z$ the +following are true. + +\begin{enumerate} +\item $\left ( { a \over p} \right )$ equals $-1$, $0$ or $1$. +\item $\left ( { ab \over p} \right ) = \left ( { a \over p} \right )\left ( { b \over p} \right )$. +\item If $a \equiv b$ then $\left ( { a \over p} \right ) = \left ( { b \over p} \right )$. +\item $\left ( { 2 \over p} \right )$ equals $1$ if $p \equiv 1$ or $7 \mbox{ (mod }8\mbox{)}$. Otherwise, it equals $-1$. +\item $\left ( { a \over p} \right ) \equiv \left ( { p \over a} \right ) \cdot (-1)^{(p-1)(a-1)/4}$. More specifically +$\left ( { a \over p} \right ) = \left ( { p \over a} \right )$ if $p \equiv a \equiv 1 \mbox{ (mod }4\mbox{)}$. +\end{enumerate} + +Using these facts if $a = 2^k \cdot a'$ then + +\begin{eqnarray} +\left ( { a \over p } \right ) = \left ( {{2^k} \over p } \right ) \left ( {a' \over p} \right ) \nonumber \\ + = \left ( {2 \over p } \right )^k \left ( {a' \over p} \right ) +\label{eqn:jacobi} +\end{eqnarray} + +By fact five, + +\begin{equation} +\left ( { a \over p } \right ) = \left ( { p \over a } \right ) \cdot (-1)^{(p-1)(a-1)/4} +\end{equation} + +Subsequently by fact three since $p \equiv (p \mbox{ mod }a) \mbox{ (mod }a\mbox{)}$ then + +\begin{equation} +\left ( { a \over p } \right ) = \left ( { {p \mbox{ mod } a} \over a } \right ) \cdot (-1)^{(p-1)(a-1)/4} +\end{equation} + +By putting both observations into equation \ref{eqn:jacobi} the following simplified equation is formed. + +\begin{equation} +\left ( { a \over p } \right ) = \left ( {2 \over p } \right )^k \left ( {{p\mbox{ mod }a'} \over a'} \right ) \cdot (-1)^{(p-1)(a'-1)/4} +\end{equation} + +The value of $\left ( {{p \mbox{ mod }a'} \over a'} \right )$ can be found by using the same equation recursively. The value of +$\left ( {2 \over p } \right )^k$ equals $1$ if $k$ is even otherwise it equals $\left ( {2 \over p } \right )$. Using this approach the +factors of $p$ do not have to be known. Furthermore, if $(a, p) = 1$ then the algorithm will terminate when the recursion requests the +Jacobi symbol computation of $\left ( {1 \over a'} \right )$ which is simply $1$. + +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_jacobi}. \\ +\textbf{Input}. mp\_int $a$ and $p$, $a \ge 0$, $p \ge 3$, $p \equiv 1 \mbox{ (mod }2\mbox{)}$ \\ +\textbf{Output}. The Jacobi symbol $c = \left ( {a \over p } \right )$. \\ +\hline \\ +1. If $a = 0$ then \\ +\hspace{3mm}1.1 $c \leftarrow 0$ \\ +\hspace{3mm}1.2 Return(\textit{MP\_OKAY}). \\ +2. If $a = 1$ then \\ +\hspace{3mm}2.1 $c \leftarrow 1$ \\ +\hspace{3mm}2.2 Return(\textit{MP\_OKAY}). \\ +3. $a' \leftarrow a$ \\ +4. $k \leftarrow 0$ \\ +5. While $a'.used > 0$ and $a'_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\ +\hspace{3mm}5.1 $k \leftarrow k + 1$ \\ +\hspace{3mm}5.2 $a' \leftarrow \lfloor a' / 2 \rfloor$ \\ +6. If $k \equiv 0 \mbox{ (mod }2\mbox{)}$ then \\ +\hspace{3mm}6.1 $s \leftarrow 1$ \\ +7. else \\ +\hspace{3mm}7.1 $r \leftarrow p_0 \mbox{ (mod }8\mbox{)}$ \\ +\hspace{3mm}7.2 If $r = 1$ or $r = 7$ then \\ +\hspace{6mm}7.2.1 $s \leftarrow 1$ \\ +\hspace{3mm}7.3 else \\ +\hspace{6mm}7.3.1 $s \leftarrow -1$ \\ +8. If $p_0 \equiv a'_0 \equiv 3 \mbox{ (mod }4\mbox{)}$ then \\ +\hspace{3mm}8.1 $s \leftarrow -s$ \\ +9. If $a' \ne 1$ then \\ +\hspace{3mm}9.1 $p' \leftarrow p \mbox{ (mod }a'\mbox{)}$ \\ +\hspace{3mm}9.2 $s \leftarrow s \cdot \mbox{mp\_jacobi}(p', a')$ \\ +10. $c \leftarrow s$ \\ +11. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_jacobi} +\end{figure} +\textbf{Algorithm mp\_jacobi.} +This algorithm computes the Jacobi symbol for an arbitrary positive integer $a$ with respect to an odd integer $p$ greater than three. The algorithm +is based on algorithm 2.149 of HAC \cite[pp. 73]{HAC}. + +Step numbers one and two handle the trivial cases of $a = 0$ and $a = 1$ respectively. Step five determines the number of two factors in the +input $a$. If $k$ is even than the term $\left ( { 2 \over p } \right )^k$ must always evaluate to one. If $k$ is odd than the term evaluates to one +if $p_0$ is congruent to one or seven modulo eight, otherwise it evaluates to $-1$. After the the $\left ( { 2 \over p } \right )^k$ term is handled +the $(-1)^{(p-1)(a'-1)/4}$ is computed and multiplied against the current product $s$. The latter term evaluates to one if both $p$ and $a'$ +are congruent to one modulo four, otherwise it evaluates to negative one. + +By step nine if $a'$ does not equal one a recursion is required. Step 9.1 computes $p' \equiv p \mbox{ (mod }a'\mbox{)}$ and will recurse to compute +$\left ( {p' \over a'} \right )$ which is multiplied against the current Jacobi product. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_jacobi.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* computes the jacobi c = (a | n) (or Legendre if n is prime) +018 * HAC pp. 73 Algorithm 2.149 +019 */ +020 int mp_jacobi (mp_int * a, mp_int * p, int *c) +021 \{ +022 mp_int a1, p1; +023 int k, s, r, res; +024 mp_digit residue; +025 +026 /* if p <= 0 return MP_VAL */ +027 if (mp_cmp_d(p, 0) != MP_GT) \{ +028 return MP_VAL; +029 \} +030 +031 /* step 1. if a == 0, return 0 */ +032 if (mp_iszero (a) == 1) \{ +033 *c = 0; +034 return MP_OKAY; +035 \} +036 +037 /* step 2. if a == 1, return 1 */ +038 if (mp_cmp_d (a, 1) == MP_EQ) \{ +039 *c = 1; +040 return MP_OKAY; +041 \} +042 +043 /* default */ +044 s = 0; +045 +046 /* step 3. write a = a1 * 2**k */ +047 if ((res = mp_init_copy (&a1, a)) != MP_OKAY) \{ +048 return res; +049 \} +050 +051 if ((res = mp_init (&p1)) != MP_OKAY) \{ +052 goto __A1; +053 \} +054 +055 /* divide out larger power of two */ +056 k = mp_cnt_lsb(&a1); +057 if ((res = mp_div_2d(&a1, k, &a1, NULL)) != MP_OKAY) \{ +058 goto __P1; +059 \} +060 +061 /* step 4. if e is even set s=1 */ +062 if ((k & 1) == 0) \{ +063 s = 1; +064 \} else \{ +065 /* else set s=1 if p = 1/7 (mod 8) or s=-1 if p = 3/5 (mod 8) */ +066 residue = p->dp[0] & 7; +067 +068 if (residue == 1 || residue == 7) \{ +069 s = 1; +070 \} else if (residue == 3 || residue == 5) \{ +071 s = -1; +072 \} +073 \} +074 +075 /* step 5. if p == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */ +076 if ( ((p->dp[0] & 3) == 3) && ((a1.dp[0] & 3) == 3)) \{ +077 s = -s; +078 \} +079 +080 /* if a1 == 1 we're done */ +081 if (mp_cmp_d (&a1, 1) == MP_EQ) \{ +082 *c = s; +083 \} else \{ +084 /* n1 = n mod a1 */ +085 if ((res = mp_mod (p, &a1, &p1)) != MP_OKAY) \{ +086 goto __P1; +087 \} +088 if ((res = mp_jacobi (&p1, &a1, &r)) != MP_OKAY) \{ +089 goto __P1; +090 \} +091 *c = s * r; +092 \} +093 +094 /* done */ +095 res = MP_OKAY; +096 __P1:mp_clear (&p1); +097 __A1:mp_clear (&a1); +098 return res; +099 \} +\end{alltt} +\end{small} + +As a matter of practicality the variable $a'$ as per the pseudo-code is reprensented by the variable $a1$ since the $'$ symbol is not valid for a C +variable name character. + +The two simple cases of $a = 0$ and $a = 1$ are handled at the very beginning to simplify the algorithm. If the input is non-trivial the algorithm +has to proceed compute the Jacobi. The variable $s$ is used to hold the current Jacobi product. Note that $s$ is merely a C ``int'' data type since +the values it may obtain are merely $-1$, $0$ and $1$. + +After a local copy of $a$ is made all of the factors of two are divided out and the total stored in $k$. Technically only the least significant +bit of $k$ is required, however, it makes the algorithm simpler to follow to perform an addition. In practice an exclusive-or and addition have the same +processor requirements and neither is faster than the other. + +Line 61 through 70 determines the value of $\left ( { 2 \over p } \right )^k$. If the least significant bit of $k$ is zero than +$k$ is even and the value is one. Otherwise, the value of $s$ depends on which residue class $p$ belongs to modulo eight. The value of +$(-1)^{(p-1)(a'-1)/4}$ is compute and multiplied against $s$ on lines 75 through 73. + +Finally, if $a1$ does not equal one the algorithm must recurse and compute $\left ( {p' \over a'} \right )$. + +\textit{-- Comment about default $s$ and such...} + +\section{Modular Inverse} +\label{sec:modinv} +The modular inverse of a number actually refers to the modular multiplicative inverse. Essentially for any integer $a$ such that $(a, p) = 1$ there +exist another integer $b$ such that $ab \equiv 1 \mbox{ (mod }p\mbox{)}$. The integer $b$ is called the multiplicative inverse of $a$ which is +denoted as $b = a^{-1}$. Technically speaking modular inversion is a well defined operation for any finite ring or field not just for rings and +fields of integers. However, the former will be the matter of discussion. + +The simplest approach is to compute the algebraic inverse of the input. That is to compute $b \equiv a^{\Phi(p) - 1}$. If $\Phi(p)$ is the +order of the multiplicative subgroup modulo $p$ then $b$ must be the multiplicative inverse of $a$. The proof of which is trivial. + +\begin{equation} +ab \equiv a \left (a^{\Phi(p) - 1} \right ) \equiv a^{\Phi(p)} \equiv a^0 \equiv 1 \mbox{ (mod }p\mbox{)} +\end{equation} + +However, as simple as this approach may be it has two serious flaws. It requires that the value of $\Phi(p)$ be known which if $p$ is composite +requires all of the prime factors. This approach also is very slow as the size of $p$ grows. + +A simpler approach is based on the observation that solving for the multiplicative inverse is equivalent to solving the linear +Diophantine\footnote{See LeVeque \cite[pp. 40-43]{LeVeque} for more information.} equation. + +\begin{equation} +ab + pq = 1 +\end{equation} + +Where $a$, $b$, $p$ and $q$ are all integers. If such a pair of integers $ \left < b, q \right >$ exist than $b$ is the multiplicative inverse of +$a$ modulo $p$. The extended Euclidean algorithm (Knuth \cite[pp. 342]{TAOCPV2}) can be used to solve such equations provided $(a, p) = 1$. +However, instead of using that algorithm directly a variant known as the binary Extended Euclidean algorithm will be used in its place. The +binary approach is very similar to the binary greatest common divisor algorithm except it will produce a full solution to the Diophantine +equation. + +\subsection{General Case} +\newpage\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_invmod}. \\ +\textbf{Input}. mp\_int $a$ and $b$, $(a, b) = 1$, $p \ge 2$, $0 < a < p$. \\ +\textbf{Output}. The modular inverse $c \equiv a^{-1} \mbox{ (mod }b\mbox{)}$. \\ +\hline \\ +1. If $b \le 0$ then return(\textit{MP\_VAL}). \\ +2. If $b_0 \equiv 1 \mbox{ (mod }2\mbox{)}$ then use algorithm fast\_mp\_invmod. \\ +3. $x \leftarrow \vert a \vert, y \leftarrow b$ \\ +4. If $x_0 \equiv y_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ then return(\textit{MP\_VAL}). \\ +5. $B \leftarrow 0, C \leftarrow 0, A \leftarrow 1, D \leftarrow 1$ \\ +6. While $u.used > 0$ and $u_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\ +\hspace{3mm}6.1 $u \leftarrow \lfloor u / 2 \rfloor$ \\ +\hspace{3mm}6.2 If ($A.used > 0$ and $A_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) or ($B.used > 0$ and $B_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) then \\ +\hspace{6mm}6.2.1 $A \leftarrow A + y$ \\ +\hspace{6mm}6.2.2 $B \leftarrow B - x$ \\ +\hspace{3mm}6.3 $A \leftarrow \lfloor A / 2 \rfloor$ \\ +\hspace{3mm}6.4 $B \leftarrow \lfloor B / 2 \rfloor$ \\ +7. While $v.used > 0$ and $v_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\ +\hspace{3mm}7.1 $v \leftarrow \lfloor v / 2 \rfloor$ \\ +\hspace{3mm}7.2 If ($C.used > 0$ and $C_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) or ($D.used > 0$ and $D_0 \equiv 1 \mbox{ (mod }2\mbox{)}$) then \\ +\hspace{6mm}7.2.1 $C \leftarrow C + y$ \\ +\hspace{6mm}7.2.2 $D \leftarrow D - x$ \\ +\hspace{3mm}7.3 $C \leftarrow \lfloor C / 2 \rfloor$ \\ +\hspace{3mm}7.4 $D \leftarrow \lfloor D / 2 \rfloor$ \\ +8. If $u \ge v$ then \\ +\hspace{3mm}8.1 $u \leftarrow u - v$ \\ +\hspace{3mm}8.2 $A \leftarrow A - C$ \\ +\hspace{3mm}8.3 $B \leftarrow B - D$ \\ +9. else \\ +\hspace{3mm}9.1 $v \leftarrow v - u$ \\ +\hspace{3mm}9.2 $C \leftarrow C - A$ \\ +\hspace{3mm}9.3 $D \leftarrow D - B$ \\ +10. If $u \ne 0$ goto step 6. \\ +11. If $v \ne 1$ return(\textit{MP\_VAL}). \\ +12. While $C \le 0$ do \\ +\hspace{3mm}12.1 $C \leftarrow C + b$ \\ +13. While $C \ge b$ do \\ +\hspace{3mm}13.1 $C \leftarrow C - b$ \\ +14. $c \leftarrow C$ \\ +15. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\end{figure} +\textbf{Algorithm mp\_invmod.} +This algorithm computes the modular multiplicative inverse of an integer $a$ modulo an integer $b$. This algorithm is a variation of the +extended binary Euclidean algorithm from HAC \cite[pp. 608]{HAC}. It has been modified to only compute the modular inverse and not a complete +Diophantine solution. + +If $b \le 0$ than the modulus is invalid and MP\_VAL is returned. Similarly if both $a$ and $b$ are even then there cannot be a multiplicative +inverse for $a$ and the error is reported. + +The astute reader will observe that steps seven through nine are very similar to the binary greatest common divisor algorithm mp\_gcd. In this case +the other variables to the Diophantine equation are solved. The algorithm terminates when $u = 0$ in which case the solution is + +\begin{equation} +Ca + Db = v +\end{equation} + +If $v$, the greatest common divisor of $a$ and $b$ is not equal to one then the algorithm will report an error as no inverse exists. Otherwise, $C$ +is the modular inverse of $a$. The actual value of $C$ is congruent to, but not necessarily equal to, the ideal modular inverse which should lie +within $1 \le a^{-1} < b$. Step numbers twelve and thirteen adjust the inverse until it is in range. If the original input $a$ is within $0 < a < p$ +then only a couple of additions or subtractions will be required to adjust the inverse. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_invmod.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* hac 14.61, pp608 */ +018 int mp_invmod (mp_int * a, mp_int * b, mp_int * c) +019 \{ +020 mp_int x, y, u, v, A, B, C, D; +021 int res; +022 +023 /* b cannot be negative */ +024 if (b->sign == MP_NEG || mp_iszero(b) == 1) \{ +025 return MP_VAL; +026 \} +027 +028 /* if the modulus is odd we can use a faster routine instead */ +029 if (mp_isodd (b) == 1) \{ +030 return fast_mp_invmod (a, b, c); +031 \} +032 +033 /* init temps */ +034 if ((res = mp_init_multi(&x, &y, &u, &v, +035 &A, &B, &C, &D, NULL)) != MP_OKAY) \{ +036 return res; +037 \} +038 +039 /* x = a, y = b */ +040 if ((res = mp_copy (a, &x)) != MP_OKAY) \{ +041 goto __ERR; +042 \} +043 if ((res = mp_copy (b, &y)) != MP_OKAY) \{ +044 goto __ERR; +045 \} +046 +047 /* 2. [modified] if x,y are both even then return an error! */ +048 if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) \{ +049 res = MP_VAL; +050 goto __ERR; +051 \} +052 +053 /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */ +054 if ((res = mp_copy (&x, &u)) != MP_OKAY) \{ +055 goto __ERR; +056 \} +057 if ((res = mp_copy (&y, &v)) != MP_OKAY) \{ +058 goto __ERR; +059 \} +060 mp_set (&A, 1); +061 mp_set (&D, 1); +062 +063 top: +064 /* 4. while u is even do */ +065 while (mp_iseven (&u) == 1) \{ +066 /* 4.1 u = u/2 */ +067 if ((res = mp_div_2 (&u, &u)) != MP_OKAY) \{ +068 goto __ERR; +069 \} +070 /* 4.2 if A or B is odd then */ +071 if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) \{ +072 /* A = (A+y)/2, B = (B-x)/2 */ +073 if ((res = mp_add (&A, &y, &A)) != MP_OKAY) \{ +074 goto __ERR; +075 \} +076 if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) \{ +077 goto __ERR; +078 \} +079 \} +080 /* A = A/2, B = B/2 */ +081 if ((res = mp_div_2 (&A, &A)) != MP_OKAY) \{ +082 goto __ERR; +083 \} +084 if ((res = mp_div_2 (&B, &B)) != MP_OKAY) \{ +085 goto __ERR; +086 \} +087 \} +088 +089 /* 5. while v is even do */ +090 while (mp_iseven (&v) == 1) \{ +091 /* 5.1 v = v/2 */ +092 if ((res = mp_div_2 (&v, &v)) != MP_OKAY) \{ +093 goto __ERR; +094 \} +095 /* 5.2 if C or D is odd then */ +096 if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) \{ +097 /* C = (C+y)/2, D = (D-x)/2 */ +098 if ((res = mp_add (&C, &y, &C)) != MP_OKAY) \{ +099 goto __ERR; +100 \} +101 if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) \{ +102 goto __ERR; +103 \} +104 \} +105 /* C = C/2, D = D/2 */ +106 if ((res = mp_div_2 (&C, &C)) != MP_OKAY) \{ +107 goto __ERR; +108 \} +109 if ((res = mp_div_2 (&D, &D)) != MP_OKAY) \{ +110 goto __ERR; +111 \} +112 \} +113 +114 /* 6. if u >= v then */ +115 if (mp_cmp (&u, &v) != MP_LT) \{ +116 /* u = u - v, A = A - C, B = B - D */ +117 if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) \{ +118 goto __ERR; +119 \} +120 +121 if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) \{ +122 goto __ERR; +123 \} +124 +125 if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) \{ +126 goto __ERR; +127 \} +128 \} else \{ +129 /* v - v - u, C = C - A, D = D - B */ +130 if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) \{ +131 goto __ERR; +132 \} +133 +134 if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) \{ +135 goto __ERR; +136 \} +137 +138 if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) \{ +139 goto __ERR; +140 \} +141 \} +142 +143 /* if not zero goto step 4 */ +144 if (mp_iszero (&u) == 0) +145 goto top; +146 +147 /* now a = C, b = D, gcd == g*v */ +148 +149 /* if v != 1 then there is no inverse */ +150 if (mp_cmp_d (&v, 1) != MP_EQ) \{ +151 res = MP_VAL; +152 goto __ERR; +153 \} +154 +155 /* if its too low */ +156 while (mp_cmp_d(&C, 0) == MP_LT) \{ +157 if ((res = mp_add(&C, b, &C)) != MP_OKAY) \{ +158 goto __ERR; +159 \} +160 \} +161 +162 /* too big */ +163 while (mp_cmp_mag(&C, b) != MP_LT) \{ +164 if ((res = mp_sub(&C, b, &C)) != MP_OKAY) \{ +165 goto __ERR; +166 \} +167 \} +168 +169 /* C is now the inverse */ +170 mp_exch (&C, c); +171 res = MP_OKAY; +172 __ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL); +173 return res; +174 \} +\end{alltt} +\end{small} + +\subsubsection{Odd Moduli} + +When the modulus $b$ is odd the variables $A$ and $C$ are fixed and are not required to compute the inverse. In particular by attempting to solve +the Diophantine $Cb + Da = 1$ only $B$ and $D$ are required to find the inverse of $a$. + +The algorithm fast\_mp\_invmod is a direct adaptation of algorithm mp\_invmod with all all steps involving either $A$ or $C$ removed. This +optimization will halve the time required to compute the modular inverse. + +\section{Primality Tests} + +A non-zero integer $a$ is said to be prime if it is not divisible by any other integer excluding one and itself. For example, $a = 7$ is prime +since the integers $2 \ldots 6$ do not evenly divide $a$. By contrast, $a = 6$ is not prime since $a = 6 = 2 \cdot 3$. + +Prime numbers arise in cryptography considerably as they allow finite fields to be formed. The ability to determine whether an integer is prime or +not quickly has been a viable subject in cryptography and number theory for considerable time. The algorithms that will be presented are all +probablistic algorithms in that when they report an integer is composite it must be composite. However, when the algorithms report an integer is +prime the algorithm may be incorrect. + +As will be discussed it is possible to limit the probability of error so well that for practical purposes the probablity of error might as +well be zero. For the purposes of these discussions let $n$ represent the candidate integer of which the primality is in question. + +\subsection{Trial Division} + +Trial division means to attempt to evenly divide a candidate integer by small prime integers. If the candidate can be evenly divided it obviously +cannot be prime. By dividing by all primes $1 < p \le \sqrt{n}$ this test can actually prove whether an integer is prime. However, such a test +would require a prohibitive amount of time as $n$ grows. + +Instead of dividing by every prime, a smaller, more mangeable set of primes may be used instead. By performing trial division with only a subset +of the primes less than $\sqrt{n} + 1$ the algorithm cannot prove if a candidate is prime. However, often it can prove a candidate is not prime. + +The benefit of this test is that trial division by small values is fairly efficient. Specially compared to the other algorithms that will be +discussed shortly. The probability that this approach correctly identifies a composite candidate when tested with all primes upto $q$ is given by +$1 - {1.12 \over ln(q)}$. The graph (\ref{pic:primality}, will be added later) demonstrates the probability of success for the range +$3 \le q \le 100$. + +At approximately $q = 30$ the gain of performing further tests diminishes fairly quickly. At $q = 90$ further testing is generally not going to +be of any practical use. In the case of LibTomMath the default limit $q = 256$ was chosen since it is not too high and will eliminate +approximately $80\%$ of all candidate integers. The constant \textbf{PRIME\_SIZE} is equal to the number of primes in the test base. The +array \_\_prime\_tab is an array of the first \textbf{PRIME\_SIZE} prime numbers. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_prime\_is\_divisible}. \\ +\textbf{Input}. mp\_int $a$ \\ +\textbf{Output}. $c = 1$ if $n$ is divisible by a small prime, otherwise $c = 0$. \\ +\hline \\ +1. for $ix$ from $0$ to $PRIME\_SIZE$ do \\ +\hspace{3mm}1.1 $d \leftarrow n \mbox{ (mod }\_\_prime\_tab_{ix}\mbox{)}$ \\ +\hspace{3mm}1.2 If $d = 0$ then \\ +\hspace{6mm}1.2.1 $c \leftarrow 1$ \\ +\hspace{6mm}1.2.2 Return(\textit{MP\_OKAY}). \\ +2. $c \leftarrow 0$ \\ +3. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_prime\_is\_divisible} +\end{figure} +\textbf{Algorithm mp\_prime\_is\_divisible.} +This algorithm attempts to determine if a candidate integer $n$ is composite by performing trial divisions. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_prime\_is\_divisible.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* determines if an integers is divisible by one +018 * of the first PRIME_SIZE primes or not +019 * +020 * sets result to 0 if not, 1 if yes +021 */ +022 int mp_prime_is_divisible (mp_int * a, int *result) +023 \{ +024 int err, ix; +025 mp_digit res; +026 +027 /* default to not */ +028 *result = MP_NO; +029 +030 for (ix = 0; ix < PRIME_SIZE; ix++) \{ +031 /* what is a mod __prime_tab[ix] */ +032 if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) \{ +033 return err; +034 \} +035 +036 /* is the residue zero? */ +037 if (res == 0) \{ +038 *result = MP_YES; +039 return MP_OKAY; +040 \} +041 \} +042 +043 return MP_OKAY; +044 \} +\end{alltt} +\end{small} + +The algorithm defaults to a return of $0$ in case an error occurs. The values in the prime table are all specified to be in the range of a +mp\_digit. The table \_\_prime\_tab is defined in the following file. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_prime\_tab.c +\vspace{-3mm} +\begin{alltt} +016 const mp_digit __prime_tab[] = \{ +017 0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013, +018 0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035, +019 0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059, +020 0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, +021 #ifndef MP_8BIT +022 0x0083, +023 0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD, +024 0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF, +025 0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107, +026 0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137, +027 +028 0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167, +029 0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199, +030 0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9, +031 0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7, +032 0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239, +033 0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265, +034 0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293, +035 0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF, +036 +037 0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301, +038 0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B, +039 0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371, +040 0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD, +041 0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5, +042 0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419, +043 0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449, +044 0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B, +045 +046 0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7, +047 0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503, +048 0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529, +049 0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F, +050 0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3, +051 0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7, +052 0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623, +053 0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653 +054 #endif +055 \}; +\end{alltt} +\end{small} + +Note that there are two possible tables. When an mp\_digit is 7-bits long only the primes upto $127$ may be included, otherwise the primes +upto $1619$ are used. Note that the value of \textbf{PRIME\_SIZE} is a constant dependent on the size of a mp\_digit. + +\subsection{The Fermat Test} +The Fermat test is probably one the oldest tests to have a non-trivial probability of success. It is based on the fact that if $n$ is in +fact prime then $a^{n} \equiv a \mbox{ (mod }n\mbox{)}$ for all $0 < a < n$. The reason being that if $n$ is prime than the order of +the multiplicative sub group is $n - 1$. Any base $a$ must have an order which divides $n - 1$ and as such $a^n$ is equivalent to +$a^1 = a$. + +If $n$ is composite then any given base $a$ does not have to have a period which divides $n - 1$. In which case +it is possible that $a^n \nequiv a \mbox{ (mod }n\mbox{)}$. However, this test is not absolute as it is possible that the order +of a base will divide $n - 1$ which would then be reported as prime. Such a base yields what is known as a Fermat pseudo-prime. Several +integers known as Carmichael numbers will be a pseudo-prime to all valid bases. Fortunately such numbers are extremely rare as $n$ grows +in size. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_prime\_fermat}. \\ +\textbf{Input}. mp\_int $a$ and $b$, $a \ge 2$, $0 < b < a$. \\ +\textbf{Output}. $c = 1$ if $b^a \equiv b \mbox{ (mod }a\mbox{)}$, otherwise $c = 0$. \\ +\hline \\ +1. $t \leftarrow b^a \mbox{ (mod }a\mbox{)}$ \\ +2. If $t = b$ then \\ +\hspace{3mm}2.1 $c = 1$ \\ +3. else \\ +\hspace{3mm}3.1 $c = 0$ \\ +4. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_prime\_fermat} +\end{figure} +\textbf{Algorithm mp\_prime\_fermat.} +This algorithm determines whether an mp\_int $a$ is a Fermat prime to the base $b$ or not. It uses a single modular exponentiation to +determine the result. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_prime\_fermat.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* performs one Fermat test. +018 * +019 * If "a" were prime then b**a == b (mod a) since the order of +020 * the multiplicative sub-group would be phi(a) = a-1. That means +021 * it would be the same as b**(a mod (a-1)) == b**1 == b (mod a). +022 * +023 * Sets result to 1 if the congruence holds, or zero otherwise. +024 */ +025 int mp_prime_fermat (mp_int * a, mp_int * b, int *result) +026 \{ +027 mp_int t; +028 int err; +029 +030 /* default to composite */ +031 *result = MP_NO; +032 +033 /* ensure b > 1 */ +034 if (mp_cmp_d(b, 1) != MP_GT) \{ +035 return MP_VAL; +036 \} +037 +038 /* init t */ +039 if ((err = mp_init (&t)) != MP_OKAY) \{ +040 return err; +041 \} +042 +043 /* compute t = b**a mod a */ +044 if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) \{ +045 goto __T; +046 \} +047 +048 /* is it equal to b? */ +049 if (mp_cmp (&t, b) == MP_EQ) \{ +050 *result = MP_YES; +051 \} +052 +053 err = MP_OKAY; +054 __T:mp_clear (&t); +055 return err; +056 \} +\end{alltt} +\end{small} + +\subsection{The Miller-Rabin Test} +The Miller-Rabin (citation) test is another primality test which has tighter error bounds than the Fermat test specifically with sequentially chosen +candidate integers. The algorithm is based on the observation that if $n - 1 = 2^kr$ and if $b^r \nequiv \pm 1$ then after upto $k - 1$ squarings the +value must be equal to $-1$. The squarings are stopped as soon as $-1$ is observed. If the value of $1$ is observed first it means that +some value not congruent to $\pm 1$ when squared equals one which cannot occur if $n$ is prime. + +\begin{figure}[!here] +\begin{small} +\begin{center} +\begin{tabular}{l} +\hline Algorithm \textbf{mp\_prime\_miller\_rabin}. \\ +\textbf{Input}. mp\_int $a$ and $b$, $a \ge 2$, $0 < b < a$. \\ +\textbf{Output}. $c = 1$ if $a$ is a Miller-Rabin prime to the base $a$, otherwise $c = 0$. \\ +\hline +1. $a' \leftarrow a - 1$ \\ +2. $r \leftarrow n1$ \\ +3. $c \leftarrow 0, s \leftarrow 0$ \\ +4. While $r.used > 0$ and $r_0 \equiv 0 \mbox{ (mod }2\mbox{)}$ \\ +\hspace{3mm}4.1 $s \leftarrow s + 1$ \\ +\hspace{3mm}4.2 $r \leftarrow \lfloor r / 2 \rfloor$ \\ +5. $y \leftarrow b^r \mbox{ (mod }a\mbox{)}$ \\ +6. If $y \nequiv \pm 1$ then \\ +\hspace{3mm}6.1 $j \leftarrow 1$ \\ +\hspace{3mm}6.2 While $j \le (s - 1)$ and $y \nequiv a'$ \\ +\hspace{6mm}6.2.1 $y \leftarrow y^2 \mbox{ (mod }a\mbox{)}$ \\ +\hspace{6mm}6.2.2 If $y = 1$ then goto step 8. \\ +\hspace{6mm}6.2.3 $j \leftarrow j + 1$ \\ +\hspace{3mm}6.3 If $y \nequiv a'$ goto step 8. \\ +7. $c \leftarrow 1$\\ +8. Return(\textit{MP\_OKAY}). \\ +\hline +\end{tabular} +\end{center} +\end{small} +\caption{Algorithm mp\_prime\_miller\_rabin} +\end{figure} +\textbf{Algorithm mp\_prime\_miller\_rabin.} +This algorithm performs one trial round of the Miller-Rabin algorithm to the base $b$. It will set $c = 1$ if the algorithm cannot determine +if $b$ is composite or $c = 0$ if $b$ is provably composite. The values of $s$ and $r$ are computed such that $a' = a - 1 = 2^sr$. + +If the value $y \equiv b^r$ is congruent to $\pm 1$ then the algorithm cannot prove if $a$ is composite or not. Otherwise, the algorithm will +square $y$ upto $s - 1$ times stopping only when $y \equiv -1$. If $y^2 \equiv 1$ and $y \nequiv \pm 1$ then the algorithm can report that $a$ +is provably composite. If the algorithm performs $s - 1$ squarings and $y \nequiv -1$ then $a$ is provably composite. If $a$ is not provably +composite then it is \textit{probably} prime. + +\vspace{+3mm}\begin{small} +\hspace{-5.1mm}{\bf File}: bn\_mp\_prime\_miller\_rabin.c +\vspace{-3mm} +\begin{alltt} +016 +017 /* Miller-Rabin test of "a" to the base of "b" as described in +018 * HAC pp. 139 Algorithm 4.24 +019 * +020 * Sets result to 0 if definitely composite or 1 if probably prime. +021 * Randomly the chance of error is no more than 1/4 and often +022 * very much lower. +023 */ +024 int mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result) +025 \{ +026 mp_int n1, y, r; +027 int s, j, err; +028 +029 /* default */ +030 *result = MP_NO; +031 +032 /* ensure b > 1 */ +033 if (mp_cmp_d(b, 1) != MP_GT) \{ +034 return MP_VAL; +035 \} +036 +037 /* get n1 = a - 1 */ +038 if ((err = mp_init_copy (&n1, a)) != MP_OKAY) \{ +039 return err; +040 \} +041 if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) \{ +042 goto __N1; +043 \} +044 +045 /* set 2**s * r = n1 */ +046 if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) \{ +047 goto __N1; +048 \} +049 +050 /* count the number of least significant bits +051 * which are zero +052 */ +053 s = mp_cnt_lsb(&r); +054 +055 /* now divide n - 1 by 2**s */ +056 if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) \{ +057 goto __R; +058 \} +059 +060 /* compute y = b**r mod a */ +061 if ((err = mp_init (&y)) != MP_OKAY) \{ +062 goto __R; +063 \} +064 if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) \{ +065 goto __Y; +066 \} +067 +068 /* if y != 1 and y != n1 do */ +069 if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) \{ +070 j = 1; +071 /* while j <= s-1 and y != n1 */ +072 while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) \{ +073 if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) \{ +074 goto __Y; +075 \} +076 +077 /* if y == 1 then composite */ +078 if (mp_cmp_d (&y, 1) == MP_EQ) \{ +079 goto __Y; +080 \} +081 +082 ++j; +083 \} +084 +085 /* if y != n1 then composite */ +086 if (mp_cmp (&y, &n1) != MP_EQ) \{ +087 goto __Y; +088 \} +089 \} +090 +091 /* probably prime now */ +092 *result = MP_YES; +093 __Y:mp_clear (&y); +094 __R:mp_clear (&r); +095 __N1:mp_clear (&n1); +096 return err; +097 \} +\end{alltt} +\end{small} + + + + +\backmatter +\appendix +\begin{thebibliography}{ABCDEF} +\bibitem[1]{TAOCPV2} +Donald Knuth, \textit{The Art of Computer Programming}, Third Edition, Volume Two, Seminumerical Algorithms, Addison-Wesley, 1998 + +\bibitem[2]{HAC} +A. Menezes, P. van Oorschot, S. Vanstone, \textit{Handbook of Applied Cryptography}, CRC Press, 1996 + +\bibitem[3]{ROSE} +Michael Rosing, \textit{Implementing Elliptic Curve Cryptography}, Manning Publications, 1999 + +\bibitem[4]{COMBA} +Paul G. Comba, \textit{Exponentiation Cryptosystems on the IBM PC}. IBM Systems Journal 29(4): 526-538 (1990) + +\bibitem[5]{KARA} +A. Karatsuba, Doklay Akad. Nauk SSSR 145 (1962), pp.293-294 + +\bibitem[6]{KARAP} +Andre Weimerskirch and Christof Paar, \textit{Generalizations of the Karatsuba Algorithm for Polynomial Multiplication}, Submitted to Design, Codes and Cryptography, March 2002 + +\bibitem[7]{BARRETT} +Paul Barrett, \textit{Implementing the Rivest Shamir and Adleman Public Key Encryption Algorithm on a Standard Digital Signal Processor}, Advances in Cryptology, Crypto '86, Springer-Verlag. + +\bibitem[8]{MONT} +P.L.Montgomery. \textit{Modular multiplication without trial division}. Mathematics of Computation, 44(170):519-521, April 1985. + +\bibitem[9]{DRMET} +Chae Hoon Lim and Pil Joong Lee, \textit{Generating Efficient Primes for Discrete Log Cryptosystems}, POSTECH Information Research Laboratories + +\bibitem[10]{MMB} +J. Daemen and R. Govaerts and J. Vandewalle, \textit{Block ciphers based on Modular Arithmetic}, State and {P}rogress in the {R}esearch of {C}ryptography, 1993, pp. 80-89 + +\bibitem[11]{RSAREF} +R.L. Rivest, A. Shamir, L. Adleman, \textit{A Method for Obtaining Digital Signatures and Public-Key Cryptosystems} + +\bibitem[12]{DHREF} +Whitfield Diffie, Martin E. Hellman, \textit{New Directions in Cryptography}, IEEE Transactions on Information Theory, 1976 + +\bibitem[13]{IEEE} +IEEE Standard for Binary Floating-Point Arithmetic (ANSI/IEEE Std 754-1985) + +\bibitem[14]{GMP} +GNU Multiple Precision (GMP), \url{http://www.swox.com/gmp/} + +\bibitem[15]{MPI} +Multiple Precision Integer Library (MPI), Michael Fromberger, \url{http://thayer.dartmouth.edu/~sting/mpi/} + +\bibitem[16]{OPENSSL} +OpenSSL Cryptographic Toolkit, \url{http://openssl.org} + +\bibitem[17]{LIP} +Large Integer Package, \url{http://home.hetnet.nl/~ecstr/LIP.zip} + +\bibitem[18]{ISOC} +JTC1/SC22/WG14, ISO/IEC 9899:1999, ``A draft rationale for the C99 standard.'' + +\bibitem[19]{JAVA} +The Sun Java Website, \url{http://java.sun.com/} + +\end{thebibliography} + +\input{tommath.ind} + +\end{document}