changeset 10:439b7aaaec9e

Get aes from avr231 appnote instead
author Matt Johnston <matt@ucc.asn.au>
date Wed, 12 Jun 2013 22:57:44 +0800
parents 3aa92c7f379c
children e83b35e864d7
files aes.c aes.h
diffstat 2 files changed, 458 insertions(+), 500 deletions(-) [+]
line wrap: on
line diff
--- a/aes.c	Thu Jun 06 00:05:13 2013 +0800
+++ b/aes.c	Wed Jun 12 22:57:44 2013 +0800
@@ -1,539 +1,481 @@
-// advanced encryption standard
-// Original author: Karl Malbrain, [email protected]
-// Ported to Atmel AVR by: Jiri Pittner, [email protected]
+#include "aes.h"
+//#include "loader.h"
 //
-/*
-This work, including the source code, documentation
-and related data, is placed into the public domain.
+#define KEY_COUNT 1
+
+#if KEY_COUNT > 0
+
+//#include "aeskeys.inc"
+
+
+
+
+typedef unsigned char byte;
+
+
+
+#define BPOLY 0x1b //!< Lower 8 bits of (x^8+x^4+x^3+x+1), ie. (x^4+x^3+x+1).
+#define BLOCKSIZE 16 //!< Block size in number of bytes.
+
+
+
+#if KEY_COUNT == 1
+	#define KEYBITS 128 //!< Use AES128.
+#elif KEY_COUNT == 2
+	#define KEYBITS 192 //!< Use AES196.
+#elif KEY_COUNT == 3
+	#define KEYBITS 256 //!< Use AES256.
+#else
+	#error Use 1, 2 or 3 keys!
+#endif
 
-The original author is Karl Malbrain.
+#if KEYBITS == 128
+	#define ROUNDS 10 //!< Number of rounds.
+	#define KEYLENGTH 16 //!< Key length in number of bytes.
+#elif KEYBITS == 192
+	#define ROUNDS 12 //!< Number of rounds.
+	#define KEYLENGTH 24 //!< // Key length in number of bytes.
+#elif KEYBITS == 256
+	#define ROUNDS 14 //!< Number of rounds.
+	#define KEYLENGTH 32 //!< Key length in number of bytes.
+#else
+	#error Key must be 128, 192 or 256 bits!
+#endif
+
+#define EXPANDED_KEY_SIZE (BLOCKSIZE * (ROUNDS+1)) //!< 176, 208 or 240 bytes.
+
+
+
+byte block1[ 256 ]; //!< Workspace 1.
+byte block2[ 256 ]; //!< Worksapce 2.
+
+
+
+byte * powTbl; //!< Final location of exponentiation lookup table.
+byte * logTbl; //!< Final location of logarithm lookup table.
+byte * sBox; //!< Final location of s-box.
+byte * sBoxInv; //!< Final location of inverse s-box.
+byte * expandedKey; //!< Final location of expanded key.
+
+
+
+void CalcPowLog( byte * powTbl, byte * logTbl )
+{
+        byte i = 0;
+        byte t = 1;
+
+        do {
+		// Use 0x03 as root for exponentiation and logarithms.
+        	powTbl[i] = t;
+       		logTbl[t] = i;
+       		i++;
 
-THIS SOFTWARE IS PROVIDED AS-IS WITHOUT WARRANTY
-OF ANY KIND, NOT EVEN THE IMPLIED WARRANTY OF
-MERCHANTABILITY. THE AUTHOR OF THIS SOFTWARE,
-ASSUMES _NO_ RESPONSIBILITY FOR ANY CONSEQUENCE
-RESULTING FROM THE USE, MODIFICATION, OR
-REDISTRIBUTION OF THIS SOFTWARE.
-*/
+		// Muliply t by 3 in GF(2^8).
+       		t ^= (t << 1) ^ (t & 0x80 ? BPOLY : 0);
+       	} while( t != 1 ); // Cyclic properties ensure that i < 255.
+       	
+       	powTbl[255] = powTbl[0]; // 255 = '-0', 254 = -1, etc.
+}
+
+
+
+void CalcSBox( byte * sBox )
+{
+        byte i, rot;
+        byte temp;
+	byte result;
+
+	// Fill all entries of sBox[].
+	i = 0;
+	do {
+                // Inverse in GF(2^8).
+                if( i > 0 ) {
+	                temp = powTbl[ 255 - logTbl[i] ];
+	        } else {
+                 	temp = 0;
+		}
+
+                // Affine transformation in GF(2).
+                result = temp ^ 0x63; // Start with adding a vector in GF(2).
+                for( rot = 0; rot < 4; rot++ ) {
+                        // Rotate left.
+			temp = (temp<<1) | (temp>>7);
 
-#include <stdlib.h>
-#include <string.h>
+			// Add rotated byte in GF(2).
+			result ^= temp;
+		}
+			
+		// Put result in table.
+                sBox[i] = result;
+	} while( ++i != 0 );
+}	
+
+
+
+void CalcSBoxInv( byte * sBox, byte * sBoxInv )
+{
+	byte i = 0;
+	byte j = 0;
 
-#include <avr/pgmspace.h> //tables have to reside in flash memory
+	// Iterate through all elements in sBoxInv using  i.
+	do {
+		// Search through sBox using j.
+		do {
+			// Check if current j is the inverse of current i.
+			if( sBox[ j ] == i ) {
+				// If so, set sBoxInc and indicate search finished.
+				sBoxInv[ i ] = j;
+				j = 255;
+			}
+		} while( ++j != 0 );
+	} while( ++i != 0 );
+}
+
+
+
+void CycleLeft( byte * row )
+{
+	// Cycle 4 bytes in an array left once.
+	byte temp = row[0];
+	row[0] = row[1];
+	row[1] = row[2];
+	row[2] = row[3];
+	row[3] = temp;
+}
+
 
 
-// AES only supports Nb=4
-#define Nb 4			// number of columns in the state & expanded key
-
-#define Nk 4			// number of columns in a key
-#define Nr 10			// number of rounds in encryption
+void InvMixColumn( byte * column )
+{
+	byte result0, result1, result2, result3;
+	byte column0, column1, column2, column3;
+	byte xor;
 
-#define Sbox(i) (pgm_read_byte(&P_Sbox[i]))
-const unsigned char P_Sbox[256] __attribute__ ((__progmem__)) = {		// forward s-box
-0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
-0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
-0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
-0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
-0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
-0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
-0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
-0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
-0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
-0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
-0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
-0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
-0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
-0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
-0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
-0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};
+	// This generates more effective code, at least
+	// with the IAR C compiler.
+	column0 = column[0];
+	column1 = column[1];
+	column2 = column[2];
+	column3 = column[3];
 
-#define InvSbox(i) (pgm_read_byte(&P_InvSbox[i]))
-const unsigned char P_InvSbox[256] __attribute__ ((__progmem__)) = {	// inverse s-box
-0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
-0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
-0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
-0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
-0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
-0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
-0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
-0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
-0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
-0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
-0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
-0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
-0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
-0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
-0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
-0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d};
+	// Partial sums (modular addition using XOR).
+	result0 = column1 ^ column2 ^ column3;
+	result1 = column0 ^ column2 ^ column3;
+	result2 = column0 ^ column1 ^ column3;
+	result3 = column0 ^ column1 ^ column2;
 
-// combined Xtimes2[Sbox[]]
-#define Xtime2Sbox(i) (pgm_read_byte(&P_Xtime2Sbox[i]))
-const unsigned char P_Xtime2Sbox[256] __attribute__ ((__progmem__)) = {
-0xc6, 0xf8, 0xee, 0xf6, 0xff, 0xd6, 0xde, 0x91, 0x60, 0x02, 0xce, 0x56, 0xe7, 0xb5, 0x4d, 0xec, 
-0x8f, 0x1f, 0x89, 0xfa, 0xef, 0xb2, 0x8e, 0xfb, 0x41, 0xb3, 0x5f, 0x45, 0x23, 0x53, 0xe4, 0x9b, 
-0x75, 0xe1, 0x3d, 0x4c, 0x6c, 0x7e, 0xf5, 0x83, 0x68, 0x51, 0xd1, 0xf9, 0xe2, 0xab, 0x62, 0x2a, 
-0x08, 0x95, 0x46, 0x9d, 0x30, 0x37, 0x0a, 0x2f, 0x0e, 0x24, 0x1b, 0xdf, 0xcd, 0x4e, 0x7f, 0xea, 
-0x12, 0x1d, 0x58, 0x34, 0x36, 0xdc, 0xb4, 0x5b, 0xa4, 0x76, 0xb7, 0x7d, 0x52, 0xdd, 0x5e, 0x13, 
-0xa6, 0xb9, 0x00, 0xc1, 0x40, 0xe3, 0x79, 0xb6, 0xd4, 0x8d, 0x67, 0x72, 0x94, 0x98, 0xb0, 0x85, 
-0xbb, 0xc5, 0x4f, 0xed, 0x86, 0x9a, 0x66, 0x11, 0x8a, 0xe9, 0x04, 0xfe, 0xa0, 0x78, 0x25, 0x4b, 
-0xa2, 0x5d, 0x80, 0x05, 0x3f, 0x21, 0x70, 0xf1, 0x63, 0x77, 0xaf, 0x42, 0x20, 0xe5, 0xfd, 0xbf, 
-0x81, 0x18, 0x26, 0xc3, 0xbe, 0x35, 0x88, 0x2e, 0x93, 0x55, 0xfc, 0x7a, 0xc8, 0xba, 0x32, 0xe6, 
-0xc0, 0x19, 0x9e, 0xa3, 0x44, 0x54, 0x3b, 0x0b, 0x8c, 0xc7, 0x6b, 0x28, 0xa7, 0xbc, 0x16, 0xad, 
-0xdb, 0x64, 0x74, 0x14, 0x92, 0x0c, 0x48, 0xb8, 0x9f, 0xbd, 0x43, 0xc4, 0x39, 0x31, 0xd3, 0xf2, 
-0xd5, 0x8b, 0x6e, 0xda, 0x01, 0xb1, 0x9c, 0x49, 0xd8, 0xac, 0xf3, 0xcf, 0xca, 0xf4, 0x47, 0x10, 
-0x6f, 0xf0, 0x4a, 0x5c, 0x38, 0x57, 0x73, 0x97, 0xcb, 0xa1, 0xe8, 0x3e, 0x96, 0x61, 0x0d, 0x0f, 
-0xe0, 0x7c, 0x71, 0xcc, 0x90, 0x06, 0xf7, 0x1c, 0xc2, 0x6a, 0xae, 0x69, 0x17, 0x99, 0x3a, 0x27, 
-0xd9, 0xeb, 0x2b, 0x22, 0xd2, 0xa9, 0x07, 0x33, 0x2d, 0x3c, 0x15, 0xc9, 0x87, 0xaa, 0x50, 0xa5, 
-0x03, 0x59, 0x09, 0x1a, 0x65, 0xd7, 0x84, 0xd0, 0x82, 0x29, 0x5a, 0x1e, 0x7b, 0xa8, 0x6d, 0x2c 
-};
+	// Multiply column bytes by 2 modulo BPOLY.
+	// This operation is done the following way to ensure cycle count
+	// independent from data contents. Take care when changing this code.
+	xor = 0;
+	if (column0 & 0x80) {
+		xor = BPOLY;
+	}
+	column0 <<= 1;
+	column0  ^= xor;
+	
+	xor = 0;
+	if (column1 & 0x80) {
+		xor = BPOLY;
+	}
+	column1 <<= 1;
+	column1  ^= xor;
+	
+	xor = 0;
+	if (column2 & 0x80) {
+		xor = BPOLY;
+	}
+	column2 <<= 1;
+	column2  ^= xor;
+	
+	xor = 0;
+	if (column3 & 0x80) {
+		xor = BPOLY;
+	}
+	column3 <<= 1;
+	column3  ^= xor;
 
-// combined Xtimes3[Sbox[]]
-#define Xtime3Sbox(i) (pgm_read_byte(&P_Xtime3Sbox[i]))
-const unsigned char P_Xtime3Sbox[256] __attribute__ ((__progmem__)) = {
-0xa5, 0x84, 0x99, 0x8d, 0x0d, 0xbd, 0xb1, 0x54, 0x50, 0x03, 0xa9, 0x7d, 0x19, 0x62, 0xe6, 0x9a, 
-0x45, 0x9d, 0x40, 0x87, 0x15, 0xeb, 0xc9, 0x0b, 0xec, 0x67, 0xfd, 0xea, 0xbf, 0xf7, 0x96, 0x5b, 
-0xc2, 0x1c, 0xae, 0x6a, 0x5a, 0x41, 0x02, 0x4f, 0x5c, 0xf4, 0x34, 0x08, 0x93, 0x73, 0x53, 0x3f, 
-0x0c, 0x52, 0x65, 0x5e, 0x28, 0xa1, 0x0f, 0xb5, 0x09, 0x36, 0x9b, 0x3d, 0x26, 0x69, 0xcd, 0x9f, 
-0x1b, 0x9e, 0x74, 0x2e, 0x2d, 0xb2, 0xee, 0xfb, 0xf6, 0x4d, 0x61, 0xce, 0x7b, 0x3e, 0x71, 0x97, 
-0xf5, 0x68, 0x00, 0x2c, 0x60, 0x1f, 0xc8, 0xed, 0xbe, 0x46, 0xd9, 0x4b, 0xde, 0xd4, 0xe8, 0x4a, 
-0x6b, 0x2a, 0xe5, 0x16, 0xc5, 0xd7, 0x55, 0x94, 0xcf, 0x10, 0x06, 0x81, 0xf0, 0x44, 0xba, 0xe3, 
-0xf3, 0xfe, 0xc0, 0x8a, 0xad, 0xbc, 0x48, 0x04, 0xdf, 0xc1, 0x75, 0x63, 0x30, 0x1a, 0x0e, 0x6d, 
-0x4c, 0x14, 0x35, 0x2f, 0xe1, 0xa2, 0xcc, 0x39, 0x57, 0xf2, 0x82, 0x47, 0xac, 0xe7, 0x2b, 0x95, 
-0xa0, 0x98, 0xd1, 0x7f, 0x66, 0x7e, 0xab, 0x83, 0xca, 0x29, 0xd3, 0x3c, 0x79, 0xe2, 0x1d, 0x76, 
-0x3b, 0x56, 0x4e, 0x1e, 0xdb, 0x0a, 0x6c, 0xe4, 0x5d, 0x6e, 0xef, 0xa6, 0xa8, 0xa4, 0x37, 0x8b, 
-0x32, 0x43, 0x59, 0xb7, 0x8c, 0x64, 0xd2, 0xe0, 0xb4, 0xfa, 0x07, 0x25, 0xaf, 0x8e, 0xe9, 0x18, 
-0xd5, 0x88, 0x6f, 0x72, 0x24, 0xf1, 0xc7, 0x51, 0x23, 0x7c, 0x9c, 0x21, 0xdd, 0xdc, 0x86, 0x85, 
-0x90, 0x42, 0xc4, 0xaa, 0xd8, 0x05, 0x01, 0x12, 0xa3, 0x5f, 0xf9, 0xd0, 0x91, 0x58, 0x27, 0xb9, 
-0x38, 0x13, 0xb3, 0x33, 0xbb, 0x70, 0x89, 0xa7, 0xb6, 0x22, 0x92, 0x20, 0x49, 0xff, 0x78, 0x7a, 
-0x8f, 0xf8, 0x80, 0x17, 0xda, 0x31, 0xc6, 0xb8, 0xc3, 0xb0, 0x77, 0x11, 0xcb, 0xfc, 0xd6, 0x3a 
-};
-
-// modular multiplication tables
-// based on:
-
-// Xtime2[x] = (x & 0x80 ? 0x1b : 0) ^ (x + x)
-// Xtime3[x] = x^Xtime2[x];
+	// More partial sums.
+	result0 ^= column0 ^ column1;
+	result1 ^= column1 ^ column2;
+	result2 ^= column2 ^ column3;
+	result3 ^= column0 ^ column3;
 
-#define Xtime2(i) (pgm_read_byte(&P_Xtime2[i]))
-const unsigned char P_Xtime2[256] __attribute__ ((__progmem__)) = {
-0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e, 
-0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, 
-0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e, 
-0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e, 0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e, 
-0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e, 0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e, 
-0xa0, 0xa2, 0xa4, 0xa6, 0xa8, 0xaa, 0xac, 0xae, 0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe, 
-0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde, 
-0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee, 0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe, 
-0x1b, 0x19, 0x1f, 0x1d, 0x13, 0x11, 0x17, 0x15, 0x0b, 0x09, 0x0f, 0x0d, 0x03, 0x01, 0x07, 0x05, 
-0x3b, 0x39, 0x3f, 0x3d, 0x33, 0x31, 0x37, 0x35, 0x2b, 0x29, 0x2f, 0x2d, 0x23, 0x21, 0x27, 0x25, 
-0x5b, 0x59, 0x5f, 0x5d, 0x53, 0x51, 0x57, 0x55, 0x4b, 0x49, 0x4f, 0x4d, 0x43, 0x41, 0x47, 0x45, 
-0x7b, 0x79, 0x7f, 0x7d, 0x73, 0x71, 0x77, 0x75, 0x6b, 0x69, 0x6f, 0x6d, 0x63, 0x61, 0x67, 0x65, 
-0x9b, 0x99, 0x9f, 0x9d, 0x93, 0x91, 0x97, 0x95, 0x8b, 0x89, 0x8f, 0x8d, 0x83, 0x81, 0x87, 0x85, 
-0xbb, 0xb9, 0xbf, 0xbd, 0xb3, 0xb1, 0xb7, 0xb5, 0xab, 0xa9, 0xaf, 0xad, 0xa3, 0xa1, 0xa7, 0xa5, 
-0xdb, 0xd9, 0xdf, 0xdd, 0xd3, 0xd1, 0xd7, 0xd5, 0xcb, 0xc9, 0xcf, 0xcd, 0xc3, 0xc1, 0xc7, 0xc5, 
-0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5}; 
+	// Multiply column bytes by 2 modulo BPOLY.
+	// This operation is done the following way to ensure cycle count
+	// independent from data contents. Take care when changing this code.
+	xor = 0;
+	if (column0 & 0x80) {
+		xor = BPOLY;
+	}
+	column0 <<= 1;
+	column0  ^= xor;
+	
+	xor = 0;
+	if (column1 & 0x80) {
+		xor = BPOLY;
+	}
+	column1 <<= 1;
+	column1  ^= xor;
+	
+	xor = 0;
+	if (column2 & 0x80) {
+		xor = BPOLY;
+	}
+	column2 <<= 1;
+	column2  ^= xor;
+	
+	xor = 0;
+	if (column3 & 0x80) {
+		xor = BPOLY;
+	}
+	column3 <<= 1;
+	column3  ^= xor;
 
-#define Xtime9(i) (pgm_read_byte(&P_Xtime9[i]))
-const unsigned char P_Xtime9[256] __attribute__ ((__progmem__)) = {
-0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77, 
-0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7, 
-0x3b, 0x32, 0x29, 0x20, 0x1f, 0x16, 0x0d, 0x04, 0x73, 0x7a, 0x61, 0x68, 0x57, 0x5e, 0x45, 0x4c, 
-0xab, 0xa2, 0xb9, 0xb0, 0x8f, 0x86, 0x9d, 0x94, 0xe3, 0xea, 0xf1, 0xf8, 0xc7, 0xce, 0xd5, 0xdc, 
-0x76, 0x7f, 0x64, 0x6d, 0x52, 0x5b, 0x40, 0x49, 0x3e, 0x37, 0x2c, 0x25, 0x1a, 0x13, 0x08, 0x01, 
-0xe6, 0xef, 0xf4, 0xfd, 0xc2, 0xcb, 0xd0, 0xd9, 0xae, 0xa7, 0xbc, 0xb5, 0x8a, 0x83, 0x98, 0x91, 
-0x4d, 0x44, 0x5f, 0x56, 0x69, 0x60, 0x7b, 0x72, 0x05, 0x0c, 0x17, 0x1e, 0x21, 0x28, 0x33, 0x3a, 
-0xdd, 0xd4, 0xcf, 0xc6, 0xf9, 0xf0, 0xeb, 0xe2, 0x95, 0x9c, 0x87, 0x8e, 0xb1, 0xb8, 0xa3, 0xaa, 
-0xec, 0xe5, 0xfe, 0xf7, 0xc8, 0xc1, 0xda, 0xd3, 0xa4, 0xad, 0xb6, 0xbf, 0x80, 0x89, 0x92, 0x9b, 
-0x7c, 0x75, 0x6e, 0x67, 0x58, 0x51, 0x4a, 0x43, 0x34, 0x3d, 0x26, 0x2f, 0x10, 0x19, 0x02, 0x0b, 
-0xd7, 0xde, 0xc5, 0xcc, 0xf3, 0xfa, 0xe1, 0xe8, 0x9f, 0x96, 0x8d, 0x84, 0xbb, 0xb2, 0xa9, 0xa0, 
-0x47, 0x4e, 0x55, 0x5c, 0x63, 0x6a, 0x71, 0x78, 0x0f, 0x06, 0x1d, 0x14, 0x2b, 0x22, 0x39, 0x30, 
-0x9a, 0x93, 0x88, 0x81, 0xbe, 0xb7, 0xac, 0xa5, 0xd2, 0xdb, 0xc0, 0xc9, 0xf6, 0xff, 0xe4, 0xed, 
-0x0a, 0x03, 0x18, 0x11, 0x2e, 0x27, 0x3c, 0x35, 0x42, 0x4b, 0x50, 0x59, 0x66, 0x6f, 0x74, 0x7d, 
-0xa1, 0xa8, 0xb3, 0xba, 0x85, 0x8c, 0x97, 0x9e, 0xe9, 0xe0, 0xfb, 0xf2, 0xcd, 0xc4, 0xdf, 0xd6, 
-0x31, 0x38, 0x23, 0x2a, 0x15, 0x1c, 0x07, 0x0e, 0x79, 0x70, 0x6b, 0x62, 0x5d, 0x54, 0x4f, 0x46};
-
-#define XtimeB(i) (pgm_read_byte(&P_XtimeB[i]))
-const unsigned char P_XtimeB[256] __attribute__ ((__progmem__)) = {
-0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69, 
-0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9, 
-0x7b, 0x70, 0x6d, 0x66, 0x57, 0x5c, 0x41, 0x4a, 0x23, 0x28, 0x35, 0x3e, 0x0f, 0x04, 0x19, 0x12, 
-0xcb, 0xc0, 0xdd, 0xd6, 0xe7, 0xec, 0xf1, 0xfa, 0x93, 0x98, 0x85, 0x8e, 0xbf, 0xb4, 0xa9, 0xa2, 
-0xf6, 0xfd, 0xe0, 0xeb, 0xda, 0xd1, 0xcc, 0xc7, 0xae, 0xa5, 0xb8, 0xb3, 0x82, 0x89, 0x94, 0x9f, 
-0x46, 0x4d, 0x50, 0x5b, 0x6a, 0x61, 0x7c, 0x77, 0x1e, 0x15, 0x08, 0x03, 0x32, 0x39, 0x24, 0x2f, 
-0x8d, 0x86, 0x9b, 0x90, 0xa1, 0xaa, 0xb7, 0xbc, 0xd5, 0xde, 0xc3, 0xc8, 0xf9, 0xf2, 0xef, 0xe4, 
-0x3d, 0x36, 0x2b, 0x20, 0x11, 0x1a, 0x07, 0x0c, 0x65, 0x6e, 0x73, 0x78, 0x49, 0x42, 0x5f, 0x54, 
-0xf7, 0xfc, 0xe1, 0xea, 0xdb, 0xd0, 0xcd, 0xc6, 0xaf, 0xa4, 0xb9, 0xb2, 0x83, 0x88, 0x95, 0x9e, 
-0x47, 0x4c, 0x51, 0x5a, 0x6b, 0x60, 0x7d, 0x76, 0x1f, 0x14, 0x09, 0x02, 0x33, 0x38, 0x25, 0x2e, 
-0x8c, 0x87, 0x9a, 0x91, 0xa0, 0xab, 0xb6, 0xbd, 0xd4, 0xdf, 0xc2, 0xc9, 0xf8, 0xf3, 0xee, 0xe5, 
-0x3c, 0x37, 0x2a, 0x21, 0x10, 0x1b, 0x06, 0x0d, 0x64, 0x6f, 0x72, 0x79, 0x48, 0x43, 0x5e, 0x55, 
-0x01, 0x0a, 0x17, 0x1c, 0x2d, 0x26, 0x3b, 0x30, 0x59, 0x52, 0x4f, 0x44, 0x75, 0x7e, 0x63, 0x68, 
-0xb1, 0xba, 0xa7, 0xac, 0x9d, 0x96, 0x8b, 0x80, 0xe9, 0xe2, 0xff, 0xf4, 0xc5, 0xce, 0xd3, 0xd8, 
-0x7a, 0x71, 0x6c, 0x67, 0x56, 0x5d, 0x40, 0x4b, 0x22, 0x29, 0x34, 0x3f, 0x0e, 0x05, 0x18, 0x13, 
-0xca, 0xc1, 0xdc, 0xd7, 0xe6, 0xed, 0xf0, 0xfb, 0x92, 0x99, 0x84, 0x8f, 0xbe, 0xb5, 0xa8, 0xa3}; 
+	// More partial sums.
+	result0 ^= column0 ^ column2;
+	result1 ^= column1 ^ column3;
+	result2 ^= column0 ^ column2;
+	result3 ^= column1 ^ column3;
 
-#define XtimeD(i) (pgm_read_byte(&P_XtimeD[i]))
-const unsigned char P_XtimeD[256] __attribute__ ((__progmem__)) = {
-0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b, 
-0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b, 
-0xbb, 0xb6, 0xa1, 0xac, 0x8f, 0x82, 0x95, 0x98, 0xd3, 0xde, 0xc9, 0xc4, 0xe7, 0xea, 0xfd, 0xf0, 
-0x6b, 0x66, 0x71, 0x7c, 0x5f, 0x52, 0x45, 0x48, 0x03, 0x0e, 0x19, 0x14, 0x37, 0x3a, 0x2d, 0x20, 
-0x6d, 0x60, 0x77, 0x7a, 0x59, 0x54, 0x43, 0x4e, 0x05, 0x08, 0x1f, 0x12, 0x31, 0x3c, 0x2b, 0x26, 
-0xbd, 0xb0, 0xa7, 0xaa, 0x89, 0x84, 0x93, 0x9e, 0xd5, 0xd8, 0xcf, 0xc2, 0xe1, 0xec, 0xfb, 0xf6, 
-0xd6, 0xdb, 0xcc, 0xc1, 0xe2, 0xef, 0xf8, 0xf5, 0xbe, 0xb3, 0xa4, 0xa9, 0x8a, 0x87, 0x90, 0x9d, 
-0x06, 0x0b, 0x1c, 0x11, 0x32, 0x3f, 0x28, 0x25, 0x6e, 0x63, 0x74, 0x79, 0x5a, 0x57, 0x40, 0x4d, 
-0xda, 0xd7, 0xc0, 0xcd, 0xee, 0xe3, 0xf4, 0xf9, 0xb2, 0xbf, 0xa8, 0xa5, 0x86, 0x8b, 0x9c, 0x91, 
-0x0a, 0x07, 0x10, 0x1d, 0x3e, 0x33, 0x24, 0x29, 0x62, 0x6f, 0x78, 0x75, 0x56, 0x5b, 0x4c, 0x41, 
-0x61, 0x6c, 0x7b, 0x76, 0x55, 0x58, 0x4f, 0x42, 0x09, 0x04, 0x13, 0x1e, 0x3d, 0x30, 0x27, 0x2a, 
-0xb1, 0xbc, 0xab, 0xa6, 0x85, 0x88, 0x9f, 0x92, 0xd9, 0xd4, 0xc3, 0xce, 0xed, 0xe0, 0xf7, 0xfa, 
-0xb7, 0xba, 0xad, 0xa0, 0x83, 0x8e, 0x99, 0x94, 0xdf, 0xd2, 0xc5, 0xc8, 0xeb, 0xe6, 0xf1, 0xfc, 
-0x67, 0x6a, 0x7d, 0x70, 0x53, 0x5e, 0x49, 0x44, 0x0f, 0x02, 0x15, 0x18, 0x3b, 0x36, 0x21, 0x2c, 
-0x0c, 0x01, 0x16, 0x1b, 0x38, 0x35, 0x22, 0x2f, 0x64, 0x69, 0x7e, 0x73, 0x50, 0x5d, 0x4a, 0x47, 
-0xdc, 0xd1, 0xc6, 0xcb, 0xe8, 0xe5, 0xf2, 0xff, 0xb4, 0xb9, 0xae, 0xa3, 0x80, 0x8d, 0x9a, 0x97}; 
+	// Multiply column bytes by 2 modulo BPOLY.
+	// This operation is done the following way to ensure cycle count
+	// independent from data contents. Take care when changing this code.
+	xor = 0;
+	if (column0 & 0x80) {
+		xor = BPOLY;
+	}
+	column0 <<= 1;
+	column0  ^= xor;
+	
+	xor = 0;
+	if (column1 & 0x80) {
+		xor = BPOLY;
+	}
+	column1 <<= 1;
+	column1  ^= xor;
+	
+	xor = 0;
+	if (column2 & 0x80) {
+		xor = BPOLY;
+	}
+	column2 <<= 1;
+	column2  ^= xor;
+	
+	xor = 0;
+	if (column3 & 0x80) {
+		xor = BPOLY;
+	}
+	column3 <<= 1;
+	column3  ^= xor;
 
-#define XtimeE(i) (pgm_read_byte(&P_XtimeE[i]))
-const unsigned char P_XtimeE[256] __attribute__ ((__progmem__)) = {
-0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a, 
-0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba, 
-0xdb, 0xd5, 0xc7, 0xc9, 0xe3, 0xed, 0xff, 0xf1, 0xab, 0xa5, 0xb7, 0xb9, 0x93, 0x9d, 0x8f, 0x81, 
-0x3b, 0x35, 0x27, 0x29, 0x03, 0x0d, 0x1f, 0x11, 0x4b, 0x45, 0x57, 0x59, 0x73, 0x7d, 0x6f, 0x61, 
-0xad, 0xa3, 0xb1, 0xbf, 0x95, 0x9b, 0x89, 0x87, 0xdd, 0xd3, 0xc1, 0xcf, 0xe5, 0xeb, 0xf9, 0xf7, 
-0x4d, 0x43, 0x51, 0x5f, 0x75, 0x7b, 0x69, 0x67, 0x3d, 0x33, 0x21, 0x2f, 0x05, 0x0b, 0x19, 0x17, 
-0x76, 0x78, 0x6a, 0x64, 0x4e, 0x40, 0x52, 0x5c, 0x06, 0x08, 0x1a, 0x14, 0x3e, 0x30, 0x22, 0x2c, 
-0x96, 0x98, 0x8a, 0x84, 0xae, 0xa0, 0xb2, 0xbc, 0xe6, 0xe8, 0xfa, 0xf4, 0xde, 0xd0, 0xc2, 0xcc, 
-0x41, 0x4f, 0x5d, 0x53, 0x79, 0x77, 0x65, 0x6b, 0x31, 0x3f, 0x2d, 0x23, 0x09, 0x07, 0x15, 0x1b, 
-0xa1, 0xaf, 0xbd, 0xb3, 0x99, 0x97, 0x85, 0x8b, 0xd1, 0xdf, 0xcd, 0xc3, 0xe9, 0xe7, 0xf5, 0xfb, 
-0x9a, 0x94, 0x86, 0x88, 0xa2, 0xac, 0xbe, 0xb0, 0xea, 0xe4, 0xf6, 0xf8, 0xd2, 0xdc, 0xce, 0xc0, 
-0x7a, 0x74, 0x66, 0x68, 0x42, 0x4c, 0x5e, 0x50, 0x0a, 0x04, 0x16, 0x18, 0x32, 0x3c, 0x2e, 0x20, 
-0xec, 0xe2, 0xf0, 0xfe, 0xd4, 0xda, 0xc8, 0xc6, 0x9c, 0x92, 0x80, 0x8e, 0xa4, 0xaa, 0xb8, 0xb6, 
-0x0c, 0x02, 0x10, 0x1e, 0x34, 0x3a, 0x28, 0x26, 0x7c, 0x72, 0x60, 0x6e, 0x44, 0x4a, 0x58, 0x56, 
-0x37, 0x39, 0x2b, 0x25, 0x0f, 0x01, 0x13, 0x1d, 0x47, 0x49, 0x5b, 0x55, 0x7f, 0x71, 0x63, 0x6d, 
-0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d}; 
+	// Final partial sum.
+	column0 ^= column1 ^ column2 ^ column3;
+
+	// Final sums stored into original column bytes.
+	column[0] = result0 ^ column0;
+	column[1] = result1 ^ column0;
+	column[2] = result2 ^ column0;
+	column[3] = result3 ^ column0;
+}
+
+
 
-// exchanges columns in each of 4 rows
-// row0 - unchanged, row1- shifted left 1, 
-// row2 - shifted left 2 and row3 - shifted left 3
-void ShiftRows (unsigned char *state)
+void SubBytes( byte * bytes, byte count )
 {
-unsigned char tmp;
+	do {
+		*bytes = sBox[ *bytes ]; // Substitute every byte in state.
+		bytes++;
+	} while( --count );
+}
 
-	// just substitute row 0
-	state[0] = Sbox(state[0]), state[4] = Sbox(state[4]);
-	state[8] = Sbox(state[8]), state[12] = Sbox(state[12]);
 
-	// rotate row 1
-	tmp = Sbox(state[1]), state[1] = Sbox(state[5]);
-	state[5] = Sbox(state[9]), state[9] = Sbox(state[13]), state[13] = tmp;
 
-	// rotate row 2
-	tmp = Sbox(state[2]), state[2] = Sbox(state[10]), state[10] = tmp;
-	tmp = Sbox(state[6]), state[6] = Sbox(state[14]), state[14] = tmp;
-
-	// rotate row 3
-	tmp = Sbox(state[15]), state[15] = Sbox(state[11]);
-	state[11] = Sbox(state[7]), state[7] = Sbox(state[3]), state[3] = tmp;
+void InvSubBytesAndXOR( byte * bytes, byte * key, byte count )
+{
+	do {
+//		*bytes = sBoxInv[ *bytes ] ^ *key; // Inverse substitute every byte in state and add key.
+		*bytes = block2[ *bytes ] ^ *key; // Use block2 directly. Increases speed.
+		bytes++;
+		key++;
+	} while( --count );
 }
 
-// restores columns in each of 4 rows
-// row0 - unchanged, row1- shifted right 1, 
-// row2 - shifted right 2 and row3 - shifted right 3
-void InvShiftRows (unsigned char *state)
+
+
+void InvShiftRows( byte * state )
 {
-unsigned char tmp;
+	byte temp;
+
+	// Note: State is arranged column by column.
 
-	// restore row 0
-	state[0] = InvSbox(state[0]), state[4] = InvSbox(state[4]);
-	state[8] = InvSbox(state[8]), state[12] = InvSbox(state[12]);
+	// Cycle second row right one time.
+	temp = state[ 1 + 3*4 ];
+	state[ 1 + 3*4 ] = state[ 1 + 2*4 ];
+	state[ 1 + 2*4 ] = state[ 1 + 1*4 ];
+	state[ 1 + 1*4 ] = state[ 1 + 0*4 ];
+	state[ 1 + 0*4 ] = temp;
 
-	// restore row 1
-	tmp = InvSbox(state[13]), state[13] = InvSbox(state[9]);
-	state[9] = InvSbox(state[5]), state[5] = InvSbox(state[1]), state[1] = tmp;
+	// Cycle third row right two times.
+	temp = state[ 2 + 0*4 ];
+	state[ 2 + 0*4 ] = state[ 2 + 2*4 ];
+	state[ 2 + 2*4 ] = temp;
+	temp = state[ 2 + 1*4 ];
+	state[ 2 + 1*4 ] = state[ 2 + 3*4 ];
+	state[ 2 + 3*4 ] = temp;
 
-	// restore row 2
-	tmp = InvSbox(state[2]), state[2] = InvSbox(state[10]), state[10] = tmp;
-	tmp = InvSbox(state[6]), state[6] = InvSbox(state[14]), state[14] = tmp;
-
-	// restore row 3
-	tmp = InvSbox(state[3]), state[3] = InvSbox(state[7]);
-	state[7] = InvSbox(state[11]), state[11] = InvSbox(state[15]), state[15] = tmp;
+	// Cycle fourth row right three times, ie. left once.
+	temp = state[ 3 + 0*4 ];
+	state[ 3 + 0*4 ] = state[ 3 + 1*4 ];
+	state[ 3 + 1*4 ] = state[ 3 + 2*4 ];
+	state[ 3 + 2*4 ] = state[ 3 + 3*4 ];
+	state[ 3 + 3*4 ] = temp;
 }
 
-// recombine and mix each row in a column
-void MixSubColumns (unsigned char *state)
-{
-unsigned char tmp[4 * Nb];
+
 
-	// mixing column 0
-	tmp[0] = Xtime2Sbox(state[0]) ^ Xtime3Sbox(state[5]) ^ Sbox(state[10]) ^ Sbox(state[15]);
-	tmp[1] = Sbox(state[0]) ^ Xtime2Sbox(state[5]) ^ Xtime3Sbox(state[10]) ^ Sbox(state[15]);
-	tmp[2] = Sbox(state[0]) ^ Sbox(state[5]) ^ Xtime2Sbox(state[10]) ^ Xtime3Sbox(state[15]);
-	tmp[3] = Xtime3Sbox(state[0]) ^ Sbox(state[5]) ^ Sbox(state[10]) ^ Xtime2Sbox(state[15]);
+void InvMixColumns( byte * state )
+{
+	InvMixColumn( state + 0*4 );
+	InvMixColumn( state + 1*4 );
+	InvMixColumn( state + 2*4 );
+	InvMixColumn( state + 3*4 );
+}
+
+
 
-	// mixing column 1
-	tmp[4] = Xtime2Sbox(state[4]) ^ Xtime3Sbox(state[9]) ^ Sbox(state[14]) ^ Sbox(state[3]);
-	tmp[5] = Sbox(state[4]) ^ Xtime2Sbox(state[9]) ^ Xtime3Sbox(state[14]) ^ Sbox(state[3]);
-	tmp[6] = Sbox(state[4]) ^ Sbox(state[9]) ^ Xtime2Sbox(state[14]) ^ Xtime3Sbox(state[3]);
-	tmp[7] = Xtime3Sbox(state[4]) ^ Sbox(state[9]) ^ Sbox(state[14]) ^ Xtime2Sbox(state[3]);
+void XORBytes( byte * bytes1, byte * bytes2, byte count )
+{
+	do {
+		*bytes1 ^= *bytes2; // Add in GF(2), ie. XOR.
+		bytes1++;
+		bytes2++;
+	} while( --count );
+}
 
-	// mixing column 2
-	tmp[8] = Xtime2Sbox(state[8]) ^ Xtime3Sbox(state[13]) ^ Sbox(state[2]) ^ Sbox(state[7]);
-	tmp[9] = Sbox(state[8]) ^ Xtime2Sbox(state[13]) ^ Xtime3Sbox(state[2]) ^ Sbox(state[7]);
-	tmp[10]  = Sbox(state[8]) ^ Sbox(state[13]) ^ Xtime2Sbox(state[2]) ^ Xtime3Sbox(state[7]);
-	tmp[11]  = Xtime3Sbox(state[8]) ^ Sbox(state[13]) ^ Sbox(state[2]) ^ Xtime2Sbox(state[7]);
+
 
-	// mixing column 3
-	tmp[12] = Xtime2Sbox(state[12]) ^ Xtime3Sbox(state[1]) ^ Sbox(state[6]) ^ Sbox(state[11]);
-	tmp[13] = Sbox(state[12]) ^ Xtime2Sbox(state[1]) ^ Xtime3Sbox(state[6]) ^ Sbox(state[11]);
-	tmp[14] = Sbox(state[12]) ^ Sbox(state[1]) ^ Xtime2Sbox(state[6]) ^ Xtime3Sbox(state[11]);
-	tmp[15] = Xtime3Sbox(state[12]) ^ Sbox(state[1]) ^ Sbox(state[6]) ^ Xtime2Sbox(state[11]);
-
-	memcpy (state, tmp, sizeof(tmp));
+void CopyBytes( byte * to, byte * from, byte count )
+{
+	do {
+		*to = *from;
+		to++;
+		from++;
+	} while( --count );
 }
 
-// restore and un-mix each row in a column
-void InvMixSubColumns (unsigned char *state)
+
+
+void KeyExpansion( byte * key, byte * expandedKey )
 {
-unsigned char tmp[4 * Nb];
-int i;
+	byte temp[4];
+	byte i;
+	byte Rcon[4] = { 0x01, 0x00, 0x00, 0x00 }; // Round constant.
+	
+#if 0
+    // matt
+	unsigned char BOOTFLASH * key = kTable;
+#endif
 
-	// restore column 0
-	tmp[0] = XtimeE(state[0]) ^ XtimeB(state[1]) ^ XtimeD(state[2]) ^ Xtime9(state[3]);
-	tmp[5] = Xtime9(state[0]) ^ XtimeE(state[1]) ^ XtimeB(state[2]) ^ XtimeD(state[3]);
-	tmp[10] = XtimeD(state[0]) ^ Xtime9(state[1]) ^ XtimeE(state[2]) ^ XtimeB(state[3]);
-	tmp[15] = XtimeB(state[0]) ^ XtimeD(state[1]) ^ Xtime9(state[2]) ^ XtimeE(state[3]);
+	// Copy key to start of expanded key.
+	i = KEYLENGTH;
+	do {
+		*expandedKey = *key;
+		expandedKey++;
+		key++;
+	} while( --i );
+
+	// Prepare last 4 bytes of key in temp.
+	expandedKey -= 4;
+	temp[0] = *(expandedKey++);
+	temp[1] = *(expandedKey++);
+	temp[2] = *(expandedKey++);
+	temp[3] = *(expandedKey++);
 
-	// restore column 1
-	tmp[4] = XtimeE(state[4]) ^ XtimeB(state[5]) ^ XtimeD(state[6]) ^ Xtime9(state[7]);
-	tmp[9] = Xtime9(state[4]) ^ XtimeE(state[5]) ^ XtimeB(state[6]) ^ XtimeD(state[7]);
-	tmp[14] = XtimeD(state[4]) ^ Xtime9(state[5]) ^ XtimeE(state[6]) ^ XtimeB(state[7]);
-	tmp[3] = XtimeB(state[4]) ^ XtimeD(state[5]) ^ Xtime9(state[6]) ^ XtimeE(state[7]);
+	// Expand key.
+	i = KEYLENGTH;
+	while( i < BLOCKSIZE*(ROUNDS+1) ) {
+		// Are we at the start of a multiple of the key size?
+		if( (i % KEYLENGTH) == 0 ) {
+			CycleLeft( temp ); // Cycle left once.
+			SubBytes( temp, 4 ); // Substitute each byte.
+			XORBytes( temp, Rcon, 4 ); // Add constant in GF(2).
+			*Rcon = (*Rcon << 1) ^ (*Rcon & 0x80 ? BPOLY : 0);
+		}
 
-	// restore column 2
-	tmp[8] = XtimeE(state[8]) ^ XtimeB(state[9]) ^ XtimeD(state[10]) ^ Xtime9(state[11]);
-	tmp[13] = Xtime9(state[8]) ^ XtimeE(state[9]) ^ XtimeB(state[10]) ^ XtimeD(state[11]);
-	tmp[2]  = XtimeD(state[8]) ^ Xtime9(state[9]) ^ XtimeE(state[10]) ^ XtimeB(state[11]);
-	tmp[7]  = XtimeB(state[8]) ^ XtimeD(state[9]) ^ Xtime9(state[10]) ^ XtimeE(state[11]);
+		// Keysize larger than 24 bytes, ie. larger that 192 bits?
+		#if KEYLENGTH > 24
+		// Are we right past a block size?
+		else if( (i % KEYLENGTH) == BLOCKSIZE ) {
+			SubBytes( temp, 4 ); // Substitute each byte.
+		}
+		#endif
 
-	// restore column 3
-	tmp[12] = XtimeE(state[12]) ^ XtimeB(state[13]) ^ XtimeD(state[14]) ^ Xtime9(state[15]);
-	tmp[1] = Xtime9(state[12]) ^ XtimeE(state[13]) ^ XtimeB(state[14]) ^ XtimeD(state[15]);
-	tmp[6] = XtimeD(state[12]) ^ Xtime9(state[13]) ^ XtimeE(state[14]) ^ XtimeB(state[15]);
-	tmp[11] = XtimeB(state[12]) ^ XtimeD(state[13]) ^ Xtime9(state[14]) ^ XtimeE(state[15]);
+		// Add bytes in GF(2) one KEYLENGTH away.
+		XORBytes( temp, expandedKey - KEYLENGTH, 4 );
 
-	for( i=0; i < 4 * Nb; i++ )
-		state[i] = InvSbox(tmp[i]);
+		// Copy result to current 4 bytes.
+		*(expandedKey++) = temp[ 0 ];
+		*(expandedKey++) = temp[ 1 ];
+		*(expandedKey++) = temp[ 2 ];
+		*(expandedKey++) = temp[ 3 ];
+
+		i += 4; // Next 4 bytes.
+	}	
 }
 
-// encrypt/decrypt columns of the key
-// n.b. you can replace this with
-//      byte-wise xor if you wish.
+
+
+void InvCipher( byte * block, byte * expandedKey )
+{
+	byte round = ROUNDS-1;
+	expandedKey += BLOCKSIZE * ROUNDS;
+
+	XORBytes( block, expandedKey, 16 );
+	expandedKey -= BLOCKSIZE;
 
-void AddRoundKey (unsigned *state, unsigned *key)
-{
-int idx;
+	do {
+		InvShiftRows( block );
+		InvSubBytesAndXOR( block, expandedKey, 16 );
+		expandedKey -= BLOCKSIZE;
+		InvMixColumns( block );
+	} while( --round );
 
-	for( idx = 0; idx < 4; idx++ )
-		state[idx] ^= key[idx];
+	InvShiftRows( block );
+	InvSubBytesAndXOR( block, expandedKey, 16 );
 }
 
-unsigned char Rcon[11] = {
-0x00, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36};
+
 
-// produce Nb bytes for each round
-void ExpandKey (unsigned char *key, unsigned char *expkey)
+void aesInit( unsigned char *key, unsigned char * tempbuf )
 {
-unsigned char tmp0, tmp1, tmp2, tmp3, tmp4;
-unsigned idx;
+	powTbl = block1;
+	logTbl = block2;
+	CalcPowLog( powTbl, logTbl );
 
-	memcpy (expkey, key, Nk * 4);
+	sBox = tempbuf;
+	CalcSBox( sBox );
 
-	for( idx = Nk; idx < Nb * (Nr + 1); idx++ ) {
-		tmp0 = expkey[4*idx - 4];
-		tmp1 = expkey[4*idx - 3];
-		tmp2 = expkey[4*idx - 2];
-		tmp3 = expkey[4*idx - 1];
-		if( !(idx % Nk) ) {
-			tmp4 = tmp3;
-			tmp3 = Sbox(tmp0);
-			tmp0 = Sbox(tmp1) ^ Rcon[idx/Nk];
-			tmp1 = Sbox(tmp2);
-			tmp2 = Sbox(tmp4);
-		} else if( Nk > 6 && idx % Nk == 4 ) {
-			tmp0 = Sbox(tmp0);
-			tmp1 = Sbox(tmp1);
-			tmp2 = Sbox(tmp2);
-			tmp3 = Sbox(tmp3);
-		}
+	expandedKey = block1;
+	KeyExpansion( key, expandedKey );
+	
+	sBoxInv = block2; // Must be block2.
+	CalcSBoxInv( sBox, sBoxInv );
+}	
+
+
 
-		expkey[4*idx+0] = expkey[4*idx - 4*Nk + 0] ^ tmp0;
-		expkey[4*idx+1] = expkey[4*idx - 4*Nk + 1] ^ tmp1;
-		expkey[4*idx+2] = expkey[4*idx - 4*Nk + 2] ^ tmp2;
-		expkey[4*idx+3] = expkey[4*idx - 4*Nk + 3] ^ tmp3;
+void aesDecrypt( unsigned char * buffer, unsigned char * chainBlock )
+{
+	byte temp[ BLOCKSIZE ];
+
+	CopyBytes( temp, buffer, BLOCKSIZE );
+	InvCipher( buffer, expandedKey );
+	if (chainBlock)
+	{
+		XORBytes( buffer, chainBlock, BLOCKSIZE );
+		CopyBytes( chainBlock, temp, BLOCKSIZE );
 	}
 }
 
-// encrypt one 128 bit block
-void Encrypt (unsigned char *in, unsigned char *expkey, unsigned char *out)
-{
-unsigned char state[Nb * 4];
-unsigned round;
-
-	memcpy (state, in, Nb * 4);
-	AddRoundKey ((unsigned *)state, (unsigned *)expkey);
-
-	for( round = 1; round < Nr + 1; round++ ) {
-		if( round < Nr )
-			MixSubColumns (state);
-		else
-			ShiftRows (state);
-
-		AddRoundKey ((unsigned *)state, (unsigned *)expkey + round * Nb);
-	}
-
-	memcpy (out, state, sizeof(state));
-}
-
-void Decrypt (unsigned char *in, unsigned char *expkey, unsigned char *out)
-{
-unsigned char state[Nb * 4];
-unsigned round;
-
-	memcpy (state, in, sizeof(state));
-
-	AddRoundKey ((unsigned *)state, (unsigned *)expkey + Nr * Nb);
-	InvShiftRows(state);
-
-	for( round = Nr; round--; )
-	{
-		AddRoundKey ((unsigned *)state, (unsigned *)expkey + round * Nb);
-		if( round )
-			InvMixSubColumns (state);
-	} 
-
-	memcpy (out, state, sizeof(state));
-}
-
-
-#if 0
-
-/*avr specific routines*/
-#include "backward.h"
-#include <avr/io.h>
-#include <avr/pgmspace.h>
-#include <avr/eeprom.h>
-#include <avr/interrupt.h>
-#include <avr/sleep.h>
-
-
-
-#if defined(at90s2313) || defined(at90s8535)
-#else
-#define ATmega
 #endif
-
-#ifdef ATmega
-#define USR UCSRA
-#endif
-
-
-
-void printP (PGM_P string){
-        char c;
-        c=pgm_read_byte(string);
-                   while (c) {
-                   loop_until_bit_is_set(USR, UDRE);
-                   UDR = c;
-                   c=pgm_read_byte(++string);
-                   }
-                   return;
-                 }
-
-
-
-void print (const char *string){
-                   while (*string) {
-                   loop_until_bit_is_set(USR, UDRE);
-                   UDR = *string++;
-                   }
-                   return;
-                 }
-
-void scan(char *string){
-char c;
-        do      {
-                do {
-                        loop_until_bit_is_set(USR, RXC);
-                        c =UDR;
-                        } while bit_is_set(USR, FE);
-                *string++ = c;
-                //echo the character
-                loop_until_bit_is_set(USR, UDRE);
-                UDR = c;
-                } while ( c != '\r' );
-        loop_until_bit_is_set(USR, UDRE);
-        UDR = '\n';
-        string[-1]=0;
-        }
-
-
-//UART initialize
-#ifdef ATmega
-#define UCR UCSRB
-#define UART_INIT(baud) { \
-UBRRH=0; \
-UBRRL= (XTAL/baud+15)/16-1; \
-UCSRB=(1<<TXEN)|(1<<RXEN); \
-UCSRC=(1<<URSEL)|(1<<UCSZ0)|(1<<UCSZ1)|(1<<USBS); }
-#else
-#define UART_INIT(baud) { \
-UBRR = (XTAL/baud+15)/16-1; \
-sbi(UCR, TXEN); \
-sbi(UCR, RXEN); \
-}
-#endif
-
-#define itoa10(N,S) itoa(N,S,10)
-#define itoa16(N,S) itoa(N,S,16)
-
-
-//DEMO
-
-unsigned char sampleout[16];
-
-
-unsigned char samplekey[] = {0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab,
-0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c};
-
-unsigned char samplein[] = {0x32, 0x43, 0xf6, 0xa8, 0x88, 0x5a, 0x30, 0x8d, 0x31,
-0x31, 0x98, 0xa2, 0xe0, 0x37, 0x07, 0x34};
-
-#define BAUD 115200
-
-int main(void)
-{
-UART_INIT(BAUD);
-
-unsigned char expkey[4 * Nb * (Nr + 1)];
-unsigned char i;
-char c[8];
-
-	printP(PSTR("Original: "));
-	for( i = 0; i < 16; i++ ) {itoa16(samplein[i],c);print(c); print(" ");}
-	printP(PSTR("\n"));
-
-	ExpandKey (samplekey, expkey);
-	Encrypt (samplein, expkey, sampleout);
-
-	printP(PSTR("Encrypted: "));
-	for( i = 0; i < 16; i++ ) {itoa16(sampleout[i],c);print(c); print(" ");}
-	printP(PSTR("\n"));
-
-	Decrypt (sampleout, expkey, samplein);
-
-	printP(PSTR("Decrypted: "));
-
-	for( i = 0; i < 16; i++ ) {itoa16(samplein[i],c);print(c); print(" ");}
-	printP(PSTR("\n"));
-}
-
-
-#endif
\ No newline at end of file
--- a/aes.h	Thu Jun 06 00:05:13 2013 +0800
+++ b/aes.h	Wed Jun 12 22:57:44 2013 +0800
@@ -1,16 +1,32 @@
-#ifndef AES_H
-#define AES_H
-
-// 4*nB*(nK+1)
-#define AES_EXPKEY_SIZE (4*4*(4+1))
-
-void ExpandKey (unsigned char *key, unsigned char *expkey);
-// encrypt one 128 bit block
-void Encrypt (unsigned char *in, unsigned char *expkey, unsigned char *out);
-
-void Decrypt (unsigned char *in, unsigned char *expkey, unsigned char *out);
-
-
-
-
-#endif
\ No newline at end of file
+//=============================================================================
+// Copyright Atmel Corporation 2003. All Rights Reserved.
+//
+// File:			des.h
+// Compiler:		IAR Atmel AVR C/EC++ Compiler
+// Output Size:
+// Based on work by:�E, VU
+// Created:			4-Feb-2003	JP (Atmel Finland)
+// Modified:	
+//
+// Support Mail:	[email protected]
+//
+// Description:		Please refer to Application Note Documentation for more
+//					information.
+//
+//					For details on DES, please refer to the official FIPS 46-3
+//					document:
+//
+//				http://csrc.nist.gov/publications/fips/fips46-3/fips46-3.pdf
+//
+//=============================================================================
+
+#ifndef AES_H
+#define AES_H
+
+//#include "bootldr.h"
+
+extern void aesInit( unsigned char *key, unsigned char * tempbuf );
+extern void aesDecrypt(unsigned char *buffer, unsigned char *chainBlock);
+
+#endif // AES_H
+