comparison sha1-asm.S @ 0:8705acff2494

lots of stuff
author Matt Johnston <matt@ucc.asn.au>
date Sat, 01 Jun 2013 01:38:42 +0800
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:8705acff2494
1 /* sha1-asm.S */
2 /*
3 This file is part of the AVR-Crypto-Lib.
4 Copyright (C) 2008 Daniel Otte ([email protected])
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19 /*
20 * Author: Daniel Otte
21 *
22 * License: GPLv3 or later
23 */
24 ; SHA1 implementation in assembler for AVR
25 SHA1_BLOCK_BITS = 512
26 SHA1_HASH_BITS = 160
27
28 .macro precall
29 /* push r18 - r27, r30 - r31*/
30 push r0
31 push r1
32 push r18
33 push r19
34 push r20
35 push r21
36 push r22
37 push r23
38 push r24
39 push r25
40 push r26
41 push r27
42 push r30
43 push r31
44 clr r1
45 .endm
46
47 .macro postcall
48 pop r31
49 pop r30
50 pop r27
51 pop r26
52 pop r25
53 pop r24
54 pop r23
55 pop r22
56 pop r21
57 pop r20
58 pop r19
59 pop r18
60 pop r1
61 pop r0
62 .endm
63
64
65 .macro hexdump length
66 push r27
67 push r26
68 ldi r25, '\r'
69 mov r24, r25
70 call uart_putc
71 ldi r25, '\n'
72 mov r24, r25
73 call uart_putc
74 pop r26
75 pop r27
76 movw r24, r26
77 .if \length > 16
78 ldi r22, lo8(16)
79 ldi r23, hi8(16)
80 push r27
81 push r26
82 call uart_hexdump
83 pop r26
84 pop r27
85 adiw r26, 16
86 hexdump \length-16
87 .else
88 ldi r22, lo8(\length)
89 ldi r23, hi8(\length)
90 call uart_hexdump
91 .endif
92 .endm
93
94 .macro delay
95 /*
96 push r0
97 push r1
98 clr r0
99 1: clr r1
100 2: dec r1
101 brne 2b
102 dec r0
103 brne 1b
104 pop r1
105 pop r0 // */
106 .endm
107
108 /* X points to Block */
109 .macro dbg_hexdump length
110 /*
111 precall
112 hexdump \length
113 postcall
114 // */
115 .endm
116
117
118
119 .section .text
120
121 SPL = 0x3D
122 SPH = 0x3E
123 SREG = 0x3F
124
125
126 ;
127 ;sha1_ctx_t is:
128 ;
129 ; [h0][h1][h2][h3][h4][length]
130 ; hn is 32 bit large, length is 64 bit large
131
132 ;###########################################################
133
134 .global sha1_ctx2hash
135 ; === sha1_ctx2hash ===
136 ; this function converts a state into a normal hash (bytestring)
137 ; param1: the 16-bit destination pointer
138 ; given in r25,r24 (r25 is most significant)
139 ; param2: the 16-bit pointer to sha1_ctx structure
140 ; given in r23,r22
141 sha1_ctx2hash:
142 movw r26, r22
143 movw r30, r24
144 ldi r21, 5
145 sbiw r26, 4
146 1:
147 ldi r20, 4
148 adiw r26, 8
149 2:
150 ld r0, -X
151 st Z+, r0
152 dec r20
153 brne 2b
154
155 dec r21
156 brne 1b
157
158 ret
159
160 ;###########################################################
161
162 .global sha1
163 ; === sha1 ===
164 ; this function calculates SHA-1 hashes from messages in RAM
165 ; param1: the 16-bit hash destination pointer
166 ; given in r25,r24 (r25 is most significant)
167 ; param2: the 16-bit pointer to message
168 ; given in r23,r22
169 ; param3: 32-bit length value (length of message in bits)
170 ; given in r21,r20,r19,r18
171 sha1:
172 sha1_prolog:
173 push r8
174 push r9
175 push r10
176 push r11
177 push r12
178 push r13
179 push r16
180 push r17
181 in r30, SPL
182 in r31, SPH
183 sbiw r30, 5*4+8
184 in r0, SREG
185 cli
186 out SPL, r30
187 out SREG, r0
188 out SPH, r31
189
190 push r25
191 push r24
192 adiw r30, 1
193 movw r16, r30
194
195 movw r8, r18 /* backup of length*/
196 movw r10, r20
197
198 movw r12, r22 /* backup pf msg-ptr */
199
200 movw r24, r16
201 rcall sha1_init
202 /* if length >= 512 */
203 1:
204 tst r11
205 brne 2f
206 tst r10
207 breq 4f
208 2:
209 movw r24, r16
210 movw r22, r12
211 rcall sha1_nextBlock
212 ldi r19, 64
213 add r12, r19
214 adc r13, r1
215 /* length -= 512 */
216 ldi r19, 0x02
217 sub r9, r19
218 sbc r10, r1
219 sbc r11, r1
220 rjmp 1b
221
222 4:
223 movw r24, r16
224 movw r22, r12
225 movw r20, r8
226 rcall sha1_lastBlock
227
228 pop r24
229 pop r25
230 movw r22, r16
231 rcall sha1_ctx2hash
232
233 sha1_epilog:
234 in r30, SPL
235 in r31, SPH
236 adiw r30, 5*4+8
237 in r0, SREG
238 cli
239 out SPL, r30
240 out SREG, r0
241 out SPH, r31
242 pop r17
243 pop r16
244 pop r13
245 pop r12
246 pop r11
247 pop r10
248 pop r9
249 pop r8
250 ret
251
252 ;###########################################################
253
254
255 ; block MUST NOT be larger than 64 bytes
256
257 .global sha1_lastBlock
258 ; === sha1_lastBlock ===
259 ; this function does padding & Co. for calculating SHA-1 hashes
260 ; param1: the 16-bit pointer to sha1_ctx structure
261 ; given in r25,r24 (r25 is most significant)
262 ; param2: an 16-bit pointer to 64 byte block to hash
263 ; given in r23,r22
264 ; param3: an 16-bit integer specifing length of block in bits
265 ; given in r21,r20
266 sha1_lastBlock_localSpace = (SHA1_BLOCK_BITS/8+1)
267
268
269 sha1_lastBlock:
270 cpi r21, 0x02
271 brlo sha1_lastBlock_prolog
272 push r25
273 push r24
274 push r23
275 push r22
276 push r21
277 push r20
278 rcall sha1_nextBlock
279 pop r20
280 pop r21
281 pop r22
282 pop r23
283 pop r24
284 pop r25
285 subi r21, 2
286 ldi r19, 64
287 add r22, r19
288 adc r23, r1
289 rjmp sha1_lastBlock
290 sha1_lastBlock_prolog:
291 /* allocate space on stack */
292 in r30, SPL
293 in r31, SPH
294 in r0, SREG
295 subi r30, lo8(64)
296 sbci r31, hi8(64) /* ??? */
297 cli
298 out SPL, r30
299 out SREG, r0
300 out SPH, r31
301
302 adiw r30, 1 /* SP points to next free byte on stack */
303 mov r18, r20 /* r20 = LSB(length) */
304 lsr r18
305 lsr r18
306 lsr r18
307 bst r21, 0 /* may be we should explain this ... */
308 bld r18, 5 /* now: r18 == length/8 (aka. length in bytes) */
309
310
311 movw r26, r22 /* X points to begin of msg */
312 tst r18
313 breq sha1_lastBlock_post_copy
314 mov r1, r18
315 sha1_lastBlock_copy_loop:
316 ld r0, X+
317 st Z+, r0
318 dec r1
319 brne sha1_lastBlock_copy_loop
320 sha1_lastBlock_post_copy:
321 sha1_lastBlock_insert_stuffing_bit:
322 ldi r19, 0x80
323 mov r0,r19
324 ldi r19, 0x07
325 and r19, r20 /* if we are in bitmode */
326 breq 2f /* no bitmode */
327 1:
328 lsr r0
329 dec r19
330 brne 1b
331 ld r19, X
332 /* maybe we should do some ANDing here, just for safety */
333 or r0, r19
334 2:
335 st Z+, r0
336 inc r18
337
338 /* checking stuff here */
339 cpi r18, 64-8+1
340 brsh 0f
341 rjmp sha1_lastBlock_insert_zeros
342 0:
343 /* oh shit, we landed here */
344 /* first we have to fill it up with zeros */
345 ldi r19, 64
346 sub r19, r18
347 breq 2f
348 1:
349 st Z+, r1
350 dec r19
351 brne 1b
352 2:
353 sbiw r30, 63
354 sbiw r30, 1
355 movw r22, r30
356
357 push r31
358 push r30
359 push r25
360 push r24
361 push r21
362 push r20
363 rcall sha1_nextBlock
364 pop r20
365 pop r21
366 pop r24
367 pop r25
368 pop r30
369 pop r31
370
371 /* now we should subtract 512 from length */
372 movw r26, r24
373 adiw r26, 4*5+1 /* we can skip the lowest byte */
374 ld r19, X
375 subi r19, hi8(512)
376 st X+, r19
377 ldi r18, 6
378 1:
379 ld r19, X
380 sbci r19, 0
381 st X+, r19
382 dec r18
383 brne 1b
384
385 ; clr r18 /* not neccessary ;-) */
386 /* reset Z pointer to begin of block */
387
388 sha1_lastBlock_insert_zeros:
389 ldi r19, 64-8
390 sub r19, r18
391 breq sha1_lastBlock_insert_length
392 clr r1
393 1:
394 st Z+, r1 /* r1 is still zero */
395 dec r19
396 brne 1b
397
398 ; rjmp sha1_lastBlock_epilog
399 sha1_lastBlock_insert_length:
400 movw r26, r24 /* X points to state */
401 adiw r26, 5*4 /* X points to (state.length) */
402 adiw r30, 8 /* Z points one after the last byte of block */
403 ld r0, X+
404 add r0, r20
405 st -Z, r0
406 ld r0, X+
407 adc r0, r21
408 st -Z, r0
409 ldi r19, 6
410 1:
411 ld r0, X+
412 adc r0, r1
413 st -Z, r0
414 dec r19
415 brne 1b
416
417 sbiw r30, 64-8
418 movw r22, r30
419 rcall sha1_nextBlock
420
421 sha1_lastBlock_epilog:
422 in r30, SPL
423 in r31, SPH
424 in r0, SREG
425 adiw r30, 63 ; lo8(64)
426 adiw r30, 1 ; hi8(64)
427 cli
428 out SPL, r30
429 out SREG, r0
430 out SPH, r31
431 clr r1
432 ret
433
434 /**/
435 ;###########################################################
436
437 .global sha1_nextBlock
438 ; === sha1_nextBlock ===
439 ; this is the core function for calculating SHA-1 hashes
440 ; param1: the 16-bit pointer to sha1_ctx structure
441 ; given in r25,r24 (r25 is most significant)
442 ; param2: an 16-bit pointer to 64 byte block to hash
443 ; given in r23,r22
444 sha1_nextBlock_localSpace = (16+5+1)*4 ; 16 32-bit values for w array and 5 32-bit values for a array (total 84 byte)
445
446 xtmp = 0
447 xNULL = 1
448 W1 = 10
449 W2 = 11
450 T1 = 12
451 T2 = 13
452 T3 = 14
453 T4 = 15
454 LoopC = 16
455 S = 17
456 tmp1 = 18
457 tmp2 = 19
458 tmp3 = 20
459 tmp4 = 21
460 F1 = 22
461 F2 = 23
462 F3 = 24
463 F4 = 25
464
465 /* byteorder: high number <--> high significance */
466 sha1_nextBlock:
467 ; initial, let's make some space ready for local vars
468 /* replace push & pop by mem ops? */
469 push r10
470 push r11
471 push r12
472 push r13
473 push r14
474 push r15
475 push r16
476 push r17
477 push r28
478 push r29
479 in r20, SPL
480 in r21, SPH
481 movw r18, r20 ;backup SP
482 ; movw r26, r20 ; X points to free space on stack /* maybe removeable? */
483 movw r30, r22 ; Z points to message
484 subi r20, lo8(sha1_nextBlock_localSpace) ;sbiw can do only up to 63
485 sbci r21, hi8(sha1_nextBlock_localSpace)
486 movw r26, r20 ; X points to free space on stack
487 in r0, SREG
488 cli ; we want to be uninterrupted while updating SP
489 out SPL, r20
490 out SREG, r0
491 out SPH, r21
492
493 push r18
494 push r19 /* push old SP on new stack */
495 push r24
496 push r25 /* param1 will be needed later */
497
498 /* load a[] with state */
499 movw 28, r24 /* load pointer to state in Y */
500 adiw r26, 1 ; X++
501
502 ldi LoopC, 5*4
503 1: ld tmp1, Y+
504 st X+, tmp1
505 dec LoopC
506 brne 1b
507
508 movw W1, r26 /* save pointer to w[0] */
509 /* load w[] with endian fixed message */
510 /* we might also use the changeendian32() function at bottom */
511 movw r30, r22 /* mv param2 (ponter to msg) to Z */
512 ldi LoopC, 16
513 1:
514 ldd tmp1, Z+3
515 st X+, tmp1
516 ldd tmp1, Z+2
517 st X+, tmp1
518 ldd tmp1, Z+1
519 st X+, tmp1
520 ld tmp1, Z
521 st X+, tmp1
522 adiw r30, 4
523 dec LoopC
524 brne 1b
525
526 ;clr LoopC /* LoopC is named t in FIPS 180-2 */
527 clr xtmp
528 sha1_nextBlock_mainloop:
529 mov S, LoopC
530 lsl S
531 lsl S
532 andi S, 0x3C /* S is a bytepointer so *4 */
533 /* load w[s] */
534 movw r26, W1
535 add r26, S /* X points at w[s] */
536 adc r27, xNULL
537 ld T1, X+
538 ld T2, X+
539 ld T3, X+
540 ld T4, X+
541
542 /*
543 push r26
544 push r27
545 push T4
546 push T3
547 push T2
548 push T1
549 in r26, SPL
550 in r27, SPH
551 adiw r26, 1
552 dbg_hexdump 4
553 pop T1
554 pop T2
555 pop T3
556 pop T4
557 pop r27
558 pop r26
559 */
560
561 cpi LoopC, 16
562 brlt sha1_nextBlock_mainloop_core
563 /* update w[s] */
564 ldi tmp1, 2*4
565 rcall 1f
566 ldi tmp1, 8*4
567 rcall 1f
568 ldi tmp1, 13*4
569 rcall 1f
570 rjmp 2f
571 1: /* this might be "outsourced" to save the jump above */
572 add tmp1, S
573 andi tmp1, 0x3f
574 movw r26, W1
575 add r26, tmp1
576 adc r27, xNULL
577 ld tmp2, X+
578 eor T1, tmp2
579 ld tmp2, X+
580 eor T2, tmp2
581 ld tmp2, X+
582 eor T3, tmp2
583 ld tmp2, X+
584 eor T4, tmp2
585 ret
586 2: /* now we just hav to do a ROTL(T) and save T back */
587 mov tmp2, T4
588 rol tmp2
589 rol T1
590 rol T2
591 rol T3
592 rol T4
593 movw r26, W1
594 add r26, S
595 adc r27, xNULL
596 st X+, T1
597 st X+, T2
598 st X+, T3
599 st X+, T4
600
601 sha1_nextBlock_mainloop_core: /* ther core function; T=ROTL5(a) ....*/
602 /* T already contains w[s] */
603 movw r26, W1
604 sbiw r26, 4*1 /* X points at a[4] aka e */
605 ld tmp1, X+
606 add T1, tmp1
607 ld tmp1, X+
608 adc T2, tmp1
609 ld tmp1, X+
610 adc T3, tmp1
611 ld tmp1, X+
612 adc T4, tmp1 /* T = w[s]+e */
613 sbiw r26, 4*5 /* X points at a[0] aka a */
614 ld F1, X+
615 ld F2, X+
616 ld F3, X+
617 ld F4, X+
618 mov tmp1, F4 /* X points at a[1] aka b */
619 ldi tmp2, 5
620 1:
621 rol tmp1
622 rol F1
623 rol F2
624 rol F3
625 rol F4
626 dec tmp2
627 brne 1b
628
629 add T1, F1
630 adc T2, F2
631 adc T3, F3
632 adc T4, F4 /* T = ROTL(a,5) + e + w[s] */
633
634 /* now we have to do this fucking conditional stuff */
635 ldi r30, lo8(sha1_nextBlock_xTable)
636 ldi r31, hi8(sha1_nextBlock_xTable)
637 add r30, xtmp
638 adc r31, xNULL
639 lpm tmp1, Z
640 cp tmp1, LoopC
641 brne 1f
642 inc xtmp
643 1: ldi r30, lo8(sha1_nextBlock_KTable)
644 ldi r31, hi8(sha1_nextBlock_KTable)
645 lsl xtmp
646 lsl xtmp
647 add r30, xtmp
648 adc r31, xNULL
649 lsr xtmp
650 lsr xtmp
651
652 lpm tmp1, Z+
653 add T1, tmp1
654 lpm tmp1, Z+
655 adc T2, tmp1
656 lpm tmp1, Z+
657 adc T3, tmp1
658 lpm tmp1, Z+
659 adc T4, tmp1
660 /* T = ROTL(a,5) + e + kt + w[s] */
661
662 /* Z-4 is just pointing to kt ... */
663 movw r28, r26 /* copy X in Y */
664 adiw r30, 3*4 /* now Z points to the rigth locatin in our jump-vector-table */
665 lsr r31
666 ror r30
667
668 icall
669 mov F1, tmp1
670 icall
671 mov F2, tmp1
672 icall
673 mov F3, tmp1
674 icall
675
676 add T1, F1
677 adc T2, F2
678 adc T3, F3
679 adc T4, tmp1 /* T = ROTL5(a) + f_t(b,c,d) + e + k_t + w[s] */
680 /* X points still at a[1] aka b, Y points at a[2] aka c */
681 /* update a[] */
682 sha1_nextBlock_update_a:
683 /*first we move all vars in a[] "one up" e=d, d=c, c=b, b=a*/
684 //adiw r28, 3*4 /* Y should point at a[4] aka e */
685 movw r28, W1
686 sbiw r28, 4
687
688 ldi tmp2, 4*4
689 1:
690 ld tmp1, -Y
691 std Y+4, tmp1
692 dec tmp2
693 brne 1b
694 /* Y points at a[0] aka a*/
695
696 movw r28, W1
697 sbiw r28, 5*4
698 /* store T in a[0] aka a */
699 st Y+, T1
700 st Y+, T2
701 st Y+, T3
702 st Y+, T4
703 /* Y points at a[1] aka b*/
704
705 /* rotate c */
706 ldd T1, Y+1*4
707 ldd T2, Y+1*4+1
708 ldd T3, Y+1*4+2
709 ldd T4, Y+1*4+3
710 mov tmp1, T1
711 ldi tmp2, 2
712 1: ror tmp1
713 ror T4
714 ror T3
715 ror T2
716 ror T1
717 dec tmp2
718 brne 1b
719 std Y+1*4+0, T1
720 std Y+1*4+1, T2
721 std Y+1*4+2, T3
722 std Y+1*4+3, T4
723 /*
724 push r27
725 push r26
726 movw r26, W1
727 sbiw r26, 4*5
728 dbg_hexdump 4*5
729 pop r26
730 pop r27
731 */
732 inc LoopC
733 cpi LoopC, 80
734 brge 1f
735 rjmp sha1_nextBlock_mainloop
736 /**************************************/
737 1:
738 /* littel patch */
739 sbiw r28, 4
740
741 /* add a[] to state and inc length */
742 pop r27
743 pop r26 /* now X points to state (and Y still at a[0]) */
744 ldi tmp4, 5
745 1: clc
746 ldi tmp3, 4
747 2: ld tmp1, X
748 ld tmp2, Y+
749 adc tmp1, tmp2
750 st X+, tmp1
751 dec tmp3
752 brne 2b
753 dec tmp4
754 brne 1b
755
756 /* now length += 512 */
757 adiw r26, 1 /* we skip the least significant byte */
758 ld tmp1, X
759 ldi tmp2, hi8(512) /* 2 */
760 add tmp1, tmp2
761 st X+, tmp1
762 ldi tmp2, 6
763 1:
764 ld tmp1, X
765 adc tmp1, xNULL
766 st X+, tmp1
767 dec tmp2
768 brne 1b
769
770 ; EPILOG
771 sha1_nextBlock_epilog:
772 /* now we should clean up the stack */
773 pop r21
774 pop r20
775 in r0, SREG
776 cli ; we want to be uninterrupted while updating SP
777 out SPL, r20
778 out SREG, r0
779 out SPH, r21
780
781 clr r1
782 pop r29
783 pop r28
784 pop r17
785 pop r16
786 pop r15
787 pop r14
788 pop r13
789 pop r12
790 pop r11
791 pop r10
792 ret
793
794 sha1_nextBlock_xTable:
795 .byte 20,40,60,0
796 sha1_nextBlock_KTable:
797 .int 0x5a827999
798 .int 0x6ed9eba1
799 .int 0x8f1bbcdc
800 .int 0xca62c1d6
801 sha1_nextBlock_JumpTable:
802 rjmp sha1_nextBlock_Ch
803 nop
804 rjmp sha1_nextBlock_Parity
805 nop
806 rjmp sha1_nextBlock_Maj
807 nop
808 rjmp sha1_nextBlock_Parity
809
810 /* X and Y still point at a[1] aka b ; return value in tmp1 */
811 sha1_nextBlock_Ch:
812 ld tmp1, Y+
813 mov tmp2, tmp1
814 com tmp2
815 ldd tmp3, Y+3 /* load from c */
816 and tmp1, tmp3
817 ldd tmp3, Y+7 /* load from d */
818 and tmp2, tmp3
819 eor tmp1, tmp2
820 ret
821
822 sha1_nextBlock_Maj:
823 ld tmp1, Y+
824 mov tmp2, tmp1
825 ldd tmp3, Y+3 /* load from c */
826 and tmp1, tmp3
827 ldd tmp4, Y+7 /* load from d */
828 and tmp2, tmp4
829 eor tmp1, tmp2
830 and tmp3, tmp4
831 eor tmp1, tmp3
832 ret
833
834 sha1_nextBlock_Parity:
835 ld tmp1, Y+
836 ldd tmp2, Y+3 /* load from c */
837 eor tmp1, tmp2
838 ldd tmp2, Y+7 /* load from d */
839 eor tmp1, tmp2
840 ret
841 /*
842 ch_str: .asciz "\r\nCh"
843 maj_str: .asciz "\r\nMaj"
844 parity_str: .asciz "\r\nParity"
845 */
846 ;###########################################################
847
848 .global sha1_init
849 ;void sha1_init(sha1_ctx_t *state){
850 ; DEBUG_S("\r\nSHA1_INIT");
851 ; state->h[0] = 0x67452301;
852 ; state->h[1] = 0xefcdab89;
853 ; state->h[2] = 0x98badcfe;
854 ; state->h[3] = 0x10325476;
855 ; state->h[4] = 0xc3d2e1f0;
856 ; state->length = 0;
857 ;}
858 ; param1: (Func3,r24) 16-bit pointer to sha1_ctx_t struct in ram
859 ; modifys: Z(r30,r31), Func1, r22
860 sha1_init:
861 movw r26, r24 ; (24,25) --> (26,27) load X with param1
862 ldi r30, lo8((sha1_init_vector))
863 ldi r31, hi8((sha1_init_vector))
864 ldi r22, 5*4 /* bytes to copy */
865 sha1_init_vloop:
866 lpm r23, Z+
867 st X+, r23
868 dec r22
869 brne sha1_init_vloop
870 ldi r22, 8
871 sha1_init_lloop:
872 st X+, r1
873 dec r22
874 brne sha1_init_lloop
875 ret
876
877 sha1_init_vector:
878 .int 0x67452301;
879 .int 0xefcdab89;
880 .int 0x98badcfe;
881 .int 0x10325476;
882 .int 0xc3d2e1f0;
883