diff -cr eccp109-132-2/Makefile sos4/Makefile *** eccp109-132-2/Makefile Fri Aug 2 00:12:33 2002 --- sos4/Makefile Mon Oct 7 21:00:55 2002 *************** *** 231,237 **** $(SP_ASMFILES) $(SP_OSFILES) ; gcc -DSPCL $(SP_OPT) -o eccp109-spCL \ -I$(INC) $(INT)/spCL/eccp109-spCL.c \ ! $(PICFILES) $(SP_ASMFILES) $(SP_OSFILES) -lsocket -lnsl --- 231,238 ---- $(SP_ASMFILES) $(SP_OSFILES) ; gcc -DSPCL $(SP_OPT) -o eccp109-spCL \ -I$(INC) $(INT)/spCL/eccp109-spCL.c \ ! $(PICFILES) $(SP_ASMFILES) $(SP_OSFILES) ! # $(PICFILES) $(SP_ASMFILES) $(SP_OSFILES) -lsocket -lnsl diff -cr eccp109-132-2/asm/sparc/add128.s sos4/asm/sparc/add128.s *** eccp109-132-2/asm/sparc/add128.s Sat Dec 22 03:16:08 2001 --- sos4/asm/sparc/add128.s Mon Oct 7 20:58:04 2002 *************** *** 1,49 **** ! /******************************************************************/ ! /* add128.s */ ! /* Chris Monico, 1/5/00 */ ! /* This is code to compute the sum of two <128-bit integers. */ ! /******************************************************************/ ! /* An input arg may be the same as an output arg. */ ! /******************************************************************/ ! /** */ ! /** This SPARC implementation by Quentin Campbell, 14/Jun/2001 */ ! /** */ ! /******************************************************************/ ! ! /** ! ARGS: ! RES: %i0 ! A: %i1 ! B: %i2 ! **/ ! ! .section ".text" ! ! .global add128 ! ! .align 4 ! add128: ! save %sp,-(64+4+24) & -8,%sp ! create new stack frame & register window ! ! ld [%i1],%l7 ! l7 <- A[0] ! ld [%i2],%l6 ! l6 <- B[0] ! addcc %l7,%l6,%l5 ! l5 <- A[0] + B[0] ! st %l5,[%i0] ! RES[0] <- l5 ! ! ld [%i1+4],%l7 ! l7 <- A[1] ! ld [%i2+4],%l6 ! l6 <- B[1] ! addxcc %l7,%l6,%l5 ! l5 <- A[1] + B[1] + carry ! st %l5,[%i0+4] ! RES[1] <- l5 ! ! ld [%i1+8],%l7 ! l7 <- A[2] ! ld [%i2+8],%l6 ! l6 <- B[2] ! addxcc %l7,%l6,%l5 ! l5 <- A[2] + B[2] + carry ! st %l5,[%i0+8] ! RES[2] <- l5 ! ! ld [%i1+12],%l7 ! l7 <- A[3] ! ld [%i2+12],%l6 ! l6 <- B[3] ! addx %l7,%l6,%l5 ! l5 <- A[3] + B[3] + carry ! st %l5,[%i0+12] ! RES[3] <- l5 ! ! ret ! restore --- 1,49 ---- ! /******************************************************************/ ! /* add128.s */ ! /* Chris Monico, 1/5/00 */ ! /* This is code to compute the sum of two <128-bit integers. */ ! /******************************************************************/ ! /* An input arg may be the same as an output arg. */ ! /******************************************************************/ ! /** */ ! /** This SPARC implementation by Quentin Campbell, 14/Jun/2001 */ ! /** */ ! /******************************************************************/ ! ! /** ! ARGS: ! RES: %i0 ! A: %i1 ! B: %i2 ! **/ ! ! /*.section ".text"*/ ! ! .global add128 ! ! .align 4 ! add128: ! save %sp,-(64+4+24) & -8,%sp ! create new stack frame & register window ! ! ld [%i1],%l7 ! l7 <- A[0] ! ld [%i2],%l6 ! l6 <- B[0] ! addcc %l7,%l6,%l5 ! l5 <- A[0] + B[0] ! st %l5,[%i0] ! RES[0] <- l5 ! ! ld [%i1+4],%l7 ! l7 <- A[1] ! ld [%i2+4],%l6 ! l6 <- B[1] ! addxcc %l7,%l6,%l5 ! l5 <- A[1] + B[1] + carry ! st %l5,[%i0+4] ! RES[1] <- l5 ! ! ld [%i1+8],%l7 ! l7 <- A[2] ! ld [%i2+8],%l6 ! l6 <- B[2] ! addxcc %l7,%l6,%l5 ! l5 <- A[2] + B[2] + carry ! st %l5,[%i0+8] ! RES[2] <- l5 ! ! ld [%i1+12],%l7 ! l7 <- A[3] ! ld [%i2+12],%l6 ! l6 <- B[3] ! addx %l7,%l6,%l5 ! l5 <- A[3] + B[3] + carry ! st %l5,[%i0+12] ! RES[3] <- l5 ! ! ret ! restore diff -cr eccp109-132-2/asm/sparc/addmod_n109.s sos4/asm/sparc/addmod_n109.s *** eccp109-132-2/asm/sparc/addmod_n109.s Sat Dec 22 03:16:08 2001 --- sos4/asm/sparc/addmod_n109.s Mon Oct 7 20:59:06 2002 *************** *** 1,130 **** ! /******************************************************************/ ! /* addmod_n109.s */ ! /* Chris Monico, 1/5/00 */ ! /* This is code to compute the sum of two 109-bit integers modulo */ ! /* a fixed 109-bit prime, n. Specifically, the fixed prime is the */ ! /* order of the point P in the Certicom ECCP-109 Challenge: */ ! /* n = 1BD5 79792B38 0B049C4D 13A75AE5 */ ! /******************************************************************/ ! /* One or both of the input args may be the same as the output arg*/ ! /******************************************************************/ ! /** **/ ! /** This SPARC implementation by Quentin Campbell, 14/Jun/2001 **/ ! /** **/ ! /******************************************************************/ ! ! /** ! ARGS: ! RES: %i0 ! A: %i1 ! B: %i2 ! **/ ! ! .section ".text" ! ! .global addmod_n109 ! ! .align 4 ! addmod_n109: ! ! save %sp,-(64+4+24) & -8,%sp ! create new stack frame & register window ! ! ld [%i1],%l7 ! l7 <- A[0] ! ld [%i2],%l6 ! l6 <- B[0] ! addcc %l7,%l6,%l0 ! C[0] <- A[0] + B[0] ! st %l0,[%i0] ! RES[0] <- C[0] ! ! ld [%i1+4],%l7 ! l7 <- A[1] ! ld [%i2+4],%l6 ! l6 <- B[1] ! addxcc %l7,%l6,%l1 ! C[1] <- A[1] + B[1] + carry ! st %l1,[%i0+4] ! RES[1] <- C[1] ! ! ld [%i1+8],%l7 ! l7 <- A[2] ! ld [%i2+8],%l6 ! l6 <- B[2] ! addxcc %l7,%l6,%l2 ! C[2] <- A[2] + B[2] + carry ! st %l2,[%i0+8] ! RES[2] <- C[2] ! ! ld [%i1+12],%l7 ! l7 <- A[3] ! ld [%i2+12],%l6 ! l6 <- B[3] ! addx %l7,%l6,%l3 ! C[3] <- A[3] + B[3] + carry ! st %l3,[%i0+12] ! RES[3] <- C[3] ! ! /******************************************************************/ ! /** The REDUCTION by our prime modulus, n (as 4-words in o0-o3) **/ ! /******************************************************************/ ! ! /** ! ! Remember: This is another place where we have hardcoded ! the modulus n = 1BD5 79792B38 0B049C4D 13A75AE5 ! o3 o2 o1 o0 ! **/ ! ! sethi %hi(0x13A75AE5),%o0 ! o0 <- LS word of n ! or %o0,%lo(0x13A75AE5),%o0 ! sethi %hi(0x0B049C4D),%o1 ! or %o1,%lo(0x0B049C4D),%o1 ! sethi %hi(0x79792B38),%o2 ! or %o2,%lo(0x79792B38),%o2 ! sethi %hi(0x00001BD5),%o3 ! or %o3,%lo(0x00001BD5),%o3 ! o3 <- MS word of n ! ! /** ! [C0,C1,C2,C3] is now in [0,2n) so we need at MOST one ! subtraction to be done. ! ! NB: At this point the values of C0,C1,C2,C3 are held in a ! temporary location, the registers l0,l1,l2,l3. ! **/ ! ! cmp %l3,%o3 ! is MS word of C < MS word of n? ! blu .DONE_A ! yes, so no reduction needed ! nop ! bgu .SUBTRACT_A ! no, C > n so reduction needed ! ! cmp %l2,%o2 ! blu .DONE_A ! nop ! bgu .SUBTRACT_A ! ! cmp %l1,%o1 ! blu .DONE_A ! nop ! bgu .SUBTRACT_A ! ! cmp %l0,%o0 ! is LS word of C < LS word of n? ! blu .DONE_A ! yes, C < n so no reduction needed ! nop ! bgu .SUBTRACT_A ! no, C > n so reduction needed ! nop ! ! /******************************************************************/ ! /** Here when C is equal to our prime modulus, n. **/ ! /******************************************************************/ ! mov 0,%l0 ! C[0] <- 0 ! mov 0,%l1 ! C[1] <- 0 ! mov 0,%l2 ! C[2] <- 0 ! mov 0,%l3 ! C[3] <- 0 ! ba .DONE_A ! nop ! ! /******************************************************************/ ! /** Here when C is geater than our prime modulus, n. **/ ! /******************************************************************/ ! .SUBTRACT_A: ! subcc %l0,%o0,%l0 ! C[0] <- C[0] - p0 ! subxcc %l1,%o1,%l1 ! C[1] <- C[1] - p1 + carry ! subxcc %l2,%o2,%l2 ! C[2] <- C[2] - p2 + carry ! subx %l3,%o3,%l3 ! C[3] <- C[3] - p3 + carry ! ! /******************************************************************/ ! /** All that is left to do now is return C. **/ ! /******************************************************************/ ! .DONE_A: ! st %l0,[%i0] ! RES[0] <- C[0] ! st %l1,[%i0+4] ! RES[1] <- C[1] ! st %l2,[%i0+8] ! RES[2] <- C[2] ! st %l3,[%i0+12] ! RES[3] <- C[3] ! ! ret ! restore --- 1,130 ---- ! /******************************************************************/ ! /* addmod_n109.s */ ! /* Chris Monico, 1/5/00 */ ! /* This is code to compute the sum of two 109-bit integers modulo */ ! /* a fixed 109-bit prime, n. Specifically, the fixed prime is the */ ! /* order of the point P in the Certicom ECCP-109 Challenge: */ ! /* n = 1BD5 79792B38 0B049C4D 13A75AE5 */ ! /******************************************************************/ ! /* One or both of the input args may be the same as the output arg*/ ! /******************************************************************/ ! /** **/ ! /** This SPARC implementation by Quentin Campbell, 14/Jun/2001 **/ ! /** **/ ! /******************************************************************/ ! ! /** ! ARGS: ! RES: %i0 ! A: %i1 ! B: %i2 ! **/ ! ! /*.section ".text"*/ ! ! .global addmod_n109 ! ! .align 4 ! addmod_n109: ! ! save %sp,-(64+4+24) & -8,%sp ! create new stack frame & register window ! ! ld [%i1],%l7 ! l7 <- A[0] ! ld [%i2],%l6 ! l6 <- B[0] ! addcc %l7,%l6,%l0 ! C[0] <- A[0] + B[0] ! st %l0,[%i0] ! RES[0] <- C[0] ! ! ld [%i1+4],%l7 ! l7 <- A[1] ! ld [%i2+4],%l6 ! l6 <- B[1] ! addxcc %l7,%l6,%l1 ! C[1] <- A[1] + B[1] + carry ! st %l1,[%i0+4] ! RES[1] <- C[1] ! ! ld [%i1+8],%l7 ! l7 <- A[2] ! ld [%i2+8],%l6 ! l6 <- B[2] ! addxcc %l7,%l6,%l2 ! C[2] <- A[2] + B[2] + carry ! st %l2,[%i0+8] ! RES[2] <- C[2] ! ! ld [%i1+12],%l7 ! l7 <- A[3] ! ld [%i2+12],%l6 ! l6 <- B[3] ! addx %l7,%l6,%l3 ! C[3] <- A[3] + B[3] + carry ! st %l3,[%i0+12] ! RES[3] <- C[3] ! ! /******************************************************************/ ! /** The REDUCTION by our prime modulus, n (as 4-words in o0-o3) **/ ! /******************************************************************/ ! ! /** ! ! Remember: This is another place where we have hardcoded ! the modulus n = 1BD5 79792B38 0B049C4D 13A75AE5 ! o3 o2 o1 o0 ! **/ ! ! sethi %hi(0x13A75AE5),%o0 ! o0 <- LS word of n ! or %o0,%lo(0x13A75AE5),%o0 ! sethi %hi(0x0B049C4D),%o1 ! or %o1,%lo(0x0B049C4D),%o1 ! sethi %hi(0x79792B38),%o2 ! or %o2,%lo(0x79792B38),%o2 ! sethi %hi(0x00001BD5),%o3 ! or %o3,%lo(0x00001BD5),%o3 ! o3 <- MS word of n ! ! /** ! [C0,C1,C2,C3] is now in [0,2n) so we need at MOST one ! subtraction to be done. ! ! NB: At this point the values of C0,C1,C2,C3 are held in a ! temporary location, the registers l0,l1,l2,l3. ! **/ ! ! cmp %l3,%o3 ! is MS word of C < MS word of n? ! blu .DONE_A ! yes, so no reduction needed ! nop ! bgu .SUBTRACT_A ! no, C > n so reduction needed ! ! cmp %l2,%o2 ! blu .DONE_A ! nop ! bgu .SUBTRACT_A ! ! cmp %l1,%o1 ! blu .DONE_A ! nop ! bgu .SUBTRACT_A ! ! cmp %l0,%o0 ! is LS word of C < LS word of n? ! blu .DONE_A ! yes, C < n so no reduction needed ! nop ! bgu .SUBTRACT_A ! no, C > n so reduction needed ! nop ! ! /******************************************************************/ ! /** Here when C is equal to our prime modulus, n. **/ ! /******************************************************************/ ! mov 0,%l0 ! C[0] <- 0 ! mov 0,%l1 ! C[1] <- 0 ! mov 0,%l2 ! C[2] <- 0 ! mov 0,%l3 ! C[3] <- 0 ! ba .DONE_A ! nop ! ! /******************************************************************/ ! /** Here when C is geater than our prime modulus, n. **/ ! /******************************************************************/ ! .SUBTRACT_A: ! subcc %l0,%o0,%l0 ! C[0] <- C[0] - p0 ! subxcc %l1,%o1,%l1 ! C[1] <- C[1] - p1 + carry ! subxcc %l2,%o2,%l2 ! C[2] <- C[2] - p2 + carry ! subx %l3,%o3,%l3 ! C[3] <- C[3] - p3 + carry ! ! /******************************************************************/ ! /** All that is left to do now is return C. **/ ! /******************************************************************/ ! .DONE_A: ! st %l0,[%i0] ! RES[0] <- C[0] ! st %l1,[%i0+4] ! RES[1] <- C[1] ! st %l2,[%i0+8] ! RES[2] <- C[2] ! st %l3,[%i0+12] ! RES[3] <- C[3] ! ! ret ! restore diff -cr eccp109-132-2/asm/sparc/addmod_p109.s sos4/asm/sparc/addmod_p109.s *** eccp109-132-2/asm/sparc/addmod_p109.s Sat Dec 22 03:16:08 2001 --- sos4/asm/sparc/addmod_p109.s Mon Oct 7 20:58:34 2002 *************** *** 1,130 **** ! /******************************************************************/ ! /* addmod_p109.s */ ! /* Chris Monico, 1/5/00 */ ! /* This is code to compute the sum of two 109-bit integers */ ! /* modulo a fixed 109-bit prime, p. Specifically, the fixed prime */ ! /* is the number over which the Certicom ECCP-109 Challenge curve */ ! /* is defined: */ ! /* p = 1BD5 79792B38 0B5B521E 6D9FB599 */ ! /******************************************************************/ ! /* One or both of the input args may be the same as the output arg*/ ! /******************************************************************/ ! /** **/ ! /** This SPARC implementation by Quentin Campbell, 14/Jun/2001 **/ ! /** **/ ! /******************************************************************/ ! ! /** ! ARGS: ! RES: %i0 ! A: %i1 ! B: %i2 ! **/ ! ! .section ".text" ! ! .global addmod_p109 ! ! .align 4 ! ! addmod_p109: ! save %sp,-(64+4+24) & -8,%sp ! create new stack frame & register window ! ! ld [%i1],%l7 ! l7 <- A[0] ! ld [%i2],%l6 ! l6 <- B[0] ! addcc %l7,%l6,%l0 ! C[0] <- A[0] + B[0] ! st %l0,[%i0] ! RES[0] <- C[0] ! ! ld [%i1+4],%l7 ! l7 <- A[1] ! ld [%i2+4],%l6 ! l6 <- B[1] ! addxcc %l7,%l6,%l1 ! C[1] <- A[1] + B[1] + carry ! st %l1,[%i0+4] ! RES[1] <- C[1] ! ! ld [%i1+8],%l7 ! l7 <- A[2] ! ld [%i2+8],%l6 ! l6 <- B[2] ! addxcc %l7,%l6,%l2 ! C[2] <- A[2] + B[2] + carry ! st %l2,[%i0+8] ! RES[2] <- C[2] ! ! ld [%i1+12],%l7 ! l7 <- A[3] ! ld [%i2+12],%l6 ! l6 <- B[3] ! addx %l7,%l6,%l3 ! C[3] <- A[3] + B[3] + carry ! st %l3,[%i0+12] ! RES[3] <- C[3] ! ! /******************************************************************/ ! /** The REDUCTION by our prime modulus, p (as 4-words in o0-o3) **/ ! /******************************************************************/ ! ! /** ! Remember: This is another place where we have hardcoded ! the modulus p = 1BD5 79792B38 0B5B521E 6D9FB599 ! o3 o2 o1 o0 ! **/ ! ! sethi %hi(0x6D9FB599),%o0 ! o0 <- LS word of p ! or %o0,%lo(0x6D9FB599),%o0 ! sethi %hi(0x0B5B521E),%o1 ! or %o1,%lo(0x0B5B521E),%o1 ! sethi %hi(0x79792B38),%o2 ! or %o2,%lo(0x79792B38),%o2 ! sethi %hi(0x00001BD5),%o3 ! or %o3,%lo(0x00001BD5),%o3 ! o3 <- MS word of p ! ! /** ! [C0,C1,C2,C3] is now in [0,2p) so we need at MOST one ! subtraction to be done. ! ! NB: At this point the values of C0,C1,C2,C3 are held in a ! temporary location, the registers l0,l1,l2,l3. ! **/ ! ! cmp %l3,%o3 ! is MS word of C < MS word of p? ! blu .DONE_A ! yes, so no reduction needed ! nop ! bgu .SUBTRACT_A ! no, C > p so reduction needed ! ! cmp %l2,%o2 ! blu .DONE_A ! nop ! bgu .SUBTRACT_A ! ! cmp %l1,%o1 ! blu .DONE_A ! nop ! bgu .SUBTRACT_A ! ! cmp %l0,%o0 ! is LS word of C < LS word of p? ! blu .DONE_A ! yes, C < p so no reduction needed ! nop ! bgu .SUBTRACT_A ! no, C > p so reduction needed ! nop ! ! /******************************************************************/ ! /** Here when C is equal to our prime modulus, p. **/ ! /******************************************************************/ ! mov 0,%l0 ! C[0] <- 0 ! mov 0,%l1 ! C[1] <- 0 ! mov 0,%l2 ! C[2] <- 0 ! mov 0,%l3 ! C[3] <- 0 ! ba .DONE_A ! nop ! ! /******************************************************************/ ! /** Here when C is geater than our prime modulus, p. **/ ! /******************************************************************/ ! .SUBTRACT_A: ! subcc %l0,%o0,%l0 ! C[0] <- C[0] - p0 ! subxcc %l1,%o1,%l1 ! C[1] <- C[1] - p1 + carry ! subxcc %l2,%o2,%l2 ! C[2] <- C[2] - p2 + carry ! subx %l3,%o3,%l3 ! C[3] <- C[3] - p3 + carry ! ! /******************************************************************/ ! /** All that is left to do now is return C. **/ ! /******************************************************************/ ! .DONE_A: ! st %l0,[%i0] ! RES[0] <- C[0] ! st %l1,[%i0+4] ! RES[1] <- C[1] ! st %l2,[%i0+8] ! RES[2] <- C[2] ! st %l3,[%i0+12] ! RES[3] <- C[3] ! ! ret ! restore --- 1,130 ---- ! /******************************************************************/ ! /* addmod_p109.s */ ! /* Chris Monico, 1/5/00 */ ! /* This is code to compute the sum of two 109-bit integers */ ! /* modulo a fixed 109-bit prime, p. Specifically, the fixed prime */ ! /* is the number over which the Certicom ECCP-109 Challenge curve */ ! /* is defined: */ ! /* p = 1BD5 79792B38 0B5B521E 6D9FB599 */ ! /******************************************************************/ ! /* One or both of the input args may be the same as the output arg*/ ! /******************************************************************/ ! /** **/ ! /** This SPARC implementation by Quentin Campbell, 14/Jun/2001 **/ ! /** **/ ! /******************************************************************/ ! ! /** ! ARGS: ! RES: %i0 ! A: %i1 ! B: %i2 ! **/ ! ! /*.section ".text"*/ ! ! .global addmod_p109 ! ! .align 4 ! ! addmod_p109: ! save %sp,-(64+4+24) & -8,%sp ! create new stack frame & register window ! ! ld [%i1],%l7 ! l7 <- A[0] ! ld [%i2],%l6 ! l6 <- B[0] ! addcc %l7,%l6,%l0 ! C[0] <- A[0] + B[0] ! st %l0,[%i0] ! RES[0] <- C[0] ! ! ld [%i1+4],%l7 ! l7 <- A[1] ! ld [%i2+4],%l6 ! l6 <- B[1] ! addxcc %l7,%l6,%l1 ! C[1] <- A[1] + B[1] + carry ! st %l1,[%i0+4] ! RES[1] <- C[1] ! ! ld [%i1+8],%l7 ! l7 <- A[2] ! ld [%i2+8],%l6 ! l6 <- B[2] ! addxcc %l7,%l6,%l2 ! C[2] <- A[2] + B[2] + carry ! st %l2,[%i0+8] ! RES[2] <- C[2] ! ! ld [%i1+12],%l7 ! l7 <- A[3] ! ld [%i2+12],%l6 ! l6 <- B[3] ! addx %l7,%l6,%l3 ! C[3] <- A[3] + B[3] + carry ! st %l3,[%i0+12] ! RES[3] <- C[3] ! ! /******************************************************************/ ! /** The REDUCTION by our prime modulus, p (as 4-words in o0-o3) **/ ! /******************************************************************/ ! ! /** ! Remember: This is another place where we have hardcoded ! the modulus p = 1BD5 79792B38 0B5B521E 6D9FB599 ! o3 o2 o1 o0 ! **/ ! ! sethi %hi(0x6D9FB599),%o0 ! o0 <- LS word of p ! or %o0,%lo(0x6D9FB599),%o0 ! sethi %hi(0x0B5B521E),%o1 ! or %o1,%lo(0x0B5B521E),%o1 ! sethi %hi(0x79792B38),%o2 ! or %o2,%lo(0x79792B38),%o2 ! sethi %hi(0x00001BD5),%o3 ! or %o3,%lo(0x00001BD5),%o3 ! o3 <- MS word of p ! ! /** ! [C0,C1,C2,C3] is now in [0,2p) so we need at MOST one ! subtraction to be done. ! ! NB: At this point the values of C0,C1,C2,C3 are held in a ! temporary location, the registers l0,l1,l2,l3. ! **/ ! ! cmp %l3,%o3 ! is MS word of C < MS word of p? ! blu .DONE_A ! yes, so no reduction needed ! nop ! bgu .SUBTRACT_A ! no, C > p so reduction needed ! ! cmp %l2,%o2 ! blu .DONE_A ! nop ! bgu .SUBTRACT_A ! ! cmp %l1,%o1 ! blu .DONE_A ! nop ! bgu .SUBTRACT_A ! ! cmp %l0,%o0 ! is LS word of C < LS word of p? ! blu .DONE_A ! yes, C < p so no reduction needed ! nop ! bgu .SUBTRACT_A ! no, C > p so reduction needed ! nop ! ! /******************************************************************/ ! /** Here when C is equal to our prime modulus, p. **/ ! /******************************************************************/ ! mov 0,%l0 ! C[0] <- 0 ! mov 0,%l1 ! C[1] <- 0 ! mov 0,%l2 ! C[2] <- 0 ! mov 0,%l3 ! C[3] <- 0 ! ba .DONE_A ! nop ! ! /******************************************************************/ ! /** Here when C is geater than our prime modulus, p. **/ ! /******************************************************************/ ! .SUBTRACT_A: ! subcc %l0,%o0,%l0 ! C[0] <- C[0] - p0 ! subxcc %l1,%o1,%l1 ! C[1] <- C[1] - p1 + carry ! subxcc %l2,%o2,%l2 ! C[2] <- C[2] - p2 + carry ! subx %l3,%o3,%l3 ! C[3] <- C[3] - p3 + carry ! ! /******************************************************************/ ! /** All that is left to do now is return C. **/ ! /******************************************************************/ ! .DONE_A: ! st %l0,[%i0] ! RES[0] <- C[0] ! st %l1,[%i0+4] ! RES[1] <- C[1] ! st %l2,[%i0+8] ! RES[2] <- C[2] ! st %l3,[%i0+12] ! RES[3] <- C[3] ! ! ret ! restore diff -cr eccp109-132-2/asm/sparc/incmod_n109.s sos4/asm/sparc/incmod_n109.s *** eccp109-132-2/asm/sparc/incmod_n109.s Sat Dec 22 03:16:08 2001 --- sos4/asm/sparc/incmod_n109.s Mon Oct 7 20:59:17 2002 *************** *** 1,117 **** ! /******************************************************************/ ! /* incmod_n109.s */ ! /* Chris Monico, 1/5/00 */ ! /* This is code to increment a 109-bit integer modulo a fixed */ ! /* 109-bit number, n. Specifically, the fixed prime is the */ ! /* order of the point P in the Certicom ECCP-109 Challenge: */ ! /* n = 1BD5 79792B38 0B049C4D 13A75AE5 */ ! /* */ ! /* Note that we use this function to sometimes increment a value */ ! /* that is already a little bigger than n, so we can't just test */ ! /* for == n */ ! /******************************************************************/ ! /** **/ ! /** This SPARC implementation by Quentin Campbell, 14/Jun/2001 **/ ! /** **/ ! /******************************************************************/ ! ! /** ! ARGS: ! RES: %i0 ! **/ ! ! .section ".text" ! ! .global incmod_n109 ! ! .align 4 ! incmod_n109: ! ! save %sp,-(64+4+24) & -8,%sp ! create new stack frame & register window ! ! ld [%i0],%l0 ! C[0] <- RES[0] ! ld [%i0+4],%l1 ! C[1] <- RES[1] ! ld [%i0+8],%l2 ! C[2] <- RES[2] ! ld [%i0+12],%l3 ! C[3] <- RES[3] ! addcc %l0,1,%l0 ! C[0] <- C[0] + 1 ! addxcc %l1,0,%l1 ! C[1] <- C[1] + carry ! addxcc %l2,0,%l2 ! C[2] <- C[2] + carry ! addx %l3,0,%l3 ! C[3] <- C[3] + carry ! /******************************************************************/ ! /** The REDUCTION by our modulus, n (as 4-words in o0-o3) **/ ! /******************************************************************/ ! ! /** ! Remember: This is another place where we have hardcoded ! the modulus n = 1BD5 79792B38 0B049C4D 13A75AE5 ! o3 o2 o1 o0 ! **/ ! ! sethi %hi(0x13A75AE5),%o0 ! o0 <- LS word of n ! or %o0,%lo(0x13A75AE5),%o0 ! sethi %hi(0x0B049C4D),%o1 ! or %o1,%lo(0x0B049C4D),%o1 ! sethi %hi(0x79792B38),%o2 ! or %o2,%lo(0x79792B38),%o2 ! sethi %hi(0x00001BD5),%o3 ! or %o3,%lo(0x00001BD5),%o3 ! o3 <- MS word of n ! ! /** ! [C0,C1,C2,C3] is now in [0,2n) so we need at MOST one ! subtraction to be done. ! ! NB: At this point the values of C0,C1,C2,C3 are held in a ! temporary location, the registers l0,l1,l2,l3. ! **/ ! ! cmp %l3,%o3 ! is MS word of C < MS word of n? ! blu .DONE_A ! yes, so no reduction needed ! nop ! bgu .SUBTRACT_A ! no, C > n so reduction needed ! ! cmp %l2,%o2 ! blu .DONE_A ! nop ! bgu .SUBTRACT_A ! ! cmp %l1,%o1 ! blu .DONE_A ! nop ! bgu .SUBTRACT_A ! ! cmp %l0,%o0 ! is LS word of C < LS word of n? ! blu .DONE_A ! yes, C < n so no reduction needed ! nop ! bgu .SUBTRACT_A ! no, C > n so reduction needed ! nop ! ! /******************************************************************/ ! /** Here when C is equal to our modulus, n. **/ ! /******************************************************************/ ! mov 0,%l0 ! C[0] <- 0 ! mov 0,%l1 ! C[1] <- 0 ! mov 0,%l2 ! C[2] <- 0 ! mov 0,%l3 ! C[3] <- 0 ! ba .DONE_A ! nop ! ! /******************************************************************/ ! /** Here when C is geater than our modulus, n. **/ ! /******************************************************************/ ! .SUBTRACT_A: ! subcc %l0,%o0,%l0 ! C[0] <- C[0] - p0 ! subxcc %l1,%o1,%l1 ! C[1] <- C[1] - p1 + carry ! subxcc %l2,%o2,%l2 ! C[2] <- C[2] - p2 + carry ! subx %l3,%o3,%l3 ! C[3] <- C[3] - p3 + carry ! ! /******************************************************************/ ! /** All that is left to do now is return C. **/ ! /******************************************************************/ ! .DONE_A: ! st %l0,[%i0] ! RES[0] <- C[0] ! st %l1,[%i0+4] ! RES[1] <- C[1] ! st %l2,[%i0+8] ! RES[2] <- C[2] ! st %l3,[%i0+12] ! RES[3] <- C[3] ! ! ret ! restore --- 1,117 ---- ! /******************************************************************/ ! /* incmod_n109.s */ ! /* Chris Monico, 1/5/00 */ ! /* This is code to increment a 109-bit integer modulo a fixed */ ! /* 109-bit number, n. Specifically, the fixed prime is the */ ! /* order of the point P in the Certicom ECCP-109 Challenge: */ ! /* n = 1BD5 79792B38 0B049C4D 13A75AE5 */ ! /* */ ! /* Note that we use this function to sometimes increment a value */ ! /* that is already a little bigger than n, so we can't just test */ ! /* for == n */ ! /******************************************************************/ ! /** **/ ! /** This SPARC implementation by Quentin Campbell, 14/Jun/2001 **/ ! /** **/ ! /******************************************************************/ ! ! /** ! ARGS: ! RES: %i0 ! **/ ! ! /*.section ".text"*/ ! ! .global incmod_n109 ! ! .align 4 ! incmod_n109: ! ! save %sp,-(64+4+24) & -8,%sp ! create new stack frame & register window ! ! ld [%i0],%l0 ! C[0] <- RES[0] ! ld [%i0+4],%l1 ! C[1] <- RES[1] ! ld [%i0+8],%l2 ! C[2] <- RES[2] ! ld [%i0+12],%l3 ! C[3] <- RES[3] ! addcc %l0,1,%l0 ! C[0] <- C[0] + 1 ! addxcc %l1,0,%l1 ! C[1] <- C[1] + carry ! addxcc %l2,0,%l2 ! C[2] <- C[2] + carry ! addx %l3,0,%l3 ! C[3] <- C[3] + carry ! /******************************************************************/ ! /** The REDUCTION by our modulus, n (as 4-words in o0-o3) **/ ! /******************************************************************/ ! ! /** ! Remember: This is another place where we have hardcoded ! the modulus n = 1BD5 79792B38 0B049C4D 13A75AE5 ! o3 o2 o1 o0 ! **/ ! ! sethi %hi(0x13A75AE5),%o0 ! o0 <- LS word of n ! or %o0,%lo(0x13A75AE5),%o0 ! sethi %hi(0x0B049C4D),%o1 ! or %o1,%lo(0x0B049C4D),%o1 ! sethi %hi(0x79792B38),%o2 ! or %o2,%lo(0x79792B38),%o2 ! sethi %hi(0x00001BD5),%o3 ! or %o3,%lo(0x00001BD5),%o3 ! o3 <- MS word of n ! ! /** ! [C0,C1,C2,C3] is now in [0,2n) so we need at MOST one ! subtraction to be done. ! ! NB: At this point the values of C0,C1,C2,C3 are held in a ! temporary location, the registers l0,l1,l2,l3. ! **/ ! ! cmp %l3,%o3 ! is MS word of C < MS word of n? ! blu .DONE_A ! yes, so no reduction needed ! nop ! bgu .SUBTRACT_A ! no, C > n so reduction needed ! ! cmp %l2,%o2 ! blu .DONE_A ! nop ! bgu .SUBTRACT_A ! ! cmp %l1,%o1 ! blu .DONE_A ! nop ! bgu .SUBTRACT_A ! ! cmp %l0,%o0 ! is LS word of C < LS word of n? ! blu .DONE_A ! yes, C < n so no reduction needed ! nop ! bgu .SUBTRACT_A ! no, C > n so reduction needed ! nop ! ! /******************************************************************/ ! /** Here when C is equal to our modulus, n. **/ ! /******************************************************************/ ! mov 0,%l0 ! C[0] <- 0 ! mov 0,%l1 ! C[1] <- 0 ! mov 0,%l2 ! C[2] <- 0 ! mov 0,%l3 ! C[3] <- 0 ! ba .DONE_A ! nop ! ! /******************************************************************/ ! /** Here when C is geater than our modulus, n. **/ ! /******************************************************************/ ! .SUBTRACT_A: ! subcc %l0,%o0,%l0 ! C[0] <- C[0] - p0 ! subxcc %l1,%o1,%l1 ! C[1] <- C[1] - p1 + carry ! subxcc %l2,%o2,%l2 ! C[2] <- C[2] - p2 + carry ! subx %l3,%o3,%l3 ! C[3] <- C[3] - p3 + carry ! ! /******************************************************************/ ! /** All that is left to do now is return C. **/ ! /******************************************************************/ ! .DONE_A: ! st %l0,[%i0] ! RES[0] <- C[0] ! st %l1,[%i0+4] ! RES[1] <- C[1] ! st %l2,[%i0+8] ! RES[2] <- C[2] ! st %l3,[%i0+12] ! RES[3] <- C[3] ! ! ret ! restore diff -cr eccp109-132-2/asm/sparc/inv_p109.s sos4/asm/sparc/inv_p109.s *** eccp109-132-2/asm/sparc/inv_p109.s Sat Dec 22 03:16:08 2001 --- sos4/asm/sparc/inv_p109.s Mon Oct 7 20:58:45 2002 *************** *** 1,292 **** ! /******************************************************************/ ! /* inv_p109.s */ ! /* Brian Gladman & Chris Monico, 1/5/00 */ ! /******************************************************************/ ! /** **/ ! /** This SPARC implementation by Quentin Campbell, 14/Jun/2001 **/ ! /** **/ ! /******************************************************************/ ! ! /** ! ARGS: ! RES: %i0 ! OP: %i1 ! **/ ! ! .section ".text" ! ! .global inv_p109 ! ! .align 4 ! inv_p109: ! ! save %sp,-(64+4+24) & -8,%sp ! create new stack frame & register window ! ! /************ ! /* D[0] = l0 ! /* D[1] = l1 ! /* D[2] = l2 ! /* D[3] = l3 ! /* /* B[0] = l4 ! /* B[1] = l5 ! /* B[2] = l6 ! /* B[3] = l7 ! /* /* u[0] = i2 ! /* u[1] = i3 ! /* u[2] = i4 ! /* u[3] = i5 ! /* /* v[0] = o2 ! /* v[1] = o3 ! /* v[2] = o4 ! /* v[3] = o5 ! /************ ! ! /** u <- p **/ ! /** ! ! Remember: This is another place where we have hardcoded ! the modulus p = 1BD5 79792B38 0B5B521E 6D9FB599 ! **/ ! ! set 0x6D9FB599,%i2 ! u[0] <- p0 ! set 0x0B5B521E,%i3 ! u[1] <- p1 ! set 0x79792B38,%i4 ! u[2] <- p2 ! set 0x00001BD5,%i5 ! u[3] <- p3 ! ! /** v <- OP **/ ! ! ld [%i1],%o2 ! v[0] <- OP[0] ! ld [%i1+4],%o3 ! v[1] <- OP[1] ! ld [%i1+8],%o4 ! v[2] <- OP[2] ! ld [%i1+12],%o5 ! v[3] <- OP[3] ! ! /** B, D <- 0, D[0] <- 1 **/ ! ! clr %l4 ! B[0] <- 0 ! clr %l5 ! clr %l6 ! clr %l7 ! B[3] <- 0 ! ! mov 0x00000001,%l0 ! D[0] <- 1 ! clr %l1 ! clr %l2 ! clr %l3 ! D[3] <- 0 ! ! .1: ! andcc %i2,0x00000001,%g0 ! u[0] bit0 == 1 ? ! bnz .4 ! yes, goto .4 ! nop ! ! /** DIV2(u) **/ ! ! sll %i5,31,%o0 ! o0 <- bit0 of u[3] ! sra %i5,1,%i5 ! u[3] shift R aritmetic 1 bit ! sll %i4,31,%o1 ! o1 <- bit0 of u[2] ! srl %i4,1,%i4 ! u[2] shift R logical 1 bit ! or %i4,%o0,%i4 ! u[2] bit31 <- u[3] bit0 ! sll %i3,31,%o0 ! o0 <- bit0 of u[1] ! srl %i3,1,%i3 ! u[1] shift R logical 1 bit ! or %i3,%o1,%i3 ! u[1] bit31 <- u[2] bit0 ! sll %i2,31,%o1 ! o1 <- bit0 of u[0] ! srl %i2,1,%i2 ! u[0] shift R logical 1 bit ! or %i2,%o0,%i2 ! u[0] bit31 <- u[1] bit0 ! orcc %o1,%o1,%g0 ! u[0] bit0 (before R shift) == '1'? ! bz .2 ! no, goto .2. Dividend even so skip rounding ! nop ! addcc %i2,1,%i2 ! dividend += 1; round -> 0 ! addxcc %i3,0,%i3 ! addxcc %i4,0,%i4 ! addx %i5,0,%i5 ! ! .2: ! andcc %l4,0x00000001,%g0 ! B[0] bit0 == 1 ? ! bz .3 ! no, goto .3 ! nop ! ! /** sub128(B,B,p) **/ ! ! /** ! ! Remember: This is another place where we have hardcoded ! the modulus p = 1BD5 79792B38 0B5B521E 6D9FB599 ! **/ ! ! set 0x6D9FB599,%o1 ! o1 <- p0 ! subcc %l4,%o1,%l4 ! B[0] <- B[0] - p0 ! set 0x0B5B521E,%o1 ! o1 <- p1 ! subxcc %l5,%o1,%l5 ! B[1] <- B[1] - p1 + carry ! set 0x79792B38,%o1 ! o1 <- p2 ! subxcc %l6,%o1,%l6 ! B[2] <- B[2] - p2 + carry ! set 0x00001BD5,%o1 ! o1 <- p3 ! subx %l7,%o1,%l7 ! B[3] <- B[3] - p3 + carry ! ! .3: ! ! /** DIV2(B) **/ ! ! sll %l7,31,%o0 ! o0 <- bit0 of B[3] ! sra %l7,1,%l7 ! B[3] shift R aritmetic 1 bit ! sll %l6,31,%o1 ! o1 <- bit0 of B[2] ! srl %l6,1,%l6 ! B[2] shift R logical 1 bit ! or %l6,%o0,%l6 ! B[2] bit31 <- B[3] bit0 ! sll %l5,31,%o0 ! o0 <- bit0 of B[1] ! srl %l5,1,%l5 ! B[1] shift R logical 1 bit ! or %l5,%o1,%l5 ! B[1] bit31 <- B[2] bit0 ! sll %l4,31,%o1 ! o1 <- bit0 of B[0] ! srl %l4,1,%l4 ! B[0] shift R logical 1 bit ! or %l4,%o0,%l4 ! B[0] bit31 <- B[1] bit0 ! orcc %o1,%o1,%g0 ! B[0] bit0 (before R shift) == '1'? ! bz .1 ! no, goto .1. Dividend even so skip rounding ! nop ! addcc %l4,1,%l4 ! dividend +=; round -> 0 ! addxcc %l5,0,%l5 ! addxcc %l6,0,%l6 ! addx %l7,0,%l7 ! ba .1 ! nop ! ! .4: ! andcc %o2,0x00000001,%g0 ! v[0] bit0 == 1 ? ! bnz .7 ! yes, goto label .7 ! nop ! ! /** DIV2(v) **/ ! ! sll %o5,31,%o0 ! o0 <- bit0 of v[3] ! sra %o5,1,%o5 ! v[3] shift R aritmetic 1 bit ! sll %o4,31,%o1 ! o1 <- bit0 of v[2] ! srl %o4,1,%o4 ! v[2] shift R logical 1 bit ! or %o4,%o0,%o4 ! v[2] bit31 <- v[3] bit0 ! sll %o3,31,%o0 ! o0 <- bit0 of v[1] ! srl %o3,1,%o3 ! v[1] shift R logical 1 bit ! or %o3,%o1,%o3 ! v[1] bit31 <- v[2] bit0 ! sll %o2,31,%o1 ! o1 <- bit0 of v[0] ! srl %o2,1,%o2 ! v[0] shift R logical 1 bit ! or %o2,%o0,%o2 ! v[0] bit31 <- v[1] bit0 ! orcc %o1,%o1,%g0 ! v[0] bit0 (before R shift) == '1'? ! bz .5 ! no, goto .5. Dividend even so skip rounding ! nop ! addcc %o2,1,%o2 ! dividend += 1; round -> 0 ! addxcc %o3,0,%o3 ! addxcc %o4,0,%o4 ! addx %o5,0,%o5 ! ! .5: ! andcc %l0,0x00000001,%g0 ! D[0] bit0 == 1 ? ! bz .6 ! no, goto label .6 ! nop ! ! /** sub128(D,D,p) **/ ! ! /** ! ! Remember: This is another place where we have hardcoded ! the modulus p = 1BD5 79792B38 0B5B521E 6D9FB599 ! **/ ! ! set 0x6D9FB599,%o1 ! o1 <- p0 ! subcc %l0,%o1,%l0 ! D[0] <- D[0] - p0 ! set 0x0B5B521E,%o1 ! o1 <- p1 ! subxcc %l1,%o1,%l1 ! D[1] <- D[1] - p1 + carry ! set 0x79792B38,%o1 ! o1 <- p2 ! subxcc %l2,%o1,%l2 ! D[2] <- D[2] - p2 + carry ! set 0x00001BD5,%o1 ! o1 <- p3 ! subx %l3,%o1,%l3 ! D[3] <- D[3] - p3 + carry ! ! .6: ! ! /** DIV2(D) **/ ! ! sll %l3,31,%o0 ! o0 <- bit0 of D[3] ! sra %l3,1,%l3 ! D[3] shift R aritmetic 1 bit ! sll %l2,31,%o1 ! o1 <- bit0 of D[2] ! srl %l2,1,%l2 ! D[2] shift R logical 1 bit ! or %l2,%o0,%l2 ! D[2] bit31 <- D[3] bit0 ! sll %l1,31,%o0 ! o0 <- bit0 of D[1] ! srl %l1,1,%l1 ! D[1] shift R logical 1 bit ! or %l1,%o1,%l1 ! D[1] bit31 <- D[2] bit0 ! sll %l0,31,%o1 ! o1 <- bit0 of D[0] ! srl %l0,1,%l0 ! D[0] shift R logical 1 bit ! or %l0,%o0,%l0 ! D[0] bit31 <- D[1] bit0 ! orcc %o1,%o1,%g0 ! D[0] bit0 (before R shift) == '1'? ! bz .4 ! no, goto .4. Dividend even so skip rounding ! nop ! addcc %l0,1,%l0 ! dividend += 1; round -> 0 ! addxcc %l1,0,%l1 ! addxcc %l2,0,%l2 ! addx %l3,0,%l3 ! ba .4 ! nop ! ! .7: ! subcc %i2,%o2,%g0 ! u[0] - v[0] ! subxcc %i3,%o3,%g0 ! u[1] - v[1] + carry ! subxcc %i4,%o4,%g0 ! u[2] - v[2] + carry ! subxcc %i5,%o5,%g0 ! u[3] - v[3] + carry ! bl .8 ! branch if u < v ! nop ! ! subcc %i2,%o2,%i2 ! u[0] <- u[0] - v[0] ! subxcc %i3,%o3,%i3 ! u[1] <- u[1] - v[1] + carry ! subxcc %i4,%o4,%i4 ! u[2] <- u[2] - v[2] + carry ! subx %i5,%o5,%i5 ! u[3] <- u[3] - v[3] + carry ! subcc %l4,%l0,%l4 ! B[0] <- B[0] - D[0] ! subxcc %l5,%l1,%l5 ! B[1] <- B[1] - D[1] + carry ! subxcc %l6,%l2,%l6 ! B[2] <- B[2] - D[2] + carry ! subx %l7,%l3,%l7 ! B[3] <- B[3] - D[3] + carry ! ba .9 ! nop ! ! .8: ! subcc %o2,%i2,%o2 ! v[0] <- v[0] - u[0] ! subxcc %o3,%i3,%o3 ! v[1] <- v[1] - u[1] + carry ! subxcc %o4,%i4,%o4 ! v[2] <- v[2] - u[2] + carry ! subx %o5,%i5,%o5 ! v[3] <- v[3] - u[3] + carry ! subcc %l0,%l4,%l0 ! D[0] <- D[0] - B[0] ! subxcc %l1,%l5,%l1 ! D[1] <- D[1] - B[1] + carry ! subxcc %l2,%l6,%l2 ! D[2] <- D[2] - B[2] + carry ! subx %l3,%l7,%l3 ! D[3] <- D[3] - B[3] + carry ! ! .9: ! mov %i2,%o0 ! or %i3,%o0,%o0 ! or %i4,%o0,%o0 ! orcc %i5,%o0,%o0 ! bz .10 ! branch if u == 0 ! nop ! ba .1 ! u > 0 ! nop ! ! .10: ! orcc %l3,%l3,%g0 ! D[3] bit 31 == 0? ! bpos .12 ! yes, so return D. ! nop ! ! .11: ! /** ! ! Remember: This is another place where we have hardcoded ! the modulus p = 1BD5 79792B38 0B5B521E 6D9FB599 ! **/ ! ! .11a: ! set 0x6D9FB599,%o1 ! o1 <- p0 ! addcc %l0,%o1,%l0 ! D[0] <- D[0] + p0 ! set 0x0B5B521E,%o1 ! o1 <- p1 ! addxcc %l1,%o1,%l1 ! D[1] <- D[1] + p1 + carry ! set 0x79792B38,%o1 ! o1 <- p2 ! addxcc %l2,%o1,%l2 ! D[2] <- D[2] + p2 + carry ! set 0x00001BD5,%o1 ! o1 <- p3 ! addxcc %l3,%o1,%l3 ! D[3] <- D[3] + p3 + carry ! bneg .11a ! nop ! ! .12: ! st %l0,[%i0] ! RES[0] <- D[0] ! st %l1,[%i0+4] ! st %l2,[%i0+8] ! st %l3,[%i0+12] ! RES[3] <- D[3] ! ! mov 1,%o0 ! return 1 ! ret ! restore %o0,0,%o0 --- 1,292 ---- ! /******************************************************************/ ! /* inv_p109.s */ ! /* Brian Gladman & Chris Monico, 1/5/00 */ ! /******************************************************************/ ! /** **/ ! /** This SPARC implementation by Quentin Campbell, 14/Jun/2001 **/ ! /** **/ ! /******************************************************************/ ! ! /** ! ARGS: ! RES: %i0 ! OP: %i1 ! **/ ! ! /*.section ".text"*/ ! ! .global inv_p109 ! ! .align 4 ! inv_p109: ! ! save %sp,-(64+4+24) & -8,%sp ! create new stack frame & register window ! ! /************ ! /* D[0] = l0 ! /* D[1] = l1 ! /* D[2] = l2 ! /* D[3] = l3 ! /* /* B[0] = l4 ! /* B[1] = l5 ! /* B[2] = l6 ! /* B[3] = l7 ! /* /* u[0] = i2 ! /* u[1] = i3 ! /* u[2] = i4 ! /* u[3] = i5 ! /* /* v[0] = o2 ! /* v[1] = o3 ! /* v[2] = o4 ! /* v[3] = o5 ! /************ ! ! /** u <- p **/ ! /** ! ! Remember: This is another place where we have hardcoded ! the modulus p = 1BD5 79792B38 0B5B521E 6D9FB599 ! **/ ! ! set 0x6D9FB599,%i2 ! u[0] <- p0 ! set 0x0B5B521E,%i3 ! u[1] <- p1 ! set 0x79792B38,%i4 ! u[2] <- p2 ! set 0x00001BD5,%i5 ! u[3] <- p3 ! ! /** v <- OP **/ ! ! ld [%i1],%o2 ! v[0] <- OP[0] ! ld [%i1+4],%o3 ! v[1] <- OP[1] ! ld [%i1+8],%o4 ! v[2] <- OP[2] ! ld [%i1+12],%o5 ! v[3] <- OP[3] ! ! /** B, D <- 0, D[0] <- 1 **/ ! ! clr %l4 ! B[0] <- 0 ! clr %l5 ! clr %l6 ! clr %l7 ! B[3] <- 0 ! ! mov 0x00000001,%l0 ! D[0] <- 1 ! clr %l1 ! clr %l2 ! clr %l3 ! D[3] <- 0 ! ! .1: ! andcc %i2,0x00000001,%g0 ! u[0] bit0 == 1 ? ! bnz .4 ! yes, goto .4 ! nop ! ! /** DIV2(u) **/ ! ! sll %i5,31,%o0 ! o0 <- bit0 of u[3] ! sra %i5,1,%i5 ! u[3] shift R aritmetic 1 bit ! sll %i4,31,%o1 ! o1 <- bit0 of u[2] ! srl %i4,1,%i4 ! u[2] shift R logical 1 bit ! or %i4,%o0,%i4 ! u[2] bit31 <- u[3] bit0 ! sll %i3,31,%o0 ! o0 <- bit0 of u[1] ! srl %i3,1,%i3 ! u[1] shift R logical 1 bit ! or %i3,%o1,%i3 ! u[1] bit31 <- u[2] bit0 ! sll %i2,31,%o1 ! o1 <- bit0 of u[0] ! srl %i2,1,%i2 ! u[0] shift R logical 1 bit ! or %i2,%o0,%i2 ! u[0] bit31 <- u[1] bit0 ! orcc %o1,%o1,%g0 ! u[0] bit0 (before R shift) == '1'? ! bz .2 ! no, goto .2. Dividend even so skip rounding ! nop ! addcc %i2,1,%i2 ! dividend += 1; round -> 0 ! addxcc %i3,0,%i3 ! addxcc %i4,0,%i4 ! addx %i5,0,%i5 ! ! .2: ! andcc %l4,0x00000001,%g0 ! B[0] bit0 == 1 ? ! bz .3 ! no, goto .3 ! nop ! ! /** sub128(B,B,p) **/ ! ! /** ! ! Remember: This is another place where we have hardcoded ! the modulus p = 1BD5 79792B38 0B5B521E 6D9FB599 ! **/ ! ! set 0x6D9FB599,%o1 ! o1 <- p0 ! subcc %l4,%o1,%l4 ! B[0] <- B[0] - p0 ! set 0x0B5B521E,%o1 ! o1 <- p1 ! subxcc %l5,%o1,%l5 ! B[1] <- B[1] - p1 + carry ! set 0x79792B38,%o1 ! o1 <- p2 ! subxcc %l6,%o1,%l6 ! B[2] <- B[2] - p2 + carry ! set 0x00001BD5,%o1 ! o1 <- p3 ! subx %l7,%o1,%l7 ! B[3] <- B[3] - p3 + carry ! ! .3: ! ! /** DIV2(B) **/ ! ! sll %l7,31,%o0 ! o0 <- bit0 of B[3] ! sra %l7,1,%l7 ! B[3] shift R aritmetic 1 bit ! sll %l6,31,%o1 ! o1 <- bit0 of B[2] ! srl %l6,1,%l6 ! B[2] shift R logical 1 bit ! or %l6,%o0,%l6 ! B[2] bit31 <- B[3] bit0 ! sll %l5,31,%o0 ! o0 <- bit0 of B[1] ! srl %l5,1,%l5 ! B[1] shift R logical 1 bit ! or %l5,%o1,%l5 ! B[1] bit31 <- B[2] bit0 ! sll %l4,31,%o1 ! o1 <- bit0 of B[0] ! srl %l4,1,%l4 ! B[0] shift R logical 1 bit ! or %l4,%o0,%l4 ! B[0] bit31 <- B[1] bit0 ! orcc %o1,%o1,%g0 ! B[0] bit0 (before R shift) == '1'? ! bz .1 ! no, goto .1. Dividend even so skip rounding ! nop ! addcc %l4,1,%l4 ! dividend +=; round -> 0 ! addxcc %l5,0,%l5 ! addxcc %l6,0,%l6 ! addx %l7,0,%l7 ! ba .1 ! nop ! ! .4: ! andcc %o2,0x00000001,%g0 ! v[0] bit0 == 1 ? ! bnz .7 ! yes, goto label .7 ! nop ! ! /** DIV2(v) **/ ! ! sll %o5,31,%o0 ! o0 <- bit0 of v[3] ! sra %o5,1,%o5 ! v[3] shift R aritmetic 1 bit ! sll %o4,31,%o1 ! o1 <- bit0 of v[2] ! srl %o4,1,%o4 ! v[2] shift R logical 1 bit ! or %o4,%o0,%o4 ! v[2] bit31 <- v[3] bit0 ! sll %o3,31,%o0 ! o0 <- bit0 of v[1] ! srl %o3,1,%o3 ! v[1] shift R logical 1 bit ! or %o3,%o1,%o3 ! v[1] bit31 <- v[2] bit0 ! sll %o2,31,%o1 ! o1 <- bit0 of v[0] ! srl %o2,1,%o2 ! v[0] shift R logical 1 bit ! or %o2,%o0,%o2 ! v[0] bit31 <- v[1] bit0 ! orcc %o1,%o1,%g0 ! v[0] bit0 (before R shift) == '1'? ! bz .5 ! no, goto .5. Dividend even so skip rounding ! nop ! addcc %o2,1,%o2 ! dividend += 1; round -> 0 ! addxcc %o3,0,%o3 ! addxcc %o4,0,%o4 ! addx %o5,0,%o5 ! ! .5: ! andcc %l0,0x00000001,%g0 ! D[0] bit0 == 1 ? ! bz .6 ! no, goto label .6 ! nop ! ! /** sub128(D,D,p) **/ ! ! /** ! ! Remember: This is another place where we have hardcoded ! the modulus p = 1BD5 79792B38 0B5B521E 6D9FB599 ! **/ ! ! set 0x6D9FB599,%o1 ! o1 <- p0 ! subcc %l0,%o1,%l0 ! D[0] <- D[0] - p0 ! set 0x0B5B521E,%o1 ! o1 <- p1 ! subxcc %l1,%o1,%l1 ! D[1] <- D[1] - p1 + carry ! set 0x79792B38,%o1 ! o1 <- p2 ! subxcc %l2,%o1,%l2 ! D[2] <- D[2] - p2 + carry ! set 0x00001BD5,%o1 ! o1 <- p3 ! subx %l3,%o1,%l3 ! D[3] <- D[3] - p3 + carry ! ! .6: ! ! /** DIV2(D) **/ ! ! sll %l3,31,%o0 ! o0 <- bit0 of D[3] ! sra %l3,1,%l3 ! D[3] shift R aritmetic 1 bit ! sll %l2,31,%o1 ! o1 <- bit0 of D[2] ! srl %l2,1,%l2 ! D[2] shift R logical 1 bit ! or %l2,%o0,%l2 ! D[2] bit31 <- D[3] bit0 ! sll %l1,31,%o0 ! o0 <- bit0 of D[1] ! srl %l1,1,%l1 ! D[1] shift R logical 1 bit ! or %l1,%o1,%l1 ! D[1] bit31 <- D[2] bit0 ! sll %l0,31,%o1 ! o1 <- bit0 of D[0] ! srl %l0,1,%l0 ! D[0] shift R logical 1 bit ! or %l0,%o0,%l0 ! D[0] bit31 <- D[1] bit0 ! orcc %o1,%o1,%g0 ! D[0] bit0 (before R shift) == '1'? ! bz .4 ! no, goto .4. Dividend even so skip rounding ! nop ! addcc %l0,1,%l0 ! dividend += 1; round -> 0 ! addxcc %l1,0,%l1 ! addxcc %l2,0,%l2 ! addx %l3,0,%l3 ! ba .4 ! nop ! ! .7: ! subcc %i2,%o2,%g0 ! u[0] - v[0] ! subxcc %i3,%o3,%g0 ! u[1] - v[1] + carry ! subxcc %i4,%o4,%g0 ! u[2] - v[2] + carry ! subxcc %i5,%o5,%g0 ! u[3] - v[3] + carry ! bl .8 ! branch if u < v ! nop ! ! subcc %i2,%o2,%i2 ! u[0] <- u[0] - v[0] ! subxcc %i3,%o3,%i3 ! u[1] <- u[1] - v[1] + carry ! subxcc %i4,%o4,%i4 ! u[2] <- u[2] - v[2] + carry ! subx %i5,%o5,%i5 ! u[3] <- u[3] - v[3] + carry ! subcc %l4,%l0,%l4 ! B[0] <- B[0] - D[0] ! subxcc %l5,%l1,%l5 ! B[1] <- B[1] - D[1] + carry ! subxcc %l6,%l2,%l6 ! B[2] <- B[2] - D[2] + carry ! subx %l7,%l3,%l7 ! B[3] <- B[3] - D[3] + carry ! ba .9 ! nop ! ! .8: ! subcc %o2,%i2,%o2 ! v[0] <- v[0] - u[0] ! subxcc %o3,%i3,%o3 ! v[1] <- v[1] - u[1] + carry ! subxcc %o4,%i4,%o4 ! v[2] <- v[2] - u[2] + carry ! subx %o5,%i5,%o5 ! v[3] <- v[3] - u[3] + carry ! subcc %l0,%l4,%l0 ! D[0] <- D[0] - B[0] ! subxcc %l1,%l5,%l1 ! D[1] <- D[1] - B[1] + carry ! subxcc %l2,%l6,%l2 ! D[2] <- D[2] - B[2] + carry ! subx %l3,%l7,%l3 ! D[3] <- D[3] - B[3] + carry ! ! .9: ! mov %i2,%o0 ! or %i3,%o0,%o0 ! or %i4,%o0,%o0 ! orcc %i5,%o0,%o0 ! bz .10 ! branch if u == 0 ! nop ! ba .1 ! u > 0 ! nop ! ! .10: ! orcc %l3,%l3,%g0 ! D[3] bit 31 == 0? ! bpos .12 ! yes, so return D. ! nop ! ! .11: ! /** ! ! Remember: This is another place where we have hardcoded ! the modulus p = 1BD5 79792B38 0B5B521E 6D9FB599 ! **/ ! ! .11a: ! set 0x6D9FB599,%o1 ! o1 <- p0 ! addcc %l0,%o1,%l0 ! D[0] <- D[0] + p0 ! set 0x0B5B521E,%o1 ! o1 <- p1 ! addxcc %l1,%o1,%l1 ! D[1] <- D[1] + p1 + carry ! set 0x79792B38,%o1 ! o1 <- p2 ! addxcc %l2,%o1,%l2 ! D[2] <- D[2] + p2 + carry ! set 0x00001BD5,%o1 ! o1 <- p3 ! addxcc %l3,%o1,%l3 ! D[3] <- D[3] + p3 + carry ! bneg .11a ! nop ! ! .12: ! st %l0,[%i0] ! RES[0] <- D[0] ! st %l1,[%i0+4] ! st %l2,[%i0+8] ! st %l3,[%i0+12] ! RES[3] <- D[3] ! ! mov 1,%o0 ! return 1 ! ret ! restore %o0,0,%o0 diff -cr eccp109-132-2/asm/sparc/mulmod_p109.S sos4/asm/sparc/mulmod_p109.S *** eccp109-132-2/asm/sparc/mulmod_p109.S Sat Dec 22 03:16:08 2001 --- sos4/asm/sparc/mulmod_p109.S Mon Oct 7 20:59:29 2002 *************** *** 1,604 **** ! /******************************************************************/ ! /* mulmod_p109.s */ ! /* Based on the AT&T version of mulmod_p109.s by Chris Monico */ ! /* which had some good optimizations by Brian Gladman. */ ! /* This is a cut down version to just compute the product of two */ ! /* 109-bit integers without the reduction of the product mod p. */ ! /******************************************************************/ ! /* One or both input args may be the same as the output arg. */ ! /******************************************************************/ ! ! /******************************************************************/ ! /** **/ ! /** This SPARC implementation by Quentin Campbell, 14/Jun/2001, **/ ! /** **/ ! /******************************************************************/ ! ! /** ! ARGS: ! RES: %i0 ! A: %i1 ! B: %i2 ! **/ ! ! .section ".text" ! ! .global mulmod_p109 ! ! .align 4 ! mulmod_p109: ! ! save %sp,-(64+4+24) & -8,%sp ! create new stack frame & register window ! ! /******************************************************************/ ! /** Register mappings: **/ ! /** %eax: %o0 t[0]: %l0 **/ ! /** %ebx: %o2 t[1]: %l1 **/ ! /** %ecx: %o3 t[2]: %l2 **/ ! /** %edx: %y t[3]: %l3 **/ ! /** %ebp: %o4 t[4]: %l4 **/ ! /** %esi: %i1 t[5]: %l5 **/ ! /** %edi: %i2 t[6]: %l6 **/ ! /******************************************************************/ ! ! xor %o3,%o3,%o3 ! xor %o4,%o4,%o4 ! ! ld [%i1],%o0 ! o0 <- A[0]; R[0] ! ld [%i2],%o1 ! o1 <- B[0] ! umul %o0,%o1,%o0 ! y:o0 <- A[0] * B[0] ! mov %o0,%l0 ! LS part of result ! ! mov %y,%o2 ! MS part of result; R[1] -> (o2,o3,o4) ! ld [%i1+4],%o0 ! o0 <- A[1] ! ld [%i2],%o1 ! o1 <- B[0] ! umul %o0,%o1,%o0 ! y:o0 <- A[1] * B[0] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o2,%o2 ! addxcc %o1,%o3,%o3 ! ld [%i1],%o0 ! o0 <- A[0] ! ld [%i2+4],%o1 ! o1 <- B[1] ! umul %o0,%o1,%o0 ! y:o0 <- A[0] * B[1] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o2,%o2 ! addxcc %o1,%o3,%o3 ! addxcc %o4,0,%o4 ! mov %o2,%l1 ! xor %o2,%o2,%o2 ! ! ld [%i1+8],%o0 ! o0 <- A[2]; R[2] -> (o3,o4,o2) ! ld [%i2],%o1 ! o1 <- B[0] ! umul %o0,%o1,%o0 ! y:o0 <- A[2] * B[0] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o3,%o3 ! addxcc %o1,%o4,%o4 ! ld [%i1+4],%o0 ! o0 <- A[1] ! ld [%i2+4],%o1 ! o1 <- B[1] ! umul %o0,%o1,%o0 ! y:o0 <- A[1] * B[1] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o3,%o3 ! addxcc %o1,%o4,%o4 ! addxcc %o2,0,%o2 ! ld [%i1],%o0 ! o0 <- A[0] ! ld [%i2+8],%o1 ! o1 <- B[2] ! umul %o0,%o1,%o0 ! y:o0 <- A[0] * B[2] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o3,%o3 ! addxcc %o1,%o4,%o4 ! addxcc %o2,0,%o2 ! mov %o3,%l2 ! t[2] ! xor %o3,%o3,%o3 ! ! ld [%i1+12],%o0 ! o0 <- A[3]; R[3] -> (o4,o2,o3) ! ld [%i2],%o1 ! o1 <- B[0] ! umul %o0,%o1,%o0 ! y:o0 <- A[2] * B[3] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o4,%o4 ! addxcc %o1,%o2,%o2 ! ld [%i1+8],%o0 ! o0 <- A[2] ! ld [%i2+4],%o1 ! o1 <- B[1] ! umul %o0,%o1,%o0 ! y:o0 <- A[2] * B[1] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o4,%o4 ! addxcc %o1,%o2,%o2 ! addxcc %o3,0,%o3 ! ld [%i1+4],%o0 ! o0 <- A[1] ! ld [%i2+8],%o1 ! o1 <- B[2] ! umul %o0,%o1,%o0 ! y:o0 <- A[1] * B[2] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o4,%o4 ! addxcc %o1,%o2,%o2 ! addxcc %o3,0,%o3 ! ld [%i1],%o0 ! o0 <- A[0] ! ld [%i2+12],%o1 ! o1 <- B[3] ! umul %o0,%o1,%o0 ! y:o0 <- A[0] * B[3] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o4,%o4 ! addxcc %o1,%o2,%o2 ! addxcc %o3,0,%o3 ! mov %o4,%l3 ! t[3] ! xor %o4,%o4,%o4 ! ! ld [%i1+12],%o0 ! o0 <- A[3]; R[4] -> (o2,o3,o4) ! ld [%i2+4],%o1 ! o1 <- B[1] ! umul %o0,%o1,%o0 ! y:o0 <- A[3] * B[1] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o2,%o2 ! addxcc %o1,%o3,%o3 ! ld [%i1+8],%o0 ! o0 <- A[2] ! ld [%i2+8],%o1 ! o1 <- B[2] ! umul %o0,%o1,%o0 ! y:o0 <- A[2] * B[2] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o2,%o2 ! addxcc %o1,%o3,%o3 ! addxcc %o4,0,%o4 ! ld [%i1+4],%o0 ! o0 <- A[1] ! ld [%i2+12],%o1 ! o1 <- B[3] ! umul %o0,%o1,%o0 ! y:o0 <- A[1] * B[3] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o2,%o2 ! addxcc %o1,%o3,%o3 ! addxcc %o4,0,%o4 ! mov %o2,%l4 ! t[4] ! ! ld [%i1+12],%o0 ! o0 <- A[3]; R[5] -> (o3,o4) ! ld [%i2+8],%o1 ! o1 <- B[2] ! umul %o0,%o1,%o0 ! y:o0 <- A[3] * B[2] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o3,%o3 ! addxcc %o1,%o4,%o4 ! ld [%i1+8],%o0 ! o0 <- A[2] ! ld [%i2+12],%o1 ! o1 <- B[3] ! umul %o0,%o1,%o0 ! y:o0 <- A[2] * B[3] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o3,%o3 ! addxcc %o1,%o4,%o4 ! mov %o3,%l5 ! t[5] ! ! ld [%i1+12],%o0 ! o0 <- A[3]; R[6] -> (ro4) ! ld [%i2+12],%o1 ! o1 <- B[3] ! umul %o0,%o1,%o0 ! y:o0 <- A[3] * B[3] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o4,%o4 ! mov %o4,%l6 ! t[6] ! ! /***********************************************************************/ ! /* The 218-bit product is now stored in t[0],...,t[6]. ! /* Our next mission, should we choose to accept it, is to /* reduce this number mod 'p'. ! /* We proceed in several steps: /* 1) Notice that we can write: ! /* t = (a 128-bit number) + 2^{128}t4 + 2^{160}t5 + 2^{196}t6 ! /* Where t4,t5,t6 each have 32-bits. ! /* Furthermore, we've precomputed the reductions : ! /* r4 := 2^128 mod p = 7f0 55455b5c 27a9ef92 540f3986 (107 bits) ! /* r5 := 2^160 mod p = acc f5a4e730 b2ec7424 32d0a6bf (108 bits) ! /* r6 := 2^192 mod p = a7c a3ea9cfb 531af0b6 b786c901 (108 bits) ! /* ! /* So that ! /* r4*t4 will have 32+107 = 139 bits ! /* r5*t5 will have 32+108 = 140 bits ! /* r6*t6 will have 22+108 = 130 bits ! /* ! /* Let's look carefully at what we'll do: ! /* t = (a 128-bit number) + 2^{128}t4 + 2^{160}t5 + 2^{196}t6 ! /* = (128 bits) + (139 bits) + (140 bits) + (130 bits) ! /* = (128 bits + 139 bits) + (140 bits + 130 bits) ! /* <= (140 bits) + (141 bits) ! /* <= (142 bits) ! /* /* ! /* 2) Use the precomputed tables (T) to reduce it to a number ! /* in [0,2p) ! /* 3) Perform a single subtraction, if necessary, to get it into ! /* [0,p) ! /***********************************************************************/ ! ! /*** compute [t3,t2,t1,t0] += r4*t4 ***/ ! ! xor %o2,%o2,%o2 ! mov %l4,%o3 ! o3 <- t[4] ! mov %o2,%l4 ! t[4] <- 0 ! ! set 0x540f3986,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[4] * 0x540f3986 ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l0,%o0,%l0 ! t[0] <- t[0] + LS part of result ! addxcc %l1,%o1,%l1 ! t[1] <- t[1] + MS part of result + carry ! addxcc %l2,0,%l2 ! t[2] <- t[2] + carry ! addxcc %l3,0,%l3 ! t[3] <- t[3] + carry ! addx %l4,0,%l4 ! t[4] <- t[4] + carry ! ! set 0x27a9ef92,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[4] * 0x27a9ef92 ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l1,%o0,%l1 ! t[1] <- t[1] + LS part of result ! addxcc %l2,%o1,%l2 ! t[2] <- t[2] + MS part of result + carry ! addxcc %l3,0,%l3 ! t[3] <- t[3] + carry ! addx %l4,0,%l4 ! t[4] <- t[4] + carry ! ! set 0x55455b5c,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[4] * 0x55455b5c ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l2,%o0,%l2 ! t[2] <- t[2] + LS part of result ! addxcc %l3,%o1,%l3 ! t[3] <- t[3] + MS part of result + carry ! addx %l4,0,%l4 ! t[4] <- t[4] + carry ! ! set 0x000007f0,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[4] * 0x000007f0 ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l3,%o0,%l3 ! t[3] <- t[3] + LS part of result ! addx %l4,%o1,%l4 ! t[4] <- t[4] + MS part of result + carry ! ! /*** compute [t4,t3,t2,t1,t0] += r5*t5 ***/ ! ! mov %l5,%o3 ! o3 <- t[5] ! ! set 0x32d0a6bf,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[5] * 0x32d0a6bf ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l0,%o0,%l0 ! t[0] <- t[0] + LS part of result ! addxcc %l1,%o1,%l1 ! t[1] <- t[1] + MS part of result + carry ! addxcc %l2,0,%l2 ! t[2] <- t[2] + carry ! addxcc %l3,0,%l3 ! t[3] <- t[3] + carry ! addx %l4,0,%l4 ! t[4] <- t[4] + carry ! ! set 0xb2ec7424,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[5] * 0xb2ec7424 ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l1,%o0,%l1 ! t[1] <- t[1] + LS part of result ! addxcc %l2,%o1,%l2 ! t[2] <- t[2] + MS part of result + carry ! addxcc %l3,0,%l3 ! t[3] <- t[3] + carry ! addx %l4,0,%l4 ! t[4] <- t[4] + carry ! ! set 0xf5a4e730,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[5] * 0xf5a4e730 ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l2,%o0,%l2 ! t[2] <- t[2] + LS part of result ! addxcc %l3,%o1,%l3 ! t[3] <- t[3] + MS part of result + carry ! addx %l4,0,%l4 ! t[4] <- t[4] + carry ! ! set 0x00000acc,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[5] * 0x00000acc ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l3,%o0,%l3 ! t[3] <- t[3] + LS part of result ! addx %l4,%o1,%l4 ! t[4] <- t[4] + MS part of result + carry ! ! /*** compute [t4,t3,t2,t1,t0] += r6*t6 ***/ ! ! mov %l6,%o3 ! o3 <- t[6] ! ! set 0xb786c901,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[6] * 0xb786c901 ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l0,%o0,%l0 ! t[0] <- t[0] + LS part of result ! addxcc %l1,%o1,%l1 ! t[1] <- t[1] + MS part of result + carry ! addxcc %l2,0,%l2 ! t[2] <- t[2] + carry ! addxcc %l3,0,%l3 ! t[3] <- t[3] + carry ! addx %l4,0,%l4 ! t[4] <- t[4] + carry ! ! set 0x531af0b6,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[6] * 0x531af0b6 ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l1,%o0,%l1 ! t[1] <- t[1] + LS part of result ! addxcc %l2,%o1,%l2 ! t[2] <- t[2] + MS part of result + carry ! addxcc %l3,0,%l3 ! t[3] <- t[3] + carry ! addx %l4,0,%l4 ! t[4] <- t[4] + carry ! ! set 0xa3ea9cfb,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[6] * 0xa3ea9cfb ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l2,%o0,%l2 ! t[2] <- t[2] + LS part of result ! addxcc %l3,%o1,%l3 ! t[3] <- t[3] + MS part of result + carry ! addx %l4,0,%l4 ! t[4] <- t[4] + carry ! ! set 0x00000a7c,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[6] * 0x00000a7c ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l3,%o0,%l3 ! t[3] <- t[3] + LS part of result ! addx %l4,%o1,%l4 ! t[4] <- t[4] + MS part of result + carry ! ! /***************************************************/ ! /* We now have a 142 bit partially reduced product */ ! /* stored in bits 0-141 of [t4,t3,t2,t1,t0]. */ ! /***************************************************/ ! /* Now, 't4' has at most 142-128 = 14 bits (i.e. */ ! /* bits 128-141). */ ! /* So here's what we're gonna do: Write */ ! /* t = (bits 0-125) + 2^{126}(a 16-bit #) */ ! /* As before, we've precomputed r126 = 2^{126}mod p*/ ! /* Which has 108 bits. So, we do */ ! /* (bits 0-125) += (16 bits)*(108 bits) */ ! /* Which gives a result that will fit in bits 0-126*/ ! /***************************************************/ ! /* By abuse of notation, 't4' will now be the */ ! /* 16-bit number that's in bits 126-141 now. */ ! /***************************************************/ ! ! /************************************************************/ ! /* The notation: 47(%esp), etc, refers to that used in the */ ! /* ATT/Intel assembler versions. See diagram at end of this */ ! /* file for mapping to corresponding SPARC registers. Note */ ! /* in the SPARC architecture ld/st commands must access */ ! /* word-aligned operands hence cumbersome sequence of loads */ ! /* and shifts. */ ! /* */ ! /* Grab the 4 bytes 47(%esp), 48(%esp), 49(%esp), 50(%esp) */ ! /* Remember that t[0] -> t[6] are in registers l0 -> l6 */ ! /* Thus byte offset 47 is in l3 while 48, 49 & 50 are in l4 */ ! /* Need to use shifts and ORs to get the bytes where we */ ! /* want them in a single register. */ ! /************************************************************/ ! ! /** l3 BEFORE shift: [B47:B46:B45:B44] **/ ! srl %l3,24,%o1 ! o1 <- [ : : :B47] ! /** l4 BEFORE shift: [B51:B50:B49:B48] **/ ! sll %l4,8,%o3 ! o3 <- [B50:B49:B48: ] ! or %o3,%o1,%o3 ! o3 <- [B50:B49:B48:B47]; bits 120-141[151] ! srl %o3,6,%o3 ! o3 <- bits 126-141[151] ! /* ! o3 now contains the precise number we want to multiply ! by the pre-computed number 2^{126} ! */ ! /***************************************************/ ! /* Look at mapping diagram at end of this file for */ ! /* location of the bits and their corresponding */ ! /* register in this SPARC implementation. */ ! /* */ ! /* Zero out bits 126-141 of the product; bits 142- */ ! /* 151 already zero, so we're keeping only bits */ ! /* 120-025. So byte 32 through byte 47 is exactly */ ! /* the 126-bit number we'll add the product to. */ ! /* That is, it's exactly equal to (bits 0-125). */ ! /***************************************************/ ! ! set 0x3fffffff,%o0 ! o0 <- 0011111..11111111111 ! and %l3,%o0,%l3 ! zero bits 126 and 127 ! /* ! set 0xffffc000,%o0 ! o0 <- 11..1100000000000000 ! and %l4,%o0,%l4 ! zero bits 128 to 141 ! */ ! mov 0,%l4 ! ! /* Do not need bits 128 to 159 in l4/t[4] anymore */ ! ! /***************************************************/ ! /* 2^{126} == fe6 d20dec73 0f9824f3 cbd3a92e mod p */ ! /***************************************************/ ! /* Remember: the product of o3 by the above will */ ! /* have at most 16+108 = 124 bits. Since we're */ ! /* adding it to a 126 bit number, the result will */ ! /* have at most 127 bits, and hence, it will fit */ ! /* nicely into the four 32-bit words */ ! /* t0,t1,t2,t3. */ ! /***************************************************/ ! ! set 0xcbd3a92e,%o0 ! umul %o0,%o3,%o0 ! o0 <- 03 * 0xcbd3a92e ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l0,%o0,%l0 ! t[0] <- t[0] + LS part of result ! addxcc %l1,%o1,%l1 ! t[1] <- t[1] + MS part of result + carry ! addxcc %l2,0,%l2 ! t[2] <- t[2] + carry ! addx %l3,0,%l3 ! t[3] <- t[3] + carry ! ! set 0x0f9824f3,%o0 ! umul %o0,%o3,%o0 ! o0 <- 03 * 0x0f9824f3 ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l1,%o0,%l1 ! t[1] <- t[1] + LS part of result ! addxcc %l2,%o1,%l2 ! t[2] <- t[2] + MS part of result + carry ! addx %l3,0,%l3 ! t[3] <- t[3] + carry ! ! set 0xd20dec73,%o0 ! umul %o0,%o3,%o0 ! o0 <- 03 * 0xd20dec73 ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l2,%o0,%l2 ! t[2] <- t[2] + LS part of result ! addx %l3,%o1,%l3 ! t[2] <- t[2] + MS part of result + carry ! ! set 0x00000fe6,%o0 ! umul %o0,%o3,%o0 ! o0 <- 03 * 0x00000fe6 ! mov %y,%o1 ! o1 <- MS part of result ! add %l3,%o0,%l3 ! t[3] <- t[3] + LS part of result ! ! mov %l3,%l4 ! t[4] <- "old" t[3] for table indexing ! and %l3,0x000000ff,%l3 ! why? ! /***************************************************/ ! /* So now we have only bits 0-126 to worry about */ ! /***************************************************/ ! ! sethi %hi(_T),%i1 ! reduction table as an external ! or %i1,%lo(_T),%i1 ! i1 <- base address of T ! ! /***************************************************/ ! /* Use T2 to cancel bits 120-127 (byte 15) */ ! /***************************************************/ ! ! set 0xff000000,%o0 ! mask ! and %l4,%o0,%o1 ! o1 <- bits 0-7 of old t3 (byte 47) ! srl %o1,24,%o1 ! sll %o1,4,%o1 ! add %i1,%o1,%o2 ! o2 <- T2 + offset ! ! ld [%o2],%o1 ! addcc %l0,%o1,%l0 ! t[0] <- t[0] + T2[offset] ! ld [%o2+4],%o1 ! addxcc %l1,%o1,%l1 ! t[1] <- t[1] + T2[offset + 4] ! ld [%o2+8],%o1 ! addxcc %l2,%o1,%l2 ! t[2] <- t[2] + T2[offset + 8] ! ld [%o2+12],%o1 ! addx %l3,%o1,%l3 ! t[3] <- t[3] + T2[offset + 12] ! ! /***************************************************/ ! /* Use T1 to cancel bits 112-119 (byte 14) */ ! /***************************************************/ ! ! add %i1,4095,%i1 ! address of T + 4096 ! inc %i1 ! ! set 0x00ff0000,%o0 ! mask ! and %l4,%o0,%o1 ! ! srl %o1,16,%o1 ! o1 <- bits 16-23 of old t3 (byte 46) ! sll %o1,4,%o1 ! add %i1,%o1,%o2 ! o2 <- T1 + offset ! ! ld [%o2],%o1 ! addcc %l0,%o1,%l0 ! t[0] <- t[0] + T1[offset] ! ld [%o2+4],%o1 ! addxcc %l1,%o1,%l1 ! t[1] <- t[1] + T1[offset + 4] ! ld [%o2+8],%o1 ! addxcc %l2,%o1,%l2 ! t[2] <- t[2] + T1[offset + 8] ! ld [%o2+12],%o1 ! addx %l3,%o1,%l3 ! t[3] <- t[3] + T1[offset + 12] ! ! /***************************************************/ ! /* Use T0 to cancel bits 104-111 (byte 13) */ ! /***************************************************/ ! ! add %i1,4095,%i1 ! T0 <- base address of T1 + 4096 ! inc %i1 ! ! set 0x0000ff00,%o0 ! mask ! and %l4,%o0,%o1 ! ! srl %o1,8,%o1 ! o1 <- bits 16-24 of old t3 (byte 45) ! sll %o1,4,%o1 ! byte 45 x 16; offset into T0 ! add %i1,%o1,%o2 ! o2 <- T0 + offset ! ! ld [%o2],%o1 ! addcc %l0,%o1,%l0 ! t[0] <- t[0] + T0[offset] ! ld [%o2+4],%o1 ! addxcc %l1,%o1,%l1 ! t[1] <- t[1] + T0[offset + 4] ! ld [%o2+8],%o1 ! addxcc %l2,%o1,%l2 ! t[2] <- t[2] + T0[offset + 8] ! ld [%o2+12],%o1 ! addx %l3,%o1,%l3 ! t[3] <- t[3] + T0[offset + 12] ! ! /*********************************************************/ ! /* At long last, we're almost done. [C0,C1,C2,C3] is now */ ! /* in [0,2p), so we need at most one more subtraction */ ! /* to be done. */ ! /* */ ! /* Remember: This is another place where we've hardcoded */ ! /* the modulus: */ ! /* p = 1BD5 79792B38 0B5B521E 6D9FB599 */ ! /*********************************************************/ ! ! set 0x6D9FB599,%o0 ! set 0x0B5B521E,%o1 ! set 0x79792B38,%o2 ! set 0x00001BD5,%o3 ! ! .STORE: ! st %l0,[%i0] ! RES(0) <- C(0) ! st %l1,[%i0+4] ! RES(1) <- C(1) ! st %l2,[%i0+8] ! RES(2) <- C(2) ! st %l3,[%i0+12] ! RES(3) <- C(3) ! ! /** ! NB: At this point the values of C0,C1,C2,C3 are held in a ! temporary location, the registers l0,l1,l2,l3. ! **/ ! ! cmp %l3,%o3 ! is MS word of C < MS word of p? ! blu .DONE_M ! yes, so no reduction needed ! nop ! bgu .SUBTRACT_M ! no, C > p so reduction needed ! ! cmp %l2,%o2 ! blu .DONE_M ! nop ! bgu .SUBTRACT_M ! ! cmp %l1,%o1 ! blu .DONE_M ! nop ! bgu .SUBTRACT_M ! ! cmp %l0,%o0 ! is LS word of C < LS word of p? ! blu .DONE_M ! yes, C < p so no reduction needed ! nop ! bgu .SUBTRACT_M ! no, C > p so reduction needed ! nop ! ! /******************************************************************/ ! /** Here when C is equal to our prime modulus, p. **/ ! /******************************************************************/ ! mov 0,%l0 ! C(0) <- 0 ! mov 0,%l1 ! C(1) <- 0 ! mov 0,%l2 ! C(2) <- 0 ! mov 0,%l3 ! C(3) <- 0 ! ba .STORE ! nop ! ! /******************************************************************/ ! /** Here when C is geater than our prime modulus, p. **/ ! /******************************************************************/ ! .SUBTRACT_M: ! ! subcc %l0,%o0,%l0 ! C(0) <- C(0) mod p(0) ! subxcc %l1,%o1,%l1 ! C(1) <- C(1) mod p(1) ! subxcc %l2,%o2,%l2 ! C(2) <- C(2) mod p(2) ! subx %l3,%o3,%l3 ! C(3) <- C(3) mod p(3) ! ba .STORE ! nop ! ! /******************************************************************/ ! /** All that is left to do now is return C. **/ ! /******************************************************************/ ! .DONE_M: ! ! ret ! restore ! ! /**************************************************************/ ! /** The temporary locations t[0] - t[6] are the local ! /** registers l0 - l6 in the SPARC implementation. Thus ! /** the LS byte of l0/t[0] corresponds to memory offset ! /** 32; l1/t[1] corresponds to offset 40 and so on in /** the ATT and Intel implementations. In particular, ! /** byte offsets 45, 46 and 47 used in the ATT/Intel ! /** implementations are in register l3/t[3]. ! /** ! /** The bit positions of the 218 bit product are shown. ! /** ! /** ! /** SPARC [ATT/Intel byte offset 32] ! /** reg.+--------+--------+--------+--------+ ! /** |31 24|23 16|15 8|7 0| bit ! /** l0 | | | | | ! /** | | | | byte 0| ! /** +--------+--------+--------+--------+ ! /** /** 36 ! /** +--------+--------+--------+--------+ ! /** |63 | | | 32| bit ! /** l1 | | | | | ! /** | | | | byte 4| ! /** +--------+--------+--------+--------+ ! /** /** 40 ! /** +--------+--------+--------+--------+ ! /** |95 | | | 64| bit ! /** l2 | | | | | ! /** | | | | byte 8| ! /** +--------+--------+--------+--------+ ! /** /** 47 46 45 44 ! /** +--------+--------+--------+--------+ ! /** |127 120|119 112|111 104|103 96| bit ! /** l3 | | | | | ! /** | byte 15| byte 14| byte 12| byte 12| ! /** +--------+--------+--------+--------+ ! /** /** 48 ! /** +--------+--------+--------+--------+ ! /** |159 152|151 144|143 136|135 128| bit ! /** l4 | | | | | ! /** | | | | byte 16| ! /** +--------+--------+-+------+--------+ ! /** ^ ! /** | ! /** +----- bit 142 ! /** /** 52 ! /** +--------+--------+--------+--------+ ! /** |191 | | | 160| bit ! /** l5 | | | | | ! /** | | | | | ! /** +--------+--------+--------+--------+ ! /** /** 56 ! /** +--------+--------+--------+--------+ ! /** |223 | | | 192| bit ! /** l6 | | | | | ! /** | | | | | ! /** +--------+--------+--------+--------+ ! /** /**************************************************************/ --- 1,604 ---- ! /******************************************************************/ ! /* mulmod_p109.s */ ! /* Based on the AT&T version of mulmod_p109.s by Chris Monico */ ! /* which had some good optimizations by Brian Gladman. */ ! /* This is a cut down version to just compute the product of two */ ! /* 109-bit integers without the reduction of the product mod p. */ ! /******************************************************************/ ! /* One or both input args may be the same as the output arg. */ ! /******************************************************************/ ! ! /******************************************************************/ ! /** **/ ! /** This SPARC implementation by Quentin Campbell, 14/Jun/2001, **/ ! /** **/ ! /******************************************************************/ ! ! /** ! ARGS: ! RES: %i0 ! A: %i1 ! B: %i2 ! **/ ! ! /*.section ".text"*/ ! ! .global mulmod_p109 ! ! .align 4 ! mulmod_p109: ! ! save %sp,-(64+4+24) & -8,%sp ! create new stack frame & register window ! ! /******************************************************************/ ! /** Register mappings: **/ ! /** %eax: %o0 t[0]: %l0 **/ ! /** %ebx: %o2 t[1]: %l1 **/ ! /** %ecx: %o3 t[2]: %l2 **/ ! /** %edx: %y t[3]: %l3 **/ ! /** %ebp: %o4 t[4]: %l4 **/ ! /** %esi: %i1 t[5]: %l5 **/ ! /** %edi: %i2 t[6]: %l6 **/ ! /******************************************************************/ ! ! xor %o3,%o3,%o3 ! xor %o4,%o4,%o4 ! ! ld [%i1],%o0 ! o0 <- A[0]; R[0] ! ld [%i2],%o1 ! o1 <- B[0] ! umul %o0,%o1,%o0 ! y:o0 <- A[0] * B[0] ! mov %o0,%l0 ! LS part of result ! ! mov %y,%o2 ! MS part of result; R[1] -> (o2,o3,o4) ! ld [%i1+4],%o0 ! o0 <- A[1] ! ld [%i2],%o1 ! o1 <- B[0] ! umul %o0,%o1,%o0 ! y:o0 <- A[1] * B[0] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o2,%o2 ! addxcc %o1,%o3,%o3 ! ld [%i1],%o0 ! o0 <- A[0] ! ld [%i2+4],%o1 ! o1 <- B[1] ! umul %o0,%o1,%o0 ! y:o0 <- A[0] * B[1] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o2,%o2 ! addxcc %o1,%o3,%o3 ! addxcc %o4,0,%o4 ! mov %o2,%l1 ! xor %o2,%o2,%o2 ! ! ld [%i1+8],%o0 ! o0 <- A[2]; R[2] -> (o3,o4,o2) ! ld [%i2],%o1 ! o1 <- B[0] ! umul %o0,%o1,%o0 ! y:o0 <- A[2] * B[0] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o3,%o3 ! addxcc %o1,%o4,%o4 ! ld [%i1+4],%o0 ! o0 <- A[1] ! ld [%i2+4],%o1 ! o1 <- B[1] ! umul %o0,%o1,%o0 ! y:o0 <- A[1] * B[1] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o3,%o3 ! addxcc %o1,%o4,%o4 ! addxcc %o2,0,%o2 ! ld [%i1],%o0 ! o0 <- A[0] ! ld [%i2+8],%o1 ! o1 <- B[2] ! umul %o0,%o1,%o0 ! y:o0 <- A[0] * B[2] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o3,%o3 ! addxcc %o1,%o4,%o4 ! addxcc %o2,0,%o2 ! mov %o3,%l2 ! t[2] ! xor %o3,%o3,%o3 ! ! ld [%i1+12],%o0 ! o0 <- A[3]; R[3] -> (o4,o2,o3) ! ld [%i2],%o1 ! o1 <- B[0] ! umul %o0,%o1,%o0 ! y:o0 <- A[2] * B[3] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o4,%o4 ! addxcc %o1,%o2,%o2 ! ld [%i1+8],%o0 ! o0 <- A[2] ! ld [%i2+4],%o1 ! o1 <- B[1] ! umul %o0,%o1,%o0 ! y:o0 <- A[2] * B[1] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o4,%o4 ! addxcc %o1,%o2,%o2 ! addxcc %o3,0,%o3 ! ld [%i1+4],%o0 ! o0 <- A[1] ! ld [%i2+8],%o1 ! o1 <- B[2] ! umul %o0,%o1,%o0 ! y:o0 <- A[1] * B[2] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o4,%o4 ! addxcc %o1,%o2,%o2 ! addxcc %o3,0,%o3 ! ld [%i1],%o0 ! o0 <- A[0] ! ld [%i2+12],%o1 ! o1 <- B[3] ! umul %o0,%o1,%o0 ! y:o0 <- A[0] * B[3] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o4,%o4 ! addxcc %o1,%o2,%o2 ! addxcc %o3,0,%o3 ! mov %o4,%l3 ! t[3] ! xor %o4,%o4,%o4 ! ! ld [%i1+12],%o0 ! o0 <- A[3]; R[4] -> (o2,o3,o4) ! ld [%i2+4],%o1 ! o1 <- B[1] ! umul %o0,%o1,%o0 ! y:o0 <- A[3] * B[1] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o2,%o2 ! addxcc %o1,%o3,%o3 ! ld [%i1+8],%o0 ! o0 <- A[2] ! ld [%i2+8],%o1 ! o1 <- B[2] ! umul %o0,%o1,%o0 ! y:o0 <- A[2] * B[2] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o2,%o2 ! addxcc %o1,%o3,%o3 ! addxcc %o4,0,%o4 ! ld [%i1+4],%o0 ! o0 <- A[1] ! ld [%i2+12],%o1 ! o1 <- B[3] ! umul %o0,%o1,%o0 ! y:o0 <- A[1] * B[3] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o2,%o2 ! addxcc %o1,%o3,%o3 ! addxcc %o4,0,%o4 ! mov %o2,%l4 ! t[4] ! ! ld [%i1+12],%o0 ! o0 <- A[3]; R[5] -> (o3,o4) ! ld [%i2+8],%o1 ! o1 <- B[2] ! umul %o0,%o1,%o0 ! y:o0 <- A[3] * B[2] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o3,%o3 ! addxcc %o1,%o4,%o4 ! ld [%i1+8],%o0 ! o0 <- A[2] ! ld [%i2+12],%o1 ! o1 <- B[3] ! umul %o0,%o1,%o0 ! y:o0 <- A[2] * B[3] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o3,%o3 ! addxcc %o1,%o4,%o4 ! mov %o3,%l5 ! t[5] ! ! ld [%i1+12],%o0 ! o0 <- A[3]; R[6] -> (ro4) ! ld [%i2+12],%o1 ! o1 <- B[3] ! umul %o0,%o1,%o0 ! y:o0 <- A[3] * B[3] ! mov %y,%o1 ! copy MS word of product to o1 for the addition ! addcc %o0,%o4,%o4 ! mov %o4,%l6 ! t[6] ! ! /***********************************************************************/ ! /* The 218-bit product is now stored in t[0],...,t[6]. ! /* Our next mission, should we choose to accept it, is to /* reduce this number mod 'p'. ! /* We proceed in several steps: /* 1) Notice that we can write: ! /* t = (a 128-bit number) + 2^{128}t4 + 2^{160}t5 + 2^{196}t6 ! /* Where t4,t5,t6 each have 32-bits. ! /* Furthermore, we've precomputed the reductions : ! /* r4 := 2^128 mod p = 7f0 55455b5c 27a9ef92 540f3986 (107 bits) ! /* r5 := 2^160 mod p = acc f5a4e730 b2ec7424 32d0a6bf (108 bits) ! /* r6 := 2^192 mod p = a7c a3ea9cfb 531af0b6 b786c901 (108 bits) ! /* ! /* So that ! /* r4*t4 will have 32+107 = 139 bits ! /* r5*t5 will have 32+108 = 140 bits ! /* r6*t6 will have 22+108 = 130 bits ! /* ! /* Let's look carefully at what we'll do: ! /* t = (a 128-bit number) + 2^{128}t4 + 2^{160}t5 + 2^{196}t6 ! /* = (128 bits) + (139 bits) + (140 bits) + (130 bits) ! /* = (128 bits + 139 bits) + (140 bits + 130 bits) ! /* <= (140 bits) + (141 bits) ! /* <= (142 bits) ! /* /* ! /* 2) Use the precomputed tables (T) to reduce it to a number ! /* in [0,2p) ! /* 3) Perform a single subtraction, if necessary, to get it into ! /* [0,p) ! /***********************************************************************/ ! ! /*** compute [t3,t2,t1,t0] += r4*t4 ***/ ! ! xor %o2,%o2,%o2 ! mov %l4,%o3 ! o3 <- t[4] ! mov %o2,%l4 ! t[4] <- 0 ! ! set 0x540f3986,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[4] * 0x540f3986 ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l0,%o0,%l0 ! t[0] <- t[0] + LS part of result ! addxcc %l1,%o1,%l1 ! t[1] <- t[1] + MS part of result + carry ! addxcc %l2,0,%l2 ! t[2] <- t[2] + carry ! addxcc %l3,0,%l3 ! t[3] <- t[3] + carry ! addx %l4,0,%l4 ! t[4] <- t[4] + carry ! ! set 0x27a9ef92,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[4] * 0x27a9ef92 ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l1,%o0,%l1 ! t[1] <- t[1] + LS part of result ! addxcc %l2,%o1,%l2 ! t[2] <- t[2] + MS part of result + carry ! addxcc %l3,0,%l3 ! t[3] <- t[3] + carry ! addx %l4,0,%l4 ! t[4] <- t[4] + carry ! ! set 0x55455b5c,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[4] * 0x55455b5c ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l2,%o0,%l2 ! t[2] <- t[2] + LS part of result ! addxcc %l3,%o1,%l3 ! t[3] <- t[3] + MS part of result + carry ! addx %l4,0,%l4 ! t[4] <- t[4] + carry ! ! set 0x000007f0,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[4] * 0x000007f0 ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l3,%o0,%l3 ! t[3] <- t[3] + LS part of result ! addx %l4,%o1,%l4 ! t[4] <- t[4] + MS part of result + carry ! ! /*** compute [t4,t3,t2,t1,t0] += r5*t5 ***/ ! ! mov %l5,%o3 ! o3 <- t[5] ! ! set 0x32d0a6bf,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[5] * 0x32d0a6bf ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l0,%o0,%l0 ! t[0] <- t[0] + LS part of result ! addxcc %l1,%o1,%l1 ! t[1] <- t[1] + MS part of result + carry ! addxcc %l2,0,%l2 ! t[2] <- t[2] + carry ! addxcc %l3,0,%l3 ! t[3] <- t[3] + carry ! addx %l4,0,%l4 ! t[4] <- t[4] + carry ! ! set 0xb2ec7424,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[5] * 0xb2ec7424 ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l1,%o0,%l1 ! t[1] <- t[1] + LS part of result ! addxcc %l2,%o1,%l2 ! t[2] <- t[2] + MS part of result + carry ! addxcc %l3,0,%l3 ! t[3] <- t[3] + carry ! addx %l4,0,%l4 ! t[4] <- t[4] + carry ! ! set 0xf5a4e730,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[5] * 0xf5a4e730 ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l2,%o0,%l2 ! t[2] <- t[2] + LS part of result ! addxcc %l3,%o1,%l3 ! t[3] <- t[3] + MS part of result + carry ! addx %l4,0,%l4 ! t[4] <- t[4] + carry ! ! set 0x00000acc,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[5] * 0x00000acc ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l3,%o0,%l3 ! t[3] <- t[3] + LS part of result ! addx %l4,%o1,%l4 ! t[4] <- t[4] + MS part of result + carry ! ! /*** compute [t4,t3,t2,t1,t0] += r6*t6 ***/ ! ! mov %l6,%o3 ! o3 <- t[6] ! ! set 0xb786c901,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[6] * 0xb786c901 ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l0,%o0,%l0 ! t[0] <- t[0] + LS part of result ! addxcc %l1,%o1,%l1 ! t[1] <- t[1] + MS part of result + carry ! addxcc %l2,0,%l2 ! t[2] <- t[2] + carry ! addxcc %l3,0,%l3 ! t[3] <- t[3] + carry ! addx %l4,0,%l4 ! t[4] <- t[4] + carry ! ! set 0x531af0b6,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[6] * 0x531af0b6 ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l1,%o0,%l1 ! t[1] <- t[1] + LS part of result ! addxcc %l2,%o1,%l2 ! t[2] <- t[2] + MS part of result + carry ! addxcc %l3,0,%l3 ! t[3] <- t[3] + carry ! addx %l4,0,%l4 ! t[4] <- t[4] + carry ! ! set 0xa3ea9cfb,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[6] * 0xa3ea9cfb ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l2,%o0,%l2 ! t[2] <- t[2] + LS part of result ! addxcc %l3,%o1,%l3 ! t[3] <- t[3] + MS part of result + carry ! addx %l4,0,%l4 ! t[4] <- t[4] + carry ! ! set 0x00000a7c,%o0 ! umul %o0,%o3,%o0 ! o0 <- t[6] * 0x00000a7c ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l3,%o0,%l3 ! t[3] <- t[3] + LS part of result ! addx %l4,%o1,%l4 ! t[4] <- t[4] + MS part of result + carry ! ! /***************************************************/ ! /* We now have a 142 bit partially reduced product */ ! /* stored in bits 0-141 of [t4,t3,t2,t1,t0]. */ ! /***************************************************/ ! /* Now, 't4' has at most 142-128 = 14 bits (i.e. */ ! /* bits 128-141). */ ! /* So here's what we're gonna do: Write */ ! /* t = (bits 0-125) + 2^{126}(a 16-bit #) */ ! /* As before, we've precomputed r126 = 2^{126}mod p*/ ! /* Which has 108 bits. So, we do */ ! /* (bits 0-125) += (16 bits)*(108 bits) */ ! /* Which gives a result that will fit in bits 0-126*/ ! /***************************************************/ ! /* By abuse of notation, 't4' will now be the */ ! /* 16-bit number that's in bits 126-141 now. */ ! /***************************************************/ ! ! /************************************************************/ ! /* The notation: 47(%esp), etc, refers to that used in the */ ! /* ATT/Intel assembler versions. See diagram at end of this */ ! /* file for mapping to corresponding SPARC registers. Note */ ! /* in the SPARC architecture ld/st commands must access */ ! /* word-aligned operands hence cumbersome sequence of loads */ ! /* and shifts. */ ! /* */ ! /* Grab the 4 bytes 47(%esp), 48(%esp), 49(%esp), 50(%esp) */ ! /* Remember that t[0] -> t[6] are in registers l0 -> l6 */ ! /* Thus byte offset 47 is in l3 while 48, 49 & 50 are in l4 */ ! /* Need to use shifts and ORs to get the bytes where we */ ! /* want them in a single register. */ ! /************************************************************/ ! ! /** l3 BEFORE shift: [B47:B46:B45:B44] **/ ! srl %l3,24,%o1 ! o1 <- [ : : :B47] ! /** l4 BEFORE shift: [B51:B50:B49:B48] **/ ! sll %l4,8,%o3 ! o3 <- [B50:B49:B48: ] ! or %o3,%o1,%o3 ! o3 <- [B50:B49:B48:B47]; bits 120-141[151] ! srl %o3,6,%o3 ! o3 <- bits 126-141[151] ! /* ! o3 now contains the precise number we want to multiply ! by the pre-computed number 2^{126} ! */ ! /***************************************************/ ! /* Look at mapping diagram at end of this file for */ ! /* location of the bits and their corresponding */ ! /* register in this SPARC implementation. */ ! /* */ ! /* Zero out bits 126-141 of the product; bits 142- */ ! /* 151 already zero, so we're keeping only bits */ ! /* 120-025. So byte 32 through byte 47 is exactly */ ! /* the 126-bit number we'll add the product to. */ ! /* That is, it's exactly equal to (bits 0-125). */ ! /***************************************************/ ! ! set 0x3fffffff,%o0 ! o0 <- 0011111..11111111111 ! and %l3,%o0,%l3 ! zero bits 126 and 127 ! /* ! set 0xffffc000,%o0 ! o0 <- 11..1100000000000000 ! and %l4,%o0,%l4 ! zero bits 128 to 141 ! */ ! mov 0,%l4 ! ! /* Do not need bits 128 to 159 in l4/t[4] anymore */ ! ! /***************************************************/ ! /* 2^{126} == fe6 d20dec73 0f9824f3 cbd3a92e mod p */ ! /***************************************************/ ! /* Remember: the product of o3 by the above will */ ! /* have at most 16+108 = 124 bits. Since we're */ ! /* adding it to a 126 bit number, the result will */ ! /* have at most 127 bits, and hence, it will fit */ ! /* nicely into the four 32-bit words */ ! /* t0,t1,t2,t3. */ ! /***************************************************/ ! ! set 0xcbd3a92e,%o0 ! umul %o0,%o3,%o0 ! o0 <- 03 * 0xcbd3a92e ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l0,%o0,%l0 ! t[0] <- t[0] + LS part of result ! addxcc %l1,%o1,%l1 ! t[1] <- t[1] + MS part of result + carry ! addxcc %l2,0,%l2 ! t[2] <- t[2] + carry ! addx %l3,0,%l3 ! t[3] <- t[3] + carry ! ! set 0x0f9824f3,%o0 ! umul %o0,%o3,%o0 ! o0 <- 03 * 0x0f9824f3 ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l1,%o0,%l1 ! t[1] <- t[1] + LS part of result ! addxcc %l2,%o1,%l2 ! t[2] <- t[2] + MS part of result + carry ! addx %l3,0,%l3 ! t[3] <- t[3] + carry ! ! set 0xd20dec73,%o0 ! umul %o0,%o3,%o0 ! o0 <- 03 * 0xd20dec73 ! mov %y,%o1 ! o1 <- MS part of result ! addcc %l2,%o0,%l2 ! t[2] <- t[2] + LS part of result ! addx %l3,%o1,%l3 ! t[2] <- t[2] + MS part of result + carry ! ! set 0x00000fe6,%o0 ! umul %o0,%o3,%o0 ! o0 <- 03 * 0x00000fe6 ! mov %y,%o1 ! o1 <- MS part of result ! add %l3,%o0,%l3 ! t[3] <- t[3] + LS part of result ! ! mov %l3,%l4 ! t[4] <- "old" t[3] for table indexing ! and %l3,0x000000ff,%l3 ! why? ! /***************************************************/ ! /* So now we have only bits 0-126 to worry about */ ! /***************************************************/ ! ! sethi %hi(_T),%i1 ! reduction table as an external ! or %i1,%lo(_T),%i1 ! i1 <- base address of T ! ! /***************************************************/ ! /* Use T2 to cancel bits 120-127 (byte 15) */ ! /***************************************************/ ! ! set 0xff000000,%o0 ! mask ! and %l4,%o0,%o1 ! o1 <- bits 0-7 of old t3 (byte 47) ! srl %o1,24,%o1 ! sll %o1,4,%o1 ! add %i1,%o1,%o2 ! o2 <- T2 + offset ! ! ld [%o2],%o1 ! addcc %l0,%o1,%l0 ! t[0] <- t[0] + T2[offset] ! ld [%o2+4],%o1 ! addxcc %l1,%o1,%l1 ! t[1] <- t[1] + T2[offset + 4] ! ld [%o2+8],%o1 ! addxcc %l2,%o1,%l2 ! t[2] <- t[2] + T2[offset + 8] ! ld [%o2+12],%o1 ! addx %l3,%o1,%l3 ! t[3] <- t[3] + T2[offset + 12] ! ! /***************************************************/ ! /* Use T1 to cancel bits 112-119 (byte 14) */ ! /***************************************************/ ! ! add %i1,4095,%i1 ! address of T + 4096 ! inc %i1 ! ! set 0x00ff0000,%o0 ! mask ! and %l4,%o0,%o1 ! ! srl %o1,16,%o1 ! o1 <- bits 16-23 of old t3 (byte 46) ! sll %o1,4,%o1 ! add %i1,%o1,%o2 ! o2 <- T1 + offset ! ! ld [%o2],%o1 ! addcc %l0,%o1,%l0 ! t[0] <- t[0] + T1[offset] ! ld [%o2+4],%o1 ! addxcc %l1,%o1,%l1 ! t[1] <- t[1] + T1[offset + 4] ! ld [%o2+8],%o1 ! addxcc %l2,%o1,%l2 ! t[2] <- t[2] + T1[offset + 8] ! ld [%o2+12],%o1 ! addx %l3,%o1,%l3 ! t[3] <- t[3] + T1[offset + 12] ! ! /***************************************************/ ! /* Use T0 to cancel bits 104-111 (byte 13) */ ! /***************************************************/ ! ! add %i1,4095,%i1 ! T0 <- base address of T1 + 4096 ! inc %i1 ! ! set 0x0000ff00,%o0 ! mask ! and %l4,%o0,%o1 ! ! srl %o1,8,%o1 ! o1 <- bits 16-24 of old t3 (byte 45) ! sll %o1,4,%o1 ! byte 45 x 16; offset into T0 ! add %i1,%o1,%o2 ! o2 <- T0 + offset ! ! ld [%o2],%o1 ! addcc %l0,%o1,%l0 ! t[0] <- t[0] + T0[offset] ! ld [%o2+4],%o1 ! addxcc %l1,%o1,%l1 ! t[1] <- t[1] + T0[offset + 4] ! ld [%o2+8],%o1 ! addxcc %l2,%o1,%l2 ! t[2] <- t[2] + T0[offset + 8] ! ld [%o2+12],%o1 ! addx %l3,%o1,%l3 ! t[3] <- t[3] + T0[offset + 12] ! ! /*********************************************************/ ! /* At long last, we're almost done. [C0,C1,C2,C3] is now */ ! /* in [0,2p), so we need at most one more subtraction */ ! /* to be done. */ ! /* */ ! /* Remember: This is another place where we've hardcoded */ ! /* the modulus: */ ! /* p = 1BD5 79792B38 0B5B521E 6D9FB599 */ ! /*********************************************************/ ! ! set 0x6D9FB599,%o0 ! set 0x0B5B521E,%o1 ! set 0x79792B38,%o2 ! set 0x00001BD5,%o3 ! ! .STORE: ! st %l0,[%i0] ! RES(0) <- C(0) ! st %l1,[%i0+4] ! RES(1) <- C(1) ! st %l2,[%i0+8] ! RES(2) <- C(2) ! st %l3,[%i0+12] ! RES(3) <- C(3) ! ! /** ! NB: At this point the values of C0,C1,C2,C3 are held in a ! temporary location, the registers l0,l1,l2,l3. ! **/ ! ! cmp %l3,%o3 ! is MS word of C < MS word of p? ! blu .DONE_M ! yes, so no reduction needed ! nop ! bgu .SUBTRACT_M ! no, C > p so reduction needed ! ! cmp %l2,%o2 ! blu .DONE_M ! nop ! bgu .SUBTRACT_M ! ! cmp %l1,%o1 ! blu .DONE_M ! nop ! bgu .SUBTRACT_M ! ! cmp %l0,%o0 ! is LS word of C < LS word of p? ! blu .DONE_M ! yes, C < p so no reduction needed ! nop ! bgu .SUBTRACT_M ! no, C > p so reduction needed ! nop ! ! /******************************************************************/ ! /** Here when C is equal to our prime modulus, p. **/ ! /******************************************************************/ ! mov 0,%l0 ! C(0) <- 0 ! mov 0,%l1 ! C(1) <- 0 ! mov 0,%l2 ! C(2) <- 0 ! mov 0,%l3 ! C(3) <- 0 ! ba .STORE ! nop ! ! /******************************************************************/ ! /** Here when C is geater than our prime modulus, p. **/ ! /******************************************************************/ ! .SUBTRACT_M: ! ! subcc %l0,%o0,%l0 ! C(0) <- C(0) mod p(0) ! subxcc %l1,%o1,%l1 ! C(1) <- C(1) mod p(1) ! subxcc %l2,%o2,%l2 ! C(2) <- C(2) mod p(2) ! subx %l3,%o3,%l3 ! C(3) <- C(3) mod p(3) ! ba .STORE ! nop ! ! /******************************************************************/ ! /** All that is left to do now is return C. **/ ! /******************************************************************/ ! .DONE_M: ! ! ret ! restore ! ! /**************************************************************/ ! /** The temporary locations t[0] - t[6] are the local ! /** registers l0 - l6 in the SPARC implementation. Thus ! /** the LS byte of l0/t[0] corresponds to memory offset ! /** 32; l1/t[1] corresponds to offset 40 and so on in /** the ATT and Intel implementations. In particular, ! /** byte offsets 45, 46 and 47 used in the ATT/Intel ! /** implementations are in register l3/t[3]. ! /** ! /** The bit positions of the 218 bit product are shown. ! /** ! /** ! /** SPARC [ATT/Intel byte offset 32] ! /** reg.+--------+--------+--------+--------+ ! /** |31 24|23 16|15 8|7 0| bit ! /** l0 | | | | | ! /** | | | | byte 0| ! /** +--------+--------+--------+--------+ ! /** /** 36 ! /** +--------+--------+--------+--------+ ! /** |63 | | | 32| bit ! /** l1 | | | | | ! /** | | | | byte 4| ! /** +--------+--------+--------+--------+ ! /** /** 40 ! /** +--------+--------+--------+--------+ ! /** |95 | | | 64| bit ! /** l2 | | | | | ! /** | | | | byte 8| ! /** +--------+--------+--------+--------+ ! /** /** 47 46 45 44 ! /** +--------+--------+--------+--------+ ! /** |127 120|119 112|111 104|103 96| bit ! /** l3 | | | | | ! /** | byte 15| byte 14| byte 12| byte 12| ! /** +--------+--------+--------+--------+ ! /** /** 48 ! /** +--------+--------+--------+--------+ ! /** |159 152|151 144|143 136|135 128| bit ! /** l4 | | | | | ! /** | | | | byte 16| ! /** +--------+--------+-+------+--------+ ! /** ^ ! /** | ! /** +----- bit 142 ! /** /** 52 ! /** +--------+--------+--------+--------+ ! /** |191 | | | 160| bit ! /** l5 | | | | | ! /** | | | | | ! /** +--------+--------+--------+--------+ ! /** /** 56 ! /** +--------+--------+--------+--------+ ! /** |223 | | | 192| bit ! /** l6 | | | | | ! /** | | | | | ! /** +--------+--------+--------+--------+ ! /** /**************************************************************/ diff -cr eccp109-132-2/asm/sparc/sub128.s sos4/asm/sparc/sub128.s *** eccp109-132-2/asm/sparc/sub128.s Sat Dec 22 03:16:08 2001 --- sos4/asm/sparc/sub128.s Mon Oct 7 20:55:52 2002 *************** *** 1,47 **** ! /*******************************************************************/ ! /* sub128.s */ ! /* Chris Monico, 1/5/00 */ ! /* This is code to compute the difference of two <128-bit integers.*/ ! /*******************************************************************/ ! /* An input arg may be the same as an output arg. */ ! /*******************************************************************/ ! /** **/ ! /** This SPARC implementation by Quentin Campbell, 14/Jun/2001 **/ ! /** **/ ! /*******************************************************************/ ! ! /** ! ARGS: ! RES: %i0 ! A: %i1 ! B: %i2 ! **/ ! ! .global sub128 ! ! .align 4 ! sub128: ! save %sp,-(64+4+24) & -8,%sp ! create new stack frame & register window ! ! ld [%i1],%l7 ! l7 <- A[0] ! ld [%i2],%l6 ! l6 <- B[0] ! subcc %l7,%l6,%l5 ! l5 <- A[0] + B[0] ! st %l5,[%i0] ! RES[0] <- l5 ! ! ld [%i1+4],%l7 ! l7 <- A[1] ! ld [%i2+4],%l6 ! l6 <- B[1] ! subxcc %l7,%l6,%l5 ! l5 <- A[1] + B[1] + carry ! st %l5,[%i0+4] ! RES[1] <- l5 ! ! ld [%i1+8],%l7 ! l7 <- A[2] ! ld [%i2+8],%l6 ! l6 <- B[2] ! subxcc %l7,%l6,%l5 ! l5 <- A[2] + B[2] + carry ! st %l5,[%i0+8] ! RES[2] <- l5 ! ! ld [%i1+12],%l7 ! l7 <- A[3] ! ld [%i2+12],%l6 ! l6 <- B[3] ! subx %l7,%l6,%l5 ! l5 <- A[3] + B[3] + carry ! st %l5,[%i0+12] ! RES[3] <- l5 ! ! ret ! restore --- 1,47 ---- ! /*******************************************************************/ ! /* sub128.s */ ! /* Chris Monico, 1/5/00 */ ! /* This is code to compute the difference of two <128-bit integers.*/ ! /*******************************************************************/ ! /* An input arg may be the same as an output arg. */ ! /*******************************************************************/ ! /** **/ ! /** This SPARC implementation by Quentin Campbell, 14/Jun/2001 **/ ! /** **/ ! /*******************************************************************/ ! ! /** ! ARGS: ! RES: %i0 ! A: %i1 ! B: %i2 ! **/ ! ! .global sub128 ! ! .align 4 ! sub128: ! save %sp,-(64+4+24) & -8,%sp ! create new stack frame & register window ! ! ld [%i1],%l7 ! l7 <- A[0] ! ld [%i2],%l6 ! l6 <- B[0] ! subcc %l7,%l6,%l5 ! l5 <- A[0] + B[0] ! st %l5,[%i0] ! RES[0] <- l5 ! ! ld [%i1+4],%l7 ! l7 <- A[1] ! ld [%i2+4],%l6 ! l6 <- B[1] ! subxcc %l7,%l6,%l5 ! l5 <- A[1] + B[1] + carry ! st %l5,[%i0+4] ! RES[1] <- l5 ! ! ld [%i1+8],%l7 ! l7 <- A[2] ! ld [%i2+8],%l6 ! l6 <- B[2] ! subxcc %l7,%l6,%l5 ! l5 <- A[2] + B[2] + carry ! st %l5,[%i0+8] ! RES[2] <- l5 ! ! ld [%i1+12],%l7 ! l7 <- A[3] ! ld [%i2+12],%l6 ! l6 <- B[3] ! subx %l7,%l6,%l5 ! l5 <- A[3] + B[3] + carry ! st %l5,[%i0+12] ! RES[3] <- l5 ! ! ret ! restore diff -cr eccp109-132-2/asm/sparc/submod_p109.s sos4/asm/sparc/submod_p109.s *** eccp109-132-2/asm/sparc/submod_p109.s Sat Dec 22 03:16:08 2001 --- sos4/asm/sparc/submod_p109.s Mon Oct 7 20:59:37 2002 *************** *** 1,138 **** ! /******************************************************************/ ! /* submod_p109.s */ ! /* Chris Monico, 1/5/00 */ ! /* This is code to compute the sum of two 109-bit integers */ ! /* modulo a fixed 109-bit prime, p. Specifically, the fixed prime */ ! /* is the number over which the Certicom ECCP-109 Challenge curve */ ! /* is defined: */ ! /* p = 1BD5 79792B38 0B5B521E 6D9FB599 */ ! /******************************************************************/ ! /* Chris Monico remarks: */ ! /* There is certainly a better way to do this, but I do not want */ ! /* to be bothered with all the signed/unsigned issues right now, */ ! /* so we'll proceed as follows: */ ! /* 1) Compute A+p */ ! /* 2) Subtract B from (A+p) and reduce the result if necessary */ ! /* This insures that the difference computed is always positive */ ! /******************************************************************/ ! /* An input arg may be the same as an output arg. */ ! /******************************************************************/ ! /** **/ ! /** This SPARC implementation by Quentin Campbell, 14/Jun/2001 **/ ! /** **/ ! /******************************************************************/ ! ! /** ! ARGS: ! RES: %i0 ! A: %i1 ! B: %i2 ! **/ ! ! .section ".text" ! ! .global submod_p109 ! ! .align 4 ! submod_p109: ! ! save %sp,-(64+4+24) & -8,%sp ! create new stack frame & register window ! ! /******************************************************************/ ! /** A & B are loaded into registers for efficiency and to deal **/ ! /** with the case where an input arg is the same as an output **/ ! /** arg. The registers holding A also become the TEMP registers. **/ ! /******************************************************************/ ! ! ld [%i1],%l0 ! l0 <- A[0] ! ld [%i1+4],%l1 ! l1 <- A[1] ! ld [%i1+8],%l2 ! l2 <- A[2] ! ld [%i1+12],%l3 ! l3 <- A[3] ! ! ld [%i2],%l4 ! l4 <- B[0] ! ld [%i2+4],%l5 ! l5 <- B[1] ! ld [%i2+8],%l6 ! l6 <- B[2] ! ld [%i2+12],%l7 ! l7 <- B[3] ! ! /******************************************************************/ ! /** Is B > A? **/ ! /******************************************************************/ ! ! cmp %l7,%l3 ! B[3] > A[3]? ! bgu .APLUSP ! yes, B[3] > A[3] ! nop ! blu .AMINUSB ! no, A > B ! nop ! ! cmp %l6,%l2 ! B[2] > A[2]? ! bgu .APLUSP ! yes, B[2] > A[2] ! nop ! blu .AMINUSB ! no, A > B ! nop ! ! cmp %l5,%l1 ! B[1] > A[1]? ! bgu .APLUSP ! yes, B[1] > A[1] ! nop ! blu .AMINUSB ! no, A > B ! nop ! ! cmp %l4,%l0 ! B[0] > A[0]? ! bgu .APLUSP ! yes, B[0] > A[0] ! nop ! blu .AMINUSB ! no, A > B ! nop ! ! /******************************************************************/ ! /** Here when B = A; RES <- 0 **/ ! /******************************************************************/ ! ! mov 0,%l0 ! st %l0,[%i0] ! RES[0] <- 0 ! st %l0,[%i0+4] ! RES[1] <- 0 ! st %l0,[%i0+8] ! RES[2] <- 0 ! st %l0,[%i0+12] ! RES[3] <- 0 ! ! ret ! restore ! ! /**********************************/ ! /** TEMP <- A+p **/ ! /**********************************/ ! /** ! Remember: This is another place where we have hardcoded ! the modulus p = 1BD5 79792B38 0B5B521E 6D9FB599 ! **/ ! ! .APLUSP: ! ! set 0x6D9FB599,%o0 ! set 0x0B5B521E,%o1 ! set 0x79792B38,%o2 ! set 0x00001BD5,%o3 ! ! addcc %l0,%o0,%l0 ! temp[0] <- A[0] + p0 ! addxcc %l1,%o1,%l1 ! temp[1] <- A[1] + p1 + carry ! addxcc %l2,%o2,%l2 ! temp[2] <- A[2] + p2 + carry ! addx %l3,%o3,%l3 ! temp[3] <- A[3] + p3 + carry ! ! /**********************************/ ! /** TEMP <- TEMP-B **/ ! /**********************************/ ! /** ! Note: TEMP/l0-l3 holds A if A > B ! TEMP/l0-l3 holds A+p if B >= A ! **/ ! ! .AMINUSB: ! ! subcc %l0,%l4,%l0 ! TEMP[0] <- TEMP[0] - B[0] ! st %l0,[%i0] ! RES[0] <- TEMP[0] ! subxcc %l1,%l5,%l1 ! TEMP[1] <- TEMP[1] - B[1] + carry ! st %l1,[%i0+4] ! RES[1] <- TEMP[1] ! subxcc %l2,%l6,%l2 ! TEMP[2] <- TEMP[2] - B[2] + carry ! st %l2,[%i0+8] ! RES[2] <- TEMP[2] ! subx %l3,%l7,%l3 ! TEMP[3] <- TEMP[3] - B[3] + carry ! st %l3,[%i0+12] ! RES[3] <- TEMP[3] ! ! ret ! restore --- 1,138 ---- ! /******************************************************************/ ! /* submod_p109.s */ ! /* Chris Monico, 1/5/00 */ ! /* This is code to compute the sum of two 109-bit integers */ ! /* modulo a fixed 109-bit prime, p. Specifically, the fixed prime */ ! /* is the number over which the Certicom ECCP-109 Challenge curve */ ! /* is defined: */ ! /* p = 1BD5 79792B38 0B5B521E 6D9FB599 */ ! /******************************************************************/ ! /* Chris Monico remarks: */ ! /* There is certainly a better way to do this, but I do not want */ ! /* to be bothered with all the signed/unsigned issues right now, */ ! /* so we'll proceed as follows: */ ! /* 1) Compute A+p */ ! /* 2) Subtract B from (A+p) and reduce the result if necessary */ ! /* This insures that the difference computed is always positive */ ! /******************************************************************/ ! /* An input arg may be the same as an output arg. */ ! /******************************************************************/ ! /** **/ ! /** This SPARC implementation by Quentin Campbell, 14/Jun/2001 **/ ! /** **/ ! /******************************************************************/ ! ! /** ! ARGS: ! RES: %i0 ! A: %i1 ! B: %i2 ! **/ ! ! /*.section ".text"*/ ! ! .global submod_p109 ! ! .align 4 ! submod_p109: ! ! save %sp,-(64+4+24) & -8,%sp ! create new stack frame & register window ! ! /******************************************************************/ ! /** A & B are loaded into registers for efficiency and to deal **/ ! /** with the case where an input arg is the same as an output **/ ! /** arg. The registers holding A also become the TEMP registers. **/ ! /******************************************************************/ ! ! ld [%i1],%l0 ! l0 <- A[0] ! ld [%i1+4],%l1 ! l1 <- A[1] ! ld [%i1+8],%l2 ! l2 <- A[2] ! ld [%i1+12],%l3 ! l3 <- A[3] ! ! ld [%i2],%l4 ! l4 <- B[0] ! ld [%i2+4],%l5 ! l5 <- B[1] ! ld [%i2+8],%l6 ! l6 <- B[2] ! ld [%i2+12],%l7 ! l7 <- B[3] ! ! /******************************************************************/ ! /** Is B > A? **/ ! /******************************************************************/ ! ! cmp %l7,%l3 ! B[3] > A[3]? ! bgu .APLUSP ! yes, B[3] > A[3] ! nop ! blu .AMINUSB ! no, A > B ! nop ! ! cmp %l6,%l2 ! B[2] > A[2]? ! bgu .APLUSP ! yes, B[2] > A[2] ! nop ! blu .AMINUSB ! no, A > B ! nop ! ! cmp %l5,%l1 ! B[1] > A[1]? ! bgu .APLUSP ! yes, B[1] > A[1] ! nop ! blu .AMINUSB ! no, A > B ! nop ! ! cmp %l4,%l0 ! B[0] > A[0]? ! bgu .APLUSP ! yes, B[0] > A[0] ! nop ! blu .AMINUSB ! no, A > B ! nop ! ! /******************************************************************/ ! /** Here when B = A; RES <- 0 **/ ! /******************************************************************/ ! ! mov 0,%l0 ! st %l0,[%i0] ! RES[0] <- 0 ! st %l0,[%i0+4] ! RES[1] <- 0 ! st %l0,[%i0+8] ! RES[2] <- 0 ! st %l0,[%i0+12] ! RES[3] <- 0 ! ! ret ! restore ! ! /**********************************/ ! /** TEMP <- A+p **/ ! /**********************************/ ! /** ! Remember: This is another place where we have hardcoded ! the modulus p = 1BD5 79792B38 0B5B521E 6D9FB599 ! **/ ! ! .APLUSP: ! ! set 0x6D9FB599,%o0 ! set 0x0B5B521E,%o1 ! set 0x79792B38,%o2 ! set 0x00001BD5,%o3 ! ! addcc %l0,%o0,%l0 ! temp[0] <- A[0] + p0 ! addxcc %l1,%o1,%l1 ! temp[1] <- A[1] + p1 + carry ! addxcc %l2,%o2,%l2 ! temp[2] <- A[2] + p2 + carry ! addx %l3,%o3,%l3 ! temp[3] <- A[3] + p3 + carry ! ! /**********************************/ ! /** TEMP <- TEMP-B **/ ! /**********************************/ ! /** ! Note: TEMP/l0-l3 holds A if A > B ! TEMP/l0-l3 holds A+p if B >= A ! **/ ! ! .AMINUSB: ! ! subcc %l0,%l4,%l0 ! TEMP[0] <- TEMP[0] - B[0] ! st %l0,[%i0] ! RES[0] <- TEMP[0] ! subxcc %l1,%l5,%l1 ! TEMP[1] <- TEMP[1] - B[1] + carry ! st %l1,[%i0+4] ! RES[1] <- TEMP[1] ! subxcc %l2,%l6,%l2 ! TEMP[2] <- TEMP[2] - B[2] + carry ! st %l2,[%i0+8] ! RES[2] <- TEMP[2] ! subx %l3,%l7,%l3 ! TEMP[3] <- TEMP[3] - B[3] + carry ! st %l3,[%i0+12] ! RES[3] <- TEMP[3] ! ! ret ! restore diff -cr eccp109-132-2/eccp109/logerr.c sos4/eccp109/logerr.c *** eccp109-132-2/eccp109/logerr.c Sat Dec 22 03:16:08 2001 --- sos4/eccp109/logerr.c Mon Oct 7 21:04:43 2002 *************** *** 1,14 **** ! #include ! #include ! #include ! ! ! int logError(FILE *fp, char *str, int errNum) ! { time_t now; ! ! time(&now); ! fprintf(fp, "%s\t%s\n", asctime(localtime(&now)), str); ! if (errNum) ! fprintf(fp, " %s\n", strerror(errNum)); ! return 1; ! } --- 1,15 ---- ! #include ! #include ! #include ! #include ! char *sys_errlist[]; ! ! int logError(FILE *fp, char *str, int errNum) ! { time_t now; ! ! time(&now); ! fprintf(fp, "%s\t%s\n", asctime(localtime(&now)), str); ! if (errNum) ! fprintf(fp, " %s\n", sys_errlist[errNum]); ! return 1; ! }