! Copyright distributed.net 2003 - All Rights Reserved
! For use in distributed.net projects only.
! Any other distribution or use of this source violates copyright.
!
! Author: Andreas Beckmann <andreasb@distributed.net>
!
! $Id: r72-anbe1.gcc.S,v 1.1.2.7 2004/03/12 07:13:02 snikkel Exp $
!
! This 1 pipeline core is optimized for a low instruction count by storing
! parts of S[] in registers to avoid ld/st S[i] pairs.
! This works best for not superscalar cpus and/or cpus with a small instruction 
! cache. The mainloop (1020 instructions) is a little bit smaller than 4 KByte.
! Tested on a SS5 clone with a microSPARC II processor and 8 KB L1I.
!

// use the following command to build r72-anbe1.s from this file:
// (heavy use of macros requires the GNU cpp for preprocessing)
//     gcc -E -P r72-anbe1.gcc.S |
//       perl -pe 's/\s*;\s*/\n\t/g; s/^\s*(\S*:)\s*/$1\n\t/gm; 
//                 s/^(\s+(!+\s*)?\w+\b)\s+(?!$)/$1\t/gm; 
//                 s/^\s+$/\n/gm; s/\n\n+/\n/gm; 
//                 $a=/^$/; $b=$a*($b+$a); $_="" if $b>2;' > r72-anbe1.s
 
/* The following comment is for r72-anbe1.s, generated from this file */
!
! *****  Do NOT edit THIS file, edit r72-anbe1.gcc.S instead!  *****
!
! This file was generated from r72-anbe1.S using the command
!   gcc -E -P r72-anbe1.gcc.S | perl -pe '... see r72-anbe1.gcc.S ...' > r72-anbe1.s
! (the full perl formatting expression can be viewed in r72-anbe1.gcc.S)


#define PIPELINES	1
#define RESULT_NOTHING	1
#define RESULT_FOUND	2


.section	".text"

! extern "C" s32 rc5_72_unit_func_anbe_1( RC5_72UnitWork *, u32 *, void *);
.global rc5_72_unit_func_anbe_1
rc5_72_unit_func_anbe_1:

! Input:
! %i0 = RC5_72UnitWork *
! %i1 = u32 *iterations
! %i2 = (unused)

! Output:
! [rc5_72unitwork]
! [iterations]
! %i0 = RESULT_{NOTHING|FOUND}

#define r72unitwork_plain_hi	0
#define r72unitwork_plain_lo	4
#define r72unitwork_cypher_hi	8
#define r72unitwork_cypher_lo	12
#define r72unitwork_L0_hi	16
#define r72unitwork_L0_mid	20
#define r72unitwork_L0_lo	24
#define r72unitwork_check_count	28
#define r72unitwork_check_hi	32
#define r72unitwork_check_mid	36
#define r72unitwork_check_lo	40
#define sizeof_r72unitwork	44


#define default_stackframe	(4*(16+1+6))

#define stack			%sp+default_stackframe

// stack layout:

// <<<=== %fp points here
//   <--- stack copy of r72unitwork (%fp relative, dword aligned)
#define s_save_r72unitworkP	%fp-4
#define s_uw(offset)		%fp-4-sizeof_r72unitwork+(offset)
#define s_save_iterationsP	%fp-4-sizeof_r72unitwork-4
#define s_CS0			s_save_iterationsP-4
#define s_GS1			s_CS0-4
#define s_GS2			s_GS1-4
#define s_Llo			s_GS2-4
#define s_Lmid			s_Llo-4
#define s_Lhi			s_Lmid-4
#define s_iterations		s_Lhi-4
#define s_GL1S2			s_iterations-4
#define s_GS2S3			s_GL1S2-4
#define s_GL0			s_GS2S3-4
#define s_GL1			s_GL0-4
#define fp_relative_size	(4+sizeof_r72unitwork+4*(1+3+3+1+2+2))
//   <--? here could be a filler for dword alignment
#define stackframe_size		(default_stackframe+fp_relative_size+4*(1+26+26))
// s_Sinit[26]
#define s_Sinit(i)		stack+4+4*26+(4*i)
// s_S1[26]
#define s_S1(i)			stack+4+4*(((i)+26) % 26)
// <- dword aligned
#define s_save_i7		stack+0
// <<<=== 'stack' points here
// (default stackframe 6+1+16 words)
// <<<=== %sp points here


	save	%sp,-(stackframe_size+(stackframe_size % 8)),%sp
	st	%i7,[s_save_i7]

	st	%i0,[s_save_r72unitworkP]
! copy r72unitwork to stack
	ld	[%i0+r72unitwork_plain_hi],%l0
	ld	[%i0+r72unitwork_plain_lo],%l1
	ld	[%i0+r72unitwork_cypher_hi],%l2
	ld	[%i0+r72unitwork_cypher_lo],%l3
	ld	[%i0+r72unitwork_check_count],%l4
	ld	[%i0+r72unitwork_check_hi],%l5
	ld	[%i0+r72unitwork_check_mid],%l6
	ld	[%i0+r72unitwork_check_lo],%l7
	st	%l0,[s_uw(r72unitwork_plain_hi)]
	st	%l1,[s_uw(r72unitwork_plain_lo)]
	st	%l2,[s_uw(r72unitwork_cypher_hi)]
	st	%l3,[s_uw(r72unitwork_cypher_lo)]
	st	%l4,[s_uw(r72unitwork_check_count)]
	st	%l5,[s_uw(r72unitwork_check_hi)]
	st	%l6,[s_uw(r72unitwork_check_mid)]
	st	%l7,[s_uw(r72unitwork_check_lo)]
	st	%i1,[s_save_iterationsP]

#define pred3(j)	_pred3(j)
#define _pred3(j)	pred3_##j
#define pred3_0		2
#define pred3_1		0
#define pred3_2		1

#define P		0xB7E15163
#define Q		0x9E3779B9
#define P_ROTL3		0xBF0A8B1D


! about registers
! do not use %sp, %fp, %g0, %g5, %g6, %g7
! save before use: %i7
! all other registers are available for free use unless we are going to call 
!   some functions

! free registers: none

#define R1S6		R1A // only round 1/2
#define R1S7		R1B // only round 1/2
/* perhaps another 2 instructions could be gained if R1A and R1B would be used
   for R1S0 and R1S1 and plain_{lo,hi} loaded into a temp and added to R1{A,B}
   after they have completed their ROUND3 calculation */

#define R1S8		%l6
#define R1S9		%l7
#define R1S10		%i0 // do not reuse before mainloop
#define R1S11		%i1 // do not reuse before mainloop
#define R1S12		%i2
#define R1S13		%i3
#define R1S14		%i4
#define R1S15		%i5
#define R1S16		%i7
#define R1S17		%o0
#define R1S18		%o1
#define R1S19		%o2
#define R1S20		%o3
#define R1S21		%o4
#define R1S22		%o5
#define R1S23		%o7
#define R1S24		%l5

#define tRGcypher_lo	R1S10
#define tRGiterations	R1S12
#define tRGLlo		R1S13
#define tRGLmid		R1S14
#define	tRGLhi		R1S15
#define tRGL1S2		R1S16
#define tRGS2S3		R1S17

#define tRCS0		R1S18
#define tRGS1		R1S19
#define tRGS2		R1S20
#define tRGL0		R1S21
#define tRGL1		R1S22

#define tRGSinit	R1S23
#define tRCQ		R1S24

#define R1SS		%g1
#define R1L(j)		_R1L(j)
#define _R1L(j)		R1L_##j
#define R1L_0		%l0
#define R1L_1		%l1
#define R1L_2		%l2

#define R1A		%l3
#define R1B		%l4

#define T1a		%g2
#define	T1b		%g3
#define Tc		%g4

	ld	[%i1],tRGiterations
	ld	[%i0+r72unitwork_L0_hi],tRGLhi
	ld	[%i0+r72unitwork_L0_mid],tRGLmid
	ld	[%i0+r72unitwork_L0_lo],tRGLlo

	st	tRGLlo,[s_Llo]
	st	tRGLmid,[s_Lmid]
	st	tRGLhi,[s_Lhi]
	st	tRGiterations,[s_iterations]

KEYINIT:
!! S[i] = P + i*Q
!! Sinit[0] = P
!! Sinit[i] = Sinit[i-1]+Q

#define KEY_INIT(i) 				\
	add	tRGSinit,tRCQ,tRGSinit		;\
	st	tRGSinit,[s_Sinit(i)]		;\

	set	P,tRGSinit
	set	Q,tRCQ
	st	tRGSinit,[s_Sinit(0)]
	
        KEY_INIT(1)
        KEY_INIT(2)
        KEY_INIT(3)
        KEY_INIT(4)
        KEY_INIT(5)
        KEY_INIT(6)
        KEY_INIT(7)
        KEY_INIT(8)
        KEY_INIT(9)
        KEY_INIT(10)
        KEY_INIT(11)
        KEY_INIT(12)
        KEY_INIT(13)
        KEY_INIT(14)
        KEY_INIT(15)
        KEY_INIT(16)
        KEY_INIT(17)
        KEY_INIT(18)
        KEY_INIT(19)
        KEY_INIT(20)
        KEY_INIT(21)
        KEY_INIT(22)
        KEY_INIT(23)
        KEY_INIT(24)
        KEY_INIT(25)

! preset constant values: s0
ROUND1_S_i0_j0:
	!! S[0] = ROTL3(S[0])
	set	P_ROTL3,tRCS0
	st	tRCS0,[s_CS0]

! precalculate seldom changing values in ROUND1: s1, s2, l0, l1

#define CALCULATE_RGLlo_DEPS(x) 			\
CALCULATE_RGLlo_DEPS_##x:				\
	/* ROUND1L(0,0) */				\
	/* !! L[0] = ROTL(L[0]+S[0],S[0]); */		\
	ld	[s_CS0],tRCS0				;\
	ld	[s_Llo],tRGLlo				;\
	add	tRGLlo,tRCS0,R1L(0)			;\
	sll	R1L(0),(P_ROTL3 % 32),T1a		;\
	srl	R1L(0),(32 - (P_ROTL3 % 32)),R1L(0)	;\
	or	R1L(0),T1a,tRGL0			;\
	st	tRGL0,[s_GL0]				;\
							\
	/* ROUND1S(1,1) */				\
	/* !! S[1] = ROTL3(S[1]+(S[0]+L[0])); */	\
	ld	[s_Sinit(1)],Tc			/* precalc S[0]+S[1] ? */	;\
	add	tRCS0,tRGL0,R1SS			;\
	add	R1SS,Tc,R1SS			/* precalc S[0]+S[1] ? */	;\
	sll	R1SS,3,T1a				;\
	srl	R1SS,29,R1SS				;\
	or	R1SS,T1a,tRGS1				;\
	st	tRGS1,[s_GS1] /*###*/			;\

#define CALCULATE_RGLmid_DEPS(x)			\
CALCULATE_RGLmid_DEPS_##x:				\
	/* ROUND1L(1,1) */				\
	/* !! L[1] = ROTL(L[1]+(S[1]+L[0]),(S[1]+L[0]));*/	\
	ld	[s_GS1],tRGS1 /*###*/			;\
	ld	[s_Lmid],tRGLmid			;\
	ld	[s_GL0],tRGL0				;\
	add	tRGS1,tRGL0,T1a				;\
	add	tRGLmid,T1a,R1L(1)			;\
	sll	R1L(1),T1a,T1b				;\
	neg	T1a					;\
	srl	R1L(1),T1a,R1L(1)			;\
	or	R1L(1),T1b,tRGL1			;\
	st	tRGL1,[s_GL1]				;\
							\
	/* ROUND1S(2,2) */				\
	/* !! S[2] = ROTL3(S[2]+(S[1]+L[1])); */	\
	ld	[s_Sinit(2)],Tc				;\
	add	tRGS1,tRGL1,R1SS			;\
	add	R1SS,Tc,R1SS				;\
	sll	R1SS,3,T1a				;\
	srl	R1SS,29,R1SS				;\
	or	R1SS,T1a,tRGS2				;\
	ld	[s_Sinit(3)],Tc				;\
	add	tRGL1,tRGS2,tRGL1S2			;\
	add	tRGS2,Tc,tRGS2S3			;\
	st	tRGS2,[s_GS2] /*###*/			;\
	st	tRGL1S2,[s_GL1S2]			;\
	st	tRGS2S3,[s_GS2S3]			;\

CALCULATE_RGLlo_DEPS(init)
CALCULATE_RGLmid_DEPS(init)

! ##########################################################################

	b	mainloop
	nop
.align	16
mainloop:

! ROUND 1 - key expansion

#define ROUND1_START(i,j)       ROUND1_S(i,j,pred3(pred3(j)),pred3(j))
#define ROUND1_prec(i,j,inR1Lj) ROUND1_L_S(i,j,inR1Lj,pred3(pred3(j)),pred3(j))
#define ROUND1(i,j)             ROUND1_L_S(i,j,R1L(pred3(j)),pred3(pred3(j)),pred3(j))
#define ROUND1_MR(i,j,R1SSout)  ROUND1_L_S_MR(i,j,R1L(pred3(j)),R1SSout,pred3(pred3(j)),pred3(j))
#define ROUND1_RR(i,j,R1SSin,R1SSout)  ROUND1_L_S_RR(i,j,R1L(pred3(j)),R1SSin,R1SSout,pred3(pred3(j)),pred3(j))
#define ROUND1_RM(i,j,R1SSin)   ROUND1_L_S_RM(i,j,R1L(pred3(j)),R1SSin,pred3(pred3(j)),pred3(j))
#define ROUND1_END(i,j)         ROUND1_L(i,j,pred3(pred3(j)),pred3(j))

#define ROUND1_S(i,j,Lpred,Lcurr)		\
ROUND1_S_i##i##_j##j:				\
/*S*/	ld	[s_Sinit(i)],Tc			;\
/*S*/	add	R1SS,R1L(Lcurr),R1SS		;\
/*S*/	add	R1SS,Tc,R1SS			;\
/*S*/	sll	R1SS,3,T1a			;\
/*S*/	srl	R1SS,29,R1SS			;\
/*S*/	or	R1SS,T1a,R1SS			;\
/*S*/	/* st	R1SS,[s_S1(i)] */ /* to next */	;\

#define ROUND1_L_S(i,j,inR1Lj,Lpred,Lcurr)	\
ROUND1_L_S_i##i##_j##j:				\
/*L*/	add	R1SS,R1L(Lpred),T1a		;\
/*L*/	/* add	R1L(Lcurr),T1a,R1L(Lcurr) */	;\
/*L*/	add	inR1Lj,T1a,R1L(Lcurr)		;\
/*S*/	st	R1SS,[s_S1(i-1)] /* from prev */;\
/*L*/	sll	R1L(Lcurr),T1a,T1b		;\
/*L*/	neg	T1a				;\
/*S*/	ld	[s_Sinit(i)],Tc			;\
/*L*/	srl	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*L*/	or	R1L(Lcurr),T1b,R1L(Lcurr)	;\
/*S*/	add	R1SS,Tc,R1SS			;\
/*S*/	add	R1SS,R1L(Lcurr),R1SS		;\
/*S*/	sll	R1SS,3,T1a			;\
/*S*/	srl	R1SS,29,R1SS			;\
/*S*/	or	R1SS,T1a,R1SS			;\
/*S*/	/* st	R1SS,[s_S1(i)] */ /* to next */	;\

#define ROUND1_L_S_MR(i,j,inR1Lj,outR1SS,Lpred,Lcurr)	\
ROUND1_L_S_MR_i##i##_j##j:			\
/*L*/	add	R1SS,R1L(Lpred),T1a		;\
/*L*/	/* add	R1L(Lcurr),T1a,R1L(Lcurr) */	;\
/*L*/	add	inR1Lj,T1a,R1L(Lcurr)		;\
/*S*/	st	R1SS,[s_S1(i-1)] /* from prev */;\
/*L*/	sll	R1L(Lcurr),T1a,T1b		;\
/*L*/	neg	T1a				;\
/*S*/	ld	[s_Sinit(i)],Tc			;\
/*L*/	srl	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*L*/	or	R1L(Lcurr),T1b,R1L(Lcurr)	;\
/*S*/	add	R1SS,Tc,outR1SS			;\
/*S*/	add	outR1SS,R1L(Lcurr),outR1SS	;\
/*S*/	sll	outR1SS,3,T1a			;\
/*S*/	srl	outR1SS,29,outR1SS		;\
/*S*/	or	outR1SS,T1a,outR1SS		;\
/*S*/	/* st	outR1SS,[s_S1(i)] */ /* to next */ ;\

#define ROUND1_L_S_RR(i,j,inR1Lj,inR1SS,outR1SS,Lpred,Lcurr)	\
ROUND1_L_S_RR_i##i##_j##j:				\
/*L*/	add	inR1SS,R1L(Lpred),T1a		;\
/*L*/	/* add	R1L(Lcurr),T1a,R1L(Lcurr) */	;\
/*L*/	add	inR1Lj,T1a,R1L(Lcurr)		;\
/*S*/	/* st	inR1SS,[s_S1(i-1)]*/ /* from prev */;/****/\
/*L*/	sll	R1L(Lcurr),T1a,T1b		;\
/*L*/	neg	T1a				;\
/*S*/	ld	[s_Sinit(i)],Tc			;\
/*L*/	srl	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*L*/	or	R1L(Lcurr),T1b,R1L(Lcurr)	;\
/*S*/	add	inR1SS,Tc,outR1SS			;\
/*S*/	add	outR1SS,R1L(Lcurr),outR1SS	;\
/*S*/	sll	outR1SS,3,T1a			;\
/*S*/	srl	outR1SS,29,outR1SS		;\
/*S*/	or	outR1SS,T1a,outR1SS		;\
/*S*/	/* st	outR1SS,[s_S1(i)] */ /* to next */ ;\

#define ROUND1_L_S_RM(i,j,inR1Lj,inR1SS,Lpred,Lcurr)	\
ROUND1_L_S_RM_i##i##_j##j:				\
/*L*/	add	inR1SS,R1L(Lpred),T1a		;\
/*L*/	/* add	R1L(Lcurr),T1a,R1L(Lcurr) */	;\
/*L*/	add	inR1Lj,T1a,R1L(Lcurr)		;\
/*S*/	/* st	inR1SS,[s_S1(i-1)] */ /* from prev */;/****/\
/*L*/	sll	R1L(Lcurr),T1a,T1b		;\
/*L*/	neg	T1a				;\
/*S*/	ld	[s_Sinit(i)],Tc			;\
/*L*/	srl	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*L*/	or	R1L(Lcurr),T1b,R1L(Lcurr)	;\
/*S*/	add	inR1SS,Tc,R1SS			;\
/*S*/	add	R1SS,R1L(Lcurr),R1SS		;\
/*S*/	sll	R1SS,3,T1a			;\
/*S*/	srl	R1SS,29,R1SS			;\
/*S*/	or	R1SS,T1a,R1SS			;\
/*S*/	/* st	R1SS,[s_S1(i)] */ /* to next */	;\

#define ROUND1_L(i,j,Lpred,Lcurr)		\
ROUND1_L_i##i##_j##j:				\
/*L*/	add	R1SS,R1L(Lpred),T1a		;\
/*L*/	add	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*S*/	st	R1SS,[s_S1(i-1)] /* from prev */;\
/*L*/	sll	R1L(Lcurr),T1a,T1b		;\
/*L*/	neg	T1a				;\
/*L*/	srl	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*L*/	or	R1L(Lcurr),T1b,R1L(Lcurr)	;\

	! Precalculated:
	!! S[0] = ROTL3(S[0]);
	!  RCS0 = RCP_ROTL3
	!! L[0] = ROTL(L[0]+S[0],S[0]);
	!  RGL0 = ROTL(RGLlo+RCS0,P_ROTL3)
	!! S[1] = ROTL3(S[1]+(S[0]+L[0]));
	!  RGS1 = ROTL3(Sinit[1]+RCS0+RGL0)
	!! L[1] = ROTL(L[1]+(S[1]+L[0]),(S[1]+L[0]));
	!  RGL1 = ROTL(RGLmid+(RGS1+RGL0),(RGS1+RGL0))
	!! S[2] = ROTL3(S[2]+S[1]+L[1]);
	!  RGS2 = ROTL3(Sinit[2]+RGS1+RGL1)
	!  RGL1S2 = RGL1+RGS2
	!  RGS2S3 = RGS2+Sinit[3]

	/* ROUND1L(2,2) */
!INLINE_ROUND1_L_i2_j2:
	!! L[2] = ROTL(L[2]+(S[2]+L[1]),(S[2]+L[1]));
	!! L[2] = ROTL(RGLhi+(RGL1S2),(RGL1S2));
	!ld	[s_Lhi],tRGLhi ! keep tRGLhi from end of last iteration
	ld	[s_GL1S2],tRGL1S2
/*2-2*/	add	tRGLhi,tRGL1S2,Tc
/*2-2*/	sll	Tc,tRGL1S2,T1b
/*2-2*/	neg	tRGL1S2,T1a			// constant, but slot is free anyway
/*2-2*/	srl	Tc,T1a,R1L(2)
/*2-2*/	or	R1L(2),T1b,R1L(2)

INLINE_ROUND1_S_i3_j0:
	!! S[3] = ROTL3(S[3]+S[2]+L[2]);
	!! S[3] = ROTL3(RGS2S3+L[2]);
	ld	[s_GS2S3],tRGS2S3
/*S*/	add	tRGS2S3,R1L(2),R1SS
/*S*/	sll	R1SS,3,T1a
/*S*/	srl	R1SS,29,R1SS
/*S*/	or	R1SS,T1a,R1SS
/*S*/	/* st	R1SS,[s_S1(i)] */ /* to next */

	!! S[i] = ROTL3(S[i]+S[i-1]+L[j-1])
	!! L[j] = ROTL(L[j]+(S[i]+L[j-1]),(S[i]+L[j-1]))
	
	!! the ROUND1*(i,j,*) macros calculate the values reordered:
	!! L[j-1] = ROTL(L[j-1]+(S[i-1]+L[j-2]),(S[i-1]+L[j-2]))
	!! S[i]   = ROTL3(S[i]+S[i-1]+L[j-1])

	ld	[s_GL0],tRGL0
	ROUND1_prec(4,1,tRGL0)
	ld	[s_GL1],tRGL1
	ROUND1_prec(5,2,tRGL1)
	ROUND1_MR(6,0,       R1S6)
	ROUND1_RR(7,1, R1S6, R1S7)
	ROUND1_RR(8,2, R1S7, R1S8)
	ROUND1_RR(9,0, R1S8, R1S9)
	ROUND1_RR(10,1,R1S9, R1S10)
	ROUND1_RR(11,2,R1S10,R1S11)
	ROUND1_RR(12,0,R1S11,R1S12)
	ROUND1_RR(13,1,R1S12,R1S13)
	ROUND1_RR(14,2,R1S13,R1S14)
	ROUND1_RR(15,0,R1S14,R1S15)
	ROUND1_RR(16,1,R1S15,R1S16)
	ROUND1_RR(17,2,R1S16,R1S17)
	ROUND1_RR(18,0,R1S17,R1S18)
	ROUND1_RR(19,1,R1S18,R1S19)
	ROUND1_RR(20,2,R1S19,R1S20)
	ROUND1_RR(21,0,R1S20,R1S21)
	ROUND1_RR(22,1,R1S21,R1S22)
	ROUND1_RR(23,2,R1S22,R1S23)
	ROUND1_RR(24,0,R1S23,R1S24)
	ROUND1_RM(25,1,R1S24)
	ROUND1_END(26,2)

! ##########################################################################

! ROUND 2 - key expansion

#define ROUND2_START_prec(i,j,loadS1i)   ROUND2_S(i,j,loadS1i,pred3(pred3(j)),pred3(j))
#define ROUND2_START(i,j)        ROUND2_S(i,j,s_S1(i),pred3(pred3(j)),pred3(j))
#define ROUND2_prec(i,j,loadS1i) ROUND2_L_S(i,j,loadS1i,pred3(pred3(j)),pred3(j))
#define ROUND2(i,j)              ROUND2_L_S(i,j,s_S1(i),pred3(pred3(j)),pred3(j))
#define ROUND2_MR(i,j,outR1SS)   ROUND2_L_S_MR(i,j,s_S1(i),outR1SS,pred3(pred3(j)),pred3(j))
#define ROUND2_RR(i,j,inR1SS,outR1SS)   ROUND2_L_S_RR(i,j,s_S1(i),inR1SS,outR1SS,pred3(pred3(j)),pred3(j))
#define ROUND2_RM(i,j,inR1SS)    ROUND2_L_S_RM(i,j,s_S1(i),inR1SS,pred3(pred3(j)),pred3(j))
#define ROUND2_END(i,j)          ROUND2_L(i,j,pred3(pred3(j)),pred3(j))

#define ROUND2_S(i,j,loadS1i,Lpred,Lcurr)	\
ROUND2_S_i##i##_j##j:				\
/*S*/	ld	[loadS1i],T1a			;\
/*S*/	add	R1SS,R1L(Lcurr),R1SS		;\
/*S*/	add	R1SS,T1a,R1SS			;\
/*S*/	sll	R1SS,3,T1a			;\
/*S*/	srl	R1SS,29,R1SS			;\
/*S*/	or	R1SS,T1a,R1SS			;\
/*S*/	st	R1SS,[s_S1(i)]			;\

#define ROUND2_L_S(i,j,loadS1i,Lpred,Lcurr)	\
ROUND2_L_S_i##i##_j##j:				\
/*L*/	add	R1SS,R1L(Lpred),T1a		;\
/*L*/	add	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*L*/	sll	R1L(Lcurr),T1a,T1b		;\
/*L*/	neg	T1a				;\
/*L*/	srl	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*L*/	or	R1L(Lcurr),T1b,R1L(Lcurr)	;\
/*S*/	ld	[loadS1i],T1a			;\
/*S*/	add	R1SS,R1L(Lcurr),R1SS		;\
/*S*/	add	R1SS,T1a,R1SS			;\
/*S*/	sll	R1SS,3,T1a			;\
/*S*/	srl	R1SS,29,R1SS			;\
/*S*/	or	R1SS,T1a,R1SS			;\
/*S*/	st	R1SS,[s_S1(i)]			;\

#define ROUND2_L_S_MR(i,j,loadS1i,outR1SS,Lpred,Lcurr)	\
ROUND2_L_S_MR_i##i##_j##j:			\
/*L*/	add	R1SS,R1L(Lpred),T1a		;\
/*L*/	add	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*L*/	sll	R1L(Lcurr),T1a,T1b		;\
/*L*/	neg	T1a				;\
/*L*/	srl	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*L*/	or	R1L(Lcurr),T1b,R1L(Lcurr)	;\
/*S*/	/* ld	[loadS1i],T1a */ /*****/	;\
/*S*/	/* add	R1SS,T1a,outR1SS */ /*****/	;\
/*S*/	add	R1SS,outR1SS,outR1SS		;\
/*S*/	add	outR1SS,R1L(Lcurr),outR1SS	;\
/*S*/	sll	outR1SS,3,T1a			;\
/*S*/	srl	outR1SS,29,outR1SS		;\
/*S*/	or	outR1SS,T1a,outR1SS		;\
/*S*/	/* st	outR1SS,[s_S1(i)] */ /*****/	;\

#define ROUND2_L_S_RR(i,j,loadS1i,inR1SS,outR1SS,Lpred,Lcurr)	\
ROUND2_L_S_RR_i##i##_j##j:			\
/*L*/	add	inR1SS,R1L(Lpred),T1a		;\
/*L*/	add	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*L*/	sll	R1L(Lcurr),T1a,T1b		;\
/*L*/	neg	T1a				;\
/*L*/	srl	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*L*/	or	R1L(Lcurr),T1b,R1L(Lcurr)	;\
/*S*/	/* ld	[loadS1i],T1a */ /*****/	;\
/*S*/	/* add	inR1SS,T1a,outR1SS */ /*****/	;\
/*S*/	add	inR1SS,outR1SS,outR1SS		;\
/*S*/	add	outR1SS,R1L(Lcurr),outR1SS	;\
/*S*/	sll	outR1SS,3,T1a			;\
/*S*/	srl	outR1SS,29,outR1SS		;\
/*S*/	or	outR1SS,T1a,outR1SS		;\
/*S*/	/* st	outR1SS,[s_S1(i)] */ /*****/	;\

#define ROUND2_L_S_RM(i,j,loadS1i,inR1SS,Lpred,Lcurr)	\
ROUND2_L_S_RM_i##i##_j##j:			\
/*L*/	add	inR1SS,R1L(Lpred),T1a		;\
/*L*/	add	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*L*/	sll	R1L(Lcurr),T1a,T1b		;\
/*L*/	neg	T1a				;\
/*L*/	srl	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*L*/	or	R1L(Lcurr),T1b,R1L(Lcurr)	;\
/*S*/	ld	[loadS1i],T1a			;\
/*S*/	add	inR1SS,R1L(Lcurr),R1SS		;\
/*S*/	add	R1SS,T1a,R1SS			;\
/*S*/	sll	R1SS,3,T1a			;\
/*S*/	srl	R1SS,29,R1SS			;\
/*S*/	or	R1SS,T1a,R1SS			;\
/*S*/	st	R1SS,[s_S1(i)]			;\

#define ROUND2_L(i,j,Lpred,Lcurr)		\
ROUND2_L_i##i##_j##j:				\
/*L*/	add	R1SS,R1L(Lpred),T1a		;\
/*L*/	add	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*L*/	sll	R1L(Lcurr),T1a,T1b		;\
/*L*/	neg	T1a				;\
/*L*/	srl	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*L*/	or	R1L(Lcurr),T1b,R1L(Lcurr)	;\

	!! S[i] = ROTL3(S[i]+S[i-1]+L[j-1])
	!! L[j] = ROTL(L[j]+(S[i]+L[j-1]),(S[i]+L[j-1]))
	
	!! the ROUND2*(i,j,*) macros calculate the values reordered:
	!! L[j-1] = ROTL(L[j-1]+(S[i-1]+L[j-2]),(S[i-1]+L[j-2]))
	!! S[i]   = ROTL3(S[i]+S[i-1]+L[j-1])

	ROUND2_START_prec(0,2,s_CS0)
	ROUND2_prec(1,0,s_GS1)
	ROUND2_prec(2,1,s_GS2)
	ROUND2(3,2)
	ROUND2(4,0)
	ROUND2(5,1)
	ROUND2_MR(6,2,       R1S6)
	st	R1S6,[s_S1(6)]
	ROUND2_RR(7,0, R1S6, R1S7)
	st	R1S7,[s_S1(7)]
	ROUND2_RR(8,1, R1S7, R1S8)
	ROUND2_RR(9,2, R1S8, R1S9)
	ROUND2_RR(10,0,R1S9, R1S10)
	ROUND2_RR(11,1,R1S10,R1S11)
	ROUND2_RR(12,2,R1S11,R1S12)
	ROUND2_RR(13,0,R1S12,R1S13)
	ROUND2_RR(14,1,R1S13,R1S14)
	ROUND2_RR(15,2,R1S14,R1S15)
	ROUND2_RR(16,0,R1S15,R1S16)
	ROUND2_RR(17,1,R1S16,R1S17)
	ROUND2_RR(18,2,R1S17,R1S18)
	ROUND2_RR(19,0,R1S18,R1S19)
	ROUND2_RR(20,1,R1S19,R1S20)
	ROUND2_RR(21,2,R1S20,R1S21)
	ROUND2_RR(22,0,R1S21,R1S22)
	ROUND2_RR(23,1,R1S22,R1S23)
	ROUND2_RR(24,2,R1S23,R1S24)
	ROUND2_RM(25,0,R1S24)
	ROUND2_END(26,1)

! ##########################################################################

! ROUND 3 - key expansion and encryption

#define ROUND3_i0(i,j)  ROUND3_S(i,j,pred3(pred3(j)),pred3(j))
#define ROUND3(i,j)     ROUND3_L_S(i,j,pred3(pred3(j)),pred3(j))
#define ROUND3EVEN(i,j) ROUND3_L_S_A(i,j,A,B,pred3(pred3(j)),pred3(j))
#define ROUND3ODD(i,j)  ROUND3_L_S_A(i,j,B,A,pred3(pred3(j)),pred3(j))
#define ROUND3EVEN_MR(i,j,outR1SS)        ROUND3_L_S_A_MR(i,j,outR1SS,A,B,pred3(pred3(j)),pred3(j))
#define ROUND3EVEN_RR(i,j,inR1SS,outR1SS) ROUND3_L_S_A_RR(i,j,inR1SS,outR1SS,A,B,pred3(pred3(j)),pred3(j))
#define ROUND3EVEN_RM(i,j,inR1SS)         ROUND3_L_S_A_RM(i,j,inR1SS,A,B,pred3(pred3(j)),pred3(j))
#define ROUND3ODD_MR(i,j,outR1SS)         ROUND3_L_S_A_MR(i,j,outR1SS,B,A,pred3(pred3(j)),pred3(j))
#define ROUND3ODD_RR(i,j,inR1SS,outR1SS)  ROUND3_L_S_A_RR(i,j,inR1SS,outR1SS,B,A,pred3(pred3(j)),pred3(j))
#define ROUND3ODD_RM(i,j,inR1SS)          ROUND3_L_S_A_RM(i,j,inR1SS,B,A,pred3(pred3(j)),pred3(j))

#define ROUND3_S(i,j,Lpred,Lcurr)		\
ROUND3_S_i##i##_j##j:				\
/*S*/	ld	[s_S1(i)],T1a			;\
/*S*/	add	R1SS,R1L(Lcurr),R1SS		;\
/*S*/	add	R1SS,T1a,R1SS			;\
/*S*/	sll	R1SS,3,T1a			;\
/*S*/	srl	R1SS,29,R1SS			;\
/*S*/	or	R1SS,T1a,R1SS			;\
/*S*/	/* !st	R1SS,[s_S1(i)]	*/		;\

#define ROUND3_L_S(i,j,Lpred,Lcurr)		\
ROUND3_L_S_i##i##_j##j:				\
/*L*/	add	R1SS,R1L(Lpred),T1a		;\
/*L*/	add	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*L*/	sll	R1L(Lcurr),T1a,T1b		;\
/*L*/	neg	T1a				;\
/*L*/	srl	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*L*/	or	R1L(Lcurr),T1b,R1L(Lcurr)	;\
/*S*/	ld	[s_S1(i)],T1a			;\
/*S*/	add	R1SS,R1L(Lcurr),R1SS		;\
/*S*/	add	R1SS,T1a,R1SS			;\
/*S*/	sll	R1SS,3,T1a			;\
/*S*/	srl	R1SS,29,R1SS			;\
/*S*/	or	R1SS,T1a,R1SS			;\
/*S*/	/* !st	R1SS,[s_S1(i)]	*/		;\

#define ROUND3_L_S_A(i,j,A,B,Lpred,Lcurr)	\
ROUND3_L_S_A_i##i##_j##j:			\
/*L*/	add	R1SS,R1L(Lpred),T1a		;\
/*A*/	xor	R1##A,R1##B,R1##A		;\
						\
/*A*/	sll	R1##A,R1##B,Tc			;\
/*L*/	add	R1L(Lcurr),T1a,R1L(Lcurr)	;\
						\
/*L*/	sll	R1L(Lcurr),T1a,T1b		;\
/*L*/	neg	T1a				;\
						\
/*L*/	srl	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*A*/	neg	R1##B,T1a			;\
						\
/*A*/	srl	R1##A,T1a,R1##A			;\
/*L*/	or	R1L(Lcurr),T1b,R1L(Lcurr)	;\
/*S*/	ld	[s_S1(i)],T1a			;\
						\
/*S*/	add	R1SS,T1a,R1SS			;\
/*S*/	add	R1SS,R1L(Lcurr),R1SS		;\
/*S*/	sll	R1SS,3,T1a			;\
/*A*/	or	R1##A,Tc,R1##A			;\
/*S*/	srl	R1SS,29,R1SS			;\
/*S*/	or	R1SS,T1a,R1SS			;\
/*S*/	/* !st	R1SS,[s_S1(i)]	*/		;\
/*A*/	add	R1##A,R1SS,R1##A		;\

#define ROUND3_L_S_A_MR(i,j,outR1SS,A,B,Lpred,Lcurr)	\
ROUND3_L_S_A_MR_i##i##_j##j:			\
/*L*/	add	R1SS,R1L(Lpred),T1a		;\
/*A*/	xor	R1##A,R1##B,R1##A		;\
						\
/*A*/	sll	R1##A,R1##B,Tc			;\
/*L*/	add	R1L(Lcurr),T1a,R1L(Lcurr)	;\
						\
/*L*/	sll	R1L(Lcurr),T1a,T1b		;\
/*L*/	neg	T1a				;\
						\
/*L*/	srl	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*A*/	neg	R1##B,T1a			;\
/*S*/	/* ld	[s_S1(i)],T1a */ /*****/	;\
						\
/*A*/	srl	R1##A,T1a,R1##A			;\
/*L*/	or	R1L(Lcurr),T1b,R1L(Lcurr)	;\
						\
/*S*/	/* add	R1SS,T1a,outR1SS */ /*****/	;\
/*S*/	add	R1SS,outR1SS,outR1SS		;\
/*S*/	add	outR1SS,R1L(Lcurr),outR1SS	;\
/*S*/	sll	outR1SS,3,T1a			;\
/*A*/	or	R1##A,Tc,R1##A			;\
/*S*/	srl	outR1SS,29,outR1SS		;\
/*S*/	or	outR1SS,T1a,outR1SS		;\
/*S*/	/* !st	R1SS,[s_S1(i)]	*/		;\
/*A*/	add	R1##A,outR1SS,R1##A		;\

#define ROUND3_L_S_A_RR(i,j,inR1SS,outR1SS,A,B,Lpred,Lcurr)	\
ROUND3_L_S_A_RR_i##i##_j##j:			\
/*L*/	add	inR1SS,R1L(Lpred),T1a		;\
/*A*/	xor	R1##A,R1##B,R1##A		;\
						\
/*A*/	sll	R1##A,R1##B,Tc			;\
/*L*/	add	R1L(Lcurr),T1a,R1L(Lcurr)	;\
						\
/*L*/	sll	R1L(Lcurr),T1a,T1b		;\
/*L*/	neg	T1a				;\
						\
/*L*/	srl	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*A*/	neg	R1##B,T1a			;\
/*S*/	/* ld	[s_S1(i)],T1a */ /*****/	;\
						\
/*A*/	srl	R1##A,T1a,R1##A			;\
/*L*/	or	R1L(Lcurr),T1b,R1L(Lcurr)	;\
						\
/*S*/	/* add	inR1SS,T1a,outR1SS */ /*****/	;\
/*S*/	add	inR1SS,outR1SS,outR1SS		;\
/*S*/	add	outR1SS,R1L(Lcurr),outR1SS	;\
/*S*/	sll	outR1SS,3,T1a			;\
/*A*/	or	R1##A,Tc,R1##A			;\
/*S*/	srl	outR1SS,29,outR1SS		;\
/*S*/	or	outR1SS,T1a,outR1SS		;\
/*S*/	/* !st	R1SS,[s_S1(i)]	*/		;\
/*A*/	add	R1##A,outR1SS,R1##A		;\

#define ROUND3_L_S_A_RM(i,j,inR1SS,A,B,Lpred,Lcurr)	\
ROUND3_L_S_A_RM_i##i##_j##j:			\
/*L*/	add	inR1SS,R1L(Lpred),T1a		;\
/*A*/	xor	R1##A,R1##B,R1##A		;\
						\
/*A*/	sll	R1##A,R1##B,Tc			;\
/*L*/	add	R1L(Lcurr),T1a,R1L(Lcurr)	;\
						\
/*L*/	sll	R1L(Lcurr),T1a,T1b		;\
/*L*/	neg	T1a				;\
						\
/*L*/	srl	R1L(Lcurr),T1a,R1L(Lcurr)	;\
/*A*/	neg	R1##B,T1a			;\
						\
/*A*/	srl	R1##A,T1a,R1##A			;\
/*L*/	or	R1L(Lcurr),T1b,R1L(Lcurr)	;\
/*S*/	ld	[s_S1(i)],T1a			;\
						\
/*S*/	add	inR1SS,T1a,R1SS			;\
/*S*/	add	R1SS,R1L(Lcurr),R1SS		;\
/*S*/	sll	R1SS,3,T1a			;\
/*A*/	or	R1##A,Tc,R1##A			;\
/*S*/	srl	R1SS,29,R1SS			;\
/*S*/	or	R1SS,T1a,R1SS			;\
/*S*/	/* !st	R1SS,[s_S1(i)]	*/		;\
/*A*/	add	R1##A,R1SS,R1##A		;\

	!! S[i] = ROTL3(S[i]+S[i-1]+L[j-1])
	!! L[j] = ROTL(L[j]+(S[i]+L[j-1]),(S[i]+L[j-1]))
	!! i even:
	!!    A = ROTL(A^B,B)+S[i]
	!! i odd:
	!!    B = ROTL(B^A,A)+S[i]

	!! the ROUND3*(i,j,A,B,*) macros calculate the values reordered:
	!! L[j-1] = ROTL(L[j-1]+(S[i-1]+L[j-2]),(S[i-1]+L[j-2]))
	!! S[i]   = ROTL3(S[i]+S[i-1]+L[j-1])
	!! A      = ROTL(A^B)+S[i]

	ld	[s_uw(r72unitwork_plain_lo)],R1A
	ld	[s_uw(r72unitwork_plain_hi)],R1B

	ROUND3_i0(0,1)
	add	R1A,R1SS,R1A

	ROUND3(1,2)
	add	R1B,R1SS,R1B

	ROUND3EVEN(2,0)
	ROUND3ODD (3,1)
	ROUND3EVEN(4,2)
	ROUND3ODD (5,0)
	ROUND3EVEN(6,1)
	ROUND3ODD (7,2)
	ROUND3EVEN_MR(8,0,       R1S8)
	ROUND3ODD_RR (9,1, R1S8, R1S9)
	ROUND3EVEN_RR(10,2,R1S9, R1S10)
	ROUND3ODD_RR (11,0,R1S10,R1S11)
	ROUND3EVEN_RR(12,1,R1S11,R1S12)
	ROUND3ODD_RR (13,2,R1S12,R1S13)
	ROUND3EVEN_RR(14,0,R1S13,R1S14)
	ROUND3ODD_RR (15,1,R1S14,R1S15)
	ROUND3EVEN_RR(16,2,R1S15,R1S16)
	ROUND3ODD_RR (17,0,R1S16,R1S17)
	ROUND3EVEN_RR(18,1,R1S17,R1S18)
	ROUND3ODD_RR (19,2,R1S18,R1S19)
	ROUND3EVEN_RR(20,0,R1S19,R1S20)
	ROUND3ODD_RR (21,1,R1S20,R1S21)
	ROUND3EVEN_RR(22,2,R1S21,R1S22)
	ROUND3ODD_RR (23,0,R1S22,R1S23)
	ROUND3EVEN_RR(24,1,R1S23,R1S24)
! delay ROUND3ODD_RM_i25_j2 (calculates cypher_hi) after a successful
! cypher_lo test

! ############################################################################

	ld	[s_uw(r72unitwork_cypher_lo)],tRGcypher_lo
	ld	[s_Lhi],tRGLhi
test_pipe_1:
	cmp	R1A,tRGcypher_lo
	be	partial_success
	 ld	[s_iterations],tRGiterations /* from below */
	
increment_key:
	add	tRGLhi,PIPELINES,tRGLhi
	andcc	tRGLhi,0xff,tRGLhi
	bz	complex_increment
	 st	tRGLhi,[s_Lhi]
	 	! keep the value of tRGLhi til the beginning of next iteration
increment_key_done:

	/* ld	[s_iterations],tRGiterations */ /* to up */
	subcc	tRGiterations,PIPELINES,tRGiterations
	bnz	mainloop
	 st	tRGiterations,[s_iterations]

! ############################################################################

end_of_mainloop:
	b	finish
	 mov	RESULT_NOTHING,%i0

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

complex_increment:
#define RI0x01000000 R1A
#define RI0xFF000000 R1B
#define RI0x00010000 T1a
//#define RI0x00FFFFFF
#define RI0x00FF0000 R1L(0)
#define RI0x0000FFFF R1L(1)
#define RI0x0000FF00 R1L(2)

	! increment RGLmid
	ld	[s_Lmid],tRGLmid
	set	0x01000000,RI0x01000000
	set	0xFF000000,RI0xFF000000
	add	tRGLmid,RI0x01000000,tRGLmid
	btst	RI0xFF000000,tRGLmid
	bnz	update_RGLmid
	 set	0x00010000,RI0x00010000
	!set	0x00FFFFFF,RI0x00FFFFFF			!!! 2 ops
	!sub	RI0x01000000,1,RI0x00FFFFFF
	add	tRGLmid,RI0x00010000,tRGLmid
	set	0x00FF0000,RI0x00FF0000
	!and	tRGLmid,RI0x00FFFFFF,tRGLmid
	andn	tRGLmid,RI0xFF000000,tRGLmid
	btst	RI0x00FF0000,tRGLmid
	bnz	update_RGLmid
	 sub	RI0x00010000,1,RI0x0000FFFF
	add	tRGLmid,0x0100,tRGLmid
	!set	0x0000FF00,RI0x0000FF00			!!! 2 ops
	andn	RI0x0000FFFF,0xff,RI0x0000FF00
	and	tRGLmid,RI0x0000FFFF,tRGLmid
	btst	RI0x0000FF00,tRGLmid
	bnz	update_RGLmid
	 nop
	add	tRGLmid,0x01,tRGLmid
	andcc	tRGLmid,0xff,tRGLmid
	bnz	update_RGLmid
	 nop

	! increment RGLlo
	ld	[s_Llo],tRGLlo
	add	tRGLlo,RI0x01000000,tRGLlo
	btst	RI0xFF000000,tRGLlo
	bnz	update_RGLlo
	 nop
	add	tRGLlo,RI0x00010000,tRGLlo
	!and	tRGLlo,RI0x00FFFFFF,tRGLlo
	andn	tRGLlo,RI0xFF000000,tRGLlo
	btst	RI0x00FF0000,tRGLlo
	bnz	update_RGLlo
	 nop
	add	tRGLlo,0x0100,tRGLlo
	and	tRGLlo,RI0x0000FFFF,tRGLlo
	btst	RI0x0000FF00,tRGLlo
	bnz	update_RGLlo
	 nop
	add	tRGLlo,0x01,tRGLlo
	and	tRGLlo,0xff,tRGLlo
	!b	update_RGLlo
	! nop

update_RGLlo:
	st	tRGLlo,[s_Llo]

	! update precalculated values RGL0, RGS1, RGL1, RGS2
	! first calculate RGL0, RGS1
	CALCULATE_RGLlo_DEPS(increment)
	! and fall through to calculate RGL1, RGS2

update_RGLmid:
	st	tRGLmid,[s_Lmid]

	! update precalculated values RGL1, RGS2
	CALCULATE_RGLmid_DEPS(increment)
	b	increment_key_done
	 nop

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

partial_success:
	ROUND3ODD_RM (25,2,R1S24)
	!ld	[s_uw(r72unitwork_cypher_lo)],RGcypher_lo
	!cmp	R1A,RGcypher_lo
	!be	test_pipe_1_lo_success
	! nop
test_pipe_1_lo_success:
	ld	[s_uw(r72unitwork_check_count)],T1a
	ld	[s_Llo],tRGLlo
	ld	[s_Lmid],tRGLmid
	inc	T1a
	st	tRGLhi,[s_uw(r72unitwork_check_hi)]
	st	tRGLmid,[s_uw(r72unitwork_check_mid)]
	st	tRGLlo,[s_uw(r72unitwork_check_lo)]
	st	T1a,[s_uw(r72unitwork_check_count)]
	ld	[s_uw(r72unitwork_cypher_hi)],T1b
	cmp	R1B,T1b
	bne	increment_key
	 nop ! mov	RCP,RGSinit
test_pipe_1_lohi_success:
	ld	[s_save_iterationsP],T1a
	ld	[s_iterations],tRGiterations
	ld	[T1a],T1b
	sub	T1b,tRGiterations,T1b
	st	T1b,[T1a]
	!b	finish
	 mov	RESULT_FOUND,%i0

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

finish:

! copy r72unitwork from stack back into supplied memory
	ld	[s_save_r72unitworkP],%g1
	ld	[s_Llo],tRGLlo
	ld	[s_Lmid],tRGLmid
	ld	[s_Lhi],tRGLhi
	st	tRGLlo,[%g1+r72unitwork_L0_lo]
	st	tRGLmid,[%g1+r72unitwork_L0_mid]
	st	tRGLhi,[%g1+r72unitwork_L0_hi]
	ld	[s_uw(r72unitwork_check_count)],%l0
	ld	[s_uw(r72unitwork_check_hi)],%l1
	ld	[s_uw(r72unitwork_check_mid)],%l2
	ld	[s_uw(r72unitwork_check_lo)],%l3
	st	%l0,[%g1+r72unitwork_check_count]
	st	%l1,[%g1+r72unitwork_check_hi]
	st	%l2,[%g1+r72unitwork_check_mid]
	st	%l3,[%g1+r72unitwork_check_lo]
	
	ld	[s_save_i7],%i7

	ret
	restore
