/*
//########################################################
//
//
//  MIPS3 / irix RC5 optimised Core.
//
// Assumes GNU assembler 
//	{ possibly version 2.7 (mips-sgi-irix5.3), using BFD version 2.7}
//
//
//  Author:	 Simon Fenney  (simonf@videologic.com)
// 
//
// Based on code originally generated by
//
//		GNU C++ 2.7.2.2 [AL 1.1, MM 40] SGI running IRIX 5.x compiled by CC
//		Cc1 arguments (-G value = 0, Cpu = 3000, ISA = 1):
//		-mabicalls
//		-quiet -dumpbase -O6 -Wall -fomit-frame-pointer -fno-inline-functions -o
//
// This was then modified to be compatible with the "mips-crunch.cpp" version.
// This has a call format of...
//
//	u32 crunch(register RC5UnitWork *rc5UnitWork, u32 iterations) ;
//
// ..where the value returned is (PipelineCount * num_pipelines) if the key is
// not found, and a smaller value if it is. Updated key values are returned in
// RC5UnitWork.
//
//	BUILD NOTE:
//	###########
//
//	The file uses a ".S" ending rather than a ".s".  This instructs gcc (or g++) to
//	first put the file through the c preprocessor before assembling. In otherwords,
//	it just has to be built with something like...
//	
//			gcc -c mips-irix.S {options}
//
//	This source can build one of two possible versions depending of a preprocessor
//	macro that MUST be passed in from the command line.
//
//	-DSUPERSCALAR=0
//		This is optimised for R4x00, R5000, and R3000 processors, and only assumes
//		a single integer pipeline. It makes quite effective use of the available
//		registers.
//
//	-DSUPERSCALAR=1
//		This is 'optimised' for the R10000 mips processor. It calculates 2 keys
//		per loop to make better use of the multiple pipelines.
//	
//	
//	
//	 NOTE: The code only uses R3000 compatible instructions.
//
//
//	IMPORTANT NOTE:
//	##############
//   I'm assuming that  ((iterations % pipeline_count) == 0) 
//	and that the corresponding bits in key_hi are also Zero.
//	(This does appear to be the case (having looked in problem.cpp))	
//
//   If not, it  will terminate the call with a return value of -1 otherwise.
//
//	This makes the core much faster as I can save tests in the 'inner' loop, by
//	setting pipeline_count to something very high!!! (probably 256)
//  
//
   Revision 1.24  1998/11/26  17:09:32  sjf
   1) Fixed a VERY nasty bug with the incrementing of keys (well only if
      you were to increment the high key, get an overflow, and then have
      to increment the low key)
   2) Cleaned up some more comments.
 
   Revision 1.23  1998/11/26  15:11:20  sjf
   Rewrote the 2nd pipeline code so that the first 2 S values are
   now stored in registers. (These are shared with eA_P2 and eB_P2).
   Amazingly, it seemed to work first time, and also runs a smidgeon
   faster.
 
   NOTE: There are some further optimisations that could be done with
   the dual pipeline version, but as I don't have access to an R10000
   to try them out, I'm not currently that interested in doing it :)

   Revision 1.22  1998/11/26  13:28:44  sjf
   Just a tidy up of the comments.
 
   Revision 1.21  1998/11/26  09:50:51  sjf
   1) Tidied up some comments and added some more
   2) Added SUPERSCALAR macro as "a command line build parameter"
   3) Moved some 'constant' stuff out of a loop.
 
   Revision 1.20  1998/11/25  16:16:41  sjf
   Fixed some silly errors - it APPEARS to be working. i.e.
   it passes all the self tests that the mips-crunch passed.
   However, since mips-crunch still had a very serious bug
   I'm still going to do some more testing :)

   Revision 1.19  1998/11/24  14:07:56  sjf
   UNTESTED/UNASSEMBLED: In the process of changing over to the MIPS-Crunch
   calling style.

   Revision 1.18  1998/11/17  15:47:10  sjf
   Got both the single and dual pipelined versions running. Currently
   the dual pipe version is 4% slower on the Mips R4400, but hopefully
   would be faster on an R10k. There are several improvements that could
   be made to this. The main one would be to define S0_P2 and S1_P2
   registers and share THESE with the eA_P2 and eB_P2 registers.

   Revision 1.17  1998/11/17  15:03:24  sjf
   I think this is a version that has both pipelines working... but it
   needs tidying up.

   Revision 1.11  1998/11/16  11:17:14  sjf
   This is an 'in transistion' source - it's in the process of
   being changed over to support either 1 or 2 pipelines.

   Revision 1.9  1998/11/11  10:35:45  sjf
   Removed an Addu per key iteration by noting that KeyHi+S1+B0 was constant
   for all keys we want to do.  (This refers to the INITIAL keyHi)

   Revision 1.8  1998/11/11  09:54:48  sjf
   I think I removed one lw stall at the loop test PLUS tidied
   up the comments a bit.

   Revision 1.7  1998/11/10  10:53:36  sjf
   Removed a few 'lw' stalls from the Round3 key generation pipeline, by
   reusing some of the earlier 'S' registers.

   Revision 1.6  1998/11/09  16:53:18  sjf
   1) Saved input params on stack so that workspace register could
      be re-used as an S value
   2) Got rid of a couple of minor load stalls

   Revision 1.5  1998/11/09  15:17:23  sjf
   Reused Sinit and Q registers as S24 and S25 so that we do fewer
   reads and writes from the stack.
 
   Revision 1.4  1998/11/09  13:50:33  sjf
   Shared S0 and eA, and S1 and eB.
*/
	
	/*
	//I've no idea what these do, but left them in anyway
	*/
	.set	nobopt
	.option pic2


/*
// Define if we do a single key per loop or 2 at the same time.
// The 2 key version is intended for the R10000.
*/

#ifndef SUPERSCALAR
	#error The Macro SUPERSCALAR must be defined on the command line
	
#elif SUPERSCALAR==0
	#define TWO_PARALLEL_KEYS 0
	
#elif SUPERSCALAR==1
	#define TWO_PARALLEL_KEYS 1
	
#else
	#error The Macro SUPERSCALAR set to an invalid value
	
#endif
/*
//########################################################
//
// Notes on optimisation.
//   MIPS4x00:	A load instruction (lw) will cause a delay if the target of the load
//				is used in any of the next _3_ instructions on a R4000 or R4400.
//				A mips 4600 only has a 2 instruction delay as it has a shorter pipeline.
//	
//				Load immediates don't have this problem.
//
//				Branch instructions delay 3 cycles (according to the 4400 manual).
//				I assume this means a delay on TAKEN branches as opposed to non
//				taken branches. I've tried measuring this and it does indeed
//				appear better on the 'straight through' case - we thus make the
//				the branch go to the unlikely case.
//
//	MIPS10k:	These can perform out of order execution, and can keep a look-ahead
//				buffer of  16 instructions. It can execute up to 2 integer and one
//				load instruction at a time.
//	
//########################################################
*/

/*	
// Define the number of keys we check with each call.
// We choose 256 as it makes the auto key incrementing cheaper. This
// value cannot be changed without seriously modifying the code!
//
// Note that this value MUST be the same as the makefile -DPIPELINE_COUNT value.
*/
	
.EQU NumIterations, 256

/* check it */
#if PIPELINE_COUNT != 256
#error Pipeline count MUST be 256
#endif


/*
// If we do 2 keys in parallel, then we want to increment the inner loop by 2
*/
#if TWO_PARALLEL_KEYS
	.EQU LoopIncrement, 2
#else
	.EQU LoopIncrement, 1
#endif

.EQU  LoopLimit, (NumIterations - LoopIncrement)
	
/*
//
// Define the offsets into the Work structure. These are determined from
// the "RC5UnitWork" structure in "rc5.h"
//
// These get copied onto the stack so that we can re-use the register that
// initially points to them.
//
*/
.EQU	plain_hi,		0
.EQU	plain_lo,		4
.EQU	cipher_hi,		8
.EQU	cipher_lo,		12
.EQU	key_hi,			16
.EQU	key_lo,			20


/*
// Constants for the key generation
*/
.EQU	P_RC5_VAL,      0xB7E15163
.EQU	Q_VAL,		    0x9E3779B9
	
/*
// Get the P_RC5 value rotated by 3 (note the >> shift is signed)
*/
.EQU	P_ROTL3_VAL,   ((P_RC5_VAL << 3) | ((P_RC5_VAL >> (32-3)) & 0x7))
.EQU	S1_INITIAL_VAL, (P_RC5_VAL + Q_VAL)


/*
//
// Define Register assignment
// We need to do this via the C pre-processor
//
// Registers to avoid:	
//     29		- It's the stack pointer!
//	   27 and 26- produce random results when used! I noticed that the disassembler
//				  refers to these as k1 and k0, what ever than means (kernel perhaps?)
//
// Registers that don't have to be preserved across
// function calls- 25,24, 15 through to 1
//
// Input Parameter  (determined by looking at output of compiler)
*/
#define	WorkUnit $4      /*NOTE this gets destroyed and re-used as an S val.*/
#define InIterations $5  /* Ditto. Save it!*/

/*
// Key generation Pipeline.. The B/L Values
*/
#define	B0 $31
#define	B1 $30

/*
// Define as many of the 26 S vals we can comfortably store 
// (for the first key pipeline)
//
// Do the remaining ones on the stack 
*/	
#define	S0	$28		/*also shared with eA*/
#define	S1	$25		/*also shared with eB*/
#define	S2	$24
#define	S3	$23
#define	S4	$22		/* also S21_R3  */
#define	S5	$21		/* also S22_R3  */
#define	S6	$20		/* also S23_R3  */
#define	S7	$19
#define	S8	$18
#define	S9	$17
#define	S10 $16		/* S10, S11, S12 also used as temp registers right at the very end*/
#define	S11 $15
#define	S12 $14
#define	S13 $13
#define	S14 $12

/*
// if we are only doing a single pipeline, then use another 4 registers to
// store the S values directly
//
// NOTE S20 is WorkUnit. This means we must save all the work unit values before
// we stomp all over its original contents!!
*/

#if !TWO_PARALLEL_KEYS

#define	S15 $11
#define	S16 $10
#define	S17 $9
#define	S18 $8
#define	S19 $7
#define	S20 $4
	
/*
// Define the first SVAL we must keep on the stack, and the number of stack
// entries needed for the other pipeline
*/
.EQU	FIRST_STACK_SVAL, 21
.EQU	PIPE2_STACK,	  0

/*
// Else we are doing 2 pipelines at the same time. In this case we need 6 registers
// as temporary working variables. This is less efficient than it need be, but will
// do for the moment.
*/
#else
	
#define	S0_P2 $11				/*also shared with eA_P2 */
#define	S1_P2 $10				/*also shared with eB_P2 */
#define	S_READ_AHEAD_P2 $9
#define	A_P2	$8
#define	B0_P2	$7
#define	B1_P2	$4


/*
// Define the first SVAL we must keep on the stack, and the number of stack
// entries needed for the other pipeline
*/
.EQU	FIRST_STACK_SVAL, 15
.EQU	PIPE2_STACK,	  26
	
#endif


#define	A	  $6	/*'A' Is used between macros when we've run out of S Registers
					  It's also used to temporarily store plain text words*/
#define	Temp  $5	/*Temp is used within the processing macros*/

/* SKIP $4 */
	
/*
// Values used only in the beginning of Round 1 of key generation. 
// They then become S24 and S25 at the very end of Round 1
*/
#define		S_INITIAL	$3
#define		Q			$2 /*NOTE $2 is also the RETURN value*/

#define S24 S_INITIAL
#define S25 Q
	
/*
// Values used only in Round 3. NOTE eA and eB are S0 and S1 so we must
// take care for the first few steps of Round 3
// (Ditto for the second pipeline if any)
*/
#define		eA			S0
#define		eB			S1

#if TWO_PARALLEL_KEYS
  #define	eA_P2	S0_P2
  #define	eB_P2	S1_P2
#endif

/*
// Also in the latter part of round 3, to reduce the problems of load stalls,
// re-use some of the earlier S registers to do some read-aheads.
*/
#define S_ODD_R3	 S4
#define S_EVEN_R3	 S5

	
/*
// Values used in the middle of "rotates". This can be used elsewhere  
// but only if we're _very_ careful! (Note that $1 is the assembler temporary
// variable, "at", and so we must disable it with ".set noat").
*/
.set noat
#define Scratch		$1

	
# ###################################
# ###################################
/*
// Define the Stack requirements...
//
// We actually are reserving more space for Svals and registers than we actually
// need, but who cares?
*/
.EQU	SvalsSize,		 (((26 - FIRST_STACK_SVAL) + PIPE2_STACK)*4)
.EQU	Regspace,		 (32 * 4)   /*for coding simplicity, allow space for all*/
.EQU	LocalSpace,		 (16*4)		/* For local variables */
.EQU	TotalStackSpace, (SvalsSize + Regspace + LocalSpace)

	
/*
// Define the accessing of the Svals array for the first pipeline
*/	
.MACRO SaveSVal Reg, Index1
	sw \Reg,  (((\Index1 - FIRST_STACK_SVAL) + PIPE2_STACK)*4)($sp)
	.ENDM
	
.MACRO GetSVal Reg, Index1
	lw \Reg,  (((\Index1 - FIRST_STACK_SVAL) + PIPE2_STACK)*4)($sp)
	.ENDM

/*
// define accessing of the SVALS for 'pipeline' 2
*/

.MACRO SaveSVal_P2 Reg, Index1
	#if TWO_PARALLEL_KEYS
		sw \Reg,  ((\Index1)*4)($sp)
	#endif
	.ENDM
	
.MACRO GetSVal_P2 Reg, Index1
	#if TWO_PARALLEL_KEYS
		lw \Reg,  ((\Index1)*4)($sp)
	#endif
	.ENDM
	

   
/*
// Define local stack variable value offets
*/
.EQU	SavedB0,	(SvalsSize+Regspace+ 1 * 4)
.EQU	SavedS1,	(SvalsSize+Regspace+ 2 * 4)
.EQU	LoopCount,	(SvalsSize+Regspace+ 3 * 4)

.EQU	Saved_P_ROTL3,	(SvalsSize+Regspace+ 4 * 4)
.EQU	Saved_S2_INIT,	(SvalsSize+Regspace+ 5 * 4)
.EQU	Saved_Q,		(SvalsSize+Regspace+ 6 * 4)

/* 
// The 'work unit' values we need to store before stomping on the workunit pointer
//
// We don't need to save the low bits of the key as these only get used once,
// while we store a modified "key high" which is added to some other constants computed
// from the low key.
*/
.EQU	Saved_plain_hi,	   (SvalsSize+Regspace+  7 * 4)
.EQU	Saved_plain_lo,	   (SvalsSize+Regspace+  8 * 4)
.EQU	Saved_cipher_hi,   (SvalsSize+Regspace+  9 * 4)
.EQU	Saved_cipher_lo,   (SvalsSize+Regspace+ 10 * 4)
.EQU	Saved_key_hi_S1B0, (SvalsSize+Regspace+ 11 * 4)

	
/*
// Saved Outer Loop control values.
*/
.EQU	SavedWorkUnit,	(SvalsSize+Regspace+ 12 * 4)
.EQU	SavedIterations,(SvalsSize+Regspace+ 13 * 4) /*Num outer loop iterations*/
.EQU	OuterLoopCount, (SvalsSize+Regspace+ 14 * 4) /*The Outer loop counter   */
	
	
# ###################################
# ###################################

/*
// Define the register save/restore
*/
.MACRO	SaveReg Reg
	sw $\Reg, SvalsSize + \Reg * 4($sp)
    .ENDM
	
.MACRO	RestoreReg Reg
	lw $\Reg, SvalsSize + \Reg * 4($sp)
    .ENDM

	
/*
# ###################################
# Save/restore register macros
# ###################################
*/
.MACRO SaveALLRegs
	SaveReg 31
	SaveReg 30
	
	SaveReg 28
	
	SaveReg 23
	SaveReg 22
	SaveReg 21
	SaveReg 20
	SaveReg 19
	SaveReg 18
	SaveReg 17
	SaveReg 16

   	.ENDM

.MACRO RestoreALLRegs
	RestoreReg 31
	RestoreReg 30
	
	RestoreReg 28
	
	RestoreReg 23
	RestoreReg 22
	RestoreReg 21
	RestoreReg 20
	RestoreReg 19
	RestoreReg 18
	RestoreReg 17
	RestoreReg 16
	.ENDM
	
	
/*
# ###################################
#  Define the ROTATE equivalents
#  Uses the 'scratch' temporary variable
#
#  This is a pain on the MIPS as the shifts can't go
# in parallel even on an R10000, and it takes 2 cycles to
# do a variable shift on an R4400.
# ###################################
*/
.MACRO MyROL Dst, Src, Amnt
	subu	Scratch, $0, \Amnt
	srlv	Scratch, \Src, Scratch
	sllv	\Dst,\Src, \Amnt
	or		\Dst,\Dst, Scratch
	.ENDM
	
	
/*
# ###################################
# Rol by 3
# ###################################
*/	
.MACRO MyROL3 Dst, Src
	srl		Scratch, \Src, (32-3)
	sll		\Dst,\Src, 3
	or		\Dst,\Dst, Scratch
	.ENDM

/*
# ###################################
# Rol by P_ROTL3
#
# Since this rotate is a constant, and we don't have
# a real rotate instruction, do it by hand.
# ###################################
*/	
.EQU	ROL_LSHIFT,	(P_ROTL3_VAL & 31)

.MACRO ROLby_P_ROTL3 Val

	srl		Scratch, \Val, (32 - ROL_LSHIFT)
	sll		\Val,\Val, ROL_LSHIFT
	or		\Val,\Val, Scratch
	.ENDM

/*	
# ######################################################################
# ######################################################################
#  Standard ROUND X B Macro
#  This is the same for all rounds and both Pipes.
# ######################################################################
# ######################################################################
*/
.MACRO ROUND_X_B  A_VAL, B_PREV, B_CURR
	#
	# B_CURR = (B_CURR + A_VAL + B_PREV) <<< (A_VAL + B_PREV))
	#
	addu  Temp,   \A_VAL, \B_PREV
	addu  \B_CURR, Temp, \B_CURR
	MyROL \B_CURR,\B_CURR, Temp
	.ENDM

/*	
# ######################################################################
# ######################################################################
# Secondary Pipeline Macros. These are
#  'included' by the main Macros below...
# ######################################################################
# ######################################################################
*/

/*
// define the common B macro
*/
.MACRO ROUND_X_B_PIPE2  B_PREV, B_CURR
#if TWO_PARALLEL_KEYS
	ROUND_X_B  A_P2, \B_PREV, \B_CURR	
#endif
	.ENDM

/* Special macro for Round 2 step 0 */	
.MACRO ROUND_X_B_PIPE2_REG CURR_A, B_PREV, B_CURR
#if TWO_PARALLEL_KEYS
	ROUND_X_B  \CURR_A, \B_PREV, \B_CURR	
#endif
	.ENDM
	
/*
// Round 1 key generation S-vals for PIPE 1
//
// The S_INITIAL value we pinch directly from the 'main pipeline', and
// is supplied in S_IN
*/	
.MACRO ROUND_1_PIPE_2_JUST_A Step, S_IN, B_PREV, B_CURR
	
	#if TWO_PARALLEL_KEYS
		#
		# S_OUT = (A_P2) = (S_IN + A_P2 + B_PREV) <<< 3
		#
		addu   A_P2, \S_IN, A_P2
		addu   A_P2, A_P2, \B_PREV
		MyROL3 A_P2, A_P2

		#
		# Save the result
		#
		SaveSVal_P2 A_P2, \Step
	#endif
	.ENDM

	

/*
// Round 2 key generation S-vals for PIPE 2
//
// We want to read in the next S value in advance so that we don't stall the
// pipeline. This gets loaded into S_READ_AHEAD_P2
*/	
.MACRO ROUND_2_3_PRELOAD_S Step
	#if TWO_PARALLEL_KEYS
		GetSVal_P2 S_READ_AHEAD_P2, \Step
	#endif
	.ENDM

/*
// Processing of S value. It assumes we have already got the input S value
// from Round 0, either by pre-loading it, or because it is the same as pipeline 1.
*/
.MACRO ROUND_2_PIPE_2_JUST_A Step, S_IN, B_PREV, B_CURR
	#if TWO_PARALLEL_KEYS
		#
		# (A_P2) = (S_IN + A_P2 + B_PREV) <<< 3
		#
		addu   A_P2, \S_IN, A_P2
		addu   A_P2, A_P2, \B_PREV
		MyROL3 A_P2, A_P2

		#
		# Save the result
		#
		SaveSVal_P2 A_P2, \Step
	#endif
	.ENDM

/*
// Round 2 for the cases where we don't have to store to the stack
//
//	S_VAL is the value from the previous round
//	S_PREV is the value from the previous STEP
//	S_OUT is new output value for this step
//
//   S_VAL and S_OUT can be the same
*/
.MACRO	ROUND_2_PIPE_2_JUST_A_REG  S_VAL, S_PREV, S_OUT, B_PREV, B_CURR
	#if TWO_PARALLEL_KEYS
		
		#
		# S_OUT = (S_VAL + S_PREV + B_PREV) <<< 3
		#
		addu   \S_OUT, \S_VAL, \S_PREV
		addu   \S_OUT, \S_OUT, \B_PREV
		MyROL3 \S_OUT, \S_OUT

	#endif
	.ENDM

	
		
/*
// Round 3 key generation S-vals for PIPE 2
//
// This is the same as for ROUND_2_PIPE_2_JUST_A, except that we don't have to
// save the result back to the stack.
*/	
.MACRO ROUND_3_PIPE_2_JUST_A Step, S_IN, B_PREV, B_CURR
	#if TWO_PARALLEL_KEYS

		#
		# (A_P2) = (S_IN + A_P2 + B_PREV) <<< 3
		#
		addu   A_P2, \S_IN, A_P2
		addu   A_P2, A_P2, \B_PREV
		MyROL3 A_P2, A_P2

	#endif
	.ENDM


/*
// Round 3 for the cases where we don't need the stack
*/
.MACRO ROUND_3_PIPE_2_JUST_A_REG  S_VAL, S_PREV, S_OUT,  B_PREV, B_CURR
	#if TWO_PARALLEL_KEYS
		#
		# S_OUT = (S_VAL + S_PREV + B_PREV) <<< 3
		#
		addu   \S_OUT, \S_VAL, \S_PREV
		addu   \S_OUT, \S_OUT, \B_PREV
		MyROL3 \S_OUT, \S_OUT
	
	#endif
	.ENDM
		

/*	
# ###################################
#  ROUND 1 Macros
#  These are split into those that save S values on the stack
#  and those that just use registers.
#
# To make the odd and even bit easier, use two layers of macros.
# ###################################
*/
.MACRO ROUND_1_REGS_REAL Step, A_VAL, S_OUT, B_PREV, B_CURR, B_PREV_P2, B_CURR_P2
	#
	# S_OUT = (S_INITIAL + A_VAL + B_PREV) <<< 3
	#
	addu   \S_OUT, \A_VAL,    \B_PREV
	addu   \S_OUT, S_INITIAL, \S_OUT
	MyROL3 \S_OUT, \S_OUT
	
	#
	# Do the same with the second pipeline (if any)
	#
	ROUND_1_PIPE_2_JUST_A \Step, S_INITIAL, \B_PREV_P2, \B_CURR_P2
	
	#
	# Do B vals
	#
	ROUND_X_B  \S_OUT, \B_PREV, \B_CURR

	#
	# And again with the second pipeline
	#
	ROUND_X_B_PIPE2  \B_PREV_P2, \B_CURR_P2
	
	.ENDM

/*
// Get the order of B's correct - we have odd and even cases...
*/
.MACRO ROUND_1_REGS_ODD Step, A_VAL, S_OUT
		ROUND_1_REGS_REAL	\Step, \A_VAL, \S_OUT, B0, B1, B0_P2, B1_P2
	.ENDM
	
.MACRO ROUND_1_REGS_EVEN Step, A_VAL, S_OUT
		ROUND_1_REGS_REAL	\Step, \A_VAL, \S_OUT, B1, B0, B1_P2, B0_P2
	.ENDM




	
/*
//  Special case for Step 2 of round 1. S1 (i.e. A_VAL) and B0 are identical for 
//  both pipelines, so take advantage of it.
//
//  NOTE B_PREV = B1, B_CURR = B0
*/
.MACRO ROUND_1_REGS_STEP2 A_VAL, S_OUT

	#
	# S_INITIAL and A_VAL are the same for both pipes, so save some work..
	#
	addu Temp, S_INITIAL, \A_VAL

	#
	# S_OUT = (S_INITIAL + A_VAL + B_PREV(B1)) <<< 3
	#
	addu	 \S_OUT, Temp, B1
	MyROL3	 \S_OUT, \S_OUT
	
	#
	# And for the second pipeline as well.
	#
	#if TWO_PARALLEL_KEYS
		#
		# S_OUT = (A_P2) = (Temp + B_PREV(B1_P2)) <<< 3
		#
		addu   A_P2, Temp, B1_P2
		MyROL3 A_P2, A_P2

		#
		# Save the result
		#
		SaveSVal_P2 A_P2, 2
	#endif
	
	#
	# Do B vals
	#
	ROUND_X_B  \S_OUT, B1, B0
	
	#
	# And again with the second pipeline
	#
	ROUND_X_B_PIPE2  B1_P2, B0_P2
	
	.ENDM


	
/*
// The following is the similar to the above but assumes S_INITIAL
// has already been stored in S_VAL
*/
.MACRO ROUND_1_REGS_NO_SINIT_REAL Step,A_VAL, S_VAL, B_PREV, B_CURR, B_PREV_P2,B_CURR_P2

	#
	# Do the second pipeline first (before we destroy S_VAL)
	#
	ROUND_1_PIPE_2_JUST_A \Step, \S_VAL, \B_PREV_P2, \B_CURR_P2
	
	#
	# S_VAL = (S_VAL + A_VAL + B_PREV) <<< 3
	#
	addu	\S_VAL, \S_VAL, \A_VAL
	addu	\S_VAL, \S_VAL, \B_PREV
	MyROL3	\S_VAL, \S_VAL
	
	#
	# And again with the second pipeline
	#
	ROUND_X_B_PIPE2  \B_PREV_P2, \B_CURR_P2
	
	#
	# Do B vals
	#
	ROUND_X_B  \S_VAL, \B_PREV, \B_CURR
	
	.ENDM

	
			
.MACRO ROUND_1_REGS_NO_SINIT_ODD Step,  A_VAL, S_VAL
	 ROUND_1_REGS_NO_SINIT_REAL	\Step, \A_VAL, \S_VAL, B0, B1, B0_P2, B1_P2
	.ENDM													               
															               
.MACRO ROUND_1_REGS_NO_SINIT_EVEN Step,  A_VAL, S_VAL		               
	 ROUND_1_REGS_NO_SINIT_REAL	\Step, \A_VAL, \S_VAL, B1, B0, B1_P2, B0_P2
	.ENDM

		
/*
// Finally, the Stack-based version of Round 1.
*/
	
.MACRO ROUND_1_STACK_REAL Step, A_VAL, S_OUT, B_PREV, B_CURR, B_PREV_P2, B_CURR_P2

	#
	# S_OUT = (S_INITIAL + A_VAL + B_PREV) <<< 3
	#
	addu \S_OUT, \A_VAL, \B_PREV
	addu \S_OUT, S_INITIAL, \S_OUT
	MyROL3	 \S_OUT, \S_OUT
	
	#
	# Do the same with the second pipeline (if any)
	#
	ROUND_1_PIPE_2_JUST_A \Step, S_INITIAL, \B_PREV_P2, \B_CURR_P2

	#
	# Store S_OUT
	#
	SaveSVal \S_OUT, \Step
	
	#
	# Do B vals
	#
	ROUND_X_B  \S_OUT, \B_PREV, \B_CURR
	
	#
	# And again with the second pipeline
	#
	ROUND_X_B_PIPE2  \B_PREV_P2, \B_CURR_P2
	
	.ENDM

		
.MACRO ROUND_1_STACK_ODD Step, A_VAL, S_OUT
	 ROUND_1_STACK_REAL	\Step, \A_VAL, \S_OUT, B0, B1, B0_P2, B1_P2
	.ENDM											               
													               
.MACRO ROUND_1_STACK_EVEN Step, A_VAL, S_OUT		               
	 ROUND_1_STACK_REAL	\Step, \A_VAL, \S_OUT, B1, B0, B1_P2, B0_P2
	.ENDM

	
# ###################################
#  ROUND 2 Macros
#
# Again these are divided into a few groups.
#
# The first assumes all input/output values are in registers
# while the second assumes it must read and write to the stack.
# ###################################
	
.MACRO ROUND_2_REGS_REAL Step, A_VAL, S_VAL, B_PREV, B_CURR, B_PREV_P2, B_CURR_P2

	#
	# Pre-load the 2nd pipeline Value
	#
	ROUND_2_3_PRELOAD_S \Step
	
	#
	# S_VAL = (S_VAL + A_VAL + B_PREV) <<< 3
	#
	addu \S_VAL, \S_VAL, \A_VAL
	addu \S_VAL, \S_VAL, \B_PREV
	MyROL3	 \S_VAL, \S_VAL

	#
	# Do the second key pipeline (if any).
	#
	ROUND_2_PIPE_2_JUST_A \Step, S_READ_AHEAD_P2, \B_PREV_P2, \B_CURR_P2

	#
	# Do B vals
	#
	ROUND_X_B  \S_VAL, \B_PREV, \B_CURR


	#
	# And the second pipeline as well
	#
	ROUND_X_B_PIPE2  \B_PREV_P2, \B_CURR_P2
	
	.ENDM

/*
// Get the order of B's correct - we have odd and even cases...
*/
.MACRO ROUND_2_REGS_ODD Step, A_VAL, S_OUT
		ROUND_2_REGS_REAL	\Step, \A_VAL, \S_OUT, B0, B1, B0_P2, B1_P2
	.ENDM												               
														               
.MACRO ROUND_2_REGS_EVEN Step, A_VAL, S_OUT				               
		ROUND_2_REGS_REAL	\Step, \A_VAL, \S_OUT, B1, B0, B1_P2, B0_P2
	.ENDM



/*
// Special cases for Step 0 and Step 1. S0 and S1 (on input!) are the same 
// for both pipelines.
//
// Step 0:	 B_PREV=B1, B_CURR = B0
*/
.MACRO ROUND_2_REGS_STEP0  A_VAL, S_VAL

	#
	# Do the second key pipeline (if any).
	#
	ROUND_2_PIPE_2_JUST_A_REG \S_VAL, A_P2, S0_P2, B1_P2, B0_P2
	
	#
	# S_VAL = (S_VAL + A_VAL + B_PREV(B1)) <<< 3
	#
	addu   \S_VAL, \S_VAL, \A_VAL
	addu   \S_VAL, \S_VAL, B1
	MyROL3 \S_VAL, \S_VAL
	
	#
	# And the second pipeline as well
	#
	ROUND_X_B_PIPE2_REG S0_P2, B1_P2, B0_P2
	
	#
	# Do B vals
	#
	ROUND_X_B  \S_VAL, B1, B0
	
	.ENDM


	
.MACRO ROUND_2_REGS_STEP1  A_VAL, S_VAL
	#
	# Do the second key pipeline (if any).
	#
	ROUND_2_PIPE_2_JUST_A_REG \S_VAL, S0_P2, S1_P2, B0_P2, B1_P2
	
	#
	# S_VAL = (S_VAL + A_VAL + B_PREV(B0)) <<< 3
	#
	addu   \S_VAL, \S_VAL, \A_VAL
	addu   \S_VAL, \S_VAL, B0
	MyROL3 \S_VAL, \S_VAL

	
	/*
	// To save having to re-write the step 3 macro (but at the
	// expense of one instruction) copy S1_P2 to A_P2
	*/
	#if TWO_PARALLEL_KEYS
		move A_P2, S1_P2
	#endif
		
	#
	# And the second pipeline as well
	#
	ROUND_X_B_PIPE2  B0_P2, B1_P2
	
	#
	# Do B vals
	#
	ROUND_X_B  \S_VAL, B0, B1
	
	.ENDM

	
/*
// Stack based version.
//
*/ 	
.MACRO ROUND_2_STACK_REAL Step,  A_VAL, S_OUT, B_PREV, B_CURR, B_PREV_P2, B_CURR_P2

	#
	# Grab the S_IN value from the stack
	#
	# ARGH! we have a stall here on an R4000 because the result is used
	# within the next 3 instructions. This is equivalent to one lost instruction
	#
	GetSVal Temp, \Step
	
	#
	# Pre-load the 2nd pipeline Value
	#
	ROUND_2_3_PRELOAD_S \Step
	
	#
	# S_OUT = (S_IN(temp) + A_VAL + B_PREV) <<< 3
	#
	addu	\S_OUT, \A_VAL, \B_PREV
	addu	\S_OUT, Temp, \S_OUT
	MyROL3	\S_OUT, \S_OUT
	
	#
	# Do the second key pipeline (if any).
	#
	ROUND_2_PIPE_2_JUST_A \Step, S_READ_AHEAD_P2, \B_PREV_P2, \B_CURR_P2
	
	#
	# Save the value back to the stack
	#
	SaveSVal \S_OUT, \Step
	
	#
	# Do B vals
	#
	ROUND_X_B  \S_OUT, \B_PREV, \B_CURR
	
	#
	# And the second pipeline as well
	#
	ROUND_X_B_PIPE2  \B_PREV_P2, \B_CURR_P2
	
	.ENDM


.MACRO ROUND_2_STACK_ODD Step,  A_VAL, S_OUT
		ROUND_2_STACK_REAL	\Step, \A_VAL, \S_OUT, B0, B1, B0_P2, B1_P2
	.ENDM												               
														               
.MACRO ROUND_2_STACK_EVEN Step,  A_VAL, S_OUT			               
		ROUND_2_STACK_REAL	\Step, \A_VAL, \S_OUT, B1, B0, B1_P2, B0_P2
	.ENDM

	
		
# ###################################
#  ROUND 3 Macros
#
# Note that we never have to save the S values on the stack
# as this is the last round of key generation.
# ################################### 

.MACRO ROUND_3_REGS_REAL Step, A_VAL, S_VAL, B_PREV, B_CURR, B_PREV_P2, B_CURR_P2
	#
	# Pre-load the 2nd pipeline Value
	#
	ROUND_2_3_PRELOAD_S \Step
	
	#
	# S_VAL = (S_VAL + A_VAL + B_PREV) <<< 3
	#
	addu   \S_VAL, \S_VAL, \A_VAL
	addu   \S_VAL, \S_VAL, \B_PREV
	MyROL3 \S_VAL, \S_VAL
	
	#
	# Do the second key pipeline (if any).
	#
	ROUND_3_PIPE_2_JUST_A \Step, S_READ_AHEAD_P2, \B_PREV_P2, \B_CURR_P2
	
	#
	# Do B vals
	#
	ROUND_X_B  \S_VAL, \B_PREV, \B_CURR

	
	#
	# And the second pipeline as well
	#
	ROUND_X_B_PIPE2  \B_PREV_P2, \B_CURR_P2
	
	.ENDM

/*
// Get the order of B's correct - we have odd and even cases...
*/
.MACRO ROUND_3_REGS_ODD Step, A_VAL, S_VAL
		ROUND_3_REGS_REAL	\Step, \A_VAL, \S_VAL, B0, B1, B0_P2, B1_P2
	.ENDM												               
														               
.MACRO ROUND_3_REGS_EVEN Step, A_VAL, S_VAL				               
		ROUND_3_REGS_REAL	\Step, \A_VAL, \S_VAL, B1, B0, B1_P2, B0_P2
	.ENDM

/**************************/	
/**************************/	
	
/*
//
// Special Cases for Steps 0 and 1 - the 2nd pipeline is using registers
//
*/
.MACRO ROUND_3_REGS_STEP0	A_VAL, S_VAL
	
	#
	# S_VAL = (S_VAL + A_VAL + B_PREV) <<< 3
	#
	addu   \S_VAL, \S_VAL, \A_VAL
	addu   \S_VAL, \S_VAL, B1
	MyROL3 \S_VAL, \S_VAL
	
	#
	# Do the second key pipeline (if any).
	#
	ROUND_3_PIPE_2_JUST_A_REG  S0_P2, A_P2, S0_P2, B1_P2, B0_P2
	
	#
	# Do B vals
	#
	ROUND_X_B  \S_VAL, B1, B0

	#
	# And the second pipeline as well
	#
	ROUND_X_B_PIPE2_REG S0_P2, B1_P2, B0_P2
	
	.ENDM

	
	
.MACRO ROUND_3_REGS_STEP1	A_VAL, S_VAL
	
	#
	# S_VAL = (S_VAL + A_VAL + B_PREV) <<< 3
	#
	addu   \S_VAL, \S_VAL, \A_VAL
	addu   \S_VAL, \S_VAL, B0
	MyROL3 \S_VAL, \S_VAL
	
	#
	# Do the second key pipeline (if any).
	#
	ROUND_3_PIPE_2_JUST_A_REG  S1_P2, S0_P2, S1_P2, B0_P2, B1_P2
	
	#
	# Do B vals
	#
	ROUND_X_B  \S_VAL, B0, B1
	
	/*
	// To save having to re-write the step 3 macro (but at the
	// expense of one instruction) copy S1_P2 to A_P2
	*/
	#if TWO_PARALLEL_KEYS
		move A_P2, S1_P2
	#endif

	#
	# And the second pipeline as well
	#
	ROUND_X_B_PIPE2  B0_P2, B1_P2
	.ENDM


/**************************/	
/**************************/	

.MACRO ROUND_3_REGS_JUST_A_REAL Step,  A_VAL, S_VAL, B_PREV, B_CURR, B_PREV_P2, B_CURR_P2
	
	#
	# Pre-load the 2nd pipeline Value
	#
	ROUND_2_3_PRELOAD_S \Step

	#
	# S_VAL = (S_VAL + A_VAL + B_PREV) <<< 3
	#
	addu	\S_VAL, \S_VAL, \A_VAL
	addu	\S_VAL, \S_VAL, \B_PREV
	MyROL3	\S_VAL, \S_VAL
	
	#
	# Do the second key pipeline (if any).
	#
	ROUND_3_PIPE_2_JUST_A \Step, S_READ_AHEAD_P2, \B_PREV_P2, \B_CURR_P2
	
	.ENDM

	
	
.MACRO ROUND_3_REGS_JUST_A_ODD  Step, A_VAL, S_VAL
		ROUND_3_REGS_JUST_A_REAL  \Step, \A_VAL, \S_VAL, B0, B1, B0_P2, B1_P2
	.ENDM													                 
															                 
.MACRO ROUND_3_REGS_JUST_A_EVEN  Step, A_VAL, S_VAL			                 
		ROUND_3_REGS_JUST_A_REAL  \Step, \A_VAL, \S_VAL, B1, B0, B1_P2, B0_P2
	.ENDM
	

/**************************/	
/**************************/	
		
.MACRO ROUND_3_REGS_JUST_B_REAL  A_VAL, B_PREV, B_CURR, B_PREV_P2, B_CURR_P2
	
	#
	# Do B vals
	#
	ROUND_X_B  \A_VAL, \B_PREV, \B_CURR

	#
	# And the second pipeline as well
	#
	ROUND_X_B_PIPE2  \B_PREV_P2, \B_CURR_P2
	.ENDM

#if 0	
.MACRO ROUND_3_REGS_JUST_B_ODD Step, A_VAL
		ROUND_3_REGS_JUST_B_REAL    \A_VAL, B0, B1, B0_P2, B1_P2
	.ENDM										                
#endif											                
												                
.MACRO ROUND_3_REGS_JUST_B_EVEN     Step, A_VAL	  
		ROUND_3_REGS_JUST_B_REAL	\A_VAL, B1, B0, B1_P2, B0_P2
	.ENDM

	
#######
#
#	Round 3 Encryption operations.
#
#######
.MACRO	ENCRYPT_REAL V1, V2, Key, V1_P2, V2_P2
	xor   \V1, \V1, \V2
	MyROL \V1, \V1, \V2
	
	addu  \V1, \V1, \Key
	
#if TWO_PARALLEL_KEYS
	xor   \V1_P2, \V1_P2, \V2_P2
	MyROL \V1_P2, \V1_P2, \V2_P2
	
	addu  \V1_P2, \V1_P2, A_P2
#endif
	.ENDM

	
		
.MACRO	ENCRYPT_ODD Key
	ENCRYPT_REAL	eB, eA, \Key, eB_P2, eA_P2
	.ENDM
	
.MACRO	ENCRYPT_EVEN Key
	ENCRYPT_REAL	eA, eB, \Key, eA_P2, eB_P2
	.ENDM
		

# ####################################################################
# ####################################################################
# ####################################################################
	
# ##################################		
# ##################################
	
# u32 rc5_unit_func_mips_crunch_asm(register RC5UnitWork * rc5unitwork, u32 iterations )

# ##################################		
# ##################################		

# ####################################################################		
# ####################################################################		
# ####################################################################		

	
	
gcc2_compiled.:
__gnu_compiled_cplusplus:
	.text
	.align	2
	.globl	rc5_unit_func_mips_crunch_asm
	.ent	rc5_unit_func_mips_crunch_asm


rc5_unit_func_mips_crunch_asm:
	#
	# Reserve our stack space
	#
	subu	$sp,$sp,TotalStackSpace
	#.cprestore 0
	SaveALLRegs


	/*
	// Save the Outer loop parameters
	*/
	sw	InIterations, SavedIterations($sp)
	sw  WorkUnit,	  SavedWorkUnit($sp)

	/*
	// Set up the outer loop counter value - we are counting down
	*/
	addiu InIterations, -1
	sw	  InIterations,  OuterLoopCount($sp)

	/*
	// Just test we have been given something to actually do
	// If the iteration count is zero, then get out of here.
	*/
	.set noreorder
	bltz InIterations, BYE
	move $2, $0 
	.set reorder

	/*
	// Now test that the we have a sensible key hi value. We assume that
	// the most sig 8 bits are all zero, so just double check. I suspect
	// that this is compleletly unnecessary, but the overhead is small.
	*/
	lw  Temp,    key_hi(WorkUnit)
	lui Scratch, 0xFF00
	and Temp, Temp, Scratch
	
	.set noreorder
	bne Temp, $0, BYE
	li	$2, -1			#The -1 should flag an error in the caller.
	.set reorder

	/*
	// Ok if we've got this far, it should all be safe.
	*/
	
	/*
	// Grab the Key Bits. These may get updated in the outer loop.
	//
	// B0 = keyLow
    // B1 = keyHi
	*/
	lw  B0,		key_lo(WorkUnit)
	lw	B1,		key_hi(WorkUnit)

	
	/*
	//
	// Save the parameters from the WorkUnit structure before we destroy the
	// pointer (use some of the S vals as temporary variables)
	//
	// This extra work is only worthwhile if we are doing a reasonable number
    // of iterations.
	//
	// Note we don''t need to save keyLow
	//
	*/
	lw	S6,	plain_hi(WorkUnit)
	lw	S7,	plain_lo(WorkUnit)
	lw	S8,	cipher_hi(WorkUnit)
	lw	S9,	cipher_lo(WorkUnit)

	sw	S6,	Saved_plain_hi($sp)
	sw	S7,	Saved_plain_lo($sp)
	sw	S8,	Saved_cipher_hi($sp)
	sw	S9,	Saved_cipher_lo($sp)
   

OuterLoopMain:		
	/*
	// Start of the inner loop. This just runs though PIPELINE_COUNT
	// number of keys. Because we know the start and end values of the
	// key, we can save on the increment costs
	*/
	
	#
	# Set up the inner loop count
	#
	sw $0, LoopCount($sp)

	/*	
	#
	# Set up constants we will need a number of times. Ideally these''d
	# be passed to us along with the other parameters, as a lw is quicker
	# than a load immediate of a 32 bit value, but for the moment
	# this will have to do.
	#
	# NOTE:	 because these are 32 bit constants, the assembler must change
	# each 'load' into 2 actual instructions.(lui and ori)
	*/
	li	S0,		   P_ROTL3_VAL
	li	S_INITIAL, S1_INITIAL_VAL
	li  Q,		   Q_VAL

	#
	# Since we are doing a few iterations, save the const values on the stack.
	# This only takes one instruction each, and so (overall) will be cheaper
	# than re-loading via the lui (load upper immediate) and ori instructions.
	#
	sw S0,		  Saved_P_ROTL3($sp)
	sw Q,		  Saved_Q($sp)

	#########################
	#
	# Round 1 of expanded key generation for first iteration
	#
	#########################

	#
	# Round 1: i==0, j==0
	#
	# S0= A = P_ROTL3			(Already done)
	# B = (keyLo + A) <<< A
	#
	addu  B0, B0, S0
	ROLBY_P_ROTL3 B0

	#
	# Both A/S0 and B0 will be the same for all pipes. Save B0 so that
	# we don''t have to recompute it for the other pipelines/iterations 
	#
	sw	B0,	SavedB0($sp)
	
   	#
	# Round 1: i==1, j==1
	#
	# The input and Outut values for A are still all identical
	# A = SVal[1]= (Sinitial + A + B) <<< 3
	#
	addu	S1, S_INITIAL, S0		#HMMM this is also a constant value. But a load
									# is more expensive.
	addu	S1, S1, B0
	MyROL3	S1, S1

	#
	# Save S1 for the other keys we''ll try
	#
	sw	S1, SavedS1($sp)

	#
	# Increment the Initial S Value to get the next (i.e. S2 value), and save it
	#
	addu	S_INITIAL, S_INITIAL, Q
	sw		S_INITIAL, Saved_S2_INIT($sp)

	#
	# Begin to evaluate B1 value (B1 currently = keyHi)
	# B1 = (keyHi + (S1 + B0)) <<< (S1 + B0)
	#
	# Note that S1+B0 is constant for all the keys, so we may as well add this on
	# to keyHi and save that rather than KeyHi. It''s not worth trying to save
	# S1+B0 though, since the extra load will cost as much as the addition.
	#
	addu Temp, S1, B0
	addu B1,   B1, Temp

	# Save it
	sw	B1,	Saved_key_hi_S1B0($sp)


#if TWO_PARALLEL_KEYS
	/*
	# Do the other key calculation as well. It amounts to adding on 1<<24
	# to the Pipe 0 B1 value, before we destroy it
	*/
	li	 B1_P2, (1<<24)

	addu B1_P2, B1_P2, B1
	
	MyROL B1_P2, B1_P2, Temp

	move  B0_P2, B0		#This is a waste!
#endif
	
	# and do the rest of the B1 calculation
	MyROL B1, B1, Temp

		
R1Step2:

	##########################
	#
	# Do the remaining steps of Round 1
	#
	##########################
	ROUND_1_REGS_STEP2 S1, S2		/* Step 2   (Even Step)*/

	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_REGS_ODD   3,  S2, S3   /* Step 3   (Odd Step)*/
	
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_REGS_EVEN  4, S3, S4
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_REGS_ODD   5, S4, S5
	
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_REGS_EVEN  6, S5, S6
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_REGS_ODD   7, S6, S7
	
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_REGS_EVEN  8, S7, S8
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_REGS_ODD   9, S8, S9
	
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_REGS_EVEN  10, S9,  S10
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_REGS_ODD   11, S10, S11
	
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_REGS_EVEN  12, S11, S12
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_REGS_ODD   13, S12, S13
	
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_REGS_EVEN  14,	S13, S14 

/*
//	If we have 1 pipeline then we can do more register operations, otherwise
//	we have to rely on the stack
*/
#if TWO_PARALLEL_KEYS
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_STACK_ODD   15,	S14, A

	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_STACK_EVEN  16,	A, A
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_STACK_ODD   17,	A, A
	
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_STACK_EVEN  18,	A, A
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_STACK_ODD   19,	A, A

	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_STACk_EVEN  20, A, A
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_STACK_ODD  21,  A, A
#else
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_REGS_ODD   15,	S14, S15 

	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_REGS_EVEN  16,	S15, S16
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_REGS_ODD   17,	S16, S17 
	
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_REGS_EVEN  18,	S17, S18
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_REGS_ODD   19,	S18, S19

	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_REGS_EVEN  20, S19, S20
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_STACK_ODD  21, S20,   A
#endif
	
	
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_STACK_EVEN 22, A, A
	addu	S_INITIAL, S_INITIAL, Q
	ROUND_1_STACK_ODD  23, A, A

	/*
	// S_INITIAL and Q now become
	// S24 and S25, since we don't need these
	// explicit values anymore.
	*/
	addu	S24, S_INITIAL, Q
	addu    S25, S24, Q
	
	ROUND_1_REGS_NO_SINIT_EVEN 24, A,   S24
	ROUND_1_REGS_NO_SINIT_ODD  25, S24, S25

	##########################
	#
	#	Round 2
	#
	##########################
	
	ROUND_2_REGS_STEP0   S25, S0  /* Step 0   (Even Step)*/
	ROUND_2_REGS_STEP1    S0, S1  /* Step 1   (Odd Step)*/

	ROUND_2_REGS_EVEN 2,  S1, S2  /* Step 2   (Even Step)*/
	ROUND_2_REGS_ODD  3,  S2, S3  /* Step 3   (Odd Step)*/
								
	ROUND_2_REGS_EVEN 4,  S3, S4
	ROUND_2_REGS_ODD  5,  S4, S5
								
	ROUND_2_REGS_EVEN 6,  S5, S6
	ROUND_2_REGS_ODD  7,  S6, S7
								
	ROUND_2_REGS_EVEN 8,  S7, S8
	ROUND_2_REGS_ODD  9,  S8, S9
	
	ROUND_2_REGS_EVEN 10, S9,  S10
	ROUND_2_REGS_ODD  11, S10, S11
								  
	ROUND_2_REGS_EVEN 12, S11, S12
	ROUND_2_REGS_ODD  13, S12, S13
								  
	ROUND_2_REGS_EVEN 14, S13, S14
	
/*
//	If we have 1 pipeline then we can do more register operations, otherwise
//	we have to rely on the stack
*/
#if TWO_PARALLEL_KEYS
	ROUND_2_STACK_ODD  15, S14, A
								  
	ROUND_2_STACK_EVEN 16, A, A
	ROUND_2_STACK_ODD  17, A, A
								  
	ROUND_2_STACK_EVEN 18, A, A
	ROUND_2_STACK_ODD  19, A, A
								  
	ROUND_2_STACK_EVEN 20, A, A
	ROUND_2_STACK_ODD  21, A, A
	
#else
	ROUND_2_REGS_ODD  15, S14, S15
								  
	ROUND_2_REGS_EVEN 16, S15, S16
	ROUND_2_REGS_ODD  17, S16, S17
								  
	ROUND_2_REGS_EVEN 18, S17, S18
	ROUND_2_REGS_ODD  19, S18, S19
								  
	ROUND_2_REGS_EVEN 20, S19, S20
	ROUND_2_STACK_ODD 21, S20, A
#endif	
		
	ROUND_2_STACK_EVEN 22, A,  A
	ROUND_2_STACK_ODD  23, A,  A

	#
	# Back to registers again.
	#
	ROUND_2_REGS_EVEN 24,  A,   S24
	ROUND_2_REGS_ODD  25,  S24, S25
	
	
	/*
	###########################
	#
	# ROUND 3 of key generation along with the Encryption Rounds
	#
	# Note we must take care with the sharing of registers for
	# values eA, eB, and S0 and S1. 
	###########################
	*/
	#
	# Grab the first word of the plain text. (Reg A is free)
	#
	lw A,	Saved_plain_lo($sp)

	#
	# Do Step 0 of key generation
	#
	ROUND_3_REGS_STEP0 S25, S0
	
	#
	# Do Step 1 of key generation
	#
	ROUND_3_REGS_STEP1 S0, S1

	#
	# Do Step 0 encryption of the main pipeline -
	# (Note that eA and S0 are the same, but we no longer
	# need S0, and similarly for the other pipeline as well)
	#
	# Also Grab the high word of the plain text
	# S0 is now avi
	#
	addu eA, A, S0
#if  TWO_PARALLEL_KEYS
	addu eA_P2, A, S0_P2
#endif
	
	lw	 A,	Saved_plain_hi($sp)

	#
	# Do step2
	#
	ROUND_3_REGS_EVEN 2, S1, S2
	
	#
	# Do step 1 of pipe 1 encryption. Note S1 and eB are the same
	# register but we no longer need S1
	#
	addu eB, A, S1
#if  TWO_PARALLEL_KEYS
	addu eB_P2, A, S1_P2
#endif
	
	#
	# We can now do step 2 of encryption for both pipelines
	#
	ENCRYPT_EVEN S2

	#
	# Do MOST of the remaining rounds
	#
	ROUND_3_REGS_ODD 3,   S2, S3
	ENCRYPT_ODD S3
	
	ROUND_3_REGS_EVEN 4,  S3, S4
	ENCRYPT_EVEN S4
	ROUND_3_REGS_ODD  5,  S4, S5
	ENCRYPT_ODD S5
	
	ROUND_3_REGS_EVEN 6,  S5, S6
	ENCRYPT_EVEN S6
	ROUND_3_REGS_ODD  7,  S6, S7
	ENCRYPT_ODD S7
	
	ROUND_3_REGS_EVEN 8,  S7, S8
	ENCRYPT_EVEN S8
	ROUND_3_REGS_ODD  9,  S8, S9
	ENCRYPT_ODD S9
	
	ROUND_3_REGS_EVEN 10, S9,  S10
	ENCRYPT_EVEN S10
	ROUND_3_REGS_ODD  11, S10, S11
	ENCRYPT_ODD S11
	
	ROUND_3_REGS_EVEN 12, S11, S12
	ENCRYPT_EVEN S12
	ROUND_3_REGS_ODD  13, S12, S13
	ENCRYPT_ODD S13
	
	
/*
//	If we have 1 pipeline then we can do more register operations, otherwise
//	we have to rely on the stack
*/
#if TWO_PARALLEL_KEYS
	/*
	// So that we don't get load stalls, grab the values
	// that are on the stack early, and shove 'em into spare
	// registers
	*/
	ROUND_3_REGS_EVEN 14, S13, S14
	GetSVal S_ODD_R3, 15
	ENCRYPT_EVEN S14

	ROUND_3_REGS_ODD  15, S14, S_ODD_R3
	GetSVal S_EVEN_R3, 16
	ENCRYPT_ODD S_ODD_R3
	
	ROUND_3_REGS_EVEN 16, S_ODD_R3, S_EVEN_R3
	GetSVal S_ODD_R3, 17
	ENCRYPT_EVEN S_EVEN_R3
	
	ROUND_3_REGS_ODD  17, S_EVEN_R3, S_ODD_R3
	GetSVal S_EVEN_R3, 18
	ENCRYPT_ODD S_ODD_R3
	
	ROUND_3_REGS_EVEN 18, S_ODD_R3, S_EVEN_R3
	GetSVal S_ODD_R3, 19
	ENCRYPT_EVEN S_EVEN_R3
	
	ROUND_3_REGS_ODD  19, S_EVEN_R3, S_ODD_R3
	GetSVal S_EVEN_R3, 20
	ENCRYPT_ODD S_ODD_R3
	
	ROUND_3_REGS_EVEN 20, S_ODD_R3, S_EVEN_R3
	GetSVal S_ODD_R3, 21
	ENCRYPT_EVEN S_EVEN_R3
	
	ROUND_3_REGS_ODD  21, S_EVEN_R3, S_ODD_R3
	GetSVal S_EVEN_R3, 22
	ENCRYPT_ODD S_ODD_R3

#else
	ROUND_3_REGS_EVEN 14, S13, S14
	ENCRYPT_EVEN S14
	ROUND_3_REGS_ODD  15, S14, S15
	ENCRYPT_ODD S15
	
	ROUND_3_REGS_EVEN 16, S15, S16
	ENCRYPT_EVEN S16
	ROUND_3_REGS_ODD  17, S16, S17
	ENCRYPT_ODD S17
	
	ROUND_3_REGS_EVEN 18, S17, S18
	ENCRYPT_EVEN S18
	ROUND_3_REGS_ODD  19, S18, S19
	ENCRYPT_ODD S19
	/*
	// So that we don't get load stalls, grab the values
	// that are on the stack early, and shove 'em into spare
	// registers
	*/
	ROUND_3_REGS_EVEN 20, S19, S20
	GetSVal S_ODD_R3, 21
	ENCRYPT_EVEN S20
	
	ROUND_3_REGS_ODD  21, S20, S_ODD_R3
	GetSVal S_EVEN_R3, 22
	ENCRYPT_ODD S_ODD_R3
	
#endif		
	
	ROUND_3_REGS_EVEN 22, S_ODD_R3, S_EVEN_R3
	GetSVal S_ODD_R3, 23
	ENCRYPT_EVEN S_EVEN_R3

	
	ROUND_3_REGS_ODD  23, S_EVEN_R3, S_ODD_R3
	ENCRYPT_ODD S_ODD_R3
	
	#
	# At this point check if we have ANY chance of a success by only
	# producing the first word and testing it.
	#
	# S10 is no longer needed so load into that.
	#
	lw S10,	Saved_cipher_lo($sp)

	#
	# Do just the first part of the penultimate key generation
	#
	ROUND_3_REGS_JUST_A_EVEN 24, S_ODD_R3, S24
	
	ENCRYPT_EVEN S24

	#
	# In the very unlikely event that we have a match with the first word, 
	# continue to encode the other 32 bits.
	# Since this is VERY unlikely, load up the loop count value since we are more
	# likely to want that.
	#
	# Note the different code for the two pipes.
	#  For the dual key method, we use S11 and S12 as temporaries that tell us if we
	#  had success with a particular pipeline. 
	#
#if TWO_PARALLEL_KEYS
	
	xor S11, eA,    S10
	xor S12, eA_P2, S10
	
	sltiu S11, S11, 1	# Set S11 to 1 if we have a match
	sltiu S12, S11,	1	# Set S12 to 1 if we have a match
	or    S10, S11, S12
	
	.set noreorder	
	bne	S10, $0, FirstWordOK
	lw Scratch, LoopCount($sp)
	.set reorder
#else	
	.set noreorder	
	beq	eA, S10, FirstWordOK
	lw Scratch, LoopCount($sp)
	.set reorder
#endif
		
PrepareNextIteration:
	#
	# Begin to check the loop count, if we''ve done it all, then exit. (S6 is spare)
	#
	li S6, LoopLimit
	
	#
	# To stop a load stall from "lw Scratch, LoopCount($sp)" occuring with the
	# branch, move in some load instructions that are needed for the next iteration
	#
	# Start to generate the new KeyHi by adding on the loop count shifted up by the
	# appropriate amount. Store it in B1. NOTE we have already pre-added on S1 and B0
	# which are constant for all the keys we are doing.
	#
	#
	lw	 B1, Saved_key_hi_S1B0($sp)
	lw	 S0, Saved_P_ROTL3($sp)

	#
	# Exit the inner loopif we have finished, but Increment the loop count
	# anyway, since we want to do this most of the time.
	#	
	.set noreorder
	beq  S6, Scratch, NextOuterLoop
	addi Scratch, Scratch, LoopIncrement
	.set reorder
	
	#
	# Set up for another iteration. Reload the first parts of the key generation.
	# B0, S1
	#
	lw	B0,	SavedB0($sp)
	lw	S1, SavedS1($sp)

	#
	# Save the loop counter
	#
	sw   Scratch, LoopCount($sp)
	
	#
	# Finish off computing the  _New_  keyHi + (S1 + B0) (for pipeline 1)
	#
	# We increment by 1<<24 so just shift up the current loop count value.
	# (this works for single or dual keys)
	#
	sll  Scratch, Scratch, 24
	
	addu  B1, B1, Scratch		#lw of B1 was done >3 instructions earlier
	
#if TWO_PARALLEL_KEYS
	#
	# Do the other key calculation as well. It amounts to adding on 1<<24
	# to the Pipe 0   B1 value
	#
	li	 B1_P2, (1<<24)
	addu B1_P2, B1_P2, B1
#endif	

	#
	# Get the other consts
	#	
	lw S_INITIAL, Saved_S2_INIT($sp)
	lw Q,		  Saved_Q($sp)

	#
	# Evaluate the correct B1 value (B1 currently = keyHi+S1+B0)
	# B1 = (keyHi + (S1 + B0)) <<< (S1 + B0)
	#
	addu  Temp, S1, B0
	MyROL B1,   B1, Temp
	
#if TWO_PARALLEL_KEYS
	# and do the rest of the B1 calculation
	MyROL B1_P2, B1_P2, Temp

	move B0_P2, B0            #Another waste. We should redo a macro to save this copy.
#endif

	#
	# continue with the other iterations
	#
	b R1Step2
	
/*
//  Code to check the other cipher text word
*/	
FirstWordOK:
	#
	# Ok we are still in with a chance, finish off the previous key generation 
	# step (24) and start on 25
	#
	#
	# Note that we could save a tiny amount of work in the double key method by
	# only doing the key we might have success with, but, lets face it, the chance
	# of being here in that case was only 2 in 2^32.
	#
	ROUND_3_REGS_JUST_B_EVEN  24, S24
	
	ROUND_3_REGS_JUST_A_ODD   25, S24, S25
		
	#
	# S10 is no longer needed so load into that.
	#
	lw S10,	Saved_cipher_hi($sp)

	ENCRYPT_ODD S25

	#
	# If we haven''t got a match, load up the loop count, and try again.
	#
	# NOTE we _could_ do the same trick as above and have the loop branch only if
	# we DO get a match as this is so much less likely. However, since we are unlikely
	# to be here in the first place, its not worth while duplicating the code.
	#
#if TWO_PARALLEL_KEYS
	/*
	// DUAL PIPELINE CASE
	*/
	#
	# load up the loop count. We will always need it.
	#
	lw Scratch, LoopCount($sp)
	
	#
	# Compute results for Pipes 1 and 2. A 1 indicates success.
	#
	# Note that S11 and S12 contained the partial results for eA and eA_p2
	#
	xor eB,    eB,    S10
	xor eB_P2, eB_P2, S10

	sltiu eB,	eB,		1		# Set eB to 1 if we have a match
	sltiu eB_P2,eB_P2, 	1		# ditto for second pipe

	and	 eB,	eB,	   S11		# See if we had success with both words of key 1
	and  eB_P2, eB_P2, S12		# check key 2

	#
	# check for success with pipeline 1
	#
	beq eB, $0, NoPipe1Success

	#
	# Mark that we have success with Pipe 1
	#
	.set noreorder	
	b SUCCESS
	move $2, Scratch
	.set reorder
	
NoPipe1Success:
	/*
	// check other Pipe then. If no luck, then set up the
	// next iteration. (Hmm can't find anything to put in the delay slot)
	*/
	beq eB_P2, $0, PrepareNextIteration

	/*
	// Else we have success with pipe 2
	*/
	.set noreorder	
	b SUCCESS
	addiu $2, Scratch, 1
	.set reorder
	
#else
	/*
	//SINGLE PIPELINE CASE
	*/
	.set noreorder	
	bne	eB, S10, PrepareNextIteration
	lw Scratch, LoopCount($sp)
	.set reorder

	#
	# Else we have success!!!!
	#
	.set noreorder
	b SUCCESS
	move $2, Scratch
	.set reorder
#endif



/*******************************************
// Prepare for another outer loop iteration....
//
// This will be 'relatively' slow because of all the loads...
********************************************/
NextOuterLoop:

	/*
	// get the outer loop count, and the WorkUnit Pointer.
	*/
	lw WorkUnit, SavedWorkUnit($sp)
	lw S0, OuterLoopCount($sp)


	/*
	// We need to increment the key values irrespective of whether the
	// the loop terminates or not, so do so
	*/
	lw	B1,		key_hi(WorkUnit)   /*We have a stall with this load!*/
	lw  B0,		key_lo(WorkUnit)   /*and with this one too*/


	/*
	// The incrementing of this is a complete pain in the arse. We are incrementing
	// in reverse byte order, and so have to do loads of tests...
	//
	// Increment Byte 2 (counting from 0) of the high key, BUT check for an overflow
	*/
	li  Temp,        0x00010000		#Increment amount
	li  Scratch,     0x00FF0000		#Mask

	add Temp,    B1, Temp			#'increment' the key
	
	and Scratch, Scratch, Temp		#check for an overflow
	beq Scratch, $0, OverflowIntoB1_1

	/*
	// No overflow, we can keep the incremented key.
	*/
	move B1, Temp
	b	 DoneKeyIncrement

OverflowIntoB1_1:
	/*
	// Else we have an 'overflow' into Byte	1
	*/
	addiu Temp,   B1,	0x00000100		#'increment' the key

	andi Scratch, Temp, 0x0000FF00		#check for an overflow
	beq  Scratch, $0,   OverflowIntoB1_0

	/*
	// No overflow. Mask the appropriate bits and save B1
	*/
	andi B1, Temp, 0xFFFF
	b	 DoneKeyIncrement
	
OverflowIntoB1_0:
	/*
	// Else we have an 'overflow' into Byte	0
	*/
	addiu Temp,   B1,	0x00000001		#'increment' the key

	andi Scratch, Temp, 0x000000FF		#check for an overflow
	beq  Scratch, $0, OverflowIntoB0_3
	
	/*
	// No overflow. Mask the appropriate bits and save B1
	*/
	andi B1, Temp, 0x00FF
	b	 DoneKeyIncrement

OverflowIntoB0_3:
	/*
	// Else we have an 'overflow' into the low key word.
	// First clear the hi key, since we've now wrapped around to zero
	*/
	move B1, $0

	li  Temp,        0x01000000		#Increment amount
	li  Scratch,     0xFF000000		#Mask
	
	add Temp,    B0, Temp			#'increment' the low key
	
	and Scratch, Scratch, Temp		#check for an overflow
	beq Scratch, $0, OverflowIntoB0_2
	
	/*
	// No overflow. Save B0
	*/
	move B0, Temp
	b	 DoneKeyIncrement
	
OverflowIntoB0_2:
	/*
	// Else we have an 'overflow' into byte 2
	*/
	li  Temp,        0x00010000		#Increment amount
	li  Scratch,     0x00FF0000		#Mask
	
	add Temp,    B0, Temp			#'increment' the low key
	
	and Scratch, Scratch, Temp		#check for an overflow
	beq Scratch, $0, OverflowIntoB0_1


	/*
	// No extra overflow. Mask the appropriate bits and save B1
	// (we know the top byte should be cleared)
	*/
	li  Scratch,     0x00FFFFFF		#Mask  - this is a SLOW load.
	and B0, Temp,    Scratch	
	b	DoneKeyIncrement
	
OverflowIntoB0_1:
	/*
	// Else we have an 'overflow' into byte 1
	*/
	addiu Temp,     B0, 0x00000100		#'increment' the key

	andi Scratch, Temp, 0x0000FF00		#check for an overflow
	beq  Scratch, $0, OverflowIntoB0_0
	
	/*
	// No overflow. Mask the appropriate bits and save B1
	*/
	andi B0, Temp, 0xFFFF
	b	 DoneKeyIncrement
	
OverflowIntoB0_0:
	/*
	// Else we have an 'overflow' into byte 0. This is as far
	// as we go....
	*/
	addiu B0,     B0, 0x01		#'increment' the key
	
DoneKeyIncrement:
	/*
	// Save the incremented key values. These are reported back to
	// the caller so that we can start where we left on on the subsequent
	// call.
	*/
	sw	B1,		key_hi(WorkUnit)
	sw  B0,		key_lo(WorkUnit)

	/*
	// Check the outer loop count (already loaded into S0)
	// If its hit zero, then we've finished.
	*/
	beq S0, $0, FAILED

	/*
	// Decrement the loop counter, and save it again
	*/
	addiu S0, S0, -1
	sw	  S0, OuterLoopCount($sp)
	

	/*
	// Jump back to do another outer loop
	*/	
	b OuterLoopMain


/*
// SUCCESS. $2 contains the inner loop count value. We need to add this to
//  "pipelinecount" times the outer loop count value. Note that the outer
//	 loop count value is counting down, so it's a little tricky.
*/
SUCCESS:
	/*
	// First prepare the key_hi value. We need to add on the loop count to the
	// top byte. We don't give a damn about processor stalls now.
	*/
	lw  WorkUnit, SavedWorkUnit($sp)
	lw	B1,		key_hi(WorkUnit)

	sll	 Temp, $2, 24				#Move the loop count into the top byte
	addu B1, B1, Temp				#add it on to the key
	sw	B1,		key_hi(WorkUnit)	#and save it
	
	/*
	// We just need to compute the returned iteration count. This is the inner
	// loop count (in $2) + 256 * the number of time through the outer loop.
	//
	// To calc the number of times through the outer loop we should have...
	//    (SavedIterations - OuterLoopCount - 1)
	*/
	lw S0,	 SavedIterations($sp)
	lw Temp, OuterLoopCount($sp)

	subu  S0, S0, Temp
	addiu S0, S0, -1

	/*
	// Mulitply up by 256 and add on the inner loop iteration count
	*/
	sll  S0, S0, 8
	addu $2, S0, $2

	/*
	// Exit
	*/
	b BYE   
	
/*
// If we failed to find the magic key ....
*/		
FAILED:	
	#
	# Returned value should be Iteration count * 256
	#
	lw	$2, SavedIterations($sp)
	sll $2, $2, 8

/*
// Exit code. The return value is assumed to be in $2
*/	
BYE:
	/*
	// Restore all the registers
	*/
	RestoreALLRegs
	
	/*
	// Restore the stack location and return
	*/
	addu	$sp,$sp,TotalStackSpace
	j	$31
	
	.end	crunch__FP11RC5UnitWorkUl

/*	
// #########################
// END OF FILE
// #########################
*/
