#
# RC5-72 core generated by genCore.
# Time stamp : Sun Mar 16 17:17:53 2003
# Targets : MPC603e/MPC740/MPC750
#
# Written by Didier Levet (kakace@wanadoo.fr)
# Copyright distributed.net 1997-2003 - All Rights Reserved.
# For use in distributed.net projects only. Any other distribution
# or use of this source violates copyright.
#
# Dependencies :
#
#	struct rc5_72UnitWork (ccoreio.h) :
#		typedef struct {
#			struct {u32 hi, lo;} plain;
#			struct {u32 hi, lo;} cypher;
#			struct {u32 hi, mid, lo;} L0;
#			struct {u32 count, hi, mid, lo;} check;
#		} RC5_72UnitWork;
#
#	MINIMUM_ITERATIONS (problem.cpp) :
#		The number of iterations to perform is always an even multiple of
#		MINIMUM_ITERATIONS, and the first key to checked is also an even
#		multiple of this constant.
#		Therefore, it is assumed that the number of iterations is never
#		equal to zero (otherwise it would be interpreted as 2^32).
#		The current value of 24 also ensure that we can process 1, 2, 4 or
#		8 keys at once, all keys (within each group) having the same mid
#		and lo values.
#
# Stats :
#	Clock cycles (inner loop) : 602
#	Estimated keyrate : 3322.25 keys/s/MHz
#
# modified for coff and TOC on AIX by
#  Michael Weiser <michael@weiser.dinsnail.net>
#
# see http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12069 for an
# explanation why we need the TOC entries for AIX and maybe other
# architectures
#
# $Id: r72-KKS2pipes.toc.s,v 1.3 2007/10/22 16:48:35 jlawson Exp $
#
# $Log: r72-KKS2pipes.toc.s,v $
# Revision 1.3  2007/10/22 16:48:35  jlawson
# overwrite head with contents of release-2-90xx
#
# Revision 1.1.2.2  2004/05/15 08:31:09  mweiser
# new email address
#
# Revision 1.1.2.1  2003/09/12 13:13:17  mweiser
# add TOC versions of PowerPC cores for AIX and undo changes to mh-2 for
# gas/coff compatibility
#
#
#=============================================================================

		.globl	rc5_72_unit_func_KKS2pipes	# function pointer toc entry
		.globl  .rc5_72_unit_func_KKS2pipes	# function label for coff



# Register aliases

.set    r0,0
.set    r1,1
.set    r2,2
.set    r3,3
.set    r4,4
.set    r5,5
.set    r6,6
.set    r7,7
.set    r8,8
.set    r9,9
.set    r10,10
.set    r11,11
.set    r12,12
.set    r13,13
.set    r14,14
.set    r15,15
.set    r16,16
.set    r17,17
.set    r18,18
.set    r19,19
.set    r20,20
.set    r21,21
.set    r22,22
.set    r23,23
.set    r24,24
.set    r25,25
.set    r26,26
.set    r27,27
.set    r28,28
.set    r29,29
.set    r30,30
.set    r31,31

# Result values (see ccoreio.h)

.set	RESULT_NOTHING,	1
.set	RESULT_FOUND,	2


# struct RC5_72UnitWork (see ccoreio.h) :

.set	plain_hi,	  0
.set	plain_lo,	  4
.set	cypher_hi,	  8
.set	cypher_lo,	 12
.set	L0_hi,		 16
.set	L0_mid,		 20
.set	L0_lo,		 24
.set	check_count, 28
.set	check_hi,	 32
.set	check_mid,	 36
.set	check_lo,	 40


# RSA constants

.set    P, 0xB7E15163
.set    P_h, 0xB7E1
.set    P_l, 0x5163
.set    Q, 0x9E3779B9
.set    Q_h, 0x9E37
.set    Q_l, 0x79B9
.set    P2Q,0xf45044d5      # P+2Q
.set    P2Q_h,0xf450
.set    P2Q_l,0x44d5
.set    P3Q,0x9287be8e      # P+3Q
.set    P3Q_h,0x9287
.set    P3Q_l,0xbe8e


# RC5 projects
# This core is designed to handle RC5-72, RC5-80, RC5-88
# and RC5-96 projects.

.set	RC5_72_KEY_SHIFT,	24
.set	RC5_80_KEY_SHIFT,	16
.set	RC5_88_KEY_SHIFT,	8
.set	RC5_96_KEY_SHIFT,	0

.set	RC5_KEY_SHIFT,	RC5_72_KEY_SHIFT

		# The core handle 2 keys per loop
		.macro	ADD_KEY	$0 $1	# key.hi += x
		.if		RC5_KEY_SHIFT < 16
		addi	\$0,\$0,(\$1<<RC5_KEY_SHIFT)
		.else
		addis	\$0,\$0,(\$1<<(RC5_KEY_SHIFT-16))
		.endif
		.endm


		.macro	SUB_KEY	$0 $1	# key.hi -= x
		.if		RC5_KEY_SHIFT < 16
		addi	\$0,\$0,-(\$1<<RC5_KEY_SHIFT)
		.else
		addis	\$0,\$0,-(\$1<<(RC5_KEY_SHIFT-16))
		.endif
		.endm


# About the stack frame :
# Since the core is a leaf procedure, the stack frame doesn't have
# to follow specific ABI conventions. However, the stack pointer
# shall be aligned on a quad-word (16 bytes) boundary. This alignment
# is enforced at run time when allocating the stack frame.
#
# Conventions :
# The LR register is not used, so it doesn't have to be preserved.
# Register r2, usually used as RTOC, is preserved although it might
# be defined as volatile.
# The CR fields used by the core might be defined as volatile. However,
# the CR register is preserved for convenience.
# The CTR is preserved, although it might be defined as volatile.
#
# Volatile registers (not preserved) :
#  r0, r3-r12

		# The following constants are used to determine the size of
		# the GPR save area. Prolog and epilog code have to be
		# modified if your ABI defines another convention.
.set	NV_GPR,		13		# Non-volatile GPR : r13 - r31

.set	wSaveR2,	  4		# r2 backup
.set	wSaveCR,	  8		# CR backup
.set	wSaveCTR,	 12		# CTR backup
.set	wKIter,		 16		# Remaining iterations
.set	pUnitWork,	 20		# struct RC5_72UnitWork * (copy)
.set	pIterations, 24		# u32 * (copy)

.set	wKeyHi,		28		# key.hi
.set	wKeyMid,	32		# key.mid
.set	wKeyLo,		36		# key.lo

.set	wRSA_P,		40		# P
.set	wRSA_Q,		44		# Q
.set	wRSA_S0,	48		# S[0] = ROTL3(P)
.set	wCached_S1,	52		# Sa[1] (cached)
.set	wCached_S2,	56		# Sa[2] (cached)
.set	wCached_L0,	60		# La[0] (cached)
.set	wCached_L1,	64		# La[1] (cached)

.set	wChkHi,		68
.set	wChkMid,	72
.set	wChkLo,		76
.set	wChkCnt,	80

.set	wPlainLo,	84
.set	wPlainHi,	88
.set	wCypherLo,	92
.set	wCypherHi,	96

.set	Sa_09,	100
.set	Sb_09,	104
.set	Sa_10,	108
.set	Sb_10,	112
.set	Sa_11,	116
.set	Sb_11,	120
.set	Sa_12,	124
.set	Sb_12,	128
.set	Sa_13,	132
.set	Sb_13,	136
.set	Sa_14,	140
.set	Sb_14,	144
.set	Sa_15,	148
.set	Sb_15,	152
.set	Sa_16,	156
.set	Sb_16,	160
.set	Sa_17,	164
.set	Sb_17,	168
.set	Sa_18,	172
.set	Sb_18,	176
.set	Sa_19,	180
.set	Sb_19,	184
.set	Sa_20,	188
.set	Sb_20,	192
.set	Sa_21,	196
.set	Sb_21,	200
.set	Sa_22,	204
.set	Sb_22,	208
.set	Sa_23,	212
.set	Sb_23,	216
.set	Sa_24,	220
.set	Sb_24,	224
.set	Sa_25,	228
.set	Sb_25,	232

.set	localTop,	236
.set	GPRsave,	(32-NV_GPR) * 4	# Size of the GPR save area

		# The prolog code assumes that the size of the stack frame
		# is a multiple of 16
.set	FrameSize,	(localTop + GPRsave + 15) & (-16)


#=============================================================================
# u32 (r3) = rc5_72_unit_func_KKS2pipes(RC5_72UnitWork *rc5_72unitwork (r3),
#						u32 *iterations (r4),
#						void * /* memblk (r5) */)

		# add new TOC entry for unit func
		.toc
		.csect .rc5_72_unit_func_KKS2pipes[DS]
rc5_72_unit_func_KKS2pipes:
		# set the TOC anchor
		.long .rc5_72_unit_func_KKS2pipes, TOC[tc0], 0

		.csect .text[PR]        # start text[program code] section
		.align 4
.rc5_72_unit_func_KKS2pipes:

		# Allocate the stack frame
		mr		r5,r1				# Caller's stack pointer
		clrlwi	r6,r1,27			# Keep the low order 4-bits
		subfic	r6,r6,-FrameSize	# Total frame size, including padding.
		stwux	r1,r1,r6			# Create the stack frame.

		# Save non-volatile registers
		stmw	r13,-GPRsave(r5)	# Save r13-r31
		mfcr	r6					# CR register
		mfctr	r7					# CTR register
		stw		r2,wSaveR2(r1)		# Save r2
		stw		r6,wSaveCR(r1)
		stw		r7,wSaveCTR(r1)

		# Copy the arguments (we'll need all registers)
		stw		r3,pUnitWork(r1)	# RC5_72UnitWork *
		stw		r4,pIterations(r1)	# u32 *

		# Initialize local variables
		lwz		r5,L0_hi(r3)		# Dispatch first key's components
		lwz		r6,L0_mid(r3)
		lwz		r7,L0_lo(r3)
		stw		r5,wKeyHi(r1)
		stw		r6,wKeyMid(r1)
		stw		r7,wKeyLo(r1)

		lwz		r5,check_count(r3)	# Prepare Check datas
		lwz		r6,check_hi(r3)
		lwz		r7,check_mid(r3)
		lwz		r8,check_lo(r3)
		stw		r5,wChkCnt(r1)
		stw		r6,wChkHi(r1)
		stw		r7,wChkMid(r1)
		stw		r8,wChkLo(r1)

		lwz		r5,plain_lo(r3)		# Prepare Plain/Cypher datas
		lwz		r6,plain_hi(r3)
		lwz		r7,cypher_lo(r3)
		lwz		r8,cypher_hi(r3)
		stw		r5,wPlainLo(r1)
		stw		r6,wPlainHi(r1)
		stw		r7,wCypherLo(r1)
		stw		r8,wCypherHi(r1)

		# Initialize RSA constants
		lis		r5,P_h
		lis		r6,Q_h
		ori		r5,r5,P_l
		ori		r6,r6,Q_l
		rotlwi	r7,r5,3				# == S[0]
		stw		r5,wRSA_P(r1)
		stw		r6,wRSA_Q(r1)
		stw		r7,wRSA_S0(r1)

		# Initialize key.hi offset
		li		r31,wKeyHi


#=============================================================================
# Main loop implementation :
# Compute how many iterations to perform in the inner loop.
# The inner loop shall exit when the iteration count becomes
# zero, or when key.hi becomes zero (in the later case, we
# have to increment key.mid and maybe key.lo).
# NOTE : The code below relies on MINIMUM_ITERATIONS and assumes
#        that (key % MINIMUM_ITERATION) == 0. Said otherwise,
#        key.hi can be incremented at least 7 times without
#        causing an overflow.

main_loop_init:

		lwz		r12,pIterations(r1)
		lwbrx	r13,r31,r1			# Load key.hi (byte reversed)
		lwz		r12,0(r12)
		lwz		r2,wRSA_Q(r1)		# r2 = Q

		# Compute how many iterations to perform in the inner loop
		# until key.hi == 0 or the total number of iterations is reached.

		lis		r11,32768			# Setup max value
		srwi	r13,r13,1			# key.hi / 2
		srwi	r12,r12,1			# = Total number of iterations / 2
		subf	r11,r13,r11			# = Iterations until key.hi == 0
		srwi	r11,r11,RC5_KEY_SHIFT
		cmplw	r12,r11
		bgt		set_count1

		mr		r11,r12

set_count1:
		subf	r12,r11,r12			# Remaining iterations
		stw		r12,wKIter(r1)
		mtctr	r11					# Cannot be zero


# Handle new key.lo value.
# Load S[0], and compute L[0] and S[1]
# r2  := Q (const)

new_key_lo:
		lwz		r11,wRSA_S0(r1)
		lwz		r3,wKeyLo(r1)		# L[0] = key.lo
		lwz		r0,wRSA_P(r1)

		add		r3,r3,r11			# L = L[0] + S[0]
		add		r0,r0,r2			# S[1] = P + Q
		rotlw	r3,r3,r11			# B = La[0] = ROTL(L, Sa[0])

		add		r12,r11,r3			# A += B
		stw		r3,wCached_L0(r1)	# Save La[0]
		add		r12,r12,r0			# S = A + S[1]
		add		r0,r0,r2			# S[2] = P + 2Q
		rotlwi	r12,r12,3			# A = Sa[1] = ROTL3(S)
		stw		r12,wCached_S1(r1)	# Save Sa[1]


# Handle new key.mid value : Compute L[1] and S[2].
# Pre-conditions :
# r0  := S[2] = P + 2Q
# r2  := Q (const)
# r12 := Sa[1] = A
# r3  := La[0] = B

new_key_mid:
		lwz		r4,wKeyMid(r1)		# L[1] = key.mid
		add		r6,r12,r3			# T = A + B
		lwz		r5,wKeyHi(r1)		# L[2] = key.hi
		add		r4,r4,r6			# L = L[1] + T
		rotlw	r4,r4,r6			# B = La[1] = ROTL(L, T)
		add		r13,r12,r4			# A += B
		stw		r4,wCached_L1(r1)	# Save L0[1]
		add		r13,r13,r0			# S = A + S[2]
		add		r0,r0,r2			# S[3] = P + 3Q
		rotlwi	r13,r13,3			# A = Sa[2] = ROTL3(S)
		stw		r13,wCached_S2(r1)	# Save Sa[2]
		b		new_key_hi

		nop
		.align	4


# Inner loop (processed at most 2^(key_len-66) times).
# Preconditions :
# r0  := S. Also used as a temporary storage
# r2  := Q. Also used as a temporary storage
# r12 := Sa(1)
# r13 := Sa(2) := A
# r4  := La(1) := B
# r5  := La(2) = key.hi
#
# Registers assignments :
# r11-r19 := Sa[0] - Sa[8]
# r21-r29 := Sb[0] - Sb[8]
# r3 -r5  := La[0] - La[2]
# r7 -r9  := Lb[0] - Lb[2]
# r0  := Sn = P + nQ. Also used as a temporary storage.
# r20 := Sa[n]
# r30 := Sb[n]
# r6  := Core #1 temporary storage
# r10 := Core #2 temporary storage
# r31 := Offset to wKeyHi (const)

new_key_hi:

		#----- Stage 2-3 ----------
		addi	r9,r5,1				# Lb[2] = key.hi + 1
		lwz		r11,wRSA_S0(r1)		# Sa(0)

		add		r6,r13,r4			# Ta = Sa[2] + La[1]
		lwz		r3,wCached_L0(r1)	# La(0)

		add		r9,r9,r6			# Lb[2] += (Tb = Ta)
		add		r5,r5,r6			# La[2] += Ta

		rotlw	r9,r9,r6			# Bb = Lb[2] = ROTL(Lb[2], Tb)
		add		r14,r0,r13			# Sa[3] = S3 + Sa[2]

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r24,r14,r9			# Sb[3] = Sa[3] + Lb[2]

		add		r14,r14,r5			# Sa[3] += La[2]
		rotlwi	r24,r24,3			# Ab = Sb[3] = ROTL3(Sb[3])

		rotlwi	r14,r14,3			# Aa = Sa[3] = ROTL3(Sa[3])
		#----- Stages 3-4 ----------
		add		r10,r9,r24			# Tb = Lb[2] + Sb[3]

		add		r6,r5,r14			# Ta = La[2] + Sa[3]
		add		r0,r0,r2			# S4 = S3 + Q

		add		r7,r3,r10			# Lb[0] = La[0] + Tb
		add		r3,r3,r6			# La[0] += Ta

		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)
		add		r15,r0,r14			# Sa[4] = S4 + Sa[3]

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r25,r0,r24			# Sb[4] = S4 + Sb[3]

		add		r25,r25,r7			# Sb[4] += Lb[0]
		add		r0,r0,r2			# S5 = S4 + Q

		add		r15,r15,r3			# Sa[4] += La[0]
		rotlwi	r25,r25,3			# Ab = Sb[4] = ROTL3(Sb[4])

		rotlwi	r15,r15,3			# Aa = Sa[4] = ROTL3(Sa[4])
		#----- Stages 4-5 ----------
		add		r10,r7,r25			# Tb = Lb[0] + Sb[4]

		add		r6,r3,r15			# Ta = La[0] + Sa[4]
		add		r8,r4,r10			# Lb[1] = La[1] + Tb

		add		r4,r4,r6			# La[1] += Ta
		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r26,r0,r25			# Sb[5] = S5 + Sb[4]

		add		r16,r0,r15			# Sa[5] = S5 + Sa[4]
		add		r26,r26,r8			# Sb[5] += Lb[1]

		add		r16,r16,r4			# Sa[5] += La[1]
		rotlwi	r26,r26,3			# Ab = Sb[5] = ROTL3(Sb[5])

		rotlwi	r16,r16,3			# Aa = Sa[5] = ROTL3(Sa[5])
		#----- Stages 5-6 ----------
		add		r10,r8,r26			# Tb = Lb[1] + Sb[5]

		add		r6,r4,r16			# Ta = La[1] + Sa[5]
		add		r0,r0,r2			# S6 = S5 + Q

		add		r9,r9,r10			# Lb[2] += Tb
		add		r5,r5,r6			# La[2] += Ta

		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)
		add		r17,r0,r16			# Sa[6] = S6 + Sa[5]

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r27,r0,r26			# Sb[6] = S6 + Sb[5]

		add		r27,r27,r9			# Sb[6] += Lb[2]
		add		r0,r0,r2			# S7 = S6 + Q

		add		r17,r17,r5			# Sa[6] += La[2]
		rotlwi	r27,r27,3			# Ab = Sb[6] = ROTL3(Sb[6])

		rotlwi	r17,r17,3			# Aa = Sa[6] = ROTL3(Sa[6])
		#----- Stages 6-7 ----------
		add		r10,r9,r27			# Tb = Lb[2] + Sb[6]

		add		r6,r5,r17			# Ta = La[2] + Sa[6]
		add		r7,r7,r10			# Lb[0] += Tb

		add		r3,r3,r6			# La[0] += Ta
		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r28,r0,r27			# Sb[7] = S7 + Sb[6]

		add		r18,r0,r17			# Sa[7] = S7 + Sa[6]
		add		r28,r28,r7			# Sb[7] += Lb[0]

		add		r18,r18,r3			# Sa[7] += La[0]
		rotlwi	r28,r28,3			# Ab = Sb[7] = ROTL3(Sb[7])

		rotlwi	r18,r18,3			# Aa = Sa[7] = ROTL3(Sa[7])
		#----- Stages 7-8 ----------
		add		r10,r7,r28			# Tb = Lb[0] + Sb[7]

		add		r6,r3,r18			# Ta = La[0] + Sa[7]
		add		r0,r0,r2			# S8 = S7 + Q

		add		r8,r8,r10			# Lb[1] += Tb
		add		r4,r4,r6			# La[1] += Ta

		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)
		add		r19,r0,r18			# Sa[8] = S8 + Sa[7]

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r29,r0,r28			# Sb[8] = S8 + Sb[7]

		add		r29,r29,r8			# Sb[8] += Lb[1]
		add		r0,r0,r2			# S9 = S8 + Q

		add		r19,r19,r4			# Sa[8] += La[1]
		rotlwi	r29,r29,3			# Ab = Sb[8] = ROTL3(Sb[8])

		rotlwi	r19,r19,3			# Aa = Sa[8] = ROTL3(Sa[8])
		#----- Stages 8-9 ----------
		add		r10,r8,r29			# Tb = Lb[1] + Sb[8]

		add		r6,r4,r19			# Ta = La[1] + Sa[8]
		add		r9,r9,r10			# Lb[2] += Tb

		add		r5,r5,r6			# La[2] += Ta
		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r30,r0,r29			# Sb[9] = S9 + Sb[8]

		add		r20,r0,r19			# Sa[9] = S9 + Sa[8]
		add		r30,r30,r9			# Sb[9] += Lb[2]

		add		r20,r20,r5			# Sa[9] += La[2]
		rotlwi	r30,r30,3			# Ab = Sb[9] = ROTL3(Sb[9])

		rotlwi	r20,r20,3			# Aa = Sa[9] = ROTL3(Sa[9])
		stw		r30,Sb_09(r1)		# Store Sb[9]

		#----- Stages 9-10 ----------
		add		r10,r9,r30			# Tb = Lb[2] + Sb[9]
		stw		r20,Sa_09(r1)		# Store Sa[9]

		add		r6,r5,r20			# Ta = La[2] + Sa[9]
		add		r0,r0,r2			# S10 = S9 + Q

		add		r7,r7,r10			# Lb[0] += Tb
		add		r3,r3,r6			# La[0] += Ta

		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)
		add		r20,r0,r20			# Sa[10] = S10 + Sa[9]

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r30,r0,r30			# Sb[10] = S10 + Sb[9]

		add		r30,r30,r7			# Sb[10] += Lb[0]
		add		r0,r0,r2			# S11 = S10 + Q

		add		r20,r20,r3			# Sa[10] += La[0]
		rotlwi	r30,r30,3			# Ab = Sb[10] = ROTL3(Sb[10])

		rotlwi	r20,r20,3			# Aa = Sa[10] = ROTL3(Sa[10])
		stw		r30,Sb_10(r1)		# Store Sb[10]

		#----- Stages 10-11 ----------
		add		r10,r7,r30			# Tb = Lb[0] + Sb[10]
		stw		r20,Sa_10(r1)		# Store Sa[10]

		add		r6,r3,r20			# Ta = La[0] + Sa[10]
		add		r8,r8,r10			# Lb[1] += Tb

		add		r4,r4,r6			# La[1] += Ta
		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r30,r0,r30			# Sb[11] = S11 + Sb[10]

		add		r20,r0,r20			# Sa[11] = S11 + Sa[10]
		add		r30,r30,r8			# Sb[11] += Lb[1]

		add		r20,r20,r4			# Sa[11] += La[1]
		rotlwi	r30,r30,3			# Ab = Sb[11] = ROTL3(Sb[11])

		rotlwi	r20,r20,3			# Aa = Sa[11] = ROTL3(Sa[11])
		stw		r30,Sb_11(r1)		# Store Sb[11]

		#----- Stages 11-12 ----------
		add		r10,r8,r30			# Tb = Lb[1] + Sb[11]
		stw		r20,Sa_11(r1)		# Store Sa[11]

		add		r6,r4,r20			# Ta = La[1] + Sa[11]
		add		r0,r0,r2			# S12 = S11 + Q

		add		r9,r9,r10			# Lb[2] += Tb
		add		r5,r5,r6			# La[2] += Ta

		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)
		add		r20,r0,r20			# Sa[12] = S12 + Sa[11]

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r30,r0,r30			# Sb[12] = S12 + Sb[11]

		add		r30,r30,r9			# Sb[12] += Lb[2]
		add		r0,r0,r2			# S13 = S12 + Q

		add		r20,r20,r5			# Sa[12] += La[2]
		rotlwi	r30,r30,3			# Ab = Sb[12] = ROTL3(Sb[12])

		rotlwi	r20,r20,3			# Aa = Sa[12] = ROTL3(Sa[12])
		stw		r30,Sb_12(r1)		# Store Sb[12]

		#----- Stages 12-13 ----------
		add		r10,r9,r30			# Tb = Lb[2] + Sb[12]
		stw		r20,Sa_12(r1)		# Store Sa[12]

		add		r6,r5,r20			# Ta = La[2] + Sa[12]
		add		r7,r7,r10			# Lb[0] += Tb

		add		r3,r3,r6			# La[0] += Ta
		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r30,r0,r30			# Sb[13] = S13 + Sb[12]

		add		r20,r0,r20			# Sa[13] = S13 + Sa[12]
		add		r30,r30,r7			# Sb[13] += Lb[0]

		add		r20,r20,r3			# Sa[13] += La[0]
		rotlwi	r30,r30,3			# Ab = Sb[13] = ROTL3(Sb[13])

		rotlwi	r20,r20,3			# Aa = Sa[13] = ROTL3(Sa[13])
		stw		r30,Sb_13(r1)		# Store Sb[13]

		#----- Stages 13-14 ----------
		add		r10,r7,r30			# Tb = Lb[0] + Sb[13]
		stw		r20,Sa_13(r1)		# Store Sa[13]

		add		r6,r3,r20			# Ta = La[0] + Sa[13]
		add		r0,r0,r2			# S14 = S13 + Q

		add		r8,r8,r10			# Lb[1] += Tb
		add		r4,r4,r6			# La[1] += Ta

		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)
		add		r20,r0,r20			# Sa[14] = S14 + Sa[13]

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r30,r0,r30			# Sb[14] = S14 + Sb[13]

		add		r30,r30,r8			# Sb[14] += Lb[1]
		add		r0,r0,r2			# S15 = S14 + Q

		add		r20,r20,r4			# Sa[14] += La[1]
		rotlwi	r30,r30,3			# Ab = Sb[14] = ROTL3(Sb[14])

		rotlwi	r20,r20,3			# Aa = Sa[14] = ROTL3(Sa[14])
		stw		r30,Sb_14(r1)		# Store Sb[14]

		#----- Stages 14-15 ----------
		add		r10,r8,r30			# Tb = Lb[1] + Sb[14]
		stw		r20,Sa_14(r1)		# Store Sa[14]

		add		r6,r4,r20			# Ta = La[1] + Sa[14]
		add		r9,r9,r10			# Lb[2] += Tb

		add		r5,r5,r6			# La[2] += Ta
		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r30,r0,r30			# Sb[15] = S15 + Sb[14]

		add		r20,r0,r20			# Sa[15] = S15 + Sa[14]
		add		r30,r30,r9			# Sb[15] += Lb[2]

		add		r20,r20,r5			# Sa[15] += La[2]
		rotlwi	r30,r30,3			# Ab = Sb[15] = ROTL3(Sb[15])

		rotlwi	r20,r20,3			# Aa = Sa[15] = ROTL3(Sa[15])
		stw		r30,Sb_15(r1)		# Store Sb[15]

		#----- Stages 15-16 ----------
		add		r10,r9,r30			# Tb = Lb[2] + Sb[15]
		stw		r20,Sa_15(r1)		# Store Sa[15]

		add		r6,r5,r20			# Ta = La[2] + Sa[15]
		add		r0,r0,r2			# S16 = S15 + Q

		add		r7,r7,r10			# Lb[0] += Tb
		add		r3,r3,r6			# La[0] += Ta

		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)
		add		r20,r0,r20			# Sa[16] = S16 + Sa[15]

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r30,r0,r30			# Sb[16] = S16 + Sb[15]

		add		r30,r30,r7			# Sb[16] += Lb[0]
		add		r0,r0,r2			# S17 = S16 + Q

		add		r20,r20,r3			# Sa[16] += La[0]
		rotlwi	r30,r30,3			# Ab = Sb[16] = ROTL3(Sb[16])

		rotlwi	r20,r20,3			# Aa = Sa[16] = ROTL3(Sa[16])
		stw		r30,Sb_16(r1)		# Store Sb[16]

		#----- Stages 16-17 ----------
		add		r10,r7,r30			# Tb = Lb[0] + Sb[16]
		stw		r20,Sa_16(r1)		# Store Sa[16]

		add		r6,r3,r20			# Ta = La[0] + Sa[16]
		add		r8,r8,r10			# Lb[1] += Tb

		add		r4,r4,r6			# La[1] += Ta
		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r30,r0,r30			# Sb[17] = S17 + Sb[16]

		add		r20,r0,r20			# Sa[17] = S17 + Sa[16]
		add		r30,r30,r8			# Sb[17] += Lb[1]

		add		r20,r20,r4			# Sa[17] += La[1]
		rotlwi	r30,r30,3			# Ab = Sb[17] = ROTL3(Sb[17])

		rotlwi	r20,r20,3			# Aa = Sa[17] = ROTL3(Sa[17])
		stw		r30,Sb_17(r1)		# Store Sb[17]

		#----- Stages 17-18 ----------
		add		r10,r8,r30			# Tb = Lb[1] + Sb[17]
		stw		r20,Sa_17(r1)		# Store Sa[17]

		add		r6,r4,r20			# Ta = La[1] + Sa[17]
		add		r0,r0,r2			# S18 = S17 + Q

		add		r9,r9,r10			# Lb[2] += Tb
		add		r5,r5,r6			# La[2] += Ta

		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)
		add		r20,r0,r20			# Sa[18] = S18 + Sa[17]

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r30,r0,r30			# Sb[18] = S18 + Sb[17]

		add		r30,r30,r9			# Sb[18] += Lb[2]
		add		r0,r0,r2			# S19 = S18 + Q

		add		r20,r20,r5			# Sa[18] += La[2]
		rotlwi	r30,r30,3			# Ab = Sb[18] = ROTL3(Sb[18])

		rotlwi	r20,r20,3			# Aa = Sa[18] = ROTL3(Sa[18])
		stw		r30,Sb_18(r1)		# Store Sb[18]

		#----- Stages 18-19 ----------
		add		r10,r9,r30			# Tb = Lb[2] + Sb[18]
		stw		r20,Sa_18(r1)		# Store Sa[18]

		add		r6,r5,r20			# Ta = La[2] + Sa[18]
		add		r7,r7,r10			# Lb[0] += Tb

		add		r3,r3,r6			# La[0] += Ta
		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r30,r0,r30			# Sb[19] = S19 + Sb[18]

		add		r20,r0,r20			# Sa[19] = S19 + Sa[18]
		add		r30,r30,r7			# Sb[19] += Lb[0]

		add		r20,r20,r3			# Sa[19] += La[0]
		rotlwi	r30,r30,3			# Ab = Sb[19] = ROTL3(Sb[19])

		rotlwi	r20,r20,3			# Aa = Sa[19] = ROTL3(Sa[19])
		stw		r30,Sb_19(r1)		# Store Sb[19]

		#----- Stages 19-20 ----------
		add		r10,r7,r30			# Tb = Lb[0] + Sb[19]
		stw		r20,Sa_19(r1)		# Store Sa[19]

		add		r6,r3,r20			# Ta = La[0] + Sa[19]
		add		r0,r0,r2			# S20 = S19 + Q

		add		r8,r8,r10			# Lb[1] += Tb
		add		r4,r4,r6			# La[1] += Ta

		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)
		add		r20,r0,r20			# Sa[20] = S20 + Sa[19]

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r30,r0,r30			# Sb[20] = S20 + Sb[19]

		add		r30,r30,r8			# Sb[20] += Lb[1]
		add		r0,r0,r2			# S21 = S20 + Q

		add		r20,r20,r4			# Sa[20] += La[1]
		rotlwi	r30,r30,3			# Ab = Sb[20] = ROTL3(Sb[20])

		rotlwi	r20,r20,3			# Aa = Sa[20] = ROTL3(Sa[20])
		stw		r30,Sb_20(r1)		# Store Sb[20]

		#----- Stages 20-21 ----------
		add		r10,r8,r30			# Tb = Lb[1] + Sb[20]
		stw		r20,Sa_20(r1)		# Store Sa[20]

		add		r6,r4,r20			# Ta = La[1] + Sa[20]
		add		r9,r9,r10			# Lb[2] += Tb

		add		r5,r5,r6			# La[2] += Ta
		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r30,r0,r30			# Sb[21] = S21 + Sb[20]

		add		r20,r0,r20			# Sa[21] = S21 + Sa[20]
		add		r30,r30,r9			# Sb[21] += Lb[2]

		add		r20,r20,r5			# Sa[21] += La[2]
		rotlwi	r30,r30,3			# Ab = Sb[21] = ROTL3(Sb[21])

		rotlwi	r20,r20,3			# Aa = Sa[21] = ROTL3(Sa[21])
		stw		r30,Sb_21(r1)		# Store Sb[21]

		#----- Stages 21-22 ----------
		add		r10,r9,r30			# Tb = Lb[2] + Sb[21]
		stw		r20,Sa_21(r1)		# Store Sa[21]

		add		r6,r5,r20			# Ta = La[2] + Sa[21]
		add		r0,r0,r2			# S22 = S21 + Q

		add		r7,r7,r10			# Lb[0] += Tb
		add		r3,r3,r6			# La[0] += Ta

		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)
		add		r20,r0,r20			# Sa[22] = S22 + Sa[21]

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r30,r0,r30			# Sb[22] = S22 + Sb[21]

		add		r30,r30,r7			# Sb[22] += Lb[0]
		add		r0,r0,r2			# S23 = S22 + Q

		add		r20,r20,r3			# Sa[22] += La[0]
		rotlwi	r30,r30,3			# Ab = Sb[22] = ROTL3(Sb[22])

		rotlwi	r20,r20,3			# Aa = Sa[22] = ROTL3(Sa[22])
		stw		r30,Sb_22(r1)		# Store Sb[22]

		#----- Stages 22-23 ----------
		add		r10,r7,r30			# Tb = Lb[0] + Sb[22]
		stw		r20,Sa_22(r1)		# Store Sa[22]

		add		r6,r3,r20			# Ta = La[0] + Sa[22]
		add		r8,r8,r10			# Lb[1] += Tb

		add		r4,r4,r6			# La[1] += Ta
		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r30,r0,r30			# Sb[23] = S23 + Sb[22]

		add		r20,r0,r20			# Sa[23] = S23 + Sa[22]
		add		r30,r30,r8			# Sb[23] += Lb[1]

		add		r20,r20,r4			# Sa[23] += La[1]
		rotlwi	r30,r30,3			# Ab = Sb[23] = ROTL3(Sb[23])

		rotlwi	r20,r20,3			# Aa = Sa[23] = ROTL3(Sa[23])
		stw		r30,Sb_23(r1)		# Store Sb[23]

		#----- Stages 23-24 ----------
		add		r10,r8,r30			# Tb = Lb[1] + Sb[23]
		stw		r20,Sa_23(r1)		# Store Sa[23]

		add		r6,r4,r20			# Ta = La[1] + Sa[23]
		add		r0,r0,r2			# S24 = S23 + Q

		add		r9,r9,r10			# Lb[2] += Tb
		add		r5,r5,r6			# La[2] += Ta

		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)
		add		r20,r0,r20			# Sa[24] = S24 + Sa[23]

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r30,r0,r30			# Sb[24] = S24 + Sb[23]

		add		r30,r30,r9			# Sb[24] += Lb[2]
		add		r0,r0,r2			# S25 = S24 + Q

		add		r20,r20,r5			# Sa[24] += La[2]
		rotlwi	r30,r30,3			# Ab = Sb[24] = ROTL3(Sb[24])

		rotlwi	r20,r20,3			# Aa = Sa[24] = ROTL3(Sa[24])
		stw		r30,Sb_24(r1)		# Store Sb[24]

		#----- Stages 24-25 ----------
		add		r10,r9,r30			# Tb = Lb[2] + Sb[24]
		stw		r20,Sa_24(r1)		# Store Sa[24]

		add		r6,r5,r20			# Ta = La[2] + Sa[24]
		add		r7,r7,r10			# Lb[0] += Tb

		add		r3,r3,r6			# La[0] += Ta
		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r30,r0,r30			# Sb[25] = S25 + Sb[24]

		add		r20,r0,r20			# Sa[25] = S25 + Sa[24]
		add		r30,r30,r7			# Sb[25] += Lb[0]

		add		r20,r20,r3			# Sa[25] += La[0]
		rotlwi	r30,r30,3			# Ab = Sb[25] = ROTL3(Sb[25])

		rotlwi	r20,r20,3			# Aa = Sa[25] = ROTL3(Sa[25])
		stw		r30,Sb_25(r1)		# Store Sb[25]

		# Sb[0], Sb[1] and Sb[2] are not yet initialized.
		#----- Stages 25-26 ----------
		add		r10,r7,r30			# Tb = Lb[0] + Sb[25]
		stw		r20,Sa_25(r1)		# Store Sa[25]

		add		r6,r3,r20			# Ta = La[0] + Sa[25]
		add		r8,r8,r10			# Lb[1] += Tb

		add		r4,r4,r6			# La[1] += Ta
		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r21,r11,r30			# Sb[0] = Sa[0] + Sb[25]

		add		r11,r11,r20			# Sa[0] += Sa[25]
		add		r21,r21,r8			# Sb[0] += Lb[1]

		add		r11,r11,r4			# Sa[0] += La[1]
		rotlwi	r21,r21,3			# Ab = Sb[0] = ROTL3(Sb[0])

		rotlwi	r11,r11,3			# Aa = Sa[0] = ROTL3(Sa[0])
		#----- Stages 26-27 ----------
		add		r10,r8,r21			# Tb = Lb[1] + Sb[0]

		add		r6,r4,r11			# Ta = La[1] + Sa[0]
		add		r9,r9,r10			# Lb[2] += Tb

		add		r5,r5,r6			# La[2] += Ta
		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r22,r12,r21			# Sb[1] = Sa[1] + Sb[0]

		add		r12,r12,r11			# Sa[1] += Sa[0]
		add		r22,r22,r9			# Sb[1] += Lb[2]

		add		r12,r12,r5			# Sa[1] += La[2]
		rotlwi	r22,r22,3			# Ab = Sb[1] = ROTL3(Sb[1])

		rotlwi	r12,r12,3			# Aa = Sa[1] = ROTL3(Sa[1])
		#----- Stages 27-28 ----------
		add		r10,r9,r22			# Tb = Lb[2] + Sb[1]

		add		r6,r5,r12			# Ta = La[2] + Sa[1]
		add		r7,r7,r10			# Lb[0] += Tb

		add		r3,r3,r6			# La[0] += Ta
		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r23,r13,r22			# Sb[2] = Sa[2] + Sb[1]

		add		r13,r13,r12			# Sa[2] += Sa[1]
		add		r23,r23,r7			# Sb[2] += Lb[0]

		add		r13,r13,r3			# Sa[2] += La[0]
		rotlwi	r23,r23,3			# Ab = Sb[2] = ROTL3(Sb[2])

		rotlwi	r13,r13,3			# Aa = Sa[2] = ROTL3(Sa[2])
		#----- Stages 28-29 ----------
		add		r10,r7,r23			# Tb = Lb[0] + Sb[2]

		add		r6,r3,r13			# Ta = La[0] + Sa[2]
		add		r8,r8,r10			# Lb[1] += Tb

		add		r4,r4,r6			# La[1] += Ta
		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r24,r24,r23			# Sb[3] += Sb[2]

		add		r14,r14,r13			# Sa[3] += Sa[2]
		add		r24,r24,r8			# Sb[3] += Lb[1]

		add		r14,r14,r4			# Sa[3] += La[1]
		rotlwi	r24,r24,3			# Ab = Sb[3] = ROTL3(Sb[3])

		rotlwi	r14,r14,3			# Aa = Sa[3] = ROTL3(Sa[3])
		#----- Stages 29-30 ----------
		add		r10,r8,r24			# Tb = Lb[1] + Sb[3]

		add		r6,r4,r14			# Ta = La[1] + Sa[3]
		add		r9,r9,r10			# Lb[2] += Tb

		add		r5,r5,r6			# La[2] += Ta
		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r25,r25,r24			# Sb[4] += Sb[3]

		add		r15,r15,r14			# Sa[4] += Sa[3]
		add		r25,r25,r9			# Sb[4] += Lb[2]

		add		r15,r15,r5			# Sa[4] += La[2]
		rotlwi	r25,r25,3			# Ab = Sb[4] = ROTL3(Sb[4])

		rotlwi	r15,r15,3			# Aa = Sa[4] = ROTL3(Sa[4])
		#----- Stages 30-31 ----------
		add		r10,r9,r25			# Tb = Lb[2] + Sb[4]

		add		r6,r5,r15			# Ta = La[2] + Sa[4]
		add		r7,r7,r10			# Lb[0] += Tb

		add		r3,r3,r6			# La[0] += Ta
		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r26,r26,r25			# Sb[5] += Sb[4]

		add		r16,r16,r15			# Sa[5] += Sa[4]
		add		r26,r26,r7			# Sb[5] += Lb[0]

		add		r16,r16,r3			# Sa[5] += La[0]
		rotlwi	r26,r26,3			# Ab = Sb[5] = ROTL3(Sb[5])

		rotlwi	r16,r16,3			# Aa = Sa[5] = ROTL3(Sa[5])
		#----- Stages 31-32 ----------
		add		r10,r7,r26			# Tb = Lb[0] + Sb[5]

		add		r6,r3,r16			# Ta = La[0] + Sa[5]
		add		r8,r8,r10			# Lb[1] += Tb

		add		r4,r4,r6			# La[1] += Ta
		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r27,r27,r26			# Sb[6] += Sb[5]

		add		r17,r17,r16			# Sa[6] += Sa[5]
		add		r27,r27,r8			# Sb[6] += Lb[1]

		add		r17,r17,r4			# Sa[6] += La[1]
		rotlwi	r27,r27,3			# Ab = Sb[6] = ROTL3(Sb[6])

		rotlwi	r17,r17,3			# Aa = Sa[6] = ROTL3(Sa[6])
		#----- Stages 32-33 ----------
		add		r10,r8,r27			# Tb = Lb[1] + Sb[6]

		add		r6,r4,r17			# Ta = La[1] + Sa[6]
		add		r9,r9,r10			# Lb[2] += Tb

		add		r5,r5,r6			# La[2] += Ta
		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r28,r28,r27			# Sb[7] += Sb[6]

		add		r18,r18,r17			# Sa[7] += Sa[6]
		add		r28,r28,r9			# Sb[7] += Lb[2]

		add		r18,r18,r5			# Sa[7] += La[2]
		rotlwi	r28,r28,3			# Ab = Sb[7] = ROTL3(Sb[7])

		rotlwi	r18,r18,3			# Aa = Sa[7] = ROTL3(Sa[7])
		#----- Stages 33-34 ----------
		add		r10,r9,r28			# Tb = Lb[2] + Sb[7]

		add		r6,r5,r18			# Ta = La[2] + Sa[7]
		add		r7,r7,r10			# Lb[0] += Tb

		add		r3,r3,r6			# La[0] += Ta
		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r29,r29,r28			# Sb[8] += Sb[7]

		add		r19,r19,r18			# Sa[8] += Sa[7]
		add		r29,r29,r7			# Sb[8] += Lb[0]

		add		r19,r19,r3			# Sa[8] += La[0]
		rotlwi	r29,r29,3			# Ab = Sb[8] = ROTL3(Sb[8])

		rotlwi	r19,r19,3			# Aa = Sa[8] = ROTL3(Sa[8])
		#----- Stages 34-35 ----------
		add		r10,r7,r29			# Tb = Lb[0] + Sb[8]

		add		r6,r3,r19			# Ta = La[0] + Sa[8]
		lwz		r2,Sb_09(r1)		# Load Sb[9]

		add		r8,r8,r10			# Lb[1] += Tb
		lwz		r0,Sa_09(r1)		# Load Sa[9]

		add		r4,r4,r6			# La[1] += Ta
		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r30,r2,r29			# Sb[9] += Sb[8]

		add		r20,r0,r19			# Sa[9] += Sa[8]
		add		r30,r30,r8			# Sb[9] += Lb[1]

		add		r20,r20,r4			# Sa[9] += La[1]
		rotlwi	r30,r30,3			# Ab = Sb[9] = ROTL3(Sb[9])

		rotlwi	r20,r20,3			# Aa = Sa[9] = ROTL3(Sa[9])
		stw		r30,Sb_09(r1)		# Store Sb[9]

		#----- Stages 35-36 ----------
		add		r10,r8,r30			# Tb = Lb[1] + Sb[9]
		stw		r20,Sa_09(r1)		# Store Sa[9]

		add		r6,r4,r20			# Ta = La[1] + Sa[9]
		lwz		r2,Sb_10(r1)		# Load Sb[10]

		add		r9,r9,r10			# Lb[2] += Tb
		lwz		r0,Sa_10(r1)		# Load Sa[10]

		add		r5,r5,r6			# La[2] += Ta
		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r30,r2,r30			# Sb[10] += Sb[9]

		add		r20,r0,r20			# Sa[10] += Sa[9]
		add		r30,r30,r9			# Sb[10] += Lb[2]

		add		r20,r20,r5			# Sa[10] += La[2]
		rotlwi	r30,r30,3			# Ab = Sb[10] = ROTL3(Sb[10])

		rotlwi	r20,r20,3			# Aa = Sa[10] = ROTL3(Sa[10])
		stw		r30,Sb_10(r1)		# Store Sb[10]

		#----- Stages 36-37 ----------
		add		r10,r9,r30			# Tb = Lb[2] + Sb[10]
		stw		r20,Sa_10(r1)		# Store Sa[10]

		add		r6,r5,r20			# Ta = La[2] + Sa[10]
		lwz		r2,Sb_11(r1)		# Load Sb[11]

		add		r7,r7,r10			# Lb[0] += Tb
		lwz		r0,Sa_11(r1)		# Load Sa[11]

		add		r3,r3,r6			# La[0] += Ta
		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r30,r2,r30			# Sb[11] += Sb[10]

		add		r20,r0,r20			# Sa[11] += Sa[10]
		add		r30,r30,r7			# Sb[11] += Lb[0]

		add		r20,r20,r3			# Sa[11] += La[0]
		rotlwi	r30,r30,3			# Ab = Sb[11] = ROTL3(Sb[11])

		rotlwi	r20,r20,3			# Aa = Sa[11] = ROTL3(Sa[11])
		stw		r30,Sb_11(r1)		# Store Sb[11]

		#----- Stages 37-38 ----------
		add		r10,r7,r30			# Tb = Lb[0] + Sb[11]
		stw		r20,Sa_11(r1)		# Store Sa[11]

		add		r6,r3,r20			# Ta = La[0] + Sa[11]
		lwz		r2,Sb_12(r1)		# Load Sb[12]

		add		r8,r8,r10			# Lb[1] += Tb
		lwz		r0,Sa_12(r1)		# Load Sa[12]

		add		r4,r4,r6			# La[1] += Ta
		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r30,r2,r30			# Sb[12] += Sb[11]

		add		r20,r0,r20			# Sa[12] += Sa[11]
		add		r30,r30,r8			# Sb[12] += Lb[1]

		add		r20,r20,r4			# Sa[12] += La[1]
		rotlwi	r30,r30,3			# Ab = Sb[12] = ROTL3(Sb[12])

		rotlwi	r20,r20,3			# Aa = Sa[12] = ROTL3(Sa[12])
		stw		r30,Sb_12(r1)		# Store Sb[12]

		#----- Stages 38-39 ----------
		add		r10,r8,r30			# Tb = Lb[1] + Sb[12]
		stw		r20,Sa_12(r1)		# Store Sa[12]

		add		r6,r4,r20			# Ta = La[1] + Sa[12]
		lwz		r2,Sb_13(r1)		# Load Sb[13]

		add		r9,r9,r10			# Lb[2] += Tb
		lwz		r0,Sa_13(r1)		# Load Sa[13]

		add		r5,r5,r6			# La[2] += Ta
		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r30,r2,r30			# Sb[13] += Sb[12]

		add		r20,r0,r20			# Sa[13] += Sa[12]
		add		r30,r30,r9			# Sb[13] += Lb[2]

		add		r20,r20,r5			# Sa[13] += La[2]
		rotlwi	r30,r30,3			# Ab = Sb[13] = ROTL3(Sb[13])

		rotlwi	r20,r20,3			# Aa = Sa[13] = ROTL3(Sa[13])
		stw		r30,Sb_13(r1)		# Store Sb[13]

		#----- Stages 39-40 ----------
		add		r10,r9,r30			# Tb = Lb[2] + Sb[13]
		stw		r20,Sa_13(r1)		# Store Sa[13]

		add		r6,r5,r20			# Ta = La[2] + Sa[13]
		lwz		r2,Sb_14(r1)		# Load Sb[14]

		add		r7,r7,r10			# Lb[0] += Tb
		lwz		r0,Sa_14(r1)		# Load Sa[14]

		add		r3,r3,r6			# La[0] += Ta
		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r30,r2,r30			# Sb[14] += Sb[13]

		add		r20,r0,r20			# Sa[14] += Sa[13]
		add		r30,r30,r7			# Sb[14] += Lb[0]

		add		r20,r20,r3			# Sa[14] += La[0]
		rotlwi	r30,r30,3			# Ab = Sb[14] = ROTL3(Sb[14])

		rotlwi	r20,r20,3			# Aa = Sa[14] = ROTL3(Sa[14])
		stw		r30,Sb_14(r1)		# Store Sb[14]

		#----- Stages 40-41 ----------
		add		r10,r7,r30			# Tb = Lb[0] + Sb[14]
		stw		r20,Sa_14(r1)		# Store Sa[14]

		add		r6,r3,r20			# Ta = La[0] + Sa[14]
		lwz		r2,Sb_15(r1)		# Load Sb[15]

		add		r8,r8,r10			# Lb[1] += Tb
		lwz		r0,Sa_15(r1)		# Load Sa[15]

		add		r4,r4,r6			# La[1] += Ta
		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r30,r2,r30			# Sb[15] += Sb[14]

		add		r20,r0,r20			# Sa[15] += Sa[14]
		add		r30,r30,r8			# Sb[15] += Lb[1]

		add		r20,r20,r4			# Sa[15] += La[1]
		rotlwi	r30,r30,3			# Ab = Sb[15] = ROTL3(Sb[15])

		rotlwi	r20,r20,3			# Aa = Sa[15] = ROTL3(Sa[15])
		stw		r30,Sb_15(r1)		# Store Sb[15]

		#----- Stages 41-42 ----------
		add		r10,r8,r30			# Tb = Lb[1] + Sb[15]
		stw		r20,Sa_15(r1)		# Store Sa[15]

		add		r6,r4,r20			# Ta = La[1] + Sa[15]
		lwz		r2,Sb_16(r1)		# Load Sb[16]

		add		r9,r9,r10			# Lb[2] += Tb
		lwz		r0,Sa_16(r1)		# Load Sa[16]

		add		r5,r5,r6			# La[2] += Ta
		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r30,r2,r30			# Sb[16] += Sb[15]

		add		r20,r0,r20			# Sa[16] += Sa[15]
		add		r30,r30,r9			# Sb[16] += Lb[2]

		add		r20,r20,r5			# Sa[16] += La[2]
		rotlwi	r30,r30,3			# Ab = Sb[16] = ROTL3(Sb[16])

		rotlwi	r20,r20,3			# Aa = Sa[16] = ROTL3(Sa[16])
		stw		r30,Sb_16(r1)		# Store Sb[16]

		#----- Stages 42-43 ----------
		add		r10,r9,r30			# Tb = Lb[2] + Sb[16]
		stw		r20,Sa_16(r1)		# Store Sa[16]

		add		r6,r5,r20			# Ta = La[2] + Sa[16]
		lwz		r2,Sb_17(r1)		# Load Sb[17]

		add		r7,r7,r10			# Lb[0] += Tb
		lwz		r0,Sa_17(r1)		# Load Sa[17]

		add		r3,r3,r6			# La[0] += Ta
		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r30,r2,r30			# Sb[17] += Sb[16]

		add		r20,r0,r20			# Sa[17] += Sa[16]
		add		r30,r30,r7			# Sb[17] += Lb[0]

		add		r20,r20,r3			# Sa[17] += La[0]
		rotlwi	r30,r30,3			# Ab = Sb[17] = ROTL3(Sb[17])

		rotlwi	r20,r20,3			# Aa = Sa[17] = ROTL3(Sa[17])
		stw		r30,Sb_17(r1)		# Store Sb[17]

		#----- Stages 43-44 ----------
		add		r10,r7,r30			# Tb = Lb[0] + Sb[17]
		stw		r20,Sa_17(r1)		# Store Sa[17]

		add		r6,r3,r20			# Ta = La[0] + Sa[17]
		lwz		r2,Sb_18(r1)		# Load Sb[18]

		add		r8,r8,r10			# Lb[1] += Tb
		lwz		r0,Sa_18(r1)		# Load Sa[18]

		add		r4,r4,r6			# La[1] += Ta
		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r30,r2,r30			# Sb[18] += Sb[17]

		add		r20,r0,r20			# Sa[18] += Sa[17]
		add		r30,r30,r8			# Sb[18] += Lb[1]

		add		r20,r20,r4			# Sa[18] += La[1]
		rotlwi	r30,r30,3			# Ab = Sb[18] = ROTL3(Sb[18])

		rotlwi	r20,r20,3			# Aa = Sa[18] = ROTL3(Sa[18])
		stw		r30,Sb_18(r1)		# Store Sb[18]

		#----- Stages 44-45 ----------
		add		r10,r8,r30			# Tb = Lb[1] + Sb[18]
		stw		r20,Sa_18(r1)		# Store Sa[18]

		add		r6,r4,r20			# Ta = La[1] + Sa[18]
		lwz		r2,Sb_19(r1)		# Load Sb[19]

		add		r9,r9,r10			# Lb[2] += Tb
		lwz		r0,Sa_19(r1)		# Load Sa[19]

		add		r5,r5,r6			# La[2] += Ta
		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r30,r2,r30			# Sb[19] += Sb[18]

		add		r20,r0,r20			# Sa[19] += Sa[18]
		add		r30,r30,r9			# Sb[19] += Lb[2]

		add		r20,r20,r5			# Sa[19] += La[2]
		rotlwi	r30,r30,3			# Ab = Sb[19] = ROTL3(Sb[19])

		rotlwi	r20,r20,3			# Aa = Sa[19] = ROTL3(Sa[19])
		stw		r30,Sb_19(r1)		# Store Sb[19]

		#----- Stages 45-46 ----------
		add		r10,r9,r30			# Tb = Lb[2] + Sb[19]
		stw		r20,Sa_19(r1)		# Store Sa[19]

		add		r6,r5,r20			# Ta = La[2] + Sa[19]
		lwz		r2,Sb_20(r1)		# Load Sb[20]

		add		r7,r7,r10			# Lb[0] += Tb
		lwz		r0,Sa_20(r1)		# Load Sa[20]

		add		r3,r3,r6			# La[0] += Ta
		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r30,r2,r30			# Sb[20] += Sb[19]

		add		r20,r0,r20			# Sa[20] += Sa[19]
		add		r30,r30,r7			# Sb[20] += Lb[0]

		add		r20,r20,r3			# Sa[20] += La[0]
		rotlwi	r30,r30,3			# Ab = Sb[20] = ROTL3(Sb[20])

		rotlwi	r20,r20,3			# Aa = Sa[20] = ROTL3(Sa[20])
		stw		r30,Sb_20(r1)		# Store Sb[20]

		#----- Stages 46-47 ----------
		add		r10,r7,r30			# Tb = Lb[0] + Sb[20]
		stw		r20,Sa_20(r1)		# Store Sa[20]

		add		r6,r3,r20			# Ta = La[0] + Sa[20]
		lwz		r2,Sb_21(r1)		# Load Sb[21]

		add		r8,r8,r10			# Lb[1] += Tb
		lwz		r0,Sa_21(r1)		# Load Sa[21]

		add		r4,r4,r6			# La[1] += Ta
		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r30,r2,r30			# Sb[21] += Sb[20]

		add		r20,r0,r20			# Sa[21] += Sa[20]
		add		r30,r30,r8			# Sb[21] += Lb[1]

		add		r20,r20,r4			# Sa[21] += La[1]
		rotlwi	r30,r30,3			# Ab = Sb[21] = ROTL3(Sb[21])

		rotlwi	r20,r20,3			# Aa = Sa[21] = ROTL3(Sa[21])
		stw		r30,Sb_21(r1)		# Store Sb[21]

		#----- Stages 47-48 ----------
		add		r10,r8,r30			# Tb = Lb[1] + Sb[21]
		stw		r20,Sa_21(r1)		# Store Sa[21]

		add		r6,r4,r20			# Ta = La[1] + Sa[21]
		lwz		r2,Sb_22(r1)		# Load Sb[22]

		add		r9,r9,r10			# Lb[2] += Tb
		lwz		r0,Sa_22(r1)		# Load Sa[22]

		add		r5,r5,r6			# La[2] += Ta
		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r30,r2,r30			# Sb[22] += Sb[21]

		add		r20,r0,r20			# Sa[22] += Sa[21]
		add		r30,r30,r9			# Sb[22] += Lb[2]

		add		r20,r20,r5			# Sa[22] += La[2]
		rotlwi	r30,r30,3			# Ab = Sb[22] = ROTL3(Sb[22])

		rotlwi	r20,r20,3			# Aa = Sa[22] = ROTL3(Sa[22])
		stw		r30,Sb_22(r1)		# Store Sb[22]

		#----- Stages 48-49 ----------
		add		r10,r9,r30			# Tb = Lb[2] + Sb[22]
		stw		r20,Sa_22(r1)		# Store Sa[22]

		add		r6,r5,r20			# Ta = La[2] + Sa[22]
		lwz		r2,Sb_23(r1)		# Load Sb[23]

		add		r7,r7,r10			# Lb[0] += Tb
		lwz		r0,Sa_23(r1)		# Load Sa[23]

		add		r3,r3,r6			# La[0] += Ta
		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r30,r2,r30			# Sb[23] += Sb[22]

		add		r20,r0,r20			# Sa[23] += Sa[22]
		add		r30,r30,r7			# Sb[23] += Lb[0]

		add		r20,r20,r3			# Sa[23] += La[0]
		rotlwi	r30,r30,3			# Ab = Sb[23] = ROTL3(Sb[23])

		rotlwi	r20,r20,3			# Aa = Sa[23] = ROTL3(Sa[23])
		stw		r30,Sb_23(r1)		# Store Sb[23]

		#----- Stages 49-50 ----------
		add		r10,r7,r30			# Tb = Lb[0] + Sb[23]
		stw		r20,Sa_23(r1)		# Store Sa[23]

		add		r6,r3,r20			# Ta = La[0] + Sa[23]
		lwz		r2,Sb_24(r1)		# Load Sb[24]

		add		r8,r8,r10			# Lb[1] += Tb
		lwz		r0,Sa_24(r1)		# Load Sa[24]

		add		r4,r4,r6			# La[1] += Ta
		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r30,r2,r30			# Sb[24] += Sb[23]

		add		r20,r0,r20			# Sa[24] += Sa[23]
		add		r30,r30,r8			# Sb[24] += Lb[1]

		add		r20,r20,r4			# Sa[24] += La[1]
		rotlwi	r30,r30,3			# Ab = Sb[24] = ROTL3(Sb[24])

		rotlwi	r20,r20,3			# Aa = Sa[24] = ROTL3(Sa[24])
		stw		r30,Sb_24(r1)		# Store Sb[24]

		#----- Stages 50-51 ----------
		add		r10,r8,r30			# Tb = Lb[1] + Sb[24]
		stw		r20,Sa_24(r1)		# Store Sa[24]

		add		r6,r4,r20			# Ta = La[1] + Sa[24]
		lwz		r2,Sb_25(r1)		# Load Sb[25]

		add		r9,r9,r10			# Lb[2] += Tb
		lwz		r0,Sa_25(r1)		# Load Sa[25]

		add		r5,r5,r6			# La[2] += Ta
		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r30,r2,r30			# Sb[25] += Sb[24]

		add		r20,r0,r20			# Sa[25] += Sa[24]
		add		r30,r30,r9			# Sb[25] += Lb[2]

		add		r20,r20,r5			# Sa[25] += La[2]
		rotlwi	r30,r30,3			# Ab = Sb[25] = ROTL3(Sb[25])

		rotlwi	r20,r20,3			# Aa = Sa[25] = ROTL3(Sa[25])
		stw		r30,Sb_25(r1)		# Store Sb[25]

		#----- Stages 51-52 ----------
		add		r10,r9,r30			# Tb = Lb[2] + Sb[25]
		stw		r20,Sa_25(r1)		# Store Sa[25]

		add		r6,r5,r20			# Ta = La[2] + Sa[25]
		lwz		r2,wPlainHi(r1)		# Load plain.hi

		add		r7,r7,r10			# Lb[0] += Tb
		lwz		r0,wPlainLo(r1)		# Load plain.lo

		add		r3,r3,r6			# La[0] += Ta
		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r21,r21,r30			# Sb[0] += Sb[25]

		add		r11,r11,r20			# Sa[0] += Sa[25]
		add		r21,r21,r7			# Sb[0] += Lb[0]

		add		r11,r11,r3			# Sa[0] += La[0]
		rotlwi	r21,r21,3			# Ab = Sb[0] = ROTL3(Sb[0])

		rotlwi	r11,r11,3			# Aa = Sa[0] = ROTL3(Sa[0])
		#----- Stages 52-53 ----------
		add		r10,r7,r21			# Tb = Lb[0] + Sb[0]

		add		r6,r3,r11			# Ta = La[0] + Sa[0]
		add		r8,r8,r10			# Lb[1] += Tb

		add		r4,r4,r6			# La[1] += Ta
		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r22,r22,r21			# Sb[1] += Sb[0]

		add		r12,r12,r11			# Sa[1] += Sa[0]
		add		r22,r22,r8			# Sb[1] += Lb[1]

		add		r11,r11,r0			# RND_Aa = Sa[0] + plain.lo
		add		r21,r21,r0			# RND_Ab = Sb[0] + plain.lo

		add		r12,r12,r4			# Sa[1] += La[1]
		rotlwi	r22,r22,3			# Ab = Sb[1] = ROTL3(Sb[1])

		rotlwi	r12,r12,3			# Aa = Sa[1] = ROTL3(Sa[1])
		#----- Stages 53-54 ----------
		add		r10,r8,r22			# Tb = Lb[1] + Sb[1]

		add		r6,r4,r12			# Ta = La[1] + Sa[1]
		add		r9,r9,r10			# Lb[2] += Tb

		add		r5,r5,r6			# La[2] += Ta
		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r23,r23,r22			# Sb[2] += Sb[1]

		add		r13,r13,r12			# Sa[2] += Sa[1]
		add		r23,r23,r9			# Sb[2] += Lb[2]

		add		r12,r12,r2			# RND_Ba = Sa[1] + plain.hi
		add		r22,r22,r2			# RND_Bb = Sb[1] + plain.hi

		add		r13,r13,r5			# Sa[2] += La[2]
		rotlwi	r23,r23,3			# Ab = Sb[2] = ROTL3(Sb[2])

		rotlwi	r13,r13,3			# Aa = Sa[2] = ROTL3(Sa[2])

		# Combined key mixing and round stages

		#----- Stages 54-55 ----------
		add		r10,r9,r23			# Tb = Lb[2] + Sb[2]

		xor		r11,r11,r12			# RND_Aa ^= RND_Ba
		add		r6,r5,r13			# Ta = La[2] + Sa[2]

		rotlw	r11,r11,r12			# RND_Aa = ROTL(RND_Aa,RND_Ba)
		add		r7,r7,r10			# Lb[0] += Tb

		add		r11,r11,r13			# RND_Aa += Sa[2]
		xor		r21,r21,r22			# RND_Ab ^= RND_Bb

		rotlw	r21,r21,r22			# RND_Ab = ROTL(RND_Ab,RND_Bb)
		add		r3,r3,r6			# La[0] += Ta

		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)
		add		r21,r21,r23			# RND_Ab += Sb[2]

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r24,r24,r23			# Sb[3] += Sb[2]

		add		r14,r14,r13			# Sa[3] += Sa[2]
		add		r24,r24,r7			# Sb[3] += Lb[0]

		add		r14,r14,r3			# Sa[3] += La[0]
		rotlwi	r23,r24,3			# Ab = Sb[3] = ROTL3(Sb[3])

		rotlwi	r13,r14,3			# Aa = Sa[3] = ROTL3(Sa[3])
		#----- Stages 55-56 ----------
		add		r10,r7,r23			# Tb = Lb[0] + Sb[3]

		xor		r12,r12,r11			# RND_Ba ^= RND_Aa
		add		r6,r3,r13			# Ta = La[0] + Sa[3]

		rotlw	r12,r12,r11			# RND_Ba = ROTL(RND_Ba,RND_Aa)
		add		r8,r8,r10			# Lb[1] += Tb

		add		r12,r12,r13			# RND_Ba += Sa[3]
		xor		r22,r22,r21			# RND_Bb ^= RND_Ab

		rotlw	r22,r22,r21			# RND_Bb = ROTL(RND_Bb,RND_Ab)
		add		r4,r4,r6			# La[1] += Ta

		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)
		add		r22,r22,r23			# RND_Bb += Sb[3]

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r24,r25,r23			# Sb[4] += Sb[3]

		add		r14,r15,r13			# Sa[4] += Sa[3]
		add		r24,r24,r8			# Sb[4] += Lb[1]

		add		r14,r14,r4			# Sa[4] += La[1]
		rotlwi	r23,r24,3			# Ab = Sb[4] = ROTL3(Sb[4])

		rotlwi	r13,r14,3			# Aa = Sa[4] = ROTL3(Sa[4])
		#----- Stages 56-57 ----------
		add		r10,r8,r23			# Tb = Lb[1] + Sb[4]

		xor		r11,r11,r12			# RND_Aa ^= RND_Ba
		add		r6,r4,r13			# Ta = La[1] + Sa[4]

		rotlw	r11,r11,r12			# RND_Aa = ROTL(RND_Aa,RND_Ba)
		add		r9,r9,r10			# Lb[2] += Tb

		add		r11,r11,r13			# RND_Aa += Sa[4]
		xor		r21,r21,r22			# RND_Ab ^= RND_Bb

		rotlw	r21,r21,r22			# RND_Ab = ROTL(RND_Ab,RND_Bb)
		add		r5,r5,r6			# La[2] += Ta

		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)
		add		r21,r21,r23			# RND_Ab += Sb[4]

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r24,r26,r23			# Sb[5] += Sb[4]

		add		r14,r16,r13			# Sa[5] += Sa[4]
		add		r24,r24,r9			# Sb[5] += Lb[2]

		add		r14,r14,r5			# Sa[5] += La[2]
		rotlwi	r23,r24,3			# Ab = Sb[5] = ROTL3(Sb[5])

		rotlwi	r13,r14,3			# Aa = Sa[5] = ROTL3(Sa[5])
		#----- Stages 57-58 ----------
		add		r10,r9,r23			# Tb = Lb[2] + Sb[5]

		xor		r12,r12,r11			# RND_Ba ^= RND_Aa
		add		r6,r5,r13			# Ta = La[2] + Sa[5]

		rotlw	r12,r12,r11			# RND_Ba = ROTL(RND_Ba,RND_Aa)
		add		r7,r7,r10			# Lb[0] += Tb

		add		r12,r12,r13			# RND_Ba += Sa[5]
		xor		r22,r22,r21			# RND_Bb ^= RND_Ab

		rotlw	r22,r22,r21			# RND_Bb = ROTL(RND_Bb,RND_Ab)
		add		r3,r3,r6			# La[0] += Ta

		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)
		add		r22,r22,r23			# RND_Bb += Sb[5]

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r24,r27,r23			# Sb[6] += Sb[5]

		add		r14,r17,r13			# Sa[6] += Sa[5]
		add		r24,r24,r7			# Sb[6] += Lb[0]

		add		r14,r14,r3			# Sa[6] += La[0]
		rotlwi	r23,r24,3			# Ab = Sb[6] = ROTL3(Sb[6])

		rotlwi	r13,r14,3			# Aa = Sa[6] = ROTL3(Sa[6])
		#----- Stages 58-59 ----------
		add		r10,r7,r23			# Tb = Lb[0] + Sb[6]

		xor		r11,r11,r12			# RND_Aa ^= RND_Ba
		add		r6,r3,r13			# Ta = La[0] + Sa[6]

		rotlw	r11,r11,r12			# RND_Aa = ROTL(RND_Aa,RND_Ba)
		add		r8,r8,r10			# Lb[1] += Tb

		add		r11,r11,r13			# RND_Aa += Sa[6]
		xor		r21,r21,r22			# RND_Ab ^= RND_Bb

		rotlw	r21,r21,r22			# RND_Ab = ROTL(RND_Ab,RND_Bb)
		add		r4,r4,r6			# La[1] += Ta

		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)
		add		r21,r21,r23			# RND_Ab += Sb[6]

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r24,r28,r23			# Sb[7] += Sb[6]

		add		r14,r18,r13			# Sa[7] += Sa[6]
		add		r24,r24,r8			# Sb[7] += Lb[1]

		add		r14,r14,r4			# Sa[7] += La[1]
		rotlwi	r23,r24,3			# Ab = Sb[7] = ROTL3(Sb[7])

		rotlwi	r13,r14,3			# Aa = Sa[7] = ROTL3(Sa[7])
		#----- Stages 59-60 ----------
		add		r10,r8,r23			# Tb = Lb[1] + Sb[7]

		xor		r12,r12,r11			# RND_Ba ^= RND_Aa
		add		r6,r4,r13			# Ta = La[1] + Sa[7]

		rotlw	r12,r12,r11			# RND_Ba = ROTL(RND_Ba,RND_Aa)
		add		r9,r9,r10			# Lb[2] += Tb

		add		r12,r12,r13			# RND_Ba += Sa[7]
		xor		r22,r22,r21			# RND_Bb ^= RND_Ab

		rotlw	r22,r22,r21			# RND_Bb = ROTL(RND_Bb,RND_Ab)
		add		r5,r5,r6			# La[2] += Ta

		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)
		add		r22,r22,r23			# RND_Bb += Sb[7]

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r24,r29,r23			# Sb[8] += Sb[7]

		add		r14,r19,r13			# Sa[8] += Sa[7]
		add		r24,r24,r9			# Sb[8] += Lb[2]

		add		r14,r14,r5			# Sa[8] += La[2]
		rotlwi	r23,r24,3			# Ab = Sb[8] = ROTL3(Sb[8])

		rotlwi	r13,r14,3			# Aa = Sa[8] = ROTL3(Sa[8])
		#----- Stages 60-61 ----------
		add		r10,r9,r23			# Tb = Lb[2] + Sb[8]

		xor		r11,r11,r12			# RND_Aa ^= RND_Ba
		lwz		r24,Sb_09(r1)		# Load Sb[9]

		add		r6,r5,r13			# Ta = La[2] + Sa[8]
		lwz		r14,Sa_09(r1)		# Load Sa[9]

		rotlw	r11,r11,r12			# RND_Aa = ROTL(RND_Aa,RND_Ba)
		add		r7,r7,r10			# Lb[0] += Tb

		add		r11,r11,r13			# RND_Aa += Sa[8]
		xor		r21,r21,r22			# RND_Ab ^= RND_Bb

		rotlw	r21,r21,r22			# RND_Ab = ROTL(RND_Ab,RND_Bb)
		add		r3,r3,r6			# La[0] += Ta

		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)
		add		r21,r21,r23			# RND_Ab += Sb[8]

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r24,r24,r23			# Sb[9] += Sb[8]

		add		r14,r14,r13			# Sa[9] += Sa[8]
		add		r24,r24,r7			# Sb[9] += Lb[0]

		add		r14,r14,r3			# Sa[9] += La[0]
		rotlwi	r23,r24,3			# Ab = Sb[9] = ROTL3(Sb[9])

		rotlwi	r13,r14,3			# Aa = Sa[9] = ROTL3(Sa[9])
		#----- Stages 61-62 ----------
		add		r10,r7,r23			# Tb = Lb[0] + Sb[9]

		xor		r12,r12,r11			# RND_Ba ^= RND_Aa
		lwz		r24,Sb_10(r1)		# Load Sb[10]

		add		r6,r3,r13			# Ta = La[0] + Sa[9]
		lwz		r14,Sa_10(r1)		# Load Sa[10]

		rotlw	r12,r12,r11			# RND_Ba = ROTL(RND_Ba,RND_Aa)
		add		r8,r8,r10			# Lb[1] += Tb

		add		r12,r12,r13			# RND_Ba += Sa[9]
		xor		r22,r22,r21			# RND_Bb ^= RND_Ab

		rotlw	r22,r22,r21			# RND_Bb = ROTL(RND_Bb,RND_Ab)
		add		r4,r4,r6			# La[1] += Ta

		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)
		add		r22,r22,r23			# RND_Bb += Sb[9]

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r24,r24,r23			# Sb[10] += Sb[9]

		add		r14,r14,r13			# Sa[10] += Sa[9]
		add		r24,r24,r8			# Sb[10] += Lb[1]

		add		r14,r14,r4			# Sa[10] += La[1]
		rotlwi	r23,r24,3			# Ab = Sb[10] = ROTL3(Sb[10])

		rotlwi	r13,r14,3			# Aa = Sa[10] = ROTL3(Sa[10])
		#----- Stages 62-63 ----------
		add		r10,r8,r23			# Tb = Lb[1] + Sb[10]

		xor		r11,r11,r12			# RND_Aa ^= RND_Ba
		lwz		r24,Sb_11(r1)		# Load Sb[11]

		add		r6,r4,r13			# Ta = La[1] + Sa[10]
		lwz		r14,Sa_11(r1)		# Load Sa[11]

		rotlw	r11,r11,r12			# RND_Aa = ROTL(RND_Aa,RND_Ba)
		add		r9,r9,r10			# Lb[2] += Tb

		add		r11,r11,r13			# RND_Aa += Sa[10]
		xor		r21,r21,r22			# RND_Ab ^= RND_Bb

		rotlw	r21,r21,r22			# RND_Ab = ROTL(RND_Ab,RND_Bb)
		add		r5,r5,r6			# La[2] += Ta

		rotlw	r9,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)
		add		r21,r21,r23			# RND_Ab += Sb[10]

		rotlw	r5,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r24,r24,r23			# Sb[11] += Sb[10]

		add		r14,r14,r13			# Sa[11] += Sa[10]
		add		r24,r24,r9			# Sb[11] += Lb[2]

		add		r14,r14,r5			# Sa[11] += La[2]
		rotlwi	r23,r24,3			# Ab = Sb[11] = ROTL3(Sb[11])

		rotlwi	r13,r14,3			# Aa = Sa[11] = ROTL3(Sa[11])
		#----- Stages 63-64 ----------
		add		r10,r9,r23			# Tb = Lb[2] + Sb[11]

		xor		r12,r12,r11			# RND_Ba ^= RND_Aa
		lwz		r24,Sb_12(r1)		# Load Sb[12]

		add		r6,r5,r13			# Ta = La[2] + Sa[11]
		lwz		r14,Sa_12(r1)		# Load Sa[12]

		rotlw	r12,r12,r11			# RND_Ba = ROTL(RND_Ba,RND_Aa)
		add		r7,r7,r10			# Lb[0] += Tb

		add		r12,r12,r13			# RND_Ba += Sa[11]
		xor		r22,r22,r21			# RND_Bb ^= RND_Ab

		rotlw	r22,r22,r21			# RND_Bb = ROTL(RND_Bb,RND_Ab)
		add		r3,r3,r6			# La[0] += Ta

		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)
		add		r22,r22,r23			# RND_Bb += Sb[11]

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r24,r24,r23			# Sb[12] += Sb[11]

		add		r14,r14,r13			# Sa[12] += Sa[11]
		add		r24,r24,r7			# Sb[12] += Lb[0]

		add		r14,r14,r3			# Sa[12] += La[0]
		rotlwi	r23,r24,3			# Ab = Sb[12] = ROTL3(Sb[12])

		rotlwi	r13,r14,3			# Aa = Sa[12] = ROTL3(Sa[12])
		#----- Stages 64-65 ----------
		add		r10,r7,r23			# Tb = Lb[0] + Sb[12]

		xor		r11,r11,r12			# RND_Aa ^= RND_Ba
		lwz		r24,Sb_13(r1)		# Load Sb[13]

		add		r6,r3,r13			# Ta = La[0] + Sa[12]
		lwz		r14,Sa_13(r1)		# Load Sa[13]

		rotlw	r11,r11,r12			# RND_Aa = ROTL(RND_Aa,RND_Ba)
		add		r8,r8,r10			# Lb[1] += Tb

		add		r11,r11,r13			# RND_Aa += Sa[12]
		xor		r21,r21,r22			# RND_Ab ^= RND_Bb

		rotlw	r21,r21,r22			# RND_Ab = ROTL(RND_Ab,RND_Bb)
		add		r4,r4,r6			# La[1] += Ta

		rotlw	r8,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)
		add		r21,r21,r23			# RND_Ab += Sb[12]

		rotlw	r4,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r24,r24,r23			# Sb[13] += Sb[12]

		add		r14,r14,r13			# Sa[13] += Sa[12]
		add		r24,r24,r8			# Sb[13] += Lb[1]

		add		r14,r14,r4			# Sa[13] += La[1]
		rotlwi	r23,r24,3			# Ab = Sb[13] = ROTL3(Sb[13])

		rotlwi	r13,r14,3			# Aa = Sa[13] = ROTL3(Sa[13])
		#----- Stages 65-66 ----------
		add		r10,r8,r23			# Tb = Lb[1] + Sb[13]

		xor		r12,r12,r11			# RND_Ba ^= RND_Aa
		lwz		r24,Sb_14(r1)		# Load Sb[14]

		add		r6,r4,r13			# Ta = La[1] + Sa[13]
		lwz		r14,Sa_14(r1)		# Load Sa[14]

		rotlw	r12,r12,r11			# RND_Ba = ROTL(RND_Ba,RND_Aa)
		add		r9,r9,r10			# Lb[2] += Tb

		add		r12,r12,r13			# RND_Ba += Sa[13]
		xor		r22,r22,r21			# RND_Bb ^= RND_Ab

		rotlw	r22,r22,r21			# RND_Bb = ROTL(RND_Bb,RND_Ab)
		add		r5,r5,r6			# La[2] += Ta

		rotlw	r26,r9,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)		<<< RENAME >>>
		add		r22,r22,r23			# RND_Bb += Sb[13]

		rotlw	r16,r5,r6			# Ba = La[2] = ROTL(La[2], Ta)		<<< RENAME >>>
		add		r24,r24,r23			# Sb[14] += Sb[13]

		add		r14,r14,r13			# Sa[14] += Sa[13]
		add		r24,r24,r26			# Sb[14] += Lb[2]

		add		r14,r14,r16			# Sa[14] += La[2]
		rotlwi	r23,r24,3			# Ab = Sb[14] = ROTL3(Sb[14])

		rotlwi	r13,r14,3			# Aa = Sa[14] = ROTL3(Sa[14])
		#----- Stages 66-67 ----------
		add		r10,r26,r23			# Tb = Lb[2] + Sb[14]

		xor		r11,r11,r12			# RND_Aa ^= RND_Ba
		lwz		r24,Sb_15(r1)		# Load Sb[15]

		add		r6,r16,r13			# Ta = La[2] + Sa[14]
		lwz		r14,Sa_15(r1)		# Load Sa[15]

		rotlw	r11,r11,r12			# RND_Aa = ROTL(RND_Aa,RND_Ba)
		add		r7,r7,r10			# Lb[0] += Tb

		add		r11,r11,r13			# RND_Aa += Sa[14]
		xor		r21,r21,r22			# RND_Ab ^= RND_Bb

		rotlw	r21,r21,r22			# RND_Ab = ROTL(RND_Ab,RND_Bb)
		add		r3,r3,r6			# La[0] += Ta

		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)
		add		r21,r21,r23			# RND_Ab += Sb[14]

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r24,r24,r23			# Sb[15] += Sb[14]

		add		r14,r14,r13			# Sa[15] += Sa[14]
		add		r24,r24,r7			# Sb[15] += Lb[0]

		add		r14,r14,r3			# Sa[15] += La[0]
		rotlwi	r23,r24,3			# Ab = Sb[15] = ROTL3(Sb[15])

		rotlwi	r13,r14,3			# Aa = Sa[15] = ROTL3(Sa[15])
		#----- Stages 67-68 ----------
		add		r10,r7,r23			# Tb = Lb[0] + Sb[15]

		xor		r12,r12,r11			# RND_Ba ^= RND_Aa
		lwz		r24,Sb_16(r1)		# Load Sb[16]

		add		r6,r3,r13			# Ta = La[0] + Sa[15]
		lwz		r14,Sa_16(r1)		# Load Sa[16]

		rotlw	r12,r12,r11			# RND_Ba = ROTL(RND_Ba,RND_Aa)
		add		r8,r8,r10			# Lb[1] += Tb

		add		r12,r12,r13			# RND_Ba += Sa[15]
		xor		r22,r22,r21			# RND_Bb ^= RND_Ab

		rotlw	r22,r22,r21			# RND_Bb = ROTL(RND_Bb,RND_Ab)
		add		r4,r4,r6			# La[1] += Ta

		rotlw	r25,r8,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)		<<< RENAME >>>
		add		r22,r22,r23			# RND_Bb += Sb[15]

		rotlw	r15,r4,r6			# Ba = La[1] = ROTL(La[1], Ta)		<<< RENAME >>>
		add		r24,r24,r23			# Sb[16] += Sb[15]

		add		r14,r14,r13			# Sa[16] += Sa[15]
		add		r24,r24,r25			# Sb[16] += Lb[1]

		add		r14,r14,r15			# Sa[16] += La[1]
		rotlwi	r23,r24,3			# Ab = Sb[16] = ROTL3(Sb[16])

		rotlwi	r13,r14,3			# Aa = Sa[16] = ROTL3(Sa[16])
		#----- Stages 68-69 ----------
		add		r10,r25,r23			# Tb = Lb[1] + Sb[16]

		xor		r11,r11,r12			# RND_Aa ^= RND_Ba
		lwz		r24,Sb_17(r1)		# Load Sb[17]

		add		r6,r15,r13			# Ta = La[1] + Sa[16]
		lwz		r14,Sa_17(r1)		# Load Sa[17]

		rotlw	r11,r11,r12			# RND_Aa = ROTL(RND_Aa,RND_Ba)
		add		r26,r26,r10			# Lb[2] += Tb

		add		r11,r11,r13			# RND_Aa += Sa[16]
		xor		r21,r21,r22			# RND_Ab ^= RND_Bb

		rotlw	r21,r21,r22			# RND_Ab = ROTL(RND_Ab,RND_Bb)
		add		r16,r16,r6			# La[2] += Ta

		rotlw	r26,r26,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)
		add		r21,r21,r23			# RND_Ab += Sb[16]

		rotlw	r16,r16,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r24,r24,r23			# Sb[17] += Sb[16]

		add		r14,r14,r13			# Sa[17] += Sa[16]
		add		r24,r24,r26			# Sb[17] += Lb[2]

		add		r14,r14,r16			# Sa[17] += La[2]
		rotlwi	r23,r24,3			# Ab = Sb[17] = ROTL3(Sb[17])

		rotlwi	r13,r14,3			# Aa = Sa[17] = ROTL3(Sa[17])
		#----- Stages 69-70 ----------
		add		r10,r26,r23			# Tb = Lb[2] + Sb[17]

		xor		r12,r12,r11			# RND_Ba ^= RND_Aa
		lwz		r24,Sb_18(r1)		# Load Sb[18]

		add		r6,r16,r13			# Ta = La[2] + Sa[17]
		lwz		r14,Sa_18(r1)		# Load Sa[18]

		rotlw	r12,r12,r11			# RND_Ba = ROTL(RND_Ba,RND_Aa)
		add		r7,r7,r10			# Lb[0] += Tb

		add		r12,r12,r13			# RND_Ba += Sa[17]
		xor		r22,r22,r21			# RND_Bb ^= RND_Ab

		rotlw	r22,r22,r21			# RND_Bb = ROTL(RND_Bb,RND_Ab)
		add		r3,r3,r6			# La[0] += Ta

		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)
		add		r22,r22,r23			# RND_Bb += Sb[17]

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r24,r24,r23			# Sb[18] += Sb[17]

		add		r14,r14,r13			# Sa[18] += Sa[17]
		add		r24,r24,r7			# Sb[18] += Lb[0]

		add		r14,r14,r3			# Sa[18] += La[0]
		rotlwi	r23,r24,3			# Ab = Sb[18] = ROTL3(Sb[18])

		rotlwi	r13,r14,3			# Aa = Sa[18] = ROTL3(Sa[18])
		#----- Stages 70-71 ----------
		add		r10,r7,r23			# Tb = Lb[0] + Sb[18]

		xor		r11,r11,r12			# RND_Aa ^= RND_Ba
		lwz		r24,Sb_19(r1)		# Load Sb[19]

		add		r6,r3,r13			# Ta = La[0] + Sa[18]
		lwz		r14,Sa_19(r1)		# Load Sa[19]

		rotlw	r11,r11,r12			# RND_Aa = ROTL(RND_Aa,RND_Ba)
		add		r25,r25,r10			# Lb[1] += Tb

		add		r11,r11,r13			# RND_Aa += Sa[18]
		xor		r21,r21,r22			# RND_Ab ^= RND_Bb

		rotlw	r21,r21,r22			# RND_Ab = ROTL(RND_Ab,RND_Bb)
		add		r15,r15,r6			# La[1] += Ta

		rotlw	r25,r25,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)
		add		r21,r21,r23			# RND_Ab += Sb[18]

		rotlw	r15,r15,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r24,r24,r23			# Sb[19] += Sb[18]

		add		r14,r14,r13			# Sa[19] += Sa[18]
		add		r24,r24,r25			# Sb[19] += Lb[1]

		add		r14,r14,r15			# Sa[19] += La[1]
		rotlwi	r23,r24,3			# Ab = Sb[19] = ROTL3(Sb[19])

		rotlwi	r13,r14,3			# Aa = Sa[19] = ROTL3(Sa[19])
		#----- Stages 71-72 ----------
		add		r10,r25,r23			# Tb = Lb[1] + Sb[19]

		xor		r12,r12,r11			# RND_Ba ^= RND_Aa
		lwz		r24,Sb_20(r1)		# Load Sb[20]

		add		r6,r15,r13			# Ta = La[1] + Sa[19]
		lwz		r14,Sa_20(r1)		# Load Sa[20]

		rotlw	r12,r12,r11			# RND_Ba = ROTL(RND_Ba,RND_Aa)
		add		r26,r26,r10			# Lb[2] += Tb

		add		r12,r12,r13			# RND_Ba += Sa[19]
		xor		r22,r22,r21			# RND_Bb ^= RND_Ab

		rotlw	r22,r22,r21			# RND_Bb = ROTL(RND_Bb,RND_Ab)
		add		r16,r16,r6			# La[2] += Ta

		rotlw	r26,r26,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)
		add		r22,r22,r23			# RND_Bb += Sb[19]

		rotlw	r16,r16,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r24,r24,r23			# Sb[20] += Sb[19]

		add		r14,r14,r13			# Sa[20] += Sa[19]
		add		r24,r24,r26			# Sb[20] += Lb[2]

		add		r14,r14,r16			# Sa[20] += La[2]
		rotlwi	r23,r24,3			# Ab = Sb[20] = ROTL3(Sb[20])

		rotlwi	r13,r14,3			# Aa = Sa[20] = ROTL3(Sa[20])
		#----- Stages 72-73 ----------
		add		r10,r26,r23			# Tb = Lb[2] + Sb[20]

		xor		r11,r11,r12			# RND_Aa ^= RND_Ba
		lwz		r24,Sb_21(r1)		# Load Sb[21]

		add		r6,r16,r13			# Ta = La[2] + Sa[20]
		lwz		r14,Sa_21(r1)		# Load Sa[21]

		rotlw	r11,r11,r12			# RND_Aa = ROTL(RND_Aa,RND_Ba)
		add		r7,r7,r10			# Lb[0] += Tb

		add		r11,r11,r13			# RND_Aa += Sa[20]
		xor		r21,r21,r22			# RND_Ab ^= RND_Bb

		rotlw	r21,r21,r22			# RND_Ab = ROTL(RND_Ab,RND_Bb)
		add		r3,r3,r6			# La[0] += Ta

		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)
		add		r21,r21,r23			# RND_Ab += Sb[20]

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r24,r24,r23			# Sb[21] += Sb[20]

		add		r14,r14,r13			# Sa[21] += Sa[20]
		add		r24,r24,r7			# Sb[21] += Lb[0]

		add		r14,r14,r3			# Sa[21] += La[0]
		rotlwi	r23,r24,3			# Ab = Sb[21] = ROTL3(Sb[21])

		rotlwi	r13,r14,3			# Aa = Sa[21] = ROTL3(Sa[21])
		#----- Stages 73-74 ----------
		add		r10,r7,r23			# Tb = Lb[0] + Sb[21]

		xor		r12,r12,r11			# RND_Ba ^= RND_Aa
		lwz		r24,Sb_22(r1)		# Load Sb[22]

		add		r6,r3,r13			# Ta = La[0] + Sa[21]
		lwz		r14,Sa_22(r1)		# Load Sa[22]

		rotlw	r12,r12,r11			# RND_Ba = ROTL(RND_Ba,RND_Aa)
		add		r25,r25,r10			# Lb[1] += Tb

		add		r12,r12,r13			# RND_Ba += Sa[21]
		xor		r22,r22,r21			# RND_Bb ^= RND_Ab

		rotlw	r22,r22,r21			# RND_Bb = ROTL(RND_Bb,RND_Ab)
		add		r15,r15,r6			# La[1] += Ta

		rotlw	r25,r25,r10			# Bb = Lb[1] = ROTL(Lb[1], Tb)
		add		r22,r22,r23			# RND_Bb += Sb[21]

		rotlw	r15,r15,r6			# Ba = La[1] = ROTL(La[1], Ta)
		add		r24,r24,r23			# Sb[22] += Sb[21]

		add		r14,r14,r13			# Sa[22] += Sa[21]
		add		r24,r24,r25			# Sb[22] += Lb[1]

		add		r14,r14,r15			# Sa[22] += La[1]
		rotlwi	r23,r24,3			# Ab = Sb[22] = ROTL3(Sb[22])

		rotlwi	r13,r14,3			# Aa = Sa[22] = ROTL3(Sa[22])
		#----- Stages 74-75 ----------
		add		r10,r25,r23			# Tb = Lb[1] + Sb[22]

		xor		r11,r11,r12			# RND_Aa ^= RND_Ba
		lwz		r24,Sb_23(r1)		# Load Sb[23]

		add		r6,r15,r13			# Ta = La[1] + Sa[22]
		lwz		r14,Sa_23(r1)		# Load Sa[23]

		rotlw	r11,r11,r12			# RND_Aa = ROTL(RND_Aa,RND_Ba)
		add		r26,r26,r10			# Lb[2] += Tb

		add		r11,r11,r13			# RND_Aa += Sa[22]
		xor		r21,r21,r22			# RND_Ab ^= RND_Bb

		rotlw	r21,r21,r22			# RND_Ab = ROTL(RND_Ab,RND_Bb)
		add		r16,r16,r6			# La[2] += Ta

		rotlw	r26,r26,r10			# Bb = Lb[2] = ROTL(Lb[2], Tb)
		add		r21,r21,r23			# RND_Ab += Sb[22]

		rotlw	r16,r16,r6			# Ba = La[2] = ROTL(La[2], Ta)
		add		r24,r24,r23			# Sb[23] += Sb[22]

		add		r14,r14,r13			# Sa[23] += Sa[22]
		add		r24,r24,r26			# Sb[23] += Lb[2]

		add		r14,r14,r16			# Sa[23] += La[2]
		rotlwi	r23,r24,3			# Ab = Sb[23] = ROTL3(Sb[23])

		rotlwi	r13,r14,3			# Aa = Sa[23] = ROTL3(Sa[23])
		#----- Stages 75-76 ----------
		add		r10,r26,r23			# Tb = Lb[2] + Sb[23]

		xor		r12,r12,r11			# RND_Ba ^= RND_Aa
		lwz		r24,Sb_24(r1)		# Load Sb[24]

		add		r6,r16,r13			# Ta = La[2] + Sa[23]
		lwz		r14,Sa_24(r1)		# Load Sa[24]

		rotlw	r12,r12,r11			# RND_Ba = ROTL(RND_Ba,RND_Aa)
		add		r7,r7,r10			# Lb[0] += Tb

		add		r18,r12,r13			# RND_Ba += Sa[23]			<<< RENAME >>>
		xor		r22,r22,r21			# RND_Bb ^= RND_Ab

		rotlw	r22,r22,r21			# RND_Bb = ROTL(RND_Bb,RND_Ab)
		add		r3,r3,r6			# La[0] += Ta

		rotlw	r7,r7,r10			# Bb = Lb[0] = ROTL(Lb[0], Tb)
		add		r28,r22,r23			# RND_Bb += Sb[23]			 <<< RENAME >>>

		rotlw	r3,r3,r6			# Ba = La[0] = ROTL(La[0], Ta)
		add		r24,r24,r23			# Sb[24] += Sb[23]

		add		r14,r14,r13			# Sa[24] += Sa[23]
		lwbrx	r27,r31,r1			# key.hi (byte reversed)

		add		r24,r24,r7			# Sb[24] += Lb[0]
		lwz		r12,wCached_S1(r1)	# pre-load Sa(1)

		add		r14,r14,r3			# Sa[24] += La[0]
		lwz		r17,wCypherLo(r1)	# cypher.lo

		rotlwi	r29,r24,3			# Ab = Sb[24] = ROTL3(Sb[24])
		lwz		r2,wRSA_Q(r1)		# reload Q for the next iteration

		ADD_KEY	r27,2				# key.hi += 2
		lwz		r4,wCached_L1(r1)	# pre-load La(1)

		rotlwi	r19,r14,3			# Aa = Sa[24] = ROTL3(Sa[24])
		stwbrx	r27,r31,r1			# Store new key.hi value

		xor		r11,r11,r18			# RND_Aa ^= RND_Ba
		lis		r0,P3Q_h		# S3 (hi)

		rotlw	r11,r11,r18			# RND_Aa = ROTL(RND_Aa,RND_Ba)
		ori		r0,r0,P3Q_l	# S3 (lo)

		xor		r21,r21,r28			# RND_Ab ^= RND_Bb
		add		r11,r11,r19			# RND_Aa += Sa[24]

		rotlw	r21,r21,r28			# RND_Ab = ROTL(RND_Ab,RND_Bb)
		cmplw	r11,r17				# RND_Aa == cypher.lo ? (cr0)

		add		r21,r21,r29			# RND_Ab += Sb[24]
		lwz		r5,wKeyHi(r1)		# La(2) = key.hi

		cmplw	cr1,r21,r17			# RND_Ab == cypher.lo ? (cr1)
		lwz		r13,wCached_S2(r1)	# Sa(2) := A
		beq-	check_key1

L1:		beq-	cr1,check_key2
L2:		bdnz	new_key_hi


# The inner loop exits when all iterations have been done or
# when key.hi == 0 (key.hi has already been incremented by 2).
# The (simplified) logic is as follows :
#	if (key.hi != 0)		// then iter == 0
#		exit();
#	else {
#		increment key.mid (and even key.lo)
#		store new key.mid/key.lo values
#		if (iter == 0)
#			exit();
#	}

next_block:
		cmplwi	r5,0				# key.hi == 0 ? (cr0)
		li		r25,wKeyMid
		lwz		r15,wKIter(r1)		# Remaining iterations / 2
		li		r26,wKeyLo
		lwbrx	r17,r25,r1 			# key.mid (byte reversed)
		li		r19,1				# Preset next loop count
		lwbrx	r18,r26,r1 			# key.lo (byte reversed)
		bne-	not_found			# key.hi != 0 means that kiter == 0

		addic.	r17,r17,1			# key.mid += 1 (cr0)
		slwi	r19,r19,31-RC5_KEY_SHIFT
		cmplw	cr1,r15,r19			# iter < count ? (cr1)
		addze	r18,r18				# key.lo + carry
		cmplwi	cr6,r15,0			# Iteration count == 0 ? (cr6)
		lis		r0,P2Q_h		# S[2] = P + 2Q
		stwbrx	r17,r25,r1 			# Store updated key.mid
		ori		r0,r0,P2Q_l	# S[2] = P + 2Q
		stwbrx	r18,r26,r1 			# Store updated key.lo
		beq-	cr6,not_found		# Iteration count == 0
		bgt		cr1,set_count2		# jump if iter >= count

		mr		r19,r15				# Loop count = remaining iterations / 2

set_count2:
		subf	r15,r19,r15			# Update iteration count
		lwz		r3,wCached_L0(r1)
		mtctr	r19
		stw		r15,wKIter(r1)
		bne+	new_key_mid			# key.mid != 0
		b		new_key_lo			# overwise, key.lo has changed

not_found:
		li		r3,RESULT_NOTHING
		b		epilog


# Key #1 matches cypher.lo
# r19 := Sa[24] = Aa
# r14 := Sa[25] = Sa (to be loaded)
# r3  := La[0] = Ba
# r15 := La[1] = La
# r11 := Round Aa
# r18 := Round Ba
# cr1   := (RND_Aa == cypher.lo) ?

check_key1:
		add		r6,r19,r3			# Ta = Sa[24] + La[0]
		lwz		r14,Sa_25(r1)
		add		r15,r15,r6			# La[1] += Ta
		lwz		r17,wCypherHi(r1)	# Load cypher.hi
		rotlw	r15,r15,r6			# La[1] = ROTL(La[1],Ta)
		lwbrx	r27,r31,r1			# reload key.hi (byte reversed)
		add		r14,r14,r19			# Sa[25] += Sa[24]
		lwz		r20,wKeyMid(r1)
		add		r14,r14,r15			# Sa[25] += La[1]
		lwz		r30,wKeyLo(r1)
		xor		r18,r18,r11			# RND_Ba ^= RND_Aa
		lwz		r6,wChkCnt(r1)
		rotlwi	r14,r14,3			# Sa[25] = ROTL3(Sa[25])
		SUB_KEY	r27,2				# Restore key.hi value
		rotlw	r18,r18,r11			# RND_Ba = ROTL(RND_Ba,RND_Aa)
		stw		r20,wChkMid(r1)
		add		r18,r18,r14			# RND_Ba += Sa[25]
		stw		r30,wChkLo(r1)
		cmplw	r18,r17				# RND_Ba == cypher.hi ? (cr0)
		addi	r6,r6,1				# check.count++
		li		r10,wChkHi
		stw		r6,wChkCnt(r1)
		stwbrx	r27,r10,r1
		bne+	L1

		li		r12,0				# 0 = First key
		b		key_found


# Key #2 matches cypher.lo
# r29 := Sb[24] = Ab
# r24 := Sb[25] = Sb (to be loaded)
# r7  := Lb[0] = Bb
# r25 := Lb[1] = Lb
# r21 := Round Ab
# r28 := Round Bb

check_key2:
		add		r10,r29,r7			# Tb = Sb[24] + Lb[0]
		lwz		r24,Sb_25(r1)
		add		r25,r25,r10			# Lb[1] += Tb
		lwz		r17,wCypherHi(r1)	# Load cypher.hi
		rotlw	r25,r25,r10			# Lb[1] = ROTL(Lb[1],Tb)
		lwbrx	r27,r31,r1			# reload key.hi (byte reversed)
		add		r24,r24,r29			# Sb[25] += Sb[24]
		lwz		r20,wKeyMid(r1)
		add		r24,r24,r25			# Sb[25] += Lb[1]
		lwz		r30,wKeyLo(r1)
		xor		r28,r28,r21			# RND_Bb ^= RND_Ab
		lwz		r6,wChkCnt(r1)
		rotlwi	r24,r24,3			# Sb[25] = ROTL3(Sb[25])
		SUB_KEY	r27,1				# Restore key.hi value
		rotlw	r28,r28,r21			# RND_Bb = ROTL(RND_Bb,RND_Ab)
		stw		r20,wChkMid(r1)
		add		r28,r28,r24			# RND_Bb += Sb[25]
		stw		r30,wChkLo(r1)
		cmplw	r28,r17				# RND_Bb == cypher.hi ? (cr0)
		addi	r6,r6,1				# check.count++
		li		r10,wChkHi
		stw		r6,wChkCnt(r1)
		stwbrx	r27,r10,r1
		bne+	L2

		li		r12,1				# 1 = Second key


# Key found :
# r12 := Key id

key_found:
		lwbrx	r27,r31,r1			# reload key.hi
		mfctr	r14
		lwz		r13,wKIter(r1)
		SUB_KEY	r27,2				# Undo key.hi incrementation
		lwz		r15,pIterations(r1)
		add		r14,r14,r13			# KIter + count
		lwz		r13,0(r15)			# Total iterations count
		add		r14,r14,r14			# remaining iterations *= 2
		add 	r13,r12,r13			# Total count -= key ID
		stwbrx	r27,r31,r1			# Store key.hi
		subf	r13,r14,r13			# Total count -= remaining iterations
		li		r3,RESULT_FOUND
		stw		r13,0(r15)			# = Remaining iteration count.


#=============================================================================
# Epilog : Update the RC5_72UnitWork structure, then restore all non-volatile
#		registers.
#		r3 := RESULT_FOUND or RESULT_NOTHING

epilog:
		lwz		r4,pUnitWork(r1)

		# Update rc5_72unitwork->check
		lwz		r5,wChkCnt(r1)
		lwz		r6,wChkHi(r1)
		lwz		r7,wChkMid(r1)
		lwz		r8,wChkLo(r1)
		stw		r5,check_count(r4)
		stw		r6,check_hi(r4)
		stw		r7,check_mid(r4)
		stw		r8,check_lo(r4)

		# Update rc5_72unitwork->L0
		lwz		r5,wKeyHi(r1)
		lwz		r6,wKeyMid(r1)
		lwz		r7,wKeyLo(r1)
		stw		r5,L0_hi(r4)
		stw		r6,L0_mid(r4)
		stw		r7,L0_lo(r4)

		# Restore non-volatile registers
		lwz		r6,wSaveCR(r1)
		lwz		r7,wSaveCTR(r1)
		mtcr	r6
		mtctr	r7
		lwz		r2,wSaveR2(r1)

		lwz		r5,0(r1)				# Caller's stack pointer
		lmw		r13,-GPRsave(r5)
		mr		r1,r5
		blr
