;#
;# RC5-72 core generated by genCore.
;# Time stamp : Sat Mar  1 17:53:22 2003
;# Target : MPC744x/MPC745x (G4+)
;#
;# Written by Didier Levet (kakace@wanadoo.fr)
;# Copyright distributed.net 1997-2003 - All Rights Reserved.
;# For use in distributed.net projects only. Any other distribution
;# or use of this source violates copyright.
;#
;# Provisions have been made to ease the port of this core to other
;# platforms. All comments begin with ";#" and all constants are defined
;# using the .set directive.
;#
;# Dependencies :
;#
;#	struct rc5_72UnitWork (ccoreio.h) :
;#		typedef struct {
;#			struct {u32 hi, lo;} plain;
;#			struct {u32 hi, lo;} cypher;
;#			struct {u32 hi, mid, lo;} L0;
;#			struct {u32 count, hi, mid, lo;} check;
;#		} RC5_72UnitWork;
;#
;#	MINIMUM_ITERATIONS (problem.cpp) :
;#		The number of iterations to perform is always an even multiple of
;#		MINIMUM_ITERATIONS, and the first key to checked is also an even
;#		multiple of this constant.
;#		Therefore, it is assumed that the number of iterations is never
;#		equal to zero (otherwise it would be interpreted as 2^32).
;#		The current value of 24 also ensure that we can process 1, 2, 4 or
;#		8 keys at once, all keys (within each group) having the same mid
;#		and lo values.
;#
;# Stats :
;#	Clock cycles (inner loop) : 374
;#	Estimated keyrate : 10695.2 keys/s/MHz
;#	Benchmark : 10687.8 keys/s/MHz (374 clock cycles)
;#
;# $Id: r72-KKS7450.osx.s,v 1.4 2009/01/31 20:18:21 kakace Exp $
;#
;# $Log: r72-KKS7450.osx.s,v $
;# Revision 1.4  2009/01/31 20:18:21  kakace
;# Work around a spurious bug in XCode 3+
;#
;# Revision 1.3  2007/10/22 16:48:36  jlawson
;# overwrite head with contents of release-2-90xx
;#
;# Revision 1.1.2.2  2003/04/03 23:25:57  mfeiri
;# Latest cores by kakace
;#
;# Revision 1.8  2003/03/19 17:56:16  kakace
;# The last pipeline stall has been removed. The core's efficiency is now 93.5 clocks/key.
;#
;# Revision 1.7  2003/03/17 21:46:49  kakace
;# Misc cleanup
;#
;# Revision 1.6  2003/03/12 20:01:55  kakace
;# Modified assembly syntax. Still assemble using Apple's AS, but the source should be easier to assemble using GAS
;#
;#=============================================================================

		.text
		.align	5			;# Align to a 32-byte boundary (performance trick)
		.globl	_rc5_72_unit_func_KKS7450


;# Result values (see ccoreio.h)

.set	RESULT_NOTHING,	1
.set	RESULT_FOUND,	2


;# struct RC5_72UnitWork (see ccoreio.h) :

.set	plain_hi,	  0
.set	plain_lo,	  4
.set	cypher_hi,	  8
.set	cypher_lo,	 12
.set	L0_hi,		 16
.set	L0_mid,		 20
.set	L0_lo,		 24
.set	check_count, 28
.set	check_hi,	 32
.set	check_mid,	 36
.set	check_lo,	 40


;# RSA constants

.set	P, 0xB7E15163
.set	Q, 0x9E3779B9


;# RC5 projects
;# This core is designed to handle RC5-72, RC5-80, RC5-88
;# and RC5-96 projects.

.set	RC5_72_KEY_SHIFT,	24
.set	RC5_80_KEY_SHIFT,	16
.set	RC5_88_KEY_SHIFT,	8
.set	RC5_96_KEY_SHIFT,	0

.set	RC5_KEY_SHIFT, RC5_72_KEY_SHIFT

		;## The core handle 4 keys per loop
.set	RC5_KEY_INC, (4<<RC5_KEY_SHIFT)

		.macro	INC_KEY		;# key.hi += 4
		.if		RC5_KEY_INC < 0x10000
		addi	$0,$0,RC5_KEY_INC
		.else
		addis	$0,$0,(RC5_KEY_INC>>16)
		.endif
		.endmacro


		.macro	DEC_KEY		;# key.hi -= 4
		.if		RC5_KEY_INC < 0x10000
		addi	$0,$0,-RC5_KEY_INC
		.else
		addis	$0,$0,-(RC5_KEY_INC>>16)
		.endif
		.endmacro


;# About the stack frame :
;# Since the core is a leaf procedure, the stack frame doesn't have
;# to follow specific ABI conventions. However, the stack pointer
;# shall be aligned on a quad-word (16 bytes) boundary to accomodate
;# AltiVec limitations. This alignment is enforced at run time when
;# allocating the stack frame.
;#
;# Conventions :
;# The LR register is not used, so it doesn't have to be preserved.
;# Register r2, usually used as RTOC, is preserved although it might
;# be defined as volatile.
;# The CR fields used by the core might be defined as volatile. However,
;# the CR register is preserved for convenience.
;# The CTR is preserved, although it might be defined as volatile.
;#
;# Volatile registers (not preserved) :
;#  r0, r3-r12, v0-v19

		;# The following constants are used to determine the size of
		;# the GPR and VR save areas. Prolog and epilog code have
		;# to be modified if your ABI defines another convention.
.set	NV_GPR,			13		;# Non-volatile GPR : r13 - r31
.set	NV_VR,			20		;# Non-volatile VR  : v20 - v31

.set	wVRSave,		 4		;# vrsave backup
.set	wSaveR2,		 8		;# r2 backup
.set	wSaveCR,		12		;# CR backup
.set	wSaveCTR,		16		;# CTR backup
.set	wKIter,			20		;# Remaining iterations
.set	pUnitWork,		24		;# struct RC5_72UnitWork * (copy)
.set	pIterations,	28		;# u32 * (copy)

.set	aVectorArea,	32		;# Vector registers save area
								;# (12 registers, 192 bytes)

		;# Vector datas : (L[], S[], and misc variables).
		;# All vector datas are addressed through 6 base registers
		;# and 6 offsets (including the form (rA|0)).
		;# The combinations base + offset provide direct access
		;# to 36 consecutive vectors. Since we don't have so many
		;# needs for vectors, the remaining entries are used for
		;# integer datas that can then be accessed using the indexed
		;# addressing mode (hint : lwbrx / stwbrx).
		;# DO NOT CHANGE THEIR RESPECTIVE ORDER
.set	vBase,	aVectorArea + (32 - NV_VR) * 16

.set	vL_00,	vBase
.set	vL_01,	vBase + 16
.set	vL_02,	vBase + 16 *  2

.set	vS_00,	vBase + 16 *  3
.set	vS_01,	vBase + 16 *  4
.set	vS_02,	vBase + 16 *  5
.set	vS_03,	vBase + 16 *  6
.set	vS_04,	vBase + 16 *  7
.set	vS_05,	vBase + 16 *  8
.set	vS_06,	vBase + 16 *  9
.set	vS_07,	vBase + 16 * 10
.set	vS_08,	vBase + 16 * 11
.set	vS_09,	vBase + 16 * 12
.set	vS_10,	vBase + 16 * 13
.set	vS_11,	vBase + 16 * 14
.set	vS_12,	vBase + 16 * 15
.set	vS_13,	vBase + 16 * 16
.set	vS_14,	vBase + 16 * 17
.set	vS_15,	vBase + 16 * 18
.set	vS_16,	vBase + 16 * 19
.set	vS_17,	vBase + 16 * 20
.set	vS_18,	vBase + 16 * 21
.set	vS_19,	vBase + 16 * 22
.set	vS_20,	vBase + 16 * 23
.set	vS_21,	vBase + 16 * 24
.set	vS_22,	vBase + 16 * 25
.set	vS_23,	vBase + 16 * 26
.set	vS_24,	vBase + 16 * 27
.set	vS_25,	vBase + 16 * 28

.set	vKeyHi,		vBase + 16 * 29
.set	wKeyHi,		vKeyHi			;# key.hi (integer pass)
.set	wRSA_S0,	vKeyHi +  4		;# S[0] = ROTL3(P)
.set	wCached_L0,	vKeyHi +  8		;# L[0] (cached)
.set	wCached_L1,	vKeyHi + 12		;# L[1] (cached)

.set	vKeyMid,	vBase + 16 * 30
.set	wKeyMid,	vKeyMid			;# key.mid (integer pass)
.set	wFreeSlot1,	vKeyMid +  4
.set	wFreeSlot2,	vKeyMid +  8
.set	wFreeSlot3,	vKeyMid + 12

.set	vKeyLo,		vBase + 16 * 31
.set	wKeyLo,		vKeyLo			;# key.lo  (integer pass)
.set	wRSA_P,		vKeyLo +  4		;# P
.set	wRSA_Q,		vKeyLo +  8		;# Q
.set	wFreeSlot4,	vKeyLo + 12

.set	vCheck,		vBase + 16 * 32	;# vector(hi, mid, lo, count)
.set	wChkHi,		vCheck
.set	wChkMid,	vCheck +  4
.set	wChkLo,		vCheck +  8
.set	wChkCnt,	vCheck + 12

.set	vText,		vBase + 16 * 33	;# vector(plain.lo/hi, cypher.lo/hi)
.set	wPlainLo,	vText
.set	wPlainHi,	vText +  4
.set	wCypherLo,	vText +  8
.set	wCypherHi,	vText + 12

.set	vCst0123,	vBase + 16 * 34	;# vector(0, 1, 2, 3)

.set	vPartial,	vBase + 16 * 35	;# Storage for partial work.

.set	localTop,	vBase + 16 * 36	;# (== 800)
.set	GPRsave,	(32-NV_GPR) * 4	;# Size of the GPR save area

		;# The prolog code assumes that the size of the stack frame
		;# is a multiple of 16
.set	FrameSize,	(localTop + GPRsave + 15) & (-16)


;#=============================================================================
;# u32 (r3) = rc5_72_unit_func_KKS7450(RC5_72UnitWork *rc5_72unitwork (r3),
;#						u32 *iterations (r4),
;#						void * /* memblk (r5) */)

_rc5_72_unit_func_KKS7450:

		;# Allocate the stack frame
		mr		r5,r1				;# Caller's stack pointer
		clrlwi	r6,r1,27			;# Keep the low order 4-bits
		subfic	r6,r6,-FrameSize	;# Total frame size, including padding.
		stwux	r1,r1,r6			;# Create the stack frame.

		;# Save non-volatile registers
		stmw	r13,-GPRsave(r5)	;# Save r13-r31
		mfcr	r6					;# CR register
		mfctr	r7					;# CTR register
		stw		r2,wSaveR2(r1)		;# Save r2
		stw		r6,wSaveCR(r1)
		stw		r7,wSaveCTR(r1)

		mfspr	r6,VRsave
		li		r7,-1				;# Use all vector registers
		stw		r6,wVRSave(r1)
		mtspr	VRsave,r7

		;# Save vector registers
		li		r5,aVectorArea
		stvx	v31,r1,r5
		addi	r5,r5,16
		stvx	v30,r1,r5
		addi	r5,r5,16
		stvx	v29,r1,r5
		addi	r5,r5,16
		stvx	v28,r1,r5
		addi	r5,r5,16
		stvx	v27,r1,r5
		addi	r5,r5,16
		stvx	v26,r1,r5
		addi	r5,r5,16
		stvx	v25,r1,r5
		addi	r5,r5,16
		stvx	v24,r1,r5
		addi	r5,r5,16
		stvx	v23,r1,r5
		addi	r5,r5,16
		stvx	v22,r1,r5
		addi	r5,r5,16
		stvx	v21,r1,r5
		addi	r5,r5,16
		stvx	v20,r1,r5

		;# Copy the arguments (we'll need all registers)
		stw		r3,pUnitWork(r1)	;# RC5_72UnitWork *
		stw		r4,pIterations(r1)	;# u32 *

		;# Initialize local variables
		lwz		r5,L0_hi(r3)		;# Dispatch first key's components
		lwz		r6,L0_mid(r3)
		lwz		r7,L0_lo(r3)
		stw		r5,wKeyHi(r1)
		stw		r6,wKeyMid(r1)
		stw		r7,wKeyLo(r1)

		lwz		r5,check_count(r3)	;# Prepare vCheck vector.
		lwz		r6,check_hi(r3)
		lwz		r7,check_mid(r3)
		lwz		r8,check_lo(r3)
		stw		r5,wChkCnt(r1)
		stw		r6,wChkHi(r1)
		stw		r7,wChkMid(r1)
		stw		r8,wChkLo(r1)

		lwz		r5,plain_lo(r3)		;# Prepare vText vector.
		lwz		r6,plain_hi(r3)
		lwz		r7,cypher_lo(r3)
		lwz		r8,cypher_hi(r3)
		stw		r5,wPlainLo(r1)
		stw		r6,wPlainHi(r1)
		stw		r7,wCypherLo(r1)
		stw		r8,wCypherHi(r1)

		li		r5,0				;# Initialize vCst0123 vector.
		li		r6,1
		li		r7,2
		li		r8,3
		stw		r5,vCst0123(r1)
		stw		r6,vCst0123+4(r1)
		stw		r7,vCst0123+8(r1)
		stw		r8,vCst0123+12(r1)

		;# Initialize RSA constants
		lis		r5,hi16(P)
		lis		r6,hi16(Q)
		ori		r5,r5,lo16(P)
		ori		r6,r6,lo16(Q)
		rotlwi	r7,r5,3				;# == S[0]
		stw		r5,wRSA_P(r1)
		stw		r6,wRSA_Q(r1)
		stw		r7,wRSA_S0(r1)

		;# Initialize bases/offsets registers. They are used to
		;# load/store datas using the indexed addressing mode.
		;# All base registers but r7 remain constant.
		;# Assignments :
		;# r2 := Points to vS_02 (vector #5)
		;# r3 := Points to vS_08 (vector #11)
		;# r4 := Points to vS_14 (vector #17)
		;# r5 := Points to vS_20 (vector #23)
		;# r6 := Points to vKeyHi (vector #29)
		;# r7 := Points to vPartial (vector #35)
		;#  r8 == -80
		;#  r9 == -64
		;# r10 == -48
		;# r11 == -32
		;# r12 == -16
		addi	r2,r1,vBase+80
		li		r8,-80
		addi	r3,r2,96
		li		r9,-64
		addi	r4,r3,96
		li		r10,-48
		addi	r5,r4,96
		li		r11,-32
		addi	r6,r5,96
		li		r12,-16
		addi	r7,r6,96


;#=============================================================================
;# Inner loop pre-initialization. Compute S[i] and L[j] upto stage 26, step 3
;# Register assignments :
;#  r2 -  r7 := Base pointers (const)
;#  r8 - r12 := Offsets (const)
;# v0  - v25 := S[i]
;# v26 - v28 := L[j]
;# v29 := Temporary register
;# v30 := plain hi/lo and cypher hi/lo
;# v31 := vector (3, 3, 3, 3) (const)

AltiVec_PreInit:

		lvx		v28,0,r6 			;# vKeyHi
		lvx		v26,r9,r7 			;# vKeyLo
		vspltisw v31,3				;# const (3,3,3,3)
		lvx		v27,r8,r7 			;# vKeyMid
		vspltw	v0,v28,1			;# S[0] = ROTL3(P)
		lvx		v24,r12,r7 			;# const (0,1,2,3)
		vspltw	v1,v26,1			;# load P
		lvx		v30,r11,r7 			;# plain.hi/lo, cypher.hi/lo
		vspltw	v25,v26,2			;# S[25] = Q
		vspltw	v28,v28,0			;# L[0] = key.hi
		vadduwm	v1,v1,v25			;# S[1] = P + Q
		vspltw	v27,v27,0			;# L[1] = key.mid
		vadduwm	v28,v28,v24			;# (hi, hi+1, hi+2, hi+3)
		vspltw	v26,v26,0			;# L[2] = key.lo

		;# Terminate stage #0 (S[0] = ROTL3(P), L[0] = key.lo, A = 0, B = 0
		vadduwm	v26,v26,v0			;# L = key.lo + S[0]
		vrlw	v26,v26,v0			;# B = L[0] = ROTL(L,S[0])
		vadduwm	v2,v1,v25			;# S[2] = S[1] + Q

		;# Stage 1 : Compute S[1] and L[1]
		;# A = S[0], B = L[0]
		vadduwm	v1,v1,v0			;# S = S[1] + A
		vadduwm	v3,v2,v25			;# S[3] = S[2] + Q
		vadduwm	v1,v1,v26			;# S += B
		vadduwm	v4,v3,v25			;# S[4] = S[3] + Q
		vrlw	v1,v1,v31			;# A = S[1] = ROTL3(S)
		stvx	v1,r12,r2 			;# Save S[1]
		vadduwm	v5,v4,v25			;# S[5] = S[4] + Q
		vadduwm	v29,v1,v26			;# T = A + B
		vadduwm	v6,v5,v25			;# S[6] = S[5] + Q
		vadduwm	v27,v27,v29			;# L = L[1] + T
		vadduwm	v7,v6,v25			;# S[7] = S[6] + Q
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)
		vadduwm	v8,v7,v25			;# S[8] = S[7] + Q

		;# Stage 2 : Compute S[2] and L[2]
		;# A = S[1], B = L[1]
		vadduwm	v2,v2,v1			;# S = S[2] + A
		vadduwm	v9,v8,v25			;# S[9] = S[8] + Q
		vadduwm	v2,v2,v27			;# S += B
		vadduwm	v10,v9,v25			;# S[10] = S[9] + Q
		vrlw	v2,v2,v31			;# A = S[2] = ROTL3(S)
		stvx	v2,0,r2 			;# Save S[2]
		vadduwm	v11,v10,v25			;# S[11] = S[10] + Q
		vadduwm	v29,v2,v27			;# T = A + B
		vadduwm	v12,v11,v25			;# S[12] = S[11] + Q
		vadduwm	v28,v28,v29			;# L = L[2] + T
		vadduwm	v13,v12,v25			;# S[13] = S[12] + Q
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)
		vadduwm	v14,v13,v25			;# S[14] = S[13] + Q

		;# Stage 3 : Compute S[3] and L[0]
		;# A = S[2], B = L[2]
		vadduwm	v3,v3,v2			;# S = S[3] + A
		vadduwm	v15,v14,v25			;# S[15] = S[14] + Q
		vadduwm	v3,v3,v28			;# S += B
		vadduwm	v16,v15,v25			;# S[16] = S[15] + Q
		vrlw	v3,v3,v31			;# A = S[3] = ROTL3(S)
		stvx	v3,r8,r3 			;# Save S[3]
		vadduwm	v17,v16,v25			;# S[17] = S[16] + Q
		vadduwm	v29,v3,v28			;# T = A + B
		vadduwm	v18,v17,v25			;# S[18] = S[17] + Q
		vadduwm	v26,v26,v29			;# L = L[0] + T
		vadduwm	v19,v18,v25			;# S[19] = S[18] + Q
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)
		vadduwm	v20,v19,v25			;# S[20] = S[19] + Q

		;# Stage 4 : Compute S[4] and L[1]
		;# A = S[3], B = L[0]
		vadduwm	v4,v4,v3			;# S = S[4] + A
		vadduwm	v21,v20,v25			;# S[21] = S[20] + Q
		vadduwm	v4,v4,v26			;# S += B
		vadduwm	v22,v21,v25			;# S[22] = S[21] + Q
		vrlw	v4,v4,v31			;# A = S[4] = ROTL3(S)
		stvx	v4,r9,r3 			;# Save S[4]
		vadduwm	v23,v22,v25			;# S[23] = S[22] + Q
		vadduwm	v29,v4,v26			;# T = A + B
		vadduwm	v24,v23,v25			;# S[24] = S[23] + Q
		vadduwm	v27,v27,v29			;# L = L[1] + T
		vadduwm	v25,v24,v25			;# S[25] = S[24] + Q
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)

		;# Stage 5 : Compute S[5] and L[2]
		;# A = S[4], B = L[1]
		vadduwm	v5,v5,v4			;# S = S[5] + A
		vadduwm	v5,v5,v27			;# S += B
		vrlw	v5,v5,v31			;# A = S[5] = ROTL3(S)
		stvx	v5,r10,r3 			;# Save S[5]
		vadduwm	v29,v5,v27			;# T = A + B
		vadduwm	v28,v28,v29			;# L = L[2] + T
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)

		;# Stage 6 : Compute S[6] and L[0]
		;# A = S[5], B = L[2]
		vadduwm	v6,v6,v5			;# S = S[6] + A
		vadduwm	v6,v6,v28			;# S += B
		vrlw	v6,v6,v31			;# A = S[6] = ROTL3(S)
		stvx	v6,r11,r3 			;# Save S[6]
		vadduwm	v29,v6,v28			;# T = A + B
		vadduwm	v26,v26,v29			;# L = L[0] + T
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)

		;# Stage 7 : Compute S[7] and L[1]
		;# A = S[6], B = L[0]
		vadduwm	v7,v7,v6			;# S = S[7] + A
		vadduwm	v7,v7,v26			;# S += B
		vrlw	v7,v7,v31			;# A = S[7] = ROTL3(S)
		stvx	v7,r12,r3 			;# Save S[7]
		vadduwm	v29,v7,v26			;# T = A + B
		vadduwm	v27,v27,v29			;# L = L[1] + T
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)

		;# Stage 8 : Compute S[8] and L[2]
		;# A = S[7], B = L[1]
		vadduwm	v8,v8,v7			;# S = S[8] + A
		vadduwm	v8,v8,v27			;# S += B
		vrlw	v8,v8,v31			;# A = S[8] = ROTL3(S)
		stvx	v8,0,r3 			;# Save S[8]
		vadduwm	v29,v8,v27			;# T = A + B
		vadduwm	v28,v28,v29			;# L = L[2] + T
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)

		;# Stage 9 : Compute S[9] and L[0]
		;# A = S[8], B = L[2]
		vadduwm	v9,v9,v8			;# S = S[9] + A
		vadduwm	v9,v9,v28			;# S += B
		vrlw	v9,v9,v31			;# A = S[9] = ROTL3(S)
		stvx	v9,r8,r4 			;# Save S[9]
		vadduwm	v29,v9,v28			;# T = A + B
		vadduwm	v26,v26,v29			;# L = L[0] + T
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)

		;# Stage 10 : Compute S[10] and L[1]
		;# A = S[9], B = L[0]
		vadduwm	v10,v10,v9			;# S = S[10] + A
		vadduwm	v10,v10,v26			;# S += B
		vrlw	v10,v10,v31			;# A = S[10] = ROTL3(S)
		stvx	v10,r9,r4 			;# Save S[10]
		vadduwm	v29,v10,v26			;# T = A + B
		vadduwm	v27,v27,v29			;# L = L[1] + T
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)

		;# Stage 11 : Compute S[11] and L[2]
		;# A = S[10], B = L[1]
		vadduwm	v11,v11,v10			;# S = S[11] + A
		vadduwm	v11,v11,v27			;# S += B
		vrlw	v11,v11,v31			;# A = S[11] = ROTL3(S)
		stvx	v11,r10,r4 			;# Save S[11]
		vadduwm	v29,v11,v27			;# T = A + B
		vadduwm	v28,v28,v29			;# L = L[2] + T
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)

		;# Stage 12 : Compute S[12] and L[0]
		;# A = S[11], B = L[2]
		vadduwm	v12,v12,v11			;# S = S[12] + A
		vadduwm	v12,v12,v28			;# S += B
		vrlw	v12,v12,v31			;# A = S[12] = ROTL3(S)
		stvx	v12,r11,r4 			;# Save S[12]
		vadduwm	v29,v12,v28			;# T = A + B
		vadduwm	v26,v26,v29			;# L = L[0] + T
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)

		;# Stage 13 : Compute S[13] and L[1]
		;# A = S[12], B = L[0]
		vadduwm	v13,v13,v12			;# S = S[13] + A
		vadduwm	v13,v13,v26			;# S += B
		vrlw	v13,v13,v31			;# A = S[13] = ROTL3(S)
		stvx	v13,r12,r4 			;# Save S[13]
		vadduwm	v29,v13,v26			;# T = A + B
		vadduwm	v27,v27,v29			;# L = L[1] + T
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)

		;# Stage 14 : Compute S[14] and L[2]
		;# A = S[13], B = L[1]
		vadduwm	v14,v14,v13			;# S = S[14] + A
		vadduwm	v14,v14,v27			;# S += B
		vrlw	v14,v14,v31			;# A = S[14] = ROTL3(S)
		stvx	v14,0,r4 			;# Save S[14]
		vadduwm	v29,v14,v27			;# T = A + B
		vadduwm	v28,v28,v29			;# L = L[2] + T
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)

		;# Stage 15 : Compute S[15] and L[0]
		;# A = S[14], B = L[2]
		vadduwm	v15,v15,v14			;# S = S[15] + A
		vadduwm	v15,v15,v28			;# S += B
		vrlw	v15,v15,v31			;# A = S[15] = ROTL3(S)
		stvx	v15,r8,r5 			;# Save S[15]
		vadduwm	v29,v15,v28			;# T = A + B
		vadduwm	v26,v26,v29			;# L = L[0] + T
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)

		;# Stage 16 : Compute S[16] and L[1]
		;# A = S[15], B = L[0]
		vadduwm	v16,v16,v15			;# S = S[16] + A
		vadduwm	v16,v16,v26			;# S += B
		vrlw	v16,v16,v31			;# A = S[16] = ROTL3(S)
		stvx	v16,r9,r5 			;# Save S[16]
		vadduwm	v29,v16,v26			;# T = A + B
		vadduwm	v27,v27,v29			;# L = L[1] + T
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)

		;# Stage 17 : Compute S[17] and L[2]
		;# A = S[16], B = L[1]
		vadduwm	v17,v17,v16			;# S = S[17] + A
		vadduwm	v17,v17,v27			;# S += B
		vrlw	v17,v17,v31			;# A = S[17] = ROTL3(S)
		stvx	v17,r10,r5 			;# Save S[17]
		vadduwm	v29,v17,v27			;# T = A + B
		vadduwm	v28,v28,v29			;# L = L[2] + T
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)

		;# Stage 18 : Compute S[18] and L[0]
		;# A = S[17], B = L[2]
		vadduwm	v18,v18,v17			;# S = S[18] + A
		vadduwm	v18,v18,v28			;# S += B
		vrlw	v18,v18,v31			;# A = S[18] = ROTL3(S)
		stvx	v18,r11,r5 			;# Save S[18]
		vadduwm	v29,v18,v28			;# T = A + B
		vadduwm	v26,v26,v29			;# L = L[0] + T
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)

		;# Stage 19 : Compute S[19] and L[1]
		;# A = S[18], B = L[0]
		vadduwm	v19,v19,v18			;# S = S[19] + A
		vadduwm	v19,v19,v26			;# S += B
		vrlw	v19,v19,v31			;# A = S[19] = ROTL3(S)
		stvx	v19,r12,r5 			;# Save S[19]
		vadduwm	v29,v19,v26			;# T = A + B
		vadduwm	v27,v27,v29			;# L = L[1] + T
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)

		;# Stage 20 : Compute S[20] and L[2]
		;# A = S[19], B = L[1]
		vadduwm	v20,v20,v19			;# S = S[20] + A
		vadduwm	v20,v20,v27			;# S += B
		vrlw	v20,v20,v31			;# A = S[20] = ROTL3(S)
		stvx	v20,0,r5 			;# Save S[20]
		vadduwm	v29,v20,v27			;# T = A + B
		vadduwm	v28,v28,v29			;# L = L[2] + T
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)

		;# Stage 21 : Compute S[21] and L[0]
		;# A = S[20], B = L[2]
		vadduwm	v21,v21,v20			;# S = S[21] + A
		vadduwm	v21,v21,v28			;# S += B
		vrlw	v21,v21,v31			;# A = S[21] = ROTL3(S)
		stvx	v21,r8,r6 			;# Save S[21]
		vadduwm	v29,v21,v28			;# T = A + B
		vadduwm	v26,v26,v29			;# L = L[0] + T
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)

		;# Stage 22 : Compute S[22] and L[1]
		;# A = S[21], B = L[0]
		vadduwm	v22,v22,v21			;# S = S[22] + A
		vadduwm	v22,v22,v26			;# S += B
		vrlw	v22,v22,v31			;# A = S[22] = ROTL3(S)
		stvx	v22,r9,r6 			;# Save S[22]
		vadduwm	v29,v22,v26			;# T = A + B
		vadduwm	v27,v27,v29			;# L = L[1] + T
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)

		;# Stage 23 : Compute S[23] and L[2]
		;# A = S[22], B = L[1]
		vadduwm	v23,v23,v22			;# S = S[23] + A
		vadduwm	v23,v23,v27			;# S += B
		vrlw	v23,v23,v31			;# A = S[23] = ROTL3(S)
		stvx	v23,r10,r6 			;# Save S[23]
		vadduwm	v29,v23,v27			;# T = A + B
		vadduwm	v28,v28,v29			;# L = L[2] + T
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)
		stvx	v28,r10,r2 			;# Save L[2]

		;# Stage 24 : Compute S[24] and L[0]
		;# A = S[23], B = L[2]
		vadduwm	v24,v24,v23			;# S = S[24] + A
		vadduwm	v24,v24,v28			;# S += B
		vrlw	v24,v24,v31			;# A = S[24] = ROTL3(S)
		stvx	v24,r11,r6 			;# Save S[24]
		vadduwm	v29,v24,v28			;# T = A + B
		vadduwm	v26,v26,v29			;# L = L[0] + T
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)
		stvx	v26,r8,r2 			;# Save L[0]

		;# Stage 25 : Compute S[25] and L[1]
		;# A = S[24], B = L[0]
		vadduwm	v25,v25,v24			;# S = S[25] + A
		vadduwm	v25,v25,v26			;# S += B
		vrlw	v25,v25,v31			;# A = S[25] = ROTL3(S)
		stvx	v25,r12,r6 			;# Save S[25]
		vadduwm	v29,v25,v26			;# T = A + B
		vadduwm	v27,v27,v29			;# L = L[1] + T
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)
		stvx	v27,r9,r2 			;# Save L[1]

		;# Stage 26 : Compute S[0] and L[2]
		;# A = S[25], B = L[1]
		vadduwm	v0,v0,v25			;# S = S[0] + A
		vadduwm	v0,v0,v27			;# S += B
		vrlw	v0,v0,v31			;# A = S[0] = ROTL3(S)
		stvx	v0,r11,r2 			;# Save S[0]


;#=============================================================================
;# Main loop implementation :
;# While the integer units compute the first steps for a group of 4 successive
;# keys, the AltiVec units are used to terminate the work done during the
;# previous iteration by the integer units.

		;# Compute how many iterations to perform in the inner loop.
		;# The inner loop shall exit when the iteration count becomes
		;# zero, or when key.hi becomes zero (in the later case, we
		;# have to increment key.mid and maybe key.lo).
		;# NOTE : The code below relies on MINIMUM_ITERATIONS and assumes
		;#        that (key % MINIMUM_ITERATION) == 0. Said otherwise,
		;#        key.hi can be incremented at least 7 times without
		;#        causing an overflow.

main_loop_init:

		lwz		r15,pIterations(r1)
		lwbrx	r16,0,r6 			;# Load key.hi (byte reversed)
		lwz		r15,0(r15)
		lwz		r13,wRSA_Q(r1)		;# r13 = Q

		INC_KEY	r16					;# key.hi += increment
		subfic	r14,r16,0
		srwi	r15,r15,2			;# iterations /= 4
		srwi	r14,r14,RC5_KEY_SHIFT+2
		stwbrx	r16,0,r6 			;# Store key.hi

		;# r14 := diff = How many iterations to perform (inner loop,
		;# not counting the next one) until key.hi == 0
		cmplw	r15,r14
		bgt		set_count1

		mr		r14,r15

set_count1:		
		subf	r15,r14,r15			;# Remaining iterations
		stw		r15,wKIter(r1)
		mtctr	r14					;# Cannot be zero

		
;# Handle new key.lo value.
;# Load S[0], and compute L[0] and S[1]
;# r13 := Q (const)

new_key_lo:
		lwz		r14,wRSA_S0(r1)
		lwz		r18,wKeyLo(r1)		;# L[0] = key.lo
		lwz		r0,wRSA_P(r1)

		add		r18,r18,r14			;# L = L[0] + S[0]
		add		r0,r0,r13			;# S[1] = P + Q
		rotlw	r18,r18,r14			;# B = L0[0] = ROTL(L, S[0])

		add		r14,r14,r18			;# A += B
		stw		r18,wCached_L0(r1)	;# Save L0[0]
		add		r14,r14,r0			;# S = A + S[1]
		add		r0,r0,r13			;# S[2] = P + 2Q
		rotlwi	r14,r14,3			;# A = S0[1] = ROTL3(S)

		;# Initialize vS_01 vector.
		stw		r14,vS_01(r1)
		stw		r14,vS_01+4(r1)
		stw		r14,vS_01+8(r1)
		stw		r14,vS_01+12(r1)

;# Handle new key.mid value : Compute L[1] and S[2].
;# Pre-conditions :
;# r0  := S[2] = P + 2Q
;# r13 := Q (const)
;# r14 := S0[1] = A
;# r18 := L0[0] = B

new_key_mid:
		lwz		r19,wKeyMid(r1)		;# L[1] = key.mid
		add		r30,r14,r18			;# T = A + B
		lwz		r7,wKeyHi(r1)		;# L[2] = key.hi
		add		r19,r19,r30			;# L = L[1] + T
		rotlw	r19,r19,r30			;# B = L0[1] = ROTL(L, T)
		add		r14,r14,r19			;# A += B
		stw		r19,wCached_L1(r1)	;# Save L0[1]
		add		r14,r14,r0			;# S = A + S[2]
		rotlwi	r14,r14,3			;# A = S0[2] = ROTL3(S)
		add		r0,r0,r13			;# S[3] = P + 3Q
		add		r30,r14,r19			;# t0 = s0 + b0
		stw		r14,vS_02(r1)		;# store s0[2]
		add		r20,r7,r30			;# l0 = key.hi + t0
		addi	r26,r20,2
		stw		r14,vS_02+4(r1)		;# store s1[2]
		addi	r29,r20,3
		rotlw	r26,r26,r30			;# b2 = l2[2] = ROTL(l2,t0)
		stw		r14,vS_02+8(r1)		;# store s2[2]
		rotlw	r29,r29,r30			;# b3 = l3[2] = ROTL(l3,t0)
		addi	r23,r20,1
		stw		r14,vS_02+12(r1)	;# store s3[2]
		add		r17,r14,r0			;# a3 = a0 + S[3]
		rotlw	r20,r20,r30			;# b0 = l0[2] = ROTL(l0,t0)
		rotlw	r23,r23,r30			;# b1 = l1[2] = ROTL(l1,t0)

	;#-- Integer stage 3. Compute S[3] and L[0]
		add		r14,r17,r20			;# s0 = a3 + b0
		add		r15,r17,r23			;# s1 = a3 + b1
		add		r16,r17,r26			;# s2 = a3 + b2
		add		r17,r17,r29			;# s3 = a3 + b3
		add		r0,r0,r13			;# S[4] = 4Q + P
		rotlwi	r14,r14,3			;# a0 = s0[3] = ROTL3(s0)
		rotlwi	r15,r15,3			;# a1 = s1[3] = ROTL3(s1)
		rotlwi	r16,r16,3			;# a2 = s2[3] = ROTL3(s2)
		rotlwi	r17,r17,3			;# a3 = s3[3] = ROTL3(s3)
		b		new_key_hi

		nop
		.align	5

;# Inner loop (processed at most 2^(key_len-66) times).
;# Pre-conditions :
;# r0  := S[4] = P + 4Q
;# r13 := Q (const)
;# r14 := S0[2] = A
;# r18 := L0[0]
;# r19 := L0[1]
;# r20 := b0 = l0[2]
;# r23 := b1 = l1[2]
;# r26 := b2 = l2[2]
;# r29 := b3 = l3[2]
;# v2  := S[2]
;# v0  := S[0] = A
;# v1  := S[1] = S
;# v27 := L[1] = B
;# v28 := L[2] = L
;# v31 := vector (3, 3, 3, 3) (const)
;# v30 := plain hi/lo, cypher hi/lo (vector, const)
;#
;# Register assignments :
;# v29 := Temporary register
;# r0  := S[n] = P + nQ
;#  r2 -  r6 := Base registers
;#  r7 := Temporary register (key.hi incrementation)
;#  r8 - r12 := Offset registers
;# r14 - r17 := S0[n] - S3[n]
;# r18 - r20 := L0[0] - L0[2]
;# r21 - r23 := L1[0] - L1[2]
;# r24 - r26 := L2[0] - L2[2]
;# r27 - r29 := L3[0] - L3[2]
;# r30, r31  := Temporary registers
;# v0  - v25 := S[n]
;# v26 - v28 := L[k]

new_key_hi:

		;# Terminate stage 26
		vadduwm	v29,v0,v27			;# T = A + B
		add		r30,r16,r26			;# t0 = s2 + b2
		lvx		v26,r8,r2 			;# Load L[0]
		vadduwm	v28,v28,v29			;# L = L[2] + T
		add		r31,r17,r29			;# t1 = s3 + b3
		lvx		v3,r8,r3 			;# Load S[3]
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)
		add		r24,r18,r30			;# l2 = l0[0] + t0
		lvx		v4,r9,r3 			;# Load S[4]

		;# Stage 27 : Compute S[1] and L[0], A = S[0], B = L[2]
		;# Also stores S[3] (which as not been saved yet).
		vadduwm	v1,v1,v0			;# S = S[1] + A
		add		r27,r18,r31			;# l3 = l0[0] + t1
		lvx		v5,r10,r3 			;# Load S[5]
		vadduwm	v1,v1,v28			;# S += B
		rotlw	r24,r24,r30			;# b2 = l2[0] = ROTL(l2,t0)
		stw		r14,vS_03(r1)		;# Store s0[3]
		vrlw	v1,v1,v31			;# A = S[1] = ROTL3(S)
		rotlw	r27,r27,r31			;# b3 = l3[0] = ROTL(l3,t1)
		stw		r15,vS_03+4(r1)		;# Store s1[3]
		vadduwm	v29,v1,v28			;# T = A + B
		add		r30,r15,r23			;# t0 = s1 + b1
		stw		r16,vS_03+8(r1)		;# Store s2[3]
		vadduwm	v26,v26,v29			;# L = L[0] + T
		add		r31,r14,r20			;# t1 = s0 + b0
		stw		r17,vS_03+12(r1)	;# Store s3[3]
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)
		add		r21,r18,r30			;# l1 = l0[0] + t0
		add		r18,r18,r31			;# l0 = l0[0] + t1


		;# Stage 28 : Compute S[2] and L[1], A = S[1], B = L[0]
		vadduwm	v2,v2,v1			;# S = S[2] + A
		rotlw	r21,r21,r30			;# b1 = l1[0] = ROTL(l1,t0)
		rotlw	r18,r18,r31			;# b0 = l0[0] = ROTL(l0,t1)
		vadduwm	v2,v2,v26			;# S += B
		;#-- Integer stage 4. Compute S[4] and L[1]
		add		r14,r14,r0			;# a0 += S[4]
		add		r15,r15,r0			;# a1 += S[4]
		vrlw	v2,v2,v31			;# A = S[2] = ROTL3(S)
		add		r16,r16,r0			;# a2 += S[4]
		add		r17,r17,r0			;# a3 += S[4]
		vadduwm	v29,v2,v26			;# T = A + B
		add		r14,r14,r18			;# s0 = a0 + b0
		add		r15,r15,r21			;# s1 = a1 + b1
		vadduwm	v27,v27,v29			;# L = L[1] + T
		add		r16,r16,r24			;# s2 = a2 + b2
		add		r17,r17,r27			;# s3 = a3 + b3
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)
		add		r0,r0,r13			;# S[5] = 5Q + P
		rotlwi	r14,r14,3			;# a0 = s0[4] = ROTL3(s0)

		;# Stage 29 : Compute S[3] and L[2], A = S[2], B = L[1]
		vadduwm	v3,v3,v2			;# S = S[3] + A
		rotlwi	r15,r15,3			;# a1 = s1[4] = ROTL3(s1)
		stw		r14,vS_04(r1)		;# Store s0[4]
		vadduwm	v3,v3,v27			;# S += B
		rotlwi	r16,r16,3			;# a2 = s2[4] = ROTL3(s2)
		stw		r15,vS_04+4(r1)		;# Store s1[4]
		vrlw	v3,v3,v31			;# A = S[3] = ROTL3(S)
		rotlwi	r17,r17,3			;# a3 = s3[4] = ROTL3(s3)
		stw		r16,vS_04+8(r1)		;# Store s2[4]
		vadduwm	v29,v3,v27			;# T = A + B
		add		r30,r16,r24			;# t0 = s2 + b2
		stw		r17,vS_04+12(r1)	;# Store s3[4]
		vadduwm	v28,v28,v29			;# L = L[2] + T
		add		r31,r17,r27			;# t1 = s3 + b3
		add		r25,r19,r30			;# l2 = l0[1] + t0
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)
		add		r28,r19,r31			;# l3 = l0[1] + t1
		rotlw	r25,r25,r30			;# b2 = l2[1] = ROTL(l2,t0)

		;# Stage 30 : Compute S[4] and L[0], A = S[3], B = L[2]
		vadduwm	v4,v4,v3			;# S = S[4] + A
		rotlw	r28,r28,r31			;# b3 = l3[1] = ROTL(l3,t1)
		add		r30,r15,r21			;# t0 = s1 + b1
		vadduwm	v4,v4,v28			;# S += B
		add		r31,r14,r18			;# t1 = s0 + b0
		add		r22,r19,r30			;# l1 = l0[1] + t0
		vrlw	v4,v4,v31			;# A = S[4] = ROTL3(S)
		add		r19,r19,r31			;# l0 = l0[1] + t1
		rotlw	r22,r22,r30			;# b1 = l1[1] = ROTL(l1,t0)
		vadduwm	v29,v4,v28			;# T = A + B
		rotlw	r19,r19,r31			;# b0 = l0[1] = ROTL(l0,t1)
		;#-- Integer stage 5. Compute S[5] and L[2]
		add		r14,r14,r0			;# a0 += S[5]
		vadduwm	v26,v26,v29			;# L = L[0] + T
		add		r15,r15,r0			;# a1 += S[5]
		add		r16,r16,r0			;# a2 += S[5]
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)
		add		r17,r17,r0			;# a3 += S[5]
		add		r14,r14,r19			;# s0 = a0 + b0

		;# Stage 31 : Compute S[5] and L[1], A = S[4], B = L[0]
		vadduwm	v5,v5,v4			;# S = S[5] + A
		add		r15,r15,r22			;# s1 = a1 + b1
		add		r16,r16,r25			;# s2 = a2 + b2
		vadduwm	v5,v5,v26			;# S += B
		add		r17,r17,r28			;# s3 = a3 + b3
		lvx		v6,r11,r3 			;# Load S[6]
		vrlw	v5,v5,v31			;# A = S[5] = ROTL3(S)
		add		r0,r0,r13			;# S[6] = 6Q + P
		lvx		v7,r12,r3 			;# Load S[7]
		vadduwm	v29,v5,v26			;# T = A + B
		rotlwi	r14,r14,3			;# a0 = s0[5] = ROTL3(s0)
		lvx		v8,0,r3 			;# Load S[8]
		vadduwm	v27,v27,v29			;# L = L[1] + T
		rotlwi	r15,r15,3			;# a1 = s1[5] = ROTL3(s1)
		lvx		v9,r8,r4 			;# Load S[9]
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)
		rotlwi	r16,r16,3			;# a2 = s2[5] = ROTL3(s2)
		stw		r14,vS_05(r1)		;# Store s0[5]

		;# Stage 32 : Compute S[6] and L[2], A = S[5], B = L[1]
		vadduwm	v6,v6,v5			;# S = S[6] + A
		rotlwi	r17,r17,3			;# a3 = s3[5] = ROTL3(s3)
		stw		r15,vS_05+4(r1)		;# Store s1[5]
		vadduwm	v6,v6,v27			;# S += B
		add		r30,r14,r19			;# t0 = s0 + b0
		stw		r16,vS_05+8(r1)		;# Store s2[5]
		vrlw	v6,v6,v31			;# A = S[6] = ROTL3(S)
		add		r31,r15,r22			;# t1 = s1 + b1
		stw		r17,vS_05+12(r1)	;# Store s3[5]
		vadduwm	v29,v6,v27			;# T = A + B
		add		r20,r20,r30			;# l0 = l0[2] + t0
		add		r23,r23,r31			;# l1 = l1[2] + t1
		vadduwm	v28,v28,v29			;# L = L[2] + T
		rotlw	r20,r20,r30			;# b0 = l0[2] = ROTL(l0,t0)
		rotlw	r23,r23,r31			;# b1 = l1[2] = ROTL(l1,t1)
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)
		add		r30,r16,r25			;# t0 = s2 + b2
		add		r31,r17,r28			;# t1 = s3 + b3

		;# Stage 33 : Compute S[7] and L[0], A = S[6], B = L[2]
		vadduwm	v7,v7,v6			;# S = S[7] + A
		add		r26,r26,r30			;# l2 = l2[2] + t0
		add		r29,r29,r31			;# l3 = l3[2] + t1
		vadduwm	v7,v7,v28			;# S += B
		rotlw	r26,r26,r30			;# b2 = l2[2] = ROTL(l2,t0)
		rotlw	r29,r29,r31			;# b3 = l3[2] = ROTL(l3,t1)
		vrlw	v7,v7,v31			;# A = S[7] = ROTL3(S)
		;#-- Integer stage 6. Compute S[6] and L[0]
		add		r14,r14,r0			;# a0 += S[6]
		add		r15,r15,r0			;# a1 += S[6]
		vadduwm	v29,v7,v28			;# T = A + B
		add		r16,r16,r0			;# a2 += S[6]
		add		r17,r17,r0			;# a3 += S[6]
		vadduwm	v26,v26,v29			;# L = L[0] + T
		add		r14,r14,r20			;# s0 = a0 + b0
		add		r15,r15,r23			;# s1 = a1 + b1
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)
		add		r16,r16,r26			;# s2 = a2 + b2
		add		r17,r17,r29			;# s3 = a3 + b3

		;# Stage 34 : Compute S[8] and L[1], A = S[7], B = L[0]
		vadduwm	v8,v8,v7			;# S = S[8] + A
		add		r0,r0,r13			;# S[7] = 7Q + P
		lvx		v10,r9,r4 			;# Load S[10]
		vadduwm	v8,v8,v26			;# S += B
		rotlwi	r14,r14,3			;# a0 = s0[6] = ROTL3(s0)
		lvx		v11,r10,r4 			;# Load S[11]
		vrlw	v8,v8,v31			;# A = S[8] = ROTL3(S)
		rotlwi	r15,r15,3			;# a1 = s1[6] = ROTL3(s1)
		lvx		v12,r11,r4 			;# Load S[12]
		vadduwm	v29,v8,v26			;# T = A + B
		rotlwi	r16,r16,3			;# a2 = s2[6] = ROTL3(s2)
		stw		r14,vS_06(r1)		;# Store s0[6]
		vadduwm	v27,v27,v29			;# L = L[1] + T
		rotlwi	r17,r17,3			;# a3 = s3[6] = ROTL3(s3)
		stw		r15,vS_06+4(r1)		;# Store s1[6]
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)
		add		r30,r14,r20			;# t0 = s0 + b0
		stw		r16,vS_06+8(r1)		;# Store s2[6]

		;# Stage 35 : Compute S[9] and L[2], A = S[8], B = L[1]
		vadduwm	v9,v9,v8			;# S = S[9] + A
		add		r31,r15,r23			;# t1 = s1 + b1
		stw		r17,vS_06+12(r1)	;# Store s3[6]
		vadduwm	v9,v9,v27			;# S += B
		add		r18,r18,r30			;# l0 = l0[0] + t0
		add		r21,r21,r31			;# l1 = l1[0] + t1
		vrlw	v9,v9,v31			;# A = S[9] = ROTL3(S)
		rotlw	r18,r18,r30			;# b0 = l0[0] = ROTL(l0,t0)
		rotlw	r21,r21,r31			;# b1 = l1[0] = ROTL(l1,t1)
		vadduwm	v29,v9,v27			;# T = A + B
		add		r30,r16,r26			;# t0 = s2 + b2
		add		r31,r17,r29			;# t1 = s3 + b3
		vadduwm	v28,v28,v29			;# L = L[2] + T
		add		r24,r24,r30			;# l2 = l2[0] + t0
		add		r27,r27,r31			;# l3 = l3[0] + t1
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)
		rotlw	r24,r24,r30			;# b2 = l2[0] = ROTL(l2,t0)
		rotlw	r27,r27,r31			;# b3 = l3[0] = ROTL(l3,t1)

		;# Stage 36 : Compute S[10] and L[0], A = S[9], B = L[2]
		vadduwm	v10,v10,v9			;# S = S[10] + A
		;#-- Integer stage 7. Compute S[7] and L[1]
		add		r14,r14,r0			;# a0 += S[7]
		add		r15,r15,r0			;# a1 += S[7]
		vadduwm	v10,v10,v28			;# S += B
		add		r16,r16,r0			;# a2 += S[7]
		add		r17,r17,r0			;# a3 += S[7]
		vrlw	v10,v10,v31			;# A = S[10] = ROTL3(S)
		add		r14,r14,r18			;# s0 = a0 + b0
		add		r15,r15,r21			;# s1 = a1 + b1
		vadduwm	v29,v10,v28			;# T = A + B
		add		r16,r16,r24			;# s2 = a2 + b2
		add		r17,r17,r27			;# s3 = a3 + b3
		vadduwm	v26,v26,v29			;# L = L[0] + T
		add		r0,r0,r13			;# S[8] = 8Q + P
		lvx		v13,r12,r4 			;# Load S[13]
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)
		rotlwi	r14,r14,3			;# a0 = s0[7] = ROTL3(s0)
		lvx		v14,0,r4 			;# Load S[14]

		;# Stage 37 : Compute S[11] and L[1], A = S[10], B = L[0]
		vadduwm	v11,v11,v10			;# S = S[11] + A
		rotlwi	r15,r15,3			;# a1 = s1[7] = ROTL3(s1)
		lvx		v15,r8,r5 			;# Load S[15]
		vadduwm	v11,v11,v26			;# S += B
		rotlwi	r16,r16,3			;# a2 = s2[7] = ROTL3(s2)
		stw		r14,vS_07(r1)		;# Store s0[7]
		vrlw	v11,v11,v31			;# A = S[11] = ROTL3(S)
		rotlwi	r17,r17,3			;# a3 = s3[7] = ROTL3(s3)
		stw		r15,vS_07+4(r1)		;# Store s1[7]
		vadduwm	v29,v11,v26			;# T = A + B
		add		r30,r14,r18			;# t0 = s0 + b0
		stw		r16,vS_07+8(r1)		;# Store s2[7]
		vadduwm	v27,v27,v29			;# L = L[1] + T
		add		r31,r15,r21			;# t1 = s1 + b1
		stw		r17,vS_07+12(r1)	;# Store s3[7]
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)
		add		r19,r19,r30			;# l0 = l0[1] + t0
		add		r22,r22,r31			;# l1 = l1[1] + t1

		;# Stage 38 : Compute S[12] and L[2], A = S[11], B = L[1]
		vadduwm	v12,v12,v11			;# S = S[12] + A
		rotlw	r19,r19,r30			;# b0 = l0[1] = ROTL(l0,t0)
		rotlw	r22,r22,r31			;# b1 = l1[1] = ROTL(l1,t1)
		vadduwm	v12,v12,v27			;# S += B
		add		r30,r16,r24			;# t0 = s2 + b2
		add		r31,r17,r27			;# t1 = s3 + b3
		vrlw	v12,v12,v31			;# A = S[12] = ROTL3(S)
		add		r25,r25,r30			;# l2 = l2[1] + t0
		add		r28,r28,r31			;# l3 = l3[1] + t1
		vadduwm	v29,v12,v27			;# T = A + B
		rotlw	r25,r25,r30			;# b2 = l2[1] = ROTL(l2,t0)
		rotlw	r28,r28,r31			;# b3 = l3[1] = ROTL(l3,t1)
		vadduwm	v28,v28,v29			;# L = L[2] + T
		;#-- Integer stage 8. Compute S[8] and L[2]
		add		r14,r14,r0			;# a0 += S[8]
		add		r15,r15,r0			;# a1 += S[8]
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)
		add		r16,r16,r0			;# a2 += S[8]
		add		r17,r17,r0			;# a3 += S[8]

		;# Stage 39 : Compute S[13] and L[0], A = S[12], B = L[2]
		vadduwm	v13,v13,v12			;# S = S[13] + A
		add		r14,r14,r19			;# s0 = a0 + b0
		add		r15,r15,r22			;# s1 = a1 + b1
		vadduwm	v13,v13,v28			;# S += B
		add		r16,r16,r25			;# s2 = a2 + b2
		add		r17,r17,r28			;# s3 = a3 + b3
		vrlw	v13,v13,v31			;# A = S[13] = ROTL3(S)
		add		r0,r0,r13			;# S[9] = 9Q + P
		lvx		v16,r9,r5 			;# Load S[16]
		vadduwm	v29,v13,v28			;# T = A + B
		rotlwi	r14,r14,3			;# a0 = s0[8] = ROTL3(s0)
		lvx		v17,r10,r5 			;# Load S[17]
		vadduwm	v26,v26,v29			;# L = L[0] + T
		rotlwi	r15,r15,3			;# a1 = s1[8] = ROTL3(s1)
		lvx		v18,r11,r5 			;# Load S[18]
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)
		rotlwi	r16,r16,3			;# a2 = s2[8] = ROTL3(s2)
		stw		r14,vS_08(r1)		;# Store s0[8]

		;# Stage 40 : Compute S[14] and L[1], A = S[13], B = L[0]
		vadduwm	v14,v14,v13			;# S = S[14] + A
		rotlwi	r17,r17,3			;# a3 = s3[8] = ROTL3(s3)
		stw		r15,vS_08+4(r1)		;# Store s1[8]
		vadduwm	v14,v14,v26			;# S += B
		add		r30,r14,r19			;# t0 = s0 + b0
		stw		r16,vS_08+8(r1)		;# Store s2[8]
		vrlw	v14,v14,v31			;# A = S[14] = ROTL3(S)
		add		r31,r15,r22			;# t1 = s1 + b1
		stw		r17,vS_08+12(r1)	;# Store s3[8]
		vadduwm	v29,v14,v26			;# T = A + B
		add		r20,r20,r30			;# l0 = l0[2] + t0
		add		r23,r23,r31			;# l1 = l1[2] + t1
		vadduwm	v27,v27,v29			;# L = L[1] + T
		rotlw	r20,r20,r30			;# b0 = l0[2] = ROTL(l0,t0)
		rotlw	r23,r23,r31			;# b1 = l1[2] = ROTL(l1,t1)
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)
		add		r30,r16,r25			;# t0 = s2 + b2
		add		r31,r17,r28			;# t1 = s3 + b3

		;# Stage 41 : Compute S[15] and L[2], A = S[14], B = L[1]
		vadduwm	v15,v15,v14			;# S = S[15] + A
		add		r26,r26,r30			;# l2 = l2[2] + t0
		add		r29,r29,r31			;# l3 = l3[2] + t1
		vadduwm	v15,v15,v27			;# S += B
		rotlw	r26,r26,r30			;# b2 = l2[2] = ROTL(l2,t0)
		rotlw	r29,r29,r31			;# b3 = l3[2] = ROTL(l3,t1)
		vrlw	v15,v15,v31			;# A = S[15] = ROTL3(S)
		;#-- Integer stage 9. Compute S[9] and L[0]
		add		r14,r14,r0			;# a0 += S[9]
		add		r15,r15,r0			;# a1 += S[9]
		vadduwm	v29,v15,v27			;# T = A + B
		add		r16,r16,r0			;# a2 += S[9]
		add		r17,r17,r0			;# a3 += S[9]
		vadduwm	v28,v28,v29			;# L = L[2] + T
		add		r14,r14,r20			;# s0 = a0 + b0
		add		r15,r15,r23			;# s1 = a1 + b1
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)
		add		r16,r16,r26			;# s2 = a2 + b2
		add		r17,r17,r29			;# s3 = a3 + b3

		;# Stage 42 : Compute S[16] and L[0], A = S[15], B = L[2]
		vadduwm	v16,v16,v15			;# S = S[16] + A
		add		r0,r0,r13			;# S[10] = 10Q + P
		lvx		v19,r12,r5 			;# Load S[19]
		vadduwm	v16,v16,v28			;# S += B
		rotlwi	r14,r14,3			;# a0 = s0[9] = ROTL3(s0)
		lvx		v20,0,r5 			;# Load S[20]
		vrlw	v16,v16,v31			;# A = S[16] = ROTL3(S)
		rotlwi	r15,r15,3			;# a1 = s1[9] = ROTL3(s1)
		lvx		v21,r8,r6 			;# Load S[21]
		vadduwm	v29,v16,v28			;# T = A + B
		rotlwi	r16,r16,3			;# a2 = s2[9] = ROTL3(s2)
		stw		r14,vS_09(r1)		;# Store s0[9]
		vadduwm	v26,v26,v29			;# L = L[0] + T
		rotlwi	r17,r17,3			;# a3 = s3[9] = ROTL3(s3)
		stw		r15,vS_09+4(r1)		;# Store s1[9]
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)
		add		r30,r14,r20			;# t0 = s0 + b0
		stw		r16,vS_09+8(r1)		;# Store s2[9]

		;# Stage 43 : Compute S[17] and L[1], A = S[16], B = L[0]
		vadduwm	v17,v17,v16			;# S = S[17] + A
		add		r31,r15,r23			;# t1 = s1 + b1
		stw		r17,vS_09+12(r1)	;# Store s3[9]
		vadduwm	v17,v17,v26			;# S += B
		add		r18,r18,r30			;# l0 = l0[0] + t0
		add		r21,r21,r31			;# l1 = l1[0] + t1
		vrlw	v17,v17,v31			;# A = S[17] = ROTL3(S)
		rotlw	r18,r18,r30			;# b0 = l0[0] = ROTL(l0,t0)
		rotlw	r21,r21,r31			;# b1 = l1[0] = ROTL(l1,t1)
		vadduwm	v29,v17,v26			;# T = A + B
		add		r30,r16,r26			;# t0 = s2 + b2
		add		r31,r17,r29			;# t1 = s3 + b3
		vadduwm	v27,v27,v29			;# L = L[1] + T
		add		r24,r24,r30			;# l2 = l2[0] + t0
		add		r27,r27,r31			;# l3 = l3[0] + t1
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)
		rotlw	r24,r24,r30			;# b2 = l2[0] = ROTL(l2,t0)
		rotlw	r27,r27,r31			;# b3 = l3[0] = ROTL(l3,t1)

		;# Stage 44 : Compute S[18] and L[2], A = S[17], B = L[1]
		vadduwm	v18,v18,v17			;# S = S[18] + A
		;#-- Integer stage 10. Compute S[10] and L[1]
		add		r14,r14,r0			;# a0 += S[10]
		add		r15,r15,r0			;# a1 += S[10]
		vadduwm	v18,v18,v27			;# S += B
		add		r16,r16,r0			;# a2 += S[10]
		add		r17,r17,r0			;# a3 += S[10]
		vrlw	v18,v18,v31			;# A = S[18] = ROTL3(S)
		add		r14,r14,r18			;# s0 = a0 + b0
		add		r15,r15,r21			;# s1 = a1 + b1
		vadduwm	v29,v18,v27			;# T = A + B
		add		r16,r16,r24			;# s2 = a2 + b2
		add		r17,r17,r27			;# s3 = a3 + b3
		vadduwm	v28,v28,v29			;# L = L[2] + T
		add		r0,r0,r13			;# S[11] = 11Q + P
		lvx		v22,r9,r6 			;# Load S[22]
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)
		rotlwi	r14,r14,3			;# a0 = s0[10] = ROTL3(s0)
		lvx		v23,r10,r6 			;# Load S[23]
		;# Stage 45 : Compute S[19] and L[0], A = S[18], B = L[2]
		vadduwm	v19,v19,v18			;# S = S[19] + A
		rotlwi	r15,r15,3			;# a1 = s1[10] = ROTL3(s1)
		lvx		v24,r11,r6 			;# Load S[24]
		vadduwm	v19,v19,v28			;# S += B
		rotlwi	r16,r16,3			;# a2 = s2[10] = ROTL3(s2)
		stw		r14,vS_10(r1)		;# Store s0[10]
		vrlw	v19,v19,v31			;# A = S[19] = ROTL3(S)
		rotlwi	r17,r17,3			;# a3 = s3[10] = ROTL3(s3)
		stw		r15,vS_10+4(r1)		;# Store s1[10]
		vadduwm	v29,v19,v28			;# T = A + B
		add		r30,r14,r18			;# t0 = s0 + b0
		stw		r16,vS_10+8(r1)		;# Store s2[10]
		vadduwm	v26,v26,v29			;# L = L[0] + T
		add		r31,r15,r21			;# t1 = s1 + b1
		stw		r17,vS_10+12(r1)	;# Store s3[10]
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)
		add		r19,r19,r30			;# l0 = l0[1] + t0
		add		r22,r22,r31			;# l1 = l1[1] + t1

		;# Stage 46 : Compute S[20] and L[1], A = S[19], B = L[0]
		vadduwm	v20,v20,v19			;# S = S[20] + A
		rotlw	r19,r19,r30			;# b0 = l0[1] = ROTL(l0,t0)
		rotlw	r22,r22,r31			;# b1 = l1[1] = ROTL(l1,t1)
		vadduwm	v20,v20,v26			;# S += B
		add		r30,r16,r24			;# t0 = s2 + b2
		add		r31,r17,r27			;# t1 = s3 + b3
		vrlw	v20,v20,v31			;# A = S[20] = ROTL3(S)
		add		r25,r25,r30			;# l2 = l2[1] + t0
		add		r28,r28,r31			;# l3 = l3[1] + t1
		vadduwm	v29,v20,v26			;# T = A + B
		rotlw	r25,r25,r30			;# b2 = l2[1] = ROTL(l2,t0)
		rotlw	r28,r28,r31			;# b3 = l3[1] = ROTL(l3,t1)
		vadduwm	v27,v27,v29			;# L = L[1] + T
		;#-- Integer stage 11. Compute S[11] and L[2]
		add		r14,r14,r0			;# a0 += S[11]
		add		r15,r15,r0			;# a1 += S[11]
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)
		add		r16,r16,r0			;# a2 += S[11]
		add		r17,r17,r0			;# a3 += S[11]

		;# Stage 47 : Compute S[21] and L[2], A = S[20], B = L[1]
		vadduwm	v21,v21,v20			;# S = S[21] + A
		add		r14,r14,r19			;# s0 = a0 + b0
		add		r15,r15,r22			;# s1 = a1 + b1
		vadduwm	v21,v21,v27			;# S += B
		add		r16,r16,r25			;# s2 = a2 + b2
		add		r17,r17,r28			;# s3 = a3 + b3
		vrlw	v21,v21,v31			;# A = S[21] = ROTL3(S)
		add		r0,r0,r13			;# S[12] = 12Q + P
		rotlwi	r14,r14,3			;# a0 = s0[11] = ROTL3(s0)
		vadduwm	v29,v21,v27			;# T = A + B
		rotlwi	r15,r15,3			;# a1 = s1[11] = ROTL3(s1)
		lvx		v25,r12,r6 			;# Load S[25]
		vadduwm	v28,v28,v29			;# L = L[2] + T
		rotlwi	r16,r16,3			;# a2 = s2[11] = ROTL3(s2)
		stw		r14,vS_11(r1)		;# Store s0[11]
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)
		rotlwi	r17,r17,3			;# a3 = s3[11] = ROTL3(s3)
		stw		r15,vS_11+4(r1)		;# Store s1[11]

		;# Stage 48 : Compute S[22] and L[0], A = S[21], B = L[2]
		vadduwm	v22,v22,v21			;# S = S[22] + A
		add		r30,r14,r19			;# t0 = s0 + b0
		stw		r16,vS_11+8(r1)		;# Store s2[11]
		vadduwm	v22,v22,v28			;# S += B
		add		r31,r15,r22			;# t1 = s1 + b1
		stw		r17,vS_11+12(r1)	;# Store s3[11]
		vrlw	v22,v22,v31			;# A = S[22] = ROTL3(S)
		add		r20,r20,r30			;# l0 = l0[2] + t0
		add		r23,r23,r31			;# l1 = l1[2] + t1
		vadduwm	v29,v22,v28			;# T = A + B
		rotlw	r20,r20,r30			;# b0 = l0[2] = ROTL(l0,t0)
		rotlw	r23,r23,r31			;# b1 = l1[2] = ROTL(l1,t1)
		vadduwm	v26,v26,v29			;# L = L[0] + T
		add		r30,r16,r25			;# t0 = s2 + b2
		add		r31,r17,r28			;# t1 = s3 + b3
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)
		add		r26,r26,r30			;# l2 = l2[2] + t0
		add		r29,r29,r31			;# l3 = l3[2] + t1

		;# Stage 49 : Compute S[23] and L[1], A = S[22], B = L[0]
		vadduwm	v23,v23,v22			;# S = S[23] + A
		rotlw	r26,r26,r30			;# b2 = l2[2] = ROTL(l2,t0)
		rotlw	r29,r29,r31			;# b3 = l3[2] = ROTL(l3,t1)
		vadduwm	v23,v23,v26			;# S += B
		;#-- Integer stage 12. Compute S[12] and L[0]
		add		r14,r14,r0			;# a0 += S[12]
		add		r15,r15,r0			;# a1 += S[12]
		vrlw	v23,v23,v31			;# A = S[23] = ROTL3(S)
		add		r16,r16,r0			;# a2 += S[12]
		add		r17,r17,r0			;# a3 += S[12]
		vadduwm	v29,v23,v26			;# T = A + B
		add		r14,r14,r20			;# s0 = a0 + b0
		add		r15,r15,r23			;# s1 = a1 + b1
		vadduwm	v27,v27,v29			;# L = L[1] + T
		add		r16,r16,r26			;# s2 = a2 + b2
		add		r17,r17,r29			;# s3 = a3 + b3
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)
		add		r0,r0,r13			;# S[13] = 13Q + P
		rotlwi	r14,r14,3			;# a0 = s0[12] = ROTL3(s0)

		;# Stage 50 : Compute S[24] and L[2], A = S[23], B = L[1]
		vadduwm	v24,v24,v23			;# S = S[24] + A
		rotlwi	r15,r15,3			;# a1 = s1[12] = ROTL3(s1)
		stw		r14,vS_12(r1)		;# Store s0[12]
		vadduwm	v24,v24,v27			;# S += B
		rotlwi	r16,r16,3			;# a2 = s2[12] = ROTL3(s2)
		stw		r15,vS_12+4(r1)		;# Store s1[12]
		vrlw	v24,v24,v31			;# A = S[24] = ROTL3(S)
		rotlwi	r17,r17,3			;# a3 = s3[12] = ROTL3(s3)
		stw		r16,vS_12+8(r1)		;# Store s2[12]
		vadduwm	v29,v24,v27			;# T = A + B
		add		r30,r14,r20			;# t0 = s0 + b0
		stw		r17,vS_12+12(r1)	;# Store s3[12]
		vadduwm	v28,v28,v29			;# L = L[2] + T
		add		r31,r15,r23			;# t1 = s1 + b1
		add		r18,r18,r30			;# l0 = l0[0] + t0
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)
		add		r21,r21,r31			;# l1 = l1[0] + t1
		rotlw	r18,r18,r30			;# b0 = l0[0] = ROTL(l0,t0)

		;# Stage 51 : Compute S[25] and L[0], A = S[24], B = L[2]
		vadduwm	v25,v25,v24			;# S = S[25] + A
		rotlw	r21,r21,r31			;# b1 = l1[0] = ROTL(l1,t1)
		add		r30,r16,r26			;# t0 = s2 + b2
		vadduwm	v25,v25,v28			;# S += B
		add		r31,r17,r29			;# t1 = s3 + b3
		add		r24,r24,r30			;# l2 = l2[0] + t0
		vrlw	v25,v25,v31			;# A = S[25] = ROTL3(S)
		add		r27,r27,r31			;# l3 = l3[0] + t1
		rotlw	r24,r24,r30			;# b2 = l2[0] = ROTL(l2,t0)
		vadduwm	v29,v25,v28			;# T = A + B
		rotlw	r27,r27,r31			;# b3 = l3[0] = ROTL(l3,t1)
		;#-- Integer stage 13. Compute S[13] and L[1]
		add		r14,r14,r0			;# a0 += S[13]
		vadduwm	v26,v26,v29			;# L = L[0] + T
		add		r15,r15,r0			;# a1 += S[13]
		add		r16,r16,r0			;# a2 += S[13]
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)
		add		r17,r17,r0			;# a3 += S[13]
		add		r14,r14,r18			;# s0 = a0 + b0

		;# Stage 52 : Compute S[0] and L[1], A = S[25], B = L[0]
		vadduwm	v0,v0,v25			;# S = S[0] + A
		add		r15,r15,r21			;# s1 = a1 + b1
		add		r16,r16,r24			;# s2 = a2 + b2
		vadduwm	v0,v0,v26			;# S += B
		add		r17,r17,r27			;# s3 = a3 + b3
		add		r0,r0,r13			;# S[14] = 14Q + P
		vrlw	v0,v0,v31			;# A = S[0] = ROTL3(S)
		rotlwi	r14,r14,3			;# a0 = s0[13] = ROTL3(s0)
		rotlwi	r15,r15,3			;# a1 = s1[13] = ROTL3(s1)
		vadduwm	v29,v0,v26			;# T = A + B
		rotlwi	r16,r16,3			;# a2 = s2[13] = ROTL3(s2)
		stw		r14,vS_13(r1)		;# Store s0[13]
		vadduwm	v27,v27,v29			;# L = L[1] + T
		rotlwi	r17,r17,3			;# a3 = s3[13] = ROTL3(s3)
		stw		r15,vS_13+4(r1)		;# Store s1[13]
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)
		add		r30,r14,r18			;# t0 = s0 + b0
		stw		r16,vS_13+8(r1)		;# Store s2[13]

		;# Stage 53 : Compute S[1] and L[2], A = S[0], B = L[1]
		vadduwm	v1,v1,v0			;# S = S[1] + A
		add		r31,r15,r21			;# t1 = s1 + b1
		stw		r17,vS_13+12(r1)	;# Store s3[13]
		vadduwm	v1,v1,v27			;# S += B
		add		r19,r19,r30			;# l0 = l0[1] + t0
		add		r22,r22,r31			;# l1 = l1[1] + t1
		vrlw	v1,v1,v31			;# A = S[1] = ROTL3(S)
		rotlw	r19,r19,r30			;# b0 = l0[1] = ROTL(l0,t0)
		rotlw	r22,r22,r31			;# b1 = l1[1] = ROTL(l1,t1)
		vadduwm	v29,v1,v27			;# T = A + B
		add		r30,r16,r24			;# t0 = s2 + b2
		add		r31,r17,r27			;# t1 = s3 + b3
		vadduwm	v28,v28,v29			;# L = L[2] + T
		add		r25,r25,r30			;# l2 = l2[1] + t0
		add		r28,r28,r31			;# l3 = l3[1] + t1
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)
		rotlw	r25,r25,r30			;# b2 = l2[1] = ROTL(l2,t0)
		vspltw	v29,v30,0			;# T = plain.lo

		;# Stage 54 : Compute S[2] and L[0], A = S[1], B = L[2]
		vadduwm	v2,v2,v1			;# S = S[2] + A
		rotlw	r28,r28,r31			;# b3 = l3[1] = ROTL(l3,t1)
		;#-- Integer stage 14. Compute S[14] and L[2]
		add		r14,r14,r0			;# a0 += S[14]
		vadduwm	v2,v2,v28			;# S += B
		add		r15,r15,r0			;# a1 += S[14]
		add		r16,r16,r0			;# a2 += S[14]
		vrlw	v2,v2,v31			;# A = S[2] = ROTL3(S)
		add		r17,r17,r0			;# a3 += S[14]
		add		r14,r14,r19			;# s0 = a0 + b0
		vadduwm	v0,v0,v29			;# C = S[0] + plain.lo
		add		r15,r15,r22			;# s1 = a1 + b1
		add		r16,r16,r25			;# s2 = a2 + b2
		vadduwm	v29,v2,v28			;# T = A + B
		add		r17,r17,r28			;# s3 = a3 + b3
		add		r0,r0,r13			;# S[15] = 15Q + P
		vadduwm	v26,v26,v29			;# L = L[0] + T
		rotlwi	r14,r14,3			;# a0 = s0[14] = ROTL3(s0)
		rotlwi	r15,r15,3			;# a1 = s1[14] = ROTL3(s1)
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)
		vspltw	v29,v30,1			;# T = plain.hi
		stw		r14,vS_14(r1)		;# Store s0[14]

		;# Stage 55 : Compute S[3] and L[1], A = S[2], B = L[0]
		vadduwm	v3,v3,v2			;# S = S[3] + A
		rotlwi	r16,r16,3			;# a2 = s2[14] = ROTL3(s2)
		stw		r15,vS_14+4(r1)		;# Store s1[14]
		vadduwm	v3,v3,v26			;# S += B
		rotlwi	r17,r17,3			;# a3 = s3[14] = ROTL3(s3)
		stw		r16,vS_14+8(r1)		;# Store s2[14]
		vrlw	v3,v3,v31			;# A = S[3] = ROTL3(S)
		add		r30,r14,r19			;# t0 = s0 + b0
		stw		r17,vS_14+12(r1)	;# Store s3[14]
		vadduwm	v1,v1,v29			;# D = S[1] + plain.hi
		add		r31,r15,r22			;# t1 = s1 + b1
		add		r20,r20,r30			;# l0 = l0[2] + t0
		vadduwm	v29,v3,v26			;# T = A + B
		add		r23,r23,r31			;# l1 = l1[2] + t1
		rotlw	r20,r20,r30			;# b0 = l0[2] = ROTL(l0,t0)
		vadduwm	v27,v27,v29			;# L = L[1] + T
		rotlw	r23,r23,r31			;# b1 = l1[2] = ROTL(l1,t1)
		add		r30,r16,r25			;# t0 = s2 + b2
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)
		add		r31,r17,r28			;# t1 = s3 + b3
		add		r26,r26,r30			;# l2 = l2[2] + t0

;# Combine key expansion stages 56-76 with round stages.
;# Some vectors are reallocated (renamed) in order to remove
;# pipeline stalls later.

		;# Rounds 1.0 and 1.1
		vxor	v0,v0,v1			;# C ^ D
		add		r29,r29,r31			;# l3 = l3[2] + t1
		rotlw	r26,r26,r30			;# b2 = l2[2] = ROTL(l2,t0)
		vrlw	v0,v0,v1			;# C = ROTL(C,D)
		rotlw	r29,r29,r31			;# b3 = l3[2] = ROTL(l3,t1)
		;#-- Integer stage 15. Compute S[15] and L[0]
		add		r14,r14,r0			;# a0 += S[15]
		vadduwm	v0,v0,v2			;# C += S[2]
		add		r15,r15,r0			;# a1 += S[15]
		add		r16,r16,r0			;# a2 += S[15]
		vxor	v1,v1,v0			;# D ^ C
		add		r17,r17,r0			;# a3 += S[15]
		add		r14,r14,r20			;# s0 = a0 + b0
		vrlw	v1,v1,v0			;# D = ROTL(D,C)
		add		r15,r15,r23			;# s1 = a1 + b1
		add		r16,r16,r26			;# s2 = a2 + b2
		vadduwm	v1,v1,v3			;# D += S[3]
		add		r17,r17,r29			;# s3 = a3 + b3
		add		r0,r0,r13			;# S[16] = 16Q + P

		;# Stage 56, round 2.0 : Compute S[4] and L[2], A = S[3], B = L[1]
		vadduwm	v4,v4,v3			;# S = S[4] + A
		rotlwi	r14,r14,3			;# a0 = s0[15] = ROTL3(s0)
		lvx		v2,0,r2 			;# pre-load S[2] for the next iteration.
		vadduwm	v4,v4,v27			;# S += B
		rotlwi	r15,r15,3			;# a1 = s1[15] = ROTL3(s1)
		stw		r14,vS_15(r1)		;# Store s0[15]
		vrlw	v4,v4,v31			;# A = S[4] = ROTL3(S)
		rotlwi	r16,r16,3			;# a2 = s2[15] = ROTL3(s2)
		stw		r15,vS_15+4(r1)		;# Store s1[15]
		vxor	v0,v0,v1			;# C ^ D
		rotlwi	r17,r17,3			;# a3 = s3[15] = ROTL3(s3)
		stw		r16,vS_15+8(r1)		;# Store s2[15]
		vrlw	v0,v0,v1			;# C = ROTL(C,D)
		add		r30,r14,r20			;# t0 = s0 + b0
		stw		r17,vS_15+12(r1)	;# Store s3[15]
		vadduwm	v0,v0,v4			;# C += S[4]
		add		r31,r15,r23			;# t1 = s1 + b1
		add		r18,r18,r30			;# l0 = l0[0] + t0
		vadduwm	v29,v4,v27			;# T = A + B
		add		r21,r21,r31			;# l1 = l1[0] + t1
		rotlw	r18,r18,r30			;# b0 = l0[0] = ROTL(l0,t0)
		vadduwm	v28,v28,v29			;# L = L[2] + T
		rotlw	r21,r21,r31			;# b1 = l1[0] = ROTL(l1,t1)
		add		r30,r16,r26			;# t0 = s2 + b2
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)
		add		r31,r17,r29			;# t1 = s3 + b3
		add		r24,r24,r30			;# l2 = l2[0] + t0

		;# Stage 57, round 2.1 : Compute S[5] and L[0], A = S[4], B = L[2]
		;# Increment key.hi while the LSU is somewhat idle.
		vadduwm	v5,v5,v4			;# S = S[5] + A
		add		r27,r27,r31			;# l3 = l3[0] + t1
		rotlw	r24,r24,r30			;# b2 = l2[0] = ROTL(l2,t0)
		vadduwm	v5,v5,v28			;# S += B
		rotlw	r27,r27,r31			;# b3 = l3[0] = ROTL(l3,t1)
		add		r14,r14,r0			;# a0 += S[16]
		vrlw	v5,v5,v31			;# A = S[5] = ROTL3(S)
		;#-- Integer stage 16. Compute S[16] and L[1]
		add		r15,r15,r0			;# a1 += S[16]
		add		r16,r16,r0			;# a2 += S[16]
		vxor	v1,v1,v0			;# D ^ C
		add		r17,r17,r0			;# a3 += S[16]
		lwbrx	r7,0,r6 			;# Load key.hi
		vrlw	v1,v1,v0			;# D = ROTL(D,C)
		add		r14,r14,r18			;# s0 = a0 + b0
		add		r15,r15,r21			;# s1 = a1 + b1
		vadduwm	v1,v1,v5			;# D += S[5]
		add		r16,r16,r24			;# s2 = a2 + b2
		add		r17,r17,r27			;# s3 = a3 + b3
		vadduwm	v29,v5,v28			;# T = A + B
		add		r0,r0,r13			;# S[17] = 17Q + P
		INC_KEY	r7					;# Increment key.hi
		vadduwm	v26,v26,v29			;# L = L[0] + T
		rotlwi	r14,r14,3			;# a0 = s0[16] = ROTL3(s0)
		stwbrx	r7,0,r6 			;# Store key.hi
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)
		rotlwi	r15,r15,3			;# a1 = s1[16] = ROTL3(s1)
		stw		r14,vS_16(r1)		;# Store s0[16]

		;# Stage 58, round 3.0 : Compute S[6] and L[1], A = S[5], B = L[0]
		vadduwm	v6,v6,v5			;# S = S[6] + A
		rotlwi	r16,r16,3			;# a2 = s2[16] = ROTL3(s2)
		stw		r15,vS_16+4(r1)		;# Store s1[16]
		vadduwm	v6,v6,v26			;# S += B
		rotlwi	r17,r17,3			;# a3 = s3[16] = ROTL3(s3)
		stw		r16,vS_16+8(r1)		;# Store s2[16]
		vrlw	v6,v6,v31			;# A = S[6] = ROTL3(S)
		add		r30,r14,r18			;# t0 = s0 + b0
		stw		r17,vS_16+12(r1)	;# Store s3[16]
		vxor	v0,v0,v1			;# C ^ D
		add		r31,r15,r21			;# t1 = s1 + b1
		add		r19,r19,r30			;# l0 = l0[1] + t0
		vrlw	v0,v0,v1			;# C = ROTL(C,D)
		add		r22,r22,r31			;# l1 = l1[1] + t1
		rotlw	r19,r19,r30			;# b0 = l0[1] = ROTL(l0,t0)
		;# <<< Assign C to v5 >>>
		vadduwm	v5,v0,v6			;# C += S[6].
		rotlw	r22,r22,r31			;# b1 = l1[1] = ROTL(l1,t1)
		add		r30,r16,r24			;# t0 = s2 + b2
		vadduwm	v29,v6,v26			;# T = A + B
		add		r31,r17,r27			;# t1 = s3 + b3
		add		r25,r25,r30			;# l2 = l2[1] + t0
		vadduwm	v27,v27,v29			;# L = L[1] + T
		add		r28,r28,r31			;# l3 = l3[1] + t1
		rotlw	r25,r25,r30			;# b2 = l2[1] = ROTL(l2,t0)
		vrlw	v27,v27,v29			;# B = L[1] = ROTL(L,T)
		rotlw	r28,r28,r31			;# b3 = l3[1] = ROTL(l3,t1)
		;#-- Integer stage 17. Compute S[17] and L[2]
		add		r14,r14,r0			;# a0 += S[17]

		;# Stage 59, round 3.1 : Compute S[7] and L[2], A = S[6], B = L[1]
		vadduwm	v7,v7,v6			;# S = S[7] + A
		add		r15,r15,r0			;# a1 += S[17]
		add		r16,r16,r0			;# a2 += S[17]
		vadduwm	v7,v7,v27			;# S += B
		add		r17,r17,r0			;# a3 += S[17]
		add		r14,r14,r19			;# s0 = a0 + b0
		vrlw	v7,v7,v31			;# A = S[7] = ROTL3(S)
		add		r15,r15,r22			;# s1 = a1 + b1
		add		r16,r16,r25			;# s2 = a2 + b2
		vxor	v1,v1,v5			;# D ^ C
		add		r17,r17,r28			;# s3 = a3 + b3
		add		r0,r0,r13			;# S[18] = 18Q + P
		vrlw	v1,v1,v5			;# D = ROTL(D,C)
		rotlwi	r14,r14,3			;# a0 = s0[17] = ROTL3(s0)
		rotlwi	r15,r15,3			;# a1 = s1[17] = ROTL3(s1)
		;# <<< Assign D to v6 >>>
		vadduwm	v6,v1,v7			;# D += S[7].
		rotlwi	r16,r16,3			;# a2 = s2[17] = ROTL3(s2)
		stw		r14,vS_17(r1)		;# Store s0[17]
		vadduwm	v29,v7,v27			;# T = A + B
		rotlwi	r17,r17,3			;# a3 = s3[17] = ROTL3(s3)
		stw		r15,vS_17+4(r1)		;# Store s1[17]
		vadduwm	v28,v28,v29			;# L = L[2] + T
		add		r30,r14,r19			;# t0 = s0 + b0
		stw		r16,vS_17+8(r1)		;# Store s2[17]
		vrlw	v28,v28,v29			;# B = L[2] = ROTL(L,T)
		add		r31,r15,r22			;# t1 = s1 + b1
		stw		r17,vS_17+12(r1)	;# Store s3[17]

		;# Stage 60, round 4.0 : Compute S[8] and L[0], A = S[7], B = L[2]
		vadduwm	v8,v8,v7			;# S = S[8] + A
		add		r20,r20,r30			;# l0 = l0[2] + t0
		add		r23,r23,r31			;# l1 = l1[2] + t1
		vadduwm	v8,v8,v28			;# S += B
		rotlw	r20,r20,r30			;# b0 = l0[2] = ROTL(l0,t0)
		rotlw	r23,r23,r31			;# b1 = l1[2] = ROTL(l1,t1)
		vrlw	v8,v8,v31			;# A = S[8] = ROTL3(S)
		add		r30,r16,r25			;# t0 = s2 + b2
		add		r31,r17,r28			;# t1 = s3 + b3
		vxor	v5,v5,v6			;# C ^ D
		add		r26,r26,r30			;# l2 = l2[2] + t0
		add		r29,r29,r31			;# l3 = l3[2] + t1
		vrlw	v5,v5,v6			;# C = ROTL(C,D)
		rotlw	r26,r26,r30			;# b2 = l2[2] = ROTL(l2,t0)
		rotlw	r29,r29,r31			;# b3 = l3[2] = ROTL(l3,t1)
		vadduwm	v5,v5,v8			;# C += S[8]
		;#-- Integer stage 18. Compute S[18] and L[0]
		add		r14,r14,r0			;# a0 += S[18]
		add		r15,r15,r0			;# a1 += S[18]
		vadduwm	v29,v8,v28			;# T = A + B
		add		r16,r16,r0			;# a2 += S[18]
		add		r17,r17,r0			;# a3 += S[18]
		vadduwm	v26,v26,v29			;# L = L[0] + T
		add		r14,r14,r20			;# s0 = a0 + b0
		add		r15,r15,r23			;# s1 = a1 + b1
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)
		add		r16,r16,r26			;# s2 = a2 + b2
		add		r17,r17,r29			;# s3 = a3 + b3

		;# Stage 61, round 4.1 : Compute S[9] and L[1], A = S[8], B = L[0]
		vadduwm	v9,v9,v8			;# S = S[9] + A
		add		r0,r0,r13			;# S[19] = 19Q + P
		lvx		v1,r12,r2 			;# Pre-load S[1] for the next iteration.
		vadduwm	v9,v9,v26			;# S += B
		rotlwi	r14,r14,3			;# a0 = s0[18] = ROTL3(s0)
		lwz		r7,wKeyHi(r1)		;# Load key.hi
		vrlw	v9,v9,v31			;# A = S[9] = ROTL3(S)
		rotlwi	r15,r15,3			;# a1 = s1[18] = ROTL3(s1)
		stw		r14,vS_18(r1)		;# Store s0[18]
		vxor	v6,v6,v5			;# D ^ C
		rotlwi	r16,r16,3			;# a2 = s2[18] = ROTL3(s2)
		stw		r15,vS_18+4(r1)		;# Store s1[18]
		vrlw	v6,v6,v5			;# D = ROTL(D,C)
		rotlwi	r17,r17,3			;# a3 = s3[18] = ROTL3(s3)
		stw		r16,vS_18+8(r1)		;# Store s2[18]
		vadduwm	v6,v6,v9			;# D += S[9]
		add		r30,r14,r20			;# t0 = s0 + b0
		stw		r17,vS_18+12(r1)	;# Store s3[18]
		vadduwm	v29,v9,v26			;# T = A + B
		add		r31,r15,r23			;# t1 = s1 + b1
		add		r18,r18,r30			;# l0 = l0[0] + t0
		vadduwm	v27,v27,v29			;# L = L[1] + T
		add		r21,r21,r31			;# l1 = l1[0] + t1
		rotlw	r18,r18,r30			;# b0 = l0[0] = ROTL(l0,t0)
		;# <<< Rename L[1] to v7 >>>
		vrlw	v7,v27,v29			;# B = L[1] = ROTL(L,T)
		rotlw	r21,r21,r31			;# b1 = l1[0] = ROTL(l1,t1)
		add		r30,r16,r26			;# t0 = s2 + b2

		;# Stage 62, round 5.0 : Compute S[10] and L[2], A = S[9], B = L[1]
		vadduwm	v10,v10,v9			;# S = S[10] + A
		add		r31,r17,r29			;# t1 = s3 + b3
		add		r24,r24,r30			;# l2 = l2[0] + t0
		vadduwm	v10,v10,v7			;# S += B
		add		r27,r27,r31			;# l3 = l3[0] + t1
		rotlw	r24,r24,r30			;# b2 = l2[0] = ROTL(l2,t0)
		vrlw	v10,v10,v31			;# A = S[10] = ROTL3(S)
		rotlw	r27,r27,r31			;# b3 = l3[0] = ROTL(l3,t1)
		;#-- Integer stage 19. Compute S[19] and L[1]
		add		r14,r14,r0			;# a0 += S[19]
		vxor	v5,v5,v6			;# C ^ D
		add		r15,r15,r0			;# a1 += S[19]
		add		r16,r16,r0			;# a2 += S[19]
		vrlw	v5,v5,v6			;# C = ROTL(C,D)
		add		r17,r17,r0			;# a3 += S[19]
		add		r14,r14,r18			;# s0 = a0 + b0
		vadduwm	v5,v5,v10			;# C += S[10]
		add		r15,r15,r21			;# s1 = a1 + b1
		add		r16,r16,r24			;# s2 = a2 + b2
		vadduwm	v29,v10,v7			;# T = A + B
		add		r17,r17,r27			;# s3 = a3 + b3
		add		r0,r0,r13			;# S[20] = 20Q + P
		vadduwm	v28,v28,v29			;# L = L[2] + T
		rotlwi	r14,r14,3			;# a0 = s0[19] = ROTL3(s0)
		rotlwi	r15,r15,3			;# a1 = s1[19] = ROTL3(s1)
		;# <<< Rename L[2] to v8 >>>
		vrlw	v8,v28,v29			;# B = L[2] = ROTL(L,T)
		rotlwi	r16,r16,3			;# a2 = s2[19] = ROTL3(s2)
		stw		r14,vS_19(r1)		;# Store s0[19]

		;# Stage 63, round 5.1 : Compute S[11] and L[0], A = S[10], B = L[2]
		vadduwm	v11,v11,v10			;# S = S[11] + A
		rotlwi	r17,r17,3			;# a3 = s3[19] = ROTL3(s3)
		stw		r15,vS_19+4(r1)		;# Store s1[19]
		vadduwm	v11,v11,v8			;# S += B
		add		r30,r14,r18			;# t0 = s0 + b0
		stw		r16,vS_19+8(r1)		;# Store s2[19]
		vrlw	v11,v11,v31			;# A = S[11] = ROTL3(S)
		add		r31,r15,r21			;# t1 = s1 + b1
		stw		r17,vS_19+12(r1)	;# Store s3[19]
		vxor	v6,v6,v5			;# D ^ C
		add		r19,r19,r30			;# l0 = l0[1] + t0
		add		r22,r22,r31			;# l1 = l1[1] + t1
		vrlw	v6,v6,v5			;# D = ROTL(D,C)
		rotlw	r19,r19,r30			;# b0 = l0[1] = ROTL(l0,t0)
		rotlw	r22,r22,r31			;# b1 = l1[1] = ROTL(l1,t1)
		vadduwm	v6,v6,v11			;# D += S[11]
		add		r30,r16,r24			;# t0 = s2 + b2
		add		r31,r17,r27			;# t1 = s3 + b3
		vadduwm	v29,v11,v8			;# T = A + B
		add		r25,r25,r30			;# l2 = l2[1] + t0
		add		r28,r28,r31			;# l3 = l3[1] + t1
		vadduwm	v26,v26,v29			;# L = L[0] + T
		rotlw	r25,r25,r30			;# b2 = l2[1] = ROTL(l2,t0)
		rotlw	r28,r28,r31			;# b3 = l3[1] = ROTL(l3,t1)
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)
		;#-- Integer stage 20. Compute S[20] and L[2]
		add		r14,r14,r0			;# a0 += S[20]
		add		r15,r15,r0			;# a1 += S[20]

		;# Stage 64, round 6.0 : Compute S[12] and L[1], A = S[11], B = L[0]
		vadduwm	v12,v12,v11			;# S = S[12] + A
		add		r16,r16,r0			;# a2 += S[20]
		add		r17,r17,r0			;# a3 += S[20]
		vadduwm	v12,v12,v26			;# S += B
		add		r14,r14,r19			;# s0 = a0 + b0
		add		r15,r15,r22			;# s1 = a1 + b1
		vrlw	v12,v12,v31			;# A = S[12] = ROTL3(S)
		add		r16,r16,r25			;# s2 = a2 + b2
		add		r17,r17,r28			;# s3 = a3 + b3
		vxor	v5,v5,v6			;# C ^ D
		add		r0,r0,r13			;# S[21] = 21Q + P
		rotlwi	r14,r14,3			;# a0 = s0[20] = ROTL3(s0)
		vrlw	v5,v5,v6			;# C = ROTL(C,D)
		rotlwi	r15,r15,3			;# a1 = s1[20] = ROTL3(s1)
		stw		r14,vS_20(r1)		;# Store s0[20]
		vadduwm	v5,v5,v12			;# C += S[12]
		rotlwi	r16,r16,3			;# a2 = s2[20] = ROTL3(s2)
		stw		r15,vS_20+4(r1)		;# Store s1[20]
		vadduwm	v29,v12,v26			;# T = A + B
		rotlwi	r17,r17,3			;# a3 = s3[20] = ROTL3(s3)
		stw		r16,vS_20+8(r1)		;# Store s2[20]
		vadduwm	v7,v7,v29			;# L = L[1] + T
		add		r30,r14,r19			;# t0 = s0 + b0
		stw		r17,vS_20+12(r1)	;# Store s3[20]
		vrlw	v7,v7,v29			;# B = L[1] = ROTL(L,T)
		add		r31,r15,r22			;# t1 = s1 + b1
		add		r20,r20,r30			;# l0 = l0[2] + t0

		;# Stage 65, round 6.1 : Compute S[13] and L[2], A = S[12], B = L[1]
		vadduwm	v13,v13,v12			;# S = S[13] + A
		add		r23,r23,r31			;# l1 = l1[2] + t1
		rotlw	r20,r20,r30			;# b0 = l0[2] = ROTL(l0,t0)
		vadduwm	v13,v13,v7			;# S += B
		rotlw	r23,r23,r31			;# b1 = l1[2] = ROTL(l1,t1)
		add		r30,r16,r25			;# t0 = s2 + b2
		vrlw	v13,v13,v31			;# A = S[13] = ROTL3(S)
		add		r31,r17,r28			;# t1 = s3 + b3
		add		r26,r26,r30			;# l2 = l2[2] + t0
		vxor	v6,v6,v5			;# D ^ C
		add		r29,r29,r31			;# l3 = l3[2] + t1
		rotlw	r26,r26,r30			;# b2 = l2[2] = ROTL(l2,t0)
		vrlw	v6,v6,v5			;# D = ROTL(D,C)
		rotlw	r29,r29,r31			;# b3 = l3[2] = ROTL(l3,t1)
		;#-- Integer stage 21. Compute S[21] and L[0]
		add		r14,r14,r0			;# a0 += S[21]
		vadduwm	v6,v6,v13			;# D += S[13]
		add		r15,r15,r0			;# a1 += S[21]
		add		r16,r16,r0			;# a2 += S[21]
		vadduwm	v29,v13,v7			;# T = A + B
		add		r17,r17,r0			;# a3 += S[21]
		add		r14,r14,r20			;# s0 = a0 + b0
		vadduwm	v8,v8,v29			;# L = L[2] + T
		add		r15,r15,r23			;# s1 = a1 + b1
		add		r16,r16,r26			;# s2 = a2 + b2
		vrlw	v8,v8,v29			;# B = L[2] = ROTL(L,T)
		add		r17,r17,r29			;# s3 = a3 + b3
		add		r0,r0,r13			;# S[22] = 22Q + P

		;# Stage 66, round 7.0 : Compute S[14] and L[0], A = S[13], B = L[2]
		vadduwm	v14,v14,v13			;# S = S[14] + A
		rotlwi	r14,r14,3			;# a0 = s0[21] = ROTL3(s0)
		rotlwi	r15,r15,3			;# a1 = s1[21] = ROTL3(s1)
		vadduwm	v14,v14,v8			;# S += B
		rotlwi	r16,r16,3			;# a2 = s2[21] = ROTL3(s2)
		stw		r14,vS_21(r1)		;# Store s0[21]
		vrlw	v14,v14,v31			;# A = S[14] = ROTL3(S)
		rotlwi	r17,r17,3			;# a3 = s3[21] = ROTL3(s3)
		stw		r15,vS_21+4(r1)		;# Store s1[21]
		vxor	v5,v5,v6			;# C ^ D
		add		r30,r14,r20			;# t0 = s0 + b0
		stw		r16,vS_21+8(r1)		;# Store s2[21]
		vrlw	v5,v5,v6			;# C = ROTL(C,D)
		add		r31,r15,r23			;# t1 = s1 + b1
		stw		r17,vS_21+12(r1)	;# Store s3[21]
		vadduwm	v5,v5,v14			;# C += S[14]
		add		r18,r18,r30			;# l0 = l0[0] + t0
		add		r21,r21,r31			;# l1 = l1[0] + t1
		vadduwm	v29,v14,v8			;# T = A + B
		rotlw	r18,r18,r30			;# b0 = l0[0] = ROTL(l0,t0)
		rotlw	r21,r21,r31			;# b1 = l1[0] = ROTL(l1,t1)
		vadduwm	v26,v26,v29			;# L = L[0] + T
		add		r30,r16,r26			;# t0 = s2 + b2
		add		r31,r17,r29			;# t1 = s3 + b3
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)
		add		r24,r24,r30			;# l2 = l2[0] + t0
		add		r27,r27,r31			;# l3 = l3[0] + t1

		;# Stage 67, round 7.1 : Compute S[15] and L[1], A = S[14], B = L[0]
		vadduwm	v15,v15,v14			;# S = S[15] + A
		rotlw	r24,r24,r30			;# b2 = l2[0] = ROTL(l2,t0)
		rotlw	r27,r27,r31			;# b3 = l3[0] = ROTL(l3,t1)
		vadduwm	v15,v15,v26			;# S += B
		;#-- Integer stage 22. Compute S[22] and L[1]
		add		r14,r14,r0			;# a0 += S[22]
		add		r15,r15,r0			;# a1 += S[22]
		vrlw	v15,v15,v31			;# A = S[15] = ROTL3(S)
		add		r16,r16,r0			;# a2 += S[22]
		add		r17,r17,r0			;# a3 += S[22]
		vxor	v6,v6,v5			;# D ^ C
		add		r14,r14,r18			;# s0 = a0 + b0
		add		r15,r15,r21			;# s1 = a1 + b1
		vrlw	v6,v6,v5			;# D = ROTL(D,C)
		add		r16,r16,r24			;# s2 = a2 + b2
		add		r17,r17,r27			;# s3 = a3 + b3
		vadduwm	v6,v6,v15			;# D += S[15]
		add		r0,r0,r13			;# S[23] = 23Q + P
		rotlwi	r14,r14,3			;# a0 = s0[22] = ROTL3(s0)
		vadduwm	v29,v15,v26			;# T = A + B
		rotlwi	r15,r15,3			;# a1 = s1[22] = ROTL3(s1)
		stw		r14,vS_22(r1)		;# Store s0[22]
		vadduwm	v7,v7,v29			;# L = L[1] + T
		rotlwi	r16,r16,3			;# a2 = s2[22] = ROTL3(s2)
		stw		r15,vS_22+4(r1)		;# Store s1[22]
		vrlw	v7,v7,v29			;# B = L[1] = ROTL(L,T)
		rotlwi	r17,r17,3			;# a3 = s3[22] = ROTL3(s3)
		stw		r16,vS_22+8(r1)		;# Store s2[22]

		;# Stage 68, round 8.0 : Compute S[16] and L[2], A = S[15], B = L[1]
		vadduwm	v16,v16,v15			;# S = S[16] + A
		add		r30,r14,r18			;# t0 = s0 + b0
		stw		r17,vS_22+12(r1)	;# Store s3[22]
		vadduwm	v16,v16,v7			;# S += B
		add		r31,r15,r21			;# t1 = s1 + b1
		add		r19,r19,r30			;# l0 = l0[1] + t0
		vrlw	v16,v16,v31			;# A = S[16] = ROTL3(S)
		add		r22,r22,r31			;# l1 = l1[1] + t1
		rotlw	r19,r19,r30			;# b0 = l0[1] = ROTL(l0,t0)
		vxor	v5,v5,v6			;# C ^ D
		rotlw	r22,r22,r31			;# b1 = l1[1] = ROTL(l1,t1)
		add		r30,r16,r24			;# t0 = s2 + b2
		vrlw	v5,v5,v6			;# C = ROTL(C,D)
		add		r31,r17,r27			;# t1 = s3 + b3
		add		r25,r25,r30			;# l2 = l2[1] + t0
		vadduwm	v5,v5,v16			;# C += S[16]
		add		r28,r28,r31			;# l3 = l3[1] + t1
		rotlw	r25,r25,r30			;# b2 = l2[1] = ROTL(l2,t0)
		vadduwm	v29,v16,v7			;# T = A + B
		rotlw	r28,r28,r31			;# b3 = l3[1] = ROTL(l3,t1)
		;#-- Integer stage 23. Compute S[23] and L[2]
		add		r14,r14,r0			;# a0 += S[23]
		vadduwm	v8,v8,v29			;# L = L[2] + T
		add		r15,r15,r0			;# a1 += S[23]
		add		r16,r16,r0			;# a2 += S[23]
		vrlw	v8,v8,v29			;# B = L[2] = ROTL(L,T)
		add		r17,r17,r0			;# a3 += S[23]
		add		r14,r14,r19			;# s0 = a0 + b0

		;# Stage 69, round 8.1 : Compute S[17] and L[0], A = S[16], B = L[2]
		vadduwm	v17,v17,v16			;# S = S[17] + A
		add		r15,r15,r22			;# s1 = a1 + b1
		add		r16,r16,r25			;# s2 = a2 + b2
		vadduwm	v17,v17,v8			;# S += B
		add		r17,r17,r28			;# s3 = a3 + b3
		add		r0,r0,r13			;# S[24] = 24Q + P
		vrlw	v17,v17,v31			;# A = S[17] = ROTL3(S)
		rotlwi	r14,r14,3			;# a0 = s0[23] = ROTL3(s0)
		rotlwi	r15,r15,3			;# a1 = s1[23] = ROTL3(s1)
		vxor	v6,v6,v5			;# D ^ C
		rotlwi	r16,r16,3			;# a2 = s2[23] = ROTL3(s2)
		stw		r14,vS_23(r1)		;# Store s0[23]
		vrlw	v6,v6,v5			;# D = ROTL(D,C)
		rotlwi	r17,r17,3			;# a3 = s3[23] = ROTL3(s3)
		stw		r15,vS_23+4(r1)		;# Store s1[23]
		vadduwm	v6,v6,v17			;# D += S[17]
		add		r30,r14,r19			;# t0 = s0 + b0
		stw		r16,vS_23+8(r1)		;# Store s2[23]
		vadduwm	v29,v17,v8			;# T = A + B
		add		r31,r15,r22			;# t1 = s1 + b1
		stw		r17,vS_23+12(r1)	;# Store s3[23]
		vadduwm	v26,v26,v29			;# L = L[0] + T
		add		r20,r20,r30			;# l0 = l0[2] + t0
		add		r23,r23,r31			;# l1 = l1[2] + t1
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)
		rotlw	r20,r20,r30			;# b0 = l0[2] = ROTL(l0,t0)
		rotlw	r23,r23,r31			;# b1 = l1[2] = ROTL(l1,t1)

		;# Stage 70, round 9.0 : Compute S[18] and L[1], A = S[17], B = L[0]
		vadduwm	v18,v18,v17			;# S = S[18] + A
		add		r30,r16,r25			;# t0 = s2 + b2
		stw		r20,vL_02(r1)		;# Store l0[2]
		vadduwm	v18,v18,v26			;# S += B
		add		r31,r17,r28			;# t1 = s3 + b3
		stw		r23,vL_02+4(r1)		;# Store l1[2]
		vrlw	v18,v18,v31			;# A = S[18] = ROTL3(S)
		add		r26,r26,r30			;# l2 = l2[2] + t0
		add		r29,r29,r31			;# l3 = l3[2] + t1
		vxor	v5,v5,v6			;# C ^ D
		rotlw	r26,r26,r30			;# b2 = l2[2] = ROTL(l2,t0)
		rotlw	r29,r29,r31			;# b3 = l3[2] = ROTL(l3,t1)
		vrlw	v5,v5,v6			;# C = ROTL(C,D)
		;#-- Integer stage 24. Compute S[24] and L[0]
		add		r14,r14,r0			;# a0 += S[24]
		stw		r26,vL_02+8(r1)		;# Store l2[2]
		vadduwm	v5,v5,v18			;# C += S[18]
		add		r15,r15,r0			;# a1 += S[24]
		stw		r29,vL_02+12(r1)	;# Store l3[2]
		vadduwm	v29,v18,v26			;# T = A + B
		add		r16,r16,r0			;# a2 += S[24]
		add		r17,r17,r0			;# a3 += S[24]
		vadduwm	v7,v7,v29			;# L = L[1] + T
		add		r14,r14,r20			;# s0 = a0 + b0
		add		r15,r15,r23			;# s1 = a1 + b1
		vrlw	v7,v7,v29			;# B = L[1] = ROTL(L,T)
		add		r16,r16,r26			;# s2 = a2 + b2
		add		r17,r17,r29			;# s3 = a3 + b3

		;# Stage 71, round 9.1 : Compute S[19] and L[2], A = S[18], B = L[1]
		vadduwm	v19,v19,v18			;# S = S[19] + A
		add		r0,r0,r13			;# S[25] = 25Q + P
		rotlwi	r14,r14,3			;# a0 = s0[24] = ROTL3(s0)
		vadduwm	v19,v19,v7			;# S += B
		rotlwi	r15,r15,3			;# a1 = s1[24] = ROTL3(s1)
		stw		r14,vS_24(r1)		;# Store s0[24]
		vrlw	v19,v19,v31			;# A = S[19] = ROTL3(S)
		rotlwi	r16,r16,3			;# a2 = s2[24] = ROTL3(s2)
		stw		r15,vS_24+4(r1)		;# Store s1[24]
		vxor	v6,v6,v5			;# D ^ C
		rotlwi	r17,r17,3			;# a3 = s3[24] = ROTL3(s3)
		stw		r16,vS_24+8(r1)		;# Store s2[24]
		vrlw	v6,v6,v5			;# D = ROTL(D,C)
		add		r30,r14,r20			;# t0 = s0 + b0
		stw		r17,vS_24+12(r1)	;# Store s3[24]
		vadduwm	v6,v6,v19			;# D += S[19]
		add		r31,r15,r23			;# t1 = s1 + b1
		add		r18,r18,r30			;# l0 = l0[0] + t0
		vadduwm	v29,v19,v7			;# T = A + B
		add		r21,r21,r31			;# l1 = l1[0] + t1
		rotlw	r18,r18,r30			;# b0 = l0[0] = ROTL(l0,t0)
		vadduwm	v8,v8,v29			;# L = L[2] + T
		rotlw	r21,r21,r31			;# b1 = l1[0] = ROTL(l1,t1)
		lvx		v28,r10,r2 			;# Pre-load L[2] for the next iteration.
		vrlw	v8,v8,v29			;# B = L[2] = ROTL(L,T)
		add		r30,r16,r26			;# t0 = s2 + b2
		stw		r18,vL_00(r1)		;# Store l0[0]

		;# Stage 72, round 10.0 : Compute S[20] and L[0], A = S[19], B = L[2]
		vadduwm	v20,v20,v19			;# S = S[20] + A
		add		r31,r17,r29			;# t1 = s3 + b3
		stw		r21,vL_00+4(r1)		;# Store l1[0]
		vadduwm	v20,v20,v8			;# S += B
		add		r24,r24,r30			;# l2 = l2[0] + t0
		add		r27,r27,r31			;# l3 = l3[0] + t1
		vrlw	v20,v20,v31			;# A = S[20] = ROTL3(S)
		rotlw	r24,r24,r30			;# b2 = l2[0] = ROTL(l2,t0)
		rotlw	r27,r27,r31			;# b3 = l3[0] = ROTL(l3,t1)
		vxor	v5,v5,v6			;# C ^ D
		;#-- Integer stage 25. Compute S[25] and L[1]
		add		r14,r14,r0			;# a0 += S[25]
		stw		r24,vL_00+8(r1)		;# Store l2[0]
		vrlw	v5,v5,v6			;# C = ROTL(C,D)
		add		r15,r15,r0			;# a1 += S[25]
		stw		r27,vL_00+12(r1)	;# Store l3[0]
		vadduwm	v5,v5,v20			;# C += S[20]
		add		r16,r16,r0			;# a2 += S[25]
		add		r17,r17,r0			;# a3 += S[25]
		vadduwm	v29,v20,v8			;# T = A + B
		add		r14,r14,r18			;# s0 = a0 + b0
		add		r15,r15,r21			;# s1 = a1 + b1
		vadduwm	v26,v26,v29			;# L = L[0] + T
		add		r16,r16,r24			;# s2 = a2 + b2
		add		r17,r17,r27			;# s3 = a3 + b3
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)
		rotlwi	r14,r14,3			;# a0 = s0[25] = ROTL3(s0)
		rotlwi	r15,r15,3			;# a1 = s1[25] = ROTL3(s1)

		;# Stage 73, round 10.1 : Compute S[21] and L[1], A = S[20], B = L[0]
		vadduwm	v21,v21,v20			;# S = S[21] + A
		rotlwi	r16,r16,3			;# a2 = s2[25] = ROTL3(s2)
		stw		r14,vS_25(r1)		;# Store s0[25]
		vadduwm	v21,v21,v26			;# S += B
		rotlwi	r17,r17,3			;# a3 = s3[25] = ROTL3(s3)
		stw		r15,vS_25+4(r1)		;# Store s1[25]
		vrlw	v21,v21,v31			;# A = S[21] = ROTL3(S)
		add		r30,r14,r18			;# t0 = s0 + b0
		stw		r16,vS_25+8(r1)		;# Store s2[25]
		vxor	v6,v6,v5			;# D ^ C
		add		r31,r15,r21			;# t1 = s1 + b1
		stw		r17,vS_25+12(r1)	;# Store s3[25]
		vrlw	v6,v6,v5			;# D = ROTL(D,C)
		add		r19,r19,r30			;# l0 = l0[1] + t0
		add		r22,r22,r31			;# l1 = l1[1] + t1
		vadduwm	v6,v6,v21			;# D += S[21]
		rotlw	r19,r19,r30			;# b0 = l0[1] = ROTL(l0,t0)
		lwz		r0,wRSA_S0(r1)
		vadduwm	v29,v21,v26			;# T = A + B
		rotlw	r22,r22,r31			;# b1 = l1[1] = ROTL(l1,t1)
		lwz		r18,wCached_L0(r1)
		vadduwm	v7,v7,v29			;# L = L[1] + T
		add		r30,r16,r24			;# t0 = s2 + b2
		stw		r19,vL_01(r1)		;# Store l0[1]
		vrlw	v7,v7,v29			;# B = L[1] = ROTL(L,T)
		add		r31,r17,r27			;# t1 = s3 + b3
		stw		r22,vL_01+4(r1)		;# Store l1[1]

		;# Stage 74, round 11.0 : Compute S[22] and L[2], A = S[21], B = L[1]
		vadduwm	v22,v22,v21			;# S = S[22] + A
		add		r25,r25,r30			;# l2 = l2[1] + t0
		add		r28,r28,r31			;# l3 = l3[1] + t1
		vadduwm	v22,v22,v7			;# S += B
		rotlw	r25,r25,r30			;# b2 = l2[1] = ROTL(l2,t0)
		rotlw	r28,r28,r31			;# b3 = l3[1] = ROTL(l3,t1)
		vrlw	v22,v22,v31			;# A = S[22] = ROTL3(S)
		;#-- Integer stage 26. Compute S[0] and L[2]
		add		r14,r14,r0			;# a0 += S[0]
		stw		r25,vL_01+8(r1)		;# Store l2[1]
		vxor	v5,v5,v6			;# C ^ D
		add		r15,r15,r0			;# a1 += S[0]
		stw		r28,vL_01+12(r1)	;# Store l3[1]
		vrlw	v5,v5,v6			;# C = ROTL(C,D)
		add		r16,r16,r0			;# a2 += S[0]
		add		r17,r17,r0			;# a3 += S[0]
		vadduwm	v5,v5,v22			;# C += S[22]
		add		r14,r14,r19			;# s0 = a0 + b0
		add		r15,r15,r22			;# s1 = a1 + b1
		vadduwm	v29,v22,v7			;# T = A + B
		add		r16,r16,r25			;# s2 = a2 + b2
		rotlwi	r14,r14,3			;# a0 = s0[0] = ROTL3(s0)
		vadduwm	v8,v8,v29			;# L = L[2] + T
		add		r17,r17,r28			;# s3 = a3 + b3
		stw		r14,vS_00(r1)		;# Store s0[0]
		vrlw	v8,v8,v29			;# B = L[2] = ROTL(L,T)
		rotlwi	r15,r15,3			;# a1 = s1[0] = ROTL3(s1)
		lwz		r19,wCached_L1(r1)
		;# Stage 75, round 11.1 : Compute S[23] and L[0], A = S[22], B = L[2]
		vadduwm	v23,v23,v22			;# S = S[23] + A
		rotlwi	r16,r16,3			;# a2 = s2[0] = ROTL3(s2)
		lwz		r14,vS_02(r1)		;# Load s0[2]
		vadduwm	v23,v23,v8			;# S += B
		rotlwi	r17,r17,3			;# a3 = s3[0] = ROTL3(s3)
		stw		r15,vS_00+4(r1)		;# Store s1[0]
		vrlw	v23,v23,v31			;# A = S[23] = ROTL3(S)
		lis		r0,hi16(P+3*Q)
		stw		r16,vS_00+8(r1)		;# Store s2[0]
		vxor	v6,v6,v5			;# D ^ C
		add		r30,r14,r19			;# t0 = s0 + b0
		stw		r17,vS_00+12(r1)	;# Store s3[0]
		vrlw	v6,v6,v5			;# D = ROTL(D,C)
		add		r20,r7,r30			;# l0 = key.hi + t0
		ori		r0,r0,lo16(P+3*Q)
		vadduwm	v6,v6,v23			;# D += S[23]
		addi	r26,r20,2
		addi	r29,r20,3
		vadduwm	v29,v23,v8			;# T = A + B
		rotlw	r26,r26,r30			;# b2 = l2[2] = ROTL(l2,t0)
		rotlw	r29,r29,r30			;# b3 = l3[2] = ROTL(l3,t0)
		vadduwm	v26,v26,v29			;# L = L[0] + T
		addi	r23,r20,1
		rotlw	r20,r20,r30			;# b0 = l0[2] = ROTL(l0,t0)
		vrlw	v26,v26,v29			;# B = L[0] = ROTL(L,T)
		rotlw	r23,r23,r30			;# b1 = l1[2] = ROTL(l1,t0)
		;#-- Integer stage 3. Compute S[3] and L[0]
		add		r17,r14,r0			;# a3 = a0 + S[3]

		;# Stage 76, round 12.0 : Compute S[24] and L[1], A = S[23], B = L[0]
		vadduwm	v24,v24,v23			;# S = S[24] + A
		add		r14,r17,r20			;# s0 = a3 + b0
		add		r15,r17,r23			;# s1 = a3 + b1
		vadduwm	v24,v24,v26			;# S += B
		add		r16,r17,r26			;# s2 = a3 + b2
		vspltw	v4,v30,2			;# cypher.lo
		vrlw	v24,v24,v31			;# A = S[24] = ROTL3(S)
		add		r17,r17,r29			;# s3 = a3 + b3
		add		r0,r0,r13			;# S[4] = 4Q + P
		vxor	v5,v5,v6			;# C ^ D
		rotlwi	r14,r14,3			;# a0 = s0[3] = ROTL3(s0)
		rotlwi	r15,r15,3			;# a1 = s1[3] = ROTL3(s1)
		vrlw	v5,v5,v6			;# C = ROTL(C,D)
		rotlwi	r16,r16,3			;# a2 = s2[3] = ROTL3(s2)
		lvx		v27,r9,r2 			;# Pre-load L[1] for the next iteration.
		vadduwm	v5,v5,v24			;# C += S[24]
		rotlwi	r17,r17,3			;# a3 = s3[3] = ROTL3(s3)
		lvx		v0,r11,r2 			;# Pre-load S[0] for the next iteration.
		vcmpequw. v3,v4,v5
		bne-	cr6,check_keys		;# Complete in the same cycle
		bdnz	new_key_hi			;# +1 cycle (BTIC bubble) => 374 clock cycles


;# Increment key.mid/key.lo
;# 'spare' registers : r21, r22, r24, r25, r27, r28, r30 and r31

inc_key_mid:
		addi	r7,r6,96			;# Setup the last base register
		lwz		r21,wKIter(r1)		;# Remaining iterations / 4
		li		r25,1				;# Preset next loop count
		lwz		r14,vS_01(r1)		;# Load s0[1]
		slwi	r25,r25,30-RC5_KEY_SHIFT
		lwbrx	r22,r8,r7 			;# key.mid (byte reversed)
		lis		r0,hi16(P+2*Q)		;# S[2] = P + 2Q
		lwbrx	r24,r9,r7 			;# key.lo (byte reversed)
		cmplwi	r21,0				;# Iteration count == 0 ? (cr0)
		ori		r0,r0,lo16(P+2*Q)	;# S[2] = P + 2Q
		addi	r22,r22,1			;# key.mid += 1
		cmplw	cr1,r21,r25			;# iter < count ?
		beq-	not_found			;# Terminate.

		stwbrx	r22,r8,r7 			;# Store updated key.mid
		cmplwi	r22,0				;# key.mid == 0 ? (cr0)
		bgt		cr1,set_count2

		mr		r25,r21				;# Loop count = iterations / 4
									;# r25 = min(iterations / 4, loop count)

set_count2:		
		subf	r21,r25,r21			;# Update iteration count
		mtctr	r25
		stw		r21,wKIter(r1)
		bne+	new_key_mid

inc_key_lo:
		addi	r24,r24,1
		stwbrx	r24,r9,r7 			;# Store updated key.lo
		b		new_key_lo

;# key.hi has been incremented for the next integer iterations.
;# We have to undo this before exiting. Note that key.mid and
;# key.lo have not been saved after the incrementation of key.hi,
;# so they still be acurate (we have to return the next key to
;# be checked).

not_found:
		lwbrx	r5,0,r6 			;# Load key.hi
		DEC_KEY	r5					;# Undo key.hi incrementation
		li		r3,RESULT_NOTHING
		stwbrx	r5,0,r6
		b		epilog


;# Check the high word of the cyphered text.
;# 'spare' registers : r21, r22, r24, r25, r27, r28, r30 and r31

check_keys:
		addi	r7,r6,96			;# Setup the last base register
		vadduwm	v29,v24,v26			;# T = A + B
		vspltisw v9,-1
		vadduwm	v7,v7,v29			;# L = L[1] + T
		vspltisw v10,0
		vrlw	v7,v7,v29			;# B = L[1] = ROTL(L,T)
		lvx		v11,r12,r7 			;# vector (0,1,2,3)
		vadduwm	v25,v25,v24			;# S = S[25] + A
		vspltw	v24,v30,3			;# cypher.hi
		vadduwm	v25,v25,v7			;# S += B
		vrlw	v25,v25,v31			;# A = S[25] = ROTL3(S)
		vxor	v6,v6,v5			;# D ^ C
		vrlw	v6,v6,v5			;# D = ROTL(D, C)
		vadduwm	v6,v6,v25			;# D += S[25]

		vcmpequw  v5,v24,v6			;# D == cypher.hi ?
		vand	  v5,v5,v3			;# := C == cypher.lo && D == cypher.hi
		vcmpequw. v5,v5,v9			;# != 0 := no match

		;# Now a bit of trickery :
		;# We have to be prepared to deal with up to 4 potential winning keys, where
		;# only the first one is of interest. Beside that, any partial match (against
		;# cypher.lo) that occurs in elements upto the one that matches both cypher.lo
		;# and cypher.hi has to be registered in the check.count member.
		;# The first vector element in v5 which is equal to -1 identifies the first
		;# matching key. By shifting and or'ing the vector elements, we create a mask
		;# that will be used to flush all potential matches after the first one.

		vsldoi	v9,v10,v5,4			;# := match >> 32
		vsldoi	v6,v10,v5,8			;# := match >> 64
		vor		v9,v9,v6
		vsldoi	v6,v10,v5,12		;# := match >> 96
		vnor	v9,v9,v6
		vand	v5,v3,v9			;# Mask out unwanted matches
		vsumsws	v10,v5,v10			;# v10.lsw = [-1, -2, -3, -4]
		stvx	v10,0,r7 			;# Use vPartial as temp storage

		;# Only the matching key has to be remembered (v11 := vector (0,1,2,3))
		vand	v11,v11,v5			;# Select matching key increments
		vsldoi	v9,v11,v11,8		;# vector (2,3,0,1)
		vmaxuw	v9,v9,v11
		vsldoi	v11,v9,v9,4
		vmaxuw	v9,v9,v11			;# All elements = matching key increment.
		stvewx	v9,0,r7 			;# Store key increment (vPartial)

		lwbrx	r21,0,r6 			;# Load key.hi (byte reversed)
		lwbrx	r24,r8,r7 			;# Load key.mid (byte reversed)
		lwbrx	r27,r9,r7 			;# Load key.lo (byte reversed)

		lwz		r22,wChkCnt(r1)
		DEC_KEY	r21					;# Undo key.hi incrementation.
		addi	r30,r7,wChkMid-vCheck
		cmplwi	r21,0				;# key.hi == 0 ? (cr0)
		addi	r31,r7,wChkLo-vCheck
		DEC_KEY	r21					;# Recover AltiVec base key
		bne		no_carry			;# No need to update key.mid/key.lo

		subic	r24,r24,1			;# Decrement key.mid
		addme	r27,r27				;# and key.lo

no_carry:
		stwbrx	r24,r10,r30			;# Store check.mid
		stwbrx	r27,r10,r31			;# Store check.lo

		lwz		r31,vPartial(r1)	;# Key increment
		lwz		r30,vPartial+12(r1)	;# count increment (negative value)
		slwi	r25,r31,RC5_KEY_SHIFT
		subf	r22,r30,r22			;# New count
		add		r21,r21,r25			;# Update key.hi (== key.hi + increment)
		stw		r22,wChkCnt(r1)		;# Store check.count
		stwbrx	r21,r10,r7 			;# Store check.hi

		bne+	cr6,key_found		;# One of the 4 keys matches the cyphered text.

		bdnz	new_key_hi			;# Next iteration.
		b		inc_key_mid

		;# A matching key has been found.
		;# r24 := key.mid
		;# r27 := key.lo
		;# r31 := key increment [0; 3]

key_found:
		lwz		r22,pIterations(r1)
		lwz		r25,wKIter(r1)		;# iterations / 4
		mfctr	r28					;# Remaining loop iterations
		add		r25,r25,r28
		lwz		r28,0(r22)
		slwi	r25,r25,2			;# remaining iterations * 4
		clrrwi	r21,r21,2
		subf	r25,r31,r25
		stwbrx	r21,0,r6 			;# Save key.hi
		subf	r25,r25,r28			;# How many keys have been checked.
		stwbrx	r24,r8,r7 			;# Save key.mid
		stw		r25,0(r22)			;# Save remaining iterations count
		stwbrx	r27,r9,r7 			;# Save key.lo
		li		r3,RESULT_FOUND


;#=============================================================================
;# Epilog : Update the RC5_72UnitWork structure, then restore all non-volatile
;#		registers.
;#		r3 := RESULT_FOUND or RESULT_NOTHING

epilog:
		lwz		r4,pUnitWork(r1)

		;# Update rc5_72unitwork->check
		lwz		r5,wChkCnt(r1)
		lwz		r6,wChkHi(r1)
		lwz		r7,wChkMid(r1)
		lwz		r8,wChkLo(r1)
		stw		r5,check_count(r4)
		stw		r6,check_hi(r4)
		stw		r7,check_mid(r4)
		stw		r8,check_lo(r4)

		;# Update rc5_72unitwork->L0
		lwz		r5,wKeyHi(r1)
		lwz		r6,wKeyMid(r1)
		lwz		r7,wKeyLo(r1)
		stw		r5,L0_hi(r4)
		stw		r6,L0_mid(r4)
		stw		r7,L0_lo(r4)

		;# Restore vector registers
		li		r5,aVectorArea
		lvx		v31,r1,r5
		addi	r5,r5,16
		lvx		v30,r1,r5
		addi	r5,r5,16
		lvx		v29,r1,r5
		addi	r5,r5,16
		lvx		v28,r1,r5
		addi	r5,r5,16
		lvx		v27,r1,r5
		addi	r5,r5,16
		lvx		v26,r1,r5
		addi	r5,r5,16
		lvx		v25,r1,r5
		addi	r5,r5,16
		lvx		v24,r1,r5
		addi	r5,r5,16
		lvx		v23,r1,r5
		addi	r5,r5,16
		lvx		v22,r1,r5
		addi	r5,r5,16
		lvx		v21,r1,r5
		addi	r5,r5,16
		lvx		v20,r1,r5

		;# Restore non-volatile registers
		lwz		r5,wVRSave(r1)
		lwz		r6,wSaveCR(r1)
		lwz		r7,wSaveCTR(r1)
		mtspr	VRsave,r5
		mtcr	r6
		mtctr	r7
		lwz		r2,wSaveR2(r1)

		lwz		r5,0(r1)				;# Caller's stack pointer
		lmw		r13,-GPRsave(r5)
		mr		r1,r5
		blr
