// dual-key, mixed round 3 and encryption, A1/A2 use for last value,
// non-arrayed S1/S2 tables, run-time generation of S0[]

// Copyright distributed.net 1997 - All Rights Reserved
// For use in distributed.net projects only.
// Any other distribution or use of this source violates copyright.

/*  This file is included from rc5.cpp so we can use __inline__.  */

// Run-time generation of S0[] :
//
//	- loading a large constant on RISC need two instructions.
//	  (ie, on sparc :)
//		sethi %hi(1444465436),%g2
//		or %g2,%lo(1444465436),%g2
//
//	- generating S0[] at run time need only one instruction
//	  since S0[n] = S0[n-1] + Q
//	  (ie, : currentS0 += Q )
//
//	- drawback : we need two more registers
//	  one for 'currentS0' and one for 'Q'
//
// The main crypt routine needs :
//
//	- 26 registers for S00..S25
//
//	- 3 registers for A1,Llo1,Lhi1
//
//	- 2 registers for cS0 and Q
//	- 2 registers for eA1 and eB1
//	 note that they should overlap with cS0 and Q
//	 since the firsts are only used in round 1
//	 and the lasts are only used in round 3
//
//	- 1 more register for ROTL (Llo1 + A1 + Lhi1, A1 + Lhi1),
//	  since a decent compiler should do :
//		tmp1=A1+Lhi1
//		Llo1+=tmp1
//		Llo1=ROTL(Llo1,tmp1)
//	  (this assume we have a rotate instruction)
//
// So we need 26+3+2+1 = 31 registers
//
// On MIPS, HP-PA and PowerPC, it could be a gain to  use S00..S25 instead of 'A1',
// since we will need 30 registers and these chips have 32 registers.
// (don't know if we really could use 30 registers in a function, since I can't get
//  this @%! cross-gcc to compile !)
//
// On Sparc we are short of registers.
//
// Rmi Guyomarch - 97/07/19

#include "rc5.h"
#include "rotate.h"


#if (PIPELINE_COUNT != 1)
#error "Expecting pipeline count of 1"
#endif

#ifndef _CPU_32BIT_
#error "everything assumes a 32bit CPU..."
#endif

#if defined(__TURBOC__) || defined(__VMS) || defined(__OS2__) || defined (NTALPHA)
#define __inline__
#endif


#define _P	 0xB7E15163 
#define _Q	 0x9E3779B9 
#define S_not(n) (_P+_Q*n)


// Round 1 macros
// --------------
#define ROUND1EVEN(S1N)			\
cS0 += Q;				\
S1N = A1 = ROTL3 (A1 + Lhi1 + cS0);	\
Llo1 = ROTL (Llo1 + A1 + Lhi1, A1 + Lhi1);

#define  ROUND1ODD(S1N)			\
cS0 += Q;				\
S1N = A1 = ROTL3 (A1 + Llo1 + cS0);	\
Lhi1 = ROTL (Lhi1 + A1 + Llo1, A1 + Llo1);

// Round 2 macros
// --------------
#define ROUND2EVEN(S1N)			\
S1N = A1 = ROTL3 (A1 + Lhi1 + S1N);	\
Llo1 = ROTL (Llo1 + A1 + Lhi1, A1 + Lhi1);

#define  ROUND2ODD(S1N)			\
S1N = A1 = ROTL3 (A1 + Llo1 + S1N);	\
Lhi1 = ROTL (Lhi1 + A1 + Llo1, A1 + Llo1);

// Round 3 macros
// --------------
#define ROUND3EVEN(S1N)				\
A1 = ROTL3 (A1 + Lhi1 + S1N);			\
eA1 = ROTL (eA1 ^ eB1, eB1) + A1;		\
Llo1 = ROTL (Llo1 + A1 + Lhi1, A1 + Lhi1);
    
#define ROUND3ODD(S1N)				\
A1 = ROTL3 (A1 + Llo1 + S1N);			\
eB1 = ROTL (eA1 ^ eB1, eA1) + A1;		\
Lhi1 = ROTL (Lhi1 + A1 + Llo1, A1 + Llo1);

// rc5_unit will get passed an RC5WorkUnit to complete
// this is where all the actually work occurs, this is where you optimize.
// assembly gurus encouraged.
// Returns: 0 - nothing found, 1 - found on pipeline 1,
//   2 - found pipeline 2, 3 - ... etc ...

//static __inline__
u32 rc5_unit_func( RC5UnitWork * rc5unitwork )
{ 
  u32 S1_00,S1_01,S1_02,S1_03,S1_04,S1_05,S1_06,S1_07,S1_08,S1_09,
      S1_10,S1_11,S1_12,S1_13,S1_14,S1_15,S1_16,S1_17,S1_18,S1_19,
      S1_20,S1_21,S1_22,S1_23,S1_24,S1_25;

  register u32 A1, Llo1, Lhi1;

  Llo1 = rc5unitwork->L0.lo;
  Lhi1 = rc5unitwork->L0.hi;
  
  { register u32 cS0, Q;
      
    /* Begin round 1 of key expansion */

    /*  Special case while A and B are known to be zero.  */
    cS0 = _P;
    Q   = _Q;
    S1_00 = A1 = ROTL3(cS0);
    Llo1 = ROTL(Llo1 + A1, A1);

    ROUND1ODD  (S1_01);
    ROUND1EVEN (S1_02);
    ROUND1ODD  (S1_03);
    ROUND1EVEN (S1_04);
    ROUND1ODD  (S1_05);
    ROUND1EVEN (S1_06);
    ROUND1ODD  (S1_07);
    ROUND1EVEN (S1_08);
    ROUND1ODD  (S1_09);
    ROUND1EVEN (S1_10);
    ROUND1ODD  (S1_11);
    ROUND1EVEN (S1_12);
    ROUND1ODD  (S1_13);
    ROUND1EVEN (S1_14);
    ROUND1ODD  (S1_15);
    ROUND1EVEN (S1_16);
    ROUND1ODD  (S1_17);
    ROUND1EVEN (S1_18);
    ROUND1ODD  (S1_19);
    ROUND1EVEN (S1_20);
    ROUND1ODD  (S1_21);
    ROUND1EVEN (S1_22);
    ROUND1ODD  (S1_23);
    ROUND1EVEN (S1_24);
    ROUND1ODD  (S1_25);
  }

  /* Begin round 2 of key expansion */
  
  ROUND2EVEN (S1_00);
  ROUND2ODD  (S1_01);
  ROUND2EVEN (S1_02);
  ROUND2ODD  (S1_03);
  ROUND2EVEN (S1_04);
  ROUND2ODD  (S1_05);
  ROUND2EVEN (S1_06);
  ROUND2ODD  (S1_07);
  ROUND2EVEN (S1_08);
  ROUND2ODD  (S1_09);
  ROUND2EVEN (S1_10);
  ROUND2ODD  (S1_11);
  ROUND2EVEN (S1_12);
  ROUND2ODD  (S1_13);
  ROUND2EVEN (S1_14);
  ROUND2ODD  (S1_15);
  ROUND2EVEN (S1_16);
  ROUND2ODD  (S1_17);
  ROUND2EVEN (S1_18);
  ROUND2ODD  (S1_19);
  ROUND2EVEN (S1_20);
  ROUND2ODD  (S1_21);
  ROUND2EVEN (S1_22);
  ROUND2ODD  (S1_23);
  ROUND2EVEN (S1_24);
  ROUND2ODD  (S1_25);
  
    /* Begin round 3 of key expansion (and encryption round) */
  
  { register u32 eA1, eB1;

    eA1 = rc5unitwork->plain.lo + (A1 = ROTL3(S1_00 + A1 + Lhi1));
    Llo1 = ROTL(Llo1 + A1 + Lhi1, A1 + Lhi1);
    eB1 = rc5unitwork->plain.hi + (A1 = ROTL3(S1_01 + A1 + Llo1));
    Lhi1 = ROTL(Lhi1 + A1 + Llo1, A1 + Llo1);

    ROUND3EVEN (S1_02);
    ROUND3ODD  (S1_03);
    ROUND3EVEN (S1_04);
    ROUND3ODD  (S1_05);
    ROUND3EVEN (S1_06);
    ROUND3ODD  (S1_07);
    ROUND3EVEN (S1_08);
    ROUND3ODD  (S1_09);
    ROUND3EVEN (S1_10);
    ROUND3ODD  (S1_11);
    ROUND3EVEN (S1_12);
    ROUND3ODD  (S1_13);
    ROUND3EVEN (S1_14);
    ROUND3ODD  (S1_15);
    ROUND3EVEN (S1_16);
    ROUND3ODD  (S1_17);
    ROUND3EVEN (S1_18);
    ROUND3ODD  (S1_19);
    ROUND3EVEN (S1_20);
    ROUND3ODD  (S1_21);
    ROUND3EVEN (S1_22);
    ROUND3ODD  (S1_23);
	       
    eA1 = ROTL(eA1 ^ eB1, eB1) + (A1 = ROTL3(S1_24 + A1 + Lhi1));
	       
    if (rc5unitwork->cypher.lo == eA1 &&
	    rc5unitwork->cypher.hi == ROTL(eB1 ^ eA1, eA1) +
	      ROTL3(S1_25 + A1 + ROTL(Llo1 + A1 + Lhi1, A1 + Lhi1))) return 1;
	  return 0;	      
  }
}


