; Copyright distributed.net 1997 - All Rights Reserved
; For use in distributed.net projects only.
; Any other distribution or use of this source violates copyright.
;
; $Log: rc5-ssk7.asm,v $
; Revision 1.1.2.1  2000/02/20 22:59:30  jlawson
; added new contributed Athelon core
;
;
; AMD Athlon optimized version
; vulture a.k.a. Sean Stanek <vulture@iastate.edu>
; first version - 19 Feb 2000
;
; I used the Cx core (rg6x86.asm) revision 1.10 as a skeleton for this,
; so partial credit goes to whoever contributed to that one.
;
; Most of the non-core code was from the Cx code, though my core looks very
; much like the Cx code. I removed most of the ALIGNs because they just
; take up extra time on the Athlon in certain situations.
;
; Gets about 10% to 11% over the Cx or P6 cores for the Athlon. Someone
; else tested this and supposedly it works a little better than whatever
; is currently used. Doesn't work better on any Intel chips that I know of.
;
; Here's a cute little screenshot using 'dnetc -bench rc5'
; The class 3/4 scores were run afterwards, so the time will be slightly
; different on it.
;
; [Feb 19 22:49:47 UTC] Automatic processor type detection found
;                       an AMD K7 processor.
; [Feb 19 22:49:47 UTC] RC5: using core #0 (jasonp P5/MMX).
; [Feb 19 22:50:05 UTC] Benchmarking RC5 ... 100.00% done
; [Feb 19 22:50:05 UTC] Completed in 0.00:00:17.71 [473,504.62 keys/sec]
; [Feb 19 23:11:55 UTC] RC5: using core #1 (RG class 3/4).
; [Feb 19 23:12:11 UTC] Benchmarking RC5 ... 100.00% done
; [Feb 19 23:12:11 UTC] Completed in 0.00:00:16.02 [1,439,273.27 keys/sec]
; [Feb 19 22:50:21 UTC] RC5: using core #2 (RG class 6).
; [Feb 19 22:50:38 UTC] Benchmarking RC5 ... 100.00% done
; [Feb 19 22:50:38 UTC] Completed in 0.00:00:16.63 [1,639,385.20 keys/sec]
; [Feb 19 22:50:38 UTC] RC5: using core #3 (RG Cx re-pair).
; [Feb 19 22:50:55 UTC] Benchmarking RC5 ... 100.00% done
; [Feb 19 22:50:55 UTC] Completed in 0.00:00:16.60 [1,641,458.00 keys/sec]
; [Feb 19 22:50:55 UTC] RC5: using core #4 (RG RISC-rotate I).
; [Feb 19 22:51:11 UTC] Benchmarking RC5 ... 100.00% done
; [Feb 19 22:51:11 UTC] Completed in 0.00:00:16.15 [1,557,869.50 keys/sec]
; [Feb 19 22:51:11 UTC] RC5: using core #5 (RG RISC-rotate II).
; [Feb 19 22:51:28 UTC] Benchmarking RC5 ... 100.00% done
; [Feb 19 22:51:28 UTC] Completed in 0.00:00:16.79 [1,311,032.15 keys/sec]
; [Feb 19 22:50:05 UTC] RC5: using core #6 (RG k7-vulture).
; [Feb 19 22:50:21 UTC] Benchmarking RC5 ... 100.00% done
; [Feb 19 22:50:21 UTC] Completed in 0.00:00:16.19 [1,813,025.07 keys/sec]
;
; (1813025.07 - 1641458.00) / 1641458.00 =~ 10.45% gain
;
; Of course, this is subject to random variation on my Athlon-550.
;

%ifdef __showids__
%ifdef OS2
[SECTION _DATA USE32 align=16]
%else
[SECTION .data]
%endif
  db  "@(#)$Id: rc5-ssk7.asm,v 1.1.2.1 2000/02/20 22:59:30 jlawson Exp $"
%endif

[GLOBAL _rc5_unit_func_k7]
[GLOBAL rc5_unit_func_k7]

%define work_size       276

%define RC5UnitWork     esp+work_size+4
%define timeslice       esp+work_size+8


; The S0 values for key expansion round 1 are constants.

%define P         0xB7E15163
%define Q         0x9E3779B9
%define S_not(N)  (P+Q*(N))

;#define S0_ROTL3  _(((P<<3) | (P>>29)))
%define S0_ROTL3 0xbf0a8b1d
;#define FIRST_ROTL _((S0_ROTL3 & 0x1f))
%define FIRST_ROTL 0x1d
;#define S1_S0_ROTL3 _((S_not(1) + S0_ROTL3))
%define S1_S0_ROTL3 0x15235639


;  Offsets to access work_struct fields.

%define save_ebp   esp+0
%define save_edi   esp+4
%define save_esi   esp+8
%define save_ebx   esp+12
%define work_s1         esp+16
%define work_s2         esp+104+16
%define work_add_iter   esp+208+16
%define work_P_0        esp+212+16
%define work_P_1        esp+216+16
%define work_C_0        esp+220+16
%define work_C_1        esp+224+16
%define work_key2_edi   esp+228+16
%define work_key2_esi   esp+232+16
%define work_key_hi     esp+236+16
%define work_key_lo     esp+240+16
%define work_iterations esp+244+16
%define work_pre1_r1    esp+248+16
%define work_pre2_r1    esp+252+16
%define work_pre3_r1    esp+256+16

; Offsets to access RC5UnitWork fields

%define RC5UnitWork_plainhi   eax+0
%define RC5UnitWork_plainlo   eax+4
%define RC5UnitWork_cipherhi  eax+8
%define RC5UnitWork_cipherlo  eax+12
%define RC5UnitWork_L0hi      eax+16
%define RC5UnitWork_L0lo      eax+20

  ; A1   = %eax  A2   = %ebp
  ; Llo1 = %ebx  Llo2 = %esi
  ; Lhi1 = %edx  Lhi2 = %edi

%define S1(N) [work_s1+((N)*4)]
%define S2(N) [work_s2+((N)*4)]

; ------------------------------------------------------------------
; S1(N) = A1 = ROTL3 (A1 + Lhi1 + S_not(N));
; S2(N) = A2 = ROTL3 (A2 + Lhi2 + S_not(N));
; Llo1 = ROTL (Llo1 + A1 + Lhi1, A1 + Lhi1);
; Llo2 = ROTL (Llo2 + A2 + Lhi2, A2 + Lhi2);
%macro ROUND_1_EVEN 1
        add     eax, S_not(%1)
        add     ebp, edi
        add     eax, edx
        rol     ebp, 3
        rol     eax, 3
        mov     ecx,edx
        add     ecx,eax
        mov     S1(%1), eax
        add     eax, S_not((%1)+1)
        add     ebx, ecx
        mov     S2(%1), ebp
        rol     ebx, cl
        lea     ecx, [ebp+edi]
        add     esi, ecx
        add     ebp, S_not((%1)+1)
        rol     esi, cl
%endmacro

; S1(N) = A1 = ROTL3 (A1 + Llo1 + S_not(N));
; S2(N) = A2 = ROTL3 (A2 + Llo2 + S_not(N));
; Lhi1 = ROTL (Lhi1 + A1 + Llo1, A1 + Llo1);
; Lhi2 = ROTL (Lhi2 + A2 + Llo2, A2 + Llo2);

%macro ROUND_1_ODD 1(N) \
        add     eax, ebx
        add     ebp, esi
        rol     eax, 3
        rol     ebp, 3
        mov     ecx,ebx
        add     ecx,eax
        mov     S1(%1),eax
        add     edx, ecx
        mov     S2(%1), ebp
        rol     edx, cl
        lea     ecx, [ebp+esi]
        add     edi, ecx
        add     ebp, S_not((%1)+1)
        rol     edi, cl
%endmacro

%macro ROUND_1_LAST 1
        add     eax, ebx
        add     ebp, esi
        rol     eax, 3
        rol     ebp, 3
        mov     S1(%1), eax
        mov     ecx, eax
        add     ecx, ebx
        add     edx, ecx
        mov     S2(%1), ebp
        rol     edx, cl
        lea     ecx, [ebp+esi]
        add     edi, ecx
        add     eax, S0_ROTL3
        rol     edi, cl
%endmacro

%macro ROUND_1_EVEN_AND_ODD 1
       ROUND_1_EVEN %1
       ROUND_1_ODD %1+1
%endmacro

; ------------------------------------------------------------------
; S1N = A1 = ROTL3 (A1 + Lhi1 + S1N);
; S2N = A2 = ROTL3 (A2 + Lhi2 + S2N);
; Llo1 = ROTL (Llo1 + A1 + Lhi1, A1 + Lhi1);
; Llo2 = ROTL (Llo2 + A2 + Lhi2, A2 + Lhi2);

%macro ROUND_2_EVEN 1
        add     eax, S1(%1)
        add     ebp, edi
        add     eax, edx
        rol     ebp, 3
        rol     eax, 3
        mov     ecx, edx
        add     ecx, eax
        mov     S1(%1), eax
        add     ebx, ecx
        add     eax, S1(%1+1)
        mov     S2(%1), ebp
        rol     ebx, cl
        lea     ecx, [ebp+edi]
        add     esi, ecx
        add     ebp, S2(%1+1)
        rol     esi, cl
%endmacro

; S1N = A1 = ROTL3 (A1 + Llo1 + S1N);
; S2N = A2 = ROTL3 (A2 + Llo2 + S2N);
; Lhi1 = ROTL (Lhi1 + A1 + Llo1, A1 + Llo1);
; Lhi2 = ROTL (Lhi2 + A2 + Llo2, A2 + Llo2);

%macro ROUND_2_ODD 1
        add     eax, ebx
        add     ebp, esi
        rol     eax, 3
        rol     ebp, 3
        mov     S1(%1), eax
        mov     ecx, eax
        add     ecx, ebx
        add     edx, ecx
        mov     S2(%1), ebp
        rol     edx, cl
        lea     ecx, [ebp+esi]
        add     edi, ecx
        add     ebp, S2(%1+1)
        rol     edi, cl
%endmacro

%macro ROUND_2_LAST 1
        add     eax, ebx
        add     ebp, esi
        rol     eax, 3
        rol     ebp, 3
        mov     S1(%1), eax
        mov     ecx, eax
        add     ecx, ebx
        add     edx, ecx
        mov     S2(%1), ebp
        rol     edx, cl
        lea     ecx, [ebp+esi]
        add     edi, ecx
        mov     [work_key2_esi], esi
        rol     edi, cl
%endmacro

%macro ROUND_2_EVEN_AND_ODD 1
	ROUND_2_EVEN %1
	ROUND_2_ODD %1+1
%endmacro

; ------------------------------------------------------------------
; eA1 = ROTL (eA1 ^ eB1, eB1) + (A1 = ROTL3 (A1 + Lhi1 + S1(N)));
; Llo1 = ROTL (Llo1 + A1 + Lhi1, A1 + Lhi1);
; eB1 = ROTL (eA1 ^ eB1, eA1) + (A1 = ROTL3 (A1 + Llo1 + S1(N)));
; Lhi1 = ROTL (Lhi1 + A1 + Llo1, A1 + Llo1);

; A  = %eax  eA = %esi
; L0 = %ebx  eB = %edi
; L1 = %edx  .. = %ebp
; %%ebp is either &S1 or &S2

%define S3(N) [(N)*4+ebp]

%macro ROUND_3_EVEN_AND_ODD 1
        add     eax, S3(%1)
        xor     esi, edi
        mov     ecx, edi
        add     eax, edx
        rol     esi, cl
        rol     eax, 3
        mov     ecx, edx
        add     ecx, eax
        add     esi, eax
        add     ebx, ecx
        add     eax, S3(%1+1)
        rol     ebx, cl
        mov     ecx, esi
        xor     edi, esi
        add     eax, ebx
        rol     edi, cl
        rol     eax, 3
        mov     ecx, ebx
        add     ecx, eax
        add     edx, ecx
        add     edi, eax
        rol     edx, cl
%endmacro

%ifdef OS2
[SECTION _TEXT USE32 align=16]
%else
[SECTION .text]
%endif

; ------------------------------------------------------------------
; rc5_unit will get passed an RC5WorkUnit to complete
;
; Returns number of keys checked before a possible good key is found, or
; timeslice*PIPELINE_COUNT if no keys are 'good' keys.
; (ie:      if (result == timeslice*PIPELINE_COUNT) NOTHING_FOUND
;      else if (result < timeslice*PIPELINE_COUNT) SOMETHING_FOUND at result+1
;      else SOMETHING_WENT_WRONG... )

align 4
_rc5_unit_func_k7:
rc5_unit_func_k7:
;u32 rc5_unit_func_k7( RC5UnitWork * rc5unitwork, u32 timeslice )

     sub esp, work_size ; set up stack

     mov [save_ebp], ebp ; save registers
     mov [save_edi], edi
     mov [save_esi], esi
     mov [save_ebx], ebx

     mov ebp, [timeslice]

     mov dword [work_add_iter], 0x00000000
;    work.add_iter = 0;

     mov [work_iterations], ebp

     mov eax, [RC5UnitWork] ; load pointer to rc5unitwork into eax
;    work.iterations = timeslice;

	; load parameters
        mov     ebx, [RC5UnitWork_L0lo]                 ; ebx = l0 = Llo1
        mov     edx, [RC5UnitWork_L0hi]                 ; edx = l1 = Lhi1
        mov     esi, ebx                                ; esi = l2 = Llo2
        lea     edi, [0x01000000+edx]                   ; edi = l3 = lhi2
        mov     [work_key_lo], ebx
        mov     [work_key_hi], edx

	; Save other parameters
	; (it's faster to do so, since we will only load 1 value
	; each time in RC5_ROUND_3xy, instead of two if we save
	; only the pointer to the RC5 struct)
        mov     ebp, [RC5UnitWork_plainlo]
        mov     [work_P_0], ebp
        mov     ebp, [RC5UnitWork_plainhi]
        mov     [work_P_1], ebp
        mov     ebp, [RC5UnitWork_cipherlo]
        mov     [work_C_0], ebp
        mov     ebp, [RC5UnitWork_cipherhi]
        mov     [work_C_1], ebp

	; Pre-calculate things. Assume work.key_lo won't change it this loop
	; (it's pretty safe to assume that, because we're working on 28 bits
	; blocks)
	; It means also that %%ebx == %%esi (Llo1 == Llo2)

;align 4
_bigger_loop_k7:
        add     ebx, S0_ROTL3
        rol     ebx, FIRST_ROTL
        mov     [work_pre1_r1], ebx

        lea     eax, [S1_S0_ROTL3+ebx]
        rol     eax, 3
        mov     [work_pre2_r1], eax

        lea     ecx, [eax+ebx]
        mov     [work_pre3_r1], ecx

;align 4
_loaded_k7:
    ; ------------------------------
    ; Begin round 1 of key expansion
    ; ------------------------------

        mov     ebx, [work_pre1_r1]     ; 1
        mov     eax, [work_pre2_r1]     ;
        mov     esi, ebx                ; 1
        mov     ebp, eax                ;

        mov     ecx, [work_pre3_r1]     ; 1
        add     edx, ecx                ;
        rol     edx, cl                 ; 2
        add     edi, ecx                ;
        mov     S1(1), eax              ; 1
        add     ebp, S_not(2)           ;
        rol     edi, cl                 ; 2     sum = 8

	ROUND_1_EVEN_AND_ODD  2
	ROUND_1_EVEN_AND_ODD  4
	ROUND_1_EVEN_AND_ODD  6
	ROUND_1_EVEN_AND_ODD  8
	ROUND_1_EVEN_AND_ODD 10
	ROUND_1_EVEN_AND_ODD 12
	ROUND_1_EVEN_AND_ODD 14
	ROUND_1_EVEN_AND_ODD 16
	ROUND_1_EVEN_AND_ODD 18
	ROUND_1_EVEN_AND_ODD 20
	ROUND_1_EVEN_AND_ODD 22
	ROUND_1_EVEN         24
	ROUND_1_LAST         25
;align 4
_end_round1_k7:
    ; ------------------------------
    ; Begin round 2 of key expansion
    ; ------------------------------

        add     ebp, S0_ROTL3           ;
        add     eax, edx                ; 1
        add     ebp, edi                ;
        rol     eax, 3                  ; 1
        rol     ebp, 3                  ;
        mov     ecx, eax                ; 1
        add      ecx, edx               ;
        mov     S1(0), eax              ; 1
        add     ebx, ecx                ;
        add     eax, S1(1)              ; 1
        mov     S2(0), ebp              ;
        rol     ebx, cl                 ; 2
        lea     ecx, [ebp+edi]          ;
        add     esi, ecx                ; 1
        add     ebp, S1(1)              ;
        rol     esi, cl                 ; 2

	ROUND_2_ODD           1
        ROUND_2_EVEN_AND_ODD  2
        ROUND_2_EVEN_AND_ODD  4
        ROUND_2_EVEN_AND_ODD  6
        ROUND_2_EVEN_AND_ODD  8
        ROUND_2_EVEN_AND_ODD 10
        ROUND_2_EVEN_AND_ODD 12
        ROUND_2_EVEN_AND_ODD 14
        ROUND_2_EVEN_AND_ODD 16
        ROUND_2_EVEN_AND_ODD 18
        ROUND_2_EVEN_AND_ODD 20
        ROUND_2_EVEN_AND_ODD 22
	ROUND_2_EVEN         24
	ROUND_2_LAST         25

    ; Save 2nd key parameters
;align 4
_end_round2_k7:
        mov     [work_key2_edi], edi

    ; ----------------------------------------------------
    ; Begin round 3 of key expansion mixed with encryption
    ; ----------------------------------------------------
    ; (first key)					

	; A  = %eax  eA = %esi
	; L0 = %ebx  eB = %edi
	; L1 = %edx  .. = %ebp

        lea     ebp, S1(0)

	; A = ROTL3(S00 + A + L1);
	; eA = P_0 + A;
	; L0 = ROTL(L0 + A + L1, A + L1);
        add     eax, S3(0)      ;       (pairs with lea)
        add     eax, edx        ; 1
        mov     esi, [work_P_0] ;
        rol     eax, 3          ; 1
        mov     ecx, edx        ;
        add     esi, eax        ; 1
        add     ecx, eax        ;
        add     ebx, ecx        ; 1
        add     eax, S3(1)      ;
        rol     ebx, cl         ; 2
	; A = ROTL3(S01 + A + L0);
	; eB = P_1 + A;
	; L1 = ROTL(L1 + A + L0, A + L0);
        add     eax, ebx        ;
        mov     ecx, ebx        ; 1
        rol     eax, 3          ;
        mov     edi, [work_P_1] ; 1
        add     edi, eax        ;
        add     ecx, eax        ; 1
        add     edx, ecx        ; 1
        rol     edx, cl         ; 2
	ROUND_3_EVEN_AND_ODD  2
	ROUND_3_EVEN_AND_ODD  4
	ROUND_3_EVEN_AND_ODD  6
	ROUND_3_EVEN_AND_ODD  8
	ROUND_3_EVEN_AND_ODD 10
	ROUND_3_EVEN_AND_ODD 12
	ROUND_3_EVEN_AND_ODD 14
	ROUND_3_EVEN_AND_ODD 16
	ROUND_3_EVEN_AND_ODD 18
	ROUND_3_EVEN_AND_ODD 20
	ROUND_3_EVEN_AND_ODD 22

	; early exit
;align 4
_end_round3_1_k7:
        add     eax, S3(24)     ;       A = ROTL3(S24 + A + L1);
        mov     ecx, edi        ;       eA = ROTL(eA ^ eB, eB) + A;
        add     eax, edx        ;
        xor     esi, edi        ; 1
        rol     eax, 3          ;
        rol     esi, cl         ; 2
        add     esi, eax        ; 1
					
        cmp     esi, [work_C_0]
        jne     __exit_1_k7
					
        mov     ecx, eax        ; 1     L0 = ROTL(L0 + A + L1, A + L1);
        add     ecx, edx        ;       A = ROTL3(S25 + A + L0);
        xor     edi, esi        ;       eB = ROTL(eB ^ eA, eA) + A;
        add     ebx, ecx        ;
        rol     ebx, cl         ; 2
        add     eax, S3(25)     ;
        mov     ecx, esi        ; 1
        add     eax, ebx        ;
        rol     edi, cl         ; 2
        rol     eax, 3          ;
        add     edi, eax        ; 1

        cmp     edi, [work_C_1]
        je near _full_exit_k7

;align 4
__exit_1_k7:
    ; Restore 2nd key parameters
        mov     edx, [work_key2_edi]
        mov     ebx, [work_key2_esi]
        mov     eax, S2(25)

    ; ---------------------------------------------------- */
    ; Begin round 3 of key expansion mixed with encryption */
    ; ---------------------------------------------------- */
    ; (second key)					    */

	; A  = %eax  eA = %esi
	; L0 = %ebx  eB = %edi
	; L1 = %edx  .. = %ebp

        lea     ebp, S2(0)

	; A = ROTL3(S00 + A + L1);
	; eA = P_0 + A;
	; L0 = ROTL(L0 + A + L1, A + L1);
        add     eax, edx        ; 1
        mov     ecx, edx        ;
        add     eax, S3(0)      ; 1
        rol     eax, 3          ; 1
        mov     esi, [work_P_0] ;
        add     esi, eax        ; 1
        add     ecx, eax        ;
        add     ebx, ecx        ; 1
        add     eax, S3(1)      ;
        rol     ebx, cl         ; 2
	; A = ROTL3(S01 + A + L0);
	; eB = P_1 + A;
	; L1 = ROTL(L1 + A + L0, A + L0);
        add     eax, ebx        ;
        mov     ecx, ebx        ; 1
        rol     eax, 3          ;
        mov     edi, [work_P_1] ; 1
        add     edi, eax        ;
        add     ecx, eax        ; 1
        add     edx, ecx        ; 1
        rol     edx, cl         ; 2
	ROUND_3_EVEN_AND_ODD  2
	ROUND_3_EVEN_AND_ODD  4
	ROUND_3_EVEN_AND_ODD  6
	ROUND_3_EVEN_AND_ODD  8
	ROUND_3_EVEN_AND_ODD 10
	ROUND_3_EVEN_AND_ODD 12
	ROUND_3_EVEN_AND_ODD 14
	ROUND_3_EVEN_AND_ODD 16
	ROUND_3_EVEN_AND_ODD 18
	ROUND_3_EVEN_AND_ODD 20
	ROUND_3_EVEN_AND_ODD 22
	; early exit
;align 4
_end_round3_2_k7:
        add     eax, S3(24)     ;       A = ROTL3(S24 + A + L1);
        mov     ecx, edi        ; 1     eA = ROTL(eA ^ eB, eB) + A;
        add     eax, edx        ;
        xor     esi, edi        ; 1
        rol     eax, 3          ;
        rol     esi, cl         ; 2
        add     esi, eax        ; 1
					
        cmp     esi, [work_C_0]
        jne     __exit_2_k7
	
        mov     ecx, eax        ; 1     L0 = ROTL(L0 + A + L1, A + L1);
        add     ecx, edx        ;       A = ROTL3(S25 + A + L0);
        xor     edi, esi        ; 1     eB = ROTL(eB ^ eA, eA) + A;
        add     ebx, ecx        ;
        rol     ebx, cl         ; 2
        add     eax, S3(25)     ;
        mov     ecx, esi        ; 1
        add     eax, ebx        ;
        rol     edi, cl         ; 2
        rol     eax, 3          ;
        add     edi, eax        ; 1

        cmp     edi, [work_C_1]
        jne     __exit_2_k7
        mov     dword [work_add_iter], 1
        jmp     _full_exit_k7

;align 4
__exit_2_k7:
        mov     edx, [work_key_hi]

; Jumps not taken are faster
        add     edx, 0x02000000
        jc near _next_inc_k7

;align 4
_next_iter_k7:
        mov     [work_key_hi], edx
        lea     edi, [0x01000000+edx]
        dec     dword [work_iterations]
        jg near _loaded_k7
        mov     eax, [RC5UnitWork]                      ; pointer to rc5unitwork
        mov     ebx, [work_key_lo]
        mov     [RC5UnitWork_L0lo], ebx                 ; Update real data
        mov     [RC5UnitWork_L0hi], edx                 ; (used by caller)
        jmp     _full_exit_k7

;align 4
_next_iter2_k7:
        mov     [work_key_lo], ebx
        mov     [work_key_hi], edx
        lea     edi, [0x01000000+edx]
        mov     esi, ebx
        dec     dword [work_iterations]
        jg near _bigger_loop_k7
        mov     eax, [RC5UnitWork]                      ; pointer to rc5unitwork
        mov     [RC5UnitWork_L0lo], ebx                 ; Update real data
        mov     [RC5UnitWork_L0hi], edx                 ; (used by caller)
        jmp     _full_exit_k7

;align 4
_next_inc_k7:
        add     edx, 0x00010000
        test    edx, 0x00FF0000
        jnz near _next_iter_k7

        add     edx, 0xFF000100
        test    edx, 0x0000FF00
        jnz near _next_iter_k7

        add     edx, 0xFFFF0001
        test    edx, 0x000000FF
        jnz near _next_iter_k7


        mov     ebx, [work_key_lo]

        sub     edx, 0x00000100
        add     ebx, 0x01000000
        jnc near _next_iter2_k7

        add     ebx, 0x00010000
        test    ebx, 0x00FF0000
        jnz near _next_iter2_k7

        add     ebx, 0xFF000100
        test    ebx, 0x0000FF00
        jnz near _next_iter2_k7

        add     ebx, 0xFFFF0001
        test    ebx, 0x000000FF
        jnz near _next_iter2_k7

	; Moo !
	; We have just finished checking the last key
	; of the rc5-64 keyspace...
	; Not much to do here, since we have finished the block ...


;align 4
_full_exit_k7:
mov ebp, [timeslice]
sub ebp, [work_iterations]
mov eax, [work_add_iter]
lea edx, [eax+ebp*2]
mov eax, edx

;    return (timeslice - work.iterations) * 2 + work.add_iter;


      mov ebx, [save_ebx]
      mov esi, [save_esi]
      mov edi, [save_edi]
      mov ebp, [save_ebp]

     add esp, work_size ; restore stack pointer

     ret


