;--------------------- ; https://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 ; ; License information: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; This implementation notably advances the performance of SHA-1 algorithm compared to existing ; implementations. We are encouraging all projects utilizing SHA-1 to integrate this new fast ; implementation and are ready to help if issues or concerns arise (you are welcome to leave ; a comment or write an email to the authors). It is provided 'as is' and free for either ; commercial or non-commercial use. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ; This code implements two interfaces of SHA-1 update function: 1) working on a single ; 64-byte block and 2) working on a buffer of multiple 64-bit blocks. Multiple blocks ; version of code is software pipelined and faster overall, it is a default. Assemble ; with -DINTEL_SHA1_SINGLEBLOCK to select single 64-byte block function interface. ; ; C++ prototypes of implemented functions are below: ; ; #ifndef INTEL_SHA1_SINGLEBLOCK ; // Updates 20-byte SHA-1 record in 'hash' for 'num_blocks' consequtive 64-byte blocks ; extern "C" void sha1_update_intel(int *hash, const char* input, size_t num_blocks ); ; #else ; // Updates 20-byte SHA-1 record in 'hash' for one 64-byte block pointed by 'input' ; extern "C" void sha1_update_intel(int *hash, const char* input); ; #endif ; ; Function name 'sha1_update_intel' can be changed in the source or via macro: ; -DINTEL_SHA1_UPDATE_FUNCNAME=my_sha1_update_func_name ; ; It implements both UNIX(default) and Windows ABIs, use -DWIN_ABI on Windows ; ; Code checks CPU for SSSE3 support via CPUID feature flag (CPUID.1.ECX.SSSE3[bit 9]==1), ; and performs dispatch. Since in most cases the functionality on non-SSSE3 supporting CPUs ; is also required, the default (e.g. one being replaced) function can be provided for ; dispatch on such CPUs, the name of old function can be changed in the source or via macro: ; -DINTEL_SHA1_UPDATE_DEFAULT_DISPATCH=default_sha1_update_function_name ; ; Authors: Maxim Locktyukhin and Ronen Zohar at Intel.com ; %ifndef INTEL_SHA1_UPDATE_DEFAULT_DISPATCH ;; can be replaced with a default SHA-1 update function name %define INTEL_SHA1_UPDATE_DEFAULT_DISPATCH sha1_intel_non_ssse3_cpu_stub_ %else extern INTEL_SHA1_UPDATE_DEFAULT_DISPATCH %endif ;; provide alternative SHA-1 update function's name here %ifndef INTEL_SHA1_UPDATE_FUNCNAME %define INTEL_SHA1_UPDATE_FUNCNAME sha1_update_intel %endif global INTEL_SHA1_UPDATE_FUNCNAME %ifndef INTEL_SHA1_SINGLEBLOCK %assign multiblock 1 %else %assign multiblock 0 %endif bits 64 default rel %ifdef WIN_ABI %xdefine arg1 rcx %xdefine arg2 rdx %xdefine arg3 r8 %else %xdefine arg1 rdi %xdefine arg2 rsi %xdefine arg3 rdx %endif %xdefine ctx arg1 %xdefine buf arg2 %xdefine cnt arg3 %macro REGALLOC 0 %xdefine A ecx %xdefine B esi %xdefine C edi %xdefine D ebp %xdefine E edx %xdefine T1 eax %xdefine T2 ebx %endmacro %xdefine K_BASE r8 %xdefine HASH_PTR r9 %xdefine BUFFER_PTR r10 %xdefine BUFFER_END r11 %xdefine W_TMP xmm0 %xdefine W_TMP2 xmm9 %xdefine W0 xmm1 %xdefine W4 xmm2 %xdefine W8 xmm3 %xdefine W12 xmm4 %xdefine W16 xmm5 %xdefine W20 xmm6 %xdefine W24 xmm7 %xdefine W28 xmm8 %xdefine XMM_SHUFB_BSWAP xmm10 ;; we keep window of 64 w[i]+K pre-calculated values in a circular buffer %xdefine WK(t) (rsp + (t & 15)*4) ;------------------------------------------------------------------------------ ; ; macro implements SHA-1 function's body for single or several 64-byte blocks ; first param: function's name ; second param: =0 - function implements single 64-byte block hash ; =1 - function implements multiple64-byte blocks hash ; 3rd function's argument is a number, greater 0, of 64-byte blocks to calc hash for ; %macro SHA1_VECTOR_ASM 2 align 4096 %1: push rbx push rbp %ifdef WIN_ABI push rdi push rsi %xdefine stack_size (16*4 + 16*5 + 8) %else %xdefine stack_size (16*4 + 8) %endif sub rsp, stack_size %ifdef WIN_ABI %xdefine xmm_save_base (rsp + 16*4) xmm_mov [xmm_save_base + 0*16], xmm6 xmm_mov [xmm_save_base + 1*16], xmm7 xmm_mov [xmm_save_base + 2*16], xmm8 xmm_mov [xmm_save_base + 3*16], xmm9 xmm_mov [xmm_save_base + 4*16], xmm10 %endif mov HASH_PTR, ctx mov BUFFER_PTR, buf %if (%2 == 1) shl cnt, 6 ;; mul by 64 add cnt, buf mov BUFFER_END, cnt %endif lea K_BASE, [K_XMM_AR] xmm_mov XMM_SHUFB_BSWAP, [bswap_shufb_ctl] SHA1_PIPELINED_MAIN_BODY %2 %ifdef WIN_ABI xmm_mov xmm6, [xmm_save_base + 0*16] xmm_mov xmm7, [xmm_save_base + 1*16] xmm_mov xmm8, [xmm_save_base + 2*16] xmm_mov xmm9, [xmm_save_base + 3*16] xmm_mov xmm10,[xmm_save_base + 4*16] %endif add rsp, stack_size %ifdef WIN_ABI pop rsi pop rdi %endif pop rbp pop rbx ret %endmacro ;-------------------------------------------- ; macro implements 80 rounds of SHA-1, for one 64-byte block or multiple blocks with s/w pipelining ; macro param: =0 - process single 64-byte block ; =1 - multiple blocks ; %macro SHA1_PIPELINED_MAIN_BODY 1 REGALLOC mov A, [HASH_PTR ] mov B, [HASH_PTR+ 4] mov C, [HASH_PTR+ 8] mov D, [HASH_PTR+12] mov E, [HASH_PTR+16] %assign i 0 %rep W_PRECALC_AHEAD W_PRECALC i %assign i i+1 %endrep %xdefine F F1 %if (%1 == 1) ;; code loops through more than one block %%_loop: cmp BUFFER_PTR, K_BASE ;; we use K_BASE value as a signal of a last block, jne %%_begin ;; it is set below by: cmovae BUFFER_PTR, K_BASE jmp %%_end align 32 %%_begin: %endif RR A,B,C,D,E,0 RR D,E,A,B,C,2 RR B,C,D,E,A,4 RR E,A,B,C,D,6 RR C,D,E,A,B,8 RR A,B,C,D,E,10 RR D,E,A,B,C,12 RR B,C,D,E,A,14 RR E,A,B,C,D,16 RR C,D,E,A,B,18 %xdefine F F2 RR A,B,C,D,E,20 RR D,E,A,B,C,22 RR B,C,D,E,A,24 RR E,A,B,C,D,26 RR C,D,E,A,B,28 RR A,B,C,D,E,30 RR D,E,A,B,C,32 RR B,C,D,E,A,34 RR E,A,B,C,D,36 RR C,D,E,A,B,38 %xdefine F F3 RR A,B,C,D,E,40 RR D,E,A,B,C,42 RR B,C,D,E,A,44 RR E,A,B,C,D,46 RR C,D,E,A,B,48 RR A,B,C,D,E,50 RR D,E,A,B,C,52 RR B,C,D,E,A,54 RR E,A,B,C,D,56 RR C,D,E,A,B,58 %xdefine F F4 %if (%1 == 1) ;; if code loops through more than one block add BUFFER_PTR, 64 ;; move to next 64-byte block cmp BUFFER_PTR, BUFFER_END ;; check if current block is the last one cmovae BUFFER_PTR, K_BASE ;; smart way to signal the last iteration %else %xdefine W_NO_TAIL_PRECALC 1 ;; no software pipelining for single block interface %endif RR A,B,C,D,E,60 RR D,E,A,B,C,62 RR B,C,D,E,A,64 RR E,A,B,C,D,66 RR C,D,E,A,B,68 RR A,B,C,D,E,70 RR D,E,A,B,C,72 RR B,C,D,E,A,74 RR E,A,B,C,D,76 RR C,D,E,A,B,78 UPDATE_HASH [HASH_PTR ],A UPDATE_HASH [HASH_PTR+ 4],B UPDATE_HASH [HASH_PTR+ 8],C UPDATE_HASH [HASH_PTR+12],D UPDATE_HASH [HASH_PTR+16],E %if (%1 == 1) jmp %%_loop align 32 %%_end: %endif %xdefine W_NO_TAIL_PRECALC 0 %xdefine F %error %endmacro %macro F1 3 mov T1,%2 xor T1,%3 and T1,%1 xor T1,%3 %endmacro %macro F2 3 mov T1,%3 xor T1,%2 xor T1,%1 %endmacro %macro F3 3 mov T1,%2 mov T2,%1 or T1,%1 and T2,%2 and T1,%3 or T1,T2 %endmacro %define F4 F2 %macro UPDATE_HASH 2 add %2, %1 mov %1, %2 %endmacro %macro W_PRECALC 1 %xdefine i (%1) %if (i < 20) %xdefine K_XMM 0 %elif (i < 40) %xdefine K_XMM 16 %elif (i < 60) %xdefine K_XMM 32 %else %xdefine K_XMM 48 %endif %if (i<16 || (i>=80 && i<(80 + W_PRECALC_AHEAD))) %if (W_NO_TAIL_PRECALC == 0) %xdefine i ((%1) % 80) ;; pre-compute for the next iteration %if (i == 0) W_PRECALC_RESET %endif W_PRECALC_00_15 %endif %elif (i < 32) W_PRECALC_16_31 %elif (i < 80) ;; rounds 32-79 W_PRECALC_32_79 %endif %endmacro %macro W_PRECALC_RESET 0 %xdefine W W0 %xdefine W_minus_04 W4 %xdefine W_minus_08 W8 %xdefine W_minus_12 W12 %xdefine W_minus_16 W16 %xdefine W_minus_20 W20 %xdefine W_minus_24 W24 %xdefine W_minus_28 W28 %xdefine W_minus_32 W %endmacro %macro W_PRECALC_ROTATE 0 %xdefine W_minus_32 W_minus_28 %xdefine W_minus_28 W_minus_24 %xdefine W_minus_24 W_minus_20 %xdefine W_minus_20 W_minus_16 %xdefine W_minus_16 W_minus_12 %xdefine W_minus_12 W_minus_08 %xdefine W_minus_08 W_minus_04 %xdefine W_minus_04 W %xdefine W W_minus_32 %endmacro %xdefine W_PRECALC_AHEAD 16 %xdefine W_NO_TAIL_PRECALC 0 %xdefine xmm_mov movdqa %macro W_PRECALC_00_15 0 ;; message scheduling pre-compute for rounds 0-15 %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds movdqu W_TMP, [BUFFER_PTR + (i * 4)] %elif ((i & 3) == 1) pshufb W_TMP, XMM_SHUFB_BSWAP movdqa W, W_TMP %elif ((i & 3) == 2) paddd W_TMP, [K_BASE] %elif ((i & 3) == 3) movdqa [WK(i&~3)], W_TMP W_PRECALC_ROTATE %endif %endmacro %macro W_PRECALC_16_31 0 ;; message scheduling pre-compute for rounds 16-31 ;; calculating last 32 w[i] values in 8 XMM registers ;; pre-calculate K+w[i] values and store to mem, for later load by ALU add instruction ;; ;; "brute force" vectorization for rounds 16-31 only due to w[i]->w[i-3] dependency ;; %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds movdqa W, W_minus_12 palignr W, W_minus_16, 8 ;; w[i-14] movdqa W_TMP, W_minus_04 psrldq W_TMP, 4 ;; w[i-3] pxor W, W_minus_08 %elif ((i & 3) == 1) pxor W_TMP, W_minus_16 pxor W, W_TMP movdqa W_TMP2, W movdqa W_TMP, W pslldq W_TMP2, 12 %elif ((i & 3) == 2) psrld W, 31 pslld W_TMP, 1 por W_TMP, W movdqa W, W_TMP2 psrld W_TMP2, 30 pslld W, 2 %elif ((i & 3) == 3) pxor W_TMP, W pxor W_TMP, W_TMP2 movdqa W, W_TMP paddd W_TMP, [K_BASE + K_XMM] movdqa [WK(i&~3)],W_TMP W_PRECALC_ROTATE %endif %endmacro %macro W_PRECALC_32_79 0 ;; in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 ;; instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 ;; allows more efficient vectorization since w[i]=>w[i-3] dependency is broken ;; %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds movdqa W_TMP, W_minus_04 pxor W, W_minus_28 ;; W is W_minus_32 before xor palignr W_TMP, W_minus_08, 8 %elif ((i & 3) == 1) pxor W, W_minus_16 pxor W, W_TMP movdqa W_TMP, W %elif ((i & 3) == 2) psrld W, 30 pslld W_TMP, 2 por W_TMP, W %elif ((i & 3) == 3) movdqa W, W_TMP paddd W_TMP, [K_BASE + K_XMM] movdqa [WK(i&~3)],W_TMP W_PRECALC_ROTATE %endif %endmacro %macro RR 6 ;; RR does two rounds of SHA-1 back to back with W pre-calculation ;; TEMP = A ;; A = F( i, B, C, D ) + E + ROTATE_LEFT( A, 5 ) + W[i] + K(i) ;; C = ROTATE_LEFT( B, 30 ) ;; D = C ;; E = D ;; B = TEMP W_PRECALC (%6 + W_PRECALC_AHEAD) F %2, %3, %4 ;; F returns result in T1 add %5, [WK(%6)] rol %2, 30 mov T2, %1 add %4, [WK(%6 + 1)] rol T2, 5 add %5, T1 W_PRECALC (%6 + W_PRECALC_AHEAD + 1) add T2, %5 mov %5, T2 rol T2, 5 add %4, T2 F %1, %2, %3 ;; F returns result in T1 add %4, T1 rol %1, 30 ;; write: %1, %2 ;; rotate: %1<=%4, %2<=%5, %3<=%1, %4<=%2, %5<=%3 %endmacro ;;---------------------- section .data align=128 %xdefine K1 0x5a827999 %xdefine K2 0x6ed9eba1 %xdefine K3 0x8f1bbcdc %xdefine K4 0xca62c1d6 align 128 K_XMM_AR: DD K1, K1, K1, K1 DD K2, K2, K2, K2 DD K3, K3, K3, K3 DD K4, K4, K4, K4 align 16 bswap_shufb_ctl: DD 00010203h DD 04050607h DD 08090a0bh DD 0c0d0e0fh ;; dispatch pointer, points to the init routine for the first invocation sha1_update_intel_dispatched: DQ sha1_update_intel_init_ ;;---------------------- section .text align=4096 SHA1_VECTOR_ASM sha1_update_intel_ssse3_, multiblock align 32 sha1_update_intel_init_: ;; we get here with the first time invocation call sha1_update_intel_dispacth_init_ INTEL_SHA1_UPDATE_FUNCNAME: ;; we get here after init jmp qword [sha1_update_intel_dispatched] ;; CPUID feature flag based dispatch sha1_update_intel_dispacth_init_: push rax push rbx push rcx push rdx push rsi lea rsi, [INTEL_SHA1_UPDATE_DEFAULT_DISPATCH] mov eax, 1 cpuid test ecx, 0200h ;; SSSE3 support, CPUID.1.ECX[bit 9] jz _done lea rsi, [sha1_update_intel_ssse3_] _done: mov [sha1_update_intel_dispatched], rsi pop rsi pop rdx pop rcx pop rbx pop rax ret ;;---------------------- ;; in the case a default SHA-1 update function implementation was not provided ;; and code was invoked on a non-SSSE3 supporting CPU, dispatch handles this ;; failure in a safest way - jumps to the stub function with UD2 instruction below sha1_intel_non_ssse3_cpu_stub_: ud2 ;; in the case no default SHA-1 was provided non-SSSE3 CPUs safely fail here ret ; END ;----------------------