cosmopolitan/libc/nexgen32e/sha1.S
Justine Tunney 957c61cbbf
Release Cosmopolitan v3.3
This change upgrades to GCC 12.3 and GNU binutils 2.42. The GNU linker
appears to have changed things so that only a single de-duplicated str
table is present in the binary, and it gets placed wherever the linker
wants, regardless of what the linker script says. To cope with that we
need to stop using .ident to embed licenses. As such, this change does
significant work to revamp how third party licenses are defined in the
codebase, using `.section .notice,"aR",@progbits`.

This new GCC 12.3 toolchain has support for GNU indirect functions. It
lets us support __target_clones__ for the first time. This is used for
optimizing the performance of libc string functions such as strlen and
friends so far on x86, by ensuring AVX systems favor a second codepath
that uses VEX encoding. It shaves some latency off certain operations.
It's a useful feature to have for scientific computing for the reasons
explained by the test/libcxx/openmp_test.cc example which compiles for
fifteen different microarchitectures. Thanks to the upgrades, it's now
also possible to use newer instruction sets, such as AVX512FP16, VNNI.

Cosmo now uses the %gs register on x86 by default for TLS. Doing it is
helpful for any program that links `cosmo_dlopen()`. Such programs had
to recompile their binaries at startup to change the TLS instructions.
That's not great, since it means every page in the executable needs to
be faulted. The work of rewriting TLS-related x86 opcodes, is moved to
fixupobj.com instead. This is great news for MacOS x86 users, since we
previously needed to morph the binary every time for that platform but
now that's no longer necessary. The only platforms where we need fixup
of TLS x86 opcodes at runtime are now Windows, OpenBSD, and NetBSD. On
Windows we morph TLS to point deeper into the TIB, based on a TlsAlloc
assignment, and on OpenBSD/NetBSD we morph %gs back into %fs since the
kernels do not allow us to specify a value for the %gs register.

OpenBSD users are now required to use APE Loader to run Cosmo binaries
and assimilation is no longer possible. OpenBSD kernel needs to change
to allow programs to specify a value for the %gs register, or it needs
to stop marking executable pages loaded by the kernel as mimmutable().

This release fixes __constructor__, .ctor, .init_array, and lastly the
.preinit_array so they behave the exact same way as glibc.

We no longer use hex constants to define math.h symbols like M_PI.
2024-02-20 13:27:59 -08:00

652 lines
14 KiB
ArmAsm

/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set noet ft=asm ts=8 sw=8 fenc=utf-8 :vi
Copyright 2014 Intel Corporation
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, │
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY │
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "libc/macros.internal.h"
.section .notice,"aR",@progbits
.asciz "\n\n\
AVX2 SHA-1 (BSD-3 License)n\
Copyright 2014 Intel Corporation"
.previous
#define CTX %rdi /* arg1 */
#define BUF %rsi /* arg2 */
#define CNT %rdx /* arg3 */
#define REG_A %ecx
#define REG_B %esi
#define REG_C %edi
#define REG_D %eax
#define REG_E %edx
#define REG_TB %ebx
#define REG_TA %r12d
#define REG_RA %rcx
#define REG_RB %rsi
#define REG_RC %rdi
#define REG_RD %rax
#define REG_RE %rdx
#define REG_RTA %r12
#define REG_RTB %rbx
#define REG_T1 %r11d
#define xmm_mov vmovups
#define RND_F1 1
#define RND_F2 2
#define RND_F3 3
.macro REGALLOC
.set A, REG_A
.set B, REG_B
.set C, REG_C
.set D, REG_D
.set E, REG_E
.set TB, REG_TB
.set TA, REG_TA
.set RA, REG_RA
.set RB, REG_RB
.set RC, REG_RC
.set RD, REG_RD
.set RE, REG_RE
.set RTA, REG_RTA
.set RTB, REG_RTB
.set T1, REG_T1
.endm
#define HASH_PTR %r9
#define BLOCKS_CTR %r8
#define BUFFER_PTR %r10
#define BUFFER_PTR2 %r13
#define PRECALC_BUF %r14
#define WK_BUF %r15
#define W_TMP %xmm0
#define WY_TMP %ymm0
#define WY_TMP2 %ymm9
# AVX2 variables
#define WY0 %ymm3
#define WY4 %ymm5
#define WY08 %ymm7
#define WY12 %ymm8
#define WY16 %ymm12
#define WY20 %ymm13
#define WY24 %ymm14
#define WY28 %ymm15
#define YMM_SHUFB_BSWAP %ymm10
/*
* Keep 2 iterations precalculated at a time:
* - 80 DWORDs per iteration * 2
*/
#define W_SIZE (80*2*2 +16)
#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
.macro UPDATE_HASH hash, val
add \hash, \val
mov \val, \hash
.endm
.macro PRECALC_RESET_WY
.set WY_00, WY0
.set WY_04, WY4
.set WY_08, WY08
.set WY_12, WY12
.set WY_16, WY16
.set WY_20, WY20
.set WY_24, WY24
.set WY_28, WY28
.set WY_32, WY_00
.endm
.macro PRECALC_ROTATE_WY
/* Rotate macros */
.set WY_32, WY_28
.set WY_28, WY_24
.set WY_24, WY_20
.set WY_20, WY_16
.set WY_16, WY_12
.set WY_12, WY_08
.set WY_08, WY_04
.set WY_04, WY_00
.set WY_00, WY_32
/* Define register aliases */
.set WY, WY_00
.set WY_minus_04, WY_04
.set WY_minus_08, WY_08
.set WY_minus_12, WY_12
.set WY_minus_16, WY_16
.set WY_minus_20, WY_20
.set WY_minus_24, WY_24
.set WY_minus_28, WY_28
.set WY_minus_32, WY
.endm
.macro PRECALC_00_15
.if (i == 0) # Initialize and rotate registers
PRECALC_RESET_WY
PRECALC_ROTATE_WY
.endif
/* message scheduling pre-compute for rounds 0-15 */
.if ((i & 7) == 0)
/*
* blended AVX2 and ALU instruction scheduling
* 1 vector iteration per 8 rounds
*/
vmovdqu (i * 2)(BUFFER_PTR), W_TMP
.elseif ((i & 7) == 1)
vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
WY_TMP, WY_TMP
.elseif ((i & 7) == 2)
vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
.elseif ((i & 7) == 4)
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
.elseif ((i & 7) == 7)
vmovdqu WY_TMP, PRECALC_WK(i&~7)
PRECALC_ROTATE_WY
.endif
.endm
.macro PRECALC_16_31
/*
* message scheduling pre-compute for rounds 16-31
* calculating last 32 w[i] values in 8 XMM registers
* pre-calculate K+w[i] values and store to mem
* for later load by ALU add instruction
*
* "brute force" vectorization for rounds 16-31 only
* due to w[i]->w[i-3] dependency
*/
.if ((i & 7) == 0)
/*
* blended AVX2 and ALU instruction scheduling
* 1 vector iteration per 8 rounds
*/
/* w[i-14] */
vpalignr $8, WY_minus_16, WY_minus_12, WY
vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */
.elseif ((i & 7) == 1)
vpxor WY_minus_08, WY, WY
vpxor WY_minus_16, WY_TMP, WY_TMP
.elseif ((i & 7) == 2)
vpxor WY_TMP, WY, WY
vpslldq $12, WY, WY_TMP2
.elseif ((i & 7) == 3)
vpslld $1, WY, WY_TMP
vpsrld $31, WY, WY
.elseif ((i & 7) == 4)
vpor WY, WY_TMP, WY_TMP
vpslld $2, WY_TMP2, WY
.elseif ((i & 7) == 5)
vpsrld $30, WY_TMP2, WY_TMP2
vpxor WY, WY_TMP, WY_TMP
.elseif ((i & 7) == 7)
vpxor WY_TMP2, WY_TMP, WY
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
vmovdqu WY_TMP, PRECALC_WK(i&~7)
PRECALC_ROTATE_WY
.endif
.endm
.macro PRECALC_32_79
/*
* in SHA-1 specification:
* w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
* instead we do equal:
* w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
* allows more efficient vectorization
* since w[i]=>w[i-3] dependency is broken
*/
.if ((i & 7) == 0)
/*
* blended AVX2 and ALU instruction scheduling
* 1 vector iteration per 8 rounds
*/
vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
.elseif ((i & 7) == 1)
/* W is W_minus_32 before xor */
vpxor WY_minus_28, WY, WY
.elseif ((i & 7) == 2)
vpxor WY_minus_16, WY_TMP, WY_TMP
.elseif ((i & 7) == 3)
vpxor WY_TMP, WY, WY
.elseif ((i & 7) == 4)
vpslld $2, WY, WY_TMP
.elseif ((i & 7) == 5)
vpsrld $30, WY, WY
vpor WY, WY_TMP, WY
.elseif ((i & 7) == 7)
vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
vmovdqu WY_TMP, PRECALC_WK(i&~7)
PRECALC_ROTATE_WY
.endif
.endm
.macro PRECALC r, s
.set i, \r
.if (i < 40)
.set K_XMM, 32*0
.elseif (i < 80)
.set K_XMM, 32*1
.elseif (i < 120)
.set K_XMM, 32*2
.else
.set K_XMM, 32*3
.endif
.if (i<32)
PRECALC_00_15 \s
.elseif (i<64)
PRECALC_16_31 \s
.elseif (i < 160)
PRECALC_32_79 \s
.endif
.endm
.macro ROTATE_STATE
.set T_REG, E
.set E, D
.set D, C
.set C, B
.set B, TB
.set TB, A
.set A, T_REG
.set T_REG, RE
.set RE, RD
.set RD, RC
.set RC, RB
.set RB, RTB
.set RTB, RA
.set RA, T_REG
.endm
// Macro relies on saved ROUND_Fx
.macro RND_FUN f, r
.if (\f == RND_F1)
ROUND_F1 \r
.elseif (\f == RND_F2)
ROUND_F2 \r
.elseif (\f == RND_F3)
ROUND_F3 \r
.endif
.endm
.macro RR r
.set round_id, (\r % 80)
.if (round_id == 0) # Precalculate F for first round
.set ROUND_FUNC, RND_F1
mov B, TB
rorx $(32-30), B, B # b>>>2
andn D, TB, T1
and C, TB
xor T1, TB
.endif
RND_FUN ROUND_FUNC, \r
ROTATE_STATE
.if (round_id == 18)
.set ROUND_FUNC, RND_F2
.elseif (round_id == 38)
.set ROUND_FUNC, RND_F3
.elseif (round_id == 58)
.set ROUND_FUNC, RND_F2
.endif
.set round_id, ( (\r+1) % 80)
RND_FUN ROUND_FUNC, (\r+1)
ROTATE_STATE
.endm
.macro ROUND_F1 r
add WK(\r), E
andn C, A, T1 # ~b&d
lea (RE,RTB), E # Add F from the previous round
rorx $(32-5), A, TA # T2 = A >>> 5
rorx $(32-30),A, TB # b>>>2 for next round
PRECALC (\r) # msg scheduling for next 2 blocks
// Calculate F for the next round
// (b & c) ^ andn[b, d]
and B, A # b&c
xor T1, A # F1 = (b&c) ^ (~b&d)
lea (RE,RTA), E # E += A >>> 5
.endm
.macro ROUND_F2 r
add WK(\r), E
lea (RE,RTB), E # Add F from the previous round
/* Calculate F for the next round */
rorx $(32-5), A, TA # T2 = A >>> 5
.if ((round_id) < 79)
rorx $(32-30), A, TB # b>>>2 for next round
.endif
PRECALC (\r) # msg scheduling for next 2 blocks
.if ((round_id) < 79)
xor B, A
.endif
add TA, E # E += A >>> 5
.if ((round_id) < 79)
xor C, A
.endif
.endm
.macro ROUND_F3 r
add WK(\r), E
PRECALC (\r) # msg scheduling for next 2 blocks
lea (RE,RTB), E # Add F from the previous round
mov B, T1
or A, T1
rorx $(32-5), A, TA # T2 = A >>> 5
rorx $(32-30), A, TB # b>>>2 for next round
// Calculate F for the next round
// (b and c) or (d and (b or c))
and C, T1
and B, A
or T1, A
add TA, E # E += A >>> 5
.endm
// Add constant only if (%2 > %3) condition met (uses RTA as temp)
// %1 + %2 >= %3 ? %4 : 0
.macro ADD_IF_GE a, b, c, d
mov \a, RTA
add $\d, RTA
cmp $\c, \b
cmovge RTA, \a
.endm
// Performs 80 rounds of SHA-1 for multiple blocks with s/w pipelining
.macro SHA1_PIPELINED_MAIN_BODY
REGALLOC
mov (HASH_PTR), A
mov 4(HASH_PTR), B
mov 8(HASH_PTR), C
mov 12(HASH_PTR), D
mov 16(HASH_PTR), E
mov %rsp, PRECALC_BUF
lea (2*4*80+32)(%rsp), WK_BUF
// Precalc WK for first 2 blocks
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
.set i, 0
.rept 160
PRECALC i
.set i, i + 1
.endr
// Go to next block if needed
ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
xchg WK_BUF, PRECALC_BUF
.balign 32
.L_loop:
// code loops through more than one block
// we use K_BASE value as a signal of a last block,
// it is set below by: cmovae BUFFER_PTR, K_BASE
test BLOCKS_CTR, BLOCKS_CTR
jnz .L_begin
.balign 32
jmp .L_end
.balign 32
.L_begin:
// process first block
// rounds: 0,2,4,6,8
.set j, 0
.rept 5
RR j
.set j, j+2
.endr
jmp .L_loop0
.L_loop0:
// rounds
// 10,12,14,16,18
// 20,22,24,26,28
// 30,32,34,36,38
// 40,42,44,46,48
// 50,52,54,56,58
.rept 25
RR j
.set j, j+2
.endr
// Update Counter */
sub $1, BLOCKS_CTR
// Move to the next block only if needed*/
ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
// rounds
// 60,62,64,66,68
// 70,72,74,76,78
.rept 10
RR j
.set j, j+2
.endr
UPDATE_HASH (HASH_PTR), A
UPDATE_HASH 4(HASH_PTR), TB
UPDATE_HASH 8(HASH_PTR), C
UPDATE_HASH 12(HASH_PTR), D
UPDATE_HASH 16(HASH_PTR), E
test BLOCKS_CTR, BLOCKS_CTR
jz .L_loop
mov TB, B
// process second block
// 0+80, 2+80, 4+80, 6+80, 8+80
// 10+80,12+80,14+80,16+80,18+80
.set j, 0
.rept 10
RR j+80
.set j, j+2
.endr
jmp .L_loop1
.L_loop1:
// rounds
// 20+80,22+80,24+80,26+80,28+80
// 30+80,32+80,34+80,36+80,38+80
.rept 10
RR j+80
.set j, j+2
.endr
jmp .L_loop2
.L_loop2:
// rounds
// 40+80,42+80,44+80,46+80,48+80
// 50+80,52+80,54+80,56+80,58+80
.rept 10
RR j+80
.set j, j+2
.endr
// update counter
sub $1, BLOCKS_CTR
// Move to the next block only if needed
ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
jmp .L_loop3
.L_loop3:
// rounds
// 60+80,62+80,64+80,66+80,68+80
// 70+80,72+80,74+80,76+80,78+80
.rept 10
RR j+80
.set j, j+2
.endr
UPDATE_HASH (HASH_PTR), A
UPDATE_HASH 4(HASH_PTR), TB
UPDATE_HASH 8(HASH_PTR), C
UPDATE_HASH 12(HASH_PTR), D
UPDATE_HASH 16(HASH_PTR), E
/* Reset state for AVX2 reg permutation */
mov A, TA
mov TB, A
mov C, TB
mov E, C
mov D, B
mov TA, D
REGALLOC
xchg WK_BUF, PRECALC_BUF
jmp .L_loop
.balign 32
.L_end:
.endm
.section .rodata
#define K1 0x5a827999
#define K2 0x6ed9eba1
#define K3 0x8f1bbcdc
#define K4 0xca62c1d6
.balign 128
K_XMM_AR:
.long K1,K1,K1,K1
.long K1,K1,K1,K1
.long K2,K2,K2,K2
.long K2,K2,K2,K2
.long K3,K3,K3,K3
.long K3,K3,K3,K3
.long K4,K4,K4,K4
.long K4,K4,K4,K4
BSWAP_SHUFB_CTL:
.long 0x00010203
.long 0x04050607
.long 0x08090a0b
.long 0x0c0d0e0f
.long 0x00010203
.long 0x04050607
.long 0x08090a0b
.long 0x0c0d0e0f
.text
// Performs Intel® AVX2 optimized SHA-1 update.
//
// This implementation is based on the previous SSSE3 release:
// Visit http://software.intel.com/en-us/articles/ and refer
// to improving-the-performance-of-the-secure-hash-algorithm-1/
//
// Updates 20-byte SHA-1 record at start of 'state', from 'input',
// for even number of 'blocks' consecutive 64-byte blocks.
//
// void sha1_transform_avx2(struct sha1_state *state,
// const uint8_t *input,
// int blocks);
//
// @param %rdi points to output digest
// @param %rsi points to input data
// @param %rdx is number of 64-byte blocks to process
// @see X86_HAVE(SHA)
.ftrace1
sha1_transform_avx2:
.ftrace2
push %rbp
mov %rsp,%rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
RESERVE_STACK = (W_SIZE*4 + 8+24)
/* Align stack */
mov %rsp,%rbx
and $~(0x20-1),%rsp
push %rbx
sub $RESERVE_STACK,%rsp
vzeroupper
/* Setup initial values */
mov CTX,HASH_PTR
mov BUF,BUFFER_PTR
mov BUF,BUFFER_PTR2
mov CNT,BLOCKS_CTR
xmm_mov BSWAP_SHUFB_CTL(%rip),YMM_SHUFB_BSWAP
SHA1_PIPELINED_MAIN_BODY
vzeroupper
add $RESERVE_STACK,%rsp
pop %rsp
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
ret
.endfn sha1_transform_avx2,globl