cosmopolitan/libc/dlopen/foreign_tramp.S
Justine Tunney 957c61cbbf
Release Cosmopolitan v3.3
This change upgrades to GCC 12.3 and GNU binutils 2.42. The GNU linker
appears to have changed things so that only a single de-duplicated str
table is present in the binary, and it gets placed wherever the linker
wants, regardless of what the linker script says. To cope with that we
need to stop using .ident to embed licenses. As such, this change does
significant work to revamp how third party licenses are defined in the
codebase, using `.section .notice,"aR",@progbits`.

This new GCC 12.3 toolchain has support for GNU indirect functions. It
lets us support __target_clones__ for the first time. This is used for
optimizing the performance of libc string functions such as strlen and
friends so far on x86, by ensuring AVX systems favor a second codepath
that uses VEX encoding. It shaves some latency off certain operations.
It's a useful feature to have for scientific computing for the reasons
explained by the test/libcxx/openmp_test.cc example which compiles for
fifteen different microarchitectures. Thanks to the upgrades, it's now
also possible to use newer instruction sets, such as AVX512FP16, VNNI.

Cosmo now uses the %gs register on x86 by default for TLS. Doing it is
helpful for any program that links `cosmo_dlopen()`. Such programs had
to recompile their binaries at startup to change the TLS instructions.
That's not great, since it means every page in the executable needs to
be faulted. The work of rewriting TLS-related x86 opcodes, is moved to
fixupobj.com instead. This is great news for MacOS x86 users, since we
previously needed to morph the binary every time for that platform but
now that's no longer necessary. The only platforms where we need fixup
of TLS x86 opcodes at runtime are now Windows, OpenBSD, and NetBSD. On
Windows we morph TLS to point deeper into the TIB, based on a TlsAlloc
assignment, and on OpenBSD/NetBSD we morph %gs back into %fs since the
kernels do not allow us to specify a value for the %gs register.

OpenBSD users are now required to use APE Loader to run Cosmo binaries
and assimilation is no longer possible. OpenBSD kernel needs to change
to allow programs to specify a value for the %gs register, or it needs
to stop marking executable pages loaded by the kernel as mimmutable().

This release fixes __constructor__, .ctor, .init_array, and lastly the
.preinit_array so they behave the exact same way as glibc.

We no longer use hex constants to define math.h symbols like M_PI.
2024-02-20 13:27:59 -08:00

237 lines
6 KiB
ArmAsm

/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2024 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.internal.h"
#define SIZE 0x0200
#define SKEW 0x10
foreign_tramp:
#ifdef __x86_64__
push %rbp
mov %rsp,%rbp
sub $SIZE,%rsp
// save vector arguments
movdqu %xmm0,-0x10(%rbp)
movdqu %xmm1,-0x20(%rbp)
movdqu %xmm2,-0x30(%rbp)
movdqu %xmm3,-0x40(%rbp)
movdqu %xmm4,-0x50(%rbp)
movdqu %xmm5,-0x60(%rbp)
movdqu %xmm6,-0x70(%rbp)
movdqu %xmm7,-0x80(%rbp)
// save register arguments
mov %rdi,-0x88(%rbp)
mov %rsi,-0x90(%rbp)
mov %rdx,-0x98(%rbp)
mov %rcx,-0xa0(%rbp)
mov %r8,-0xa8(%rbp)
mov %r9,-0xb0(%rbp)
// save function pointer
mov %rax,-0xb8(%rbp)
// block signals
call __sig_block
mov %rax,-0xc0(%rbp)
// switch to foreign tls
mov %gs:0x30,%rax
mov %rax,-0xc8(%rbp)
mov __foreign+8(%rip),%rdi
call __set_tls
// move stack arguments
movdqu SKEW+0x00(%rbp),%xmm0
movdqu SKEW+0x10(%rbp),%xmm1
movdqu SKEW+0x20(%rbp),%xmm2
movdqu SKEW+0x30(%rbp),%xmm3
movdqu SKEW+0x40(%rbp),%xmm4
movdqu SKEW+0x50(%rbp),%xmm5
movdqu SKEW+0x60(%rbp),%xmm6
movdqu SKEW+0x70(%rbp),%xmm7
movdqu %xmm0,-SIZE+0x00(%rbp)
movdqu %xmm1,-SIZE+0x10(%rbp)
movdqu %xmm2,-SIZE+0x20(%rbp)
movdqu %xmm3,-SIZE+0x30(%rbp)
movdqu %xmm4,-SIZE+0x40(%rbp)
movdqu %xmm5,-SIZE+0x50(%rbp)
movdqu %xmm6,-SIZE+0x60(%rbp)
movdqu %xmm7,-SIZE+0x70(%rbp)
movdqu SKEW+0x80(%rbp),%xmm0
movdqu SKEW+0x90(%rbp),%xmm1
movdqu SKEW+0xa0(%rbp),%xmm2
movdqu SKEW+0xb0(%rbp),%xmm3
movdqu SKEW+0xc0(%rbp),%xmm4
movdqu SKEW+0xd0(%rbp),%xmm5
movdqu SKEW+0xe0(%rbp),%xmm6
movdqu SKEW+0xf0(%rbp),%xmm7
movdqu %xmm0,-SIZE+0x80(%rbp)
movdqu %xmm1,-SIZE+0x90(%rbp)
movdqu %xmm2,-SIZE+0xa0(%rbp)
movdqu %xmm3,-SIZE+0xb0(%rbp)
movdqu %xmm4,-SIZE+0xc0(%rbp)
movdqu %xmm5,-SIZE+0xd0(%rbp)
movdqu %xmm6,-SIZE+0xe0(%rbp)
movdqu %xmm7,-SIZE+0xf0(%rbp)
// restore vector arguments
movdqu -0x10(%rbp),%xmm0
movdqu -0x20(%rbp),%xmm1
movdqu -0x30(%rbp),%xmm2
movdqu -0x40(%rbp),%xmm3
movdqu -0x50(%rbp),%xmm4
movdqu -0x60(%rbp),%xmm5
movdqu -0x70(%rbp),%xmm6
movdqu -0x80(%rbp),%xmm7
// restore register arguments
mov -0x88(%rbp),%rdi
mov -0x90(%rbp),%rsi
mov -0x98(%rbp),%rdx
mov -0xa0(%rbp),%rcx
mov -0xa8(%rbp),%r8
mov -0xb0(%rbp),%r9
// call function
mov -0xb8(%rbp),%rax
call *%rax
// save function result
movdqu %xmm0,-0x10(%rbp)
movdqu %xmm1,-0x20(%rbp)
mov %rax,-0x28(%rbp)
mov %rdx,-0x30(%rbp)
// restore tls
mov -0xc8(%rbp),%rdi
call __set_tls
// unblock signals
mov -0xc0(%rbp),%rdi
call __sig_unblock
// restore function result
mov -0x28(%rbp),%rax
mov -0x30(%rbp),%rdx
movdqu -0x10(%rbp),%xmm0
movdqu -0x20(%rbp),%xmm1
add $SIZE,%rsp
pop %rbp
ret
#elif defined(__aarch64__)
stp x29,x30,[sp,-0x100]!
mov x29,sp
// save vector arguments
stp q0,q1,[sp,0x10]
stp q2,q3,[sp,0x30]
stp q4,q5,[sp,0x50]
stp q6,q7,[sp,0x70]
// save register arguments
stp x0,x1,[sp,0x90]
stp x2,x3,[sp,0xa0]
stp x4,x5,[sp,0xb0]
stp x6,x7,[sp,0xc0]
// save function pointer
str x8,[sp,0xd0]
// block signals
bl __sig_block
str x0,[sp,0xd8]
// move stack arguments
sub sp,sp,#0x100
ldp q0,q1,[sp,SIZE+0x00]
ldp q2,q3,[sp,SIZE+0x20]
ldp q4,q5,[sp,SIZE+0x40]
ldp q6,q7,[sp,SIZE+0x60]
stp q0,q1,[sp,0x00]
stp q2,q3,[sp,0x20]
stp q4,q5,[sp,0x40]
stp q6,q7,[sp,0x60]
ldp q0,q1,[sp,SIZE+0x80]
ldp q2,q3,[sp,SIZE+0xa0]
ldp q4,q5,[sp,SIZE+0xc0]
ldp q6,q7,[sp,SIZE+0xe0]
stp q0,q1,[sp,0x80]
stp q2,q3,[sp,0xa0]
stp q4,q5,[sp,0xc0]
stp q6,q7,[sp,0xe0]
// restore vector arguments
ldp q0,q1,[sp,0x100+0x10]
ldp q2,q3,[sp,0x100+0x30]
ldp q4,q5,[sp,0x100+0x50]
ldp q6,q7,[sp,0x100+0x70]
// restore register arguments
ldp x0,x1,[sp,0x100+0x90]
ldp x2,x3,[sp,0x100+0xa0]
ldp x4,x5,[sp,0x100+0xb0]
ldp x6,x7,[sp,0x100+0xc0]
// call function
ldr x8,[sp,0x100+0xd0]
blr x8
add sp,sp,#0x100
// save vector results
stp q0,q1,[sp,0x10]
stp q2,q3,[sp,0x30]
stp q4,q5,[sp,0x50]
stp q6,q7,[sp,0x70]
// save register results
stp x0,x1,[sp,0x90]
stp x2,x3,[sp,0xa0]
stp x4,x5,[sp,0xb0]
stp x6,x7,[sp,0xc0]
// unblock signals
ldr x0,[sp,0xd8]
bl __sig_unblock
// restore vector results
ldp q0,q1,[sp,0x10]
ldp q2,q3,[sp,0x30]
ldp q4,q5,[sp,0x50]
ldp q6,q7,[sp,0x70]
// restore register results
ldp x0,x1,[sp,0x90]
ldp x2,x3,[sp,0xa0]
ldp x4,x5,[sp,0xb0]
ldp x6,x7,[sp,0xc0]
ldp x29,x30,[sp],0x100
ret
#endif // __x86_64__
.endfn foreign_tramp,globl