Add Intel intrinsics headers

This commit is contained in:
Justine Tunney 2023-04-27 02:56:41 -07:00
parent 369f9740de
commit b7bf052a4b
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
121 changed files with 47114 additions and 849 deletions

View file

@ -153,6 +153,7 @@ include dsp/mpeg/mpeg.mk # │
include dsp/dsp.mk # │
include third_party/zlib/gz/gz.mk # │
include third_party/musl/musl.mk # │
include third_party/intel/intel.mk # │
include libc/libc.mk #─┘
include libc/sock/sock.mk #─┐
include dsp/tty/tty.mk # ├──ONLINE RUNTIME

View file

@ -1,7 +1,7 @@
#ifndef COSMOPOLITAN_DSP_TTY_INTERNAL_H_
#define COSMOPOLITAN_DSP_TTY_INTERNAL_H_
#include "dsp/tty/ttyrgb.h"
#include "libc/intrin/xmmintrin.internal.h"
#include "third_party/intel/xmmintrin.internal.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_

View file

@ -3,9 +3,9 @@
#include "dsp/tty/ttyrgb.h"
#include "libc/assert.h"
#include "libc/intrin/bits.h"
#include "libc/intrin/xmmintrin.internal.h"
#include "libc/limits.h"
#include "libc/str/str.h"
#include "third_party/intel/xmmintrin.internal.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_

View file

@ -17,7 +17,7 @@
PERFORMANCE OF THIS SOFTWARE.
*/
#include "dsp/tty/quant.h"
#include "libc/intrin/xmmintrin.internal.h"
#include "third_party/intel/xmmintrin.internal.h"
struct TtyRgb rgb2ttyf2i_(__m128 rgb) {
__v4si i4;

View file

@ -1,133 +0,0 @@
#ifndef COSMOPOLITAN_LIBC_BITS_AVX2INTRIN_H_
#define COSMOPOLITAN_LIBC_BITS_AVX2INTRIN_H_
#include "libc/intrin/avxintrin.internal.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
#define _mm256_min_epi16(M256_0, M256_1) \
((__m256i)__builtin_ia32_minps((__v16hi)(M256_0), (__v16hi)(M256_1)))
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § it's a trap! » avx2 » simd ops
*/
#define _mm256_add_ps(M256_0, M256_1) \
((__m256)((__v8sf)(M256_0) + (__v8sf)(M256_1)))
#define _mm256_sub_ps(M256_0, M256_1) \
((__m256)((__v8sf)(M256_0) - (__v8sf)(M256_1)))
#define _mm256_mul_ps(M256_0, M256_1) \
((__m256)((__v8sf)(M256_0) * (__v8sf)(M256_1)))
#define _mm256_div_ps(M256_0, M256_1) \
((__m256)((__v8sf)(M256_0) / (__v8sf)(M256_1)))
#define _mm256_and_ps(M256_0, M256_1) \
((__m256)((__v8su)(M256_0) & (__v8su)(M256_1)))
#define _mm256_or_ps(M256_0, M256_1) \
((__m256)((__v8su)(M256_0) | (__v8su)(M256_1)))
#define _mm256_xor_ps(M256_0, M256_1) /* XORPD [u32 simd xor] */ \
((__m256)((__v8su)(M256_0) ^ (__v8su)(M256_1)))
#define _mm256_andnot_ps(M256_0, M256_1) /* ANDNPS [u32 simd nand] */ \
((__m256)(~(__v8su)(M256_0) & (__v8su)(M256_1)))
#define _mm256_rcp_ps(M256) __builtin_ia32_rcpps256((__v8sf)(M256))
#define _mm256_sqrt_ps(M256) __builtin_ia32_sqrtps256((__v8sf)(M256))
#define _mm256_rsqrt_ps(M256) __builtin_ia32_rsqrtps256((__v8sf)(M256))
#define _mm256_round_ps(M256, IMM) \
((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(M256), IMM))
#define _mm256_add_epi32(M256I_0, M256I_1) \
((__m256i)((__v8su)(M256I_0) + (__v8su)(M256I_1)))
#define _mm256_cmpgt_epi32(M256I_0, M256I_1) \
((__m256i)((__v8si)(M256I_0) > (__v8si)(M256I_1)))
#define _mm256_min_epi32(M256I_0, M256I_1) \
((__m256i)__builtin_ia32_pminsd256((__v8si)(M256I_0), (__v8si)(M256I_1)))
#define _mm256_min_epu32(M256I_0, M256I_1) \
((__m256i)__builtin_ia32_pminud256((__v8si)(M256I_0), (__v8si)(M256I_1)))
#define _mm256_max_epi32(M256I_0, M256I_1) \
((__m256i)__builtin_ia32_pmaxsd256((__v8si)(M256I_0), (__v8si)(M256I_1)))
#define _mm256_max_epu32(M256I_0, M256I_1) \
((__m256i)__builtin_ia32_pmaxud256((__v8si)(M256I_0), (__v8si)(M256I_1)))
#define _mm256_blendv_epi8(M256I_0, M256I_1, M256I_2) \
((__m256i)__builtin_ia32_pblendvb256((__v32qi)(M256I_0), (__v32qi)(M256I_1), \
(__v32qi)(M256I_2)))
#define _mm256_min_ps(M256_0, M256_1) \
((__m256)__builtin_ia32_minps256((__v8sf)(__m256)(M256_0), \
(__v8sf)(__m256)(M256_1)))
#define _mm256_max_ps(M256_0, M256_1) \
((__m256)__builtin_ia32_maxps256((__v8sf)(__m256)(M256_0), \
(__v8sf)(__m256)(M256_1)))
#define _mm256_cmpneq_ps(M256_0, M256_1) \
((__m256)__builtin_ia32_cmpneqps((__v8sf)(__m256)(M256_0), \
(__v8sf)(__m256)(M256_1)))
#define _mm256_cmplt_ps(M256_0, M256_1) \
((__m256)__builtin_ia32_cmpltps((__v8sf)(__m256)(M256_0), \
(__v8sf)(__m256)(M256_1)))
#define _mm256_cmpnlt_ps(M256_0, M256_1) \
((__m256)__builtin_ia32_cmpnltps((__v8sf)(__m256)(M256_0), \
(__v8sf)(__m256)(M256_1)))
#define _mm256_cmple_ps(M256_0, M256_1) \
((__m256)__builtin_ia32_cmpleps((__v8sf)(__m256)(M256_0), \
(__v8sf)(__m256)(M256_1)))
#define _mm256_cmpnle_ps(M256_0, M256_1) \
((__m256)__builtin_ia32_cmpnleps((__v8sf)(__m256)(M256_0), \
(__v8sf)(__m256)(M256_1)))
#define _mm256_cmpgt_ps(M256_0, M256_1) \
((__m256)__builtin_ia32_cmpltps((__v8sf)(__m256)(M256_1), \
(__v8sf)(__m256)(M256_0)))
#define _mm256_cmpngt_ps(M256_0, M256_1) \
((__m256)__builtin_ia32_cmpnltps((__v8sf)(__m256)(M256_1), \
(__v8sf)(__m256)(M256_0)))
#define _mm256_cmpge_ps(M256_0, M256_1) \
((__m256)__builtin_ia32_cmpleps((__v8sf)(__m256)(M256_1), \
(__v8sf)(__m256)(M256_0)))
#define _mm256_cmpnge_ps(M256_0, M256_1) \
((__m256)__builtin_ia32_cmpnleps((__v8sf)(__m256)(M256_1), \
(__v8sf)(__m256)(M256_0)))
#define _mm256_cmpord_ps(M256_0, M256_1) \
((__m256)__builtin_ia32_cmpordps((__v8sf)(__m256)(M256_0), \
(__v8sf)(__m256)(M256_1)))
#define _mm256_cmpunord_ps(M256_0, M256_1) \
((__m256)__builtin_ia32_cmpunordps((__v8sf)(__m256)(M256_0), \
(__v8sf)(__m256)(M256_1)))
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § avx2 » memory ops
*/
struct thatispacked PackedMayaliasIntyYmm {
__m256i Ymm;
} mayalias;
#define _mm256_set_ps(FLT_0, FLT_1, FLT_2, FLT_3, FLT_4, FLT_5, FLT_6, FLT_7) \
((__m256)(__v8sf){(float)(FLT_0), (float)(FLT_1), (float)(FLT_2), \
(float)(FLT_3), (float)(FLT_4), (float)(FLT_5), \
(float)(FLT_6), (float)(FLT_7)})
#define _mm256_set1_ps(FLT_0) \
_mm256_set_ps(FLT_0, FLT_0, FLT_0, FLT_0, FLT_0, FLT_0, FLT_0, FLT_0)
#define _mm256_setr_ps(FLT_0, FLT_1, FLT_2, FLT_3, FLT_4, FLT_5, FLT_6, FLT_7) \
_mm256_set_ps(FLT_7, FLT_6, FLT_5, FLT_4, FLT_3, FLT_2, FLT_1, FLT_0)
#define _mm256_set_epi32(INT_0, INT_1, INT_2, INT_3, INT_4, INT_5, INT_6, \
INT_7) \
((__m256i)(__v8si){(int)(INT_0), (int)(INT_1), (int)(INT_2), (int)(INT_3), \
(int)(INT_4), (int)(INT_5), (int)(INT_6), (int)(INT_7)})
#define _mm256_set1_epi32(INT_0) \
_mm256_set_epi32(INT_0, INT_0, INT_0, INT_0, INT_0, INT_0, INT_0, INT_0)
#define _mm256_setr_epi32(INT_0, INT_1, INT_2, INT_3, INT_4, INT_5, INT_6, \
INT_7) \
_mm256_set_epi32(INT_7, INT_6, INT_5, INT_4, INT_3, INT_2, INT_1, INT_0)
#define _mm256_loadu_si256(M256IP_0) \
({ \
const __m256i *Ymm = (M256IP_0); \
((struct PackedMayaliasIntyYmm *)Ymm)->Ymm; \
})
#define _mm256_storeu_si256(M256IP_0, M256I_1) \
({ \
__m256i *Ymm = (M256IP_0); \
((struct PackedMayaliasIntyYmm *)Ymm)->Ymm = M256I_1; \
})
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_BITS_AVX2INTRIN_H_ */

View file

@ -1,51 +0,0 @@
#ifndef COSMOPOLITAN_LIBC_BITS_AVXINTRIN_H_
#define COSMOPOLITAN_LIBC_BITS_AVXINTRIN_H_
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
typedef float __m256 _Vector_size(32) mayalias;
typedef double __m256d _Vector_size(32) mayalias;
typedef long long __m256i _Vector_size(32) mayalias;
typedef float __m256_u _Vector_size(32) forcealign(1) mayalias;
typedef double __m256d_u _Vector_size(32) forcealign(1) mayalias;
typedef long long __m256i_u _Vector_size(32) forcealign(1) mayalias;
typedef double __v4df _Vector_size(32);
typedef float __v8sf _Vector_size(32);
typedef long long __v4di _Vector_size(32);
typedef unsigned long long __v4du _Vector_size(32);
typedef int __v8si _Vector_size(32);
typedef unsigned __v8su _Vector_size(32);
typedef short __v16hi _Vector_size(32);
typedef unsigned short __v16hu _Vector_size(32);
typedef char __v32qi _Vector_size(32);
typedef unsigned char __v32qu _Vector_size(32);
#define _mm256_setzero_ps() ((__m256)(__v8sf){0})
#define _mm256_load_ps(FLOATPTR) (*(__m256 *)(FLOATPTR))
#define _mm256_loadu_ps(FLOATPTR) (*(__m256_u *)(FLOATPTR))
#define _mm256_store_ps(FLOATPTR, M256_0) \
(*(__m256 *)(FLOATPTR) = (__m256)(M256_0))
#define _mm256_storeu_ps(FLOATPTR, M256_0) \
(*(__m256_u *)(FLOATPTR) = (__m256)(M256_0))
#define _mm256_extractf128_ps(M256_0, INT_1) \
((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(M256_0), \
(int)(INT_1)))
#define _mm256_insertf128_ps(M256_0, M128_1, IMM_2) \
((__m256)__builtin_ia32_vinsertf128_ps256( \
(__v8sf)(__m256)(M256_0), (__v4sf)(__m128)(M128_1), (int)(IMM_2)))
#ifdef __llvm__
#define _mm256_castps128_ps256(M128_0) \
((__m256)__builtin_shufflevector((__v4sf)(__m128)(M128_0), \
(__v4sf)(__m128)(M128_0), 0, 1, 2, 3, -1, \
-1, -1, -1))
#else
#define _mm256_castps128_ps256(M128_0) \
((__m256)__builtin_ia32_ps256_ps((__v4sf)(__m128)(M128_0)))
#endif
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_BITS_AVXINTRIN_H_ */

View file

@ -1,244 +0,0 @@
#ifndef COSMOPOLITAN_LIBC_BITS_EMMINTRIN_H_
#define COSMOPOLITAN_LIBC_BITS_EMMINTRIN_H_
#include "libc/intrin/xmmintrin.internal.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § it's a trap! » sse2
*/
typedef char __v16qi _Vector_size(16);
typedef unsigned char __v16qu _Vector_size(16);
typedef signed char __v16qs _Vector_size(16);
typedef short __v8hi _Vector_size(16);
typedef unsigned short __v8hu _Vector_size(16);
typedef double __v2df _Vector_size(16);
typedef double __m128d _Vector_size(16) forcealign(16);
typedef double __m128d_u _Vector_size(16) forcealign(1);
typedef long long __v2di _Vector_size(16);
typedef long long __m128i _Vector_size(16) forcealign(16);
typedef long long __m128i_u _Vector_size(16) forcealign(1);
typedef unsigned long long __v2du _Vector_size(16);
struct thatispacked mayalias __usi128ma {
__m128i_u __v;
};
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § it's a trap! » sse2 » memory ops
*/
#define _mm_loadu_si128(M128IP) ((struct __usi128ma *)(M128IP))->__v
#define _mm_storeu_si128(M128IP, M128I) \
(((struct __usi128ma *)(M128IP))->__v = (M128I))
#define _mm_set_epi8(I8_15, I8_14, I8_13, I8_12, I8_11, I8_10, I8_9, I8_8, \
I8_7, I8_6, I8_5, I8_4, I8_3, I8_2, I8_1, I8_0) \
((__m128i)(__v16qi){I8_0, I8_1, I8_2, I8_3, I8_4, I8_5, I8_6, I8_7, I8_8, \
I8_9, I8_10, I8_11, I8_12, I8_13, I8_14, I8_15})
#define _mm_set_epi16(I16_7, I16_6, I16_5, I16_4, I16_3, I16_2, I16_1, I16_0) \
((__m128i)(__v8hi){I16_0, I16_1, I16_2, I16_3, I16_4, I16_5, I16_6, I16_7})
#define _mm_set_epi32(I32_3, I32_2, I32_1, I32_0) \
((__m128i)(__v4si){I32_0, I32_1, I32_2, I32_3})
#define _mm_set_epi64x(I64_1, I64_0) ((__m128i)(__v2di){I64_0, I64_1})
#define _mm_setr_epi8(I8_15, I8_14, I8_13, I8_12, I8_11, I8_10, I8_9, I8_8, \
I8_7, I8_6, I8_5, I8_4, I8_3, I8_2, I8_1, I8_0) \
_mm_set_epi8(I8_0, I8_1, I8_2, I8_3, I8_4, I8_5, I8_6, I8_7, I8_8, I8_9, \
I8_10, I8_11, I8_12, I8_13, I8_14, I8_15)
#define _mm_setr_epi16(I16_7, I16_6, I16_5, I16_4, I16_3, I16_2, I16_1, I16_0) \
_mm_set_epi16(I16_0, I16_1, I16_2, I16_3, I16_4, I16_5, I16_6, I16_7)
#define _mm_setr_epi32(I32_3, I32_2, I32_1, I32_0) \
_mm_set_epi32(I32_0, I32_1, I32_2, I32_3)
#define _mm_setr_epi64x(I64_1, I64_0) _mm_set_epi64x(I64_0, I64_1)
#define _mm_set1_epi8(I8) \
_mm_set_epi8(I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8, I8)
#define _mm_set1_epi16(I16) \
_mm_set_epi16(I16, I16, I16, I16, I16, I16, I16, I16)
#define _mm_set1_epi32(I32) _mm_set_epi32(I32, I32, I32, I32)
#define _mm_set1_epi64x(I64) _mm_set_epi64x(I64, I64)
#define _mm_cvtsi128_si32(M128I) ((__v4si)(M128I))[0]
#define _mm_cvtsi32_si128(I32) ((__m128i)(__v4si){(I32), 0, 0, 0})
#define _mm_setzero_si128() ((__m128i)(__v2di){0LL, 0LL})
#define _mm_castsi128_ps(M128I) ((__m128)(M128I))
#define _mm_castps_si128(M128) ((__m128i)(M128))
#define _mm_load_si128(M128I) (*(M128I))
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § it's a trap! » sse2 » simd ops
*/
#define _mm_and_si128(M128I_0, M128I_1) \
((__m128i)((__v2du)(M128I_0) & (__v2du)(M128I_1)))
#define _mm_or_si128(M128I_0, M128I_1) \
((__m128i)((__v2du)(M128I_0) | (__v2du)(M128I_1)))
#define _mm_xor_si128(M128I_0, M128I_1) \
((__m128i)((__v2du)(M128I_0) ^ (__v2du)(M128I_1)))
#define _mm_andnot_si128(M128I_0, M128I_1) \
((__m128i)(~(__v2du)(M128I_0) & (__v2du)(M128I_1)))
#define _mm_add_pd(M128D_0, M128D_1) \
(__m128d)((__v2df)(M128D_0) + (__v2df)(M128D_1))
#define _mm_sub_pd(M128D_0, M128D_1) \
(__m128d)((__v2df)(M128D_0) - (__v2df)(M128D_1))
#define _mm_mul_pd(M128D_0, M128D_1) \
(__m128d)((__v2df)(M128D_0) * (__v2df)(M128D_1))
#define _mm_div_pd(M128D_0, M128D_1) \
(__m128d)((__v2df)(M128D_0) / (__v2df)(M128D_1))
#define _mm_and_pd(M128D_0, M128D_1) \
(__m128d)((__v2df)(M128D_0) & (__v2df)(M128D_1))
#define _mm_or_pd(M128D_0, M128D_1) \
(__m128d)((__v2df)(M128D_0) | (__v2df)(M128D_1))
#define _mm_xor_pd(M128D_0, M128D_1) \
(__m128d)((__v2df)(M128D_0) ^ (__v2df)(M128D_1))
#define _mm_andnot_pd(M128D_0, M128D_1) \
(__m128d)(~(__v2df)(M128D_0) & (__v2df)(M128D_1))
#define _mm_sqrt_pd(M128D) __builtin_ia32_sqrtpd((__v2df)(M128D))
#define _mm_min_pd(M128D_0, M128D_1) \
__builtin_ia32_minpd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_max_pd(M128D_0, M128D_1) \
__builtin_ia32_maxpd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpeq_pd(M128D_0, M128D_1) \
__builtin_ia32_cmpeqpd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpneq_pd(M128D_0, M128D_1) \
__builtin_ia32_cmpneqpd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmplt_pd(M128D_0, M128D_1) \
__builtin_ia32_cmpltpd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpnlt_pd(M128D_0, M128D_1) \
__builtin_ia32_cmpnltpd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmple_pd(M128D_0, M128D_1) \
__builtin_ia32_cmplepd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpnle_pd(M128D_0, M128D_1) \
__builtin_ia32_cmpnlepd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpgt_pd(M128D_0, M128D_1) \
__builtin_ia32_cmpltpd((__v2df)(M128D_1), (__v2df)(M128D_0))
#define _mm_cmpngt_pd(M128D_0, M128D_1) \
__builtin_ia32_cmpnltpd((__v2df)(M128D_1), (__v2df)(M128D_0))
#define _mm_cmpge_pd(M128D_0, M128D_1) \
__builtin_ia32_cmplepd((__v2df)(M128D_1), (__v2df)(M128D_0))
#define _mm_cmpnge_pd(M128D_0, M128D_1) \
__builtin_ia32_cmpnlepd((__v2df)(M128D_1), (__v2df)(M128D_0))
#define _mm_cmpord_pd(M128D_0, M128D_1) \
__builtin_ia32_cmpordpd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpunord_pd(M128D_0, M128D_1) \
__builtin_ia32_cmpunordpd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_sad_epu8(M128I_0, M128I_1) \
__builtin_ia32_psadbw128((__v16qi)(M128I_0), (__v16qi)(M128I_1))
#define _mm_subs_epi8(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_psubsb128((__v16qi)(M128I_0), (__v16qi)(M128I_1)))
#define _mm_subs_epu8(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_psubusw128((__v16qi)(M128I_0), (__v16qi)(M128I_1)))
#define _mm_subs_epi16(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_psubsw128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
#define _mm_subs_epu16(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_psubusw128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
#define _mm_add_epi32(M128I_0, M128I_1) \
((__m128i)((__v4su)(M128I_0) + (__v4su)(M128I_1)))
#define _mm_sub_epi32(M128I_0, M128I_1) \
((__m128i)((__v4su)(M128I_0) - (__v4su)(M128I_1)))
#define _mm_madd_epi16(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_pmaddwd128((__v8hi)(M128I_0), (__v8hi)(M128I_1)))
#define _mm_shuffle_epi32(V, IMM) \
((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(V), (int)(IMM)))
#define _mm_slli_epi32(M128I, COUNT) \
((__m128i)__builtin_ia32_pslldi128((__v4si)(M128I), (COUNT)))
#define _mm_slli_si128(M128I, IMM) \
((__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(M128I), (int)(IMM)*8))
#define _mm_srli_si128(M128I, IMM) \
((__m128i)__builtin_ia32_psrldqi128((__v2di)(__m128i)(M128I), (int)(IMM)*8))
#define _mm_cmpeq_epi8(a, b) ((__m128i)((__v16qi)(a) == (__v16qi)(b)))
#define _mm_movemask_epi8(a) __builtin_ia32_pmovmskb128((__v16qi)(a))
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § it's a trap! » sse2 » scalar ops
*/
#define _mm_sqrt_sd(M128D_0, M128D_1) \
({ \
__m128d M128d2 = __builtin_ia32_sqrtsd((__v2df)(M128D_1)); \
(__m128d){M128d2[0], (M128D_0)[1]}; \
})
#define _mm_add_sd(M128D_0, M128D_1) \
({ \
(M128D_0)[0] += (M128D_1)[0]; \
(M128D_0); \
})
#define _mm_sub_sd(M128D_0, M128D_1) \
({ \
(M128D_0)[0] -= (M128D_1)[0]; \
(M128D_0); \
})
#define _mm_mul_sd(M128D_0, M128D_1) \
({ \
(M128D_0)[0] *= (M128D_1)[0]; \
(M128D_0); \
})
#define _mm_div_sd(M128D_0, M128D_1) \
({ \
(M128D_0)[0] /= (M128D_1)[0]; \
(M128D_0); \
})
#define _mm_min_sd(M128D_0, M128D_1) \
__builtin_ia32_minsd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_max_sd(M128D_0, M128D_1) \
__builtin_ia32_maxsd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpeq_sd(M128D_0, M128D_1) \
__builtin_ia32_cmpeqsd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpneq_sd(M128D_0, M128D_1) \
__builtin_ia32_cmpneqsd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmplt_sd(M128D_0, M128D_1) \
__builtin_ia32_cmpltsd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpnlt_sd(M128D_0, M128D_1) \
__builtin_ia32_cmpnltsd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmple_sd(M128D_0, M128D_1) \
__builtin_ia32_cmplesd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpnle_sd(M128D_0, M128D_1) \
__builtin_ia32_cmpnlesd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpgt_sd(M128D_0, M128D_1) \
__builtin_ia32_cmpltsd((__v2df)(M128D_1), (__v2df)(M128D_0))
#define _mm_cmpngt_sd(M128D_0, M128D_1) \
__builtin_ia32_cmpnltsd((__v2df)(M128D_1), (__v2df)(M128D_0))
#define _mm_cmpge_sd(M128D_0, M128D_1) \
__builtin_ia32_cmplesd((__v2df)(M128D_1), (__v2df)(M128D_0))
#define _mm_cmpnge_sd(M128D_0, M128D_1) \
__builtin_ia32_cmpnlesd((__v2df)(M128D_1), (__v2df)(M128D_0))
#define _mm_cmpord_sd(M128D_0, M128D_1) \
__builtin_ia32_cmpordsd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_cmpunord_sd(M128D_0, M128D_1) \
__builtin_ia32_cmpunordsd((__v2df)(M128D_0), (__v2df)(M128D_1))
#define _mm_SSE2(op, A, B) \
({ \
__m128i R = A; \
asm(#op " %1, %0" : "+x"(R) : "xm"(B)); \
R; \
})
#define _mm_mul_epu32(A, B) _mm_SSE2(pmuludq, A, B)
#define _mm_add_epi64(A, B) _mm_SSE2(paddq, A, B)
#define _mm_srli_epi64(A, B) _mm_SSE2(psrlq, A, B)
#define _mm_slli_epi64(A, B) _mm_SSE2(psllq, A, B)
#define _mm_unpacklo_epi64(A, B) _mm_SSE2(punpcklqdq, A, B)
#define _mm_unpackhi_epi64(A, B) _mm_SSE2(punpckhqdq, A, B)
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § it's a trap! » sse2 » miscellaneous
*/
#define _mm_pause() asm("rep nop")
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_BITS_EMMINTRIN_H_ */

View file

@ -1,14 +0,0 @@
#ifndef COSMOPOLITAN_LIBC_BITS_PMMINTRIN_H_
#define COSMOPOLITAN_LIBC_BITS_PMMINTRIN_H_
#if !(__ASSEMBLER__ + __LINKER__ + 0)
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § it's a trap! » sse3
*/
#define _mm_hadd_ps(M128_0, M128_1) \
((__m128)__builtin_ia32_haddps((__v4sf)(__m128)(M128_0), \
(__v4sf)(__m128)(M128_0)))
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_BITS_PMMINTRIN_H_ */

View file

@ -1,37 +0,0 @@
#ifndef COSMOPOLITAN_LIBC_BITS_SHAINTRIN_H_
#define COSMOPOLITAN_LIBC_BITS_SHAINTRIN_H_
#include "libc/intrin/emmintrin.internal.h"
#include "libc/intrin/xmmintrin.internal.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
#define _mm_sha1rnds4_epu32(M128I_0, M128I_1, MEM) \
__builtin_ia32_sha1rnds4((__v4si)(__m128i)(M128I_0), \
(__v4si)(__m128i)(M128I_1), (MEM))
#define _mm_sha1nexte_epu32(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_sha1nexte((__v4si)(__m128i)(M128I_0), \
(__v4si)(__m128i)(M128I_1)))
#define _mm_sha1msg1_epu32(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_sha1msg1((__v4si)(__m128i)(M128I_0), \
(__v4si)(__m128i)(M128I_1)))
#define _mm_sha1msg2_epu32(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_sha1msg2((__v4si)(__m128i)(M128I_0), \
(__v4si)(__m128i)(M128I_1)))
#define _mm_sha256rnds2_epu32(M128I_0, M128I_1, M128I_2) \
((__m128i)__builtin_ia32_sha256rnds2((__v4si)(__m128i)(M128I_0), \
(__v4si)(__m128i)(M128I_1), \
(__v4si)(__m128i)(M128I_2)))
#define _mm_sha256msg1_epu32(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_sha256msg1((__v4si)(__m128i)(M128I_0), \
(__v4si)(__m128i)(M128I_1)))
#define _mm_sha256msg2_epu32(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_sha256msg2((__v4si)(__m128i)(M128I_0), \
(__v4si)(__m128i)(M128I_1)))
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_BITS_SHAINTRIN_H_ */

View file

@ -1,31 +0,0 @@
#ifndef COSMOPOLITAN_LIBC_BITS_SMMINTRIN_H_
#define COSMOPOLITAN_LIBC_BITS_SMMINTRIN_H_
/**
* @fileoverview SSE4 intrinsics.
*/
#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_CUR_DIRECTION 4
#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_NO_EXC 8
#define _MM_FROUND_RAISE_EXC 0
#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_TO_NEAREST_INT 0
#define _MM_FROUND_TO_NEG_INF 1
#define _MM_FROUND_TO_POS_INF 2
#define _MM_FROUND_TO_ZERO 3
#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
#if !(__ASSEMBLER__ + __LINKER__ + 0)
#define _mm_extract_epi32(M128I, I32) \
((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(M128I), (int)(I32)))
#define _mm_minpos_epu16(M128I) \
((int)__builtin_ia32_phminposuw128((__v4si)(__m128i)(M128I), (int)(I32)))
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_BITS_SMMINTRIN_H_ */

View file

@ -1,17 +0,0 @@
#ifndef COSMOPOLITAN_LIBC_BITS_TMMINTRIN_H_
#define COSMOPOLITAN_LIBC_BITS_TMMINTRIN_H_
#include "libc/intrin/emmintrin.internal.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § it's a trap! » ssse3
*/
#define _mm_maddubs_epi16(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_pmaddubsw128((__v16qi)(M128I_0), (__v16qi)(M128I_1)))
#define _mm_shuffle_epi8(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_pshufb128((__v16qi)(M128I_0), (__v16qi)(M128I_1)))
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_BITS_TMMINTRIN_H_ */

View file

@ -1,29 +0,0 @@
#ifndef COSMOPOLITAN_LIBC_BITS_WMMINTRIN_H_
#define COSMOPOLITAN_LIBC_BITS_WMMINTRIN_H_
#include "libc/intrin/emmintrin.internal.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
#define _mm_clmulepi64_si128(X, Y, IMM) \
((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \
(__v2di)(__m128i)(Y), (char)(IMM)))
#define _mm_aesenc_si128(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_aesenc128((__v2di)(M128I_0), (__v2di)(M128I_1)))
#define _mm_aesenclast_si128(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_aesenclast128((__v2di)(M128I_0), (__v2di)(M128I_1)))
#define _mm_aesdec_si128(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_aesdec128((__v2di)(M128I_0), (__v2di)(M128I_1)))
#define _mm_aesdeclast_si128(M128I_0, M128I_1) \
((__m128i)__builtin_ia32_aesdeclast128((__v2di)(M128I_0), (__v2di)(M128I_1)))
#define _mm_aesimc_si128(M128I) \
((__m128i)__builtin_ia32_aesimc128((__v2di)(M128I)))
#define _mm_aesimclast_si128(M128I) \
((__m128i)__builtin_ia32_aesimclast128((__v2di)(M128I)))
#define _mm_aeskeygenassist_si128(X, Y) \
((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(X), (int)(Y)))
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_BITS_WMMINTRIN_H_ */

View file

@ -1,243 +0,0 @@
#ifndef COSMOPOLITAN_LIBC_BITS_XMMINTRIN_H_
#define COSMOPOLITAN_LIBC_BITS_XMMINTRIN_H_
#include "libc/dce.h"
#include "libc/intrin/emmintrin.internal.h"
#define _MM_EXCEPT_MASK 0x003f
#define _MM_EXCEPT_INVALID 0x0001
#define _MM_EXCEPT_DENORM 0x0002
#define _MM_EXCEPT_DIV_ZERO 0x0004
#define _MM_EXCEPT_OVERFLOW 0x0008
#define _MM_EXCEPT_UNDERFLOW 0x0010
#define _MM_EXCEPT_INEXACT 0x0020
#define _MM_MASK_MASK 0x1f80
#define _MM_MASK_INVALID 0x0080
#define _MM_MASK_DENORM 0x0100
#define _MM_MASK_DIV_ZERO 0x0200
#define _MM_MASK_OVERFLOW 0x0400
#define _MM_MASK_UNDERFLOW 0x0800
#define _MM_MASK_INEXACT 0x1000
#define _MM_ROUND_MASK 0x6000
#define _MM_ROUND_NEAREST 0x0000
#define _MM_ROUND_DOWN 0x2000
#define _MM_ROUND_UP 0x4000
#define _MM_ROUND_TOWARD_ZERO 0x6000
#define _MM_FLUSH_ZERO_MASK 0x8000
#define _MM_FLUSH_ZERO_ON 0x8000
#define _MM_FLUSH_ZERO_OFF 0x0000
#define _MM_SHUFFLE(A, B, C, D) (((A) << 6) | ((B) << 4) | ((C) << 2) | (D))
#if !(__ASSEMBLER__ + __LINKER__ + 0)
typedef int __v4si _Vector_size(16);
typedef unsigned int __v4su _Vector_size(16);
typedef float __v4sf _Vector_size(16);
typedef float __m128 _Vector_size(16) forcealign(16) mayalias;
typedef float __m128_u _Vector_size(16) forcealign(1) mayalias;
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § it's a trap! » sse » simd ops
*/
#define _mm_add_ps(M128_0, M128_1) \
((__m128)((__v4sf)(M128_0) + (__v4sf)(M128_1)))
#define _mm_sub_ps(M128_0, M128_1) \
((__m128)((__v4sf)(M128_0) - (__v4sf)(M128_1)))
#define _mm_mul_ps(M128_0, M128_1) \
((__m128)((__v4sf)(M128_0) * (__v4sf)(M128_1)))
#define _mm_div_ps(M128_0, M128_1) \
((__m128)((__v4sf)(M128_0) / (__v4sf)(M128_1)))
#define _mm_and_ps(M128_0, M128_1) \
((__m128)((__v4su)(M128_0) & (__v4su)(M128_1)))
#define _mm_or_ps(M128_0, M128_1) \
((__m128)((__v4su)(M128_0) | (__v4su)(M128_1)))
#define _mm_xor_ps(M128_0, M128_1) /* XORPD [u32 simd xor] */ \
((__m128)((__v4su)(M128_0) ^ (__v4su)(M128_1)))
#define _mm_andnot_ps(M128_0, M128_1) /* ANDNPS [u32 simd nand] */ \
((__m128)(~(__v4su)(M128_0) & (__v4su)(M128_1)))
#define _mm_rcp_ps(M128) __builtin_ia32_rcpps((__v4sf)(M128))
#define _mm_sqrt_ps(M128) __builtin_ia32_sqrtps((__v4sf)(M128))
#define _mm_rsqrt_ps(M128) __builtin_ia32_rsqrtps((__v4sf)(M128))
#define _mm_min_ps(M128_0, M128_1) \
__builtin_ia32_minps((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_max_ps(M128_0, M128_1) \
__builtin_ia32_maxps((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_min_ss(M128_0, M128_1) \
__builtin_ia32_minss((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_max_ss(M128_0, M128_1) \
__builtin_ia32_maxss((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_cmpeq_ps(M128_0, M128_1) \
__builtin_ia32_cmpeqps((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_cmpneq_ps(M128_0, M128_1) \
__builtin_ia32_cmpneqps((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_cmplt_ps(M128_0, M128_1) \
__builtin_ia32_cmpltps((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_cmpnlt_ps(M128_0, M128_1) \
__builtin_ia32_cmpnltps((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_cmple_ps(M128_0, M128_1) \
__builtin_ia32_cmpleps((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_cmpnle_ps(M128_0, M128_1) \
__builtin_ia32_cmpnleps((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_cmpgt_ps(M128_0, M128_1) \
__builtin_ia32_cmpltps((__v4sf)(M128_1), (__v4sf)(M128_0))
#define _mm_cmpngt_ps(M128_0, M128_1) \
__builtin_ia32_cmpnltps((__v4sf)(M128_1), (__v4sf)(M128_0))
#define _mm_cmpge_ps(M128_0, M128_1) \
__builtin_ia32_cmpleps((__v4sf)(M128_1), (__v4sf)(M128_0))
#define _mm_cmpnge_ps(M128_0, M128_1) \
__builtin_ia32_cmpnleps((__v4sf)(M128_1), (__v4sf)(M128_0))
#define _mm_cmpord_ps(M128_0, M128_1) \
__builtin_ia32_cmpordps((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_cmpunord_ps(M128_0, M128_1) \
__builtin_ia32_cmpunordps((__v4sf)(M128_0), (__v4sf)(M128_1))
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § it's a trap! » sse » scalar ops
*/
#define _mm_add_ss(m128_0, m128_1) \
({ \
__m128 a = m128_0; \
__m128 b = m128_1; \
a[0] += b[0]; \
a; \
})
#define _mm_sub_ss(m128_0, m128_1) \
({ \
__m128 a = m128_0; \
__m128 b = m128_1; \
a[0] -= b[0]; \
a; \
})
#define _mm_mul_ss(m128_0, m128_1) \
({ \
__m128 a = m128_0; \
__m128 b = m128_1; \
a[0] *= b[0]; \
a; \
})
#define _mm_div_ss(m128_0, m128_1) \
({ \
__m128 a = m128_0; \
__m128 b = m128_1; \
a[0] /= b[0]; \
a; \
})
#define _mm_rcp_ss(M128) __builtin_ia32_rcpss((__v4sf)(M128)) /*~1/x*/
#define _mm_sqrt_ss(M128) __builtin_ia32_sqrtss((__v4sf)(M128)) /*sqrt𝑥*/
#define _mm_rsqrt_ss(M128) __builtin_ia32_rsqrtss((__v4sf)(M128)) /*~1/sqrt𝑥*/
#define _mm_min_ss(M128_0, M128_1) \
__builtin_ia32_minss((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_max_ss(M128_0, M128_1) \
__builtin_ia32_maxss((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_cmpeq_ss(M128_0, M128_1) \
__builtin_ia32_cmpeqss((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_cmpneq_ss(M128_0, M128_1) \
__builtin_ia32_cmpneqss((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_cmplt_ss(M128_0, M128_1) \
__builtin_ia32_cmpltss((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_cmpnlt_ss(M128_0, M128_1) \
__builtin_ia32_cmpnltss((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_cmple_ss(M128_0, M128_1) \
__builtin_ia32_cmpless((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_cmpnle_ss(M128_0, M128_1) \
__builtin_ia32_cmpnless((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_cmpgt_ss(M128_0, M128_1) \
__builtin_ia32_cmpltss((__v4sf)(M128_1), (__v4sf)(M128_0))
#define _mm_cmpngt_ss(M128_0, M128_1) \
__builtin_ia32_cmpnltss((__v4sf)(M128_1), (__v4sf)(M128_0))
#define _mm_cmpge_ss(M128_0, M128_1) \
__builtin_ia32_cmpless((__v4sf)(M128_1), (__v4sf)(M128_0))
#define _mm_cmpnge_ss(M128_0, M128_1) \
__builtin_ia32_cmpnless((__v4sf)(M128_1), (__v4sf)(M128_0))
#define _mm_cmpord_ss(M128_0, M128_1) \
__builtin_ia32_cmpordss((__v4sf)(M128_0), (__v4sf)(M128_1))
#define _mm_cmpunord_ss(M128_0, M128_1) \
__builtin_ia32_cmpunordss((__v4sf)(M128_0), (__v4sf)(M128_1))
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § it's a trap! » sse » memory ops
*/
#define _mm_set1_ps(M128_0) ((__m128)(__v4sf){M128_0, M128_0, M128_0, M128_0})
#define _mm_setzero_ps() ((__m128)(__v4sf){0})
#define _mm_cvtss_f32(M128_0) (((__v4sf)(M128_0))[0])
#define _mm_load_ps(FLOATPTR) (*(__m128 *)(FLOATPTR))
#define _mm_loadu_ps(FLOATPTR) (*(__m128_u *)(FLOATPTR))
#define _mm_set_ps(WHO, DESIGNED, THIS, SHEESH) \
((__m128)(__v4sf){SHEESH, THIS, DESIGNED, WHO})
#define _mm_set_ss(FLOAT) ((__m128)(__v4sf){FLOAT, 0, 0, 0})
#define _mm_load_ss(FLOATPTR) _mm_set_ss(*(FLOATPTR))
#define _mm_store_ss(FLOATPTR, M128_0) ((FLOATPTR)[0] = ((__v4sf)(M128_0))[0])
#define _mm_store_ps(FLOATPTR, M128_0) (*(__m128 *)(FLOATPTR) = (M128_0))
#define _mm_storeu_ps(FLOATPTR, M128_0) (*(__m128_u *)(FLOATPTR) = (M128_0))
#define _mm_shuffle_ps(M128_0, M128_1, MASK) \
((__m128)__builtin_ia32_shufps((__v4sf)(M128_0), (__v4sf)(M128_1), (MASK)))
#ifdef __llvm__
#define _mm_movehl_ps(M128_0, M128_1) \
((__m128)__builtin_shufflevector((__v4sf)(__m128)(M128_0), \
(__v4sf)(__m128)(M128_1), 6, 7, 2, 3))
/* intrinsics unstable & constantly breaking, consider ansi c or asm. */
/* each version of llvm has a different incompatible impl for this one */
#else
#define _mm_movehl_ps(M128_0, M128_1) \
((__m128)__builtin_ia32_movhlps((__v4sf)(__m128)(M128_0), \
(__v4sf)(__m128)(M128_1)))
#define _mm_storel_pi(M64PTR, M128_0) \
__builtin_ia32_storelps((__v2sf *)(M64PTR), (__v4sf)(M128_0))
#endif
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § it's a trap! » sse » cast ops
*/
#define _mm_cvtps_epi32(M128_0) \
((__m128i)__builtin_ia32_cvtps2dq((__v4sf)(M128_0)))
#ifdef __llvm__
#define _mm_cvtepi32_ps(M128I_0) \
((__m128) __builtin_convertvector((__v4si)(__m128i)(M128I_0), __v4sf))
#else
#define _mm_cvtepi32_ps(M128I_0) \
((__m128)__builtin_ia32_cvtdq2ps((__v4si)(M128I_0)))
#endif
/*───────────────────────────────────────────────────────────────────────────│─╗
cosmopolitan § it's a trap! » sse » misc
*/
#define _mm_getcsr() (__builtin_ia32_stmxcsr())
#define _mm_setcsr(U32CONF) (__builtin_ia32_ldmxcsr(U32CONF))
#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
#define _MM_SET_ROUNDING_MODE(MODE) \
(_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (MODE)))
#define XMM_DESTROY(VAR) \
do { \
if (!IsTrustworthy()) { \
asm volatile("xorps\t%1,%0" : "=x"(VAR) : "0"(VAR)); \
} \
} while (0)
/*
** Ternary:
**
** Integer: _mm_or_si128(_mm_and_si128(a, cond), _mm_andnot_si128(cond, b))
** 32-bit float: _mm_or_ps(_mm_and_ps(a, cond), _mm_andnot_ps(cond, b))
** 64-bit float: _mm_or_pd(_mm_and_pd(a, cond), _mm_andnot_pd(cond, b))
** Integer (SSE4.1+): _mm_blendv_epi8(a, b, cond)
** 32-bit float (SSE4.1+): _mm_blendv_ps(a, b, cond)
** 64-bit float (SSE4.1+): _mm_blendv_pd(a, b, cond)
*/
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_LIBC_BITS_XMMINTRIN_H_ */

4
libc/isystem/ammintrin.h Normal file
View file

@ -0,0 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_
#include "third_party/intel/ammintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_AMMINTRIN_INTERNAL_H_ */

View file

@ -1,4 +0,0 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_AVX2INTRIN_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_AVX2INTRIN_H_
#include "libc/intrin/avx2intrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_AVX2INTRIN_H_ */

View file

@ -1,4 +0,0 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_AVXINTRIN_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_AVXINTRIN_H_
#include "libc/intrin/avxintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_AVXINTRIN_H_ */

View file

@ -0,0 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_
#include "third_party/intel/clzerointrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_CLZEROINTRIN_INTERNAL_H_ */

4
libc/isystem/cpuid.h Normal file
View file

@ -0,0 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_CPUID_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_CPUID_INTERNAL_H_
#include "third_party/intel/cpuid.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_CPUID_INTERNAL_H_ */

4
libc/isystem/dog.py Normal file
View file

@ -0,0 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_DOG_PY_
#define COSMOPOLITAN_LIBC_ISYSTEM_DOG_PY_
#include "third_party/intel/dog.py"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_DOG_PY_ */

View file

@ -1,4 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_H_
#include "libc/intrin/emmintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_H_ */
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_
#include "third_party/intel/emmintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_EMMINTRIN_INTERNAL_H_ */

4
libc/isystem/fun.py Normal file
View file

@ -0,0 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_FUN_PY_
#define COSMOPOLITAN_LIBC_ISYSTEM_FUN_PY_
#include "third_party/intel/fun.py"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_FUN_PY_ */

4
libc/isystem/immintrin.h Normal file
View file

@ -0,0 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_
#include "third_party/intel/immintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_IMMINTRIN_INTERNAL_H_ */

4
libc/isystem/mm3dnow.h Normal file
View file

@ -0,0 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_MM3DNOW_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_MM3DNOW_INTERNAL_H_
#include "third_party/intel/mm3dnow.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_MM3DNOW_INTERNAL_H_ */

4
libc/isystem/mm_malloc.h Normal file
View file

@ -0,0 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_
#include "third_party/intel/mm_malloc.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_MM_MALLOC_INTERNAL_H_ */

4
libc/isystem/mmintrin.h Normal file
View file

@ -0,0 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_
#include "third_party/intel/mmintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_MMINTRIN_INTERNAL_H_ */

View file

@ -0,0 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_
#include "third_party/intel/mwaitxintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_MWAITXINTRIN_INTERNAL_H_ */

4
libc/isystem/nmmintrin.h Normal file
View file

@ -0,0 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_
#include "third_party/intel/nmmintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_NMMINTRIN_INTERNAL_H_ */

View file

@ -1,4 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_H_
#include "libc/intrin/pmmintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_H_ */
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_
#include "third_party/intel/pmmintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_PMMINTRIN_INTERNAL_H_ */

View file

@ -0,0 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_
#include "third_party/intel/popcntintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_POPCNTINTRIN_INTERNAL_H_ */

4
libc/isystem/sgxintrin.h Normal file
View file

@ -0,0 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_
#include "third_party/intel/sgxintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_SGXINTRIN_INTERNAL_H_ */

View file

@ -1,4 +0,0 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_SHAINTRIN_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_SHAINTRIN_H_
#include "libc/intrin/shaintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_SHAINTRIN_H_ */

View file

@ -1,4 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_H_
#include "libc/intrin/smmintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_H_ */
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_
#include "third_party/intel/smmintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_SMMINTRIN_INTERNAL_H_ */

View file

@ -1,4 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_H_
#include "libc/intrin/tmmintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_H_ */
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_
#include "third_party/intel/tmmintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_TMMINTRIN_INTERNAL_H_ */

View file

@ -0,0 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_VAESINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_VAESINTRIN_INTERNAL_H_
#include "third_party/intel/vaesintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_VAESINTRIN_INTERNAL_H_ */

View file

@ -1,4 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_H_
#include "libc/intrin/wmmintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_H_ */
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_
#include "third_party/intel/wmmintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_WMMINTRIN_INTERNAL_H_ */

4
libc/isystem/x86intrin.h Normal file
View file

@ -0,0 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_
#include "third_party/intel/x86intrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_X86INTRIN_INTERNAL_H_ */

View file

@ -1,4 +1,4 @@
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_H_
#include "libc/intrin/xmmintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_H_ */
#ifndef COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_
#define COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_
#include "third_party/intel/xmmintrin.internal.h"
#endif /* COSMOPOLITAN_LIBC_ISYSTEM_XMMINTRIN_INTERNAL_H_ */

52
third_party/intel/adxintrin.internal.h vendored Normal file
View file

@ -0,0 +1,52 @@
#if !defined _IMMINTRIN_H_INCLUDED
#error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _ADXINTRIN_H_INCLUDED
#define _ADXINTRIN_H_INCLUDED
extern __inline unsigned char
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_subborrow_u32(unsigned char __CF, unsigned int __X, unsigned int __Y,
unsigned int *__P) {
return __builtin_ia32_sbb_u32(__CF, __X, __Y, __P);
}
extern __inline unsigned char
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_addcarry_u32(unsigned char __CF, unsigned int __X, unsigned int __Y,
unsigned int *__P) {
return __builtin_ia32_addcarryx_u32(__CF, __X, __Y, __P);
}
extern __inline unsigned char
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_addcarryx_u32(unsigned char __CF, unsigned int __X, unsigned int __Y,
unsigned int *__P) {
return __builtin_ia32_addcarryx_u32(__CF, __X, __Y, __P);
}
#ifdef __x86_64__
extern __inline unsigned char
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_subborrow_u64(unsigned char __CF, unsigned long long __X,
unsigned long long __Y, unsigned long long *__P) {
return __builtin_ia32_sbb_u64(__CF, __X, __Y, __P);
}
extern __inline unsigned char
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_addcarry_u64(unsigned char __CF, unsigned long long __X,
unsigned long long __Y, unsigned long long *__P) {
return __builtin_ia32_addcarryx_u64(__CF, __X, __Y, __P);
}
extern __inline unsigned char
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_addcarryx_u64(unsigned char __CF, unsigned long long __X,
unsigned long long __Y, unsigned long long *__P) {
return __builtin_ia32_addcarryx_u64(__CF, __X, __Y, __P);
}
#endif
#endif /* _ADXINTRIN_H_INCLUDED */

66
third_party/intel/ammintrin.internal.h vendored Normal file
View file

@ -0,0 +1,66 @@
#ifndef _AMMINTRIN_H_INCLUDED
#define _AMMINTRIN_H_INCLUDED
#include "third_party/intel/pmmintrin.internal.h"
#ifndef __SSE4A__
#pragma GCC push_options
#pragma GCC target("sse4a")
#define __DISABLE_SSE4A__
#endif /* __SSE4A__ */
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_sd(double* __P, __m128d __Y) {
__builtin_ia32_movntsd(__P, (__v2df)__Y);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_ss(float* __P, __m128 __Y) {
__builtin_ia32_movntss(__P, (__v4sf)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_si64(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_extrq((__v2di)__X, (__v16qi)__Y);
}
#ifdef __OPTIMIZE__
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_extracti_si64(__m128i __X, unsigned const int __I, unsigned const int __L) {
return (__m128i)__builtin_ia32_extrqi((__v2di)__X, __I, __L);
}
#else
#define _mm_extracti_si64(X, I, L) \
((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(X), (unsigned int)(I), \
(unsigned int)(L)))
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_si64(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_insertq((__v2di)__X, (__v2di)__Y);
}
#ifdef __OPTIMIZE__
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_inserti_si64(__m128i __X, __m128i __Y, unsigned const int __I,
unsigned const int __L) {
return (__m128i)__builtin_ia32_insertqi((__v2di)__X, (__v2di)__Y, __I, __L);
}
#else
#define _mm_inserti_si64(X, Y, I, L) \
((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(X), \
(__v2di)(__m128i)(Y), (unsigned int)(I), \
(unsigned int)(L)))
#endif
#ifdef __DISABLE_SSE4A__
#undef __DISABLE_SSE4A__
#pragma GCC pop_options
#endif /* __DISABLE_SSE4A__ */
#endif /* _AMMINTRIN_H_INCLUDED */

1492
third_party/intel/avx2intrin.internal.h vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,128 @@
#if !defined _IMMINTRIN_H_INCLUDED
#error \
"Never use <avx5124fmapsintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _AVX5124FMAPSINTRIN_H_INCLUDED
#define _AVX5124FMAPSINTRIN_H_INCLUDED
#ifndef __AVX5124FMAPS__
#pragma GCC push_options
#pragma GCC target("avx5124fmaps")
#define __DISABLE_AVX5124FMAPS__
#endif /* __AVX5124FMAPS__ */
extern __inline __m512
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_4fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __m512 __D, __m512 __E,
__m128 *__F) {
return (__m512)__builtin_ia32_4fmaddps((__v16sf)__B, (__v16sf)__C,
(__v16sf)__D, (__v16sf)__E,
(__v16sf)__A, (const __v4sf *)__F);
}
extern __inline __m512
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_4fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C,
__m512 __D, __m512 __E, __m128 *__F) {
return (__m512)__builtin_ia32_4fmaddps_mask(
(__v16sf)__B, (__v16sf)__C, (__v16sf)__D, (__v16sf)__E, (__v16sf)__A,
(const __v4sf *)__F, (__v16sf)__A, (__mmask16)__U);
}
extern __inline __m512
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_4fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C,
__m512 __D, __m512 __E, __m128 *__F) {
return (__m512)__builtin_ia32_4fmaddps_mask(
(__v16sf)__B, (__v16sf)__C, (__v16sf)__D, (__v16sf)__E, (__v16sf)__A,
(const __v4sf *)__F, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_4fmadd_ss(__m128 __A, __m128 __B, __m128 __C, __m128 __D, __m128 __E,
__m128 *__F) {
return (__m128)__builtin_ia32_4fmaddss((__v4sf)__B, (__v4sf)__C, (__v4sf)__D,
(__v4sf)__E, (__v4sf)__A,
(const __v4sf *)__F);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_4fmadd_ss(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C,
__m128 __D, __m128 __E, __m128 *__F) {
return (__m128)__builtin_ia32_4fmaddss_mask(
(__v4sf)__B, (__v4sf)__C, (__v4sf)__D, (__v4sf)__E, (__v4sf)__A,
(const __v4sf *)__F, (__v4sf)__A, (__mmask8)__U);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_4fmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C,
__m128 __D, __m128 __E, __m128 *__F) {
return (__m128)__builtin_ia32_4fmaddss_mask(
(__v4sf)__B, (__v4sf)__C, (__v4sf)__D, (__v4sf)__E, (__v4sf)__A,
(const __v4sf *)__F, (__v4sf)_mm_setzero_ps(), (__mmask8)__U);
}
extern __inline __m512
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_4fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __m512 __D,
__m512 __E, __m128 *__F) {
return (__m512)__builtin_ia32_4fnmaddps((__v16sf)__B, (__v16sf)__C,
(__v16sf)__D, (__v16sf)__E,
(__v16sf)__A, (const __v4sf *)__F);
}
extern __inline __m512
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_4fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C,
__m512 __D, __m512 __E, __m128 *__F) {
return (__m512)__builtin_ia32_4fnmaddps_mask(
(__v16sf)__B, (__v16sf)__C, (__v16sf)__D, (__v16sf)__E, (__v16sf)__A,
(const __v4sf *)__F, (__v16sf)__A, (__mmask16)__U);
}
extern __inline __m512
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_4fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C,
__m512 __D, __m512 __E, __m128 *__F) {
return (__m512)__builtin_ia32_4fnmaddps_mask(
(__v16sf)__B, (__v16sf)__C, (__v16sf)__D, (__v16sf)__E, (__v16sf)__A,
(const __v4sf *)__F, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_4fnmadd_ss(__m128 __A, __m128 __B, __m128 __C, __m128 __D, __m128 __E,
__m128 *__F) {
return (__m128)__builtin_ia32_4fnmaddss((__v4sf)__B, (__v4sf)__C, (__v4sf)__D,
(__v4sf)__E, (__v4sf)__A,
(const __v4sf *)__F);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_4fnmadd_ss(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C,
__m128 __D, __m128 __E, __m128 *__F) {
return (__m128)__builtin_ia32_4fnmaddss_mask(
(__v4sf)__B, (__v4sf)__C, (__v4sf)__D, (__v4sf)__E, (__v4sf)__A,
(const __v4sf *)__F, (__v4sf)__A, (__mmask8)__U);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_4fnmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C,
__m128 __D, __m128 __E, __m128 *__F) {
return (__m128)__builtin_ia32_4fnmaddss_mask(
(__v4sf)__B, (__v4sf)__C, (__v4sf)__D, (__v4sf)__E, (__v4sf)__A,
(const __v4sf *)__F, (__v4sf)_mm_setzero_ps(), (__mmask8)__U);
}
#ifdef __DISABLE_AVX5124FMAPS__
#undef __DISABLE_AVX5124FMAPS__
#pragma GCC pop_options
#endif /* __DISABLE_AVX5124FMAPS__ */
#endif /* _AVX5124FMAPSINTRIN_H_INCLUDED */

View file

@ -0,0 +1,78 @@
#if !defined _IMMINTRIN_H_INCLUDED
#error \
"Never use <avx5124vnniwintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _AVX5124VNNIWINTRIN_H_INCLUDED
#define _AVX5124VNNIWINTRIN_H_INCLUDED
#ifndef __AVX5124VNNIW__
#pragma GCC push_options
#pragma GCC target("avx5124vnniw")
#define __DISABLE_AVX5124VNNIW__
#endif /* __AVX5124VNNIW__ */
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_4dpwssd_epi32(__m512i __A, __m512i __B, __m512i __C, __m512i __D,
__m512i __E, __m128i *__F) {
return (__m512i)__builtin_ia32_vp4dpwssd((__v16si)__B, (__v16si)__C,
(__v16si)__D, (__v16si)__E,
(__v16si)__A, (const __v4si *)__F);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_4dpwssd_epi32(__m512i __A, __mmask16 __U, __m512i __B,
__m512i __C, __m512i __D, __m512i __E,
__m128i *__F) {
return (__m512i)__builtin_ia32_vp4dpwssd_mask(
(__v16si)__B, (__v16si)__C, (__v16si)__D, (__v16si)__E, (__v16si)__A,
(const __v4si *)__F, (__v16si)__A, (__mmask16)__U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_4dpwssd_epi32(__mmask16 __U, __m512i __A, __m512i __B,
__m512i __C, __m512i __D, __m512i __E,
__m128i *__F) {
return (__m512i)__builtin_ia32_vp4dpwssd_mask(
(__v16si)__B, (__v16si)__C, (__v16si)__D, (__v16si)__E, (__v16si)__A,
(const __v4si *)__F, (__v16si)_mm512_setzero_ps(), (__mmask16)__U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_4dpwssds_epi32(__m512i __A, __m512i __B, __m512i __C, __m512i __D,
__m512i __E, __m128i *__F) {
return (__m512i)__builtin_ia32_vp4dpwssds((__v16si)__B, (__v16si)__C,
(__v16si)__D, (__v16si)__E,
(__v16si)__A, (const __v4si *)__F);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_4dpwssds_epi32(__m512i __A, __mmask16 __U, __m512i __B,
__m512i __C, __m512i __D, __m512i __E,
__m128i *__F) {
return (__m512i)__builtin_ia32_vp4dpwssds_mask(
(__v16si)__B, (__v16si)__C, (__v16si)__D, (__v16si)__E, (__v16si)__A,
(const __v4si *)__F, (__v16si)__A, (__mmask16)__U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_4dpwssds_epi32(__mmask16 __U, __m512i __A, __m512i __B,
__m512i __C, __m512i __D, __m512i __E,
__m128i *__F) {
return (__m512i)__builtin_ia32_vp4dpwssds_mask(
(__v16si)__B, (__v16si)__C, (__v16si)__D, (__v16si)__E, (__v16si)__A,
(const __v4si *)__F, (__v16si)_mm512_setzero_ps(), (__mmask16)__U);
}
#ifdef __DISABLE_AVX5124VNNIW__
#undef __DISABLE_AVX5124VNNIW__
#pragma GCC pop_options
#endif /* __DISABLE_AVX5124VNNIW__ */
#endif /* _AVX5124VNNIWINTRIN_H_INCLUDED */

View file

@ -0,0 +1,213 @@
#if !defined _IMMINTRIN_H_INCLUDED
#error \
"Never use <avx512bitalgintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _AVX512BITALGINTRIN_H_INCLUDED
#define _AVX512BITALGINTRIN_H_INCLUDED
#ifndef __AVX512BITALG__
#pragma GCC push_options
#pragma GCC target("avx512bitalg")
#define __DISABLE_AVX512BITALG__
#endif /* __AVX512BITALG__ */
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_popcnt_epi8(__m512i __A) {
return (__m512i)__builtin_ia32_vpopcountb_v64qi((__v64qi)__A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_popcnt_epi16(__m512i __A) {
return (__m512i)__builtin_ia32_vpopcountw_v32hi((__v32hi)__A);
}
#ifdef __DISABLE_AVX512BITALG__
#undef __DISABLE_AVX512BITALG__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512BITALG__ */
#if !defined(__AVX512BITALG__) || !defined(__AVX512BW__)
#pragma GCC push_options
#pragma GCC target("avx512bitalg,avx512bw")
#define __DISABLE_AVX512BITALGBW__
#endif /* __AVX512VLBW__ */
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B) {
return (__m512i)__builtin_ia32_vpopcountb_v64qi_mask(
(__v64qi)__A, (__v64qi)__B, (__mmask64)__U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __A) {
return (__m512i)__builtin_ia32_vpopcountb_v64qi_mask(
(__v64qi)__A, (__v64qi)_mm512_setzero_si512(), (__mmask64)__U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B) {
return (__m512i)__builtin_ia32_vpopcountw_v32hi_mask(
(__v32hi)__A, (__v32hi)__B, (__mmask32)__U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __A) {
return (__m512i)__builtin_ia32_vpopcountw_v32hi_mask(
(__v32hi)__A, (__v32hi)_mm512_setzero_si512(), (__mmask32)__U);
}
extern __inline __mmask64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B) {
return (__mmask64)__builtin_ia32_vpshufbitqmb512_mask(
(__v64qi)__A, (__v64qi)__B, (__mmask64)-1);
}
extern __inline __mmask64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_bitshuffle_epi64_mask(__mmask64 __M, __m512i __A, __m512i __B) {
return (__mmask64)__builtin_ia32_vpshufbitqmb512_mask(
(__v64qi)__A, (__v64qi)__B, (__mmask64)__M);
}
#ifdef __DISABLE_AVX512BITALGBW__
#undef __DISABLE_AVX512BITALGBW__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512BITALGBW__ */
#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__) || \
!defined(__AVX512BW__)
#pragma GCC push_options
#pragma GCC target("avx512bitalg,avx512vl,avx512bw")
#define __DISABLE_AVX512BITALGVLBW__
#endif /* __AVX512VLBW__ */
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B) {
return (__m256i)__builtin_ia32_vpopcountb_v32qi_mask(
(__v32qi)__A, (__v32qi)__B, (__mmask32)__U);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __A) {
return (__m256i)__builtin_ia32_vpopcountb_v32qi_mask(
(__v32qi)__A, (__v32qi)_mm256_setzero_si256(), (__mmask32)__U);
}
extern __inline __mmask32
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B) {
return (__mmask32)__builtin_ia32_vpshufbitqmb256_mask(
(__v32qi)__A, (__v32qi)__B, (__mmask32)-1);
}
extern __inline __mmask32
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_bitshuffle_epi64_mask(__mmask32 __M, __m256i __A, __m256i __B) {
return (__mmask32)__builtin_ia32_vpshufbitqmb256_mask(
(__v32qi)__A, (__v32qi)__B, (__mmask32)__M);
}
#ifdef __DISABLE_AVX512BITALGVLBW__
#undef __DISABLE_AVX512BITALGVLBW__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512BITALGVLBW__ */
#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__)
#pragma GCC push_options
#pragma GCC target("avx512bitalg,avx512vl")
#define __DISABLE_AVX512BITALGVL__
#endif /* __AVX512VLBW__ */
extern __inline __mmask16
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B) {
return (__mmask16)__builtin_ia32_vpshufbitqmb128_mask(
(__v16qi)__A, (__v16qi)__B, (__mmask16)-1);
}
extern __inline __mmask16
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_bitshuffle_epi64_mask(__mmask16 __M, __m128i __A, __m128i __B) {
return (__mmask16)__builtin_ia32_vpshufbitqmb128_mask(
(__v16qi)__A, (__v16qi)__B, (__mmask16)__M);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_popcnt_epi8(__m256i __A) {
return (__m256i)__builtin_ia32_vpopcountb_v32qi((__v32qi)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_popcnt_epi16(__m256i __A) {
return (__m256i)__builtin_ia32_vpopcountw_v16hi((__v16hi)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_popcnt_epi8(__m128i __A) {
return (__m128i)__builtin_ia32_vpopcountb_v16qi((__v16qi)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_popcnt_epi16(__m128i __A) {
return (__m128i)__builtin_ia32_vpopcountw_v8hi((__v8hi)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B) {
return (__m256i)__builtin_ia32_vpopcountw_v16hi_mask(
(__v16hi)__A, (__v16hi)__B, (__mmask16)__U);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __A) {
return (__m256i)__builtin_ia32_vpopcountw_v16hi_mask(
(__v16hi)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) {
return (__m128i)__builtin_ia32_vpopcountb_v16qi_mask(
(__v16qi)__A, (__v16qi)__B, (__mmask16)__U);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __A) {
return (__m128i)__builtin_ia32_vpopcountb_v16qi_mask(
(__v16qi)__A, (__v16qi)_mm_setzero_si128(), (__mmask16)__U);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) {
return (__m128i)__builtin_ia32_vpopcountw_v8hi_mask((__v8hi)__A, (__v8hi)__B,
(__mmask8)__U);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __A) {
return (__m128i)__builtin_ia32_vpopcountw_v8hi_mask(
(__v8hi)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U);
}
#ifdef __DISABLE_AVX512BITALGVL__
#undef __DISABLE_AVX512BITALGVL__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512BITALGBW__ */
#endif /* _AVX512BITALGINTRIN_H_INCLUDED */

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,124 @@
#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _AVX512CDINTRIN_H_INCLUDED
#define _AVX512CDINTRIN_H_INCLUDED
#ifndef __AVX512CD__
#pragma GCC push_options
#pragma GCC target("avx512cd")
#define __DISABLE_AVX512CD__
#endif /* __AVX512CD__ */
typedef long long __v8di __attribute__((__vector_size__(64)));
typedef int __v16si __attribute__((__vector_size__(64)));
typedef long long __m512i __attribute__((__vector_size__(64), __may_alias__));
typedef double __m512d __attribute__((__vector_size__(64), __may_alias__));
typedef unsigned char __mmask8;
typedef unsigned short __mmask16;
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_conflict_epi32(__m512i __A) {
return (__m512i)__builtin_ia32_vpconflictsi_512_mask(
(__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)-1);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_conflict_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
return (__m512i)__builtin_ia32_vpconflictsi_512_mask(
(__v16si)__A, (__v16si)__W, (__mmask16)__U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_conflict_epi32(__mmask16 __U, __m512i __A) {
return (__m512i)__builtin_ia32_vpconflictsi_512_mask(
(__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_conflict_epi64(__m512i __A) {
return (__m512i)__builtin_ia32_vpconflictdi_512_mask(
(__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_conflict_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
return (__m512i)__builtin_ia32_vpconflictdi_512_mask((__v8di)__A, (__v8di)__W,
(__mmask8)__U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_conflict_epi64(__mmask8 __U, __m512i __A) {
return (__m512i)__builtin_ia32_vpconflictdi_512_mask(
(__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_lzcnt_epi64(__m512i __A) {
return (__m512i)__builtin_ia32_vplzcntq_512_mask(
(__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)-1);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_lzcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
return (__m512i)__builtin_ia32_vplzcntq_512_mask((__v8di)__A, (__v8di)__W,
(__mmask8)__U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_lzcnt_epi64(__mmask8 __U, __m512i __A) {
return (__m512i)__builtin_ia32_vplzcntq_512_mask(
(__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_lzcnt_epi32(__m512i __A) {
return (__m512i)__builtin_ia32_vplzcntd_512_mask(
(__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)-1);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_lzcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
return (__m512i)__builtin_ia32_vplzcntd_512_mask((__v16si)__A, (__v16si)__W,
(__mmask16)__U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_lzcnt_epi32(__mmask16 __U, __m512i __A) {
return (__m512i)__builtin_ia32_vplzcntd_512_mask(
(__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_broadcastmb_epi64(__mmask8 __A) {
return (__m512i)__builtin_ia32_broadcastmb512(__A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_broadcastmw_epi32(__mmask16 __A) {
return (__m512i)__builtin_ia32_broadcastmw512(__A);
}
#ifdef __DISABLE_AVX512CD__
#undef __DISABLE_AVX512CD__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512CD__ */
#endif /* _AVX512CDINTRIN_H_INCLUDED */

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,314 @@
#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _AVX512ERINTRIN_H_INCLUDED
#define _AVX512ERINTRIN_H_INCLUDED
#ifndef __AVX512ER__
#pragma GCC push_options
#pragma GCC target("avx512er")
#define __DISABLE_AVX512ER__
#endif /* __AVX512ER__ */
typedef double __v8df __attribute__((__vector_size__(64)));
typedef float __v16sf __attribute__((__vector_size__(64)));
typedef float __m512 __attribute__((__vector_size__(64), __may_alias__));
typedef double __m512d __attribute__((__vector_size__(64), __may_alias__));
typedef unsigned char __mmask8;
typedef unsigned short __mmask16;
#ifdef __OPTIMIZE__
extern __inline __m512d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_exp2a23_round_pd(__m512d __A, int __R) {
__m512d __W;
return (__m512d)__builtin_ia32_exp2pd_mask((__v8df)__A, (__v8df)__W,
(__mmask8)-1, __R);
}
extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_mask_exp2a23_round_pd(__m512d __W, __mmask8 __U, __m512d __A, int __R) {
return (__m512d)__builtin_ia32_exp2pd_mask((__v8df)__A, (__v8df)__W,
(__mmask8)__U, __R);
}
extern __inline __m512d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_exp2a23_round_pd(__mmask8 __U, __m512d __A, int __R) {
return (__m512d)__builtin_ia32_exp2pd_mask(
(__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R);
}
extern __inline __m512
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_exp2a23_round_ps(__m512 __A, int __R) {
__m512 __W;
return (__m512)__builtin_ia32_exp2ps_mask((__v16sf)__A, (__v16sf)__W,
(__mmask16)-1, __R);
}
extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_mask_exp2a23_round_ps(__m512 __W, __mmask16 __U, __m512 __A, int __R) {
return (__m512)__builtin_ia32_exp2ps_mask((__v16sf)__A, (__v16sf)__W,
(__mmask16)__U, __R);
}
extern __inline __m512
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_exp2a23_round_ps(__mmask16 __U, __m512 __A, int __R) {
return (__m512)__builtin_ia32_exp2ps_mask(
(__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R);
}
extern __inline __m512d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_rcp28_round_pd(__m512d __A, int __R) {
__m512d __W;
return (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)__A, (__v8df)__W,
(__mmask8)-1, __R);
}
extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_mask_rcp28_round_pd(__m512d __W, __mmask8 __U, __m512d __A, int __R) {
return (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)__A, (__v8df)__W,
(__mmask8)__U, __R);
}
extern __inline __m512d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_rcp28_round_pd(__mmask8 __U, __m512d __A, int __R) {
return (__m512d)__builtin_ia32_rcp28pd_mask(
(__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R);
}
extern __inline __m512
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_rcp28_round_ps(__m512 __A, int __R) {
__m512 __W;
return (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)__A, (__v16sf)__W,
(__mmask16)-1, __R);
}
extern __inline __m512
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_rcp28_round_ps(__m512 __W, __mmask16 __U, __m512 __A, int __R) {
return (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)__A, (__v16sf)__W,
(__mmask16)__U, __R);
}
extern __inline __m512
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_rcp28_round_ps(__mmask16 __U, __m512 __A, int __R) {
return (__m512)__builtin_ia32_rcp28ps_mask(
(__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rcp28_round_sd(__m128d __A, __m128d __B, int __R) {
return (__m128d)__builtin_ia32_rcp28sd_round((__v2df)__B, (__v2df)__A, __R);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rcp28_round_ss(__m128 __A, __m128 __B, int __R) {
return (__m128)__builtin_ia32_rcp28ss_round((__v4sf)__B, (__v4sf)__A, __R);
}
extern __inline __m512d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_rsqrt28_round_pd(__m512d __A, int __R) {
__m512d __W;
return (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)__A, (__v8df)__W,
(__mmask8)-1, __R);
}
extern __inline __m512d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_mask_rsqrt28_round_pd(__m512d __W, __mmask8 __U, __m512d __A, int __R) {
return (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)__A, (__v8df)__W,
(__mmask8)__U, __R);
}
extern __inline __m512d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_rsqrt28_round_pd(__mmask8 __U, __m512d __A, int __R) {
return (__m512d)__builtin_ia32_rsqrt28pd_mask(
(__v8df)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, __R);
}
extern __inline __m512
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_rsqrt28_round_ps(__m512 __A, int __R) {
__m512 __W;
return (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)__A, (__v16sf)__W,
(__mmask16)-1, __R);
}
extern __inline __m512 __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_mask_rsqrt28_round_ps(__m512 __W, __mmask16 __U, __m512 __A, int __R) {
return (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)__A, (__v16sf)__W,
(__mmask16)__U, __R);
}
extern __inline __m512
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_rsqrt28_round_ps(__mmask16 __U, __m512 __A, int __R) {
return (__m512)__builtin_ia32_rsqrt28ps_mask(
(__v16sf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, __R);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rsqrt28_round_sd(__m128d __A, __m128d __B, int __R) {
return (__m128d)__builtin_ia32_rsqrt28sd_round((__v2df)__B, (__v2df)__A, __R);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_rsqrt28_round_ss(__m128 __A, __m128 __B, int __R) {
return (__m128)__builtin_ia32_rsqrt28ss_round((__v4sf)__B, (__v4sf)__A, __R);
}
#else
#define _mm512_exp2a23_round_pd(A, C) \
__builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C)
#define _mm512_mask_exp2a23_round_pd(W, U, A, C) \
__builtin_ia32_exp2pd_mask(A, W, U, C)
#define _mm512_maskz_exp2a23_round_pd(U, A, C) \
__builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C)
#define _mm512_exp2a23_round_ps(A, C) \
__builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C)
#define _mm512_mask_exp2a23_round_ps(W, U, A, C) \
__builtin_ia32_exp2ps_mask(A, W, U, C)
#define _mm512_maskz_exp2a23_round_ps(U, A, C) \
__builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)
#define _mm512_rcp28_round_pd(A, C) \
__builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C)
#define _mm512_mask_rcp28_round_pd(W, U, A, C) \
__builtin_ia32_rcp28pd_mask(A, W, U, C)
#define _mm512_maskz_rcp28_round_pd(U, A, C) \
__builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C)
#define _mm512_rcp28_round_ps(A, C) \
__builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C)
#define _mm512_mask_rcp28_round_ps(W, U, A, C) \
__builtin_ia32_rcp28ps_mask(A, W, U, C)
#define _mm512_maskz_rcp28_round_ps(U, A, C) \
__builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)
#define _mm512_rsqrt28_round_pd(A, C) \
__builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C)
#define _mm512_mask_rsqrt28_round_pd(W, U, A, C) \
__builtin_ia32_rsqrt28pd_mask(A, W, U, C)
#define _mm512_maskz_rsqrt28_round_pd(U, A, C) \
__builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C)
#define _mm512_rsqrt28_round_ps(A, C) \
__builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C)
#define _mm512_mask_rsqrt28_round_ps(W, U, A, C) \
__builtin_ia32_rsqrt28ps_mask(A, W, U, C)
#define _mm512_maskz_rsqrt28_round_ps(U, A, C) \
__builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)
#define _mm_rcp28_round_sd(A, B, R) __builtin_ia32_rcp28sd_round(A, B, R)
#define _mm_rcp28_round_ss(A, B, R) __builtin_ia32_rcp28ss_round(A, B, R)
#define _mm_rsqrt28_round_sd(A, B, R) __builtin_ia32_rsqrt28sd_round(A, B, R)
#define _mm_rsqrt28_round_ss(A, B, R) __builtin_ia32_rsqrt28ss_round(A, B, R)
#endif
#define _mm512_exp2a23_pd(A) \
_mm512_exp2a23_round_pd(A, _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_exp2a23_pd(W, U, A) \
_mm512_mask_exp2a23_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_exp2a23_pd(U, A) \
_mm512_maskz_exp2a23_round_pd(U, A, _MM_FROUND_CUR_DIRECTION)
#define _mm512_exp2a23_ps(A) \
_mm512_exp2a23_round_ps(A, _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_exp2a23_ps(W, U, A) \
_mm512_mask_exp2a23_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_exp2a23_ps(U, A) \
_mm512_maskz_exp2a23_round_ps(U, A, _MM_FROUND_CUR_DIRECTION)
#define _mm512_rcp28_pd(A) _mm512_rcp28_round_pd(A, _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_rcp28_pd(W, U, A) \
_mm512_mask_rcp28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_rcp28_pd(U, A) \
_mm512_maskz_rcp28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION)
#define _mm512_rcp28_ps(A) _mm512_rcp28_round_ps(A, _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_rcp28_ps(W, U, A) \
_mm512_mask_rcp28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_rcp28_ps(U, A) \
_mm512_maskz_rcp28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION)
#define _mm512_rsqrt28_pd(A) \
_mm512_rsqrt28_round_pd(A, _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_rsqrt28_pd(W, U, A) \
_mm512_mask_rsqrt28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_rsqrt28_pd(U, A) \
_mm512_maskz_rsqrt28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION)
#define _mm512_rsqrt28_ps(A) \
_mm512_rsqrt28_round_ps(A, _MM_FROUND_CUR_DIRECTION)
#define _mm512_mask_rsqrt28_ps(W, U, A) \
_mm512_mask_rsqrt28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION)
#define _mm512_maskz_rsqrt28_ps(U, A) \
_mm512_maskz_rsqrt28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION)
#define _mm_rcp28_sd(A, B) \
__builtin_ia32_rcp28sd_round(B, A, _MM_FROUND_CUR_DIRECTION)
#define _mm_rcp28_ss(A, B) \
__builtin_ia32_rcp28ss_round(B, A, _MM_FROUND_CUR_DIRECTION)
#define _mm_rsqrt28_sd(A, B) \
__builtin_ia32_rsqrt28sd_round(B, A, _MM_FROUND_CUR_DIRECTION)
#define _mm_rsqrt28_ss(A, B) \
__builtin_ia32_rsqrt28ss_round(B, A, _MM_FROUND_CUR_DIRECTION)
#ifdef __DISABLE_AVX512ER__
#undef __DISABLE_AVX512ER__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512ER__ */
#endif /* _AVX512ERINTRIN_H_INCLUDED */

12519
third_party/intel/avx512fintrin.internal.h vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,65 @@
#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <avx512ifmaintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _AVX512IFMAINTRIN_H_INCLUDED
#define _AVX512IFMAINTRIN_H_INCLUDED
#ifndef __AVX512IFMA__
#pragma GCC push_options
#pragma GCC target("avx512ifma")
#define __DISABLE_AVX512IFMA__
#endif /* __AVX512IFMA__ */
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_madd52lo_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
return (__m512i)__builtin_ia32_vpmadd52luq512_mask((__v8di)__X, (__v8di)__Y,
(__v8di)__Z, (__mmask8)-1);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_madd52hi_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
return (__m512i)__builtin_ia32_vpmadd52huq512_mask((__v8di)__X, (__v8di)__Y,
(__v8di)__Z, (__mmask8)-1);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_madd52lo_epu64(__m512i __W, __mmask8 __M, __m512i __X,
__m512i __Y) {
return (__m512i)__builtin_ia32_vpmadd52luq512_mask(
(__v8di)__W, (__v8di)__X, (__v8di)__Y, (__mmask8)__M);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_madd52hi_epu64(__m512i __W, __mmask8 __M, __m512i __X,
__m512i __Y) {
return (__m512i)__builtin_ia32_vpmadd52huq512_mask(
(__v8di)__W, (__v8di)__X, (__v8di)__Y, (__mmask8)__M);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_madd52lo_epu64(__mmask8 __M, __m512i __X, __m512i __Y,
__m512i __Z) {
return (__m512i)__builtin_ia32_vpmadd52luq512_maskz(
(__v8di)__X, (__v8di)__Y, (__v8di)__Z, (__mmask8)__M);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_madd52hi_epu64(__mmask8 __M, __m512i __X, __m512i __Y,
__m512i __Z) {
return (__m512i)__builtin_ia32_vpmadd52huq512_maskz(
(__v8di)__X, (__v8di)__Y, (__v8di)__Z, (__mmask8)__M);
}
#ifdef __DISABLE_AVX512IFMA__
#undef __DISABLE_AVX512IFMA__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512IFMA__ */
#endif /* _AVX512IFMAINTRIN_H_INCLUDED */

View file

@ -0,0 +1,108 @@
#ifndef _IMMINTRIN_H_INCLUDED
#error \
"Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _AVX512IFMAVLINTRIN_H_INCLUDED
#define _AVX512IFMAVLINTRIN_H_INCLUDED
#if !defined(__AVX512VL__) || !defined(__AVX512IFMA__)
#pragma GCC push_options
#pragma GCC target("avx512ifma,avx512vl")
#define __DISABLE_AVX512IFMAVL__
#endif /* __AVX512IFMAVL__ */
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
return (__m128i)__builtin_ia32_vpmadd52luq128_mask((__v2di)__X, (__v2di)__Y,
(__v2di)__Z, (__mmask8)-1);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
return (__m128i)__builtin_ia32_vpmadd52huq128_mask((__v2di)__X, (__v2di)__Y,
(__v2di)__Z, (__mmask8)-1);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
return (__m256i)__builtin_ia32_vpmadd52luq256_mask((__v4di)__X, (__v4di)__Y,
(__v4di)__Z, (__mmask8)-1);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
return (__m256i)__builtin_ia32_vpmadd52huq256_mask((__v4di)__X, (__v4di)__Y,
(__v4di)__Z, (__mmask8)-1);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_mask_madd52lo_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_vpmadd52luq128_mask(
(__v2di)__W, (__v2di)__X, (__v2di)__Y, (__mmask8)__M);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_mask_madd52hi_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_vpmadd52huq128_mask(
(__v2di)__W, (__v2di)__X, (__v2di)__Y, (__mmask8)__M);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_madd52lo_epu64(__m256i __W, __mmask8 __M, __m256i __X,
__m256i __Y) {
return (__m256i)__builtin_ia32_vpmadd52luq256_mask(
(__v4di)__W, (__v4di)__X, (__v4di)__Y, (__mmask8)__M);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_madd52hi_epu64(__m256i __W, __mmask8 __M, __m256i __X,
__m256i __Y) {
return (__m256i)__builtin_ia32_vpmadd52huq256_mask(
(__v4di)__W, (__v4di)__X, (__v4di)__Y, (__mmask8)__M);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_maskz_madd52lo_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) {
return (__m128i)__builtin_ia32_vpmadd52luq128_maskz(
(__v2di)__X, (__v2di)__Y, (__v2di)__Z, (__mmask8)__M);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_maskz_madd52hi_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) {
return (__m128i)__builtin_ia32_vpmadd52huq128_maskz(
(__v2di)__X, (__v2di)__Y, (__v2di)__Z, (__mmask8)__M);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_madd52lo_epu64(__mmask8 __M, __m256i __X, __m256i __Y,
__m256i __Z) {
return (__m256i)__builtin_ia32_vpmadd52luq256_maskz(
(__v4di)__X, (__v4di)__Y, (__v4di)__Z, (__mmask8)__M);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_madd52hi_epu64(__mmask8 __M, __m256i __X, __m256i __Y,
__m256i __Z) {
return (__m256i)__builtin_ia32_vpmadd52huq256_maskz(
(__v4di)__X, (__v4di)__Y, (__v4di)__Z, (__mmask8)__M);
}
#ifdef __DISABLE_AVX512IFMAVL__
#undef __DISABLE_AVX512IFMAVL__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512IFMAVL__ */
#endif /* _AVX512IFMAVLINTRIN_H_INCLUDED */

View file

@ -0,0 +1,221 @@
#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _AVX512PFINTRIN_H_INCLUDED
#define _AVX512PFINTRIN_H_INCLUDED
#ifndef __AVX512PF__
#pragma GCC push_options
#pragma GCC target("avx512pf")
#define __DISABLE_AVX512PF__
#endif /* __AVX512PF__ */
typedef long long __v8di __attribute__((__vector_size__(64)));
typedef int __v16si __attribute__((__vector_size__(64)));
typedef long long __m512i __attribute__((__vector_size__(64), __may_alias__));
typedef unsigned char __mmask8;
typedef unsigned short __mmask16;
#ifdef __OPTIMIZE__
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_prefetch_i32gather_pd(__m256i __index, void const *__addr,
int __scale, int __hint) {
__builtin_ia32_gatherpfdpd((__mmask8)0xFF, (__v8si)__index, __addr, __scale,
__hint);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_prefetch_i32gather_ps(__m512i __index, void const *__addr,
int __scale, int __hint) {
__builtin_ia32_gatherpfdps((__mmask16)0xFFFF, (__v16si)__index, __addr,
__scale, __hint);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_prefetch_i32gather_pd(__m256i __index, __mmask8 __mask,
void const *__addr, int __scale,
int __hint) {
__builtin_ia32_gatherpfdpd(__mask, (__v8si)__index, __addr, __scale, __hint);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_prefetch_i32gather_ps(__m512i __index, __mmask16 __mask,
void const *__addr, int __scale,
int __hint) {
__builtin_ia32_gatherpfdps(__mask, (__v16si)__index, __addr, __scale, __hint);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_prefetch_i64gather_pd(__m512i __index, void const *__addr,
int __scale, int __hint) {
__builtin_ia32_gatherpfqpd((__mmask8)0xFF, (__v8di)__index, __addr, __scale,
__hint);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_prefetch_i64gather_ps(__m512i __index, void const *__addr,
int __scale, int __hint) {
__builtin_ia32_gatherpfqps((__mmask8)0xFF, (__v8di)__index, __addr, __scale,
__hint);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_prefetch_i64gather_pd(__m512i __index, __mmask8 __mask,
void const *__addr, int __scale,
int __hint) {
__builtin_ia32_gatherpfqpd(__mask, (__v8di)__index, __addr, __scale, __hint);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_prefetch_i64gather_ps(__m512i __index, __mmask8 __mask,
void const *__addr, int __scale,
int __hint) {
__builtin_ia32_gatherpfqps(__mask, (__v8di)__index, __addr, __scale, __hint);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_prefetch_i32scatter_pd(void *__addr, __m256i __index, int __scale,
int __hint) {
__builtin_ia32_scatterpfdpd((__mmask8)0xFF, (__v8si)__index, __addr, __scale,
__hint);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_prefetch_i32scatter_ps(void *__addr, __m512i __index, int __scale,
int __hint) {
__builtin_ia32_scatterpfdps((__mmask16)0xFFFF, (__v16si)__index, __addr,
__scale, __hint);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_prefetch_i32scatter_pd(void *__addr, __mmask8 __mask,
__m256i __index, int __scale,
int __hint) {
__builtin_ia32_scatterpfdpd(__mask, (__v8si)__index, __addr, __scale, __hint);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_prefetch_i32scatter_ps(void *__addr, __mmask16 __mask,
__m512i __index, int __scale,
int __hint) {
__builtin_ia32_scatterpfdps(__mask, (__v16si)__index, __addr, __scale,
__hint);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_prefetch_i64scatter_pd(void *__addr, __m512i __index, int __scale,
int __hint) {
__builtin_ia32_scatterpfqpd((__mmask8)0xFF, (__v8di)__index, __addr, __scale,
__hint);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_prefetch_i64scatter_ps(void *__addr, __m512i __index, int __scale,
int __hint) {
__builtin_ia32_scatterpfqps((__mmask8)0xFF, (__v8di)__index, __addr, __scale,
__hint);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_prefetch_i64scatter_pd(void *__addr, __mmask8 __mask,
__m512i __index, int __scale,
int __hint) {
__builtin_ia32_scatterpfqpd(__mask, (__v8di)__index, __addr, __scale, __hint);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_prefetch_i64scatter_ps(void *__addr, __mmask8 __mask,
__m512i __index, int __scale,
int __hint) {
__builtin_ia32_scatterpfqps(__mask, (__v8di)__index, __addr, __scale, __hint);
}
#else
#define _mm512_prefetch_i32gather_pd(INDEX, ADDR, SCALE, HINT) \
__builtin_ia32_gatherpfdpd((__mmask8)0xFF, (__v8si)(__m256i)INDEX, \
(void const *)ADDR, (int)SCALE, (int)HINT)
#define _mm512_prefetch_i32gather_ps(INDEX, ADDR, SCALE, HINT) \
__builtin_ia32_gatherpfdps((__mmask16)0xFFFF, (__v16si)(__m512i)INDEX, \
(void const *)ADDR, (int)SCALE, (int)HINT)
#define _mm512_mask_prefetch_i32gather_pd(INDEX, MASK, ADDR, SCALE, HINT) \
__builtin_ia32_gatherpfdpd((__mmask8)MASK, (__v8si)(__m256i)INDEX, \
(void const *)ADDR, (int)SCALE, (int)HINT)
#define _mm512_mask_prefetch_i32gather_ps(INDEX, MASK, ADDR, SCALE, HINT) \
__builtin_ia32_gatherpfdps((__mmask16)MASK, (__v16si)(__m512i)INDEX, \
(void const *)ADDR, (int)SCALE, (int)HINT)
#define _mm512_prefetch_i64gather_pd(INDEX, ADDR, SCALE, HINT) \
__builtin_ia32_gatherpfqpd((__mmask8)0xFF, (__v8di)(__m512i)INDEX, \
(void *)ADDR, (int)SCALE, (int)HINT)
#define _mm512_prefetch_i64gather_ps(INDEX, ADDR, SCALE, HINT) \
__builtin_ia32_gatherpfqps((__mmask8)0xFF, (__v8di)(__m512i)INDEX, \
(void *)ADDR, (int)SCALE, (int)HINT)
#define _mm512_mask_prefetch_i64gather_pd(INDEX, MASK, ADDR, SCALE, HINT) \
__builtin_ia32_gatherpfqpd((__mmask8)MASK, (__v8di)(__m512i)INDEX, \
(void *)ADDR, (int)SCALE, (int)HINT)
#define _mm512_mask_prefetch_i64gather_ps(INDEX, MASK, ADDR, SCALE, HINT) \
__builtin_ia32_gatherpfqps((__mmask8)MASK, (__v8di)(__m512i)INDEX, \
(void *)ADDR, (int)SCALE, (int)HINT)
#define _mm512_prefetch_i32scatter_pd(ADDR, INDEX, SCALE, HINT) \
__builtin_ia32_scatterpfdpd((__mmask8)0xFF, (__v8si)(__m256i)INDEX, \
(void *)ADDR, (int)SCALE, (int)HINT)
#define _mm512_prefetch_i32scatter_ps(ADDR, INDEX, SCALE, HINT) \
__builtin_ia32_scatterpfdps((__mmask16)0xFFFF, (__v16si)(__m512i)INDEX, \
(void *)ADDR, (int)SCALE, (int)HINT)
#define _mm512_mask_prefetch_i32scatter_pd(ADDR, MASK, INDEX, SCALE, HINT) \
__builtin_ia32_scatterpfdpd((__mmask8)MASK, (__v8si)(__m256i)INDEX, \
(void *)ADDR, (int)SCALE, (int)HINT)
#define _mm512_mask_prefetch_i32scatter_ps(ADDR, MASK, INDEX, SCALE, HINT) \
__builtin_ia32_scatterpfdps((__mmask16)MASK, (__v16si)(__m512i)INDEX, \
(void *)ADDR, (int)SCALE, (int)HINT)
#define _mm512_prefetch_i64scatter_pd(ADDR, INDEX, SCALE, HINT) \
__builtin_ia32_scatterpfqpd((__mmask8)0xFF, (__v8di)(__m512i)INDEX, \
(void *)ADDR, (int)SCALE, (int)HINT)
#define _mm512_prefetch_i64scatter_ps(ADDR, INDEX, SCALE, HINT) \
__builtin_ia32_scatterpfqps((__mmask8)0xFF, (__v8di)(__m512i)INDEX, \
(void *)ADDR, (int)SCALE, (int)HINT)
#define _mm512_mask_prefetch_i64scatter_pd(ADDR, MASK, INDEX, SCALE, HINT) \
__builtin_ia32_scatterpfqpd((__mmask8)MASK, (__v8di)(__m512i)INDEX, \
(void *)ADDR, (int)SCALE, (int)HINT)
#define _mm512_mask_prefetch_i64scatter_ps(ADDR, MASK, INDEX, SCALE, HINT) \
__builtin_ia32_scatterpfqps((__mmask8)MASK, (__v8di)(__m512i)INDEX, \
(void *)ADDR, (int)SCALE, (int)HINT)
#endif
#ifdef __DISABLE_AVX512PF__
#undef __DISABLE_AVX512PF__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512PF__ */
#endif /* _AVX512PFINTRIN_H_INCLUDED */

View file

@ -0,0 +1,455 @@
#ifndef _IMMINTRIN_H_INCLUDED
#error \
"Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VBMI2INTRIN_H_INCLUDED
#define __AVX512VBMI2INTRIN_H_INCLUDED
#if !defined(__AVX512VBMI2__)
#pragma GCC push_options
#pragma GCC target("avx512vbmi2")
#define __DISABLE_AVX512VBMI2__
#endif /* __AVX512VBMI2__ */
#ifdef __OPTIMIZE__
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shrdi_epi16(__m512i __A, __m512i __B, int __C) {
return (__m512i)__builtin_ia32_vpshrd_v32hi((__v32hi)__A, (__v32hi)__B, __C);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shrdi_epi32(__m512i __A, __m512i __B, int __C) {
return (__m512i)__builtin_ia32_vpshrd_v16si((__v16si)__A, (__v16si)__B, __C);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shrdi_epi32(__m512i __A, __mmask16 __B, __m512i __C,
__m512i __D, int __E) {
return (__m512i)__builtin_ia32_vpshrd_v16si_mask(
(__v16si)__C, (__v16si)__D, __E, (__v16si)__A, (__mmask16)__B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shrdi_epi32(__mmask16 __A, __m512i __B, __m512i __C, int __D) {
return (__m512i)__builtin_ia32_vpshrd_v16si_mask(
(__v16si)__B, (__v16si)__C, __D, (__v16si)_mm512_setzero_si512(),
(__mmask16)__A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shrdi_epi64(__m512i __A, __m512i __B, int __C) {
return (__m512i)__builtin_ia32_vpshrd_v8di((__v8di)__A, (__v8di)__B, __C);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shrdi_epi64(__m512i __A, __mmask8 __B, __m512i __C, __m512i __D,
int __E) {
return (__m512i)__builtin_ia32_vpshrd_v8di_mask((__v8di)__C, (__v8di)__D, __E,
(__v8di)__A, (__mmask8)__B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shrdi_epi64(__mmask8 __A, __m512i __B, __m512i __C, int __D) {
return (__m512i)__builtin_ia32_vpshrd_v8di_mask(
(__v8di)__B, (__v8di)__C, __D, (__v8di)_mm512_setzero_si512(),
(__mmask8)__A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shldi_epi16(__m512i __A, __m512i __B, int __C) {
return (__m512i)__builtin_ia32_vpshld_v32hi((__v32hi)__A, (__v32hi)__B, __C);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shldi_epi32(__m512i __A, __m512i __B, int __C) {
return (__m512i)__builtin_ia32_vpshld_v16si((__v16si)__A, (__v16si)__B, __C);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shldi_epi32(__m512i __A, __mmask16 __B, __m512i __C,
__m512i __D, int __E) {
return (__m512i)__builtin_ia32_vpshld_v16si_mask(
(__v16si)__C, (__v16si)__D, __E, (__v16si)__A, (__mmask16)__B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shldi_epi32(__mmask16 __A, __m512i __B, __m512i __C, int __D) {
return (__m512i)__builtin_ia32_vpshld_v16si_mask(
(__v16si)__B, (__v16si)__C, __D, (__v16si)_mm512_setzero_si512(),
(__mmask16)__A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shldi_epi64(__m512i __A, __m512i __B, int __C) {
return (__m512i)__builtin_ia32_vpshld_v8di((__v8di)__A, (__v8di)__B, __C);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shldi_epi64(__m512i __A, __mmask8 __B, __m512i __C, __m512i __D,
int __E) {
return (__m512i)__builtin_ia32_vpshld_v8di_mask((__v8di)__C, (__v8di)__D, __E,
(__v8di)__A, (__mmask8)__B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shldi_epi64(__mmask8 __A, __m512i __B, __m512i __C, int __D) {
return (__m512i)__builtin_ia32_vpshld_v8di_mask(
(__v8di)__B, (__v8di)__C, __D, (__v8di)_mm512_setzero_si512(),
(__mmask8)__A);
}
#else
#define _mm512_shrdi_epi16(A, B, C) \
((__m512i) __builtin_ia32_vpshrd_v32hi ((__v32hi)(__m512i)(A), \
(__v32hi)(__m512i)(B),(int)(C))
#define _mm512_shrdi_epi32(A, B, C) \
((__m512i) __builtin_ia32_vpshrd_v16si ((__v16si)(__m512i)(A), \
(__v16si)(__m512i)(B),(int)(C))
#define _mm512_mask_shrdi_epi32(A, B, C, D, E) \
((__m512i) __builtin_ia32_vpshrd_v16si_mask ((__v16si)(__m512i)(C), \
(__v16si)(__m512i)(D), (int)(E), (__v16si)(__m512i)(A),(__mmask16)(B))
#define _mm512_maskz_shrdi_epi32(A, B, C, D) \
((__m512i) __builtin_ia32_vpshrd_v16si_mask ((__v16si)(__m512i)(B), \
(__v16si)(__m512i)(C),(int)(D), \
(__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(A))
#define _mm512_shrdi_epi64(A, B, C) \
((__m512i) __builtin_ia32_vpshrd_v8di ((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B),(int)(C))
#define _mm512_mask_shrdi_epi64(A, B, C, D, E) \
((__m512i) __builtin_ia32_vpshrd_v8di_mask ((__v8di)(__m512i)(C), \
(__v8di)(__m512i)(D), (int)(E), (__v8di)(__m512i)(A),(__mmask8)(B))
#define _mm512_maskz_shrdi_epi64(A, B, C, D) \
((__m512i) __builtin_ia32_vpshrd_v8di_mask ((__v8di)(__m512i)(B), \
(__v8di)(__m512i)(C),(int)(D), \
(__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(A))
#define _mm512_shldi_epi16(A, B, C) \
((__m512i) __builtin_ia32_vpshld_v32hi ((__v32hi)(__m512i)(A), \
(__v32hi)(__m512i)(B),(int)(C))
#define _mm512_shldi_epi32(A, B, C) \
((__m512i) __builtin_ia32_vpshld_v16si ((__v16si)(__m512i)(A), \
(__v16si)(__m512i)(B),(int)(C))
#define _mm512_mask_shldi_epi32(A, B, C, D, E) \
((__m512i) __builtin_ia32_vpshld_v16si_mask ((__v16si)(__m512i)(C), \
(__v16si)(__m512i)(D), (int)(E), (__v16si)(__m512i)(A),(__mmask16)(B))
#define _mm512_maskz_shldi_epi32(A, B, C, D) \
((__m512i) __builtin_ia32_vpshld_v16si_mask ((__v16si)(__m512i)(B), \
(__v16si)(__m512i)(C),(int)(D), \
(__v16si)(__m512i)_mm512_setzero_si512 (), (__mmask16)(A))
#define _mm512_shldi_epi64(A, B, C) \
((__m512i) __builtin_ia32_vpshld_v8di ((__v8di)(__m512i)(A), \
(__v8di)(__m512i)(B),(int)(C))
#define _mm512_mask_shldi_epi64(A, B, C, D, E) \
((__m512i) __builtin_ia32_vpshld_v8di_mask ((__v8di)(__m512i)(C), \
(__v8di)(__m512i)(D), (int)(E), (__v8di)(__m512i)(A),(__mmask8)(B))
#define _mm512_maskz_shldi_epi64(A, B, C, D) \
((__m512i) __builtin_ia32_vpshld_v8di_mask ((__v8di)(__m512i)(B), \
(__v8di)(__m512i)(C),(int)(D), \
(__v8di)(__m512i)_mm512_setzero_si512 (), (__mmask8)(A))
#endif
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shrdv_epi16(__m512i __A, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_vpshrdv_v32hi((__v32hi)__A, (__v32hi)__B,
(__v32hi)__C);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shrdv_epi32(__m512i __A, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_vpshrdv_v16si((__v16si)__A, (__v16si)__B,
(__v16si)__C);
}
extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_mask_shrdv_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
return (__m512i)__builtin_ia32_vpshrdv_v16si_mask(
(__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B);
}
extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_maskz_shrdv_epi32(__mmask16 __A, __m512i __B, __m512i __C, __m512i __D) {
return (__m512i)__builtin_ia32_vpshrdv_v16si_maskz(
(__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shrdv_epi64(__m512i __A, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_vpshrdv_v8di((__v8di)__A, (__v8di)__B,
(__v8di)__C);
}
extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_mask_shrdv_epi64(__m512i __A, __mmask8 __B, __m512i __C, __m512i __D) {
return (__m512i)__builtin_ia32_vpshrdv_v8di_mask((__v8di)__A, (__v8di)__C,
(__v8di)__D, (__mmask8)__B);
}
extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_maskz_shrdv_epi64(__mmask8 __A, __m512i __B, __m512i __C, __m512i __D) {
return (__m512i)__builtin_ia32_vpshrdv_v8di_maskz((__v8di)__B, (__v8di)__C,
(__v8di)__D, (__mmask8)__A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shldv_epi16(__m512i __A, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_vpshldv_v32hi((__v32hi)__A, (__v32hi)__B,
(__v32hi)__C);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shldv_epi32(__m512i __A, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_vpshldv_v16si((__v16si)__A, (__v16si)__B,
(__v16si)__C);
}
extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_mask_shldv_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
return (__m512i)__builtin_ia32_vpshldv_v16si_mask(
(__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B);
}
extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_maskz_shldv_epi32(__mmask16 __A, __m512i __B, __m512i __C, __m512i __D) {
return (__m512i)__builtin_ia32_vpshldv_v16si_maskz(
(__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_vpshldv_v8di((__v8di)__A, (__v8di)__B,
(__v8di)__C);
}
extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_mask_shldv_epi64(__m512i __A, __mmask8 __B, __m512i __C, __m512i __D) {
return (__m512i)__builtin_ia32_vpshldv_v8di_mask((__v8di)__A, (__v8di)__C,
(__v8di)__D, (__mmask8)__B);
}
extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_maskz_shldv_epi64(__mmask8 __A, __m512i __B, __m512i __C, __m512i __D) {
return (__m512i)__builtin_ia32_vpshldv_v8di_maskz((__v8di)__B, (__v8di)__C,
(__v8di)__D, (__mmask8)__A);
}
#ifdef __DISABLE_AVX512VBMI2__
#undef __DISABLE_AVX512VBMI2__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512VBMI2__ */
#if !defined(__AVX512VBMI2__) || !defined(__AVX512BW__)
#pragma GCC push_options
#pragma GCC target("avx512vbmi2,avx512bw")
#define __DISABLE_AVX512VBMI2BW__
#endif /* __AVX512VBMI2BW__ */
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_compress_epi8(__m512i __A, __mmask64 __B, __m512i __C) {
return (__m512i)__builtin_ia32_compressqi512_mask((__v64qi)__C, (__v64qi)__A,
(__mmask64)__B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_compress_epi8(__mmask64 __A, __m512i __B) {
return (__m512i)__builtin_ia32_compressqi512_mask(
(__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__A);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_compressstoreu_epi8(void *__A, __mmask64 __B, __m512i __C) {
__builtin_ia32_compressstoreuqi512_mask((__v64qi *)__A, (__v64qi)__C,
(__mmask64)__B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_compress_epi16(__m512i __A, __mmask32 __B, __m512i __C) {
return (__m512i)__builtin_ia32_compresshi512_mask((__v32hi)__C, (__v32hi)__A,
(__mmask32)__B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_compress_epi16(__mmask32 __A, __m512i __B) {
return (__m512i)__builtin_ia32_compresshi512_mask(
(__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__A);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_compressstoreu_epi16(void *__A, __mmask32 __B, __m512i __C) {
__builtin_ia32_compressstoreuhi512_mask((__v32hi *)__A, (__v32hi)__C,
(__mmask32)__B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_expand_epi8(__m512i __A, __mmask64 __B, __m512i __C) {
return (__m512i)__builtin_ia32_expandqi512_mask((__v64qi)__C, (__v64qi)__A,
(__mmask64)__B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_expand_epi8(__mmask64 __A, __m512i __B) {
return (__m512i)__builtin_ia32_expandqi512_maskz(
(__v64qi)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_expandloadu_epi8(__m512i __A, __mmask64 __B, const void *__C) {
return (__m512i)__builtin_ia32_expandloadqi512_mask(
(const __v64qi *)__C, (__v64qi)__A, (__mmask64)__B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_expandloadu_epi8(__mmask64 __A, const void *__B) {
return (__m512i)__builtin_ia32_expandloadqi512_maskz(
(const __v64qi *)__B, (__v64qi)_mm512_setzero_si512(), (__mmask64)__A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_expand_epi16(__m512i __A, __mmask32 __B, __m512i __C) {
return (__m512i)__builtin_ia32_expandhi512_mask((__v32hi)__C, (__v32hi)__A,
(__mmask32)__B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_expand_epi16(__mmask32 __A, __m512i __B) {
return (__m512i)__builtin_ia32_expandhi512_maskz(
(__v32hi)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_expandloadu_epi16(__m512i __A, __mmask32 __B, const void *__C) {
return (__m512i)__builtin_ia32_expandloadhi512_mask(
(const __v32hi *)__C, (__v32hi)__A, (__mmask32)__B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_expandloadu_epi16(__mmask32 __A, const void *__B) {
return (__m512i)__builtin_ia32_expandloadhi512_maskz(
(const __v32hi *)__B, (__v32hi)_mm512_setzero_si512(), (__mmask32)__A);
}
#ifdef __OPTIMIZE__
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shrdi_epi16(__m512i __A, __mmask32 __B, __m512i __C,
__m512i __D, int __E) {
return (__m512i)__builtin_ia32_vpshrd_v32hi_mask(
(__v32hi)__C, (__v32hi)__D, __E, (__v32hi)__A, (__mmask32)__B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shrdi_epi16(__mmask32 __A, __m512i __B, __m512i __C, int __D) {
return (__m512i)__builtin_ia32_vpshrd_v32hi_mask(
(__v32hi)__B, (__v32hi)__C, __D, (__v32hi)_mm512_setzero_si512(),
(__mmask32)__A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_shldi_epi16(__m512i __A, __mmask32 __B, __m512i __C,
__m512i __D, int __E) {
return (__m512i)__builtin_ia32_vpshld_v32hi_mask(
(__v32hi)__C, (__v32hi)__D, __E, (__v32hi)__A, (__mmask32)__B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_shldi_epi16(__mmask32 __A, __m512i __B, __m512i __C, int __D) {
return (__m512i)__builtin_ia32_vpshld_v32hi_mask(
(__v32hi)__B, (__v32hi)__C, __D, (__v32hi)_mm512_setzero_si512(),
(__mmask32)__A);
}
#else
#define _mm512_mask_shrdi_epi16(A, B, C, D, E) \
((__m512i) __builtin_ia32_vpshrd_v32hi_mask ((__v32hi)(__m512i)(C), \
(__v32hi)(__m512i)(D), (int)(E), (__v32hi)(__m512i)(A),(__mmask32)(B))
#define _mm512_maskz_shrdi_epi16(A, B, C, D) \
((__m512i) __builtin_ia32_vpshrd_v32hi_mask ((__v32hi)(__m512i)(B), \
(__v32hi)(__m512i)(C),(int)(D), \
(__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)(A))
#define _mm512_mask_shldi_epi16(A, B, C, D, E) \
((__m512i) __builtin_ia32_vpshld_v32hi_mask ((__v32hi)(__m512i)(C), \
(__v32hi)(__m512i)(D), (int)(E), (__v32hi)(__m512i)(A),(__mmask32)(B))
#define _mm512_maskz_shldi_epi16(A, B, C, D) \
((__m512i) __builtin_ia32_vpshld_v32hi_mask ((__v32hi)(__m512i)(B), \
(__v32hi)(__m512i)(C),(int)(D), \
(__v32hi)(__m512i)_mm512_setzero_si512 (), (__mmask32)(A))
#endif
extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_mask_shrdv_epi16(__m512i __A, __mmask32 __B, __m512i __C, __m512i __D) {
return (__m512i)__builtin_ia32_vpshrdv_v32hi_mask(
(__v32hi)__A, (__v32hi)__C, (__v32hi)__D, (__mmask32)__B);
}
extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_maskz_shrdv_epi16(__mmask32 __A, __m512i __B, __m512i __C, __m512i __D) {
return (__m512i)__builtin_ia32_vpshrdv_v32hi_maskz(
(__v32hi)__B, (__v32hi)__C, (__v32hi)__D, (__mmask32)__A);
}
extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_mask_shldv_epi16(__m512i __A, __mmask32 __B, __m512i __C, __m512i __D) {
return (__m512i)__builtin_ia32_vpshldv_v32hi_mask(
(__v32hi)__A, (__v32hi)__C, (__v32hi)__D, (__mmask32)__B);
}
extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_maskz_shldv_epi16(__mmask32 __A, __m512i __B, __m512i __C, __m512i __D) {
return (__m512i)__builtin_ia32_vpshldv_v32hi_maskz(
(__v32hi)__B, (__v32hi)__C, (__v32hi)__D, (__mmask32)__A);
}
#ifdef __DISABLE_AVX512VBMI2BW__
#undef __DISABLE_AVX512VBMI2BW__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512VBMI2BW__ */
#endif /* __AVX512VBMI2INTRIN_H_INCLUDED */

View file

@ -0,0 +1,866 @@
#ifndef _IMMINTRIN_H_INCLUDED
#error \
"Never use <avx512vbmi2vlintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _AVX512VBMI2VLINTRIN_H_INCLUDED
#define _AVX512VBMI2VLINTRIN_H_INCLUDED
#if !defined(__AVX512VL__) || !defined(__AVX512VBMI2__)
#pragma GCC push_options
#pragma GCC target("avx512vbmi2,avx512vl")
#define __DISABLE_AVX512VBMI2VL__
#endif /* __AVX512VBMIVL__ */
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_compress_epi8(__m128i __A, __mmask16 __B, __m128i __C) {
return (__m128i)__builtin_ia32_compressqi128_mask((__v16qi)__C, (__v16qi)__A,
(__mmask16)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_compress_epi8(__mmask16 __A, __m128i __B) {
return (__m128i)__builtin_ia32_compressqi128_mask(
(__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__A);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_compressstoreu_epi16(void *__A, __mmask16 __B, __m256i __C) {
__builtin_ia32_compressstoreuhi256_mask((__v16hi *)__A, (__v16hi)__C,
(__mmask16)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_compress_epi16(__m128i __A, __mmask8 __B, __m128i __C) {
return (__m128i)__builtin_ia32_compresshi128_mask((__v8hi)__C, (__v8hi)__A,
(__mmask8)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_compress_epi16(__mmask8 __A, __m128i __B) {
return (__m128i)__builtin_ia32_compresshi128_mask(
(__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_compress_epi16(__m256i __A, __mmask16 __B, __m256i __C) {
return (__m256i)__builtin_ia32_compresshi256_mask((__v16hi)__C, (__v16hi)__A,
(__mmask16)__B);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_compress_epi16(__mmask16 __A, __m256i __B) {
return (__m256i)__builtin_ia32_compresshi256_mask(
(__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__A);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_compressstoreu_epi8(void *__A, __mmask16 __B, __m128i __C) {
__builtin_ia32_compressstoreuqi128_mask((__v16qi *)__A, (__v16qi)__C,
(__mmask16)__B);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_compressstoreu_epi16(void *__A, __mmask8 __B, __m128i __C) {
__builtin_ia32_compressstoreuhi128_mask((__v8hi *)__A, (__v8hi)__C,
(__mmask8)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_expand_epi8(__m128i __A, __mmask16 __B, __m128i __C) {
return (__m128i)__builtin_ia32_expandqi128_mask((__v16qi)__C, (__v16qi)__A,
(__mmask16)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_expand_epi8(__mmask16 __A, __m128i __B) {
return (__m128i)__builtin_ia32_expandqi128_maskz(
(__v16qi)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_expandloadu_epi8(__m128i __A, __mmask16 __B, const void *__C) {
return (__m128i)__builtin_ia32_expandloadqi128_mask(
(const __v16qi *)__C, (__v16qi)__A, (__mmask16)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_expandloadu_epi8(__mmask16 __A, const void *__B) {
return (__m128i)__builtin_ia32_expandloadqi128_maskz(
(const __v16qi *)__B, (__v16qi)_mm_setzero_si128(), (__mmask16)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_expand_epi16(__m128i __A, __mmask8 __B, __m128i __C) {
return (__m128i)__builtin_ia32_expandhi128_mask((__v8hi)__C, (__v8hi)__A,
(__mmask8)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_expand_epi16(__mmask8 __A, __m128i __B) {
return (__m128i)__builtin_ia32_expandhi128_maskz(
(__v8hi)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_expandloadu_epi16(__m128i __A, __mmask8 __B, const void *__C) {
return (__m128i)__builtin_ia32_expandloadhi128_mask(
(const __v8hi *)__C, (__v8hi)__A, (__mmask8)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_expandloadu_epi16(__mmask8 __A, const void *__B) {
return (__m128i)__builtin_ia32_expandloadhi128_maskz(
(const __v8hi *)__B, (__v8hi)_mm_setzero_si128(), (__mmask8)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_expand_epi16(__m256i __A, __mmask16 __B, __m256i __C) {
return (__m256i)__builtin_ia32_expandhi256_mask((__v16hi)__C, (__v16hi)__A,
(__mmask16)__B);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_expand_epi16(__mmask16 __A, __m256i __B) {
return (__m256i)__builtin_ia32_expandhi256_maskz(
(__v16hi)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_expandloadu_epi16(__m256i __A, __mmask16 __B, const void *__C) {
return (__m256i)__builtin_ia32_expandloadhi256_mask(
(const __v16hi *)__C, (__v16hi)__A, (__mmask16)__B);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_expandloadu_epi16(__mmask16 __A, const void *__B) {
return (__m256i)__builtin_ia32_expandloadhi256_maskz(
(const __v16hi *)__B, (__v16hi)_mm256_setzero_si256(), (__mmask16)__A);
}
#ifdef __OPTIMIZE__
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shrdi_epi16(__m256i __A, __m256i __B, int __C) {
return (__m256i)__builtin_ia32_vpshrd_v16hi((__v16hi)__A, (__v16hi)__B, __C);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shrdi_epi16(__m256i __A, __mmask16 __B, __m256i __C,
__m256i __D, int __E) {
return (__m256i)__builtin_ia32_vpshrd_v16hi_mask(
(__v16hi)__C, (__v16hi)__D, __E, (__v16hi)__A, (__mmask16)__B);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shrdi_epi16(__mmask16 __A, __m256i __B, __m256i __C, int __D) {
return (__m256i)__builtin_ia32_vpshrd_v16hi_mask(
(__v16hi)__B, (__v16hi)__C, __D, (__v16hi)_mm256_setzero_si256(),
(__mmask16)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shrdi_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D,
int __E) {
return (__m256i)__builtin_ia32_vpshrd_v8si_mask((__v8si)__C, (__v8si)__D, __E,
(__v8si)__A, (__mmask8)__B);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shrdi_epi32(__mmask8 __A, __m256i __B, __m256i __C, int __D) {
return (__m256i)__builtin_ia32_vpshrd_v8si_mask(
(__v8si)__B, (__v8si)__C, __D, (__v8si)_mm256_setzero_si256(),
(__mmask8)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shrdi_epi32(__m256i __A, __m256i __B, int __C) {
return (__m256i)__builtin_ia32_vpshrd_v8si((__v8si)__A, (__v8si)__B, __C);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shrdi_epi64(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D,
int __E) {
return (__m256i)__builtin_ia32_vpshrd_v4di_mask((__v4di)__C, (__v4di)__D, __E,
(__v4di)__A, (__mmask8)__B);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shrdi_epi64(__mmask8 __A, __m256i __B, __m256i __C, int __D) {
return (__m256i)__builtin_ia32_vpshrd_v4di_mask(
(__v4di)__B, (__v4di)__C, __D, (__v4di)_mm256_setzero_si256(),
(__mmask8)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shrdi_epi64(__m256i __A, __m256i __B, int __C) {
return (__m256i)__builtin_ia32_vpshrd_v4di((__v4di)__A, (__v4di)__B, __C);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shrdi_epi16(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
int __E) {
return (__m128i)__builtin_ia32_vpshrd_v8hi_mask((__v8hi)__C, (__v8hi)__D, __E,
(__v8hi)__A, (__mmask8)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shrdi_epi16(__mmask8 __A, __m128i __B, __m128i __C, int __D) {
return (__m128i)__builtin_ia32_vpshrd_v8hi_mask((__v8hi)__B, (__v8hi)__C, __D,
(__v8hi)_mm_setzero_si128(),
(__mmask8)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shrdi_epi16(__m128i __A, __m128i __B, int __C) {
return (__m128i)__builtin_ia32_vpshrd_v8hi((__v8hi)__A, (__v8hi)__B, __C);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shrdi_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
int __E) {
return (__m128i)__builtin_ia32_vpshrd_v4si_mask((__v4si)__C, (__v4si)__D, __E,
(__v4si)__A, (__mmask8)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shrdi_epi32(__mmask8 __A, __m128i __B, __m128i __C, int __D) {
return (__m128i)__builtin_ia32_vpshrd_v4si_mask((__v4si)__B, (__v4si)__C, __D,
(__v4si)_mm_setzero_si128(),
(__mmask8)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shrdi_epi32(__m128i __A, __m128i __B, int __C) {
return (__m128i)__builtin_ia32_vpshrd_v4si((__v4si)__A, (__v4si)__B, __C);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shrdi_epi64(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
int __E) {
return (__m128i)__builtin_ia32_vpshrd_v2di_mask((__v2di)__C, (__v2di)__D, __E,
(__v2di)__A, (__mmask8)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shrdi_epi64(__mmask8 __A, __m128i __B, __m128i __C, int __D) {
return (__m128i)__builtin_ia32_vpshrd_v2di_mask((__v2di)__B, (__v2di)__C, __D,
(__v2di)_mm_setzero_si128(),
(__mmask8)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shrdi_epi64(__m128i __A, __m128i __B, int __C) {
return (__m128i)__builtin_ia32_vpshrd_v2di((__v2di)__A, (__v2di)__B, __C);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shldi_epi16(__m256i __A, __m256i __B, int __C) {
return (__m256i)__builtin_ia32_vpshld_v16hi((__v16hi)__A, (__v16hi)__B, __C);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shldi_epi16(__m256i __A, __mmask16 __B, __m256i __C,
__m256i __D, int __E) {
return (__m256i)__builtin_ia32_vpshld_v16hi_mask(
(__v16hi)__C, (__v16hi)__D, __E, (__v16hi)__A, (__mmask16)__B);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shldi_epi16(__mmask16 __A, __m256i __B, __m256i __C, int __D) {
return (__m256i)__builtin_ia32_vpshld_v16hi_mask(
(__v16hi)__B, (__v16hi)__C, __D, (__v16hi)_mm256_setzero_si256(),
(__mmask16)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shldi_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D,
int __E) {
return (__m256i)__builtin_ia32_vpshld_v8si_mask((__v8si)__C, (__v8si)__D, __E,
(__v8si)__A, (__mmask8)__B);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shldi_epi32(__mmask8 __A, __m256i __B, __m256i __C, int __D) {
return (__m256i)__builtin_ia32_vpshld_v8si_mask(
(__v8si)__B, (__v8si)__C, __D, (__v8si)_mm256_setzero_si256(),
(__mmask8)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shldi_epi32(__m256i __A, __m256i __B, int __C) {
return (__m256i)__builtin_ia32_vpshld_v8si((__v8si)__A, (__v8si)__B, __C);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_shldi_epi64(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D,
int __E) {
return (__m256i)__builtin_ia32_vpshld_v4di_mask((__v4di)__C, (__v4di)__D, __E,
(__v4di)__A, (__mmask8)__B);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_shldi_epi64(__mmask8 __A, __m256i __B, __m256i __C, int __D) {
return (__m256i)__builtin_ia32_vpshld_v4di_mask(
(__v4di)__B, (__v4di)__C, __D, (__v4di)_mm256_setzero_si256(),
(__mmask8)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shldi_epi64(__m256i __A, __m256i __B, int __C) {
return (__m256i)__builtin_ia32_vpshld_v4di((__v4di)__A, (__v4di)__B, __C);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shldi_epi16(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
int __E) {
return (__m128i)__builtin_ia32_vpshld_v8hi_mask((__v8hi)__C, (__v8hi)__D, __E,
(__v8hi)__A, (__mmask8)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shldi_epi16(__mmask8 __A, __m128i __B, __m128i __C, int __D) {
return (__m128i)__builtin_ia32_vpshld_v8hi_mask((__v8hi)__B, (__v8hi)__C, __D,
(__v8hi)_mm_setzero_si128(),
(__mmask8)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shldi_epi16(__m128i __A, __m128i __B, int __C) {
return (__m128i)__builtin_ia32_vpshld_v8hi((__v8hi)__A, (__v8hi)__B, __C);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shldi_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
int __E) {
return (__m128i)__builtin_ia32_vpshld_v4si_mask((__v4si)__C, (__v4si)__D, __E,
(__v4si)__A, (__mmask8)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shldi_epi32(__mmask8 __A, __m128i __B, __m128i __C, int __D) {
return (__m128i)__builtin_ia32_vpshld_v4si_mask((__v4si)__B, (__v4si)__C, __D,
(__v4si)_mm_setzero_si128(),
(__mmask8)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shldi_epi32(__m128i __A, __m128i __B, int __C) {
return (__m128i)__builtin_ia32_vpshld_v4si((__v4si)__A, (__v4si)__B, __C);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shldi_epi64(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
int __E) {
return (__m128i)__builtin_ia32_vpshld_v2di_mask((__v2di)__C, (__v2di)__D, __E,
(__v2di)__A, (__mmask8)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shldi_epi64(__mmask8 __A, __m128i __B, __m128i __C, int __D) {
return (__m128i)__builtin_ia32_vpshld_v2di_mask((__v2di)__B, (__v2di)__C, __D,
(__v2di)_mm_setzero_si128(),
(__mmask8)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shldi_epi64(__m128i __A, __m128i __B, int __C) {
return (__m128i)__builtin_ia32_vpshld_v2di((__v2di)__A, (__v2di)__B, __C);
}
#else
#define _mm256_shrdi_epi16(A, B, C) \
((__m256i) __builtin_ia32_vpshrd_v16hi ((__v16hi)(__m256i)(A), \
(__v16hi)(__m256i)(B),(int)(C))
#define _mm256_mask_shrdi_epi16(A, B, C, D, E) \
((__m256i) __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(C), \
(__v16hi)(__m256i)(D), (int)(E), (__v16hi)(__m256i)(A),(__mmask16)(B))
#define _mm256_maskz_shrdi_epi16(A, B, C, D) \
((__m256i) __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(B), \
(__v16hi)(__m256i)(C),(int)(D), \
(__v16hi)(__m256i)_mm256_setzero_si256 (), (__mmask16)(A))
#define _mm256_shrdi_epi32(A, B, C) \
((__m256i) __builtin_ia32_vpshrd_v8si ((__v8si)(__m256i)(A), \
(__v8si)(__m256i)(B),(int)(C))
#define _mm256_mask_shrdi_epi32(A, B, C, D, E) \
((__m256i) __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(C), \
(__v8si)(__m256i)(D), (int)(E), (__v8si)(__m256i)(A),(__mmask8)(B))
#define _mm256_maskz_shrdi_epi32(A, B, C, D) \
((__m256i) __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(B), \
(__v8si)(__m256i)(C),(int)(D), \
(__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A))
#define _mm256_shrdi_epi64(A, B, C) \
((__m256i) __builtin_ia32_vpshrd_v4di ((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B),(int)(C))
#define _mm256_mask_shrdi_epi64(A, B, C, D, E) \
((__m256i) __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(C), \
(__v4di)(__m256i)(D), (int)(E), (__v4di)(__m256i)(A),(__mmask8)(B))
#define _mm256_maskz_shrdi_epi64(A, B, C, D) \
((__m256i) __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(B), \
(__v4di)(__m256i)(C),(int)(D), \
(__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A))
#define _mm_shrdi_epi16(A, B, C) \
((__m128i) __builtin_ia32_vpshrd_v8hi ((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B),(int)(C))
#define _mm_mask_shrdi_epi16(A, B, C, D, E) \
((__m128i) __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(C), \
(__v8hi)(__m128i)(D), (int)(E), (__v8hi)(__m128i)(A),(__mmask8)(B))
#define _mm_maskz_shrdi_epi16(A, B, C, D) \
((__m128i) __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(B), \
(__v8hi)(__m128i)(C),(int)(D), \
(__v8hi)(__m128i)_mm_setzero_si128 (), (__mmask8)(A))
#define _mm_shrdi_epi32(A, B, C) \
((__m128i) __builtin_ia32_vpshrd_v4si ((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B),(int)(C))
#define _mm_mask_shrdi_epi32(A, B, C, D, E) \
((__m128i) __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(C), \
(__v4si)(__m128i)(D), (int)(E), (__v4si)(__m128i)(A),(__mmask8)(B))
#define _mm_maskz_shrdi_epi32(A, B, C, D) \
((__m128i) __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(B), \
(__v4si)(__m128i)(C),(int)(D), \
(__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(A))
#define _mm_shrdi_epi64(A, B, C) \
((__m128i) __builtin_ia32_vpshrd_v2di ((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B),(int)(C))
#define _mm_mask_shrdi_epi64(A, B, C, D, E) \
((__m128i) __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(C), \
(__v2di)(__m128i)(D), (int)(E), (__v2di)(__m128i)(A),(__mmask8)(B))
#define _mm_maskz_shrdi_epi64(A, B, C, D) \
((__m128i) __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(B), \
(__v2di)(__m128i)(C),(int)(D), \
(__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(A))
#define _mm256_shldi_epi16(A, B, C) \
((__m256i) __builtin_ia32_vpshld_v16hi ((__v16hi)(__m256i)(A), \
(__v16hi)(__m256i)(B),(int)(C))
#define _mm256_mask_shldi_epi16(A, B, C, D, E) \
((__m256i) __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(C), \
(__v16hi)(__m256i)(D), (int)(E), (__v16hi)(__m256i)(A),(__mmask16)(B))
#define _mm256_maskz_shldi_epi16(A, B, C, D) \
((__m256i) __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(B), \
(__v16hi)(__m256i)(C),(int)(D), \
(__v16hi)(__m256i)_mm256_setzero_si256 (), (__mmask16)(A))
#define _mm256_shldi_epi32(A, B, C) \
((__m256i) __builtin_ia32_vpshld_v8si ((__v8si)(__m256i)(A), \
(__v8si)(__m256i)(B),(int)(C))
#define _mm256_mask_shldi_epi32(A, B, C, D, E) \
((__m256i) __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(C), \
(__v8si)(__m256i)(D), (int)(E), (__v8si)(__m256i)(A),(__mmask8)(B))
#define _mm256_maskz_shldi_epi32(A, B, C, D) \
((__m256i) __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(B), \
(__v8si)(__m256i)(C),(int)(D), \
(__v8si)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A))
#define _mm256_shldi_epi64(A, B, C) \
((__m256i) __builtin_ia32_vpshld_v4di ((__v4di)(__m256i)(A), \
(__v4di)(__m256i)(B),(int)(C))
#define _mm256_mask_shldi_epi64(A, B, C, D, E) \
((__m256i) __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(C), \
(__v4di)(__m256i)(D), (int)(E), (__v4di)(__m256i)(A),(__mmask8)(B))
#define _mm256_maskz_shldi_epi64(A, B, C, D) \
((__m256i) __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(B), \
(__v4di)(__m256i)(C),(int)(D), \
(__v4di)(__m256i)_mm256_setzero_si256 (), (__mmask8)(A))
#define _mm_shldi_epi16(A, B, C) \
((__m128i) __builtin_ia32_vpshld_v8hi ((__v8hi)(__m128i)(A), \
(__v8hi)(__m128i)(B),(int)(C))
#define _mm_mask_shldi_epi16(A, B, C, D, E) \
((__m128i) __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(C), \
(__v8hi)(__m128i)(D), (int)(E), (__v8hi)(__m128i)(A),(__mmask8)(B))
#define _mm_maskz_shldi_epi16(A, B, C, D) \
((__m128i) __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(B), \
(__v8hi)(__m128i)(C),(int)(D), \
(__v8hi)(__m128i)_mm_setzero_si128 (), (__mmask8)(A))
#define _mm_shldi_epi32(A, B, C) \
((__m128i) __builtin_ia32_vpshld_v4si ((__v4si)(__m128i)(A), \
(__v4si)(__m128i)(B),(int)(C))
#define _mm_mask_shldi_epi32(A, B, C, D, E) \
((__m128i) __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(C), \
(__v4si)(__m128i)(D), (int)(E), (__v4si)(__m128i)(A),(__mmask8)(B))
#define _mm_maskz_shldi_epi32(A, B, C, D) \
((__m128i) __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(B), \
(__v4si)(__m128i)(C),(int)(D), \
(__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(A))
#define _mm_shldi_epi64(A, B, C) \
((__m128i) __builtin_ia32_vpshld_v2di ((__v2di)(__m128i)(A), \
(__v2di)(__m128i)(B),(int)(C))
#define _mm_mask_shldi_epi64(A, B, C, D, E) \
((__m128i) __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(C), \
(__v2di)(__m128i)(D), (int)(E), (__v2di)(__m128i)(A),(__mmask8)(B))
#define _mm_maskz_shldi_epi64(A, B, C, D) \
((__m128i) __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(B), \
(__v2di)(__m128i)(C),(int)(D), \
(__v2di)(__m128i)_mm_setzero_si128 (), (__mmask8)(A))
#endif
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shrdv_epi16(__m256i __A, __m256i __B, __m256i __C) {
return (__m256i)__builtin_ia32_vpshrdv_v16hi((__v16hi)__A, (__v16hi)__B,
(__v16hi)__C);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_mask_shrdv_epi16(__m256i __A, __mmask16 __B, __m256i __C, __m256i __D) {
return (__m256i)__builtin_ia32_vpshrdv_v16hi_mask(
(__v16hi)__A, (__v16hi)__C, (__v16hi)__D, (__mmask16)__B);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_maskz_shrdv_epi16(__mmask16 __A, __m256i __B, __m256i __C, __m256i __D) {
return (__m256i)__builtin_ia32_vpshrdv_v16hi_maskz(
(__v16hi)__B, (__v16hi)__C, (__v16hi)__D, (__mmask16)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shrdv_epi32(__m256i __A, __m256i __B, __m256i __C) {
return (__m256i)__builtin_ia32_vpshrdv_v8si((__v8si)__A, (__v8si)__B,
(__v8si)__C);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_mask_shrdv_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
return (__m256i)__builtin_ia32_vpshrdv_v8si_mask((__v8si)__A, (__v8si)__C,
(__v8si)__D, (__mmask8)__B);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_maskz_shrdv_epi32(__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) {
return (__m256i)__builtin_ia32_vpshrdv_v8si_maskz((__v8si)__B, (__v8si)__C,
(__v8si)__D, (__mmask8)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shrdv_epi64(__m256i __A, __m256i __B, __m256i __C) {
return (__m256i)__builtin_ia32_vpshrdv_v4di((__v4di)__A, (__v4di)__B,
(__v4di)__C);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_mask_shrdv_epi64(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
return (__m256i)__builtin_ia32_vpshrdv_v4di_mask((__v4di)__A, (__v4di)__C,
(__v4di)__D, (__mmask8)__B);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_maskz_shrdv_epi64(__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) {
return (__m256i)__builtin_ia32_vpshrdv_v4di_maskz((__v4di)__B, (__v4di)__C,
(__v4di)__D, (__mmask8)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shrdv_epi16(__m128i __A, __m128i __B, __m128i __C) {
return (__m128i)__builtin_ia32_vpshrdv_v8hi((__v8hi)__A, (__v8hi)__B,
(__v8hi)__C);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shrdv_epi16(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpshrdv_v8hi_mask((__v8hi)__A, (__v8hi)__C,
(__v8hi)__D, (__mmask8)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shrdv_epi16(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpshrdv_v8hi_maskz((__v8hi)__B, (__v8hi)__C,
(__v8hi)__D, (__mmask8)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shrdv_epi32(__m128i __A, __m128i __B, __m128i __C) {
return (__m128i)__builtin_ia32_vpshrdv_v4si((__v4si)__A, (__v4si)__B,
(__v4si)__C);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shrdv_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpshrdv_v4si_mask((__v4si)__A, (__v4si)__C,
(__v4si)__D, (__mmask8)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shrdv_epi32(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpshrdv_v4si_maskz((__v4si)__B, (__v4si)__C,
(__v4si)__D, (__mmask8)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shrdv_epi64(__m128i __A, __m128i __B, __m128i __C) {
return (__m128i)__builtin_ia32_vpshrdv_v2di((__v2di)__A, (__v2di)__B,
(__v2di)__C);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shrdv_epi64(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpshrdv_v2di_mask((__v2di)__A, (__v2di)__C,
(__v2di)__D, (__mmask8)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shrdv_epi64(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpshrdv_v2di_maskz((__v2di)__B, (__v2di)__C,
(__v2di)__D, (__mmask8)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shldv_epi16(__m256i __A, __m256i __B, __m256i __C) {
return (__m256i)__builtin_ia32_vpshldv_v16hi((__v16hi)__A, (__v16hi)__B,
(__v16hi)__C);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_mask_shldv_epi16(__m256i __A, __mmask16 __B, __m256i __C, __m256i __D) {
return (__m256i)__builtin_ia32_vpshldv_v16hi_mask(
(__v16hi)__A, (__v16hi)__C, (__v16hi)__D, (__mmask16)__B);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_maskz_shldv_epi16(__mmask16 __A, __m256i __B, __m256i __C, __m256i __D) {
return (__m256i)__builtin_ia32_vpshldv_v16hi_maskz(
(__v16hi)__B, (__v16hi)__C, (__v16hi)__D, (__mmask16)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shldv_epi32(__m256i __A, __m256i __B, __m256i __C) {
return (__m256i)__builtin_ia32_vpshldv_v8si((__v8si)__A, (__v8si)__B,
(__v8si)__C);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_mask_shldv_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
return (__m256i)__builtin_ia32_vpshldv_v8si_mask((__v8si)__A, (__v8si)__C,
(__v8si)__D, (__mmask8)__B);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_maskz_shldv_epi32(__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) {
return (__m256i)__builtin_ia32_vpshldv_v8si_maskz((__v8si)__B, (__v8si)__C,
(__v8si)__D, (__mmask8)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C) {
return (__m256i)__builtin_ia32_vpshldv_v4di((__v4di)__A, (__v4di)__B,
(__v4di)__C);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_mask_shldv_epi64(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
return (__m256i)__builtin_ia32_vpshldv_v4di_mask((__v4di)__A, (__v4di)__C,
(__v4di)__D, (__mmask8)__B);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_maskz_shldv_epi64(__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) {
return (__m256i)__builtin_ia32_vpshldv_v4di_maskz((__v4di)__B, (__v4di)__C,
(__v4di)__D, (__mmask8)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shldv_epi16(__m128i __A, __m128i __B, __m128i __C) {
return (__m128i)__builtin_ia32_vpshldv_v8hi((__v8hi)__A, (__v8hi)__B,
(__v8hi)__C);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shldv_epi16(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpshldv_v8hi_mask((__v8hi)__A, (__v8hi)__C,
(__v8hi)__D, (__mmask8)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shldv_epi16(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpshldv_v8hi_maskz((__v8hi)__B, (__v8hi)__C,
(__v8hi)__D, (__mmask8)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shldv_epi32(__m128i __A, __m128i __B, __m128i __C) {
return (__m128i)__builtin_ia32_vpshldv_v4si((__v4si)__A, (__v4si)__B,
(__v4si)__C);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shldv_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpshldv_v4si_mask((__v4si)__A, (__v4si)__C,
(__v4si)__D, (__mmask8)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shldv_epi32(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpshldv_v4si_maskz((__v4si)__B, (__v4si)__C,
(__v4si)__D, (__mmask8)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shldv_epi64(__m128i __A, __m128i __B, __m128i __C) {
return (__m128i)__builtin_ia32_vpshldv_v2di((__v2di)__A, (__v2di)__B,
(__v2di)__C);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_shldv_epi64(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpshldv_v2di_mask((__v2di)__A, (__v2di)__C,
(__v2di)__D, (__mmask8)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_shldv_epi64(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpshldv_v2di_maskz((__v2di)__B, (__v2di)__C,
(__v2di)__D, (__mmask8)__A);
}
#ifdef __DISABLE_AVX512VBMI2VL__
#undef __DISABLE_AVX512VBMI2VL__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512VBMIVL__ */
#if !defined(__AVX512VL__) || !defined(__AVX512VBMI2__) || \
!defined(__AVX512BW__)
#pragma GCC push_options
#pragma GCC target("avx512vbmi2,avx512vl,avx512bw")
#define __DISABLE_AVX512VBMI2VLBW__
#endif /* __AVX512VBMIVLBW__ */
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_compress_epi8(__m256i __A, __mmask32 __B, __m256i __C) {
return (__m256i)__builtin_ia32_compressqi256_mask((__v32qi)__C, (__v32qi)__A,
(__mmask32)__B);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_compress_epi8(__mmask32 __A, __m256i __B) {
return (__m256i)__builtin_ia32_compressqi256_mask(
(__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__A);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_compressstoreu_epi8(void *__A, __mmask32 __B, __m256i __C) {
__builtin_ia32_compressstoreuqi256_mask((__v32qi *)__A, (__v32qi)__C,
(__mmask32)__B);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_expand_epi8(__m256i __A, __mmask32 __B, __m256i __C) {
return (__m256i)__builtin_ia32_expandqi256_mask((__v32qi)__C, (__v32qi)__A,
(__mmask32)__B);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_expand_epi8(__mmask32 __A, __m256i __B) {
return (__m256i)__builtin_ia32_expandqi256_maskz(
(__v32qi)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_expandloadu_epi8(__m256i __A, __mmask32 __B, const void *__C) {
return (__m256i)__builtin_ia32_expandloadqi256_mask(
(const __v32qi *)__C, (__v32qi)__A, (__mmask32)__B);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_expandloadu_epi8(__mmask32 __A, const void *__B) {
return (__m256i)__builtin_ia32_expandloadqi256_maskz(
(const __v32qi *)__B, (__v32qi)_mm256_setzero_si256(), (__mmask32)__A);
}
#ifdef __DISABLE_AVX512VBMI2VLBW__
#undef __DISABLE_AVX512VBMI2VLBW__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512VBMIVLBW__ */
#endif /* _AVX512VBMIVLINTRIN_H_INCLUDED */

View file

@ -0,0 +1,107 @@
#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _AVX512VBMIINTRIN_H_INCLUDED
#define _AVX512VBMIINTRIN_H_INCLUDED
#ifndef __AVX512VBMI__
#pragma GCC push_options
#pragma GCC target("avx512vbmi")
#define __DISABLE_AVX512VBMI__
#endif /* __AVX512VBMI__ */
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, __m512i __X,
__m512i __Y) {
return (__m512i)__builtin_ia32_vpmultishiftqb512_mask(
(__v64qi)__X, (__v64qi)__Y, (__v64qi)__W, (__mmask64)__M);
}
extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y) {
return (__m512i)__builtin_ia32_vpmultishiftqb512_mask(
(__v64qi)__X, (__v64qi)__Y, (__v64qi)_mm512_setzero_si512(),
(__mmask64)__M);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y) {
return (__m512i)__builtin_ia32_vpmultishiftqb512_mask(
(__v64qi)__X, (__v64qi)__Y, (__v64qi)_mm512_undefined_epi32(),
(__mmask64)-1);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permutexvar_epi8(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_permvarqi512_mask(
(__v64qi)__B, (__v64qi)__A, (__v64qi)_mm512_undefined_epi32(),
(__mmask64)-1);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permutexvar_epi8(__mmask64 __M, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_permvarqi512_mask(
(__v64qi)__B, (__v64qi)__A, (__v64qi)_mm512_setzero_si512(),
(__mmask64)__M);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permutexvar_epi8(__m512i __W, __mmask64 __M, __m512i __A,
__m512i __B) {
return (__m512i)__builtin_ia32_permvarqi512_mask(
(__v64qi)__B, (__v64qi)__A, (__v64qi)__W, (__mmask64)__M);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) {
return (__m512i)__builtin_ia32_vpermt2varqi512_mask(
(__v64qi)__I
/* idx */,
(__v64qi)__A, (__v64qi)__B, (__mmask64)-1);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I,
__m512i __B) {
return (__m512i)__builtin_ia32_vpermt2varqi512_mask(
(__v64qi)__I
/* idx */,
(__v64qi)__A, (__v64qi)__B, (__mmask64)__U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U,
__m512i __B) {
return (__m512i)__builtin_ia32_vpermi2varqi512_mask((__v64qi)__A,
(__v64qi)__I
/* idx */,
(__v64qi)__B,
(__mmask64)__U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I,
__m512i __B) {
return (__m512i)__builtin_ia32_vpermt2varqi512_maskz(
(__v64qi)__I
/* idx */,
(__v64qi)__A, (__v64qi)__B, (__mmask64)__U);
}
#ifdef __DISABLE_AVX512VBMI__
#undef __DISABLE_AVX512VBMI__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512VBMI__ */
#endif /* _AVX512VBMIINTRIN_H_INCLUDED */

View file

@ -0,0 +1,194 @@
#ifndef _IMMINTRIN_H_INCLUDED
#error \
"Never use <avx512vbmivlintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _AVX512VBMIVLINTRIN_H_INCLUDED
#define _AVX512VBMIVLINTRIN_H_INCLUDED
#if !defined(__AVX512VL__) || !defined(__AVX512VBMI__)
#pragma GCC push_options
#pragma GCC target("avx512vbmi,avx512vl")
#define __DISABLE_AVX512VBMIVL__
#endif /* __AVX512VBMIVL__ */
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, __m256i __X,
__m256i __Y) {
return (__m256i)__builtin_ia32_vpmultishiftqb256_mask(
(__v32qi)__X, (__v32qi)__Y, (__v32qi)__W, (__mmask32)__M);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y) {
return (__m256i)__builtin_ia32_vpmultishiftqb256_mask(
(__v32qi)__X, (__v32qi)__Y, (__v32qi)_mm256_setzero_si256(),
(__mmask32)__M);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y) {
return (__m256i)__builtin_ia32_vpmultishiftqb256_mask(
(__v32qi)__X, (__v32qi)__Y, (__v32qi)_mm256_undefined_si256(),
(__mmask32)-1);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X,
__m128i __Y) {
return (__m128i)__builtin_ia32_vpmultishiftqb128_mask(
(__v16qi)__X, (__v16qi)__Y, (__v16qi)__W, (__mmask16)__M);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_vpmultishiftqb128_mask(
(__v16qi)__X, (__v16qi)__Y, (__v16qi)_mm_setzero_si128(), (__mmask16)__M);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_multishift_epi64_epi8(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_vpmultishiftqb128_mask(
(__v16qi)__X, (__v16qi)__Y, (__v16qi)_mm_undefined_si128(),
(__mmask16)-1);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permutexvar_epi8(__m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_permvarqi256_mask(
(__v32qi)__B, (__v32qi)__A, (__v32qi)_mm256_undefined_si256(),
(__mmask32)-1);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_permutexvar_epi8(__mmask32 __M, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_permvarqi256_mask(
(__v32qi)__B, (__v32qi)__A, (__v32qi)_mm256_setzero_si256(),
(__mmask32)__M);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_permutexvar_epi8(__m256i __W, __mmask32 __M, __m256i __A,
__m256i __B) {
return (__m256i)__builtin_ia32_permvarqi256_mask(
(__v32qi)__B, (__v32qi)__A, (__v32qi)__W, (__mmask32)__M);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_permutexvar_epi8(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_permvarqi128_mask(
(__v16qi)__B, (__v16qi)__A, (__v16qi)_mm_undefined_si128(),
(__mmask16)-1);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_permutexvar_epi8(__mmask16 __M, __m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_permvarqi128_mask(
(__v16qi)__B, (__v16qi)__A, (__v16qi)_mm_setzero_si128(), (__mmask16)__M);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_permutexvar_epi8(__m128i __W, __mmask16 __M, __m128i __A,
__m128i __B) {
return (__m128i)__builtin_ia32_permvarqi128_mask(
(__v16qi)__B, (__v16qi)__A, (__v16qi)__W, (__mmask16)__M);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) {
return (__m256i)__builtin_ia32_vpermt2varqi256_mask(
(__v32qi)__I
/* idx */,
(__v32qi)__A, (__v32qi)__B, (__mmask32)-1);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I,
__m256i __B) {
return (__m256i)__builtin_ia32_vpermt2varqi256_mask(
(__v32qi)__I
/* idx */,
(__v32qi)__A, (__v32qi)__B, (__mmask32)__U);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U,
__m256i __B) {
return (__m256i)__builtin_ia32_vpermi2varqi256_mask((__v32qi)__A,
(__v32qi)__I
/* idx */,
(__v32qi)__B,
(__mmask32)__U);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I,
__m256i __B) {
return (__m256i)__builtin_ia32_vpermt2varqi256_maskz(
(__v32qi)__I
/* idx */,
(__v32qi)__A, (__v32qi)__B, (__mmask32)__U);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) {
return (__m128i)__builtin_ia32_vpermt2varqi128_mask(
(__v16qi)__I
/* idx */,
(__v16qi)__A, (__v16qi)__B, (__mmask16)-1);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I,
__m128i __B) {
return (__m128i)__builtin_ia32_vpermt2varqi128_mask(
(__v16qi)__I
/* idx */,
(__v16qi)__A, (__v16qi)__B, (__mmask16)__U);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U,
__m128i __B) {
return (__m128i)__builtin_ia32_vpermi2varqi128_mask((__v16qi)__A,
(__v16qi)__I
/* idx */,
(__v16qi)__B,
(__mmask16)__U);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
__m128i __B) {
return (__m128i)__builtin_ia32_vpermt2varqi128_maskz(
(__v16qi)__I
/* idx */,
(__v16qi)__A, (__v16qi)__B, (__mmask16)__U);
}
#ifdef __DISABLE_AVX512VBMIVL__
#undef __DISABLE_AVX512VBMIVL__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512VBMIVL__ */
#endif /* _AVX512VBMIVLINTRIN_H_INCLUDED */

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,109 @@
#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <avx512vnniintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVX512VNNIINTRIN_H_INCLUDED
#define __AVX512VNNIINTRIN_H_INCLUDED
#if !defined(__AVX512VNNI__)
#pragma GCC push_options
#pragma GCC target("avx512vnni")
#define __DISABLE_AVX512VNNI__
#endif /* __AVX512VNNI__ */
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_dpbusd_epi32(__m512i __A, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_vpdpbusd_v16si((__v16si)__A, (__v16si)__B,
(__v16si)__C);
}
extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_mask_dpbusd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
return (__m512i)__builtin_ia32_vpdpbusd_v16si_mask(
(__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_dpbusd_epi32(__mmask16 __A, __m512i __B, __m512i __C,
__m512i __D) {
return (__m512i)__builtin_ia32_vpdpbusd_v16si_maskz(
(__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_dpbusds_epi32(__m512i __A, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_vpdpbusds_v16si((__v16si)__A, (__v16si)__B,
(__v16si)__C);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_dpbusds_epi32(__m512i __A, __mmask16 __B, __m512i __C,
__m512i __D) {
return (__m512i)__builtin_ia32_vpdpbusds_v16si_mask(
(__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_dpbusds_epi32(__mmask16 __A, __m512i __B, __m512i __C,
__m512i __D) {
return (__m512i)__builtin_ia32_vpdpbusds_v16si_maskz(
(__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_dpwssd_epi32(__m512i __A, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_vpdpwssd_v16si((__v16si)__A, (__v16si)__B,
(__v16si)__C);
}
extern __inline __m512i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm512_mask_dpwssd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
return (__m512i)__builtin_ia32_vpdpwssd_v16si_mask(
(__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_dpwssd_epi32(__mmask16 __A, __m512i __B, __m512i __C,
__m512i __D) {
return (__m512i)__builtin_ia32_vpdpwssd_v16si_maskz(
(__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_dpwssds_epi32(__m512i __A, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_vpdpwssds_v16si((__v16si)__A, (__v16si)__B,
(__v16si)__C);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_dpwssds_epi32(__m512i __A, __mmask16 __B, __m512i __C,
__m512i __D) {
return (__m512i)__builtin_ia32_vpdpwssds_v16si_mask(
(__v16si)__A, (__v16si)__C, (__v16si)__D, (__mmask16)__B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_dpwssds_epi32(__mmask16 __A, __m512i __B, __m512i __C,
__m512i __D) {
return (__m512i)__builtin_ia32_vpdpwssds_v16si_maskz(
(__v16si)__B, (__v16si)__C, (__v16si)__D, (__mmask16)__A);
}
#ifdef __DISABLE_AVX512VNNI__
#undef __DISABLE_AVX512VNNI__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512VNNI__ */
#endif /* __AVX512VNNIINTRIN_H_INCLUDED */

View file

@ -0,0 +1,188 @@
#ifndef _IMMINTRIN_H_INCLUDED
#error \
"Never use <avx512vnnivlintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _AVX512VNNIVLINTRIN_H_INCLUDED
#define _AVX512VNNIVLINTRIN_H_INCLUDED
#if !defined(__AVX512VL__) || !defined(__AVX512VNNI__)
#pragma GCC push_options
#pragma GCC target("avx512vnni,avx512vl")
#define __DISABLE_AVX512VNNIVL__
#endif /* __AVX512VNNIVL__ */
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_dpbusd_epi32(__m256i __A, __m256i __B, __m256i __C) {
return (__m256i)__builtin_ia32_vpdpbusd_v8si((__v8si)__A, (__v8si)__B,
(__v8si)__C);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_mask_dpbusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
return (__m256i)__builtin_ia32_vpdpbusd_v8si_mask((__v8si)__A, (__v8si)__C,
(__v8si)__D, (__mmask8)__B);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_maskz_dpbusd_epi32(__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) {
return (__m256i)__builtin_ia32_vpdpbusd_v8si_maskz(
(__v8si)__B, (__v8si)__C, (__v8si)__D, (__mmask8)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_dpbusd_epi32(__m128i __A, __m128i __B, __m128i __C) {
return (__m128i)__builtin_ia32_vpdpbusd_v4si((__v4si)__A, (__v4si)__B,
(__v4si)__C);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_dpbusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpdpbusd_v4si_mask((__v4si)__A, (__v4si)__C,
(__v4si)__D, (__mmask8)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_maskz_dpbusd_epi32(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpdpbusd_v4si_maskz(
(__v4si)__B, (__v4si)__C, (__v4si)__D, (__mmask8)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_dpbusds_epi32(__m256i __A, __m256i __B, __m256i __C) {
return (__m256i)__builtin_ia32_vpdpbusds_v8si((__v8si)__A, (__v8si)__B,
(__v8si)__C);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_mask_dpbusds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
return (__m256i)__builtin_ia32_vpdpbusds_v8si_mask(
(__v8si)__A, (__v8si)__C, (__v8si)__D, (__mmask8)__B);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_dpbusds_epi32(__mmask8 __A, __m256i __B, __m256i __C,
__m256i __D) {
return (__m256i)__builtin_ia32_vpdpbusds_v8si_maskz(
(__v8si)__B, (__v8si)__C, (__v8si)__D, (__mmask8)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_dpbusds_epi32(__m128i __A, __m128i __B, __m128i __C) {
return (__m128i)__builtin_ia32_vpdpbusds_v4si((__v4si)__A, (__v4si)__B,
(__v4si)__C);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_mask_dpbusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpdpbusds_v4si_mask(
(__v4si)__A, (__v4si)__C, (__v4si)__D, (__mmask8)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_maskz_dpbusds_epi32(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpdpbusds_v4si_maskz(
(__v4si)__B, (__v4si)__C, (__v4si)__D, (__mmask8)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_dpwssd_epi32(__m256i __A, __m256i __B, __m256i __C) {
return (__m256i)__builtin_ia32_vpdpwssd_v8si((__v8si)__A, (__v8si)__B,
(__v8si)__C);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_mask_dpwssd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
return (__m256i)__builtin_ia32_vpdpwssd_v8si_mask((__v8si)__A, (__v8si)__C,
(__v8si)__D, (__mmask8)__B);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_maskz_dpwssd_epi32(__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) {
return (__m256i)__builtin_ia32_vpdpwssd_v8si_maskz(
(__v8si)__B, (__v8si)__C, (__v8si)__D, (__mmask8)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_dpwssd_epi32(__m128i __A, __m128i __B, __m128i __C) {
return (__m128i)__builtin_ia32_vpdpwssd_v4si((__v4si)__A, (__v4si)__B,
(__v4si)__C);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_dpwssd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpdpwssd_v4si_mask((__v4si)__A, (__v4si)__C,
(__v4si)__D, (__mmask8)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_maskz_dpwssd_epi32(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpdpwssd_v4si_maskz(
(__v4si)__B, (__v4si)__C, (__v4si)__D, (__mmask8)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_dpwssds_epi32(__m256i __A, __m256i __B, __m256i __C) {
return (__m256i)__builtin_ia32_vpdpwssds_v8si((__v8si)__A, (__v8si)__B,
(__v8si)__C);
}
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_mask_dpwssds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
return (__m256i)__builtin_ia32_vpdpwssds_v8si_mask(
(__v8si)__A, (__v8si)__C, (__v8si)__D, (__mmask8)__B);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_dpwssds_epi32(__mmask8 __A, __m256i __B, __m256i __C,
__m256i __D) {
return (__m256i)__builtin_ia32_vpdpwssds_v8si_maskz(
(__v8si)__B, (__v8si)__C, (__v8si)__D, (__mmask8)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_dpwssds_epi32(__m128i __A, __m128i __B, __m128i __C) {
return (__m128i)__builtin_ia32_vpdpwssds_v4si((__v4si)__A, (__v4si)__B,
(__v4si)__C);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_mask_dpwssds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpdpwssds_v4si_mask(
(__v4si)__A, (__v4si)__C, (__v4si)__D, (__mmask8)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_maskz_dpwssds_epi32(__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vpdpwssds_v4si_maskz(
(__v4si)__B, (__v4si)__C, (__v4si)__D, (__mmask8)__A);
}
#ifdef __DISABLE_AVX512VNNIVL__
#undef __DISABLE_AVX512VNNIVL__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512VNNIVL__ */
#endif /* __DISABLE_AVX512VNNIVL__ */

View file

@ -0,0 +1,60 @@
#if !defined _IMMINTRIN_H_INCLUDED
#error \
"Never use <avx512vpopcntdqintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _AVX512VPOPCNTDQINTRIN_H_INCLUDED
#define _AVX512VPOPCNTDQINTRIN_H_INCLUDED
#ifndef __AVX512VPOPCNTDQ__
#pragma GCC push_options
#pragma GCC target("avx512vpopcntdq")
#define __DISABLE_AVX512VPOPCNTDQ__
#endif /* __AVX512VPOPCNTDQ__ */
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_popcnt_epi32(__m512i __A) {
return (__m512i)__builtin_ia32_vpopcountd_v16si((__v16si)__A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_popcnt_epi32(__m512i __A, __mmask16 __U, __m512i __B) {
return (__m512i)__builtin_ia32_vpopcountd_v16si_mask(
(__v16si)__A, (__v16si)__B, (__mmask16)__U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) {
return (__m512i)__builtin_ia32_vpopcountd_v16si_mask(
(__v16si)__A, (__v16si)_mm512_setzero_si512(), (__mmask16)__U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_popcnt_epi64(__m512i __A) {
return (__m512i)__builtin_ia32_vpopcountq_v8di((__v8di)__A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_popcnt_epi64(__m512i __A, __mmask8 __U, __m512i __B) {
return (__m512i)__builtin_ia32_vpopcountq_v8di_mask((__v8di)__A, (__v8di)__B,
(__mmask8)__U);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) {
return (__m512i)__builtin_ia32_vpopcountq_v8di_mask(
(__v8di)__A, (__v8di)_mm512_setzero_si512(), (__mmask8)__U);
}
#ifdef __DISABLE_AVX512VPOPCNTDQ__
#undef __DISABLE_AVX512VPOPCNTDQ__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512VPOPCNTDQ__ */
#endif /* _AVX512VPOPCNTDQINTRIN_H_INCLUDED */

View file

@ -0,0 +1,100 @@
#if !defined _IMMINTRIN_H_INCLUDED
#error \
"Never use <avx512vpopcntdqvlintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED
#define _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED
#if !defined(__AVX512VPOPCNTDQ__) || !defined(__AVX512VL__)
#pragma GCC push_options
#pragma GCC target("avx512vpopcntdq,avx512vl")
#define __DISABLE_AVX512VPOPCNTDQVL__
#endif /* __AVX512VPOPCNTDQVL__ */
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_popcnt_epi32(__m128i __A) {
return (__m128i)__builtin_ia32_vpopcountd_v4si((__v4si)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_popcnt_epi32(__m128i __A, __mmask16 __U, __m128i __B) {
return (__m128i)__builtin_ia32_vpopcountd_v4si_mask((__v4si)__A, (__v4si)__B,
(__mmask16)__U);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_popcnt_epi32(__mmask16 __U, __m128i __A) {
return (__m128i)__builtin_ia32_vpopcountd_v4si_mask(
(__v4si)__A, (__v4si)_mm_setzero_si128(), (__mmask16)__U);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_popcnt_epi32(__m256i __A) {
return (__m256i)__builtin_ia32_vpopcountd_v8si((__v8si)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_popcnt_epi32(__m256i __A, __mmask16 __U, __m256i __B) {
return (__m256i)__builtin_ia32_vpopcountd_v8si_mask((__v8si)__A, (__v8si)__B,
(__mmask16)__U);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_popcnt_epi32(__mmask16 __U, __m256i __A) {
return (__m256i)__builtin_ia32_vpopcountd_v8si_mask(
(__v8si)__A, (__v8si)_mm256_setzero_si256(), (__mmask16)__U);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_popcnt_epi64(__m128i __A) {
return (__m128i)__builtin_ia32_vpopcountq_v2di((__v2di)__A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_popcnt_epi64(__m128i __A, __mmask8 __U, __m128i __B) {
return (__m128i)__builtin_ia32_vpopcountq_v2di_mask((__v2di)__A, (__v2di)__B,
(__mmask8)__U);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) {
return (__m128i)__builtin_ia32_vpopcountq_v2di_mask(
(__v2di)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_popcnt_epi64(__m256i __A) {
return (__m256i)__builtin_ia32_vpopcountq_v4di((__v4di)__A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_popcnt_epi64(__m256i __A, __mmask8 __U, __m256i __B) {
return (__m256i)__builtin_ia32_vpopcountq_v4di_mask((__v4di)__A, (__v4di)__B,
(__mmask8)__U);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) {
return (__m256i)__builtin_ia32_vpopcountq_v4di_mask(
(__v4di)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U);
}
#ifdef __DISABLE_AVX512VPOPCNTDQVL__
#undef __DISABLE_AVX512VPOPCNTDQVL__
#pragma GCC pop_options
#endif /* __DISABLE_AVX512VPOPCNTDQVL__ */
#endif /* _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED */

1374
third_party/intel/avxintrin.internal.h vendored Normal file

File diff suppressed because it is too large Load diff

78
third_party/intel/bmi2intrin.internal.h vendored Normal file
View file

@ -0,0 +1,78 @@
#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED
#error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _BMI2INTRIN_H_INCLUDED
#define _BMI2INTRIN_H_INCLUDED
#ifndef __BMI2__
#pragma GCC push_options
#pragma GCC target("bmi2")
#define __DISABLE_BMI2__
#endif /* __BMI2__ */
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_bzhi_u32(unsigned int __X, unsigned int __Y) {
return __builtin_ia32_bzhi_si(__X, __Y);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pdep_u32(unsigned int __X, unsigned int __Y) {
return __builtin_ia32_pdep_si(__X, __Y);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pext_u32(unsigned int __X, unsigned int __Y) {
return __builtin_ia32_pext_si(__X, __Y);
}
#ifdef __x86_64__
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_bzhi_u64(unsigned long long __X, unsigned long long __Y) {
return __builtin_ia32_bzhi_di(__X, __Y);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pdep_u64(unsigned long long __X, unsigned long long __Y) {
return __builtin_ia32_pdep_di(__X, __Y);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pext_u64(unsigned long long __X, unsigned long long __Y) {
return __builtin_ia32_pext_di(__X, __Y);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mulx_u64(unsigned long long __X, unsigned long long __Y,
unsigned long long *__P) {
unsigned __int128 __res = (unsigned __int128)__X * __Y;
*__P = (unsigned long long)(__res >> 64);
return (unsigned long long)__res;
}
#else /* !__x86_64__ */
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
unsigned long long __res = (unsigned long long)__X * __Y;
*__P = (unsigned int)(__res >> 32);
return (unsigned int)__res;
}
#endif /* !__x86_64__ */
#ifdef __DISABLE_BMI2__
#undef __DISABLE_BMI2__
#pragma GCC pop_options
#endif /* __DISABLE_BMI2__ */
#endif /* _BMI2INTRIN_H_INCLUDED */

160
third_party/intel/bmiintrin.internal.h vendored Normal file
View file

@ -0,0 +1,160 @@
#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED
#error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _BMIINTRIN_H_INCLUDED
#define _BMIINTRIN_H_INCLUDED
#ifndef __BMI__
#pragma GCC push_options
#pragma GCC target("bmi")
#define __DISABLE_BMI__
#endif /* __BMI__ */
extern __inline unsigned short
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__tzcnt_u16(unsigned short __X) {
return __builtin_ia32_tzcnt_u16(__X);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__andn_u32(unsigned int __X, unsigned int __Y) {
return ~__X & __Y;
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bextr_u32(unsigned int __X, unsigned int __Y) {
return __builtin_ia32_bextr_u32(__X, __Y);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_bextr_u32(unsigned int __X, unsigned int __Y, unsigned __Z) {
return __builtin_ia32_bextr_u32(__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsi_u32(unsigned int __X) {
return __X & -__X;
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_blsi_u32(unsigned int __X) {
return __blsi_u32(__X);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsmsk_u32(unsigned int __X) {
return __X ^ (__X - 1);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_blsmsk_u32(unsigned int __X) {
return __blsmsk_u32(__X);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsr_u32(unsigned int __X) {
return __X & (__X - 1);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_blsr_u32(unsigned int __X) {
return __blsr_u32(__X);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__tzcnt_u32(unsigned int __X) {
return __builtin_ia32_tzcnt_u32(__X);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_tzcnt_u32(unsigned int __X) {
return __builtin_ia32_tzcnt_u32(__X);
}
#ifdef __x86_64__
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__andn_u64(unsigned long long __X, unsigned long long __Y) {
return ~__X & __Y;
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bextr_u64(unsigned long long __X, unsigned long long __Y) {
return __builtin_ia32_bextr_u64(__X, __Y);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z) {
return __builtin_ia32_bextr_u64(__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsi_u64(unsigned long long __X) {
return __X & -__X;
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_blsi_u64(unsigned long long __X) {
return __blsi_u64(__X);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsmsk_u64(unsigned long long __X) {
return __X ^ (__X - 1);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_blsmsk_u64(unsigned long long __X) {
return __blsmsk_u64(__X);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsr_u64(unsigned long long __X) {
return __X & (__X - 1);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_blsr_u64(unsigned long long __X) {
return __blsr_u64(__X);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__tzcnt_u64(unsigned long long __X) {
return __builtin_ia32_tzcnt_u64(__X);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_tzcnt_u64(unsigned long long __X) {
return __builtin_ia32_tzcnt_u64(__X);
}
#endif /* __x86_64__ */
#ifdef __DISABLE_BMI__
#undef __DISABLE_BMI__
#pragma GCC pop_options
#endif /* __DISABLE_BMI__ */
#endif /* _BMIINTRIN_H_INCLUDED */

95
third_party/intel/cetintrin.internal.h vendored Normal file
View file

@ -0,0 +1,95 @@
#if !defined _IMMINTRIN_H_INCLUDED
#error "Never use <cetintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _CETINTRIN_H_INCLUDED
#define _CETINTRIN_H_INCLUDED
#ifndef __SHSTK__
#pragma GCC push_options
#pragma GCC target("shstk")
#define __DISABLE_SHSTK__
#endif /* __SHSTK__ */
#ifdef __x86_64__
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_get_ssp(void) {
return __builtin_ia32_rdsspq();
}
#else
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_get_ssp(void) {
return __builtin_ia32_rdsspd();
}
#endif
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_inc_ssp(unsigned int __B) {
#ifdef __x86_64__
__builtin_ia32_incsspq((unsigned long long)__B);
#else
__builtin_ia32_incsspd(__B);
#endif
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_saveprevssp(void) {
__builtin_ia32_saveprevssp();
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_rstorssp(void *__B) {
__builtin_ia32_rstorssp(__B);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_wrssd(unsigned int __B, void *__C) {
__builtin_ia32_wrssd(__B, __C);
}
#ifdef __x86_64__
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_wrssq(unsigned long long __B, void *__C) {
__builtin_ia32_wrssq(__B, __C);
}
#endif
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_wrussd(unsigned int __B, void *__C) {
__builtin_ia32_wrussd(__B, __C);
}
#ifdef __x86_64__
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_wrussq(unsigned long long __B, void *__C) {
__builtin_ia32_wrussq(__B, __C);
}
#endif
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_setssbsy(void) {
__builtin_ia32_setssbsy();
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_clrssbsy(void *__B) {
__builtin_ia32_clrssbsy(__B);
}
#ifdef __DISABLE_SHSTK__
#undef __DISABLE_SHSTK__
#pragma GCC pop_options
#endif /* __DISABLE_SHSTK__ */
#endif /* _CETINTRIN_H_INCLUDED. */

View file

@ -0,0 +1,23 @@
#if !defined _IMMINTRIN_H_INCLUDED
#error "Never use <cldemoteintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _CLDEMOTE_H_INCLUDED
#define _CLDEMOTE_H_INCLUDED
#ifndef __CLDEMOTE__
#pragma GCC push_options
#pragma GCC target("cldemote")
#define __DISABLE_CLDEMOTE__
#endif /* __CLDEMOTE__ */
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_cldemote(void *__A) {
__builtin_ia32_cldemote(__A);
}
#ifdef __DISABLE_CLDEMOTE__
#undef __DISABLE_CLDEMOTE__
#pragma GCC pop_options
#endif /* __DISABLE_CLDEMOTE__ */
#endif /* _CLDEMOTE_H_INCLUDED */

View file

@ -0,0 +1,25 @@
#if !defined _IMMINTRIN_H_INCLUDED
#error "Never use <clflushoptintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _CLFLUSHOPTINTRIN_H_INCLUDED
#define _CLFLUSHOPTINTRIN_H_INCLUDED
#ifndef __CLFLUSHOPT__
#pragma GCC push_options
#pragma GCC target("clflushopt")
#define __DISABLE_CLFLUSHOPT__
#endif /* __CLFLUSHOPT__ */
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_clflushopt(void *__A) {
__builtin_ia32_clflushopt(__A);
}
#ifdef __DISABLE_CLFLUSHOPT__
#undef __DISABLE_CLFLUSHOPT__
#pragma GCC pop_options
#endif /* __DISABLE_CLFLUSHOPT__ */
#endif /* _CLFLUSHOPTINTRIN_H_INCLUDED */

25
third_party/intel/clwbintrin.internal.h vendored Normal file
View file

@ -0,0 +1,25 @@
#if !defined _IMMINTRIN_H_INCLUDED
#error "Never use <clwbintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _CLWBINTRIN_H_INCLUDED
#define _CLWBINTRIN_H_INCLUDED
#ifndef __CLWB__
#pragma GCC push_options
#pragma GCC target("clwb")
#define __DISABLE_CLWB__
#endif /* __CLWB__ */
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_clwb(void *__A) {
__builtin_ia32_clwb(__A);
}
#ifdef __DISABLE_CLWB__
#undef __DISABLE_CLWB__
#pragma GCC pop_options
#endif /* __DISABLE_CLWB__ */
#endif /* _CLWBINTRIN_H_INCLUDED */

View file

@ -0,0 +1,21 @@
#ifndef _CLZEROINTRIN_H_INCLUDED
#define _CLZEROINTRIN_H_INCLUDED
#ifndef __CLZERO__
#pragma GCC push_options
#pragma GCC target("clzero")
#define __DISABLE_CLZERO__
#endif /* __CLZERO__ */
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_clzero(void* __I) {
__builtin_ia32_clzero(__I);
}
#ifdef __DISABLE_CLZERO__
#undef __DISABLE_CLZERO__
#pragma GCC pop_options
#endif /* __DISABLE_CLZERO__ */
#endif /* _CLZEROINTRIN_H_INCLUDED */

235
third_party/intel/cpuid.internal.h vendored Normal file
View file

@ -0,0 +1,235 @@
#ifndef COSMOPOLITAN_THIRD_PARTY_INTEL_CPUID_INTERNAL_H_
#define COSMOPOLITAN_THIRD_PARTY_INTEL_CPUID_INTERNAL_H_
#if !(__ASSEMBLER__ + __LINKER__ + 0)
#define bit_SSE3 (1 << 0)
#define bit_PCLMUL (1 << 1)
#define bit_LZCNT (1 << 5)
#define bit_SSSE3 (1 << 9)
#define bit_FMA (1 << 12)
#define bit_CMPXCHG16B (1 << 13)
#define bit_SSE4_1 (1 << 19)
#define bit_SSE4_2 (1 << 20)
#define bit_MOVBE (1 << 22)
#define bit_POPCNT (1 << 23)
#define bit_AES (1 << 25)
#define bit_XSAVE (1 << 26)
#define bit_OSXSAVE (1 << 27)
#define bit_AVX (1 << 28)
#define bit_F16C (1 << 29)
#define bit_RDRND (1 << 30)
#define bit_CMPXCHG8B (1 << 8)
#define bit_CMOV (1 << 15)
#define bit_MMX (1 << 23)
#define bit_FXSAVE (1 << 24)
#define bit_SSE (1 << 25)
#define bit_SSE2 (1 << 26)
#define bit_LAHF_LM (1 << 0)
#define bit_ABM (1 << 5)
#define bit_SSE4a (1 << 6)
#define bit_PRFCHW (1 << 8)
#define bit_XOP (1 << 11)
#define bit_LWP (1 << 15)
#define bit_FMA4 (1 << 16)
#define bit_TBM (1 << 21)
#define bit_MWAITX (1 << 29)
#define bit_MMXEXT (1 << 22)
#define bit_LM (1 << 29)
#define bit_3DNOWP (1 << 30)
#define bit_3DNOW (1u << 31)
#define bit_CLZERO (1 << 0)
#define bit_WBNOINVD (1 << 9)
#define bit_FSGSBASE (1 << 0)
#define bit_SGX (1 << 2)
#define bit_BMI (1 << 3)
#define bit_HLE (1 << 4)
#define bit_AVX2 (1 << 5)
#define bit_BMI2 (1 << 8)
#define bit_RTM (1 << 11)
#define bit_MPX (1 << 14)
#define bit_AVX512F (1 << 16)
#define bit_AVX512DQ (1 << 17)
#define bit_RDSEED (1 << 18)
#define bit_ADX (1 << 19)
#define bit_AVX512IFMA (1 << 21)
#define bit_CLFLUSHOPT (1 << 23)
#define bit_CLWB (1 << 24)
#define bit_AVX512PF (1 << 26)
#define bit_AVX512ER (1 << 27)
#define bit_AVX512CD (1 << 28)
#define bit_SHA (1 << 29)
#define bit_AVX512BW (1 << 30)
#define bit_AVX512VL (1u << 31)
#define bit_PREFETCHWT1 (1 << 0)
#define bit_AVX512VBMI (1 << 1)
#define bit_PKU (1 << 3)
#define bit_OSPKE (1 << 4)
#define bit_WAITPKG (1 << 5)
#define bit_AVX512VBMI2 (1 << 6)
#define bit_SHSTK (1 << 7)
#define bit_GFNI (1 << 8)
#define bit_VAES (1 << 9)
#define bit_AVX512VNNI (1 << 11)
#define bit_VPCLMULQDQ (1 << 10)
#define bit_AVX512BITALG (1 << 12)
#define bit_AVX512VPOPCNTDQ (1 << 14)
#define bit_RDPID (1 << 22)
#define bit_MOVDIRI (1 << 27)
#define bit_MOVDIR64B (1 << 28)
#define bit_CLDEMOTE (1 << 25)
#define bit_AVX5124VNNIW (1 << 2)
#define bit_AVX5124FMAPS (1 << 3)
#define bit_IBT (1 << 20)
#define bit_PCONFIG (1 << 18)
#define bit_BNDREGS (1 << 3)
#define bit_BNDCSR (1 << 4)
#define bit_XSAVEOPT (1 << 0)
#define bit_XSAVEC (1 << 1)
#define bit_XSAVES (1 << 3)
#define bit_PTWRITE (1 << 4)
#define signature_AMD_ebx 0x68747541
#define signature_AMD_ecx 0x444d4163
#define signature_AMD_edx 0x69746e65
#define signature_CENTAUR_ebx 0x746e6543
#define signature_CENTAUR_ecx 0x736c7561
#define signature_CENTAUR_edx 0x48727561
#define signature_CYRIX_ebx 0x69727943
#define signature_CYRIX_ecx 0x64616574
#define signature_CYRIX_edx 0x736e4978
#define signature_INTEL_ebx 0x756e6547
#define signature_INTEL_ecx 0x6c65746e
#define signature_INTEL_edx 0x49656e69
#define signature_TM1_ebx 0x6e617254
#define signature_TM1_ecx 0x55504361
#define signature_TM1_edx 0x74656d73
#define signature_TM2_ebx 0x756e6547
#define signature_TM2_ecx 0x3638784d
#define signature_TM2_edx 0x54656e69
#define signature_NSC_ebx 0x646f6547
#define signature_NSC_ecx 0x43534e20
#define signature_NSC_edx 0x79622065
#define signature_NEXGEN_ebx 0x4778654e
#define signature_NEXGEN_ecx 0x6e657669
#define signature_NEXGEN_edx 0x72446e65
#define signature_RISE_ebx 0x65736952
#define signature_RISE_ecx 0x65736952
#define signature_RISE_edx 0x65736952
#define signature_SIS_ebx 0x20536953
#define signature_SIS_ecx 0x20536953
#define signature_SIS_edx 0x20536953
#define signature_UMC_ebx 0x20434d55
#define signature_UMC_ecx 0x20434d55
#define signature_UMC_edx 0x20434d55
#define signature_VIA_ebx 0x20414956
#define signature_VIA_ecx 0x20414956
#define signature_VIA_edx 0x20414956
#define signature_VORTEX_ebx 0x74726f56
#define signature_VORTEX_ecx 0x436f5320
#define signature_VORTEX_edx 0x36387865
#ifndef __x86_64__
#define __cpuid(level, a, b, c, d) \
do { \
if (__builtin_constant_p(level) && (level) != 1) \
__asm__("cpuid\n\t" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(level)); \
else \
__asm__("cpuid\n\t" \
: "=a"(a), "=b"(b), "=c"(c), "=d"(d) \
: "0"(level), "1"(0), "2"(0)); \
} while (0)
#else
#define __cpuid(level, a, b, c, d) \
__asm__("cpuid\n\t" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "0"(level))
#endif
#define __cpuid_count(level, count, a, b, c, d) \
__asm__("cpuid\n\t" \
: "=a"(a), "=b"(b), "=c"(c), "=d"(d) \
: "0"(level), "2"(count))
static __inline unsigned int __get_cpuid_max(unsigned int __ext,
unsigned int *__sig) {
unsigned int __eax, __ebx, __ecx, __edx;
#ifndef __x86_64__
#if __GNUC__ >= 3
__asm__("pushf{l|d}\n\t"
"pushf{l|d}\n\t"
"pop{l}\t%0\n\t"
"mov{l}\t{%0, %1|%1, %0}\n\t"
"xor{l}\t{%2, %0|%0, %2}\n\t"
"push{l}\t%0\n\t"
"popf{l|d}\n\t"
"pushf{l|d}\n\t"
"pop{l}\t%0\n\t"
"popf{l|d}\n\t"
: "=&r"(__eax), "=&r"(__ebx)
: "i"(0x00200000));
#else
__asm__("pushfl\n\t"
"pushfl\n\t"
"popl\t%0\n\t"
"movl\t%0, %1\n\t"
"xorl\t%2, %0\n\t"
"pushl\t%0\n\t"
"popfl\n\t"
"pushfl\n\t"
"popl\t%0\n\t"
"popfl\n\t"
: "=&r"(__eax), "=&r"(__ebx)
: "i"(0x00200000));
#endif
if (!((__eax ^ __ebx) & 0x00200000)) return 0;
#endif
__cpuid(__ext, __eax, __ebx, __ecx, __edx);
if (__sig) *__sig = __ebx;
return __eax;
}
static __inline int __get_cpuid(unsigned int __leaf, unsigned int *__eax,
unsigned int *__ebx, unsigned int *__ecx,
unsigned int *__edx) {
unsigned int __ext = __leaf & 0x80000000;
unsigned int __maxlevel = __get_cpuid_max(__ext, 0);
if (__maxlevel == 0 || __maxlevel < __leaf) return 0;
__cpuid(__leaf, *__eax, *__ebx, *__ecx, *__edx);
return 1;
}
static __inline int __get_cpuid_count(unsigned int __leaf,
unsigned int __subleaf,
unsigned int *__eax, unsigned int *__ebx,
unsigned int *__ecx,
unsigned int *__edx) {
unsigned int __ext = __leaf & 0x80000000;
unsigned int __maxlevel = __get_cpuid_max(__ext, 0);
if (__maxlevel == 0 || __maxlevel < __leaf) return 0;
__cpuid_count(__leaf, __subleaf, *__eax, *__ebx, *__ecx, *__edx);
return 1;
}
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_INTEL_CPUID_INTERNAL_H_ */

1497
third_party/intel/emmintrin.internal.h vendored Normal file

File diff suppressed because it is too large Load diff

75
third_party/intel/f16cintrin.internal.h vendored Normal file
View file

@ -0,0 +1,75 @@
#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED
#error \
"Never use <f16intrin.h> directly; include <x86intrin.h> or <immintrin.h> instead."
#endif
#ifndef _F16CINTRIN_H_INCLUDED
#define _F16CINTRIN_H_INCLUDED
#ifndef __F16C__
#pragma GCC push_options
#pragma GCC target("f16c")
#define __DISABLE_F16C__
#endif /* __F16C__ */
extern __inline float
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_cvtsh_ss(unsigned short __S) {
__v8hi __H = __extension__(__v8hi){(short)__S, 0, 0, 0, 0, 0, 0, 0};
__v4sf __A = __builtin_ia32_vcvtph2ps(__H);
return __builtin_ia32_vec_ext_v4sf(__A, 0);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtph_ps(__m128i __A) {
return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__A);
}
extern __inline __m256
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtph_ps(__m128i __A) {
return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__A);
}
#ifdef __OPTIMIZE__
extern __inline unsigned short
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_cvtss_sh(float __F, const int __I) {
__v4sf __A = __extension__(__v4sf){__F, 0, 0, 0};
__v8hi __H = __builtin_ia32_vcvtps2ph(__A, __I);
return (unsigned short)__builtin_ia32_vec_ext_v8hi(__H, 0);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_ph(__m128 __A, const int __I) {
return (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)__A, __I);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cvtps_ph(__m256 __A, const int __I) {
return (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)__A, __I);
}
#else
#define _cvtss_sh(__F, __I) \
(__extension__({ \
__v4sf __A = __extension__(__v4sf){__F, 0, 0, 0}; \
__v8hi __H = __builtin_ia32_vcvtps2ph(__A, __I); \
(unsigned short)__builtin_ia32_vec_ext_v8hi(__H, 0); \
}))
#define _mm_cvtps_ph(A, I) \
((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)A, (int)(I)))
#define _mm256_cvtps_ph(A, I) \
((__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)A, (int)(I)))
#endif /* __OPTIMIZE */
#ifdef __DISABLE_F16C__
#undef __DISABLE_F16C__
#pragma GCC pop_options
#endif /* __DISABLE_F16C__ */
#endif /* _F16CINTRIN_H_INCLUDED */

248
third_party/intel/fma4intrin.internal.h vendored Normal file
View file

@ -0,0 +1,248 @@
#ifndef _X86INTRIN_H_INCLUDED
#error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _FMA4INTRIN_H_INCLUDED
#define _FMA4INTRIN_H_INCLUDED
#include "third_party/intel/ammintrin.internal.h"
#ifndef __FMA4__
#pragma GCC push_options
#pragma GCC target("fma4")
#define __DISABLE_FMA4__
#endif /* __FMA4__ */
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B,
(__v2df)__C);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B,
(__v2df)__C);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
{
return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B,
-(__v4sf)__C);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B,
-(__v2df)__C);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B,
-(__v4sf)__C);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B,
-(__v2df)__C);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B,
(__v4sf)__C);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B,
(__v2df)__C);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B,
(__v4sf)__C);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B,
(__v2df)__C);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B,
-(__v4sf)__C);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B,
-(__v2df)__C);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B,
-(__v4sf)__C);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B,
-(__v2df)__C);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B,
(__v4sf)__C);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B,
(__v2df)__C);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B,
-(__v4sf)__C);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B,
-(__v2df)__C);
}
/* 256b Floating point multiply/add type instructions. */
extern __inline __m256
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C) {
return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B,
(__v8sf)__C);
}
extern __inline __m256d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B,
(__v4df)__C);
}
extern __inline __m256
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
{
return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B,
-(__v8sf)__C);
}
extern __inline __m256d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B,
-(__v4df)__C);
}
extern __inline __m256
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C) {
return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B,
(__v8sf)__C);
}
extern __inline __m256d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B,
(__v4df)__C);
}
extern __inline __m256
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C) {
return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B,
-(__v8sf)__C);
}
extern __inline __m256d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B,
-(__v4df)__C);
}
extern __inline __m256
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C) {
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B,
(__v8sf)__C);
}
extern __inline __m256d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B,
(__v4df)__C);
}
extern __inline __m256
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C) {
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B,
-(__v8sf)__C);
}
extern __inline __m256d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B,
-(__v4df)__C);
}
#ifdef __DISABLE_FMA4__
#undef __DISABLE_FMA4__
#pragma GCC pop_options
#endif /* __DISABLE_FMA4__ */
#endif

241
third_party/intel/fmaintrin.internal.h vendored Normal file
View file

@ -0,0 +1,241 @@
#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _FMAINTRIN_H_INCLUDED
#define _FMAINTRIN_H_INCLUDED
#ifndef __FMA__
#pragma GCC push_options
#pragma GCC target("fma")
#define __DISABLE_FMA__
#endif /* __FMA__ */
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B,
(__v2df)__C);
}
extern __inline __m256d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B,
(__v4df)__C);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
extern __inline __m256
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C) {
return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B,
(__v8sf)__C);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B,
(__v2df)__C);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B,
(__v4sf)__C);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B,
(__v2df)__C);
}
extern __inline __m256d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B,
(__v4df)__C);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
}
extern __inline __m256
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) {
return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B,
(__v8sf)__C);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfmsubsd3((__v2df)__A, (__v2df)__B,
(__v2df)__C);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_ia32_vfmsubss3((__v4sf)__A, (__v4sf)__B,
(__v4sf)__C);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B,
(__v2df)__C);
}
extern __inline __m256d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B,
(__v4df)__C);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B,
(__v4sf)__C);
}
extern __inline __m256
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) {
return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B,
(__v8sf)__C);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfnmaddsd3((__v2df)__A, (__v2df)__B,
(__v2df)__C);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_ia32_vfnmaddss3((__v4sf)__A, (__v4sf)__B,
(__v4sf)__C);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B,
(__v2df)__C);
}
extern __inline __m256d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B,
(__v4df)__C);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B,
(__v4sf)__C);
}
extern __inline __m256
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) {
return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B,
(__v8sf)__C);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfnmsubsd3((__v2df)__A, (__v2df)__B,
(__v2df)__C);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_ia32_vfnmsubss3((__v4sf)__A, (__v4sf)__B,
(__v4sf)__C);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B,
(__v2df)__C);
}
extern __inline __m256d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B,
(__v4df)__C);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B,
(__v4sf)__C);
}
extern __inline __m256
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C) {
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B,
(__v8sf)__C);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C) {
return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B,
-(__v2df)__C);
}
extern __inline __m256d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C) {
return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B,
-(__v4df)__C);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C) {
return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B,
-(__v4sf)__C);
}
extern __inline __m256
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C) {
return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B,
-(__v8sf)__C);
}
#ifdef __DISABLE_FMA__
#undef __DISABLE_FMA__
#pragma GCC pop_options
#endif /* __DISABLE_FMA__ */
#endif

45
third_party/intel/fxsrintrin.internal.h vendored Normal file
View file

@ -0,0 +1,45 @@
#if !defined _IMMINTRIN_H_INCLUDED
#error "Never use <fxsrintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _FXSRINTRIN_H_INCLUDED
#define _FXSRINTRIN_H_INCLUDED
#ifndef __FXSR__
#pragma GCC push_options
#pragma GCC target("fxsr")
#define __DISABLE_FXSR__
#endif /* __FXSR__ */
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_fxsave(void *__P) {
__builtin_ia32_fxsave(__P);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_fxrstor(void *__P) {
__builtin_ia32_fxrstor(__P);
}
#ifdef __x86_64__
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_fxsave64(void *__P) {
__builtin_ia32_fxsave64(__P);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_fxrstor64(void *__P) {
__builtin_ia32_fxrstor64(__P);
}
#endif
#ifdef __DISABLE_FXSR__
#undef __DISABLE_FXSR__
#pragma GCC pop_options
#endif /* __DISABLE_FXSR__ */
#endif /* _FXSRINTRIN_H_INCLUDED */

344
third_party/intel/gfniintrin.internal.h vendored Normal file
View file

@ -0,0 +1,344 @@
#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <gfniintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _GFNIINTRIN_H_INCLUDED
#define _GFNIINTRIN_H_INCLUDED
#if !defined(__GFNI__) || !defined(__SSE2__)
#pragma GCC push_options
#pragma GCC target("gfni,sse2")
#define __DISABLE_GFNI__
#endif /* __GFNI__ */
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_gf2p8mul_epi8(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_vgf2p8mulb_v16qi((__v16qi)__A, (__v16qi)__B);
}
#ifdef __OPTIMIZE__
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_gf2p8affineinv_epi64_epi8(__m128i __A, __m128i __B, const int __C) {
return (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)__A,
(__v16qi)__B, __C);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_gf2p8affine_epi64_epi8(__m128i __A, __m128i __B, const int __C) {
return (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)__A,
(__v16qi)__B, __C);
}
#else
#define _mm_gf2p8affineinv_epi64_epi8(A, B, C) \
((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi( \
(__v16qi)(__m128i)(A), (__v16qi)(__m128i)(B), (int)(C)))
#define _mm_gf2p8affine_epi64_epi8(A, B, C) \
((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi( \
(__v16qi)(__m128i)(A), (__v16qi)(__m128i)(B), (int)(C)))
#endif
#ifdef __DISABLE_GFNI__
#undef __DISABLE_GFNI__
#pragma GCC pop_options
#endif /* __DISABLE_GFNI__ */
#if !defined(__GFNI__) || !defined(__AVX__)
#pragma GCC push_options
#pragma GCC target("gfni,avx")
#define __DISABLE_GFNIAVX__
#endif /* __GFNIAVX__ */
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_vgf2p8mulb_v32qi((__v32qi)__A, (__v32qi)__B);
}
#ifdef __OPTIMIZE__
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_gf2p8affineinv_epi64_epi8(__m256i __A, __m256i __B, const int __C) {
return (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)__A,
(__v32qi)__B, __C);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_gf2p8affine_epi64_epi8(__m256i __A, __m256i __B, const int __C) {
return (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)__A,
(__v32qi)__B, __C);
}
#else
#define _mm256_gf2p8affineinv_epi64_epi8(A, B, C) \
((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi( \
(__v32qi)(__m256i)(A), (__v32qi)(__m256i)(B), (int)(C)))
#define _mm256_gf2p8affine_epi64_epi8(A, B, C) \
((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi( \
(__v32qi)(__m256i)(A), (__v32qi)(__m256i)(B), (int)(C)))
#endif
#ifdef __DISABLE_GFNIAVX__
#undef __DISABLE_GFNIAVX__
#pragma GCC pop_options
#endif /* __GFNIAVX__ */
#if !defined(__GFNI__) || !defined(__AVX512VL__)
#pragma GCC push_options
#pragma GCC target("gfni,avx512vl")
#define __DISABLE_GFNIAVX512VL__
#endif /* __GFNIAVX512VL__ */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_mask_gf2p8mul_epi8(__m128i __A, __mmask16 __B, __m128i __C, __m128i __D) {
return (__m128i)__builtin_ia32_vgf2p8mulb_v16qi_mask(
(__v16qi)__C, (__v16qi)__D, (__v16qi)__A, __B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_gf2p8mul_epi8(__mmask16 __A, __m128i __B, __m128i __C) {
return (__m128i)__builtin_ia32_vgf2p8mulb_v16qi_mask(
(__v16qi)__B, (__v16qi)__C, (__v16qi)_mm_setzero_si128(), __A);
}
#ifdef __OPTIMIZE__
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_gf2p8affineinv_epi64_epi8(__m128i __A, __mmask16 __B, __m128i __C,
__m128i __D, const int __E) {
return (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi_mask(
(__v16qi)__C, (__v16qi)__D, __E, (__v16qi)__A, __B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_gf2p8affineinv_epi64_epi8(__mmask16 __A, __m128i __B, __m128i __C,
const int __D) {
return (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi_mask(
(__v16qi)__B, (__v16qi)__C, __D, (__v16qi)_mm_setzero_si128(), __A);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mask_gf2p8affine_epi64_epi8(__m128i __A, __mmask16 __B, __m128i __C,
__m128i __D, const int __E) {
return (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi_mask(
(__v16qi)__C, (__v16qi)__D, __E, (__v16qi)__A, __B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskz_gf2p8affine_epi64_epi8(__mmask16 __A, __m128i __B, __m128i __C,
const int __D) {
return (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi_mask(
(__v16qi)__B, (__v16qi)__C, __D, (__v16qi)_mm_setzero_si128(), __A);
}
#else
#define _mm_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) \
((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi_mask( \
(__v16qi)(__m128i)(C), (__v16qi)(__m128i)(D), (int)(E), \
(__v16qi)(__m128i)(A), (__mmask16)(B)))
#define _mm_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) \
((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi_mask( \
(__v16qi)(__m128i)(B), (__v16qi)(__m128i)(C), (int)(D), \
(__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)(A)))
#define _mm_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) \
((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi_mask( \
(__v16qi)(__m128i)(C), (__v16qi)(__m128i)(D), (int)(E), \
(__v16qi)(__m128i)(A), (__mmask16)(B)))
#define _mm_maskz_gf2p8affine_epi64_epi8(A, B, C, D) \
((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi_mask( \
(__v16qi)(__m128i)(B), (__v16qi)(__m128i)(C), (int)(D), \
(__v16qi)(__m128i)_mm_setzero_si128(), (__mmask16)(A)))
#endif
#ifdef __DISABLE_GFNIAVX512VL__
#undef __DISABLE_GFNIAVX512VL__
#pragma GCC pop_options
#endif /* __GFNIAVX512VL__ */
#if !defined(__GFNI__) || !defined(__AVX512VL__) || !defined(__AVX512BW__)
#pragma GCC push_options
#pragma GCC target("gfni,avx512vl,avx512bw")
#define __DISABLE_GFNIAVX512VLBW__
#endif /* __GFNIAVX512VLBW__ */
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_gf2p8mul_epi8(__m256i __A, __mmask32 __B, __m256i __C,
__m256i __D) {
return (__m256i)__builtin_ia32_vgf2p8mulb_v32qi_mask(
(__v32qi)__C, (__v32qi)__D, (__v32qi)__A, __B);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_gf2p8mul_epi8(__mmask32 __A, __m256i __B, __m256i __C) {
return (__m256i)__builtin_ia32_vgf2p8mulb_v32qi_mask(
(__v32qi)__B, (__v32qi)__C, (__v32qi)_mm256_setzero_si256(), __A);
}
#ifdef __OPTIMIZE__
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_gf2p8affineinv_epi64_epi8(__m256i __A, __mmask32 __B,
__m256i __C, __m256i __D,
const int __E) {
return (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi_mask(
(__v32qi)__C, (__v32qi)__D, __E, (__v32qi)__A, __B);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_gf2p8affineinv_epi64_epi8(__mmask32 __A, __m256i __B,
__m256i __C, const int __D) {
return (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi_mask(
(__v32qi)__B, (__v32qi)__C, __D, (__v32qi)_mm256_setzero_si256(), __A);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mask_gf2p8affine_epi64_epi8(__m256i __A, __mmask32 __B, __m256i __C,
__m256i __D, const int __E) {
return (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi_mask(
(__v32qi)__C, (__v32qi)__D, __E, (__v32qi)__A, __B);
}
extern __inline __m256i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_maskz_gf2p8affine_epi64_epi8(__mmask32 __A, __m256i __B, __m256i __C,
const int __D) {
return (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi_mask(
(__v32qi)__B, (__v32qi)__C, __D, (__v32qi)_mm256_setzero_si256(), __A);
}
#else
#define _mm256_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) \
((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi_mask( \
(__v32qi)(__m256i)(C), (__v32qi)(__m256i)(D), (int)(E), \
(__v32qi)(__m256i)(A), (__mmask32)(B)))
#define _mm256_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) \
((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi_mask( \
(__v32qi)(__m256i)(B), (__v32qi)(__m256i)(C), (int)(D), \
(__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)(A)))
#define _mm256_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) \
((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi_mask( \
(__v32qi)(__m256i)(C), (__v32qi)(__m256i)(D), (int)(E), \
(__v32qi)(__m256i)(A), (__mmask32)(B)))
#define _mm256_maskz_gf2p8affine_epi64_epi8(A, B, C, D) \
((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi_mask( \
(__v32qi)(__m256i)(B), (__v32qi)(__m256i)(C), (int)(D), \
(__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)(A)))
#endif
#ifdef __DISABLE_GFNIAVX512VLBW__
#undef __DISABLE_GFNIAVX512VLBW__
#pragma GCC pop_options
#endif /* __GFNIAVX512VLBW__ */
#if !defined(__GFNI__) || !defined(__AVX512F__) || !defined(__AVX512BW__)
#pragma GCC push_options
#pragma GCC target("gfni,avx512f,avx512bw")
#define __DISABLE_GFNIAVX512FBW__
#endif /* __GFNIAVX512FBW__ */
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_gf2p8mul_epi8(__m512i __A, __mmask64 __B, __m512i __C,
__m512i __D) {
return (__m512i)__builtin_ia32_vgf2p8mulb_v64qi_mask(
(__v64qi)__C, (__v64qi)__D, (__v64qi)__A, __B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_gf2p8mul_epi8(__mmask64 __A, __m512i __B, __m512i __C) {
return (__m512i)__builtin_ia32_vgf2p8mulb_v64qi_mask(
(__v64qi)__B, (__v64qi)__C, (__v64qi)_mm512_setzero_si512(), __A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_vgf2p8mulb_v64qi((__v64qi)__A, (__v64qi)__B);
}
#ifdef __OPTIMIZE__
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_gf2p8affineinv_epi64_epi8(__m512i __A, __mmask64 __B,
__m512i __C, __m512i __D,
const int __E) {
return (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi_mask(
(__v64qi)__C, (__v64qi)__D, __E, (__v64qi)__A, __B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_gf2p8affineinv_epi64_epi8(__mmask64 __A, __m512i __B,
__m512i __C, const int __D) {
return (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi_mask(
(__v64qi)__B, (__v64qi)__C, __D, (__v64qi)_mm512_setzero_si512(), __A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_gf2p8affineinv_epi64_epi8(__m512i __A, __m512i __B, const int __C) {
return (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)__A,
(__v64qi)__B, __C);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mask_gf2p8affine_epi64_epi8(__m512i __A, __mmask64 __B, __m512i __C,
__m512i __D, const int __E) {
return (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi_mask(
(__v64qi)__C, (__v64qi)__D, __E, (__v64qi)__A, __B);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_maskz_gf2p8affine_epi64_epi8(__mmask64 __A, __m512i __B, __m512i __C,
const int __D) {
return (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi_mask(
(__v64qi)__B, (__v64qi)__C, __D, (__v64qi)_mm512_setzero_si512(), __A);
}
extern __inline __m512i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm512_gf2p8affine_epi64_epi8(__m512i __A, __m512i __B, const int __C) {
return (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)__A,
(__v64qi)__B, __C);
}
#else
#define _mm512_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) \
((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi_mask( \
(__v64qi)(__m512i)(C), (__v64qi)(__m512i)(D), (int)(E), \
(__v64qi)(__m512i)(A), (__mmask64)(B)))
#define _mm512_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) \
((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi_mask( \
(__v64qi)(__m512i)(B), (__v64qi)(__m512i)(C), (int)(D), \
(__v64qi)(__m512i)_mm512_setzero_si512(), (__mmask64)(A)))
#define _mm512_gf2p8affineinv_epi64_epi8(A, B, C) \
((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi( \
(__v64qi)(__m512i)(A), (__v64qi)(__m512i)(B), (int)(C)))
#define _mm512_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) \
((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi_mask( \
(__v64qi)(__m512i)(C), (__v64qi)(__m512i)(D), (int)(E), \
(__v64qi)(__m512i)(A), (__mmask64)(B)))
#define _mm512_maskz_gf2p8affine_epi64_epi8(A, B, C, D) \
((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi_mask( \
(__v64qi)(__m512i)(B), (__v64qi)(__m512i)(C), (int)(D), \
(__v64qi)(__m512i)_mm512_setzero_si512(), (__mmask64)(A)))
#define _mm512_gf2p8affine_epi64_epi8(A, B, C) \
((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi( \
(__v64qi)(__m512i)(A), (__v64qi)(__m512i)(B), (int)(C)))
#endif
#ifdef __DISABLE_GFNIAVX512FBW__
#undef __DISABLE_GFNIAVX512FBW__
#pragma GCC pop_options
#endif /* __GFNIAVX512FBW__ */
#endif /* _GFNIINTRIN_H_INCLUDED */

239
third_party/intel/ia32intrin.internal.h vendored Normal file
View file

@ -0,0 +1,239 @@
#ifndef _X86INTRIN_H_INCLUDED
#error "Never use <ia32intrin.h> directly; include <x86intrin.h> instead."
#endif
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bsfd(int __X) {
return __builtin_ctz(__X);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bsrd(int __X) {
return __builtin_ia32_bsrsi(__X);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bswapd(int __X) {
return __builtin_bswap32(__X);
}
#ifndef __iamcu__
#ifndef __SSE4_2__
#pragma GCC push_options
#pragma GCC target("sse4.2")
#define __DISABLE_SSE4_2__
#endif /* __SSE4_2__ */
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__crc32b(unsigned int __C, unsigned char __V) {
return __builtin_ia32_crc32qi(__C, __V);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__crc32w(unsigned int __C, unsigned short __V) {
return __builtin_ia32_crc32hi(__C, __V);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__crc32d(unsigned int __C, unsigned int __V) {
return __builtin_ia32_crc32si(__C, __V);
}
#ifdef __DISABLE_SSE4_2__
#undef __DISABLE_SSE4_2__
#pragma GCC pop_options
#endif /* __DISABLE_SSE4_2__ */
#endif /* __iamcu__ */
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__popcntd(unsigned int __X) {
return __builtin_popcount(__X);
}
#ifndef __iamcu__
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rdpmc(int __S) {
return __builtin_ia32_rdpmc(__S);
}
#endif /* __iamcu__ */
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rdtsc(void) {
return __builtin_ia32_rdtsc();
}
#ifndef __iamcu__
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rdtscp(unsigned int *__A) {
return __builtin_ia32_rdtscp(__A);
}
#endif /* __iamcu__ */
extern __inline unsigned char
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rolb(unsigned char __X, int __C) {
return __builtin_ia32_rolqi(__X, __C);
}
extern __inline unsigned short
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rolw(unsigned short __X, int __C) {
return __builtin_ia32_rolhi(__X, __C);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rold(unsigned int __X, int __C) {
__C &= 31;
return (__X << __C) | (__X >> (-__C & 31));
}
extern __inline unsigned char
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rorb(unsigned char __X, int __C) {
return __builtin_ia32_rorqi(__X, __C);
}
extern __inline unsigned short
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rorw(unsigned short __X, int __C) {
return __builtin_ia32_rorhi(__X, __C);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rord(unsigned int __X, int __C) {
__C &= 31;
return (__X >> __C) | (__X << (-__C & 31));
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__pause(void) {
__builtin_ia32_pause();
}
#ifdef __x86_64__
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bsfq(long long __X) {
return __builtin_ctzll(__X);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bsrq(long long __X) {
return __builtin_ia32_bsrdi(__X);
}
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bswapq(long long __X) {
return __builtin_bswap64(__X);
}
#ifndef __SSE4_2__
#pragma GCC push_options
#pragma GCC target("sse4.2")
#define __DISABLE_SSE4_2__
#endif /* __SSE4_2__ */
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__crc32q(unsigned long long __C, unsigned long long __V) {
return __builtin_ia32_crc32di(__C, __V);
}
#ifdef __DISABLE_SSE4_2__
#undef __DISABLE_SSE4_2__
#pragma GCC pop_options
#endif /* __DISABLE_SSE4_2__ */
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__popcntq(unsigned long long __X) {
return __builtin_popcountll(__X);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rolq(unsigned long long __X, int __C) {
__C &= 63;
return (__X << __C) | (__X >> (-__C & 63));
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__rorq(unsigned long long __X, int __C) {
__C &= 63;
return (__X >> __C) | (__X << (-__C & 63));
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__readeflags(void) {
return __builtin_ia32_readeflags_u64();
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__writeeflags(unsigned long long __X) {
__builtin_ia32_writeeflags_u64(__X);
}
#define _bswap64(a) __bswapq(a)
#define _popcnt64(a) __popcntq(a)
#else
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__readeflags(void) {
return __builtin_ia32_readeflags_u32();
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__writeeflags(unsigned int __X) {
__builtin_ia32_writeeflags_u32(__X);
}
#endif
#ifdef __LP64__
#define _lrotl(a, b) __rolq((a), (b))
#define _lrotr(a, b) __rorq((a), (b))
#else
#define _lrotl(a, b) __rold((a), (b))
#define _lrotr(a, b) __rord((a), (b))
#endif
#define _bit_scan_forward(a) __bsfd(a)
#define _bit_scan_reverse(a) __bsrd(a)
#define _bswap(a) __bswapd(a)
#define _popcnt32(a) __popcntd(a)
#ifndef __iamcu__
#define _rdpmc(a) __rdpmc(a)
#define _rdtscp(a) __rdtscp(a)
#endif /* __iamcu__ */
#define _rdtsc() __rdtsc()
#define _rotwl(a, b) __rolw((a), (b))
#define _rotwr(a, b) __rorw((a), (b))
#define _rotl(a, b) __rold((a), (b))
#define _rotr(a, b) __rord((a), (b))

209
third_party/intel/immintrin.internal.h vendored Normal file
View file

@ -0,0 +1,209 @@
#ifndef _IMMINTRIN_H_INCLUDED
#define _IMMINTRIN_H_INCLUDED
/* clang-format off */
#include "third_party/intel/mmintrin.internal.h"
#include "third_party/intel/xmmintrin.internal.h"
#include "third_party/intel/emmintrin.internal.h"
#include "third_party/intel/pmmintrin.internal.h"
#include "third_party/intel/tmmintrin.internal.h"
#include "third_party/intel/smmintrin.internal.h"
#include "third_party/intel/wmmintrin.internal.h"
#include "third_party/intel/fxsrintrin.internal.h"
#include "third_party/intel/xsaveintrin.internal.h"
#include "third_party/intel/xsaveoptintrin.internal.h"
#include "third_party/intel/xsavesintrin.internal.h"
#include "third_party/intel/xsavecintrin.internal.h"
#include "third_party/intel/avxintrin.internal.h"
#include "third_party/intel/avx2intrin.internal.h"
#include "third_party/intel/avx512fintrin.internal.h"
#include "third_party/intel/avx512erintrin.internal.h"
#include "third_party/intel/avx512pfintrin.internal.h"
#include "third_party/intel/avx512cdintrin.internal.h"
#include "third_party/intel/avx512vlintrin.internal.h"
#include "third_party/intel/avx512bwintrin.internal.h"
#include "third_party/intel/avx512dqintrin.internal.h"
#include "third_party/intel/avx512vlbwintrin.internal.h"
#include "third_party/intel/avx512vldqintrin.internal.h"
#include "third_party/intel/avx512ifmaintrin.internal.h"
#include "third_party/intel/avx512ifmavlintrin.internal.h"
#include "third_party/intel/avx512vbmiintrin.internal.h"
#include "third_party/intel/avx512vbmivlintrin.internal.h"
#include "third_party/intel/avx5124fmapsintrin.internal.h"
#include "third_party/intel/avx5124vnniwintrin.internal.h"
#include "third_party/intel/avx512vpopcntdqintrin.internal.h"
#include "third_party/intel/avx512vbmi2intrin.internal.h"
#include "third_party/intel/avx512vbmi2vlintrin.internal.h"
#include "third_party/intel/avx512vnniintrin.internal.h"
#include "third_party/intel/avx512vnnivlintrin.internal.h"
#include "third_party/intel/avx512vpopcntdqvlintrin.internal.h"
#include "third_party/intel/avx512bitalgintrin.internal.h"
#include "third_party/intel/shaintrin.internal.h"
#include "third_party/intel/lzcntintrin.internal.h"
#include "third_party/intel/bmiintrin.internal.h"
#include "third_party/intel/bmi2intrin.internal.h"
#include "third_party/intel/fmaintrin.internal.h"
#include "third_party/intel/f16cintrin.internal.h"
#include "third_party/intel/rtmintrin.internal.h"
#include "third_party/intel/xtestintrin.internal.h"
#include "third_party/intel/cetintrin.internal.h"
#include "third_party/intel/gfniintrin.internal.h"
#include "third_party/intel/vaesintrin.internal.h"
#include "third_party/intel/vpclmulqdqintrin.internal.h"
#include "third_party/intel/movdirintrin.internal.h"
#include "third_party/intel/sgxintrin.internal.h"
#include "third_party/intel/pconfigintrin.internal.h"
#include "third_party/intel/waitpkgintrin.internal.h"
#include "third_party/intel/cldemoteintrin.internal.h"
#include "third_party/intel/rdseedintrin.internal.h"
#include "third_party/intel/prfchwintrin.internal.h"
#include "third_party/intel/adxintrin.internal.h"
#include "third_party/intel/clwbintrin.internal.h"
#include "third_party/intel/clflushoptintrin.internal.h"
#include "third_party/intel/wbnoinvdintrin.internal.h"
#include "third_party/intel/pkuintrin.internal.h"
/* clang-format on */
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_wbinvd(void) {
__builtin_ia32_wbinvd();
}
#ifndef __RDRND__
#pragma GCC push_options
#pragma GCC target("rdrnd")
#define __DISABLE_RDRND__
#endif /* __RDRND__ */
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_rdrand16_step(unsigned short *__P) {
return __builtin_ia32_rdrand16_step(__P);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_rdrand32_step(unsigned int *__P) {
return __builtin_ia32_rdrand32_step(__P);
}
#ifdef __DISABLE_RDRND__
#undef __DISABLE_RDRND__
#pragma GCC pop_options
#endif /* __DISABLE_RDRND__ */
#ifndef __RDPID__
#pragma GCC push_options
#pragma GCC target("rdpid")
#define __DISABLE_RDPID__
#endif /* __RDPID__ */
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_rdpid_u32(void) {
return __builtin_ia32_rdpid();
}
#ifdef __DISABLE_RDPID__
#undef __DISABLE_RDPID__
#pragma GCC pop_options
#endif /* __DISABLE_RDPID__ */
#ifdef __x86_64__
#ifndef __FSGSBASE__
#pragma GCC push_options
#pragma GCC target("fsgsbase")
#define __DISABLE_FSGSBASE__
#endif /* __FSGSBASE__ */
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_readfsbase_u32(void) {
return __builtin_ia32_rdfsbase32();
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_readfsbase_u64(void) {
return __builtin_ia32_rdfsbase64();
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_readgsbase_u32(void) {
return __builtin_ia32_rdgsbase32();
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_readgsbase_u64(void) {
return __builtin_ia32_rdgsbase64();
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_writefsbase_u32(unsigned int __B) {
__builtin_ia32_wrfsbase32(__B);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_writefsbase_u64(unsigned long long __B) {
__builtin_ia32_wrfsbase64(__B);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_writegsbase_u32(unsigned int __B) {
__builtin_ia32_wrgsbase32(__B);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_writegsbase_u64(unsigned long long __B) {
__builtin_ia32_wrgsbase64(__B);
}
#ifdef __DISABLE_FSGSBASE__
#undef __DISABLE_FSGSBASE__
#pragma GCC pop_options
#endif /* __DISABLE_FSGSBASE__ */
#ifndef __RDRND__
#pragma GCC push_options
#pragma GCC target("rdrnd")
#define __DISABLE_RDRND__
#endif /* __RDRND__ */
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_rdrand64_step(unsigned long long *__P) {
return __builtin_ia32_rdrand64_step(__P);
}
#ifdef __DISABLE_RDRND__
#undef __DISABLE_RDRND__
#pragma GCC pop_options
#endif /* __DISABLE_RDRND__ */
#endif /* __x86_64__ */
#ifndef __PTWRITE__
#pragma GCC push_options
#pragma GCC target("ptwrite")
#define __DISABLE_PTWRITE__
#endif
#ifdef __x86_64__
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_ptwrite64(unsigned long long __B) {
__builtin_ia32_ptwrite64(__B);
}
#endif /* __x86_64__ */
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_ptwrite32(unsigned __B) {
__builtin_ia32_ptwrite32(__B);
}
#ifdef __DISABLE_PTWRITE__
#undef __DISABLE_PTWRITE__
#pragma GCC pop_options
#endif /* __DISABLE_PTWRITE__ */
#endif /* _IMMINTRIN_H_INCLUDED */

6
third_party/intel/intel.mk vendored Normal file
View file

@ -0,0 +1,6 @@
#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
#───vi: set et ft=make ts=8 tw=8 fenc=utf-8 :vi───────────────────────┘
PKGS += THIRD_PARTY_INTEL
THIRD_PARTY_INTEL_HDRS = $(filter %.h,$(THIRD_PARTY_INTEL_FILES))
THIRD_PARTY_INTEL_FILES := $(wildcard third_party/intel/*)

82
third_party/intel/lwpintrin.internal.h vendored Normal file
View file

@ -0,0 +1,82 @@
#ifndef _X86INTRIN_H_INCLUDED
#error "Never use <lwpintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _LWPINTRIN_H_INCLUDED
#define _LWPINTRIN_H_INCLUDED
#ifndef __LWP__
#pragma GCC push_options
#pragma GCC target("lwp")
#define __DISABLE_LWP__
#endif /* __LWP__ */
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__llwpcb(void *__pcbAddress) {
__builtin_ia32_llwpcb(__pcbAddress);
}
extern __inline void *__attribute__((__gnu_inline__, __always_inline__,
__artificial__)) __slwpcb(void) {
return __builtin_ia32_slwpcb();
}
#ifdef __OPTIMIZE__
extern __inline void __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
__lwpval32(unsigned int __data2, unsigned int __data1, unsigned int __flags) {
__builtin_ia32_lwpval32(__data2, __data1, __flags);
}
#ifdef __x86_64__
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lwpval64(unsigned long long __data2, unsigned int __data1,
unsigned int __flags) {
__builtin_ia32_lwpval64(__data2, __data1, __flags);
}
#endif
#else
#define __lwpval32(D2, D1, F) \
(__builtin_ia32_lwpval32((unsigned int)(D2), (unsigned int)(D1), \
(unsigned int)(F)))
#ifdef __x86_64__
#define __lwpval64(D2, D1, F) \
(__builtin_ia32_lwpval64((unsigned long long)(D2), (unsigned int)(D1), \
(unsigned int)(F)))
#endif
#endif
#ifdef __OPTIMIZE__
extern __inline unsigned char __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
__lwpins32(unsigned int __data2, unsigned int __data1, unsigned int __flags) {
return __builtin_ia32_lwpins32(__data2, __data1, __flags);
}
#ifdef __x86_64__
extern __inline unsigned char
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lwpins64(unsigned long long __data2, unsigned int __data1,
unsigned int __flags) {
return __builtin_ia32_lwpins64(__data2, __data1, __flags);
}
#endif
#else
#define __lwpins32(D2, D1, F) \
(__builtin_ia32_lwpins32((unsigned int)(D2), (unsigned int)(D1), \
(unsigned int)(F)))
#ifdef __x86_64__
#define __lwpins64(D2, D1, F) \
(__builtin_ia32_lwpins64((unsigned long long)(D2), (unsigned int)(D1), \
(unsigned int)(F)))
#endif
#endif
#ifdef __DISABLE_LWP__
#undef __DISABLE_LWP__
#pragma GCC pop_options
#endif /* __DISABLE_LWP__ */
#endif /* _LWPINTRIN_H_INCLUDED */

View file

@ -0,0 +1,51 @@
#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED
#error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _LZCNTINTRIN_H_INCLUDED
#define _LZCNTINTRIN_H_INCLUDED
#ifndef __LZCNT__
#pragma GCC push_options
#pragma GCC target("lzcnt")
#define __DISABLE_LZCNT__
#endif /* __LZCNT__ */
extern __inline unsigned short
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lzcnt16(unsigned short __X) {
return __builtin_ia32_lzcnt_u16(__X);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lzcnt32(unsigned int __X) {
return __builtin_ia32_lzcnt_u32(__X);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_lzcnt_u32(unsigned int __X) {
return __builtin_ia32_lzcnt_u32(__X);
}
#ifdef __x86_64__
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__lzcnt64(unsigned long long __X) {
return __builtin_ia32_lzcnt_u64(__X);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_lzcnt_u64(unsigned long long __X) {
return __builtin_ia32_lzcnt_u64(__X);
}
#endif
#ifdef __DISABLE_LZCNT__
#undef __DISABLE_LZCNT__
#pragma GCC pop_options
#endif /* __DISABLE_LZCNT__ */
#endif /* _LZCNTINTRIN_H_INCLUDED */

209
third_party/intel/mm3dnow.internal.h vendored Normal file
View file

@ -0,0 +1,209 @@
#ifndef _MM3DNOW_H_INCLUDED
#define _MM3DNOW_H_INCLUDED
#include "third_party/intel/mmintrin.internal.h"
#include "third_party/intel/prfchwintrin.internal.h"
#if defined __x86_64__ && !defined __SSE__ || !defined __3dNOW__
#pragma GCC push_options
#ifdef __x86_64__
#pragma GCC target("sse,3dnow")
#else
#pragma GCC target("3dnow")
#endif
#define __DISABLE_3dNOW__
#endif /* __3dNOW__ */
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_femms(void) {
__builtin_ia32_femms();
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pavgusb(__m64 __A, __m64 __B) {
return (__m64)__builtin_ia32_pavgusb((__v8qi)__A, (__v8qi)__B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pf2id(__m64 __A) {
return (__m64)__builtin_ia32_pf2id((__v2sf)__A);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfacc(__m64 __A, __m64 __B) {
return (__m64)__builtin_ia32_pfacc((__v2sf)__A, (__v2sf)__B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfadd(__m64 __A, __m64 __B) {
return (__m64)__builtin_ia32_pfadd((__v2sf)__A, (__v2sf)__B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfcmpeq(__m64 __A, __m64 __B) {
return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__A, (__v2sf)__B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfcmpge(__m64 __A, __m64 __B) {
return (__m64)__builtin_ia32_pfcmpge((__v2sf)__A, (__v2sf)__B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfcmpgt(__m64 __A, __m64 __B) {
return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__A, (__v2sf)__B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfmax(__m64 __A, __m64 __B) {
return (__m64)__builtin_ia32_pfmax((__v2sf)__A, (__v2sf)__B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfmin(__m64 __A, __m64 __B) {
return (__m64)__builtin_ia32_pfmin((__v2sf)__A, (__v2sf)__B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfmul(__m64 __A, __m64 __B) {
return (__m64)__builtin_ia32_pfmul((__v2sf)__A, (__v2sf)__B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfrcp(__m64 __A) {
return (__m64)__builtin_ia32_pfrcp((__v2sf)__A);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfrcpit1(__m64 __A, __m64 __B) {
return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__A, (__v2sf)__B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfrcpit2(__m64 __A, __m64 __B) {
return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__A, (__v2sf)__B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfrsqrt(__m64 __A) {
return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__A);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfrsqit1(__m64 __A, __m64 __B) {
return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__A, (__v2sf)__B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfsub(__m64 __A, __m64 __B) {
return (__m64)__builtin_ia32_pfsub((__v2sf)__A, (__v2sf)__B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfsubr(__m64 __A, __m64 __B) {
return (__m64)__builtin_ia32_pfsubr((__v2sf)__A, (__v2sf)__B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pi2fd(__m64 __A) {
return (__m64)__builtin_ia32_pi2fd((__v2si)__A);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmulhrw(__m64 __A, __m64 __B) {
return (__m64)__builtin_ia32_pmulhrw((__v4hi)__A, (__v4hi)__B);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_prefetch(void *__P) {
__builtin_prefetch(__P, 0, 3 /* _MM_HINT_T0 */);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_from_float(float __A) {
return __extension__(__m64)(__v2sf){__A, 0.0f};
}
extern __inline float
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_to_float(__m64 __A) {
union {
__v2sf v;
float a[2];
} __tmp;
__tmp.v = (__v2sf)__A;
return __tmp.a[0];
}
#ifdef __DISABLE_3dNOW__
#undef __DISABLE_3dNOW__
#pragma GCC pop_options
#endif /* __DISABLE_3dNOW__ */
#if defined __x86_64__ && !defined __SSE__ || !defined __3dNOW_A__
#pragma GCC push_options
#ifdef __x86_64__
#pragma GCC target("sse,3dnowa")
#else
#pragma GCC target("3dnowa")
#endif
#define __DISABLE_3dNOW_A__
#endif /* __3dNOW_A__ */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pf2iw(__m64 __A) {
return (__m64)__builtin_ia32_pf2iw((__v2sf)__A);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfnacc(__m64 __A, __m64 __B) {
return (__m64)__builtin_ia32_pfnacc((__v2sf)__A, (__v2sf)__B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pfpnacc(__m64 __A, __m64 __B) {
return (__m64)__builtin_ia32_pfpnacc((__v2sf)__A, (__v2sf)__B);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pi2fw(__m64 __A) {
return (__m64)__builtin_ia32_pi2fw((__v2si)__A);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pswapd(__m64 __A) {
return (__m64)__builtin_ia32_pswapdsf((__v2sf)__A);
}
#ifdef __DISABLE_3dNOW_A__
#undef __DISABLE_3dNOW_A__
#pragma GCC pop_options
#endif /* __DISABLE_3dNOW_A__ */
#endif /* _MM3DNOW_H_INCLUDED */

27
third_party/intel/mm_malloc.internal.h vendored Normal file
View file

@ -0,0 +1,27 @@
#ifndef _MM_MALLOC_H_INCLUDED
#define _MM_MALLOC_H_INCLUDED
#include "libc/mem/mem.h"
#ifndef __cplusplus
extern int _mm_posix_memalign(void **, size_t, size_t)
#else
extern "C" int _mm_posix_memalign(void **, size_t, size_t) throw()
#endif
__asm__("posix_memalign");
static __inline void *_mm_malloc(size_t __size, size_t __alignment) {
void *__ptr;
if (__alignment == 1) return malloc(__size);
if (__alignment == 2 || (sizeof(void *) == 8 && __alignment == 4))
__alignment = sizeof(void *);
if (_mm_posix_memalign(&__ptr, __alignment, __size) == 0)
return __ptr;
else
return NULL;
}
static __inline void _mm_free(void *__ptr) {
free(__ptr);
}
#endif /* _MM_MALLOC_H_INCLUDED */

832
third_party/intel/mmintrin.internal.h vendored Normal file
View file

@ -0,0 +1,832 @@
#ifndef _MMINTRIN_H_INCLUDED
#define _MMINTRIN_H_INCLUDED
#if defined __x86_64__ && !defined __SSE__ || !defined __MMX__
#pragma GCC push_options
#ifdef __x86_64__
#pragma GCC target("sse,mmx")
#else
#pragma GCC target("mmx")
#endif
#define __DISABLE_MMX__
#endif /* __MMX__ */
typedef int __m64 __attribute__((__vector_size__(8), __may_alias__));
typedef int __m64_u
__attribute__((__vector_size__(8), __may_alias__, __aligned__(1)));
typedef int __v2si __attribute__((__vector_size__(8)));
typedef short __v4hi __attribute__((__vector_size__(8)));
typedef char __v8qi __attribute__((__vector_size__(8)));
typedef long long __v1di __attribute__((__vector_size__(8)));
typedef float __v2sf __attribute__((__vector_size__(8)));
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_empty(void) {
__builtin_ia32_emms();
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_empty(void) {
_mm_empty();
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_si64(int __i) {
return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_from_int(int __i) {
return _mm_cvtsi32_si64(__i);
}
#ifdef __x86_64__
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_from_int64(long long __i) {
return (__m64)__i;
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_m64(long long __i) {
return (__m64)__i;
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_si64(long long __i) {
return (__m64)__i;
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi64x(long long __i) {
return (__m64)__i;
}
#endif
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si32(__m64 __i) {
return __builtin_ia32_vec_ext_v2si((__v2si)__i, 0);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_to_int(__m64 __i) {
return _mm_cvtsi64_si32(__i);
}
#ifdef __x86_64__
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_to_int64(__m64 __i) {
return (long long)__i;
}
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtm64_si64(__m64 __i) {
return (long long)__i;
}
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si64x(__m64 __i) {
return (long long)__i;
}
#endif
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_pi16(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_packsswb(__m64 __m1, __m64 __m2) {
return _mm_packs_pi16(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_pi32(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_packssdw(__m64 __m1, __m64 __m2) {
return _mm_packs_pi32(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_pu16(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_packuswb(__m64 __m1, __m64 __m2) {
return _mm_packs_pu16(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckhbw(__m64 __m1, __m64 __m2) {
return _mm_unpackhi_pi8(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckhwd(__m64 __m1, __m64 __m2) {
return _mm_unpackhi_pi16(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckhdq(__m64 __m1, __m64 __m2) {
return _mm_unpackhi_pi32(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpcklbw(__m64 __m1, __m64 __m2) {
return _mm_unpacklo_pi8(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpcklwd(__m64 __m1, __m64 __m2) {
return _mm_unpacklo_pi16(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_punpckldq(__m64 __m1, __m64 __m2) {
return _mm_unpacklo_pi32(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pi8(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddb(__m64 __m1, __m64 __m2) {
return _mm_add_pi8(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pi16(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddw(__m64 __m1, __m64 __m2) {
return _mm_add_pi16(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pi32(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddd(__m64 __m1, __m64 __m2) {
return _mm_add_pi32(__m1, __m2);
}
#ifndef __SSE2__
#pragma GCC push_options
#pragma GCC target("sse2,mmx")
#define __DISABLE_SSE2__
#endif /* __SSE2__ */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_si64(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_paddq((__v1di)__m1, (__v1di)__m2);
}
#ifdef __DISABLE_SSE2__
#undef __DISABLE_SSE2__
#pragma GCC pop_options
#endif /* __DISABLE_SSE2__ */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pi8(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddsb(__m64 __m1, __m64 __m2) {
return _mm_adds_pi8(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pi16(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddsw(__m64 __m1, __m64 __m2) {
return _mm_adds_pi16(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pu8(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddusb(__m64 __m1, __m64 __m2) {
return _mm_adds_pu8(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_pu16(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_paddusw(__m64 __m1, __m64 __m2) {
return _mm_adds_pu16(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pi8(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubb(__m64 __m1, __m64 __m2) {
return _mm_sub_pi8(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pi16(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubw(__m64 __m1, __m64 __m2) {
return _mm_sub_pi16(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pi32(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubd(__m64 __m1, __m64 __m2) {
return _mm_sub_pi32(__m1, __m2);
}
#ifndef __SSE2__
#pragma GCC push_options
#pragma GCC target("sse2,mmx")
#define __DISABLE_SSE2__
#endif /* __SSE2__ */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_si64(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_psubq((__v1di)__m1, (__v1di)__m2);
}
#ifdef __DISABLE_SSE2__
#undef __DISABLE_SSE2__
#pragma GCC pop_options
#endif /* __DISABLE_SSE2__ */
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pi8(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubsb(__m64 __m1, __m64 __m2) {
return _mm_subs_pi8(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pi16(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubsw(__m64 __m1, __m64 __m2) {
return _mm_subs_pi16(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pu8(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubusb(__m64 __m1, __m64 __m2) {
return _mm_subs_pu8(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_pu16(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psubusw(__m64 __m1, __m64 __m2) {
return _mm_subs_pu16(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_madd_pi16(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmaddwd(__m64 __m1, __m64 __m2) {
return _mm_madd_pi16(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmulhw(__m64 __m1, __m64 __m2) {
return _mm_mulhi_pi16(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_pi16(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pmullw(__m64 __m1, __m64 __m2) {
return _mm_mullo_pi16(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_pi16(__m64 __m, __m64 __count) {
return (__m64)__builtin_ia32_psllw((__v4hi)__m, (__v4hi)__count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllw(__m64 __m, __m64 __count) {
return _mm_sll_pi16(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_pi16(__m64 __m, int __count) {
return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllwi(__m64 __m, int __count) {
return _mm_slli_pi16(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_pi32(__m64 __m, __m64 __count) {
return (__m64)__builtin_ia32_pslld((__v2si)__m, (__v2si)__count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pslld(__m64 __m, __m64 __count) {
return _mm_sll_pi32(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_pi32(__m64 __m, int __count) {
return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pslldi(__m64 __m, int __count) {
return _mm_slli_pi32(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_si64(__m64 __m, __m64 __count) {
return (__m64)__builtin_ia32_psllq((__v1di)__m, (__v1di)__count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllq(__m64 __m, __m64 __count) {
return _mm_sll_si64(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_si64(__m64 __m, int __count) {
return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psllqi(__m64 __m, int __count) {
return _mm_slli_si64(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_pi16(__m64 __m, __m64 __count) {
return (__m64)__builtin_ia32_psraw((__v4hi)__m, (__v4hi)__count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psraw(__m64 __m, __m64 __count) {
return _mm_sra_pi16(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_pi16(__m64 __m, int __count) {
return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrawi(__m64 __m, int __count) {
return _mm_srai_pi16(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_pi32(__m64 __m, __m64 __count) {
return (__m64)__builtin_ia32_psrad((__v2si)__m, (__v2si)__count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrad(__m64 __m, __m64 __count) {
return _mm_sra_pi32(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_pi32(__m64 __m, int __count) {
return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psradi(__m64 __m, int __count) {
return _mm_srai_pi32(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_pi16(__m64 __m, __m64 __count) {
return (__m64)__builtin_ia32_psrlw((__v4hi)__m, (__v4hi)__count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlw(__m64 __m, __m64 __count) {
return _mm_srl_pi16(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_pi16(__m64 __m, int __count) {
return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlwi(__m64 __m, int __count) {
return _mm_srli_pi16(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_pi32(__m64 __m, __m64 __count) {
return (__m64)__builtin_ia32_psrld((__v2si)__m, (__v2si)__count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrld(__m64 __m, __m64 __count) {
return _mm_srl_pi32(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_pi32(__m64 __m, int __count) {
return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrldi(__m64 __m, int __count) {
return _mm_srli_pi32(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_si64(__m64 __m, __m64 __count) {
return (__m64)__builtin_ia32_psrlq((__v1di)__m, (__v1di)__count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlq(__m64 __m, __m64 __count) {
return _mm_srl_si64(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_si64(__m64 __m, int __count) {
return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_psrlqi(__m64 __m, int __count) {
return _mm_srli_si64(__m, __count);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_si64(__m64 __m1, __m64 __m2) {
return __builtin_ia32_pand(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pand(__m64 __m1, __m64 __m2) {
return _mm_and_si64(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_si64(__m64 __m1, __m64 __m2) {
return __builtin_ia32_pandn(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pandn(__m64 __m1, __m64 __m2) {
return _mm_andnot_si64(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_si64(__m64 __m1, __m64 __m2) {
return __builtin_ia32_por(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_por(__m64 __m1, __m64 __m2) {
return _mm_or_si64(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_si64(__m64 __m1, __m64 __m2) {
return __builtin_ia32_pxor(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pxor(__m64 __m1, __m64 __m2) {
return _mm_xor_si64(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpeqb(__m64 __m1, __m64 __m2) {
return _mm_cmpeq_pi8(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpgtb(__m64 __m1, __m64 __m2) {
return _mm_cmpgt_pi8(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpeqw(__m64 __m1, __m64 __m2) {
return _mm_cmpeq_pi16(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpgtw(__m64 __m1, __m64 __m2) {
return _mm_cmpgt_pi16(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpeqd(__m64 __m1, __m64 __m2) {
return _mm_cmpeq_pi32(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_pcmpgtd(__m64 __m1, __m64 __m2) {
return _mm_cmpgt_pi32(__m1, __m2);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_si64(void) {
return (__m64)0LL;
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi32(int __i1, int __i0) {
return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
return (__m64)__builtin_ia32_vec_init_v4hi(__w0, __w1, __w2, __w3);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
char __b2, char __b1, char __b0) {
return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3, __b4, __b5,
__b6, __b7);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pi32(int __i0, int __i1) {
return _mm_set_pi32(__i1, __i0);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
return _mm_set_pi16(__w3, __w2, __w1, __w0);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
char __b5, char __b6, char __b7) {
return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pi32(int __i) {
return _mm_set_pi32(__i, __i);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pi16(short __w) {
return _mm_set_pi16(__w, __w, __w, __w);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pi8(char __b) {
return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
}
#ifdef __DISABLE_MMX__
#undef __DISABLE_MMX__
#pragma GCC pop_options
#endif /* __DISABLE_MMX__ */
#endif /* _MMINTRIN_H_INCLUDED */

View file

@ -0,0 +1,48 @@
#if !defined _IMMINTRIN_H_INCLUDED
#error "Never use <movdirintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _MOVDIRINTRIN_H_INCLUDED
#define _MOVDIRINTRIN_H_INCLUDED
#ifndef __MOVDIRI__
#pragma GCC push_options
#pragma GCC target("movdiri")
#define __DISABLE_MOVDIRI__
#endif /* __MOVDIRI__ */
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_directstoreu_u32(void *__P, unsigned int __A) {
__builtin_ia32_directstoreu_u32((unsigned int *)__P, __A);
}
#ifdef __x86_64__
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_directstoreu_u64(void *__P, unsigned long long __A) {
__builtin_ia32_directstoreu_u64((unsigned long long *)__P, __A);
}
#endif
#ifdef __DISABLE_MOVDIRI__
#undef __DISABLE_MOVDIRI__
#pragma GCC pop_options
#endif /* __DISABLE_MOVDIRI__ */
#ifndef __MOVDIR64B__
#pragma GCC push_options
#pragma GCC target("movdir64b")
#define __DISABLE_MOVDIR64B__
#endif /* __MOVDIR64B__ */
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_movdir64b(void *__P, const void *__Q) {
__builtin_ia32_movdir64b(__P, __Q);
}
#ifdef __DISABLE_MOVDIR64B__
#undef __DISABLE_MOVDIR64B__
#pragma GCC pop_options
#endif /* __DISABLE_MOVDIR64B__ */
#endif /* _MOVDIRINTRIN_H_INCLUDED. */

View file

@ -0,0 +1,27 @@
#ifndef _MWAITXINTRIN_H_INCLUDED
#define _MWAITXINTRIN_H_INCLUDED
#ifndef __MWAITX__
#pragma GCC push_options
#pragma GCC target("mwaitx")
#define __DISABLE_MWAITX__
#endif /* __MWAITX__ */
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_monitorx(void const* __P, unsigned int __E, unsigned int __H) {
__builtin_ia32_monitorx(__P, __E, __H);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mwaitx(unsigned int __E, unsigned int __H, unsigned int __C) {
__builtin_ia32_mwaitx(__E, __H, __C);
}
#ifdef __DISABLE_MWAITX__
#undef __DISABLE_MWAITX__
#pragma GCC pop_options
#endif /* __DISABLE_MWAITX__ */
#endif /* _MWAITXINTRIN_H_INCLUDED */

View file

@ -0,0 +1,4 @@
#ifndef _NMMINTRIN_H_INCLUDED
#define _NMMINTRIN_H_INCLUDED
#include "third_party/intel/smmintrin.internal.h"
#endif /* _NMMINTRIN_H_INCLUDED */

View file

@ -0,0 +1,54 @@
#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <pconfigintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _PCONFIGINTRIN_H_INCLUDED
#define _PCONFIGINTRIN_H_INCLUDED
#ifndef __PCONFIG__
#pragma GCC push_options
#pragma GCC target("pconfig")
#define __DISABLE_PCONFIG__
#endif /* __PCONFIG__ */
#define __pconfig_b(leaf, b, retval) \
__asm__ __volatile__("pconfig\n\t" \
: "=a"(retval) \
: "a"(leaf), "b"(b) \
: "c" \
"c")
#define __pconfig_generic(leaf, b, c, d, retval) \
__asm__ __volatile__("pconfig\n\t" \
: "=a"(retval), "=b"(b), "=c"(c), "=d"(d) \
: "a"(leaf), "b"(b), "c"(c), "d"(d) \
: "cc")
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_pconfig_u32(const unsigned int __L, size_t __D[]) {
enum __pconfig_type {
__PCONFIG_KEY_PROGRAM = 0x01,
};
unsigned int __R = 0;
if (!__builtin_constant_p(__L))
__pconfig_generic(__L, __D[0], __D[1], __D[2], __R);
else
switch (__L) {
case __PCONFIG_KEY_PROGRAM:
__pconfig_b(__L, __D[0], __R);
break;
default:
__pconfig_generic(__L, __D[0], __D[1], __D[2], __R);
}
return __R;
}
#ifdef __DISABLE_PCONFIG__
#undef __DISABLE_PCONFIG__
#pragma GCC pop_options
#endif /* __DISABLE_PCONFIG__ */
#endif /* _PCONFIGINTRIN_H_INCLUDED */

31
third_party/intel/pkuintrin.internal.h vendored Normal file
View file

@ -0,0 +1,31 @@
#if !defined _IMMINTRIN_H_INCLUDED
#error "Never use <pkuintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _PKUINTRIN_H_INCLUDED
#define _PKUINTRIN_H_INCLUDED
#ifndef __PKU__
#pragma GCC push_options
#pragma GCC target("pku")
#define __DISABLE_PKU__
#endif /* __PKU__ */
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_rdpkru_u32(void) {
return __builtin_ia32_rdpkru();
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_wrpkru(unsigned int __key) {
__builtin_ia32_wrpkru(__key);
}
#ifdef __DISABLE_PKU__
#undef __DISABLE_PKU__
#pragma GCC pop_options
#endif /* __DISABLE_PKU__ */
#endif /* _PKUINTRIN_H_INCLUDED */

102
third_party/intel/pmmintrin.internal.h vendored Normal file
View file

@ -0,0 +1,102 @@
#ifndef _PMMINTRIN_H_INCLUDED
#define _PMMINTRIN_H_INCLUDED
#include "third_party/intel/emmintrin.internal.h"
#ifndef __SSE3__
#pragma GCC push_options
#pragma GCC target("sse3")
#define __DISABLE_SSE3__
#endif /* __SSE3__ */
#define _MM_DENORMALS_ZERO_MASK 0x0040
#define _MM_DENORMALS_ZERO_ON 0x0040
#define _MM_DENORMALS_ZERO_OFF 0x0000
#define _MM_SET_DENORMALS_ZERO_MODE(mode) \
_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (mode))
#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_addsub_ps(__m128 __X, __m128 __Y) {
return (__m128)__builtin_ia32_addsubps((__v4sf)__X, (__v4sf)__Y);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_ps(__m128 __X, __m128 __Y) {
return (__m128)__builtin_ia32_haddps((__v4sf)__X, (__v4sf)__Y);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_ps(__m128 __X, __m128 __Y) {
return (__m128)__builtin_ia32_hsubps((__v4sf)__X, (__v4sf)__Y);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movehdup_ps(__m128 __X) {
return (__m128)__builtin_ia32_movshdup((__v4sf)__X);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_moveldup_ps(__m128 __X) {
return (__m128)__builtin_ia32_movsldup((__v4sf)__X);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_addsub_pd(__m128d __X, __m128d __Y) {
return (__m128d)__builtin_ia32_addsubpd((__v2df)__X, (__v2df)__Y);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_pd(__m128d __X, __m128d __Y) {
return (__m128d)__builtin_ia32_haddpd((__v2df)__X, (__v2df)__Y);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_pd(__m128d __X, __m128d __Y) {
return (__m128d)__builtin_ia32_hsubpd((__v2df)__X, (__v2df)__Y);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loaddup_pd(double const *__P) {
return _mm_load1_pd(__P);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movedup_pd(__m128d __X) {
return _mm_shuffle_pd(__X, __X, _MM_SHUFFLE2(0, 0));
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_lddqu_si128(__m128i const *__P) {
return (__m128i)__builtin_ia32_lddqu((char const *)__P);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_monitor(void const *__P, unsigned int __E, unsigned int __H) {
__builtin_ia32_monitor(__P, __E, __H);
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mwait(unsigned int __E, unsigned int __H) {
__builtin_ia32_mwait(__E, __H);
}
#ifdef __DISABLE_SSE3__
#undef __DISABLE_SSE3__
#pragma GCC pop_options
#endif /* __DISABLE_SSE3__ */
#endif /* _PMMINTRIN_H_INCLUDED */

View file

@ -0,0 +1,29 @@
#ifndef _POPCNTINTRIN_H_INCLUDED
#define _POPCNTINTRIN_H_INCLUDED
#ifndef __POPCNT__
#pragma GCC push_options
#pragma GCC target("popcnt")
#define __DISABLE_POPCNT__
#endif /* __POPCNT__ */
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_popcnt_u32(unsigned int __X) {
return __builtin_popcount(__X);
}
#ifdef __x86_64__
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_popcnt_u64(unsigned long long __X) {
return __builtin_popcountll(__X);
}
#endif
#ifdef __DISABLE_POPCNT__
#undef __DISABLE_POPCNT__
#pragma GCC pop_options
#endif /* __DISABLE_POPCNT__ */
#endif /* _POPCNTINTRIN_H_INCLUDED */

View file

@ -0,0 +1,15 @@
#if !defined _IMMINTRIN_H_INCLUDED && !defined _MM3DNOW_H_INCLUDED
#error \
"Never use <prfchwintrin.h> directly; include <immintrin.h> or <mm3dnow.h> instead."
#endif
#ifndef _PRFCHWINTRIN_H_INCLUDED
#define _PRFCHWINTRIN_H_INCLUDED
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_m_prefetchw(void *__P) {
__builtin_prefetch(__P, 1, 3 /* _MM_HINT_T0 */);
}
#endif /* _PRFCHWINTRIN_H_INCLUDED */

View file

@ -0,0 +1,39 @@
#if !defined _IMMINTRIN_H_INCLUDED
#error "Never use <rdseedintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _RDSEEDINTRIN_H_INCLUDED
#define _RDSEEDINTRIN_H_INCLUDED
#ifndef __RDSEED__
#pragma GCC push_options
#pragma GCC target("rdseed")
#define __DISABLE_RDSEED__
#endif /* __RDSEED__ */
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_rdseed16_step(unsigned short *__p) {
return __builtin_ia32_rdseed_hi_step(__p);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_rdseed32_step(unsigned int *__p) {
return __builtin_ia32_rdseed_si_step(__p);
}
#ifdef __x86_64__
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_rdseed64_step(unsigned long long *__p) {
return __builtin_ia32_rdseed_di_step(__p);
}
#endif
#ifdef __DISABLE_RDSEED__
#undef __DISABLE_RDSEED__
#pragma GCC pop_options
#endif /* __DISABLE_RDSEED__ */
#endif /* _RDSEEDINTRIN_H_INCLUDED */

50
third_party/intel/rtmintrin.internal.h vendored Normal file
View file

@ -0,0 +1,50 @@
#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <rtmintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _RTMINTRIN_H_INCLUDED
#define _RTMINTRIN_H_INCLUDED
#ifndef __RTM__
#pragma GCC push_options
#pragma GCC target("rtm")
#define __DISABLE_RTM__
#endif /* __RTM__ */
#define _XBEGIN_STARTED (~0u)
#define _XABORT_EXPLICIT (1 << 0)
#define _XABORT_RETRY (1 << 1)
#define _XABORT_CONFLICT (1 << 2)
#define _XABORT_CAPACITY (1 << 3)
#define _XABORT_DEBUG (1 << 4)
#define _XABORT_NESTED (1 << 5)
#define _XABORT_CODE(x) (((x) >> 24) & 0xFF)
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_xbegin(void) {
return __builtin_ia32_xbegin();
}
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_xend(void) {
__builtin_ia32_xend();
}
#ifdef __OPTIMIZE__
extern __inline void
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_xabort(const unsigned int __imm) {
__builtin_ia32_xabort(__imm);
}
#else
#define _xabort(N) __builtin_ia32_xabort(N)
#endif /* __OPTIMIZE__ */
#ifdef __DISABLE_RTM__
#undef __DISABLE_RTM__
#pragma GCC pop_options
#endif /* __DISABLE_RTM__ */
#endif /* _RTMINTRIN_H_INCLUDED */

219
third_party/intel/sgxintrin.internal.h vendored Normal file
View file

@ -0,0 +1,219 @@
#ifndef _SGXINTRIN_H_INCLUDED
#define _SGXINTRIN_H_INCLUDED
#ifndef __SGX__
#pragma GCC push_options
#pragma GCC target("sgx")
#define __DISABLE_SGX__
#endif /* __SGX__ */
#define __encls_bc(leaf, b, c, retval) \
__asm__ __volatile__("encls\n\t" \
: "=a"(retval) \
: "a"(leaf), "b"(b), "c"(c) \
: "cc")
#define __encls_bcd(leaf, b, c, d, retval) \
__asm__ __volatile__("encls\n\t" \
: "=a"(retval) \
: "a"(leaf), "b"(b), "c"(c), "d"(d) \
: "cc")
#define __encls_c(leaf, c, retval) \
__asm__ __volatile__("encls\n\t" : "=a"(retval) : "a"(leaf), "c"(c) : "cc")
#define __encls_edbgrd(leaf, b, c, retval) \
__asm__ __volatile__("encls\n\t" : "=a"(retval), "=b"(b) : "a"(leaf), "c"(c))
#define __encls_generic(leaf, b, c, d, retval) \
__asm__ __volatile__("encls\n\t" \
: "=a"(retval), "=b"(b), "=c"(c), "=d"(d) \
: "a"(leaf), "b"(b), "c"(c), "d"(d) \
: "cc")
#define __enclu_bc(leaf, b, c, retval) \
__asm__ __volatile__("enclu\n\t" \
: "=a"(retval) \
: "a"(leaf), "b"(b), "c"(c) \
: "cc")
#define __enclu_bcd(leaf, b, c, d, retval) \
__asm__ __volatile__("enclu\n\t" \
: "=a"(retval) \
: "a"(leaf), "b"(b), "c"(c), "d"(d) \
: "cc")
#define __enclu_eenter(leaf, b, c, retval) \
__asm__ __volatile__("enclu\n\t" \
: "=a"(retval), "=c"(c) \
: "a"(leaf), "b"(b), "c"(c) \
: "cc")
#define __enclu_eexit(leaf, b, c, retval) \
__asm__ __volatile__("enclu\n\t" \
: "=a"(retval), "=c"(c) \
: "a"(leaf), "b"(b) \
: "cc")
#define __enclu_generic(leaf, b, c, d, retval) \
__asm__ __volatile__("enclu\n\t" \
: "=a"(retval), "=b"(b), "=c"(c), "=d"(d) \
: "a"(leaf), "b"(b), "c"(c), "d"(d) \
: "cc")
#define __enclv_bc(leaf, b, c, retval) \
__asm__ __volatile__("enclv\n\t" \
: "=a"(retval) \
: "a"(leaf), "b"(b), "c"(c) \
: "cc")
#define __enclv_cd(leaf, c, d, retval) \
__asm__ __volatile__("enclv\n\t" \
: "=a"(retval) \
: "a"(leaf), "c"(c), "d"(d) \
: "cc")
#define __enclv_generic(leaf, b, c, d, retval) \
__asm__ __volatile__("enclv\n\t" \
: "=a"(retval), "=b"(b), "=c"(b), "=d"(d) \
: "a"(leaf), "b"(b), "c"(c), "d"(d) \
: "cc")
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_encls_u32(const unsigned int __L, size_t __D[]) {
enum __encls_type {
__SGX_ECREATE = 0x00,
__SGX_EADD = 0x01,
__SGX_EINIT = 0x02,
__SGX_EREMOVE = 0x03,
__SGX_EDBGRD = 0x04,
__SGX_EDBGWR = 0x05,
__SGX_EEXTEND = 0x06,
__SGX_ELDB = 0x07,
__SGX_ELDU = 0x08,
__SGX_EBLOCK = 0x09,
__SGX_EPA = 0x0A,
__SGX_EWB = 0x0B,
__SGX_ETRACK = 0x0C,
__SGX_EAUG = 0x0D,
__SGX_EMODPR = 0x0E,
__SGX_EMODT = 0x0F,
__SGX_ERDINFO = 0x10,
__SGX_ETRACKC = 0x11,
__SGX_ELDBC = 0x12,
__SGX_ELDUC = 0x13
};
enum __encls_type __T = (enum __encls_type)__L;
unsigned int __R = 0;
if (!__builtin_constant_p(__T))
__encls_generic(__L, __D[0], __D[1], __D[2], __R);
else
switch (__T) {
case __SGX_ECREATE:
case __SGX_EADD:
case __SGX_EDBGWR:
case __SGX_EEXTEND:
case __SGX_EPA:
case __SGX_EMODPR:
case __SGX_EMODT:
case __SGX_EAUG:
case __SGX_ERDINFO:
__encls_bc(__L, __D[0], __D[1], __R);
break;
case __SGX_EINIT:
case __SGX_ELDB:
case __SGX_ELDU:
case __SGX_EWB:
case __SGX_ELDBC:
case __SGX_ELDUC:
__encls_bcd(__L, __D[0], __D[1], __D[2], __R);
break;
case __SGX_EREMOVE:
case __SGX_EBLOCK:
case __SGX_ETRACK:
case __SGX_ETRACKC:
__encls_c(__L, __D[1], __R);
break;
case __SGX_EDBGRD:
__encls_edbgrd(__L, __D[0], __D[1], __R);
break;
default:
__encls_generic(__L, __D[0], __D[1], __D[2], __R);
}
return __R;
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_enclu_u32(const unsigned int __L, size_t __D[]) {
enum __enclu_type {
__SGX_EREPORT = 0x00,
__SGX_EGETKEY = 0x01,
__SGX_EENTER = 0x02,
__SGX_ERESUME = 0x03,
__SGX_EEXIT = 0x04,
__SGX_EACCEPT = 0x05,
__SGX_EMODPE = 0x06,
__SGX_EACCEPTCOPY = 0x07
};
enum __enclu_type __T = (enum __enclu_type)__L;
unsigned int __R = 0;
if (!__builtin_constant_p(__T))
__enclu_generic(__L, __D[0], __D[1], __D[2], __R);
else
switch (__T) {
case __SGX_EREPORT:
case __SGX_EACCEPTCOPY:
__enclu_bcd(__L, __D[0], __D[1], __D[2], __R);
break;
case __SGX_EGETKEY:
case __SGX_ERESUME:
case __SGX_EACCEPT:
case __SGX_EMODPE:
__enclu_bc(__L, __D[0], __D[1], __R);
break;
case __SGX_EENTER:
__enclu_eenter(__L, __D[0], __D[1], __R);
break;
case __SGX_EEXIT:
__enclu_eexit(__L, __D[0], __D[1], __R);
break;
default:
__enclu_generic(__L, __D[0], __D[1], __D[2], __R);
}
return __R;
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_enclv_u32(const unsigned int __L, size_t __D[]) {
enum __enclv_type {
__SGX_EDECVIRTCHILD = 0x00,
__SGX_EINCVIRTCHILD = 0x01,
__SGX_ESETCONTEXT = 0x02
};
unsigned int __R = 0;
if (!__builtin_constant_p(__L))
__enclv_generic(__L, __D[0], __D[1], __D[2], __R);
else
switch (__L) {
case __SGX_EDECVIRTCHILD:
case __SGX_EINCVIRTCHILD:
__enclv_bc(__L, __D[0], __D[1], __R);
break;
case __SGX_ESETCONTEXT:
__enclv_cd(__L, __D[1], __D[2], __R);
break;
default:
__enclv_generic(__L, __D[0], __D[1], __D[2], __R);
}
return __R;
}
#ifdef __DISABLE_SGX__
#undef __DISABLE_SGX__
#pragma GCC pop_options
#endif /* __DISABLE_SGX__ */
#endif /* _SGXINTRIN_H_INCLUDED */

68
third_party/intel/shaintrin.internal.h vendored Normal file
View file

@ -0,0 +1,68 @@
#ifndef _IMMINTRIN_H_INCLUDED
#error "Never use <shaintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef _SHAINTRIN_H_INCLUDED
#define _SHAINTRIN_H_INCLUDED
#ifndef __SHA__
#pragma GCC push_options
#pragma GCC target("sha")
#define __DISABLE_SHA__
#endif /* __SHA__ */
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha1msg1_epu32(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_sha1msg1((__v4si)__A, (__v4si)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha1msg2_epu32(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_sha1msg2((__v4si)__A, (__v4si)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha1nexte_epu32(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_sha1nexte((__v4si)__A, (__v4si)__B);
}
#ifdef __OPTIMIZE__
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha1rnds4_epu32(__m128i __A, __m128i __B, const int __I) {
return (__m128i)__builtin_ia32_sha1rnds4((__v4si)__A, (__v4si)__B, __I);
}
#else
#define _mm_sha1rnds4_epu32(A, B, I) \
((__m128i)__builtin_ia32_sha1rnds4((__v4si)(__m128i)A, (__v4si)(__m128i)B, \
(int)I))
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha256msg1_epu32(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_sha256msg1((__v4si)__A, (__v4si)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha256msg2_epu32(__m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_sha256msg2((__v4si)__A, (__v4si)__B);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sha256rnds2_epu32(__m128i __A, __m128i __B, __m128i __C) {
return (__m128i)__builtin_ia32_sha256rnds2((__v4si)__A, (__v4si)__B,
(__v4si)__C);
}
#ifdef __DISABLE_SHA__
#undef __DISABLE_SHA__
#pragma GCC pop_options
#endif /* __DISABLE_SHA__ */
#endif /* _SHAINTRIN_H_INCLUDED */

705
third_party/intel/smmintrin.internal.h vendored Normal file
View file

@ -0,0 +1,705 @@
#ifndef _SMMINTRIN_H_INCLUDED
#define _SMMINTRIN_H_INCLUDED
#include "third_party/intel/tmmintrin.internal.h"
#ifndef __SSE4_1__
#pragma GCC push_options
#pragma GCC target("sse4.1")
#define __DISABLE_SSE4_1__
#endif /* __SSE4_1__ */
#define _MM_FROUND_TO_NEAREST_INT 0x00
#define _MM_FROUND_TO_NEG_INF 0x01
#define _MM_FROUND_TO_POS_INF 0x02
#define _MM_FROUND_TO_ZERO 0x03
#define _MM_FROUND_CUR_DIRECTION 0x04
#define _MM_FROUND_RAISE_EXC 0x00
#define _MM_FROUND_NO_EXC 0x08
#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_testz_si128(__m128i __M, __m128i __V) {
return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_testc_si128(__m128i __M, __m128i __V) {
return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_testnzc_si128(__m128i __M, __m128i __V) {
return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
}
#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
#ifdef __OPTIMIZE__
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_round_pd(__m128d __V, const int __M) {
return (__m128d)__builtin_ia32_roundpd((__v2df)__V, __M);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_round_sd(__m128d __D, __m128d __V, const int __M) {
return (__m128d)__builtin_ia32_roundsd((__v2df)__D, (__v2df)__V, __M);
}
#else
#define _mm_round_pd(V, M) \
((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(V), (int)(M)))
#define _mm_round_sd(D, V, M) \
((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(D), (__v2df)(__m128d)(V), \
(int)(M)))
#endif
#ifdef __OPTIMIZE__
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_round_ps(__m128 __V, const int __M) {
return (__m128)__builtin_ia32_roundps((__v4sf)__V, __M);
}
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_round_ss(__m128 __D, __m128 __V, const int __M) {
return (__m128)__builtin_ia32_roundss((__v4sf)__D, (__v4sf)__V, __M);
}
#else
#define _mm_round_ps(V, M) \
((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(V), (int)(M)))
#define _mm_round_ss(D, V, M) \
((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(D), (__v4sf)(__m128)(V), \
(int)(M)))
#endif
#define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL)
#define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL)
#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
#define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR)
#define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL)
#define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL)
#define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR)
#define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR)
#ifdef __OPTIMIZE__
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blend_epi16(__m128i __X, __m128i __Y, const int __M) {
return (__m128i)__builtin_ia32_pblendw128((__v8hi)__X, (__v8hi)__Y, __M);
}
#else
#define _mm_blend_epi16(X, Y, M) \
((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(X), \
(__v8hi)(__m128i)(Y), (int)(M)))
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blendv_epi8(__m128i __X, __m128i __Y, __m128i __M) {
return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__X, (__v16qi)__Y,
(__v16qi)__M);
}
#ifdef __OPTIMIZE__
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blend_ps(__m128 __X, __m128 __Y, const int __M) {
return (__m128)__builtin_ia32_blendps((__v4sf)__X, (__v4sf)__Y, __M);
}
#else
#define _mm_blend_ps(X, Y, M) \
((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
(int)(M)))
#endif
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blendv_ps(__m128 __X, __m128 __Y, __m128 __M) {
return (__m128)__builtin_ia32_blendvps((__v4sf)__X, (__v4sf)__Y, (__v4sf)__M);
}
#ifdef __OPTIMIZE__
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blend_pd(__m128d __X, __m128d __Y, const int __M) {
return (__m128d)__builtin_ia32_blendpd((__v2df)__X, (__v2df)__Y, __M);
}
#else
#define _mm_blend_pd(X, Y, M) \
((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
(int)(M)))
#endif
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_blendv_pd(__m128d __X, __m128d __Y, __m128d __M) {
return (__m128d)__builtin_ia32_blendvpd((__v2df)__X, (__v2df)__Y,
(__v2df)__M);
}
#ifdef __OPTIMIZE__
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_dp_ps(__m128 __X, __m128 __Y, const int __M) {
return (__m128)__builtin_ia32_dpps((__v4sf)__X, (__v4sf)__Y, __M);
}
extern __inline __m128d
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_dp_pd(__m128d __X, __m128d __Y, const int __M) {
return (__m128d)__builtin_ia32_dppd((__v2df)__X, (__v2df)__Y, __M);
}
#else
#define _mm_dp_ps(X, Y, M) \
((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
(int)(M)))
#define _mm_dp_pd(X, Y, M) \
((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
(int)(M)))
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi64(__m128i __X, __m128i __Y) {
return (__m128i)((__v2di)__X == (__v2di)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi8(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_pminsb128((__v16qi)__X, (__v16qi)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epi8(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_pmaxsb128((__v16qi)__X, (__v16qi)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epu16(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_pminuw128((__v8hi)__X, (__v8hi)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epu16(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_pmaxuw128((__v8hi)__X, (__v8hi)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi32(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_pminsd128((__v4si)__X, (__v4si)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epi32(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_pmaxsd128((__v4si)__X, (__v4si)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epu32(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_pminud128((__v4si)__X, (__v4si)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epu32(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_pmaxud128((__v4si)__X, (__v4si)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi32(__m128i __X, __m128i __Y) {
return (__m128i)((__v4su)__X * (__v4su)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epi32(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_pmuldq128((__v4si)__X, (__v4si)__Y);
}
#ifdef __OPTIMIZE__
extern __inline __m128
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_ps(__m128 __D, __m128 __S, const int __N) {
return (__m128)__builtin_ia32_insertps128((__v4sf)__D, (__v4sf)__S, __N);
}
#else
#define _mm_insert_ps(D, S, N) \
((__m128)__builtin_ia32_insertps128((__v4sf)(__m128)(D), \
(__v4sf)(__m128)(S), (int)(N)))
#endif
#define _MM_MK_INSERTPS_NDX(S, D, M) (((S) << 6) | ((D) << 4) | (M))
#ifdef __OPTIMIZE__
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_ps(__m128 __X, const int __N) {
union {
int i;
float f;
} __tmp;
__tmp.f = __builtin_ia32_vec_ext_v4sf((__v4sf)__X, __N);
return __tmp.i;
}
#else
#define _mm_extract_ps(X, N) \
(__extension__({ \
union { \
int i; \
float f; \
} __tmp; \
__tmp.f = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \
__tmp.i; \
}))
#endif
#define _MM_EXTRACT_FLOAT(D, S, N) \
{ (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(S), (N)); }
#define _MM_PICK_OUT_PS(X, N) \
_mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
#ifdef __OPTIMIZE__
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi8(__m128i __D, int __S, const int __N) {
return (__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)__D, __S, __N);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi32(__m128i __D, int __S, const int __N) {
return (__m128i)__builtin_ia32_vec_set_v4si((__v4si)__D, __S, __N);
}
#ifdef __x86_64__
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi64(__m128i __D, long long __S, const int __N) {
return (__m128i)__builtin_ia32_vec_set_v2di((__v2di)__D, __S, __N);
}
#endif
#else
#define _mm_insert_epi8(D, S, N) \
((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(D), (int)(S), \
(int)(N)))
#define _mm_insert_epi32(D, S, N) \
((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(D), (int)(S), \
(int)(N)))
#ifdef __x86_64__
#define _mm_insert_epi64(D, S, N) \
((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(D), (long long)(S), \
(int)(N)))
#endif
#endif
#ifdef __OPTIMIZE__
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi8(__m128i __X, const int __N) {
return (unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)__X, __N);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi32(__m128i __X, const int __N) {
return __builtin_ia32_vec_ext_v4si((__v4si)__X, __N);
}
#ifdef __x86_64__
extern __inline long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi64(__m128i __X, const int __N) {
return __builtin_ia32_vec_ext_v2di((__v2di)__X, __N);
}
#endif
#else
#define _mm_extract_epi8(X, N) \
((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
(int)(N)))
#define _mm_extract_epi32(X, N) \
((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
#ifdef __x86_64__
#define _mm_extract_epi64(X, N) \
((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
#endif
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_minpos_epu16(__m128i __X) {
return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__X);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi8_epi32(__m128i __X) {
return (__m128i)__builtin_ia32_pmovsxbd128((__v16qi)__X);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi16_epi32(__m128i __X) {
return (__m128i)__builtin_ia32_pmovsxwd128((__v8hi)__X);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi8_epi64(__m128i __X) {
return (__m128i)__builtin_ia32_pmovsxbq128((__v16qi)__X);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_epi64(__m128i __X) {
return (__m128i)__builtin_ia32_pmovsxdq128((__v4si)__X);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi16_epi64(__m128i __X) {
return (__m128i)__builtin_ia32_pmovsxwq128((__v8hi)__X);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi8_epi16(__m128i __X) {
return (__m128i)__builtin_ia32_pmovsxbw128((__v16qi)__X);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu8_epi32(__m128i __X) {
return (__m128i)__builtin_ia32_pmovzxbd128((__v16qi)__X);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu16_epi32(__m128i __X) {
return (__m128i)__builtin_ia32_pmovzxwd128((__v8hi)__X);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu8_epi64(__m128i __X) {
return (__m128i)__builtin_ia32_pmovzxbq128((__v16qi)__X);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu32_epi64(__m128i __X) {
return (__m128i)__builtin_ia32_pmovzxdq128((__v4si)__X);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu16_epi64(__m128i __X) {
return (__m128i)__builtin_ia32_pmovzxwq128((__v8hi)__X);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepu8_epi16(__m128i __X) {
return (__m128i)__builtin_ia32_pmovzxbw128((__v16qi)__X);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packus_epi32(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_packusdw128((__v4si)__X, (__v4si)__Y);
}
#ifdef __OPTIMIZE__
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mpsadbw_epu8(__m128i __X, __m128i __Y, const int __M) {
return (__m128i)__builtin_ia32_mpsadbw128((__v16qi)__X, (__v16qi)__Y, __M);
}
#else
#define _mm_mpsadbw_epu8(X, Y, M) \
((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
(__v16qi)(__m128i)(Y), (int)(M)))
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_load_si128(__m128i *__X) {
return (__m128i)__builtin_ia32_movntdqa((__v2di *)__X);
}
#ifndef __SSE4_2__
#pragma GCC push_options
#pragma GCC target("sse4.2")
#define __DISABLE_SSE4_2__
#endif
#define _SIDD_UBYTE_OPS 0x00
#define _SIDD_UWORD_OPS 0x01
#define _SIDD_SBYTE_OPS 0x02
#define _SIDD_SWORD_OPS 0x03
#define _SIDD_CMP_EQUAL_ANY 0x00
#define _SIDD_CMP_RANGES 0x04
#define _SIDD_CMP_EQUAL_EACH 0x08
#define _SIDD_CMP_EQUAL_ORDERED 0x0c
#define _SIDD_POSITIVE_POLARITY 0x00
#define _SIDD_NEGATIVE_POLARITY 0x10
#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
#define _SIDD_LEAST_SIGNIFICANT 0x00
#define _SIDD_MOST_SIGNIFICANT 0x40
#define _SIDD_BIT_MASK 0x00
#define _SIDD_UNIT_MASK 0x40
#ifdef __OPTIMIZE__
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpistrm(__m128i __X, __m128i __Y, const int __M) {
return (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)__X, (__v16qi)__Y, __M);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpistri(__m128i __X, __m128i __Y, const int __M) {
return __builtin_ia32_pcmpistri128((__v16qi)__X, (__v16qi)__Y, __M);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpestrm(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) {
return (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)__X, __LX, (__v16qi)__Y,
__LY, __M);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpestri(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) {
return __builtin_ia32_pcmpestri128((__v16qi)__X, __LX, (__v16qi)__Y, __LY,
__M);
}
#else
#define _mm_cmpistrm(X, Y, M) \
((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(X), \
(__v16qi)(__m128i)(Y), (int)(M)))
#define _mm_cmpistri(X, Y, M) \
((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(X), \
(__v16qi)(__m128i)(Y), (int)(M)))
#define _mm_cmpestrm(X, LX, Y, LY, M) \
((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(X), (int)(LX), \
(__v16qi)(__m128i)(Y), (int)(LY), \
(int)(M)))
#define _mm_cmpestri(X, LX, Y, LY, M) \
((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(X), (int)(LX), \
(__v16qi)(__m128i)(Y), (int)(LY), \
(int)(M)))
#endif
#ifdef __OPTIMIZE__
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpistra(__m128i __X, __m128i __Y, const int __M) {
return __builtin_ia32_pcmpistria128((__v16qi)__X, (__v16qi)__Y, __M);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpistrc(__m128i __X, __m128i __Y, const int __M) {
return __builtin_ia32_pcmpistric128((__v16qi)__X, (__v16qi)__Y, __M);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpistro(__m128i __X, __m128i __Y, const int __M) {
return __builtin_ia32_pcmpistrio128((__v16qi)__X, (__v16qi)__Y, __M);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpistrs(__m128i __X, __m128i __Y, const int __M) {
return __builtin_ia32_pcmpistris128((__v16qi)__X, (__v16qi)__Y, __M);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpistrz(__m128i __X, __m128i __Y, const int __M) {
return __builtin_ia32_pcmpistriz128((__v16qi)__X, (__v16qi)__Y, __M);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpestra(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) {
return __builtin_ia32_pcmpestria128((__v16qi)__X, __LX, (__v16qi)__Y, __LY,
__M);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpestrc(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) {
return __builtin_ia32_pcmpestric128((__v16qi)__X, __LX, (__v16qi)__Y, __LY,
__M);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpestro(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) {
return __builtin_ia32_pcmpestrio128((__v16qi)__X, __LX, (__v16qi)__Y, __LY,
__M);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpestrs(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) {
return __builtin_ia32_pcmpestris128((__v16qi)__X, __LX, (__v16qi)__Y, __LY,
__M);
}
extern __inline int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpestrz(__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) {
return __builtin_ia32_pcmpestriz128((__v16qi)__X, __LX, (__v16qi)__Y, __LY,
__M);
}
#else
#define _mm_cmpistra(X, Y, M) \
((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(X), \
(__v16qi)(__m128i)(Y), (int)(M)))
#define _mm_cmpistrc(X, Y, M) \
((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(X), \
(__v16qi)(__m128i)(Y), (int)(M)))
#define _mm_cmpistro(X, Y, M) \
((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(X), \
(__v16qi)(__m128i)(Y), (int)(M)))
#define _mm_cmpistrs(X, Y, M) \
((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(X), \
(__v16qi)(__m128i)(Y), (int)(M)))
#define _mm_cmpistrz(X, Y, M) \
((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(X), \
(__v16qi)(__m128i)(Y), (int)(M)))
#define _mm_cmpestra(X, LX, Y, LY, M) \
((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(X), (int)(LX), \
(__v16qi)(__m128i)(Y), (int)(LY), \
(int)(M)))
#define _mm_cmpestrc(X, LX, Y, LY, M) \
((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(X), (int)(LX), \
(__v16qi)(__m128i)(Y), (int)(LY), \
(int)(M)))
#define _mm_cmpestro(X, LX, Y, LY, M) \
((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(X), (int)(LX), \
(__v16qi)(__m128i)(Y), (int)(LY), \
(int)(M)))
#define _mm_cmpestrs(X, LX, Y, LY, M) \
((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(X), (int)(LX), \
(__v16qi)(__m128i)(Y), (int)(LY), \
(int)(M)))
#define _mm_cmpestrz(X, LX, Y, LY, M) \
((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(X), (int)(LX), \
(__v16qi)(__m128i)(Y), (int)(LY), \
(int)(M)))
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi64(__m128i __X, __m128i __Y) {
return (__m128i)((__v2di)__X > (__v2di)__Y);
}
#ifdef __DISABLE_SSE4_2__
#undef __DISABLE_SSE4_2__
#pragma GCC pop_options
#endif /* __DISABLE_SSE4_2__ */
#ifdef __DISABLE_SSE4_1__
#undef __DISABLE_SSE4_1__
#pragma GCC pop_options
#endif /* __DISABLE_SSE4_1__ */
#include "third_party/intel/popcntintrin.internal.h"
#ifndef __SSE4_1__
#pragma GCC push_options
#pragma GCC target("sse4.1")
#define __DISABLE_SSE4_1__
#endif /* __SSE4_1__ */
#ifndef __SSE4_2__
#pragma GCC push_options
#pragma GCC target("sse4.2")
#define __DISABLE_SSE4_2__
#endif /* __SSE4_1__ */
/* Accumulate CRC32 (polynomial 0x11EDC6F41) value. */
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_crc32_u8(unsigned int __C, unsigned char __V) {
return __builtin_ia32_crc32qi(__C, __V);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_crc32_u16(unsigned int __C, unsigned short __V) {
return __builtin_ia32_crc32hi(__C, __V);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_crc32_u32(unsigned int __C, unsigned int __V) {
return __builtin_ia32_crc32si(__C, __V);
}
#ifdef __x86_64__
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_crc32_u64(unsigned long long __C, unsigned long long __V) {
return __builtin_ia32_crc32di(__C, __V);
}
#endif
#ifdef __DISABLE_SSE4_2__
#undef __DISABLE_SSE4_2__
#pragma GCC pop_options
#endif /* __DISABLE_SSE4_2__ */
#ifdef __DISABLE_SSE4_1__
#undef __DISABLE_SSE4_1__
#pragma GCC pop_options
#endif /* __DISABLE_SSE4_1__ */
#endif /* _SMMINTRIN_H_INCLUDED */

154
third_party/intel/tbmintrin.internal.h vendored Normal file
View file

@ -0,0 +1,154 @@
#ifndef _X86INTRIN_H_INCLUDED
#error "Never use <tbmintrin.h> directly; include <x86intrin.h> instead."
#endif
#ifndef _TBMINTRIN_H_INCLUDED
#define _TBMINTRIN_H_INCLUDED
#ifndef __TBM__
#pragma GCC push_options
#pragma GCC target("tbm")
#define __DISABLE_TBM__
#endif /* __TBM__ */
#ifdef __OPTIMIZE__
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bextri_u32(unsigned int __X, const unsigned int __I) {
return __builtin_ia32_bextri_u32(__X, __I);
}
#else
#define __bextri_u32(X, I) \
((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(X), \
(unsigned int)(I)))
#endif /*__OPTIMIZE__ */
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blcfill_u32(unsigned int __X) {
return __X & (__X + 1);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blci_u32(unsigned int __X) {
return __X | ~(__X + 1);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blcic_u32(unsigned int __X) {
return ~__X & (__X + 1);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blcmsk_u32(unsigned int __X) {
return __X ^ (__X + 1);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blcs_u32(unsigned int __X) {
return __X | (__X + 1);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsfill_u32(unsigned int __X) {
return __X | (__X - 1);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsic_u32(unsigned int __X) {
return ~__X | (__X - 1);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__t1mskc_u32(unsigned int __X) {
return ~__X | (__X + 1);
}
extern __inline unsigned int
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__tzmsk_u32(unsigned int __X) {
return ~__X & (__X - 1);
}
#ifdef __x86_64__
#ifdef __OPTIMIZE__
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__bextri_u64(unsigned long long __X, const unsigned int __I) {
return __builtin_ia32_bextri_u64(__X, __I);
}
#else
#define __bextri_u64(X, I) \
((unsigned long long)__builtin_ia32_bextri_u64((unsigned long long)(X), \
(unsigned long long)(I)))
#endif /*__OPTIMIZE__ */
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blcfill_u64(unsigned long long __X) {
return __X & (__X + 1);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blci_u64(unsigned long long __X) {
return __X | ~(__X + 1);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blcic_u64(unsigned long long __X) {
return ~__X & (__X + 1);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blcmsk_u64(unsigned long long __X) {
return __X ^ (__X + 1);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blcs_u64(unsigned long long __X) {
return __X | (__X + 1);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsfill_u64(unsigned long long __X) {
return __X | (__X - 1);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__blsic_u64(unsigned long long __X) {
return ~__X | (__X - 1);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__t1mskc_u64(unsigned long long __X) {
return ~__X | (__X + 1);
}
extern __inline unsigned long long
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
__tzmsk_u64(unsigned long long __X) {
return ~__X & (__X - 1);
}
#endif /* __x86_64__ */
#ifdef __DISABLE_TBM__
#undef __DISABLE_TBM__
#pragma GCC pop_options
#endif /* __DISABLE_TBM__ */
#endif /* _TBMINTRIN_H_INCLUDED */

217
third_party/intel/tmmintrin.internal.h vendored Normal file
View file

@ -0,0 +1,217 @@
#ifndef _TMMINTRIN_H_INCLUDED
#define _TMMINTRIN_H_INCLUDED
#include "third_party/intel/pmmintrin.internal.h"
#ifndef __SSSE3__
#pragma GCC push_options
#pragma GCC target("ssse3")
#define __DISABLE_SSSE3__
#endif /* __SSSE3__ */
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_epi16(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_phaddw128((__v8hi)__X, (__v8hi)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_epi32(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_phaddd128((__v4si)__X, (__v4si)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadds_epi16(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__X, (__v8hi)__Y);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_pi16(__m64 __X, __m64 __Y) {
return (__m64)__builtin_ia32_phaddw((__v4hi)__X, (__v4hi)__Y);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadd_pi32(__m64 __X, __m64 __Y) {
return (__m64)__builtin_ia32_phaddd((__v2si)__X, (__v2si)__Y);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hadds_pi16(__m64 __X, __m64 __Y) {
return (__m64)__builtin_ia32_phaddsw((__v4hi)__X, (__v4hi)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_epi16(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_phsubw128((__v8hi)__X, (__v8hi)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_epi32(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_phsubd128((__v4si)__X, (__v4si)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubs_epi16(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__X, (__v8hi)__Y);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_pi16(__m64 __X, __m64 __Y) {
return (__m64)__builtin_ia32_phsubw((__v4hi)__X, (__v4hi)__Y);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsub_pi32(__m64 __X, __m64 __Y) {
return (__m64)__builtin_ia32_phsubd((__v2si)__X, (__v2si)__Y);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_hsubs_pi16(__m64 __X, __m64 __Y) {
return (__m64)__builtin_ia32_phsubsw((__v4hi)__X, (__v4hi)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddubs_epi16(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__X, (__v16qi)__Y);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maddubs_pi16(__m64 __X, __m64 __Y) {
return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__X, (__v8qi)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhrs_epi16(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__X, (__v8hi)__Y);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhrs_pi16(__m64 __X, __m64 __Y) {
return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__X, (__v4hi)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi8(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_pshufb128((__v16qi)__X, (__v16qi)__Y);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pi8(__m64 __X, __m64 __Y) {
return (__m64)__builtin_ia32_pshufb((__v8qi)__X, (__v8qi)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi8(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_psignb128((__v16qi)__X, (__v16qi)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi16(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_psignw128((__v8hi)__X, (__v8hi)__Y);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi32(__m128i __X, __m128i __Y) {
return (__m128i)__builtin_ia32_psignd128((__v4si)__X, (__v4si)__Y);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi8(__m64 __X, __m64 __Y) {
return (__m64)__builtin_ia32_psignb((__v8qi)__X, (__v8qi)__Y);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi16(__m64 __X, __m64 __Y) {
return (__m64)__builtin_ia32_psignw((__v4hi)__X, (__v4hi)__Y);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi32(__m64 __X, __m64 __Y) {
return (__m64)__builtin_ia32_psignd((__v2si)__X, (__v2si)__Y);
}
#ifdef __OPTIMIZE__
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) {
return (__m128i)__builtin_ia32_palignr128((__v2di)__X, (__v2di)__Y, __N * 8);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_alignr_pi8(__m64 __X, __m64 __Y, const int __N) {
return (__m64)__builtin_ia32_palignr((__v1di)__X, (__v1di)__Y, __N * 8);
}
#else
#define _mm_alignr_epi8(X, Y, N) \
((__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(X), \
(__v2di)(__m128i)(Y), (int)(N)*8))
#define _mm_alignr_pi8(X, Y, N) \
((__m64)__builtin_ia32_palignr((__v1di)(__m64)(X), (__v1di)(__m64)(Y), \
(int)(N)*8))
#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi8(__m128i __X) {
return (__m128i)__builtin_ia32_pabsb128((__v16qi)__X);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi16(__m128i __X) {
return (__m128i)__builtin_ia32_pabsw128((__v8hi)__X);
}
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_epi32(__m128i __X) {
return (__m128i)__builtin_ia32_pabsd128((__v4si)__X);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi8(__m64 __X) {
return (__m64)__builtin_ia32_pabsb((__v8qi)__X);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi16(__m64 __X) {
return (__m64)__builtin_ia32_pabsw((__v4hi)__X);
}
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_abs_pi32(__m64 __X) {
return (__m64)__builtin_ia32_pabsd((__v2si)__X);
}
#ifdef __DISABLE_SSSE3__
#undef __DISABLE_SSSE3__
#pragma GCC pop_options
#endif /* __DISABLE_SSSE3__ */
#endif /* _TMMINTRIN_H_INCLUDED */

Some files were not shown because too many files have changed in this diff Show more