From 1006c47993d35db3480d23819bdf216e9741aeff Mon Sep 17 00:00:00 2001 From: wangguokun Date: Mon, 8 Jul 2024 02:22:49 +0000 Subject: [PATCH] lib-arm-intrinsics: port lib-arm-intrinsics to tenonos Signed-off-by: wangguokun --- README.md | 1 - include/gcc/arm_bf16.h | 52 + include/gcc/arm_fp16.h | 2 +- include/gcc/arm_neon.h | 24561 ++++++++++++++++++++++----------------- 4 files changed, 13788 insertions(+), 10828 deletions(-) create mode 100644 include/gcc/arm_bf16.h diff --git a/README.md b/README.md index ed6ae7c..4bdbc53 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ libarm\_intriniscs for Unikraft =================== - Please refer to the `README.md` as well as the documentation in the `doc/` subdirectory of the main unikraft repository. diff --git a/include/gcc/arm_bf16.h b/include/gcc/arm_bf16.h new file mode 100644 index 0000000..59ed67f --- /dev/null +++ b/include/gcc/arm_bf16.h @@ -0,0 +1,52 @@ +/* Arm BF16 instrinsics include file. + + Copyright (C) 2019-2021 Free Software Foundation, Inc. + Contributed by Arm. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + GCC is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _AARCH64_BF16_H_ +#define _AARCH64_BF16_H_ + +typedef __bf16 bfloat16_t; +typedef float float32_t; + +#pragma GCC push_options +#pragma GCC target ("+nothing+bf16+nosimd") + +__extension__ extern __inline bfloat16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvth_bf16_f32 (float32_t __a) +{ + return __builtin_aarch64_bfcvtbf (__a); +} + +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvtah_f32_bf16 (bfloat16_t __a) +{ + return __builtin_aarch64_bfcvtsf (__a); +} + +#pragma GCC pop_options + +#endif diff --git a/include/gcc/arm_fp16.h b/include/gcc/arm_fp16.h index 55f9b8c..2afbd12 100644 --- a/include/gcc/arm_fp16.h +++ b/include/gcc/arm_fp16.h @@ -1,6 +1,6 @@ /* ARM FP16 scalar intrinsics include file. - Copyright (C) 2016-2017 Free Software Foundation, Inc. + Copyright (C) 2016-2021 Free Software Foundation, Inc. Contributed by ARM Ltd. This file is part of GCC. diff --git a/include/gcc/arm_neon.h b/include/gcc/arm_neon.h index 96e740f..baa30bd 100644 --- a/include/gcc/arm_neon.h +++ b/include/gcc/arm_neon.h @@ -1,6 +1,6 @@ /* ARM NEON intrinsics include file. - Copyright (C) 2011-2017 Free Software Foundation, Inc. + Copyright (C) 2011-2021 Free Software Foundation, Inc. Contributed by ARM Ltd. This file is part of GCC. @@ -73,6 +73,39 @@ typedef __fp16 float16_t; typedef float float32_t; typedef double float64_t; +typedef __Bfloat16x4_t bfloat16x4_t; +typedef __Bfloat16x8_t bfloat16x8_t; + +typedef struct bfloat16x4x2_t +{ + bfloat16x4_t val[2]; +} bfloat16x4x2_t; + +typedef struct bfloat16x8x2_t +{ + bfloat16x8_t val[2]; +} bfloat16x8x2_t; + +typedef struct bfloat16x4x3_t +{ + bfloat16x4_t val[3]; +} bfloat16x4x3_t; + +typedef struct bfloat16x8x3_t +{ + bfloat16x8_t val[3]; +} bfloat16x8x3_t; + +typedef struct bfloat16x4x4_t +{ + bfloat16x4_t val[4]; +} bfloat16x4x4_t; + +typedef struct bfloat16x8x4_t +{ + bfloat16x8_t val[4]; +} bfloat16x8x4_t; + typedef struct int8x8x2_t { int8x8_t val[2]; @@ -6055,6 +6088,20 @@ vreinterpretq_u32_p128 (poly128_t __a) return (uint32x4_t)__a; } +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_f64_p128 (poly128_t __a) +{ + return (float64x2_t) __a; +} + +__extension__ extern __inline poly128_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p128_f64 (float64x2_t __a) +{ + return (poly128_t) __a; +} + /* vset_lane */ __extension__ extern __inline float16x4_t @@ -6255,216 +6302,203 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index) return __aarch64_vset_lane_any (__elem, __vec, __index); } -#define __GET_LOW(__TYPE) \ - uint64x2_t tmp = vreinterpretq_u64_##__TYPE (__a); \ - uint64x1_t lo = vcreate_u64 (vgetq_lane_u64 (tmp, 0)); \ - return vreinterpret_##__TYPE##_u64 (lo); - __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_f16 (float16x8_t __a) { - __GET_LOW (f16); + return __builtin_aarch64_get_lowv8hf (__a); } __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_f32 (float32x4_t __a) { - __GET_LOW (f32); + return __builtin_aarch64_get_lowv4sf (__a); } __extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_f64 (float64x2_t __a) { - return (float64x1_t) {vgetq_lane_f64 (__a, 0)}; + return (float64x1_t) {__builtin_aarch64_get_lowv2df (__a)}; } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_p8 (poly8x16_t __a) { - __GET_LOW (p8); + return (poly8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a); } __extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_p16 (poly16x8_t __a) { - __GET_LOW (p16); + return (poly16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a); } __extension__ extern __inline poly64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_p64 (poly64x2_t __a) { - __GET_LOW (p64); + return (poly64x1_t) __builtin_aarch64_get_lowv2di ((int64x2_t) __a); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_s8 (int8x16_t __a) { - __GET_LOW (s8); + return __builtin_aarch64_get_lowv16qi (__a); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_s16 (int16x8_t __a) { - __GET_LOW (s16); + return __builtin_aarch64_get_lowv8hi (__a); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_s32 (int32x4_t __a) { - __GET_LOW (s32); + return __builtin_aarch64_get_lowv4si (__a); } __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_s64 (int64x2_t __a) { - __GET_LOW (s64); + return (int64x1_t) {__builtin_aarch64_get_lowv2di (__a)}; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_u8 (uint8x16_t __a) { - __GET_LOW (u8); + return (uint8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_u16 (uint16x8_t __a) { - __GET_LOW (u16); + return (uint16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_u32 (uint32x4_t __a) { - __GET_LOW (u32); + return (uint32x2_t) __builtin_aarch64_get_lowv4si ((int32x4_t) __a); } __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_low_u64 (uint64x2_t __a) { - return vcreate_u64 (vgetq_lane_u64 (__a, 0)); + return (uint64x1_t) {__builtin_aarch64_get_lowv2di ((int64x2_t) __a)}; } -#undef __GET_LOW - -#define __GET_HIGH(__TYPE) \ - uint64x2_t tmp = vreinterpretq_u64_##__TYPE (__a); \ - uint64x1_t hi = vcreate_u64 (vgetq_lane_u64 (tmp, 1)); \ - return vreinterpret_##__TYPE##_u64 (hi); - __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_f16 (float16x8_t __a) { - __GET_HIGH (f16); + return __builtin_aarch64_get_highv8hf (__a); } __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_f32 (float32x4_t __a) { - __GET_HIGH (f32); + return __builtin_aarch64_get_highv4sf (__a); } __extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_f64 (float64x2_t __a) { - __GET_HIGH (f64); + return (float64x1_t) {__builtin_aarch64_get_highv2df (__a)}; } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_p8 (poly8x16_t __a) { - __GET_HIGH (p8); + return (poly8x8_t) __builtin_aarch64_get_highv16qi ((int8x16_t) __a); } __extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_p16 (poly16x8_t __a) { - __GET_HIGH (p16); + return (poly16x4_t) __builtin_aarch64_get_highv8hi ((int16x8_t) __a); } __extension__ extern __inline poly64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_p64 (poly64x2_t __a) { - __GET_HIGH (p64); + return (poly64x1_t) __builtin_aarch64_get_highv2di ((int64x2_t) __a); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_s8 (int8x16_t __a) { - __GET_HIGH (s8); + return __builtin_aarch64_get_highv16qi (__a); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_s16 (int16x8_t __a) { - __GET_HIGH (s16); + return __builtin_aarch64_get_highv8hi (__a); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_s32 (int32x4_t __a) { - __GET_HIGH (s32); + return __builtin_aarch64_get_highv4si (__a); } __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_s64 (int64x2_t __a) { - __GET_HIGH (s64); + return (int64x1_t) {__builtin_aarch64_get_highv2di (__a)}; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_u8 (uint8x16_t __a) { - __GET_HIGH (u8); + return (uint8x8_t) __builtin_aarch64_get_highv16qi ((int8x16_t) __a); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_u16 (uint16x8_t __a) { - __GET_HIGH (u16); + return (uint16x4_t) __builtin_aarch64_get_highv8hi ((int16x8_t) __a); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_u32 (uint32x4_t __a) { - __GET_HIGH (u32); + return (uint32x2_t) __builtin_aarch64_get_highv4si ((int32x4_t) __a); } -#undef __GET_HIGH - __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vget_high_u64 (uint64x2_t __a) { - return vcreate_u64 (vgetq_lane_u64 (__a, 1)); + return (uint64x1_t) {__builtin_aarch64_get_highv2di ((int64x2_t) __a)}; } + __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vcombine_s8 (int8x8_t __a, int8x8_t __b) @@ -6572,4374 +6606,2808 @@ vcombine_p64 (poly64x1_t __a, poly64x1_t __b) __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vaba_s8 (int8x8_t a, int8x8_t b, int8x8_t c) +vaba_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c) { - int8x8_t result; - __asm__ ("saba %0.8b,%2.8b,%3.8b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabav8qi (__a, __b, __c); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vaba_s16 (int16x4_t a, int16x4_t b, int16x4_t c) +vaba_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c) { - int16x4_t result; - __asm__ ("saba %0.4h,%2.4h,%3.4h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabav4hi (__a, __b, __c); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vaba_s32 (int32x2_t a, int32x2_t b, int32x2_t c) +vaba_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c) { - int32x2_t result; - __asm__ ("saba %0.2s,%2.2s,%3.2s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabav2si (__a, __b, __c); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vaba_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c) +vaba_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c) { - uint8x8_t result; - __asm__ ("uaba %0.8b,%2.8b,%3.8b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabav8qi_uuuu (__a, __b, __c); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vaba_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c) +vaba_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c) { - uint16x4_t result; - __asm__ ("uaba %0.4h,%2.4h,%3.4h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabav4hi_uuuu (__a, __b, __c); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vaba_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c) +vaba_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c) { - uint32x2_t result; - __asm__ ("uaba %0.2s,%2.2s,%3.2s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabav2si_uuuu (__a, __b, __c); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c) +vabal_high_s8 (int16x8_t __a, int8x16_t __b, int8x16_t __c) { - int16x8_t result; - __asm__ ("sabal2 %0.8h,%2.16b,%3.16b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabal2v16qi (__a, __b, __c); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c) +vabal_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c) { - int32x4_t result; - __asm__ ("sabal2 %0.4s,%2.8h,%3.8h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabal2v8hi (__a, __b, __c); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c) +vabal_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c) { - int64x2_t result; - __asm__ ("sabal2 %0.2d,%2.4s,%3.4s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabal2v4si (__a, __b, __c); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c) +vabal_high_u8 (uint16x8_t __a, uint8x16_t __b, uint8x16_t __c) { - uint16x8_t result; - __asm__ ("uabal2 %0.8h,%2.16b,%3.16b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabal2v16qi_uuuu (__a, __b, __c); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c) +vabal_high_u16 (uint32x4_t __a, uint16x8_t __b, uint16x8_t __c) { - uint32x4_t result; - __asm__ ("uabal2 %0.4s,%2.8h,%3.8h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabal2v8hi_uuuu (__a, __b, __c); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c) +vabal_high_u32 (uint64x2_t __a, uint32x4_t __b, uint32x4_t __c) { - uint64x2_t result; - __asm__ ("uabal2 %0.2d,%2.4s,%3.4s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabal2v4si_uuuu (__a, __b, __c); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabal_s8 (int16x8_t a, int8x8_t b, int8x8_t c) +vabal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c) { - int16x8_t result; - __asm__ ("sabal %0.8h,%2.8b,%3.8b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabalv8qi (__a, __b, __c); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabal_s16 (int32x4_t a, int16x4_t b, int16x4_t c) +vabal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c) { - int32x4_t result; - __asm__ ("sabal %0.4s,%2.4h,%3.4h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabalv4hi (__a, __b, __c); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabal_s32 (int64x2_t a, int32x2_t b, int32x2_t c) +vabal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c) { - int64x2_t result; - __asm__ ("sabal %0.2d,%2.2s,%3.2s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabalv2si (__a, __b, __c); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c) +vabal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c) { - uint16x8_t result; - __asm__ ("uabal %0.8h,%2.8b,%3.8b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabalv8qi_uuuu (__a, __b, __c); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c) +vabal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c) { - uint32x4_t result; - __asm__ ("uabal %0.4s,%2.4h,%3.4h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabalv4hi_uuuu (__a, __b, __c); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c) +vabal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c) { - uint64x2_t result; - __asm__ ("uabal %0.2d,%2.2s,%3.2s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabalv2si_uuuu (__a, __b, __c); } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c) +vabaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c) { - int8x16_t result; - __asm__ ("saba %0.16b,%2.16b,%3.16b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabav16qi (__a, __b, __c); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c) +vabaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c) { - int16x8_t result; - __asm__ ("saba %0.8h,%2.8h,%3.8h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabav8hi (__a, __b, __c); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c) +vabaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c) { - int32x4_t result; - __asm__ ("saba %0.4s,%2.4s,%3.4s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabav4si (__a, __b, __c); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c) +vabaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) { - uint8x16_t result; - __asm__ ("uaba %0.16b,%2.16b,%3.16b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabav16qi_uuuu (__a, __b, __c); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c) +vabaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) { - uint16x8_t result; - __asm__ ("uaba %0.8h,%2.8h,%3.8h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabav8hi_uuuu (__a, __b, __c); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c) +vabaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) { - uint32x4_t result; - __asm__ ("uaba %0.4s,%2.4s,%3.4s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabav4si_uuuu (__a, __b, __c); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabd_s8 (int8x8_t a, int8x8_t b) +vabd_s8 (int8x8_t __a, int8x8_t __b) { - int8x8_t result; - __asm__ ("sabd %0.8b, %1.8b, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabdv8qi (__a, __b); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabd_s16 (int16x4_t a, int16x4_t b) +vabd_s16 (int16x4_t __a, int16x4_t __b) { - int16x4_t result; - __asm__ ("sabd %0.4h, %1.4h, %2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabdv4hi (__a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabd_s32 (int32x2_t a, int32x2_t b) +vabd_s32 (int32x2_t __a, int32x2_t __b) { - int32x2_t result; - __asm__ ("sabd %0.2s, %1.2s, %2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabdv2si (__a, __b); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabd_u8 (uint8x8_t a, uint8x8_t b) +vabd_u8 (uint8x8_t __a, uint8x8_t __b) { - uint8x8_t result; - __asm__ ("uabd %0.8b, %1.8b, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabdv8qi_uuu (__a, __b); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabd_u16 (uint16x4_t a, uint16x4_t b) +vabd_u16 (uint16x4_t __a, uint16x4_t __b) { - uint16x4_t result; - __asm__ ("uabd %0.4h, %1.4h, %2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabdv4hi_uuu (__a, __b); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabd_u32 (uint32x2_t a, uint32x2_t b) +vabd_u32 (uint32x2_t __a, uint32x2_t __b) { - uint32x2_t result; - __asm__ ("uabd %0.2s, %1.2s, %2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabdv2si_uuu (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabdl_high_s8 (int8x16_t a, int8x16_t b) +vabdl_high_s8 (int8x16_t __a, int8x16_t __b) { - int16x8_t result; - __asm__ ("sabdl2 %0.8h,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabdl2v16qi (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabdl_high_s16 (int16x8_t a, int16x8_t b) +vabdl_high_s16 (int16x8_t __a, int16x8_t __b) { - int32x4_t result; - __asm__ ("sabdl2 %0.4s,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabdl2v8hi (__a, __b); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabdl_high_s32 (int32x4_t a, int32x4_t b) +vabdl_high_s32 (int32x4_t __a, int32x4_t __b) { - int64x2_t result; - __asm__ ("sabdl2 %0.2d,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabdl2v4si (__a, __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabdl_high_u8 (uint8x16_t a, uint8x16_t b) +vabdl_high_u8 (uint8x16_t __a, uint8x16_t __b) { - uint16x8_t result; - __asm__ ("uabdl2 %0.8h,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabdl2v16qi_uuu (__a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabdl_high_u16 (uint16x8_t a, uint16x8_t b) +vabdl_high_u16 (uint16x8_t __a, uint16x8_t __b) { - uint32x4_t result; - __asm__ ("uabdl2 %0.4s,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabdl2v8hi_uuu (__a, __b); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabdl_high_u32 (uint32x4_t a, uint32x4_t b) +vabdl_high_u32 (uint32x4_t __a, uint32x4_t __b) { - uint64x2_t result; - __asm__ ("uabdl2 %0.2d,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabdl2v4si_uuu (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabdl_s8 (int8x8_t a, int8x8_t b) +vabdl_s8 (int8x8_t __a, int8x8_t __b) { - int16x8_t result; - __asm__ ("sabdl %0.8h, %1.8b, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabdlv8qi (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabdl_s16 (int16x4_t a, int16x4_t b) +vabdl_s16 (int16x4_t __a, int16x4_t __b) { - int32x4_t result; - __asm__ ("sabdl %0.4s, %1.4h, %2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabdlv4hi (__a, __b); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabdl_s32 (int32x2_t a, int32x2_t b) +vabdl_s32 (int32x2_t __a, int32x2_t __b) { - int64x2_t result; - __asm__ ("sabdl %0.2d, %1.2s, %2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabdlv2si (__a, __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabdl_u8 (uint8x8_t a, uint8x8_t b) +vabdl_u8 (uint8x8_t __a, uint8x8_t __b) { - uint16x8_t result; - __asm__ ("uabdl %0.8h, %1.8b, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabdlv8qi_uuu (__a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabdl_u16 (uint16x4_t a, uint16x4_t b) +vabdl_u16 (uint16x4_t __a, uint16x4_t __b) { - uint32x4_t result; - __asm__ ("uabdl %0.4s, %1.4h, %2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabdlv4hi_uuu (__a, __b); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabdl_u32 (uint32x2_t a, uint32x2_t b) +vabdl_u32 (uint32x2_t __a, uint32x2_t __b) { - uint64x2_t result; - __asm__ ("uabdl %0.2d, %1.2s, %2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabdlv2si_uuu (__a, __b); } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabdq_s8 (int8x16_t a, int8x16_t b) +vabdq_s8 (int8x16_t __a, int8x16_t __b) { - int8x16_t result; - __asm__ ("sabd %0.16b, %1.16b, %2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabdv16qi (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabdq_s16 (int16x8_t a, int16x8_t b) +vabdq_s16 (int16x8_t __a, int16x8_t __b) { - int16x8_t result; - __asm__ ("sabd %0.8h, %1.8h, %2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabdv8hi (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabdq_s32 (int32x4_t a, int32x4_t b) +vabdq_s32 (int32x4_t __a, int32x4_t __b) { - int32x4_t result; - __asm__ ("sabd %0.4s, %1.4s, %2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sabdv4si (__a, __b); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabdq_u8 (uint8x16_t a, uint8x16_t b) +vabdq_u8 (uint8x16_t __a, uint8x16_t __b) { - uint8x16_t result; - __asm__ ("uabd %0.16b, %1.16b, %2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabdv16qi_uuu (__a, __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabdq_u16 (uint16x8_t a, uint16x8_t b) +vabdq_u16 (uint16x8_t __a, uint16x8_t __b) { - uint16x8_t result; - __asm__ ("uabd %0.8h, %1.8h, %2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabdv8hi_uuu (__a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabdq_u32 (uint32x4_t a, uint32x4_t b) +vabdq_u32 (uint32x4_t __a, uint32x4_t __b) { - uint32x4_t result; - __asm__ ("uabd %0.4s, %1.4s, %2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_uabdv4si_uuu (__a, __b); } __extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vaddlv_s8 (int8x8_t a) +vaddlv_s8 (int8x8_t __a) { - int16_t result; - __asm__ ("saddlv %h0,%1.8b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_saddlvv8qi (__a); } __extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vaddlv_s16 (int16x4_t a) +vaddlv_s16 (int16x4_t __a) { - int32_t result; - __asm__ ("saddlv %s0,%1.4h" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_saddlvv4hi (__a); } __extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vaddlv_u8 (uint8x8_t a) +vaddlv_u8 (uint8x8_t __a) { - uint16_t result; - __asm__ ("uaddlv %h0,%1.8b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_uaddlvv8qi_uu (__a); } __extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vaddlv_u16 (uint16x4_t a) +vaddlv_u16 (uint16x4_t __a) { - uint32_t result; - __asm__ ("uaddlv %s0,%1.4h" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_uaddlvv4hi_uu (__a); } __extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vaddlvq_s8 (int8x16_t a) +vaddlvq_s8 (int8x16_t __a) { - int16_t result; - __asm__ ("saddlv %h0,%1.16b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_saddlvv16qi (__a); } __extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vaddlvq_s16 (int16x8_t a) +vaddlvq_s16 (int16x8_t __a) { - int32_t result; - __asm__ ("saddlv %s0,%1.8h" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_saddlvv8hi (__a); } __extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vaddlvq_s32 (int32x4_t a) +vaddlvq_s32 (int32x4_t __a) { - int64_t result; - __asm__ ("saddlv %d0,%1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_saddlvv4si (__a); } __extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vaddlvq_u8 (uint8x16_t a) +vaddlvq_u8 (uint8x16_t __a) { - uint16_t result; - __asm__ ("uaddlv %h0,%1.16b" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_uaddlvv16qi_uu (__a); } __extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vaddlvq_u16 (uint16x8_t a) +vaddlvq_u16 (uint16x8_t __a) { - uint32_t result; - __asm__ ("uaddlv %s0,%1.8h" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_uaddlvv8hi_uu (__a); } __extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vaddlvq_u32 (uint32x4_t a) +vaddlvq_u32 (uint32x4_t __a) { - uint64_t result; - __asm__ ("uaddlv %d0,%1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_uaddlvv4si_uu (__a); } __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtx_f32_f64 (float64x2_t a) +vcvtx_f32_f64 (float64x2_t __a) { - float32x2_t result; + float32x2_t __result; __asm__ ("fcvtxn %0.2s,%1.2d" - : "=w"(result) - : "w"(a) + : "=w"(__result) + : "w"(__a) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtx_high_f32_f64 (float32x2_t a, float64x2_t b) +vcvtx_high_f32_f64 (float32x2_t __a, float64x2_t __b) { - float32x4_t result; + float32x4_t __result; __asm__ ("fcvtxn2 %0.4s,%1.2d" - : "=w"(result) - : "w" (b), "0"(a) + : "=w"(__result) + : "w" (__b), "0"(__a) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline float32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtxd_f32_f64 (float64_t a) +vcvtxd_f32_f64 (float64_t __a) { - float32_t result; + float32_t __result; __asm__ ("fcvtxn %s0,%d1" - : "=w"(result) - : "w"(a) + : "=w"(__result) + : "w"(__a) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c) +vmla_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) { - float32x2_t result; - float32x2_t t1; + float32x2_t __result; + float32x2_t __t1; __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fadd %0.2s, %0.2s, %1.2s" - : "=w"(result), "=w"(t1) - : "0"(a), "w"(b), "w"(c) + : "=w"(__result), "=w"(__t1) + : "0"(__a), "w"(__b), "w"(__c) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_n_s16 (int16x4_t a, int16x4_t b, int16_t c) +vmla_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c) { - int16x4_t result; - __asm__ ("mla %0.4h,%2.4h,%3.h[0]" - : "=w"(result) - : "0"(a), "w"(b), "x"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_mla_nv4hi (__a, __b, __c); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_n_s32 (int32x2_t a, int32x2_t b, int32_t c) +vmla_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c) { - int32x2_t result; - __asm__ ("mla %0.2s,%2.2s,%3.s[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_mla_nv2si (__a, __b, __c); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c) +vmla_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c) { - uint16x4_t result; - __asm__ ("mla %0.4h,%2.4h,%3.h[0]" - : "=w"(result) - : "0"(a), "w"(b), "x"(c) - : /* No clobbers */); - return result; + return (uint16x4_t) __builtin_aarch64_mla_nv4hi ((int16x4_t) __a, + (int16x4_t) __b, + (int16_t) __c); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c) +vmla_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c) { - uint32x2_t result; - __asm__ ("mla %0.2s,%2.2s,%3.s[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return (uint32x2_t) __builtin_aarch64_mla_nv2si ((int32x2_t) __a, + (int32x2_t) __b, + (int32_t) __c); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_s8 (int8x8_t a, int8x8_t b, int8x8_t c) +vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c) { - int8x8_t result; - __asm__ ("mla %0.8b, %2.8b, %3.8b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_mlav8qi (__a, __b, __c); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_s16 (int16x4_t a, int16x4_t b, int16x4_t c) +vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c) { - int16x4_t result; - __asm__ ("mla %0.4h, %2.4h, %3.4h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_mlav4hi (__a, __b, __c); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_s32 (int32x2_t a, int32x2_t b, int32x2_t c) +vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c) { - int32x2_t result; - __asm__ ("mla %0.2s, %2.2s, %3.2s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_mlav2si (__a, __b, __c); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c) +vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c) { - uint8x8_t result; - __asm__ ("mla %0.8b, %2.8b, %3.8b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return (uint8x8_t) __builtin_aarch64_mlav8qi ((int8x8_t) __a, + (int8x8_t) __b, + (int8x8_t) __c); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c) +vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c) { - uint16x4_t result; - __asm__ ("mla %0.4h, %2.4h, %3.4h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return (uint16x4_t) __builtin_aarch64_mlav4hi ((int16x4_t) __a, + (int16x4_t) __b, + (int16x4_t) __c); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c) +vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c) { - uint32x2_t result; - __asm__ ("mla %0.2s, %2.2s, %3.2s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return (uint32x2_t) __builtin_aarch64_mlav2si ((int32x2_t) __a, + (int32x2_t) __b, + (int32x2_t) __c); } -#define vmlal_high_lane_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x4_t c_ = (c); \ - int16x8_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smlal2 %0.4s, %2.8h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlal_high_lane_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x2_t c_ = (c); \ - int32x4_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smlal2 %0.2d, %2.4s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlal_high_lane_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x4_t c_ = (c); \ - uint16x8_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umlal2 %0.4s, %2.8h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlal_high_lane_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x2_t c_ = (c); \ - uint32x4_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umlal2 %0.2d, %2.4s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlal_high_laneq_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x8_t c_ = (c); \ - int16x8_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smlal2 %0.4s, %2.8h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlal_high_laneq_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x4_t c_ = (c); \ - int32x4_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smlal2 %0.2d, %2.4s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlal_high_laneq_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x8_t c_ = (c); \ - uint16x8_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umlal2 %0.4s, %2.8h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlal_high_laneq_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x4_t c_ = (c); \ - uint32x4_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umlal2 %0.2d, %2.4s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c) +vmlal_high_lane_s16(int32x4_t __a, int16x8_t __b, int16x4_t __v, + const int __lane) { - int32x4_t result; - __asm__ ("smlal2 %0.4s,%2.8h,%3.h[0]" - : "=w"(result) - : "0"(a), "w"(b), "x"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlal_hi_lanev8hi (__a, __b, __v, __lane); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c) +vmlal_high_lane_s32(int64x2_t __a, int32x4_t __b, int32x2_t __v, + const int __lane) { - int64x2_t result; - __asm__ ("smlal2 %0.2d,%2.4s,%3.s[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlal_hi_lanev4si (__a, __b, __v, __lane); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c) +vmlal_high_lane_u16(uint32x4_t __a, uint16x8_t __b, uint16x4_t __v, + const int __lane) { - uint32x4_t result; - __asm__ ("umlal2 %0.4s,%2.8h,%3.h[0]" - : "=w"(result) - : "0"(a), "w"(b), "x"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlal_hi_lanev8hi_uuuus (__a, __b, __v, __lane); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c) -{ - uint64x2_t result; - __asm__ ("umlal2 %0.2d,%2.4s,%3.s[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c) +vmlal_high_lane_u32(uint64x2_t __a, uint32x4_t __b, uint32x2_t __v, + const int __lane) { - int16x8_t result; - __asm__ ("smlal2 %0.8h,%2.16b,%3.16b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlal_hi_lanev4si_uuuus (__a, __b, __v, __lane); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c) +vmlal_high_laneq_s16(int32x4_t __a, int16x8_t __b, int16x8_t __v, + const int __lane) { - int32x4_t result; - __asm__ ("smlal2 %0.4s,%2.8h,%3.8h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlal_hi_laneqv8hi (__a, __b, __v, __lane); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c) -{ - int64x2_t result; - __asm__ ("smlal2 %0.2d,%2.4s,%3.4s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c) +vmlal_high_laneq_s32(int64x2_t __a, int32x4_t __b, int32x4_t __v, + const int __lane) { - uint16x8_t result; - __asm__ ("umlal2 %0.8h,%2.16b,%3.16b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlal_hi_laneqv4si (__a, __b, __v, __lane); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c) +vmlal_high_laneq_u16(uint32x4_t __a, uint16x8_t __b, uint16x8_t __v, + const int __lane) { - uint32x4_t result; - __asm__ ("umlal2 %0.4s,%2.8h,%3.8h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlal_hi_laneqv8hi_uuuus (__a, __b, __v, __lane); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c) +vmlal_high_laneq_u32(uint64x2_t __a, uint32x4_t __b, uint32x4_t __v, + const int __lane) { - uint64x2_t result; - __asm__ ("umlal2 %0.2d,%2.4s,%3.4s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlal_hi_laneqv4si_uuuus (__a, __b, __v, __lane); } -#define vmlal_lane_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x4_t c_ = (c); \ - int16x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smlal %0.4s,%2.4h,%3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlal_lane_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x2_t c_ = (c); \ - int32x2_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smlal %0.2d,%2.2s,%3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlal_lane_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x4_t c_ = (c); \ - uint16x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umlal %0.4s,%2.4h,%3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlal_lane_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x2_t c_ = (c); \ - uint32x2_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umlal %0.2d, %2.2s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlal_laneq_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x8_t c_ = (c); \ - int16x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smlal %0.4s, %2.4h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlal_laneq_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x4_t c_ = (c); \ - int32x2_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smlal %0.2d, %2.2s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlal_laneq_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x8_t c_ = (c); \ - uint16x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umlal %0.4s, %2.4h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlal_laneq_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x4_t c_ = (c); \ - uint32x2_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umlal %0.2d, %2.2s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_n_s16 (int32x4_t a, int16x4_t b, int16_t c) +vmlal_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c) { - int32x4_t result; - __asm__ ("smlal %0.4s,%2.4h,%3.h[0]" - : "=w"(result) - : "0"(a), "w"(b), "x"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlal_hi_nv8hi (__a, __b, __c); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_n_s32 (int64x2_t a, int32x2_t b, int32_t c) +vmlal_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c) { - int64x2_t result; - __asm__ ("smlal %0.2d,%2.2s,%3.s[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlal_hi_nv4si (__a, __b, __c); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c) +vmlal_high_n_u16 (uint32x4_t __a, uint16x8_t __b, uint16_t __c) { - uint32x4_t result; - __asm__ ("umlal %0.4s,%2.4h,%3.h[0]" - : "=w"(result) - : "0"(a), "w"(b), "x"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlal_hi_nv8hi_uuuu (__a, __b, __c); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c) +vmlal_high_n_u32 (uint64x2_t __a, uint32x4_t __b, uint32_t __c) { - uint64x2_t result; - __asm__ ("umlal %0.2d,%2.2s,%3.s[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlal_hi_nv4si_uuuu (__a, __b, __c); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_s8 (int16x8_t a, int8x8_t b, int8x8_t c) +vmlal_high_s8 (int16x8_t __a, int8x16_t __b, int8x16_t __c) { - int16x8_t result; - __asm__ ("smlal %0.8h,%2.8b,%3.8b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlal_hiv16qi (__a, __b, __c); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_s16 (int32x4_t a, int16x4_t b, int16x4_t c) +vmlal_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c) { - int32x4_t result; - __asm__ ("smlal %0.4s,%2.4h,%3.4h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlal_hiv8hi (__a, __b, __c); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_s32 (int64x2_t a, int32x2_t b, int32x2_t c) +vmlal_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c) { - int64x2_t result; - __asm__ ("smlal %0.2d,%2.2s,%3.2s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlal_hiv4si (__a, __b, __c); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c) +vmlal_high_u8 (uint16x8_t __a, uint8x16_t __b, uint8x16_t __c) { - uint16x8_t result; - __asm__ ("umlal %0.8h,%2.8b,%3.8b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlal_hiv16qi_uuuu (__a, __b, __c); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c) +vmlal_high_u16 (uint32x4_t __a, uint16x8_t __b, uint16x8_t __c) { - uint32x4_t result; - __asm__ ("umlal %0.4s,%2.4h,%3.4h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlal_hiv8hi_uuuu (__a, __b, __c); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlal_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c) +vmlal_high_u32 (uint64x2_t __a, uint32x4_t __b, uint32x4_t __c) { - uint64x2_t result; - __asm__ ("umlal %0.2d,%2.2s,%3.2s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlal_hiv4si_uuuu (__a, __b, __c); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c) +vmlal_lane_s16 (int32x4_t __acc, int16x4_t __a, int16x4_t __b, const int __c) { - float32x4_t result; - float32x4_t t1; - __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fadd %0.4s, %0.4s, %1.4s" - : "=w"(result), "=w"(t1) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_smlal_lane_v4hi (__acc, __a, __b, __c); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_n_s16 (int16x8_t a, int16x8_t b, int16_t c) +vmlal_lane_s32 (int64x2_t __acc, int32x2_t __a, int32x2_t __b, const int __c) { - int16x8_t result; - __asm__ ("mla %0.8h,%2.8h,%3.h[0]" - : "=w"(result) - : "0"(a), "w"(b), "x"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_smlal_lane_v2si (__acc, __a, __b, __c); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_n_s32 (int32x4_t a, int32x4_t b, int32_t c) +vmlal_lane_u16 (uint32x4_t __acc, uint16x4_t __a, uint16x4_t __b, const int __c) { - int32x4_t result; - __asm__ ("mla %0.4s,%2.4s,%3.s[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_umlal_lane_v4hi_uuuus (__acc, __a, __b, __c); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c) +vmlal_lane_u32 (uint64x2_t __acc, uint32x2_t __a, uint32x2_t __b, const int __c) { - uint16x8_t result; - __asm__ ("mla %0.8h,%2.8h,%3.h[0]" - : "=w"(result) - : "0"(a), "w"(b), "x"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_umlal_lane_v2si_uuuus (__acc, __a, __b, __c); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c) +vmlal_laneq_s16 (int32x4_t __acc, int16x4_t __a, int16x8_t __b, const int __c) { - uint32x4_t result; - __asm__ ("mla %0.4s,%2.4s,%3.s[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_smlal_laneq_v4hi (__acc, __a, __b, __c); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_s8 (int8x16_t a, int8x16_t b, int8x16_t c) +vmlal_laneq_s32 (int64x2_t __acc, int32x2_t __a, int32x4_t __b, const int __c) { - int8x16_t result; - __asm__ ("mla %0.16b, %2.16b, %3.16b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_smlal_laneq_v2si (__acc, __a, __b, __c); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_s16 (int16x8_t a, int16x8_t b, int16x8_t c) +vmlal_laneq_u16 (uint32x4_t __acc, uint16x4_t __a, uint16x8_t __b, const int __c) { - int16x8_t result; - __asm__ ("mla %0.8h, %2.8h, %3.8h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_umlal_laneq_v4hi_uuuus (__acc, __a, __b, __c); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_s32 (int32x4_t a, int32x4_t b, int32x4_t c) +vmlal_laneq_u32 (uint64x2_t __acc, uint32x2_t __a, uint32x4_t __b, const int __c) { - int32x4_t result; - __asm__ ("mla %0.4s, %2.4s, %3.4s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_umlal_laneq_v2si_uuuus (__acc, __a, __b, __c); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c) +vmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c) { - uint8x16_t result; - __asm__ ("mla %0.16b, %2.16b, %3.16b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlal_nv4hi (__a, __b, __c); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c) +vmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c) { - uint16x8_t result; - __asm__ ("mla %0.8h, %2.8h, %3.8h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlal_nv2si (__a, __b, __c); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c) -{ - uint32x4_t result; - __asm__ ("mla %0.4s, %2.4s, %3.4s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ extern __inline float32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_n_f32 (float32x2_t a, float32x2_t b, float32_t c) -{ - float32x2_t result; - float32x2_t t1; - __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fsub %0.2s, %0.2s, %1.2s" - : "=w"(result), "=w"(t1) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ extern __inline int16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_n_s16 (int16x4_t a, int16x4_t b, int16_t c) +vmlal_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c) { - int16x4_t result; - __asm__ ("mls %0.4h, %2.4h, %3.h[0]" - : "=w"(result) - : "0"(a), "w"(b), "x"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlal_nv4hi_uuuu (__a, __b, __c); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_n_s32 (int32x2_t a, int32x2_t b, int32_t c) +vmlal_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c) { - int32x2_t result; - __asm__ ("mls %0.2s, %2.2s, %3.s[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlal_nv2si_uuuu (__a, __b, __c); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_n_u16 (uint16x4_t a, uint16x4_t b, uint16_t c) +vmlal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c) { - uint16x4_t result; - __asm__ ("mls %0.4h, %2.4h, %3.h[0]" - : "=w"(result) - : "0"(a), "w"(b), "x"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlalv8qi (__a, __b, __c); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_n_u32 (uint32x2_t a, uint32x2_t b, uint32_t c) +vmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c) { - uint32x2_t result; - __asm__ ("mls %0.2s, %2.2s, %3.s[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlalv4hi (__a, __b, __c); } -__extension__ extern __inline int8x8_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_s8 (int8x8_t a, int8x8_t b, int8x8_t c) +vmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c) { - int8x8_t result; - __asm__ ("mls %0.8b,%2.8b,%3.8b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlalv2si (__a, __b, __c); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_s16 (int16x4_t a, int16x4_t b, int16x4_t c) +vmlal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c) { - int16x4_t result; - __asm__ ("mls %0.4h,%2.4h,%3.4h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlalv8qi_uuuu (__a, __b, __c); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_s32 (int32x2_t a, int32x2_t b, int32x2_t c) +vmlal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c) { - int32x2_t result; - __asm__ ("mls %0.2s,%2.2s,%3.2s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlalv4hi_uuuu (__a, __b, __c); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_u8 (uint8x8_t a, uint8x8_t b, uint8x8_t c) +vmlal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c) { - uint8x8_t result; - __asm__ ("mls %0.8b,%2.8b,%3.8b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlalv2si_uuuu (__a, __b, __c); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_u16 (uint16x4_t a, uint16x4_t b, uint16x4_t c) +vmlaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) { - uint16x4_t result; - __asm__ ("mls %0.4h,%2.4h,%3.4h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) + float32x4_t __result; + float32x4_t __t1; + __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fadd %0.4s, %0.4s, %1.4s" + : "=w"(__result), "=w"(__t1) + : "0"(__a), "w"(__b), "w"(__c) : /* No clobbers */); - return result; + return __result; } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_u32 (uint32x2_t a, uint32x2_t b, uint32x2_t c) +vmlaq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c) { - uint32x2_t result; - __asm__ ("mls %0.2s,%2.2s,%3.2s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_mla_nv8hi (__a, __b, __c); } -#define vmlsl_high_lane_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x4_t c_ = (c); \ - int16x8_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_high_lane_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x2_t c_ = (c); \ - int32x4_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_high_lane_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x4_t c_ = (c); \ - uint16x8_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_high_lane_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x2_t c_ = (c); \ - uint32x4_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_high_laneq_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x8_t c_ = (c); \ - int16x8_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_high_laneq_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x4_t c_ = (c); \ - int32x4_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_high_laneq_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x8_t c_ = (c); \ - uint16x8_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_high_laneq_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x4_t c_ = (c); \ - uint32x4_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_high_n_s16 (int32x4_t a, int16x8_t b, int16_t c) +vmlaq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c) { - int32x4_t result; - __asm__ ("smlsl2 %0.4s, %2.8h, %3.h[0]" - : "=w"(result) - : "0"(a), "w"(b), "x"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_mla_nv4si (__a, __b, __c); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_high_n_s32 (int64x2_t a, int32x4_t b, int32_t c) +vmlaq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c) { - int64x2_t result; - __asm__ ("smlsl2 %0.2d, %2.4s, %3.s[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return (uint16x8_t) __builtin_aarch64_mla_nv8hi ((int16x8_t) __a, + (int16x8_t) __b, + (int16_t) __c); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_high_n_u16 (uint32x4_t a, uint16x8_t b, uint16_t c) +vmlaq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c) { - uint32x4_t result; - __asm__ ("umlsl2 %0.4s, %2.8h, %3.h[0]" - : "=w"(result) - : "0"(a), "w"(b), "x"(c) - : /* No clobbers */); - return result; + return (uint32x4_t) __builtin_aarch64_mla_nv4si ((int32x4_t) __a, + (int32x4_t) __b, + (int32_t) __c); } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_high_n_u32 (uint64x2_t a, uint32x4_t b, uint32_t c) +vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c) { - uint64x2_t result; - __asm__ ("umlsl2 %0.2d, %2.4s, %3.s[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_mlav16qi (__a, __b, __c); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_high_s8 (int16x8_t a, int8x16_t b, int8x16_t c) +vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c) { - int16x8_t result; - __asm__ ("smlsl2 %0.8h,%2.16b,%3.16b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_mlav8hi (__a, __b, __c); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_high_s16 (int32x4_t a, int16x8_t b, int16x8_t c) +vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c) { - int32x4_t result; - __asm__ ("smlsl2 %0.4s,%2.8h,%3.8h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_mlav4si (__a, __b, __c); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_high_s32 (int64x2_t a, int32x4_t b, int32x4_t c) +vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) { - int64x2_t result; - __asm__ ("smlsl2 %0.2d,%2.4s,%3.4s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return (uint8x16_t) __builtin_aarch64_mlav16qi ((int8x16_t) __a, + (int8x16_t) __b, + (int8x16_t) __c); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_high_u8 (uint16x8_t a, uint8x16_t b, uint8x16_t c) +vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) { - uint16x8_t result; - __asm__ ("umlsl2 %0.8h,%2.16b,%3.16b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return (uint16x8_t) __builtin_aarch64_mlav8hi ((int16x8_t) __a, + (int16x8_t) __b, + (int16x8_t) __c); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_high_u16 (uint32x4_t a, uint16x8_t b, uint16x8_t c) +vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) { - uint32x4_t result; - __asm__ ("umlsl2 %0.4s,%2.8h,%3.8h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return (uint32x4_t) __builtin_aarch64_mlav4si ((int32x4_t) __a, + (int32x4_t) __b, + (int32x4_t) __c); } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_high_u32 (uint64x2_t a, uint32x4_t b, uint32x4_t c) +vmls_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) { - uint64x2_t result; - __asm__ ("umlsl2 %0.2d,%2.4s,%3.4s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) + float32x2_t __result; + float32x2_t __t1; + __asm__ ("fmul %1.2s, %3.2s, %4.s[0]; fsub %0.2s, %0.2s, %1.2s" + : "=w"(__result), "=w"(__t1) + : "0"(__a), "w"(__b), "w"(__c) : /* No clobbers */); - return result; + return __result; } -#define vmlsl_lane_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x4_t c_ = (c); \ - int16x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_lane_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x2_t c_ = (c); \ - int32x2_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smlsl %0.2d, %2.2s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_lane_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x4_t c_ = (c); \ - uint16x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_lane_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x2_t c_ = (c); \ - uint32x2_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umlsl %0.2d, %2.2s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_laneq_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x8_t c_ = (c); \ - int16x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smlsl %0.4s, %2.4h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_laneq_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x4_t c_ = (c); \ - int32x2_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smlsl %0.2d, %2.2s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_laneq_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x8_t c_ = (c); \ - uint16x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umlsl %0.4s, %2.4h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmlsl_laneq_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x4_t c_ = (c); \ - uint32x2_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umlsl %0.2d, %2.2s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmls_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c) +{ + return __builtin_aarch64_mls_nv4hi (__a, __b, __c); +} -__extension__ extern __inline int32x4_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_n_s16 (int32x4_t a, int16x4_t b, int16_t c) +vmls_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c) { - int32x4_t result; - __asm__ ("smlsl %0.4s, %2.4h, %3.h[0]" - : "=w"(result) - : "0"(a), "w"(b), "x"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_mls_nv2si (__a, __b, __c); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_n_s32 (int64x2_t a, int32x2_t b, int32_t c) +vmls_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c) { - int64x2_t result; - __asm__ ("smlsl %0.2d, %2.2s, %3.s[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return (uint16x4_t) __builtin_aarch64_mls_nv4hi ((int16x4_t) __a, + (int16x4_t) __b, + (int16_t) __c); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_n_u16 (uint32x4_t a, uint16x4_t b, uint16_t c) +vmls_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c) { - uint32x4_t result; - __asm__ ("umlsl %0.4s, %2.4h, %3.h[0]" - : "=w"(result) - : "0"(a), "w"(b), "x"(c) - : /* No clobbers */); - return result; + return (uint32x2_t) __builtin_aarch64_mls_nv2si ((int32x2_t) __a, + (int32x2_t) __b, + (int32_t) __c); } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_n_u32 (uint64x2_t a, uint32x2_t b, uint32_t c) +vmls_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c) { - uint64x2_t result; - __asm__ ("umlsl %0.2d, %2.2s, %3.s[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_mlsv8qi (__a, __b, __c); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_s8 (int16x8_t a, int8x8_t b, int8x8_t c) +vmls_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c) { - int16x8_t result; - __asm__ ("smlsl %0.8h, %2.8b, %3.8b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_mlsv4hi (__a, __b, __c); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_s16 (int32x4_t a, int16x4_t b, int16x4_t c) +vmls_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c) { - int32x4_t result; - __asm__ ("smlsl %0.4s, %2.4h, %3.4h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_mlsv2si (__a, __b, __c); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_s32 (int64x2_t a, int32x2_t b, int32x2_t c) +vmls_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c) { - int64x2_t result; - __asm__ ("smlsl %0.2d, %2.2s, %3.2s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return (uint8x8_t) __builtin_aarch64_mlsv8qi ((int8x8_t) __a, + (int8x8_t) __b, + (int8x8_t) __c); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_u8 (uint16x8_t a, uint8x8_t b, uint8x8_t c) +vmls_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c) { - uint16x8_t result; - __asm__ ("umlsl %0.8h, %2.8b, %3.8b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return (uint16x4_t) __builtin_aarch64_mlsv4hi ((int16x4_t) __a, + (int16x4_t) __b, + (int16x4_t) __c); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_u16 (uint32x4_t a, uint16x4_t b, uint16x4_t c) +vmls_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c) { - uint32x4_t result; - __asm__ ("umlsl %0.4s, %2.4h, %3.4h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return (uint32x2_t) __builtin_aarch64_mlsv2si ((int32x2_t) __a, + (int32x2_t) __b, + (int32x2_t) __c); } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c) +vmlsl_high_lane_s16(int32x4_t __a, int16x8_t __b, int16x4_t __v, + const int __lane) { - uint64x2_t result; - __asm__ ("umlsl %0.2d, %2.2s, %3.2s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlsl_hi_lanev8hi (__a, __b, __v, __lane); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_n_f32 (float32x4_t a, float32x4_t b, float32_t c) +vmlsl_high_lane_s32(int64x2_t __a, int32x4_t __b, int32x2_t __v, + const int __lane) { - float32x4_t result; - float32x4_t t1; - __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fsub %0.4s, %0.4s, %1.4s" - : "=w"(result), "=w"(t1) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlsl_hi_lanev4si (__a, __b, __v, __lane); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_n_s16 (int16x8_t a, int16x8_t b, int16_t c) +vmlsl_high_lane_u16(uint32x4_t __a, uint16x8_t __b, uint16x4_t __v, + const int __lane) { - int16x8_t result; - __asm__ ("mls %0.8h, %2.8h, %3.h[0]" - : "=w"(result) - : "0"(a), "w"(b), "x"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlsl_hi_lanev8hi_uuuus (__a, __b, __v, __lane); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_n_s32 (int32x4_t a, int32x4_t b, int32_t c) +vmlsl_high_lane_u32(uint64x2_t __a, uint32x4_t __b, uint32x2_t __v, + const int __lane) { - int32x4_t result; - __asm__ ("mls %0.4s, %2.4s, %3.s[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlsl_hi_lanev4si_uuuus (__a, __b, __v, __lane); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_n_u16 (uint16x8_t a, uint16x8_t b, uint16_t c) +vmlsl_high_laneq_s16(int32x4_t __a, int16x8_t __b, int16x8_t __v, + const int __lane) { - uint16x8_t result; - __asm__ ("mls %0.8h, %2.8h, %3.h[0]" - : "=w"(result) - : "0"(a), "w"(b), "x"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlsl_hi_laneqv8hi (__a, __b, __v, __lane); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_n_u32 (uint32x4_t a, uint32x4_t b, uint32_t c) +vmlsl_high_laneq_s32(int64x2_t __a, int32x4_t __b, int32x4_t __v, + const int __lane) { - uint32x4_t result; - __asm__ ("mls %0.4s, %2.4s, %3.s[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlsl_hi_laneqv4si (__a, __b, __v, __lane); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_s8 (int8x16_t a, int8x16_t b, int8x16_t c) +vmlsl_high_laneq_u16(uint32x4_t __a, uint16x8_t __b, uint16x8_t __v, + const int __lane) { - int8x16_t result; - __asm__ ("mls %0.16b,%2.16b,%3.16b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlsl_hi_laneqv8hi_uuuus (__a, __b, __v, __lane); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_s16 (int16x8_t a, int16x8_t b, int16x8_t c) +vmlsl_high_laneq_u32(uint64x2_t __a, uint32x4_t __b, uint32x4_t __v, + const int __lane) { - int16x8_t result; - __asm__ ("mls %0.8h,%2.8h,%3.8h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlsl_hi_laneqv4si_uuuus (__a, __b, __v, __lane); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_s32 (int32x4_t a, int32x4_t b, int32x4_t c) +vmlsl_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c) { - int32x4_t result; - __asm__ ("mls %0.4s,%2.4s,%3.4s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlsl_hi_nv8hi (__a, __b, __c); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c) +vmlsl_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c) { - uint8x16_t result; - __asm__ ("mls %0.16b,%2.16b,%3.16b" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlsl_hi_nv4si (__a, __b, __c); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_u16 (uint16x8_t a, uint16x8_t b, uint16x8_t c) +vmlsl_high_n_u16 (uint32x4_t __a, uint16x8_t __b, uint16_t __c) { - uint16x8_t result; - __asm__ ("mls %0.8h,%2.8h,%3.8h" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlsl_hi_nv8hi_uuuu (__a, __b, __c); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_u32 (uint32x4_t a, uint32x4_t b, uint32x4_t c) +vmlsl_high_n_u32 (uint64x2_t __a, uint32x4_t __b, uint32_t __c) { - uint32x4_t result; - __asm__ ("mls %0.4s,%2.4s,%3.4s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlsl_hi_nv4si_uuuu (__a, __b, __c); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovl_high_s8 (int8x16_t a) +vmlsl_high_s8 (int16x8_t __a, int8x16_t __b, int8x16_t __c) { - int16x8_t result; - __asm__ ("sshll2 %0.8h,%1.16b,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlsl_hiv16qi (__a, __b, __c); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovl_high_s16 (int16x8_t a) +vmlsl_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c) { - int32x4_t result; - __asm__ ("sshll2 %0.4s,%1.8h,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlsl_hiv8hi (__a, __b, __c); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovl_high_s32 (int32x4_t a) +vmlsl_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c) { - int64x2_t result; - __asm__ ("sshll2 %0.2d,%1.4s,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlsl_hiv4si (__a, __b, __c); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovl_high_u8 (uint8x16_t a) +vmlsl_high_u8 (uint16x8_t __a, uint8x16_t __b, uint8x16_t __c) { - uint16x8_t result; - __asm__ ("ushll2 %0.8h,%1.16b,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlsl_hiv16qi_uuuu (__a, __b, __c); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovl_high_u16 (uint16x8_t a) +vmlsl_high_u16 (uint32x4_t __a, uint16x8_t __b, uint16x8_t __c) { - uint32x4_t result; - __asm__ ("ushll2 %0.4s,%1.8h,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlsl_hiv8hi_uuuu (__a, __b, __c); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovl_high_u32 (uint32x4_t a) +vmlsl_high_u32 (uint64x2_t __a, uint32x4_t __b, uint32x4_t __c) { - uint64x2_t result; - __asm__ ("ushll2 %0.2d,%1.4s,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlsl_hiv4si_uuuu (__a, __b, __c); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovl_s8 (int8x8_t a) +vmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __v, const int __lane) { - int16x8_t result; - __asm__ ("sshll %0.8h,%1.8b,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_smlsl_lane_v4hi (__a, __b, __v, __lane); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovl_s16 (int16x4_t a) +vmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __v, const int __lane) { - int32x4_t result; - __asm__ ("sshll %0.4s,%1.4h,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_smlsl_lane_v2si (__a, __b, __v, __lane); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovl_s32 (int32x2_t a) +vmlsl_lane_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __v, + const int __lane) { - int64x2_t result; - __asm__ ("sshll %0.2d,%1.2s,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_umlsl_lane_v4hi_uuuus (__a, __b, __v, __lane); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovl_u8 (uint8x8_t a) +vmlsl_lane_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __v, + const int __lane) { - uint16x8_t result; - __asm__ ("ushll %0.8h,%1.8b,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_umlsl_lane_v2si_uuuus (__a, __b, __v, __lane); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlsl_laneq_s16 (int32x4_t __a, int16x4_t __b, int16x8_t __v, const int __lane) +{ + return __builtin_aarch64_vec_smlsl_laneq_v4hi (__a, __b, __v, __lane); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlsl_laneq_s32 (int64x2_t __a, int32x2_t __b, int32x4_t __v, const int __lane) +{ + return __builtin_aarch64_vec_smlsl_laneq_v2si (__a, __b, __v, __lane); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovl_u16 (uint16x4_t a) +vmlsl_laneq_u16 (uint32x4_t __a, uint16x4_t __b, uint16x8_t __v, + const int __lane) { - uint32x4_t result; - __asm__ ("ushll %0.4s,%1.4h,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_umlsl_laneq_v4hi_uuuus (__a, __b, __v, __lane); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovl_u32 (uint32x2_t a) +vmlsl_laneq_u32 (uint64x2_t __a, uint32x2_t __b, uint32x4_t __v, + const int __lane) { - uint64x2_t result; - __asm__ ("ushll %0.2d,%1.2s,#0" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_umlsl_laneq_v2si_uuuus (__a, __b, __v, __lane); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovn_high_s16 (int8x8_t a, int16x8_t b) +vmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c) { - int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("xtn2 %0.16b,%1.8h" - : "+w"(result) - : "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlsl_nv4hi (__a, __b, __c); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c) +{ + return __builtin_aarch64_smlsl_nv2si (__a, __b, __c); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlsl_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c) +{ + return __builtin_aarch64_umlsl_nv4hi_uuuu (__a, __b, __c); +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlsl_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c) +{ + return __builtin_aarch64_umlsl_nv2si_uuuu (__a, __b, __c); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovn_high_s32 (int16x4_t a, int32x4_t b) +vmlsl_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c) { - int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0))); - __asm__ ("xtn2 %0.8h,%1.4s" - : "+w"(result) - : "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlslv8qi (__a, __b, __c); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovn_high_s64 (int32x2_t a, int64x2_t b) +vmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c) { - int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0))); - __asm__ ("xtn2 %0.4s,%1.2d" - : "+w"(result) - : "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlslv4hi (__a, __b, __c); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovn_high_u16 (uint8x8_t a, uint16x8_t b) +vmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c) { - uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("xtn2 %0.16b,%1.8h" - : "+w"(result) - : "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_smlslv2si (__a, __b, __c); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovn_high_u32 (uint16x4_t a, uint32x4_t b) +vmlsl_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c) { - uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0))); - __asm__ ("xtn2 %0.8h,%1.4s" - : "+w"(result) - : "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlslv8qi_uuuu (__a, __b, __c); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovn_high_u64 (uint32x2_t a, uint64x2_t b) +vmlsl_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c) { - uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0))); - __asm__ ("xtn2 %0.4s,%1.2d" - : "+w"(result) - : "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlslv4hi_uuuu (__a, __b, __c); } -__extension__ extern __inline int8x8_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovn_s16 (int16x8_t a) +vmlsl_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c) { - int8x8_t result; - __asm__ ("xtn %0.8b,%1.8h" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_umlslv2si_uuuu (__a, __b, __c); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovn_s32 (int32x4_t a) +vmlsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) { - int16x4_t result; - __asm__ ("xtn %0.4h,%1.4s" - : "=w"(result) - : "w"(a) + float32x4_t __result; + float32x4_t __t1; + __asm__ ("fmul %1.4s, %3.4s, %4.s[0]; fsub %0.4s, %0.4s, %1.4s" + : "=w"(__result), "=w"(__t1) + : "0"(__a), "w"(__b), "w"(__c) : /* No clobbers */); - return result; + return __result; } -__extension__ extern __inline int32x2_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovn_s64 (int64x2_t a) +vmlsq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c) { - int32x2_t result; - __asm__ ("xtn %0.2s,%1.2d" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_mls_nv8hi (__a, __b, __c); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovn_u16 (uint16x8_t a) +vmlsq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c) { - uint8x8_t result; - __asm__ ("xtn %0.8b,%1.8h" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return __builtin_aarch64_mls_nv4si (__a, __b, __c); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovn_u32 (uint32x4_t a) +vmlsq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c) { - uint16x4_t result; - __asm__ ("xtn %0.4h,%1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return (uint16x8_t) __builtin_aarch64_mls_nv8hi ((int16x8_t) __a, + (int16x8_t) __b, + (int16_t) __c); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovn_u64 (uint64x2_t a) +vmlsq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c) { - uint32x2_t result; - __asm__ ("xtn %0.2s,%1.2d" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return (uint32x4_t) __builtin_aarch64_mls_nv4si ((int32x4_t) __a, + (int32x4_t) __b, + (int32_t) __c); } -#define vmull_high_lane_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x4_t b_ = (b); \ - int16x8_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_lane_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x2_t b_ = (b); \ - int32x4_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_lane_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x4_t b_ = (b); \ - uint16x8_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_lane_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x2_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_laneq_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - int16x8_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smull2 %0.4s, %1.8h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_laneq_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smull2 %0.2d, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_laneq_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint16x8_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umull2 %0.4s, %1.8h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_high_laneq_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umull2 %0.2d, %1.4s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlsq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c) +{ + return __builtin_aarch64_mlsv16qi (__a, __b, __c); +} -__extension__ extern __inline int32x4_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_high_n_s16 (int16x8_t a, int16_t b) +vmlsq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c) { - int32x4_t result; - __asm__ ("smull2 %0.4s,%1.8h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_mlsv8hi (__a, __b, __c); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_high_n_s32 (int32x4_t a, int32_t b) +vmlsq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c) { - int64x2_t result; - __asm__ ("smull2 %0.2d,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_mlsv4si (__a, __b, __c); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_high_n_u16 (uint16x8_t a, uint16_t b) +vmlsq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) { - uint32x4_t result; - __asm__ ("umull2 %0.4s,%1.8h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; + return (uint8x16_t) __builtin_aarch64_mlsv16qi ((int8x16_t) __a, + (int8x16_t) __b, + (int8x16_t) __c); } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_high_n_u32 (uint32x4_t a, uint32_t b) +vmlsq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) { - uint64x2_t result; - __asm__ ("umull2 %0.2d,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return (uint16x8_t) __builtin_aarch64_mlsv8hi ((int16x8_t) __a, + (int16x8_t) __b, + (int16x8_t) __c); } -__extension__ extern __inline poly16x8_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_high_p8 (poly8x16_t a, poly8x16_t b) +vmlsq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) { - poly16x8_t result; - __asm__ ("pmull2 %0.8h,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return (uint32x4_t) __builtin_aarch64_mlsv4si ((int32x4_t) __a, + (int32x4_t) __b, + (int32x4_t) __c); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_high_s8 (int8x16_t a, int8x16_t b) +vmovl_high_s8 (int8x16_t __a) { - int16x8_t result; - __asm__ ("smull2 %0.8h,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_unpacks_hi_v16qi (__a); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_high_s16 (int16x8_t a, int16x8_t b) +vmovl_high_s16 (int16x8_t __a) { - int32x4_t result; - __asm__ ("smull2 %0.4s,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_unpacks_hi_v8hi (__a); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_high_s32 (int32x4_t a, int32x4_t b) +vmovl_high_s32 (int32x4_t __a) { - int64x2_t result; - __asm__ ("smull2 %0.2d,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_unpacks_hi_v4si (__a); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_high_u8 (uint8x16_t a, uint8x16_t b) +vmovl_high_u8 (uint8x16_t __a) { - uint16x8_t result; - __asm__ ("umull2 %0.8h,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_unpacku_hi_v16qi_uu (__a); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_high_u16 (uint16x8_t a, uint16x8_t b) +vmovl_high_u16 (uint16x8_t __a) { - uint32x4_t result; - __asm__ ("umull2 %0.4s,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_unpacku_hi_v8hi_uu (__a); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_high_u32 (uint32x4_t a, uint32x4_t b) +vmovl_high_u32 (uint32x4_t __a) { - uint64x2_t result; - __asm__ ("umull2 %0.2d,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_vec_unpacku_hi_v4si_uu (__a); } -#define vmull_lane_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smull %0.4s,%1.4h,%2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_lane_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smull %0.2d,%1.2s,%2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_lane_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umull %0.4s,%1.4h,%2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_lane_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umull %0.2d, %1.2s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_laneq_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - int16x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smull %0.4s, %1.4h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_laneq_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int32x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smull %0.2d, %1.2s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_laneq_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umull %0.4s, %1.4h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vmull_laneq_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umull %0.2d, %1.2s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmovl_s8 (int8x8_t __a) +{ + return __builtin_aarch64_sxtlv8hi (__a); +} __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_n_s16 (int16x4_t a, int16_t b) +vmovl_s16 (int16x4_t __a) { - int32x4_t result; - __asm__ ("smull %0.4s,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sxtlv4si (__a); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_n_s32 (int32x2_t a, int32_t b) +vmovl_s32 (int32x2_t __a) { - int64x2_t result; - __asm__ ("smull %0.2d,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sxtlv2di (__a); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmovl_u8 (uint8x8_t __a) +{ + return __builtin_aarch64_uxtlv8hi_uu (__a); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_n_u16 (uint16x4_t a, uint16_t b) +vmovl_u16 (uint16x4_t __a) { - uint32x4_t result; - __asm__ ("umull %0.4s,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_uxtlv4si_uu (__a); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_n_u32 (uint32x2_t a, uint32_t b) +vmovl_u32 (uint32x2_t __a) { - uint64x2_t result; - __asm__ ("umull %0.2d,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_uxtlv2di_uu (__a); } -__extension__ extern __inline poly16x8_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_p8 (poly8x8_t a, poly8x8_t b) +vmovn_high_s16 (int8x8_t __a, int16x8_t __b) { - poly16x8_t result; - __asm__ ("pmull %0.8h, %1.8b, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_xtn2v8hi (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_s8 (int8x8_t a, int8x8_t b) +vmovn_high_s32 (int16x4_t __a, int32x4_t __b) { - int16x8_t result; - __asm__ ("smull %0.8h, %1.8b, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_xtn2v4si (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_s16 (int16x4_t a, int16x4_t b) +vmovn_high_s64 (int32x2_t __a, int64x2_t __b) { - int32x4_t result; - __asm__ ("smull %0.4s, %1.4h, %2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_xtn2v2di (__a, __b); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_s32 (int32x2_t a, int32x2_t b) +vmovn_high_u16 (uint8x8_t __a, uint16x8_t __b) { - int64x2_t result; - __asm__ ("smull %0.2d, %1.2s, %2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return (uint8x16_t) + __builtin_aarch64_xtn2v8hi ((int8x8_t) __a, (int16x8_t) __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_u8 (uint8x8_t a, uint8x8_t b) +vmovn_high_u32 (uint16x4_t __a, uint32x4_t __b) { - uint16x8_t result; - __asm__ ("umull %0.8h, %1.8b, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return (uint16x8_t) + __builtin_aarch64_xtn2v4si ((int16x4_t) __a, (int32x4_t) __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_u16 (uint16x4_t a, uint16x4_t b) +vmovn_high_u64 (uint32x2_t __a, uint64x2_t __b) { - uint32x4_t result; - __asm__ ("umull %0.4s, %1.4h, %2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return (uint32x4_t) + __builtin_aarch64_xtn2v2di ((int32x2_t) __a, (int64x2_t) __b); } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_u32 (uint32x2_t a, uint32x2_t b) +vmovn_s16 (int16x8_t __a) { - uint64x2_t result; - __asm__ ("umull %0.2d, %1.2s, %2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_xtnv8hi (__a); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadal_s8 (int16x4_t a, int8x8_t b) +vmovn_s32 (int32x4_t __a) { - int16x4_t result; - __asm__ ("sadalp %0.4h,%2.8b" - : "=w"(result) - : "0"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_xtnv4si (__a); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadal_s16 (int32x2_t a, int16x4_t b) +vmovn_s64 (int64x2_t __a) { - int32x2_t result; - __asm__ ("sadalp %0.2s,%2.4h" - : "=w"(result) - : "0"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_xtnv2di (__a); } -__extension__ extern __inline int64x1_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadal_s32 (int64x1_t a, int32x2_t b) +vmovn_u16 (uint16x8_t __a) { - int64x1_t result; - __asm__ ("sadalp %0.1d,%2.2s" - : "=w"(result) - : "0"(a), "w"(b) - : /* No clobbers */); - return result; + return (uint8x8_t)__builtin_aarch64_xtnv8hi ((int16x8_t) __a); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadal_u8 (uint16x4_t a, uint8x8_t b) +vmovn_u32 (uint32x4_t __a) { - uint16x4_t result; - __asm__ ("uadalp %0.4h,%2.8b" - : "=w"(result) - : "0"(a), "w"(b) - : /* No clobbers */); - return result; + return (uint16x4_t) __builtin_aarch64_xtnv4si ((int32x4_t )__a); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadal_u16 (uint32x2_t a, uint16x4_t b) +vmovn_u64 (uint64x2_t __a) { - uint32x2_t result; - __asm__ ("uadalp %0.2s,%2.4h" - : "=w"(result) - : "0"(a), "w"(b) - : /* No clobbers */); - return result; + return (uint32x2_t) __builtin_aarch64_xtnv2di ((int64x2_t) __a); } -__extension__ extern __inline uint64x1_t +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadal_u32 (uint64x1_t a, uint32x2_t b) +vshrn_n_s16 (int16x8_t __a, const int __b) { - uint64x1_t result; - __asm__ ("uadalp %0.1d,%2.2s" - : "=w"(result) - : "0"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_shrnv8hi (__a, __b); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadalq_s8 (int16x8_t a, int8x16_t b) +vshrn_n_s32 (int32x4_t __a, const int __b) { - int16x8_t result; - __asm__ ("sadalp %0.8h,%2.16b" - : "=w"(result) - : "0"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_shrnv4si (__a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadalq_s16 (int32x4_t a, int16x8_t b) +vshrn_n_s64 (int64x2_t __a, const int __b) { - int32x4_t result; - __asm__ ("sadalp %0.4s,%2.8h" - : "=w"(result) - : "0"(a), "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_shrnv2di (__a, __b); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadalq_s32 (int64x2_t a, int32x4_t b) +vshrn_n_u16 (uint16x8_t __a, const int __b) { - int64x2_t result; - __asm__ ("sadalp %0.2d,%2.4s" - : "=w"(result) - : "0"(a), "w"(b) - : /* No clobbers */); - return result; + return (uint8x8_t)__builtin_aarch64_shrnv8hi ((int16x8_t)__a, __b); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vshrn_n_u32 (uint32x4_t __a, const int __b) +{ + return (uint16x4_t)__builtin_aarch64_shrnv4si ((int32x4_t)__a, __b); +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vshrn_n_u64 (uint64x2_t __a, const int __b) +{ + return (uint32x2_t)__builtin_aarch64_shrnv2di ((int64x2_t)__a, __b); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_lane_s16 (int16x8_t __a, int16x4_t __v, const int __lane) +{ + return __builtin_aarch64_smull_hi_lanev8hi (__a, __v, __lane); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_lane_s32 (int32x4_t __a, int32x2_t __v, const int __lane) +{ + return __builtin_aarch64_smull_hi_lanev4si (__a, __v, __lane); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_lane_u16 (uint16x8_t __a, uint16x4_t __v, const int __lane) +{ + return __builtin_aarch64_umull_hi_lanev8hi_uuus (__a, __v, __lane); +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_lane_u32 (uint32x4_t __a, uint32x2_t __v, const int __lane) +{ + return __builtin_aarch64_umull_hi_lanev4si_uuus (__a, __v, __lane); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_laneq_s16 (int16x8_t __a, int16x8_t __v, const int __lane) +{ + return __builtin_aarch64_smull_hi_laneqv8hi (__a, __v, __lane); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_laneq_s32 (int32x4_t __a, int32x4_t __v, const int __lane) +{ + return __builtin_aarch64_smull_hi_laneqv4si (__a, __v, __lane); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_laneq_u16 (uint16x8_t __a, uint16x8_t __v, const int __lane) +{ + return __builtin_aarch64_umull_hi_laneqv8hi_uuus (__a, __v, __lane); +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_laneq_u32 (uint32x4_t __a, uint32x4_t __v, const int __lane) +{ + return __builtin_aarch64_umull_hi_laneqv4si_uuus (__a, __v, __lane); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_n_s16 (int16x8_t __a, int16_t __b) +{ + return __builtin_aarch64_smull_hi_nv8hi (__a, __b); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_n_s32 (int32x4_t __a, int32_t __b) +{ + return __builtin_aarch64_smull_hi_nv4si (__a, __b); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_n_u16 (uint16x8_t __a, uint16_t __b) +{ + return __builtin_aarch64_umull_hi_nv8hi_uuu (__a, __b); +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_n_u32 (uint32x4_t __a, uint32_t __b) +{ + return __builtin_aarch64_umull_hi_nv4si_uuu (__a, __b); +} + +__extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadalq_u8 (uint16x8_t a, uint8x16_t b) +vmull_high_p8 (poly8x16_t __a, poly8x16_t __b) { - uint16x8_t result; - __asm__ ("uadalp %0.8h,%2.16b" - : "=w"(result) - : "0"(a), "w"(b) + poly16x8_t __result; + __asm__ ("pmull2 %0.8h,%1.16b,%2.16b" + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_s8 (int8x16_t __a, int8x16_t __b) +{ + return __builtin_aarch64_vec_widen_smult_hi_v16qi (__a, __b); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_s16 (int16x8_t __a, int16x8_t __b) +{ + return __builtin_aarch64_vec_widen_smult_hi_v8hi (__a, __b); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_s32 (int32x4_t __a, int32x4_t __b) +{ + return __builtin_aarch64_vec_widen_smult_hi_v4si (__a, __b); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return __builtin_aarch64_vec_widen_umult_hi_v16qi_uuu (__a, __b); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return __builtin_aarch64_vec_widen_umult_hi_v8hi_uuu (__a, __b); +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_high_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return __builtin_aarch64_vec_widen_umult_hi_v4si_uuu (__a, __b); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c) +{ + return __builtin_aarch64_vec_smult_lane_v4hi (__a, __b, __c); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c) +{ + return __builtin_aarch64_vec_smult_lane_v2si (__a, __b, __c); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) +{ + return __builtin_aarch64_vec_umult_lane_v4hi_uuus (__a, __b, __c); +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) +{ + return __builtin_aarch64_vec_umult_lane_v2si_uuus (__a, __b, __c); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_vec_smult_laneq_v4hi (__a, __b, __c); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_vec_smult_laneq_v2si (__a, __b, __c); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const int __c) +{ + return __builtin_aarch64_vec_umult_laneq_v4hi_uuus (__a, __b, __c); +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const int __c) +{ + return __builtin_aarch64_vec_umult_laneq_v2si_uuus (__a, __b, __c); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_n_s16 (int16x4_t __a, int16_t __b) +{ + return __builtin_aarch64_smull_nv4hi (__a, __b); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_n_s32 (int32x2_t __a, int32_t __b) +{ + return __builtin_aarch64_smull_nv2si (__a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadalq_u16 (uint32x4_t a, uint16x8_t b) +vmull_n_u16 (uint16x4_t __a, uint16_t __b) +{ + return __builtin_aarch64_umull_nv4hi_uuu (__a, __b); +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_n_u32 (uint32x2_t __a, uint32_t __b) +{ + return __builtin_aarch64_umull_nv2si_uuu (__a, __b); +} + +__extension__ extern __inline poly16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_p8 (poly8x8_t __a, poly8x8_t __b) { - uint32x4_t result; - __asm__ ("uadalp %0.4s,%2.8h" - : "=w"(result) - : "0"(a), "w"(b) + poly16x8_t __result; + __asm__ ("pmull %0.8h, %1.8b, %2.8b" + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_s8 (int8x8_t __a, int8x8_t __b) +{ + return __builtin_aarch64_intrinsic_vec_smult_lo_v8qi (__a, __b); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_s16 (int16x4_t __a, int16x4_t __b) +{ + return __builtin_aarch64_intrinsic_vec_smult_lo_v4hi (__a, __b); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_s32 (int32x2_t __a, int32x2_t __b) +{ + return __builtin_aarch64_intrinsic_vec_smult_lo_v2si (__a, __b); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return __builtin_aarch64_intrinsic_vec_umult_lo_v8qi_uuu (__a, __b); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return __builtin_aarch64_intrinsic_vec_umult_lo_v4hi_uuu (__a, __b); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadalq_u32 (uint64x2_t a, uint32x4_t b) +vmull_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return __builtin_aarch64_intrinsic_vec_umult_lo_v2si_uuu (__a, __b); +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadal_s8 (int16x4_t __a, int8x8_t __b) +{ + return __builtin_aarch64_sadalpv8qi (__a, __b); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadal_s16 (int32x2_t __a, int16x4_t __b) +{ + return __builtin_aarch64_sadalpv4hi (__a, __b); +} + +__extension__ extern __inline int64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadal_s32 (int64x1_t __a, int32x2_t __b) +{ + int64x1_t __result; + __asm__ ("sadalp %0.1d,%2.2s" + : "=w"(__result) + : "0"(__a), "w"(__b) + : /* No clobbers */); + return __result; +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadal_u8 (uint16x4_t __a, uint8x8_t __b) +{ + return __builtin_aarch64_uadalpv8qi_uuu (__a, __b); +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadal_u16 (uint32x2_t __a, uint16x4_t __b) +{ + return __builtin_aarch64_uadalpv4hi_uuu (__a, __b); +} + +__extension__ extern __inline uint64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadal_u32 (uint64x1_t __a, uint32x2_t __b) { - uint64x2_t result; - __asm__ ("uadalp %0.2d,%2.4s" - : "=w"(result) - : "0"(a), "w"(b) + uint64x1_t __result; + __asm__ ("uadalp %0.1d,%2.2s" + : "=w"(__result) + : "0"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadalq_s8 (int16x8_t __a, int8x16_t __b) +{ + return __builtin_aarch64_sadalpv16qi (__a, __b); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadalq_s16 (int32x4_t __a, int16x8_t __b) +{ + return __builtin_aarch64_sadalpv8hi (__a, __b); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadalq_s32 (int64x2_t __a, int32x4_t __b) +{ + return __builtin_aarch64_sadalpv4si (__a, __b); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadalq_u8 (uint16x8_t __a, uint8x16_t __b) +{ + return __builtin_aarch64_uadalpv16qi_uuu (__a, __b); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadalq_u16 (uint32x4_t __a, uint16x8_t __b) +{ + return __builtin_aarch64_uadalpv8hi_uuu (__a, __b); +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadalq_u32 (uint64x2_t __a, uint32x4_t __b) +{ + return __builtin_aarch64_uadalpv4si_uuu (__a, __b); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddl_s8 (int8x8_t a) +vpaddl_s8 (int8x8_t __a) { - int16x4_t result; + int16x4_t __result; __asm__ ("saddlp %0.4h,%1.8b" - : "=w"(result) - : "w"(a) + : "=w"(__result) + : "w"(__a) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddl_s16 (int16x4_t a) +vpaddl_s16 (int16x4_t __a) { - int32x2_t result; + int32x2_t __result; __asm__ ("saddlp %0.2s,%1.4h" - : "=w"(result) - : "w"(a) + : "=w"(__result) + : "w"(__a) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddl_s32 (int32x2_t a) +vpaddl_s32 (int32x2_t __a) { - int64x1_t result; + int64x1_t __result; __asm__ ("saddlp %0.1d,%1.2s" - : "=w"(result) - : "w"(a) + : "=w"(__result) + : "w"(__a) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddl_u8 (uint8x8_t a) +vpaddl_u8 (uint8x8_t __a) { - uint16x4_t result; + uint16x4_t __result; __asm__ ("uaddlp %0.4h,%1.8b" - : "=w"(result) - : "w"(a) + : "=w"(__result) + : "w"(__a) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddl_u16 (uint16x4_t a) +vpaddl_u16 (uint16x4_t __a) { - uint32x2_t result; + uint32x2_t __result; __asm__ ("uaddlp %0.2s,%1.4h" - : "=w"(result) - : "w"(a) + : "=w"(__result) + : "w"(__a) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddl_u32 (uint32x2_t a) +vpaddl_u32 (uint32x2_t __a) { - uint64x1_t result; + uint64x1_t __result; __asm__ ("uaddlp %0.1d,%1.2s" - : "=w"(result) - : "w"(a) + : "=w"(__result) + : "w"(__a) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddlq_s8 (int8x16_t a) +vpaddlq_s8 (int8x16_t __a) { - int16x8_t result; + int16x8_t __result; __asm__ ("saddlp %0.8h,%1.16b" - : "=w"(result) - : "w"(a) + : "=w"(__result) + : "w"(__a) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddlq_s16 (int16x8_t a) +vpaddlq_s16 (int16x8_t __a) { - int32x4_t result; + int32x4_t __result; __asm__ ("saddlp %0.4s,%1.8h" - : "=w"(result) - : "w"(a) + : "=w"(__result) + : "w"(__a) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddlq_s32 (int32x4_t a) +vpaddlq_s32 (int32x4_t __a) { - int64x2_t result; + int64x2_t __result; __asm__ ("saddlp %0.2d,%1.4s" - : "=w"(result) - : "w"(a) + : "=w"(__result) + : "w"(__a) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddlq_u8 (uint8x16_t a) +vpaddlq_u8 (uint8x16_t __a) { - uint16x8_t result; + uint16x8_t __result; __asm__ ("uaddlp %0.8h,%1.16b" - : "=w"(result) - : "w"(a) + : "=w"(__result) + : "w"(__a) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddlq_u16 (uint16x8_t a) +vpaddlq_u16 (uint16x8_t __a) { - uint32x4_t result; + uint32x4_t __result; __asm__ ("uaddlp %0.4s,%1.8h" - : "=w"(result) - : "w"(a) + : "=w"(__result) + : "w"(__a) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddlq_u32 (uint32x4_t a) +vpaddlq_u32 (uint32x4_t __a) { - uint64x2_t result; + uint64x2_t __result; __asm__ ("uaddlp %0.2d,%1.4s" - : "=w"(result) - : "w"(a) + : "=w"(__result) + : "w"(__a) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddq_s8 (int8x16_t a, int8x16_t b) +vpaddq_s8 (int8x16_t __a, int8x16_t __b) { - int8x16_t result; + int8x16_t __result; __asm__ ("addp %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddq_s16 (int16x8_t a, int16x8_t b) +vpaddq_s16 (int16x8_t __a, int16x8_t __b) { - int16x8_t result; + int16x8_t __result; __asm__ ("addp %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddq_s32 (int32x4_t a, int32x4_t b) +vpaddq_s32 (int32x4_t __a, int32x4_t __b) { - int32x4_t result; + int32x4_t __result; __asm__ ("addp %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddq_s64 (int64x2_t a, int64x2_t b) +vpaddq_s64 (int64x2_t __a, int64x2_t __b) { - int64x2_t result; + int64x2_t __result; __asm__ ("addp %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddq_u8 (uint8x16_t a, uint8x16_t b) +vpaddq_u8 (uint8x16_t __a, uint8x16_t __b) { - uint8x16_t result; + uint8x16_t __result; __asm__ ("addp %0.16b,%1.16b,%2.16b" - : "=w"(result) - : "w"(a), "w"(b) + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddq_u16 (uint16x8_t a, uint16x8_t b) +vpaddq_u16 (uint16x8_t __a, uint16x8_t __b) { - uint16x8_t result; + uint16x8_t __result; __asm__ ("addp %0.8h,%1.8h,%2.8h" - : "=w"(result) - : "w"(a), "w"(b) + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddq_u32 (uint32x4_t a, uint32x4_t b) +vpaddq_u32 (uint32x4_t __a, uint32x4_t __b) { - uint32x4_t result; + uint32x4_t __result; __asm__ ("addp %0.4s,%1.4s,%2.4s" - : "=w"(result) - : "w"(a), "w"(b) + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddq_u64 (uint64x2_t a, uint64x2_t b) +vpaddq_u64 (uint64x2_t __a, uint64x2_t __b) { - uint64x2_t result; + uint64x2_t __result; __asm__ ("addp %0.2d,%1.2d,%2.2d" - : "=w"(result) - : "w"(a), "w"(b) + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmulh_n_s16 (int16x4_t a, int16_t b) +vqdmulh_n_s16 (int16x4_t __a, int16_t __b) { - int16x4_t result; + int16x4_t __result; __asm__ ("sqdmulh %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) + : "=w"(__result) + : "w"(__a), "x"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmulh_n_s32 (int32x2_t a, int32_t b) +vqdmulh_n_s32 (int32x2_t __a, int32_t __b) { - int32x2_t result; + int32x2_t __result; __asm__ ("sqdmulh %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmulhq_n_s16 (int16x8_t a, int16_t b) +vqdmulhq_n_s16 (int16x8_t __a, int16_t __b) { - int16x8_t result; + int16x8_t __result; __asm__ ("sqdmulh %0.8h,%1.8h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) + : "=w"(__result) + : "w"(__a), "x"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmulhq_n_s32 (int32x4_t a, int32_t b) +vqdmulhq_n_s32 (int32x4_t __a, int32_t __b) { - int32x4_t result; + int32x4_t __result; __asm__ ("sqdmulh %0.4s,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovn_high_s16 (int8x8_t a, int16x8_t b) +vqmovn_high_s16 (int8x8_t __a, int16x8_t __b) { - int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("sqxtn2 %0.16b, %1.8h" - : "+w"(result) - : "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sqxtn2v8hi (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovn_high_s32 (int16x4_t a, int32x4_t b) +vqmovn_high_s32 (int16x4_t __a, int32x4_t __b) { - int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0))); - __asm__ ("sqxtn2 %0.8h, %1.4s" - : "+w"(result) - : "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sqxtn2v4si (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovn_high_s64 (int32x2_t a, int64x2_t b) +vqmovn_high_s64 (int32x2_t __a, int64x2_t __b) { - int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0))); - __asm__ ("sqxtn2 %0.4s, %1.2d" - : "+w"(result) - : "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sqxtn2v2di (__a, __b); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovn_high_u16 (uint8x8_t a, uint16x8_t b) +vqmovn_high_u16 (uint8x8_t __a, uint16x8_t __b) { - uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("uqxtn2 %0.16b, %1.8h" - : "+w"(result) - : "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_uqxtn2v8hi_uuu (__a, __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovn_high_u32 (uint16x4_t a, uint32x4_t b) +vqmovn_high_u32 (uint16x4_t __a, uint32x4_t __b) { - uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0))); - __asm__ ("uqxtn2 %0.8h, %1.4s" - : "+w"(result) - : "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_uqxtn2v4si_uuu (__a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovn_high_u64 (uint32x2_t a, uint64x2_t b) +vqmovn_high_u64 (uint32x2_t __a, uint64x2_t __b) { - uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0))); - __asm__ ("uqxtn2 %0.4s, %1.2d" - : "+w"(result) - : "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_uqxtn2v2di_uuu (__a, __b); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovun_high_s16 (uint8x8_t a, int16x8_t b) +vqmovun_high_s16 (uint8x8_t __a, int16x8_t __b) { - uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("sqxtun2 %0.16b, %1.8h" - : "+w"(result) - : "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sqxtun2v8hi_uus (__a, __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovun_high_s32 (uint16x4_t a, int32x4_t b) +vqmovun_high_s32 (uint16x4_t __a, int32x4_t __b) { - uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0))); - __asm__ ("sqxtun2 %0.8h, %1.4s" - : "+w"(result) - : "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sqxtun2v4si_uus (__a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovun_high_s64 (uint32x2_t a, int64x2_t b) +vqmovun_high_s64 (uint32x2_t __a, int64x2_t __b) { - uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0))); - __asm__ ("sqxtun2 %0.4s, %1.2d" - : "+w"(result) - : "w"(b) - : /* No clobbers */); - return result; + return __builtin_aarch64_sqxtun2v2di_uus (__a, __b); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrdmulh_n_s16 (int16x4_t a, int16_t b) +vqrdmulh_n_s16 (int16x4_t __a, int16_t __b) { - int16x4_t result; + int16x4_t __result; __asm__ ("sqrdmulh %0.4h,%1.4h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) + : "=w"(__result) + : "w"(__a), "x"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrdmulh_n_s32 (int32x2_t a, int32_t b) +vqrdmulh_n_s32 (int32x2_t __a, int32_t __b) { - int32x2_t result; + int32x2_t __result; __asm__ ("sqrdmulh %0.2s,%1.2s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrdmulhq_n_s16 (int16x8_t a, int16_t b) +vqrdmulhq_n_s16 (int16x8_t __a, int16_t __b) { - int16x8_t result; + int16x8_t __result; __asm__ ("sqrdmulh %0.8h,%1.8h,%2.h[0]" - : "=w"(result) - : "w"(a), "x"(b) + : "=w"(__result) + : "w"(__a), "x"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrdmulhq_n_s32 (int32x4_t a, int32_t b) +vqrdmulhq_n_s32 (int32x4_t __a, int32_t __b) { - int32x4_t result; + int32x4_t __result; __asm__ ("sqrdmulh %0.4s,%1.4s,%2.s[0]" - : "=w"(result) - : "w"(a), "w"(b) + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; } -#define vqrshrn_high_n_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - int8x8_t a_ = (a); \ - int8x16_t result = vcombine_s8 \ - (a_, vcreate_s8 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqrshrn2 %0.16b, %1.8h, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshrn_high_n_s16 (int8x8_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_sqrshrn2_nv8hi (__a, __b, __c); +} -#define vqrshrn_high_n_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int16x8_t result = vcombine_s16 \ - (a_, vcreate_s16 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqrshrn2 %0.8h, %1.4s, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshrn_high_n_s32 (int16x4_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_sqrshrn2_nv4si (__a, __b, __c); +} -#define vqrshrn_high_n_s64(a, b, c) \ - __extension__ \ - ({ \ - int64x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int32x4_t result = vcombine_s32 \ - (a_, vcreate_s32 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqrshrn2 %0.4s, %1.2d, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshrn_high_n_s64 (int32x2_t __a, int64x2_t __b, const int __c) +{ + return __builtin_aarch64_sqrshrn2_nv2di (__a, __b, __c); +} -#define vqrshrn_high_n_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint8x8_t a_ = (a); \ - uint8x16_t result = vcombine_u8 \ - (a_, vcreate_u8 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("uqrshrn2 %0.16b, %1.8h, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshrn_high_n_u16 (uint8x8_t __a, uint16x8_t __b, const int __c) +{ + return __builtin_aarch64_uqrshrn2_nv8hi_uuus (__a, __b, __c); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshrn_high_n_u32 (uint16x4_t __a, uint32x4_t __b, const int __c) +{ + return __builtin_aarch64_uqrshrn2_nv4si_uuus (__a, __b, __c); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshrn_high_n_u64 (uint32x2_t __a, uint64x2_t __b, const int __c) +{ + return __builtin_aarch64_uqrshrn2_nv2di_uuus (__a, __b, __c); +} + +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshrun_high_n_s16 (uint8x8_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_sqrshrun2_nv8hi_uuss (__a, __b, __c); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshrun_high_n_s32 (uint16x4_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_sqrshrun2_nv4si_uuss (__a, __b, __c); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshrun_high_n_s64 (uint32x2_t __a, int64x2_t __b, const int __c) +{ + return __builtin_aarch64_sqrshrun2_nv2di_uuss (__a, __b, __c); +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshrn_high_n_s16 (int8x8_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_sqshrn2_nv8hi (__a, __b, __c); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshrn_high_n_s32 (int16x4_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_sqshrn2_nv4si (__a, __b, __c); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshrn_high_n_s64 (int32x2_t __a, int64x2_t __b, const int __c) +{ + return __builtin_aarch64_sqshrn2_nv2di (__a, __b, __c); +} + +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshrn_high_n_u16 (uint8x8_t __a, uint16x8_t __b, const int __c) +{ + return __builtin_aarch64_uqshrn2_nv8hi_uuus (__a, __b, __c); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshrn_high_n_u32 (uint16x4_t __a, uint32x4_t __b, const int __c) +{ + return __builtin_aarch64_uqshrn2_nv4si_uuus (__a, __b, __c); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshrn_high_n_u64 (uint32x2_t __a, uint64x2_t __b, const int __c) +{ + return __builtin_aarch64_uqshrn2_nv2di_uuus (__a, __b, __c); +} + +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshrun_high_n_s16 (uint8x8_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_sqshrun2_nv8hi_uuss (__a, __b, __c); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshrun_high_n_s32 (uint16x4_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_sqshrun2_nv4si_uuss (__a, __b, __c); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshrun_high_n_s64 (uint32x2_t __a, int64x2_t __b, const int __c) +{ + return __builtin_aarch64_sqshrun2_nv2di_uuss (__a, __b, __c); +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrshrn_high_n_s16 (int8x8_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_rshrn2v8hi (__a, __b, __c); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrshrn_high_n_s32 (int16x4_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_rshrn2v4si (__a, __b, __c); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrshrn_high_n_s64 (int32x2_t __a, int64x2_t __b, const int __c) +{ + return __builtin_aarch64_rshrn2v2di (__a, __b, __c); +} + +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrshrn_high_n_u16 (uint8x8_t __a, uint16x8_t __b, const int __c) +{ + return (uint8x16_t) __builtin_aarch64_rshrn2v8hi ((int8x8_t) __a, + (int16x8_t) __b, __c); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrshrn_high_n_u32 (uint16x4_t __a, uint32x4_t __b, const int __c) +{ + return (uint16x8_t) __builtin_aarch64_rshrn2v4si ((int16x4_t) __a, + (int32x4_t) __b, __c); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrshrn_high_n_u64 (uint32x2_t __a, uint64x2_t __b, const int __c) +{ + return (uint32x4_t) __builtin_aarch64_rshrn2v2di ((int32x2_t)__a, + (int64x2_t)__b, __c); +} + +__extension__ extern __inline int8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrshrn_n_s16 (int16x8_t __a, const int __b) +{ + return __builtin_aarch64_rshrnv8hi (__a, __b); +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrshrn_n_s32 (int32x4_t __a, const int __b) +{ + return __builtin_aarch64_rshrnv4si (__a, __b); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrshrn_n_s64 (int64x2_t __a, const int __b) +{ + return __builtin_aarch64_rshrnv2di (__a, __b); +} + +__extension__ extern __inline uint8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrshrn_n_u16 (uint16x8_t __a, const int __b) +{ + return (uint8x8_t) __builtin_aarch64_rshrnv8hi ((int16x8_t) __a, __b); +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrshrn_n_u32 (uint32x4_t __a, const int __b) +{ + return (uint16x4_t) __builtin_aarch64_rshrnv4si ((int32x4_t) __a, __b); +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrshrn_n_u64 (uint64x2_t __a, const int __b) +{ + return (uint32x2_t) __builtin_aarch64_rshrnv2di ((int64x2_t) __a, __b); +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrsqrte_u32 (uint32x2_t __a) +{ + return __builtin_aarch64_ursqrtev2si_uu (__a); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrsqrteq_u32 (uint32x4_t __a) +{ + return __builtin_aarch64_ursqrtev4si_uu (__a); +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vshrn_high_n_s16 (int8x8_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_shrn2v8hi (__a, __b, __c); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vshrn_high_n_s32 (int16x4_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_shrn2v4si (__a, __b, __c); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vshrn_high_n_s64 (int32x2_t __a, int64x2_t __b, const int __c) +{ + return __builtin_aarch64_shrn2v2di (__a, __b, __c); +} + +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vshrn_high_n_u16 (uint8x8_t __a, uint16x8_t __b, const int __c) +{ + return (uint8x16_t) + __builtin_aarch64_shrn2v8hi ((int8x8_t) __a, (int16x8_t) __b, __c); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vshrn_high_n_u32 (uint16x4_t __a, uint32x4_t __b, const int __c) +{ + return (uint16x8_t) + __builtin_aarch64_shrn2v4si ((int16x4_t) __a, (int32x4_t) __b, __c); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vshrn_high_n_u64 (uint32x2_t __a, uint64x2_t __b, const int __c) +{ + return (uint32x4_t) + __builtin_aarch64_shrn2v2di ((int32x2_t) __a, (int64x2_t) __b, __c); +} -#define vqrshrn_high_n_u32(a, b, c) \ +#define vsli_n_p8(a, b, c) \ __extension__ \ ({ \ - uint32x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x8_t result = vcombine_u16 \ - (a_, vcreate_u16 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("uqrshrn2 %0.8h, %1.4s, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ + poly8x8_t b_ = (b); \ + poly8x8_t a_ = (a); \ + poly8x8_t result; \ + __asm__ ("sli %0.8b,%2.8b,%3" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vqrshrn_high_n_u64(a, b, c) \ +#define vsli_n_p16(a, b, c) \ __extension__ \ ({ \ - uint64x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x4_t result = vcombine_u32 \ - (a_, vcreate_u32 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("uqrshrn2 %0.4s, %1.2d, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ + poly16x4_t b_ = (b); \ + poly16x4_t a_ = (a); \ + poly16x4_t result; \ + __asm__ ("sli %0.4h,%2.4h,%3" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vqrshrun_high_n_s16(a, b, c) \ +#define vsliq_n_p8(a, b, c) \ __extension__ \ ({ \ - int16x8_t b_ = (b); \ - uint8x8_t a_ = (a); \ - uint8x16_t result = vcombine_u8 \ - (a_, vcreate_u8 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqrshrun2 %0.16b, %1.8h, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ + poly8x16_t b_ = (b); \ + poly8x16_t a_ = (a); \ + poly8x16_t result; \ + __asm__ ("sli %0.16b,%2.16b,%3" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vqrshrun_high_n_s32(a, b, c) \ +#define vsliq_n_p16(a, b, c) \ __extension__ \ ({ \ - int32x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x8_t result = vcombine_u16 \ - (a_, vcreate_u16 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqrshrun2 %0.8h, %1.4s, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ + poly16x8_t b_ = (b); \ + poly16x8_t a_ = (a); \ + poly16x8_t result; \ + __asm__ ("sli %0.8h,%2.8h,%3" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vqrshrun_high_n_s64(a, b, c) \ +#define vsri_n_p8(a, b, c) \ __extension__ \ ({ \ - int64x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x4_t result = vcombine_u32 \ - (a_, vcreate_u32 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqrshrun2 %0.4s, %1.2d, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ + poly8x8_t b_ = (b); \ + poly8x8_t a_ = (a); \ + poly8x8_t result; \ + __asm__ ("sri %0.8b,%2.8b,%3" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vqshrn_high_n_s16(a, b, c) \ +#define vsri_n_p16(a, b, c) \ __extension__ \ ({ \ - int16x8_t b_ = (b); \ - int8x8_t a_ = (a); \ - int8x16_t result = vcombine_s8 \ - (a_, vcreate_s8 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqshrn2 %0.16b, %1.8h, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ + poly16x4_t b_ = (b); \ + poly16x4_t a_ = (a); \ + poly16x4_t result; \ + __asm__ ("sri %0.4h,%2.4h,%3" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vqshrn_high_n_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int16x8_t result = vcombine_s16 \ - (a_, vcreate_s16 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqshrn2 %0.8h, %1.4s, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vqshrn_high_n_s64(a, b, c) \ - __extension__ \ - ({ \ - int64x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int32x4_t result = vcombine_s32 \ - (a_, vcreate_s32 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqshrn2 %0.4s, %1.2d, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vqshrn_high_n_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint8x8_t a_ = (a); \ - uint8x16_t result = vcombine_u8 \ - (a_, vcreate_u8 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("uqshrn2 %0.16b, %1.8h, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vqshrn_high_n_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x8_t result = vcombine_u16 \ - (a_, vcreate_u16 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("uqshrn2 %0.8h, %1.4s, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vqshrn_high_n_u64(a, b, c) \ - __extension__ \ - ({ \ - uint64x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x4_t result = vcombine_u32 \ - (a_, vcreate_u32 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("uqshrn2 %0.4s, %1.2d, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vqshrun_high_n_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - uint8x8_t a_ = (a); \ - uint8x16_t result = vcombine_u8 \ - (a_, vcreate_u8 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqshrun2 %0.16b, %1.8h, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vqshrun_high_n_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x8_t result = vcombine_u16 \ - (a_, vcreate_u16 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqshrun2 %0.8h, %1.4s, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vqshrun_high_n_s64(a, b, c) \ - __extension__ \ - ({ \ - int64x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x4_t result = vcombine_u32 \ - (a_, vcreate_u32 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqshrun2 %0.4s, %1.2d, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vrshrn_high_n_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - int8x8_t a_ = (a); \ - int8x16_t result = vcombine_s8 \ - (a_, vcreate_s8 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("rshrn2 %0.16b,%1.8h,#%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vrshrn_high_n_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int16x8_t result = vcombine_s16 \ - (a_, vcreate_s16 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("rshrn2 %0.8h,%1.4s,#%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vrshrn_high_n_s64(a, b, c) \ - __extension__ \ - ({ \ - int64x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int32x4_t result = vcombine_s32 \ - (a_, vcreate_s32 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("rshrn2 %0.4s,%1.2d,#%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vrshrn_high_n_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint8x8_t a_ = (a); \ - uint8x16_t result = vcombine_u8 \ - (a_, vcreate_u8 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("rshrn2 %0.16b,%1.8h,#%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vrshrn_high_n_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x8_t result = vcombine_u16 \ - (a_, vcreate_u16 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("rshrn2 %0.8h,%1.4s,#%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vrshrn_high_n_u64(a, b, c) \ - __extension__ \ - ({ \ - uint64x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x4_t result = vcombine_u32 \ - (a_, vcreate_u32 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("rshrn2 %0.4s,%1.2d,#%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vrshrn_n_s16(a, b) \ - __extension__ \ - ({ \ - int16x8_t a_ = (a); \ - int8x8_t result; \ - __asm__ ("rshrn %0.8b,%1.8h,%2" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vrshrn_n_s32(a, b) \ - __extension__ \ - ({ \ - int32x4_t a_ = (a); \ - int16x4_t result; \ - __asm__ ("rshrn %0.4h,%1.4s,%2" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ +#define vsri_n_p64(a, b, c) \ + __extension__ \ + ({ \ + poly64x1_t b_ = (b); \ + poly64x1_t a_ = (a); \ + poly64x1_t result; \ + __asm__ ("sri %d0,%d2,%3" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "i"(c) \ + : /* No clobbers. */); \ + result; \ }) -#define vrshrn_n_s64(a, b) \ +#define vsriq_n_p8(a, b, c) \ __extension__ \ ({ \ - int64x2_t a_ = (a); \ - int32x2_t result; \ - __asm__ ("rshrn %0.2s,%1.2d,%2" \ + poly8x16_t b_ = (b); \ + poly8x16_t a_ = (a); \ + poly8x16_t result; \ + __asm__ ("sri %0.16b,%2.16b,%3" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "0"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vrshrn_n_u16(a, b) \ +#define vsriq_n_p16(a, b, c) \ __extension__ \ ({ \ - uint16x8_t a_ = (a); \ - uint8x8_t result; \ - __asm__ ("rshrn %0.8b,%1.8h,%2" \ + poly16x8_t b_ = (b); \ + poly16x8_t a_ = (a); \ + poly16x8_t result; \ + __asm__ ("sri %0.8h,%2.8h,%3" \ : "=w"(result) \ - : "w"(a_), "i"(b) \ + : "0"(a_), "w"(b_), "i"(c) \ : /* No clobbers */); \ result; \ }) -#define vrshrn_n_u32(a, b) \ - __extension__ \ - ({ \ - uint32x4_t a_ = (a); \ - uint16x4_t result; \ - __asm__ ("rshrn %0.4h,%1.4s,%2" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ +#define vsriq_n_p64(a, b, c) \ + __extension__ \ + ({ \ + poly64x2_t b_ = (b); \ + poly64x2_t a_ = (a); \ + poly64x2_t result; \ + __asm__ ("sri %0.2d,%2.2d,%3" \ + : "=w"(result) \ + : "0"(a_), "w"(b_), "i"(c) \ + : /* No clobbers. */); \ + result; \ }) -#define vrshrn_n_u64(a, b) \ - __extension__ \ - ({ \ - uint64x2_t a_ = (a); \ - uint32x2_t result; \ - __asm__ ("rshrn %0.2s,%1.2d,%2" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vtst_p8 (poly8x8_t __a, poly8x8_t __b) +{ + return (uint8x8_t) ((((uint8x8_t) __a) & ((uint8x8_t) __b)) + != 0); +} -__extension__ extern __inline uint32x2_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsqrte_u32 (uint32x2_t a) +vtst_p16 (poly16x4_t __a, poly16x4_t __b) { - uint32x2_t result; - __asm__ ("ursqrte %0.2s,%1.2s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return (uint16x4_t) ((((uint16x4_t) __a) & ((uint16x4_t) __b)) + != 0); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsqrteq_u32 (uint32x4_t a) +vtst_p64 (poly64x1_t __a, poly64x1_t __b) { - uint32x4_t result; - __asm__ ("ursqrte %0.4s,%1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; + return (uint64x1_t) ((__a & __b) != __AARCH64_INT64_C (0)); } -#define vshrn_high_n_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - int8x8_t a_ = (a); \ - int8x16_t result = vcombine_s8 \ - (a_, vcreate_s8 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("shrn2 %0.16b,%1.8h,#%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vtstq_p8 (poly8x16_t __a, poly8x16_t __b) +{ + return (uint8x16_t) ((((uint8x16_t) __a) & ((uint8x16_t) __b)) + != 0); +} -#define vshrn_high_n_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int16x8_t result = vcombine_s16 \ - (a_, vcreate_s16 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("shrn2 %0.8h,%1.4s,#%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vtstq_p16 (poly16x8_t __a, poly16x8_t __b) +{ + return (uint16x8_t) ((((uint16x8_t) __a) & ((uint16x8_t) __b)) + != 0); +} -#define vshrn_high_n_s64(a, b, c) \ - __extension__ \ - ({ \ - int64x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int32x4_t result = vcombine_s32 \ - (a_, vcreate_s32 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("shrn2 %0.4s,%1.2d,#%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vtstq_p64 (poly64x2_t __a, poly64x2_t __b) +{ + return (uint64x2_t) ((((uint64x2_t) __a) & ((uint64x2_t) __b)) + != __AARCH64_INT64_C (0)); +} -#define vshrn_high_n_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint8x8_t a_ = (a); \ - uint8x16_t result = vcombine_u8 \ - (a_, vcreate_u8 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("shrn2 %0.16b,%1.8h,#%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +/* End of temporary inline asm implementations. */ -#define vshrn_high_n_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x8_t result = vcombine_u16 \ - (a_, vcreate_u16 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("shrn2 %0.8h,%1.4s,#%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +/* Start of temporary inline asm for vldn, vstn and friends. */ -#define vshrn_high_n_u64(a, b, c) \ - __extension__ \ - ({ \ - uint64x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x4_t result = vcombine_u32 \ - (a_, vcreate_u32 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("shrn2 %0.4s,%1.2d,#%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +/* Create struct element types for duplicating loads. -#define vshrn_n_s16(a, b) \ - __extension__ \ - ({ \ - int16x8_t a_ = (a); \ - int8x8_t result; \ - __asm__ ("shrn %0.8b,%1.8h,%2" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) + Create 2 element structures of: -#define vshrn_n_s32(a, b) \ - __extension__ \ - ({ \ - int32x4_t a_ = (a); \ - int16x4_t result; \ - __asm__ ("shrn %0.4h,%1.4s,%2" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) + +------+----+----+----+----+ + | | 8 | 16 | 32 | 64 | + +------+----+----+----+----+ + |int | Y | Y | N | N | + +------+----+----+----+----+ + |uint | Y | Y | N | N | + +------+----+----+----+----+ + |float | - | Y | N | N | + +------+----+----+----+----+ + |poly | Y | Y | - | - | + +------+----+----+----+----+ -#define vshrn_n_s64(a, b) \ - __extension__ \ - ({ \ - int64x2_t a_ = (a); \ - int32x2_t result; \ - __asm__ ("shrn %0.2s,%1.2d,%2" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) + Create 3 element structures of: -#define vshrn_n_u16(a, b) \ - __extension__ \ - ({ \ - uint16x8_t a_ = (a); \ - uint8x8_t result; \ - __asm__ ("shrn %0.8b,%1.8h,%2" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) + +------+----+----+----+----+ + | | 8 | 16 | 32 | 64 | + +------+----+----+----+----+ + |int | Y | Y | Y | Y | + +------+----+----+----+----+ + |uint | Y | Y | Y | Y | + +------+----+----+----+----+ + |float | - | Y | Y | Y | + +------+----+----+----+----+ + |poly | Y | Y | - | - | + +------+----+----+----+----+ -#define vshrn_n_u32(a, b) \ - __extension__ \ - ({ \ - uint32x4_t a_ = (a); \ - uint16x4_t result; \ - __asm__ ("shrn %0.4h,%1.4s,%2" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vshrn_n_u64(a, b) \ - __extension__ \ - ({ \ - uint64x2_t a_ = (a); \ - uint32x2_t result; \ - __asm__ ("shrn %0.2s,%1.2d,%2" \ - : "=w"(result) \ - : "w"(a_), "i"(b) \ - : /* No clobbers */); \ - result; \ - }) - -#define vsli_n_p8(a, b, c) \ - __extension__ \ - ({ \ - poly8x8_t b_ = (b); \ - poly8x8_t a_ = (a); \ - poly8x8_t result; \ - __asm__ ("sli %0.8b,%2.8b,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vsli_n_p16(a, b, c) \ - __extension__ \ - ({ \ - poly16x4_t b_ = (b); \ - poly16x4_t a_ = (a); \ - poly16x4_t result; \ - __asm__ ("sli %0.4h,%2.4h,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vsliq_n_p8(a, b, c) \ - __extension__ \ - ({ \ - poly8x16_t b_ = (b); \ - poly8x16_t a_ = (a); \ - poly8x16_t result; \ - __asm__ ("sli %0.16b,%2.16b,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vsliq_n_p16(a, b, c) \ - __extension__ \ - ({ \ - poly16x8_t b_ = (b); \ - poly16x8_t a_ = (a); \ - poly16x8_t result; \ - __asm__ ("sli %0.8h,%2.8h,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vsri_n_p8(a, b, c) \ - __extension__ \ - ({ \ - poly8x8_t b_ = (b); \ - poly8x8_t a_ = (a); \ - poly8x8_t result; \ - __asm__ ("sri %0.8b,%2.8b,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vsri_n_p16(a, b, c) \ - __extension__ \ - ({ \ - poly16x4_t b_ = (b); \ - poly16x4_t a_ = (a); \ - poly16x4_t result; \ - __asm__ ("sri %0.4h,%2.4h,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vsri_n_p64(a, b, c) \ - __extension__ \ - ({ \ - poly64x1_t b_ = (b); \ - poly64x1_t a_ = (a); \ - poly64x1_t result; \ - __asm__ ("sri %d0,%d2,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers. */); \ - result; \ - }) - -#define vsriq_n_p8(a, b, c) \ - __extension__ \ - ({ \ - poly8x16_t b_ = (b); \ - poly8x16_t a_ = (a); \ - poly8x16_t result; \ - __asm__ ("sri %0.16b,%2.16b,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vsriq_n_p16(a, b, c) \ - __extension__ \ - ({ \ - poly16x8_t b_ = (b); \ - poly16x8_t a_ = (a); \ - poly16x8_t result; \ - __asm__ ("sri %0.8h,%2.8h,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -#define vsriq_n_p64(a, b, c) \ - __extension__ \ - ({ \ - poly64x2_t b_ = (b); \ - poly64x2_t a_ = (a); \ - poly64x2_t result; \ - __asm__ ("sri %0.2d,%2.2d,%3" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "i"(c) \ - : /* No clobbers. */); \ - result; \ - }) - -__extension__ extern __inline uint8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtst_p8 (poly8x8_t a, poly8x8_t b) -{ - return (uint8x8_t) ((((uint8x8_t) a) & ((uint8x8_t) b)) - != 0); -} - -__extension__ extern __inline uint16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtst_p16 (poly16x4_t a, poly16x4_t b) -{ - return (uint16x4_t) ((((uint16x4_t) a) & ((uint16x4_t) b)) - != 0); -} - -__extension__ extern __inline uint64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtst_p64 (poly64x1_t a, poly64x1_t b) -{ - return (uint64x1_t) ((a & b) != __AARCH64_INT64_C (0)); -} - -__extension__ extern __inline uint8x16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtstq_p8 (poly8x16_t a, poly8x16_t b) -{ - return (uint8x16_t) ((((uint8x16_t) a) & ((uint8x16_t) b)) - != 0); -} - -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtstq_p16 (poly16x8_t a, poly16x8_t b) -{ - return (uint16x8_t) ((((uint16x8_t) a) & ((uint16x8_t) b)) - != 0); -} - -__extension__ extern __inline uint64x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtstq_p64 (poly64x2_t a, poly64x2_t b) -{ - return (uint64x2_t) ((((uint64x2_t) a) & ((uint64x2_t) b)) - != __AARCH64_INT64_C (0)); -} - -/* End of temporary inline asm implementations. */ - -/* Start of temporary inline asm for vldn, vstn and friends. */ - -/* Create struct element types for duplicating loads. - - Create 2 element structures of: - - +------+----+----+----+----+ - | | 8 | 16 | 32 | 64 | - +------+----+----+----+----+ - |int | Y | Y | N | N | - +------+----+----+----+----+ - |uint | Y | Y | N | N | - +------+----+----+----+----+ - |float | - | Y | N | N | - +------+----+----+----+----+ - |poly | Y | Y | - | - | - +------+----+----+----+----+ - - Create 3 element structures of: - - +------+----+----+----+----+ - | | 8 | 16 | 32 | 64 | - +------+----+----+----+----+ - |int | Y | Y | Y | Y | - +------+----+----+----+----+ - |uint | Y | Y | Y | Y | - +------+----+----+----+----+ - |float | - | Y | Y | Y | - +------+----+----+----+----+ - |poly | Y | Y | - | - | - +------+----+----+----+----+ - - Create 4 element structures of: + Create 4 element structures of: +------+----+----+----+----+ | | 8 | 16 | 32 | 64 | @@ -11043,8 +9511,7 @@ __ST2_LANE_FUNC (uint32x2x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, u32, __ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, di, v2di, di, u64, int64x2_t) -#undef __ST2_LANE_FUNC -#define __ST2_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ +#define __ST2Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ __extension__ extern __inline void \ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ vst2q_lane_ ## funcsuffix (ptrtype *__ptr, \ @@ -11056,20 +9523,20 @@ vst2q_lane_ ## funcsuffix (ptrtype *__ptr, \ __ptr, __temp.__o, __c); \ } -__ST2_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16) -__ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32) -__ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64) -__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8) -__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16) -__ST2_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64) -__ST2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8) -__ST2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16) -__ST2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32) -__ST2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64) -__ST2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8) -__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16) -__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32) -__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64) +__ST2Q_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16) +__ST2Q_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32) +__ST2Q_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64) +__ST2Q_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8) +__ST2Q_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16) +__ST2Q_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64) +__ST2Q_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8) +__ST2Q_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16) +__ST2Q_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32) +__ST2Q_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64) +__ST2Q_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8) +__ST2Q_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16) +__ST2Q_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32) +__ST2Q_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64) #define __ST3_LANE_FUNC(intype, largetype, ptrtype, mode, \ qmode, ptr_mode, funcsuffix, signedtype) \ @@ -11128,8 +9595,7 @@ __ST3_LANE_FUNC (uint32x2x3_t, uint32x4x3_t, uint32_t, v2si, v4si, si, u32, __ST3_LANE_FUNC (uint64x1x3_t, uint64x2x3_t, uint64_t, di, v2di, di, u64, int64x2_t) -#undef __ST3_LANE_FUNC -#define __ST3_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ +#define __ST3Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ __extension__ extern __inline void \ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ vst3q_lane_ ## funcsuffix (ptrtype *__ptr, \ @@ -11141,20 +9607,20 @@ vst3q_lane_ ## funcsuffix (ptrtype *__ptr, \ __ptr, __temp.__o, __c); \ } -__ST3_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16) -__ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32) -__ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64) -__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8) -__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16) -__ST3_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64) -__ST3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8) -__ST3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16) -__ST3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32) -__ST3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64) -__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8) -__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16) -__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32) -__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64) +__ST3Q_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16) +__ST3Q_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32) +__ST3Q_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64) +__ST3Q_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8) +__ST3Q_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16) +__ST3Q_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64) +__ST3Q_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8) +__ST3Q_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16) +__ST3Q_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32) +__ST3Q_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64) +__ST3Q_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8) +__ST3Q_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16) +__ST3Q_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32) +__ST3Q_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64) #define __ST4_LANE_FUNC(intype, largetype, ptrtype, mode, \ qmode, ptr_mode, funcsuffix, signedtype) \ @@ -11218,8 +9684,7 @@ __ST4_LANE_FUNC (uint32x2x4_t, uint32x4x4_t, uint32_t, v2si, v4si, si, u32, __ST4_LANE_FUNC (uint64x1x4_t, uint64x2x4_t, uint64_t, di, v2di, di, u64, int64x2_t) -#undef __ST4_LANE_FUNC -#define __ST4_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ +#define __ST4Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ __extension__ extern __inline void \ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ vst4q_lane_ ## funcsuffix (ptrtype *__ptr, \ @@ -11231,37 +9696,33 @@ vst4q_lane_ ## funcsuffix (ptrtype *__ptr, \ __ptr, __temp.__o, __c); \ } -__ST4_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16) -__ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32) -__ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64) -__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8) -__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16) -__ST4_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64) -__ST4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8) -__ST4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16) -__ST4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32) -__ST4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64) -__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8) -__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16) -__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32) -__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64) +__ST4Q_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16) +__ST4Q_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32) +__ST4Q_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64) +__ST4Q_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8) +__ST4Q_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16) +__ST4Q_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64) +__ST4Q_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8) +__ST4Q_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16) +__ST4Q_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32) +__ST4Q_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64) +__ST4Q_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8) +__ST4Q_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16) +__ST4Q_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32) +__ST4Q_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64) __extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vaddlv_s32 (int32x2_t a) +vaddlv_s32 (int32x2_t __a) { - int64_t result; - __asm__ ("saddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : ); - return result; + return __builtin_aarch64_saddlvv2si (__a); } __extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vaddlv_u32 (uint32x2_t a) +vaddlv_u32 (uint32x2_t __a) { - uint64_t result; - __asm__ ("uaddlp %0.1d, %1.2s" : "=w"(result) : "w"(a) : ); - return result; + return __builtin_aarch64_uaddlvv2si_uu (__a); } __extension__ extern __inline int16x4_t @@ -11324,367 +9785,367 @@ vqrdmulhq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __c) __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl1_p8 (poly8x16_t a, uint8x8_t b) +vqtbl1_p8 (poly8x16_t __a, uint8x8_t __b) { - poly8x8_t result; + poly8x8_t __result; __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl1_s8 (int8x16_t a, uint8x8_t b) +vqtbl1_s8 (int8x16_t __a, uint8x8_t __b) { - int8x8_t result; + int8x8_t __result; __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl1_u8 (uint8x16_t a, uint8x8_t b) +vqtbl1_u8 (uint8x16_t __a, uint8x8_t __b) { - uint8x8_t result; + uint8x8_t __result; __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl1q_p8 (poly8x16_t a, uint8x16_t b) +vqtbl1q_p8 (poly8x16_t __a, uint8x16_t __b) { - poly8x16_t result; + poly8x16_t __result; __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" - : "=w"(result) - : "w"(a), "w"(b) + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl1q_s8 (int8x16_t a, uint8x16_t b) +vqtbl1q_s8 (int8x16_t __a, uint8x16_t __b) { - int8x16_t result; + int8x16_t __result; __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" - : "=w"(result) - : "w"(a), "w"(b) + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl1q_u8 (uint8x16_t a, uint8x16_t b) +vqtbl1q_u8 (uint8x16_t __a, uint8x16_t __b) { - uint8x16_t result; + uint8x16_t __result; __asm__ ("tbl %0.16b, {%1.16b}, %2.16b" - : "=w"(result) - : "w"(a), "w"(b) + : "=w"(__result) + : "w"(__a), "w"(__b) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx1_s8 (int8x8_t r, int8x16_t tab, uint8x8_t idx) +vqtbx1_s8 (int8x8_t __r, int8x16_t __tab, uint8x8_t __idx) { - int8x8_t result = r; + int8x8_t __result = __r; __asm__ ("tbx %0.8b,{%1.16b},%2.8b" - : "+w"(result) - : "w"(tab), "w"(idx) + : "+w"(__result) + : "w"(__tab), "w"(__idx) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx1_u8 (uint8x8_t r, uint8x16_t tab, uint8x8_t idx) +vqtbx1_u8 (uint8x8_t __r, uint8x16_t __tab, uint8x8_t __idx) { - uint8x8_t result = r; + uint8x8_t __result = __r; __asm__ ("tbx %0.8b,{%1.16b},%2.8b" - : "+w"(result) - : "w"(tab), "w"(idx) + : "+w"(__result) + : "w"(__tab), "w"(__idx) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx1_p8 (poly8x8_t r, poly8x16_t tab, uint8x8_t idx) +vqtbx1_p8 (poly8x8_t __r, poly8x16_t __tab, uint8x8_t __idx) { - poly8x8_t result = r; + poly8x8_t __result = __r; __asm__ ("tbx %0.8b,{%1.16b},%2.8b" - : "+w"(result) - : "w"(tab), "w"(idx) + : "+w"(__result) + : "w"(__tab), "w"(__idx) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx1q_s8 (int8x16_t r, int8x16_t tab, uint8x16_t idx) +vqtbx1q_s8 (int8x16_t __r, int8x16_t __tab, uint8x16_t __idx) { - int8x16_t result = r; + int8x16_t __result = __r; __asm__ ("tbx %0.16b,{%1.16b},%2.16b" - : "+w"(result) - : "w"(tab), "w"(idx) + : "+w"(__result) + : "w"(__tab), "w"(__idx) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx1q_u8 (uint8x16_t r, uint8x16_t tab, uint8x16_t idx) +vqtbx1q_u8 (uint8x16_t __r, uint8x16_t __tab, uint8x16_t __idx) { - uint8x16_t result = r; + uint8x16_t __result = __r; __asm__ ("tbx %0.16b,{%1.16b},%2.16b" - : "+w"(result) - : "w"(tab), "w"(idx) + : "+w"(__result) + : "w"(__tab), "w"(__idx) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx1q_p8 (poly8x16_t r, poly8x16_t tab, uint8x16_t idx) +vqtbx1q_p8 (poly8x16_t __r, poly8x16_t __tab, uint8x16_t __idx) { - poly8x16_t result = r; + poly8x16_t __result = __r; __asm__ ("tbx %0.16b,{%1.16b},%2.16b" - : "+w"(result) - : "w"(tab), "w"(idx) + : "+w"(__result) + : "w"(__tab), "w"(__idx) : /* No clobbers */); - return result; + return __result; } /* V7 legacy table intrinsics. */ __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbl1_s8 (int8x8_t tab, int8x8_t idx) +vtbl1_s8 (int8x8_t __tab, int8x8_t __idx) { - int8x8_t result; - int8x16_t temp = vcombine_s8 (tab, vcreate_s8 (__AARCH64_UINT64_C (0x0))); + int8x8_t __result; + int8x16_t __temp = vcombine_s8 (__tab, vcreate_s8 (__AARCH64_UINT64_C (0x0))); __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(temp), "w"(idx) + : "=w"(__result) + : "w"(__temp), "w"(__idx) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbl1_u8 (uint8x8_t tab, uint8x8_t idx) +vtbl1_u8 (uint8x8_t __tab, uint8x8_t __idx) { - uint8x8_t result; - uint8x16_t temp = vcombine_u8 (tab, vcreate_u8 (__AARCH64_UINT64_C (0x0))); + uint8x8_t __result; + uint8x16_t __temp = vcombine_u8 (__tab, vcreate_u8 (__AARCH64_UINT64_C (0x0))); __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(temp), "w"(idx) + : "=w"(__result) + : "w"(__temp), "w"(__idx) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbl1_p8 (poly8x8_t tab, uint8x8_t idx) +vtbl1_p8 (poly8x8_t __tab, uint8x8_t __idx) { - poly8x8_t result; - poly8x16_t temp = vcombine_p8 (tab, vcreate_p8 (__AARCH64_UINT64_C (0x0))); + poly8x8_t __result; + poly8x16_t __temp = vcombine_p8 (__tab, vcreate_p8 (__AARCH64_UINT64_C (0x0))); __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(temp), "w"(idx) + : "=w"(__result) + : "w"(__temp), "w"(__idx) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbl2_s8 (int8x8x2_t tab, int8x8_t idx) +vtbl2_s8 (int8x8x2_t __tab, int8x8_t __idx) { - int8x8_t result; - int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]); + int8x8_t __result; + int8x16_t __temp = vcombine_s8 (__tab.val[0], __tab.val[1]); __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(temp), "w"(idx) + : "=w"(__result) + : "w"(__temp), "w"(__idx) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbl2_u8 (uint8x8x2_t tab, uint8x8_t idx) +vtbl2_u8 (uint8x8x2_t __tab, uint8x8_t __idx) { - uint8x8_t result; - uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]); + uint8x8_t __result; + uint8x16_t __temp = vcombine_u8 (__tab.val[0], __tab.val[1]); __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(temp), "w"(idx) + : "=w"(__result) + : "w"(__temp), "w"(__idx) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbl2_p8 (poly8x8x2_t tab, uint8x8_t idx) +vtbl2_p8 (poly8x8x2_t __tab, uint8x8_t __idx) { - poly8x8_t result; - poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]); + poly8x8_t __result; + poly8x16_t __temp = vcombine_p8 (__tab.val[0], __tab.val[1]); __asm__ ("tbl %0.8b, {%1.16b}, %2.8b" - : "=w"(result) - : "w"(temp), "w"(idx) + : "=w"(__result) + : "w"(__temp), "w"(__idx) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbl3_s8 (int8x8x3_t tab, int8x8_t idx) +vtbl3_s8 (int8x8x3_t __tab, int8x8_t __idx) { - int8x8_t result; - int8x16x2_t temp; + int8x8_t __result; + int8x16x2_t __temp; __builtin_aarch64_simd_oi __o; - temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_s8 (tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0))); + __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]); + __temp.val[1] = vcombine_s8 (__tab.val[2], vcreate_s8 (__AARCH64_UINT64_C (0x0))); __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) temp.val[0], 0); + (int8x16_t) __temp.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) temp.val[1], 1); - result = __builtin_aarch64_tbl3v8qi (__o, idx); - return result; + (int8x16_t) __temp.val[1], 1); + __result = __builtin_aarch64_tbl3v8qi (__o, __idx); + return __result; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbl3_u8 (uint8x8x3_t tab, uint8x8_t idx) +vtbl3_u8 (uint8x8x3_t __tab, uint8x8_t __idx) { - uint8x8_t result; - uint8x16x2_t temp; + uint8x8_t __result; + uint8x16x2_t __temp; __builtin_aarch64_simd_oi __o; - temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_u8 (tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0))); + __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]); + __temp.val[1] = vcombine_u8 (__tab.val[2], vcreate_u8 (__AARCH64_UINT64_C (0x0))); __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) temp.val[0], 0); + (int8x16_t) __temp.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) temp.val[1], 1); - result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx); - return result; + (int8x16_t) __temp.val[1], 1); + __result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx); + return __result; } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbl3_p8 (poly8x8x3_t tab, uint8x8_t idx) +vtbl3_p8 (poly8x8x3_t __tab, uint8x8_t __idx) { - poly8x8_t result; - poly8x16x2_t temp; + poly8x8_t __result; + poly8x16x2_t __temp; __builtin_aarch64_simd_oi __o; - temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_p8 (tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0))); + __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]); + __temp.val[1] = vcombine_p8 (__tab.val[2], vcreate_p8 (__AARCH64_UINT64_C (0x0))); __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) temp.val[0], 0); + (int8x16_t) __temp.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) temp.val[1], 1); - result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx); - return result; + (int8x16_t) __temp.val[1], 1); + __result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx); + return __result; } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbl4_s8 (int8x8x4_t tab, int8x8_t idx) +vtbl4_s8 (int8x8x4_t __tab, int8x8_t __idx) { - int8x8_t result; - int8x16x2_t temp; + int8x8_t __result; + int8x16x2_t __temp; __builtin_aarch64_simd_oi __o; - temp.val[0] = vcombine_s8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_s8 (tab.val[2], tab.val[3]); + __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]); + __temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]); __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) temp.val[0], 0); + (int8x16_t) __temp.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) temp.val[1], 1); - result = __builtin_aarch64_tbl3v8qi (__o, idx); - return result; + (int8x16_t) __temp.val[1], 1); + __result = __builtin_aarch64_tbl3v8qi (__o, __idx); + return __result; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbl4_u8 (uint8x8x4_t tab, uint8x8_t idx) +vtbl4_u8 (uint8x8x4_t __tab, uint8x8_t __idx) { - uint8x8_t result; - uint8x16x2_t temp; + uint8x8_t __result; + uint8x16x2_t __temp; __builtin_aarch64_simd_oi __o; - temp.val[0] = vcombine_u8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_u8 (tab.val[2], tab.val[3]); + __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]); + __temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]); __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) temp.val[0], 0); + (int8x16_t) __temp.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) temp.val[1], 1); - result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx); - return result; + (int8x16_t) __temp.val[1], 1); + __result = (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx); + return __result; } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbl4_p8 (poly8x8x4_t tab, uint8x8_t idx) +vtbl4_p8 (poly8x8x4_t __tab, uint8x8_t __idx) { - poly8x8_t result; - poly8x16x2_t temp; + poly8x8_t __result; + poly8x16x2_t __temp; __builtin_aarch64_simd_oi __o; - temp.val[0] = vcombine_p8 (tab.val[0], tab.val[1]); - temp.val[1] = vcombine_p8 (tab.val[2], tab.val[3]); + __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]); + __temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]); __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) temp.val[0], 0); + (int8x16_t) __temp.val[0], 0); __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) temp.val[1], 1); - result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx); - return result; + (int8x16_t) __temp.val[1], 1); + __result = (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx); + return __result; } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbx2_s8 (int8x8_t r, int8x8x2_t tab, int8x8_t idx) +vtbx2_s8 (int8x8_t __r, int8x8x2_t __tab, int8x8_t __idx) { - int8x8_t result = r; - int8x16_t temp = vcombine_s8 (tab.val[0], tab.val[1]); + int8x8_t __result = __r; + int8x16_t __temp = vcombine_s8 (__tab.val[0], __tab.val[1]); __asm__ ("tbx %0.8b, {%1.16b}, %2.8b" - : "+w"(result) - : "w"(temp), "w"(idx) + : "+w"(__result) + : "w"(__temp), "w"(__idx) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbx2_u8 (uint8x8_t r, uint8x8x2_t tab, uint8x8_t idx) +vtbx2_u8 (uint8x8_t __r, uint8x8x2_t __tab, uint8x8_t __idx) { - uint8x8_t result = r; - uint8x16_t temp = vcombine_u8 (tab.val[0], tab.val[1]); + uint8x8_t __result = __r; + uint8x16_t __temp = vcombine_u8 (__tab.val[0], __tab.val[1]); __asm__ ("tbx %0.8b, {%1.16b}, %2.8b" - : "+w"(result) - : "w"(temp), "w"(idx) + : "+w"(__result) + : "w"(__temp), "w"(__idx) : /* No clobbers */); - return result; + return __result; } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbx2_p8 (poly8x8_t r, poly8x8x2_t tab, uint8x8_t idx) +vtbx2_p8 (poly8x8_t __r, poly8x8x2_t __tab, uint8x8_t __idx) { - poly8x8_t result = r; - poly8x16_t temp = vcombine_p8 (tab.val[0], tab.val[1]); + poly8x8_t __result = __r; + poly8x16_t __temp = vcombine_p8 (__tab.val[0], __tab.val[1]); __asm__ ("tbx %0.8b, {%1.16b}, %2.8b" - : "+w"(result) - : "w"(temp), "w"(idx) + : "+w"(__result) + : "w"(__temp), "w"(__idx) : /* No clobbers */); - return result; + return __result; } /* End of temporary inline asm. */ @@ -11822,6 +10283,18 @@ vabsq_s64 (int64x2_t __a) return __builtin_aarch64_absv2di (__a); } +/* Try to avoid moving between integer and vector registers. + For why the cast to unsigned is needed check the vnegd_s64 intrinsic. + There is a testcase related to this issue: + gcc.target/aarch64/vabsd_s64.c. */ + +__extension__ extern __inline int64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vabsd_s64 (int64_t __a) +{ + return __a < 0 ? - (uint64_t) __a : __a; +} + /* vadd */ __extension__ extern __inline int64_t @@ -12789,6 +11262,13 @@ vceqq_u64 (uint64x2_t __a, uint64x2_t __b) return (__a == __b); } +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vceqq_p64 (poly64x2_t __a, poly64x2_t __b) +{ + return (__a == __b); +} + /* vceq - scalar. */ __extension__ extern __inline uint32_t @@ -12898,6 +11378,13 @@ vceqz_u64 (uint64x1_t __a) return (__a == __AARCH64_UINT64_C (0)); } +__extension__ extern __inline uint64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vceqz_p64 (poly64x1_t __a) +{ + return (__a == __AARCH64_UINT64_C (0)); +} + __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceqzq_f32 (float32x4_t __a) @@ -12975,6 +11462,13 @@ vceqzq_u64 (uint64x2_t __a) return (__a == __AARCH64_UINT64_C (0)); } +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vceqzq_p64 (poly64x2_t __a) +{ + return (__a == __AARCH64_UINT64_C (0)); +} + /* vceqz - scalar. */ __extension__ extern __inline uint32_t @@ -14173,51 +12667,93 @@ vclsq_s32 (int32x4_t __a) return __builtin_aarch64_clrsbv4si (__a); } -/* vclz. */ - __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vclz_s8 (int8x8_t __a) +vcls_u8 (uint8x8_t __a) { - return __builtin_aarch64_clzv8qi (__a); + return __builtin_aarch64_clrsbv8qi ((int8x8_t) __a); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vclz_s16 (int16x4_t __a) +vcls_u16 (uint16x4_t __a) { - return __builtin_aarch64_clzv4hi (__a); + return __builtin_aarch64_clrsbv4hi ((int16x4_t) __a); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vclz_s32 (int32x2_t __a) +vcls_u32 (uint32x2_t __a) { - return __builtin_aarch64_clzv2si (__a); + return __builtin_aarch64_clrsbv2si ((int32x2_t) __a); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vclz_u8 (uint8x8_t __a) +vclsq_u8 (uint8x16_t __a) { - return (uint8x8_t)__builtin_aarch64_clzv8qi ((int8x8_t)__a); + return __builtin_aarch64_clrsbv16qi ((int8x16_t) __a); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vclz_u16 (uint16x4_t __a) +vclsq_u16 (uint16x8_t __a) { - return (uint16x4_t)__builtin_aarch64_clzv4hi ((int16x4_t)__a); + return __builtin_aarch64_clrsbv8hi ((int16x8_t) __a); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vclz_u32 (uint32x2_t __a) +vclsq_u32 (uint32x4_t __a) { - return (uint32x2_t)__builtin_aarch64_clzv2si ((int32x2_t)__a); + return __builtin_aarch64_clrsbv4si ((int32x4_t) __a); } -__extension__ extern __inline int8x16_t +/* vclz. */ + +__extension__ extern __inline int8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vclz_s8 (int8x8_t __a) +{ + return __builtin_aarch64_clzv8qi (__a); +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vclz_s16 (int16x4_t __a) +{ + return __builtin_aarch64_clzv4hi (__a); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vclz_s32 (int32x2_t __a) +{ + return __builtin_aarch64_clzv2si (__a); +} + +__extension__ extern __inline uint8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vclz_u8 (uint8x8_t __a) +{ + return (uint8x8_t)__builtin_aarch64_clzv8qi ((int8x8_t)__a); +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vclz_u16 (uint16x4_t __a) +{ + return (uint16x4_t)__builtin_aarch64_clzv4hi ((int16x4_t)__a); +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vclz_u32 (uint32x2_t __a) +{ + return (uint32x2_t)__builtin_aarch64_clzv2si ((int32x2_t)__a); +} + +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vclzq_s8 (int8x16_t __a) { @@ -15657,7 +14193,7 @@ vdupq_n_f64 (float64_t __a) __extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_p8 (uint32_t __a) +vdupq_n_p8 (poly8_t __a) { return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a}; @@ -15665,21 +14201,21 @@ vdupq_n_p8 (uint32_t __a) __extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_p16 (uint32_t __a) +vdupq_n_p16 (poly16_t __a) { return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline poly64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_p64 (uint64_t __a) +vdupq_n_p64 (poly64_t __a) { return (poly64x2_t) {__a, __a}; } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_s8 (int32_t __a) +vdupq_n_s8 (int8_t __a) { return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a}; @@ -15687,7 +14223,7 @@ vdupq_n_s8 (int32_t __a) __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_s16 (int32_t __a) +vdupq_n_s16 (int16_t __a) { return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } @@ -15708,7 +14244,7 @@ vdupq_n_s64 (int64_t __a) __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_u8 (uint32_t __a) +vdupq_n_u8 (uint8_t __a) { return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a}; @@ -15716,7 +14252,7 @@ vdupq_n_u8 (uint32_t __a) __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_u16 (uint32_t __a) +vdupq_n_u16 (uint16_t __a) { return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } @@ -17051,1782 +15587,1710 @@ vld1_f16 (const float16_t *__a) __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_f32 (const float32_t *a) +vld1_f32 (const float32_t *__a) { - return __builtin_aarch64_ld1v2sf ((const __builtin_aarch64_simd_sf *) a); + return __builtin_aarch64_ld1v2sf ((const __builtin_aarch64_simd_sf *) __a); } __extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_f64 (const float64_t *a) +vld1_f64 (const float64_t *__a) { - return (float64x1_t) {*a}; + return (float64x1_t) {*__a}; } __extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_p8 (const poly8_t *a) +vld1_p8 (const poly8_t *__a) { return (poly8x8_t) - __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a); + __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) __a); } __extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_p16 (const poly16_t *a) +vld1_p16 (const poly16_t *__a) { return (poly16x4_t) - __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a); + __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) __a); } __extension__ extern __inline poly64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_p64 (const poly64_t *a) +vld1_p64 (const poly64_t *__a) { - return (poly64x1_t) {*a}; + return (poly64x1_t) {*__a}; } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_s8 (const int8_t *a) +vld1_s8 (const int8_t *__a) { - return __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a); + return __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) __a); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_s16 (const int16_t *a) +vld1_s16 (const int16_t *__a) { - return __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a); + return __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) __a); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_s32 (const int32_t *a) +vld1_s32 (const int32_t *__a) { - return __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a); + return __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) __a); } __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_s64 (const int64_t *a) +vld1_s64 (const int64_t *__a) { - return (int64x1_t) {*a}; + return (int64x1_t) {*__a}; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_u8 (const uint8_t *a) +vld1_u8 (const uint8_t *__a) { return (uint8x8_t) - __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) a); + __builtin_aarch64_ld1v8qi ((const __builtin_aarch64_simd_qi *) __a); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_u16 (const uint16_t *a) +vld1_u16 (const uint16_t *__a) { return (uint16x4_t) - __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) a); + __builtin_aarch64_ld1v4hi ((const __builtin_aarch64_simd_hi *) __a); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_u32 (const uint32_t *a) +vld1_u32 (const uint32_t *__a) { return (uint32x2_t) - __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) a); + __builtin_aarch64_ld1v2si ((const __builtin_aarch64_simd_si *) __a); } __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_u64 (const uint64_t *a) +vld1_u64 (const uint64_t *__a) { - return (uint64x1_t) {*a}; + return (uint64x1_t) {*__a}; } -/* vld1q */ +/* vld1x3 */ -__extension__ extern __inline float16x8_t +__extension__ extern __inline uint8x8x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_f16 (const float16_t *__a) +vld1_u8_x3 (const uint8_t *__a) { - return __builtin_aarch64_ld1v8hf (__a); + uint8x8x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = (__builtin_aarch64_simd_ci)__builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a); + __i.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0); + __i.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1); + __i.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2); + return __i; } -__extension__ extern __inline float32x4_t +__extension__ extern __inline int8x8x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_f32 (const float32_t *a) +vld1_s8_x3 (const uint8_t *__a) { - return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) a); + int8x8x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a); + __i.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0); + __i.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1); + __i.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2); + return __i; } -__extension__ extern __inline float64x2_t +__extension__ extern __inline uint16x4x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_f64 (const float64_t *a) +vld1_u16_x3 (const uint16_t *__a) { - return __builtin_aarch64_ld1v2df ((const __builtin_aarch64_simd_df *) a); + uint16x4x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a); + __i.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0); + __i.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1); + __i.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2); + return __i; } -__extension__ extern __inline poly8x16_t +__extension__ extern __inline int16x4x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_p8 (const poly8_t *a) +vld1_s16_x3 (const int16_t *__a) { - return (poly8x16_t) - __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a); + int16x4x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a); + __i.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0); + __i.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1); + __i.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2); + return __i; } -__extension__ extern __inline poly16x8_t +__extension__ extern __inline uint32x2x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_p16 (const poly16_t *a) +vld1_u32_x3 (const uint32_t *__a) { - return (poly16x8_t) - __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a); + uint32x2x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v2si ((const __builtin_aarch64_simd_si *) __a); + __i.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0); + __i.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1); + __i.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2); + return __i; } -__extension__ extern __inline poly64x2_t +__extension__ extern __inline int32x2x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_p64 (const poly64_t *a) +vld1_s32_x3 (const uint32_t *__a) { - return (poly64x2_t) - __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a); + int32x2x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v2si ((const __builtin_aarch64_simd_si *) __a); + __i.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0); + __i.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1); + __i.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2); + return __i; } -__extension__ extern __inline int8x16_t +__extension__ extern __inline uint64x1x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_s8 (const int8_t *a) +vld1_u64_x3 (const uint64_t *__a) { - return __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a); + uint64x1x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a); + __i.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0); + __i.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1); + __i.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2); + return __i; } -__extension__ extern __inline int16x8_t +__extension__ extern __inline int64x1x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_s16 (const int16_t *a) +vld1_s64_x3 (const int64_t *__a) { - return __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a); + int64x1x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a); + __i.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0); + __i.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1); + __i.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2); + + return __i; } -__extension__ extern __inline int32x4_t +__extension__ extern __inline float16x4x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_s32 (const int32_t *a) +vld1_f16_x3 (const float16_t *__a) { - return __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a); + float16x4x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v4hf ((const __builtin_aarch64_simd_hf *) __a); + __i.val[0] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 0); + __i.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 1); + __i.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 2); + return __i; } -__extension__ extern __inline int64x2_t +__extension__ extern __inline float32x2x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_s64 (const int64_t *a) +vld1_f32_x3 (const float32_t *__a) { - return __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a); + float32x2x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v2sf ((const __builtin_aarch64_simd_sf *) __a); + __i.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 0); + __i.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 1); + __i.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 2); + return __i; } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline float64x1x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_u8 (const uint8_t *a) +vld1_f64_x3 (const float64_t *__a) { - return (uint8x16_t) - __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a); + float64x1x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3df ((const __builtin_aarch64_simd_df *) __a); + __i.val[0] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 0); + __i.val[1] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 1); + __i.val[2] = (float64x1_t) __builtin_aarch64_get_dregcidi (__o, 2); + return __i; } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline poly8x8x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_u16 (const uint16_t *a) +vld1_p8_x3 (const poly8_t *__a) { - return (uint16x8_t) - __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) a); + poly8x8x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v8qi ((const __builtin_aarch64_simd_qi *) __a); + __i.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0); + __i.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1); + __i.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2); + return __i; } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline poly16x4x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_u32 (const uint32_t *a) +vld1_p16_x3 (const poly16_t *__a) { - return (uint32x4_t) - __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) a); + poly16x4x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v4hi ((const __builtin_aarch64_simd_hi *) __a); + __i.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0); + __i.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1); + __i.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2); + return __i; } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline poly64x1x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_u64 (const uint64_t *a) +vld1_p64_x3 (const poly64_t *__a) { - return (uint64x2_t) - __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a); -} + poly64x1x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3di ((const __builtin_aarch64_simd_di *) __a); + __i.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 0); + __i.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 1); + __i.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi (__o, 2); -/* vld1_dup */ +return __i; +} -__extension__ extern __inline float16x4_t +__extension__ extern __inline uint8x16x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_dup_f16 (const float16_t* __a) +vld1q_u8_x3 (const uint8_t *__a) { - return vdup_n_f16 (*__a); + uint8x16x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v16qi ((const __builtin_aarch64_simd_qi *) __a); + __i.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0); + __i.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1); + __i.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2); + return __i; } -__extension__ extern __inline float32x2_t +__extension__ extern __inline int8x16x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_dup_f32 (const float32_t* __a) +vld1q_s8_x3 (const int8_t *__a) { - return vdup_n_f32 (*__a); + int8x16x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v16qi ((const __builtin_aarch64_simd_qi *) __a); + __i.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0); + __i.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1); + __i.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2); + return __i; } -__extension__ extern __inline float64x1_t +__extension__ extern __inline uint16x8x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_dup_f64 (const float64_t* __a) +vld1q_u16_x3 (const uint16_t *__a) { - return vdup_n_f64 (*__a); + uint16x8x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a); + __i.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0); + __i.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1); + __i.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2); + return __i; } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline int16x8x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_dup_p8 (const poly8_t* __a) +vld1q_s16_x3 (const int16_t *__a) { - return vdup_n_p8 (*__a); + int16x8x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a); + __i.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0); + __i.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1); + __i.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2); + return __i; } -__extension__ extern __inline poly16x4_t +__extension__ extern __inline uint32x4x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_dup_p16 (const poly16_t* __a) +vld1q_u32_x3 (const uint32_t *__a) { - return vdup_n_p16 (*__a); + uint32x4x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v4si ((const __builtin_aarch64_simd_si *) __a); + __i.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0); + __i.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1); + __i.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return __i; } -__extension__ extern __inline poly64x1_t +__extension__ extern __inline int32x4x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_dup_p64 (const poly64_t* __a) +vld1q_s32_x3 (const int32_t *__a) { - return vdup_n_p64 (*__a); + int32x4x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v4si ((const __builtin_aarch64_simd_si *) __a); + __i.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0); + __i.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1); + __i.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return __i; } -__extension__ extern __inline int8x8_t +__extension__ extern __inline uint64x2x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_dup_s8 (const int8_t* __a) +vld1q_u64_x3 (const uint64_t *__a) { - return vdup_n_s8 (*__a); + uint64x2x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a); + __i.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0); + __i.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1); + __i.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2); + return __i; } -__extension__ extern __inline int16x4_t +__extension__ extern __inline int64x2x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_dup_s16 (const int16_t* __a) +vld1q_s64_x3 (const int64_t *__a) { - return vdup_n_s16 (*__a); + int64x2x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a); + __i.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0); + __i.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1); + __i.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2); + return __i; } -__extension__ extern __inline int32x2_t +__extension__ extern __inline float16x8x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_dup_s32 (const int32_t* __a) +vld1q_f16_x3 (const float16_t *__a) { - return vdup_n_s32 (*__a); + float16x8x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v8hf ((const __builtin_aarch64_simd_hf *) __a); + __i.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 0); + __i.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 1); + __i.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 2); + return __i; } -__extension__ extern __inline int64x1_t +__extension__ extern __inline float32x4x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_dup_s64 (const int64_t* __a) +vld1q_f32_x3 (const float32_t *__a) { - return vdup_n_s64 (*__a); + float32x4x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v4sf ((const __builtin_aarch64_simd_sf *) __a); + __i.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 0); + __i.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 1); + __i.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 2); + return __i; } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline float64x2x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_dup_u8 (const uint8_t* __a) +vld1q_f64_x3 (const float64_t *__a) { - return vdup_n_u8 (*__a); + float64x2x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v2df ((const __builtin_aarch64_simd_df *) __a); + __i.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 0); + __i.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 1); + __i.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 2); + return __i; } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline poly8x16x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_dup_u16 (const uint16_t* __a) +vld1q_p8_x3 (const poly8_t *__a) { - return vdup_n_u16 (*__a); + poly8x16x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v16qi ((const __builtin_aarch64_simd_qi *) __a); + __i.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0); + __i.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1); + __i.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2); + return __i; } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline poly16x8x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_dup_u32 (const uint32_t* __a) +vld1q_p16_x3 (const poly16_t *__a) { - return vdup_n_u32 (*__a); + poly16x8x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v8hi ((const __builtin_aarch64_simd_hi *) __a); + __i.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0); + __i.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1); + __i.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2); + return __i; } -__extension__ extern __inline uint64x1_t +__extension__ extern __inline poly64x2x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_dup_u64 (const uint64_t* __a) +vld1q_p64_x3 (const poly64_t *__a) { - return vdup_n_u64 (*__a); + poly64x2x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v2di ((const __builtin_aarch64_simd_di *) __a); + __i.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0); + __i.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1); + __i.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2); + return __i; } -/* vld1q_dup */ +/* vld1q */ __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_dup_f16 (const float16_t* __a) +vld1q_f16 (const float16_t *__a) { - return vdupq_n_f16 (*__a); + return __builtin_aarch64_ld1v8hf (__a); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_dup_f32 (const float32_t* __a) +vld1q_f32 (const float32_t *__a) { - return vdupq_n_f32 (*__a); + return __builtin_aarch64_ld1v4sf ((const __builtin_aarch64_simd_sf *) __a); } __extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_dup_f64 (const float64_t* __a) +vld1q_f64 (const float64_t *__a) { - return vdupq_n_f64 (*__a); + return __builtin_aarch64_ld1v2df ((const __builtin_aarch64_simd_df *) __a); } __extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_dup_p8 (const poly8_t* __a) +vld1q_p8 (const poly8_t *__a) { - return vdupq_n_p8 (*__a); + return (poly8x16_t) + __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) __a); } __extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_dup_p16 (const poly16_t* __a) +vld1q_p16 (const poly16_t *__a) { - return vdupq_n_p16 (*__a); + return (poly16x8_t) + __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) __a); } __extension__ extern __inline poly64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_dup_p64 (const poly64_t* __a) +vld1q_p64 (const poly64_t *__a) { - return vdupq_n_p64 (*__a); + return (poly64x2_t) + __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) __a); } - __extension__ extern __inline int8x16_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_dup_s8 (const int8_t* __a) +vld1q_s8 (const int8_t *__a) { - return vdupq_n_s8 (*__a); + return __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) __a); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_dup_s16 (const int16_t* __a) +vld1q_s16 (const int16_t *__a) { - return vdupq_n_s16 (*__a); + return __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) __a); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_dup_s32 (const int32_t* __a) +vld1q_s32 (const int32_t *__a) { - return vdupq_n_s32 (*__a); + return __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) __a); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_dup_s64 (const int64_t* __a) +vld1q_s64 (const int64_t *__a) { - return vdupq_n_s64 (*__a); + return __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) __a); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_dup_u8 (const uint8_t* __a) +vld1q_u8 (const uint8_t *__a) { - return vdupq_n_u8 (*__a); + return (uint8x16_t) + __builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) __a); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline uint8x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_dup_u16 (const uint16_t* __a) +vld1_u8_x2 (const uint8_t *__a) { - return vdupq_n_u16 (*__a); + uint8x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0); + ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1); + return ret; } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline int8x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_dup_u32 (const uint32_t* __a) +vld1_s8_x2 (const int8_t *__a) { - return vdupq_n_u32 (*__a); + int8x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0); + ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1); + return ret; } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline uint16x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_dup_u64 (const uint64_t* __a) +vld1_u16_x2 (const uint16_t *__a) { - return vdupq_n_u64 (*__a); + uint16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0); + ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1); + return ret; } -/* vld1_lane */ - -__extension__ extern __inline float16x4_t +__extension__ extern __inline int16x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_lane_f16 (const float16_t *__src, float16x4_t __vec, const int __lane) +vld1_s16_x2 (const int16_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + int16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0); + ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1); + return ret; } -__extension__ extern __inline float32x2_t +__extension__ extern __inline uint32x2x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_lane_f32 (const float32_t *__src, float32x2_t __vec, const int __lane) +vld1_u32_x2 (const uint32_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + uint32x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v2si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0); + ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1); + return ret; } -__extension__ extern __inline float64x1_t +__extension__ extern __inline int32x2x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_lane_f64 (const float64_t *__src, float64x1_t __vec, const int __lane) +vld1_s32_x2 (const int32_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); -} - -__extension__ extern __inline poly8x8_t + int32x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v2si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0); + ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1); + return ret; +} + +__extension__ extern __inline uint64x1x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_lane_p8 (const poly8_t *__src, poly8x8_t __vec, const int __lane) +vld1_u64_x2 (const uint64_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + uint64x1x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 0); + ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 1); + return ret; } -__extension__ extern __inline poly16x4_t +__extension__ extern __inline int64x1x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_lane_p16 (const poly16_t *__src, poly16x4_t __vec, const int __lane) +vld1_s64_x2 (const int64_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + int64x1x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 0); + ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 1); + return ret; } -__extension__ extern __inline poly64x1_t +__extension__ extern __inline float16x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_lane_p64 (const poly64_t *__src, poly64x1_t __vec, const int __lane) +vld1_f16_x2 (const float16_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + float16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v4hf ((const __builtin_aarch64_simd_hf *) __a); + ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregoiv4hf (__o, 0); + ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregoiv4hf (__o, 1); + return ret; } -__extension__ extern __inline int8x8_t +__extension__ extern __inline float32x2x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_lane_s8 (const int8_t *__src, int8x8_t __vec, const int __lane) +vld1_f32_x2 (const float32_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + float32x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v2sf ((const __builtin_aarch64_simd_sf *) __a); + ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 0); + ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 1); + return ret; } -__extension__ extern __inline int16x4_t +__extension__ extern __inline float64x1x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_lane_s16 (const int16_t *__src, int16x4_t __vec, const int __lane) +vld1_f64_x2 (const float64_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + float64x1x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2df ((const __builtin_aarch64_simd_df *) __a); + ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 0)}; + ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 1)}; + return ret; } -__extension__ extern __inline int32x2_t +__extension__ extern __inline poly8x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_lane_s32 (const int32_t *__src, int32x2_t __vec, const int __lane) +vld1_p8_x2 (const poly8_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + poly8x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0); + ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1); + return ret; } -__extension__ extern __inline int64x1_t +__extension__ extern __inline poly16x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_lane_s64 (const int64_t *__src, int64x1_t __vec, const int __lane) +vld1_p16_x2 (const poly16_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + poly16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0); + ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1); + return ret; } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline poly64x1x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_lane_u8 (const uint8_t *__src, uint8x8_t __vec, const int __lane) +vld1_p64_x2 (const poly64_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + poly64x1x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregoidi (__o, 0); + ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregoidi (__o, 1); + return ret; } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline uint8x16x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_lane_u16 (const uint16_t *__src, uint16x4_t __vec, const int __lane) +vld1q_u8_x2 (const uint8_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + uint8x16x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0); + ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1); + return ret; } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline int8x16x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_lane_u32 (const uint32_t *__src, uint32x2_t __vec, const int __lane) +vld1q_s8_x2 (const int8_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + int8x16x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0); + ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1); + return ret; } -__extension__ extern __inline uint64x1_t +__extension__ extern __inline uint16x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1_lane_u64 (const uint64_t *__src, uint64x1_t __vec, const int __lane) +vld1q_u16_x2 (const uint16_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + uint16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0); + ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1); + return ret; } -/* vld1q_lane */ - -__extension__ extern __inline float16x8_t +__extension__ extern __inline int16x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_lane_f16 (const float16_t *__src, float16x8_t __vec, const int __lane) +vld1q_s16_x2 (const int16_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + int16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0); + ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1); + return ret; } -__extension__ extern __inline float32x4_t +__extension__ extern __inline uint32x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_lane_f32 (const float32_t *__src, float32x4_t __vec, const int __lane) +vld1q_u32_x2 (const uint32_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + uint32x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v4si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; } -__extension__ extern __inline float64x2_t +__extension__ extern __inline int32x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_lane_f64 (const float64_t *__src, float64x2_t __vec, const int __lane) +vld1q_s32_x2 (const int32_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + int32x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v4si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; } -__extension__ extern __inline poly8x16_t +__extension__ extern __inline uint64x2x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_lane_p8 (const poly8_t *__src, poly8x16_t __vec, const int __lane) +vld1q_u64_x2 (const uint64_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + uint64x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0); + ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1); + return ret; } -__extension__ extern __inline poly16x8_t +__extension__ extern __inline int64x2x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_lane_p16 (const poly16_t *__src, poly16x8_t __vec, const int __lane) +vld1q_s64_x2 (const int64_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + int64x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0); + ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1); + return ret; } -__extension__ extern __inline poly64x2_t +__extension__ extern __inline float16x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_lane_p64 (const poly64_t *__src, poly64x2_t __vec, const int __lane) +vld1q_f16_x2 (const float16_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + float16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v8hf ((const __builtin_aarch64_simd_hf *) __a); + ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregoiv8hf (__o, 0); + ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregoiv8hf (__o, 1); + return ret; } -__extension__ extern __inline int8x16_t +__extension__ extern __inline float32x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_lane_s8 (const int8_t *__src, int8x16_t __vec, const int __lane) +vld1q_f32_x2 (const float32_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + float32x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v4sf ((const __builtin_aarch64_simd_sf *) __a); + ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 0); + ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 1); + return ret; } -__extension__ extern __inline int16x8_t +__extension__ extern __inline float64x2x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_lane_s16 (const int16_t *__src, int16x8_t __vec, const int __lane) +vld1q_f64_x2 (const float64_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + float64x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v2df ((const __builtin_aarch64_simd_df *) __a); + ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 0); + ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 1); + return ret; } -__extension__ extern __inline int32x4_t +__extension__ extern __inline poly8x16x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_lane_s32 (const int32_t *__src, int32x4_t __vec, const int __lane) +vld1q_p8_x2 (const poly8_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + poly8x16x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0); + ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1); + return ret; } -__extension__ extern __inline int64x2_t +__extension__ extern __inline poly16x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_lane_s64 (const int64_t *__src, int64x2_t __vec, const int __lane) +vld1q_p16_x2 (const poly16_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + poly16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0); + ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1); + return ret; } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline poly64x2x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_lane_u8 (const uint8_t *__src, uint8x16_t __vec, const int __lane) +vld1q_p64_x2 (const poly64_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + poly64x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0); + ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1); + return ret; } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_lane_u16 (const uint16_t *__src, uint16x8_t __vec, const int __lane) +vld1q_u16 (const uint16_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + return (uint16x8_t) + __builtin_aarch64_ld1v8hi ((const __builtin_aarch64_simd_hi *) __a); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_lane_u32 (const uint32_t *__src, uint32x4_t __vec, const int __lane) +vld1q_u32 (const uint32_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + return (uint32x4_t) + __builtin_aarch64_ld1v4si ((const __builtin_aarch64_simd_si *) __a); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld1q_lane_u64 (const uint64_t *__src, uint64x2_t __vec, const int __lane) +vld1q_u64 (const uint64_t *__a) { - return __aarch64_vset_lane_any (*__src, __vec, __lane); + return (uint64x2_t) + __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) __a); } -/* vldn */ +/* vld1(q)_x4. */ -__extension__ extern __inline int64x1x2_t +__extension__ extern __inline int8x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_s64 (const int64_t * __a) +vld1_s8_x4 (const int8_t *__a) { - int64x1x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 0); - ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 1); - return ret; + union { int8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v8qi ((const __builtin_aarch64_simd_qi *) __a); + return __au.__i; } -__extension__ extern __inline uint64x1x2_t +__extension__ extern __inline int8x16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_u64 (const uint64_t * __a) +vld1q_s8_x4 (const int8_t *__a) { - uint64x1x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 0); - ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 1); - return ret; + union { int8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v16qi ((const __builtin_aarch64_simd_qi *) __a); + return __au.__i; } -__extension__ extern __inline float64x1x2_t +__extension__ extern __inline int16x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_f64 (const float64_t * __a) +vld1_s16_x4 (const int16_t *__a) { - float64x1x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2df ((const __builtin_aarch64_simd_df *) __a); - ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 0)}; - ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 1)}; - return ret; + union { int16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v4hi ((const __builtin_aarch64_simd_hi *) __a); + return __au.__i; } -__extension__ extern __inline int8x8x2_t +__extension__ extern __inline int16x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_s8 (const int8_t * __a) +vld1q_s16_x4 (const int16_t *__a) { - int8x8x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a); - ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0); - ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1); - return ret; + union { int16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v8hi ((const __builtin_aarch64_simd_hi *) __a); + return __au.__i; } -__extension__ extern __inline poly8x8x2_t +__extension__ extern __inline int32x2x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_p8 (const poly8_t * __a) +vld1_s32_x4 (const int32_t *__a) { - poly8x8x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a); - ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0); - ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1); - return ret; + union { int32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v2si ((const __builtin_aarch64_simd_si *) __a); + return __au.__i; } -__extension__ extern __inline poly64x1x2_t +__extension__ extern __inline int32x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_p64 (const poly64_t * __a) +vld1q_s32_x4 (const int32_t *__a) { - poly64x1x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 0); - ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 1); - return ret; + union { int32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v4si ((const __builtin_aarch64_simd_si *) __a); + return __au.__i; } -__extension__ extern __inline int16x4x2_t +__extension__ extern __inline uint8x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_s16 (const int16_t * __a) +vld1_u8_x4 (const uint8_t *__a) { - int16x4x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0); - ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1); - return ret; + union { uint8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v8qi ((const __builtin_aarch64_simd_qi *) __a); + return __au.__i; } -__extension__ extern __inline poly16x4x2_t +__extension__ extern __inline uint8x16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_p16 (const poly16_t * __a) +vld1q_u8_x4 (const uint8_t *__a) { - poly16x4x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0); - ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1); - return ret; + union { uint8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v16qi ((const __builtin_aarch64_simd_qi *) __a); + return __au.__i; } -__extension__ extern __inline int32x2x2_t +__extension__ extern __inline uint16x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_s32 (const int32_t * __a) +vld1_u16_x4 (const uint16_t *__a) { - int32x2x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a); - ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0); - ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1); - return ret; + union { uint16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v4hi ((const __builtin_aarch64_simd_hi *) __a); + return __au.__i; } -__extension__ extern __inline uint8x8x2_t +__extension__ extern __inline uint16x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_u8 (const uint8_t * __a) +vld1q_u16_x4 (const uint16_t *__a) { - uint8x8x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a); - ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0); - ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1); - return ret; + union { uint16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v8hi ((const __builtin_aarch64_simd_hi *) __a); + return __au.__i; } -__extension__ extern __inline uint16x4x2_t +__extension__ extern __inline uint32x2x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_u16 (const uint16_t * __a) +vld1_u32_x4 (const uint32_t *__a) { - uint16x4x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0); - ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1); - return ret; + union { uint32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v2si ((const __builtin_aarch64_simd_si *) __a); + return __au.__i; } -__extension__ extern __inline uint32x2x2_t +__extension__ extern __inline uint32x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_u32 (const uint32_t * __a) +vld1q_u32_x4 (const uint32_t *__a) { - uint32x2x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a); - ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0); - ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1); - return ret; + union { uint32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v4si ((const __builtin_aarch64_simd_si *) __a); + return __au.__i; } -__extension__ extern __inline float16x4x2_t +__extension__ extern __inline float16x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_f16 (const float16_t * __a) +vld1_f16_x4 (const float16_t *__a) { - float16x4x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v4hf (__a); - ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0); - ret.val[1] = __builtin_aarch64_get_dregoiv4hf (__o, 1); - return ret; + union { float16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v4hf ((const __builtin_aarch64_simd_hf *) __a); + return __au.__i; } -__extension__ extern __inline float32x2x2_t +__extension__ extern __inline float16x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_f32 (const float32_t * __a) +vld1q_f16_x4 (const float16_t *__a) { - float32x2x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v2sf ((const __builtin_aarch64_simd_sf *) __a); - ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 0); - ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 1); - return ret; + union { float16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v8hf ((const __builtin_aarch64_simd_hf *) __a); + return __au.__i; } -__extension__ extern __inline int8x16x2_t +__extension__ extern __inline float32x2x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_s8 (const int8_t * __a) +vld1_f32_x4 (const float32_t *__a) { - int8x16x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a); - ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0); - ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1); - return ret; + union { float32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v2sf ((const __builtin_aarch64_simd_sf *) __a); + return __au.__i; } -__extension__ extern __inline poly8x16x2_t +__extension__ extern __inline float32x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_p8 (const poly8_t * __a) +vld1q_f32_x4 (const float32_t *__a) { - poly8x16x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a); - ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0); - ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1); - return ret; + union { float32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v4sf ((const __builtin_aarch64_simd_sf *) __a); + return __au.__i; } -__extension__ extern __inline int16x8x2_t +__extension__ extern __inline poly8x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_s16 (const int16_t * __a) +vld1_p8_x4 (const poly8_t *__a) { - int16x8x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0); - ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1); - return ret; + union { poly8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v8qi ((const __builtin_aarch64_simd_qi *) __a); + return __au.__i; } -__extension__ extern __inline poly16x8x2_t +__extension__ extern __inline poly8x16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_p16 (const poly16_t * __a) +vld1q_p8_x4 (const poly8_t *__a) { - poly16x8x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0); - ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1); - return ret; + union { poly8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v16qi ((const __builtin_aarch64_simd_qi *) __a); + return __au.__i; } -__extension__ extern __inline poly64x2x2_t +__extension__ extern __inline poly16x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_p64 (const poly64_t * __a) +vld1_p16_x4 (const poly16_t *__a) { - poly64x2x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 0); - ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 1); - return ret; + union { poly16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v4hi ((const __builtin_aarch64_simd_hi *) __a); + return __au.__i; } -__extension__ extern __inline int32x4x2_t +__extension__ extern __inline poly16x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_s32 (const int32_t * __a) +vld1q_p16_x4 (const poly16_t *__a) { - int32x4x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a); - ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0); - ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1); - return ret; + union { poly16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v8hi ((const __builtin_aarch64_simd_hi *) __a); + return __au.__i; } -__extension__ extern __inline int64x2x2_t +__extension__ extern __inline int64x1x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_s64 (const int64_t * __a) +vld1_s64_x4 (const int64_t *__a) { - int64x2x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0); - ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1); - return ret; + union { int64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4di ((const __builtin_aarch64_simd_di *) __a); + return __au.__i; } -__extension__ extern __inline uint8x16x2_t +__extension__ extern __inline uint64x1x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_u8 (const uint8_t * __a) +vld1_u64_x4 (const uint64_t *__a) { - uint8x16x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a); - ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0); - ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1); - return ret; + union { uint64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4di ((const __builtin_aarch64_simd_di *) __a); + return __au.__i; } -__extension__ extern __inline uint16x8x2_t +__extension__ extern __inline poly64x1x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_u16 (const uint16_t * __a) +vld1_p64_x4 (const poly64_t *__a) { - uint16x8x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0); - ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1); - return ret; + union { poly64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4di ((const __builtin_aarch64_simd_di *) __a); + return __au.__i; } -__extension__ extern __inline uint32x4x2_t +__extension__ extern __inline int64x2x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_u32 (const uint32_t * __a) +vld1q_s64_x4 (const int64_t *__a) { - uint32x4x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a); - ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0); - ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1); - return ret; + union { int64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v2di ((const __builtin_aarch64_simd_di *) __a); + return __au.__i; } -__extension__ extern __inline uint64x2x2_t +__extension__ extern __inline uint64x2x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_u64 (const uint64_t * __a) +vld1q_u64_x4 (const uint64_t *__a) { - uint64x2x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0); - ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1); - return ret; + union { uint64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v2di ((const __builtin_aarch64_simd_di *) __a); + return __au.__i; } -__extension__ extern __inline float16x8x2_t +__extension__ extern __inline poly64x2x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_f16 (const float16_t * __a) +vld1q_p64_x4 (const poly64_t *__a) { - float16x8x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v8hf (__a); - ret.val[0] = __builtin_aarch64_get_qregoiv8hf (__o, 0); - ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1); - return ret; + union { poly64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v2di ((const __builtin_aarch64_simd_di *) __a); + return __au.__i; } -__extension__ extern __inline float32x4x2_t +__extension__ extern __inline float64x1x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_f32 (const float32_t * __a) +vld1_f64_x4 (const float64_t *__a) { - float32x4x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v4sf ((const __builtin_aarch64_simd_sf *) __a); - ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 0); - ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 1); - return ret; + union { float64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4df ((const __builtin_aarch64_simd_df *) __a); + return __au.__i; } -__extension__ extern __inline float64x2x2_t +__extension__ extern __inline float64x2x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_f64 (const float64_t * __a) +vld1q_f64_x4 (const float64_t *__a) { - float64x2x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2v2df ((const __builtin_aarch64_simd_df *) __a); - ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 0); - ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 1); - return ret; + union { float64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v2df ((const __builtin_aarch64_simd_df *) __a); + return __au.__i; } -__extension__ extern __inline int64x1x3_t +/* vld1_dup */ + +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_s64 (const int64_t * __a) +vld1_dup_f16 (const float16_t* __a) { - int64x1x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0); - ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1); - ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2); - return ret; + return vdup_n_f16 (*__a); } -__extension__ extern __inline uint64x1x3_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_u64 (const uint64_t * __a) +vld1_dup_f32 (const float32_t* __a) { - uint64x1x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0); - ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1); - ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2); - return ret; + return vdup_n_f32 (*__a); } -__extension__ extern __inline float64x1x3_t +__extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_f64 (const float64_t * __a) +vld1_dup_f64 (const float64_t* __a) { - float64x1x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3df ((const __builtin_aarch64_simd_df *) __a); - ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 0)}; - ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 1)}; - ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 2)}; - return ret; + return vdup_n_f64 (*__a); } -__extension__ extern __inline int8x8x3_t +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_s8 (const int8_t * __a) +vld1_dup_p8 (const poly8_t* __a) { - int8x8x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a); - ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0); - ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1); - ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2); - return ret; + return vdup_n_p8 (*__a); } -__extension__ extern __inline poly8x8x3_t +__extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_p8 (const poly8_t * __a) +vld1_dup_p16 (const poly16_t* __a) { - poly8x8x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a); - ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0); - ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1); - ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2); - return ret; + return vdup_n_p16 (*__a); } -__extension__ extern __inline int16x4x3_t +__extension__ extern __inline poly64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_s16 (const int16_t * __a) +vld1_dup_p64 (const poly64_t* __a) { - int16x4x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0); - ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1); - ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2); - return ret; + return vdup_n_p64 (*__a); } -__extension__ extern __inline poly16x4x3_t +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_p16 (const poly16_t * __a) +vld1_dup_s8 (const int8_t* __a) { - poly16x4x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0); - ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1); - ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2); - return ret; + return vdup_n_s8 (*__a); } -__extension__ extern __inline int32x2x3_t +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_s32 (const int32_t * __a) +vld1_dup_s16 (const int16_t* __a) { - int32x2x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v2si ((const __builtin_aarch64_simd_si *) __a); - ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0); - ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1); - ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2); - return ret; + return vdup_n_s16 (*__a); } -__extension__ extern __inline uint8x8x3_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_u8 (const uint8_t * __a) +vld1_dup_s32 (const int32_t* __a) { - uint8x8x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a); - ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0); - ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1); - ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2); - return ret; + return vdup_n_s32 (*__a); } -__extension__ extern __inline uint16x4x3_t +__extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_u16 (const uint16_t * __a) +vld1_dup_s64 (const int64_t* __a) { - uint16x4x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0); - ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1); - ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2); - return ret; + return vdup_n_s64 (*__a); } -__extension__ extern __inline uint32x2x3_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_u32 (const uint32_t * __a) +vld1_dup_u8 (const uint8_t* __a) { - uint32x2x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v2si ((const __builtin_aarch64_simd_si *) __a); - ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0); - ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1); - ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2); - return ret; + return vdup_n_u8 (*__a); } -__extension__ extern __inline float16x4x3_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_f16 (const float16_t * __a) +vld1_dup_u16 (const uint16_t* __a) { - float16x4x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v4hf (__a); - ret.val[0] = __builtin_aarch64_get_dregciv4hf (__o, 0); - ret.val[1] = __builtin_aarch64_get_dregciv4hf (__o, 1); - ret.val[2] = __builtin_aarch64_get_dregciv4hf (__o, 2); - return ret; + return vdup_n_u16 (*__a); } -__extension__ extern __inline float32x2x3_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_f32 (const float32_t * __a) +vld1_dup_u32 (const uint32_t* __a) { - float32x2x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v2sf ((const __builtin_aarch64_simd_sf *) __a); - ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 0); - ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 1); - ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 2); - return ret; + return vdup_n_u32 (*__a); } -__extension__ extern __inline poly64x1x3_t +__extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_p64 (const poly64_t * __a) +vld1_dup_u64 (const uint64_t* __a) { - poly64x1x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 0); - ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 1); - ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 2); - return ret; + return vdup_n_u64 (*__a); } -__extension__ extern __inline int8x16x3_t +/* vld1q_dup */ + +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_s8 (const int8_t * __a) +vld1q_dup_f16 (const float16_t* __a) { - int8x16x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a); - ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0); - ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1); - ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2); - return ret; + return vdupq_n_f16 (*__a); } -__extension__ extern __inline poly8x16x3_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_p8 (const poly8_t * __a) +vld1q_dup_f32 (const float32_t* __a) { - poly8x16x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a); - ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0); - ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1); - ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2); - return ret; + return vdupq_n_f32 (*__a); } -__extension__ extern __inline int16x8x3_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_s16 (const int16_t * __a) +vld1q_dup_f64 (const float64_t* __a) { - int16x8x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0); - ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1); - ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2); - return ret; + return vdupq_n_f64 (*__a); } -__extension__ extern __inline poly16x8x3_t +__extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_p16 (const poly16_t * __a) +vld1q_dup_p8 (const poly8_t* __a) { - poly16x8x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0); - ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1); - ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2); - return ret; + return vdupq_n_p8 (*__a); } -__extension__ extern __inline int32x4x3_t +__extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_s32 (const int32_t * __a) +vld1q_dup_p16 (const poly16_t* __a) { - int32x4x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a); - ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0); - ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1); - ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2); - return ret; + return vdupq_n_p16 (*__a); } -__extension__ extern __inline int64x2x3_t +__extension__ extern __inline poly64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_s64 (const int64_t * __a) +vld1q_dup_p64 (const poly64_t* __a) { - int64x2x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0); - ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1); - ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2); - return ret; + return vdupq_n_p64 (*__a); } -__extension__ extern __inline uint8x16x3_t + __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_u8 (const uint8_t * __a) +vld1q_dup_s8 (const int8_t* __a) { - uint8x16x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a); - ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0); - ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1); - ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2); - return ret; + return vdupq_n_s8 (*__a); } -__extension__ extern __inline uint16x8x3_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_u16 (const uint16_t * __a) +vld1q_dup_s16 (const int16_t* __a) { - uint16x8x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0); - ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1); - ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2); - return ret; + return vdupq_n_s16 (*__a); } -__extension__ extern __inline uint32x4x3_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_u32 (const uint32_t * __a) +vld1q_dup_s32 (const int32_t* __a) { - uint32x4x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a); - ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0); - ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1); - ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2); - return ret; + return vdupq_n_s32 (*__a); } -__extension__ extern __inline uint64x2x3_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_u64 (const uint64_t * __a) +vld1q_dup_s64 (const int64_t* __a) { - uint64x2x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0); - ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1); - ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2); - return ret; + return vdupq_n_s64 (*__a); } -__extension__ extern __inline float16x8x3_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_f16 (const float16_t * __a) +vld1q_dup_u8 (const uint8_t* __a) { - float16x8x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v8hf (__a); - ret.val[0] = __builtin_aarch64_get_qregciv8hf (__o, 0); - ret.val[1] = __builtin_aarch64_get_qregciv8hf (__o, 1); - ret.val[2] = __builtin_aarch64_get_qregciv8hf (__o, 2); - return ret; + return vdupq_n_u8 (*__a); } -__extension__ extern __inline float32x4x3_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_f32 (const float32_t * __a) +vld1q_dup_u16 (const uint16_t* __a) { - float32x4x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v4sf ((const __builtin_aarch64_simd_sf *) __a); - ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 0); - ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 1); - ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 2); - return ret; + return vdupq_n_u16 (*__a); } -__extension__ extern __inline float64x2x3_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_f64 (const float64_t * __a) +vld1q_dup_u32 (const uint32_t* __a) { - float64x2x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v2df ((const __builtin_aarch64_simd_df *) __a); - ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 0); - ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 1); - ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 2); - return ret; + return vdupq_n_u32 (*__a); } -__extension__ extern __inline poly64x2x3_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_p64 (const poly64_t * __a) +vld1q_dup_u64 (const uint64_t* __a) { - poly64x2x3_t ret; - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 0); - ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 1); - ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 2); - return ret; + return vdupq_n_u64 (*__a); } -__extension__ extern __inline int64x1x4_t +/* vld1_lane */ + +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_s64 (const int64_t * __a) +vld1_lane_f16 (const float16_t *__src, float16x4_t __vec, const int __lane) { - int64x1x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 0); - ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 1); - ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 2); - ret.val[3] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline uint64x1x4_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_u64 (const uint64_t * __a) +vld1_lane_f32 (const float32_t *__src, float32x2_t __vec, const int __lane) { - uint64x1x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 0); - ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 1); - ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 2); - ret.val[3] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline float64x1x4_t +__extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_f64 (const float64_t * __a) +vld1_lane_f64 (const float64_t *__src, float64x1_t __vec, const int __lane) { - float64x1x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4df ((const __builtin_aarch64_simd_df *) __a); - ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 0)}; - ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 1)}; - ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 2)}; - ret.val[3] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 3)}; - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline int8x8x4_t +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_s8 (const int8_t * __a) +vld1_lane_p8 (const poly8_t *__src, poly8x8_t __vec, const int __lane) { - int8x8x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a); - ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0); - ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1); - ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2); - ret.val[3] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline poly8x8x4_t +__extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_p8 (const poly8_t * __a) +vld1_lane_p16 (const poly16_t *__src, poly16x4_t __vec, const int __lane) { - poly8x8x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a); - ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0); - ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1); - ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2); - ret.val[3] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline int16x4x4_t +__extension__ extern __inline poly64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_s16 (const int16_t * __a) +vld1_lane_p64 (const poly64_t *__src, poly64x1_t __vec, const int __lane) { - int16x4x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0); - ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1); - ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2); - ret.val[3] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline poly16x4x4_t +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_p16 (const poly16_t * __a) +vld1_lane_s8 (const int8_t *__src, int8x8_t __vec, const int __lane) { - poly16x4x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0); - ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1); - ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2); - ret.val[3] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline int32x2x4_t +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_s32 (const int32_t * __a) +vld1_lane_s16 (const int16_t *__src, int16x4_t __vec, const int __lane) { - int32x2x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v2si ((const __builtin_aarch64_simd_si *) __a); - ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0); - ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1); - ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2); - ret.val[3] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline uint8x8x4_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_u8 (const uint8_t * __a) +vld1_lane_s32 (const int32_t *__src, int32x2_t __vec, const int __lane) { - uint8x8x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a); - ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0); - ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1); - ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2); - ret.val[3] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline uint16x4x4_t +__extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_u16 (const uint16_t * __a) +vld1_lane_s64 (const int64_t *__src, int64x1_t __vec, const int __lane) { - uint16x4x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0); - ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1); - ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2); - ret.val[3] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline uint32x2x4_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_u32 (const uint32_t * __a) +vld1_lane_u8 (const uint8_t *__src, uint8x8_t __vec, const int __lane) { - uint32x2x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v2si ((const __builtin_aarch64_simd_si *) __a); - ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0); - ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1); - ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2); - ret.val[3] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline float16x4x4_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_f16 (const float16_t * __a) +vld1_lane_u16 (const uint16_t *__src, uint16x4_t __vec, const int __lane) { - float16x4x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v4hf (__a); - ret.val[0] = __builtin_aarch64_get_dregxiv4hf (__o, 0); - ret.val[1] = __builtin_aarch64_get_dregxiv4hf (__o, 1); - ret.val[2] = __builtin_aarch64_get_dregxiv4hf (__o, 2); - ret.val[3] = __builtin_aarch64_get_dregxiv4hf (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline float32x2x4_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_f32 (const float32_t * __a) +vld1_lane_u32 (const uint32_t *__src, uint32x2_t __vec, const int __lane) { - float32x2x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v2sf ((const __builtin_aarch64_simd_sf *) __a); - ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 0); - ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 1); - ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 2); - ret.val[3] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline poly64x1x4_t +__extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_p64 (const poly64_t * __a) +vld1_lane_u64 (const uint64_t *__src, uint64x1_t __vec, const int __lane) { - poly64x1x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 0); - ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 1); - ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 2); - ret.val[3] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline int8x16x4_t +/* vld1q_lane */ + +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_s8 (const int8_t * __a) +vld1q_lane_f16 (const float16_t *__src, float16x8_t __vec, const int __lane) { - int8x16x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a); - ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0); - ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1); - ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2); - ret.val[3] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline poly8x16x4_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_p8 (const poly8_t * __a) +vld1q_lane_f32 (const float32_t *__src, float32x4_t __vec, const int __lane) { - poly8x16x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a); - ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0); - ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1); - ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2); - ret.val[3] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline int16x8x4_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_s16 (const int16_t * __a) +vld1q_lane_f64 (const float64_t *__src, float64x2_t __vec, const int __lane) { - int16x8x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0); - ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1); - ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2); - ret.val[3] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline poly16x8x4_t +__extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_p16 (const poly16_t * __a) +vld1q_lane_p8 (const poly8_t *__src, poly8x16_t __vec, const int __lane) { - poly16x8x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0); - ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1); - ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2); - ret.val[3] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline int32x4x4_t +__extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_s32 (const int32_t * __a) +vld1q_lane_p16 (const poly16_t *__src, poly16x8_t __vec, const int __lane) { - int32x4x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a); - ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0); - ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1); - ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2); - ret.val[3] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline int64x2x4_t +__extension__ extern __inline poly64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_s64 (const int64_t * __a) +vld1q_lane_p64 (const poly64_t *__src, poly64x2_t __vec, const int __lane) { - int64x2x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0); - ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1); - ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2); - ret.val[3] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline uint8x16x4_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_u8 (const uint8_t * __a) +vld1q_lane_s8 (const int8_t *__src, int8x16_t __vec, const int __lane) { - uint8x16x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a); - ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0); - ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1); - ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2); - ret.val[3] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline uint16x8x4_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_u16 (const uint16_t * __a) +vld1q_lane_s16 (const int16_t *__src, int16x8_t __vec, const int __lane) { - uint16x8x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0); - ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1); - ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2); - ret.val[3] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline uint32x4x4_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_u32 (const uint32_t * __a) +vld1q_lane_s32 (const int32_t *__src, int32x4_t __vec, const int __lane) { - uint32x4x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a); - ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0); - ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1); - ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2); - ret.val[3] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline uint64x2x4_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_u64 (const uint64_t * __a) +vld1q_lane_s64 (const int64_t *__src, int64x2_t __vec, const int __lane) { - uint64x2x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0); - ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1); - ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2); - ret.val[3] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline float16x8x4_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_f16 (const float16_t * __a) +vld1q_lane_u8 (const uint8_t *__src, uint8x16_t __vec, const int __lane) { - float16x8x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v8hf (__a); - ret.val[0] = __builtin_aarch64_get_qregxiv8hf (__o, 0); - ret.val[1] = __builtin_aarch64_get_qregxiv8hf (__o, 1); - ret.val[2] = __builtin_aarch64_get_qregxiv8hf (__o, 2); - ret.val[3] = __builtin_aarch64_get_qregxiv8hf (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline float32x4x4_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_f32 (const float32_t * __a) +vld1q_lane_u16 (const uint16_t *__src, uint16x8_t __vec, const int __lane) { - float32x4x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v4sf ((const __builtin_aarch64_simd_sf *) __a); - ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 0); - ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 1); - ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 2); - ret.val[3] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 3); - return ret; + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline float64x2x4_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_f64 (const float64_t * __a) +vld1q_lane_u32 (const uint32_t *__src, uint32x4_t __vec, const int __lane) { - float64x2x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v2df ((const __builtin_aarch64_simd_df *) __a); - ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 0); - ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 1); - ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 2); - ret.val[3] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 3); + return __aarch64_vset_lane_any (*__src, __vec, __lane); +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1q_lane_u64 (const uint64_t *__src, uint64x2_t __vec, const int __lane) +{ + return __aarch64_vset_lane_any (*__src, __vec, __lane); +} + +/* vldn */ + +__extension__ extern __inline int64x1x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_s64 (const int64_t * __a) +{ + int64x1x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 0); + ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 1); return ret; } -__extension__ extern __inline poly64x2x4_t +__extension__ extern __inline uint64x1x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_p64 (const poly64_t * __a) +vld2_u64 (const uint64_t * __a) { - poly64x2x4_t ret; - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 0); - ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 1); - ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 2); - ret.val[3] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 3); + uint64x1x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 0); + ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 1); return ret; } -/* vldn_dup */ +__extension__ extern __inline float64x1x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_f64 (const float64_t * __a) +{ + float64x1x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2df ((const __builtin_aarch64_simd_df *) __a); + ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 0)}; + ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 1)}; + return ret; +} __extension__ extern __inline int8x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_dup_s8 (const int8_t * __a) +vld2_s8 (const int8_t * __a) { int8x8x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a); + __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a); ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0); ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1); return ret; } -__extension__ extern __inline int16x4x2_t +__extension__ extern __inline poly8x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_dup_s16 (const int16_t * __a) +vld2_p8 (const poly8_t * __a) { - int16x4x2_t ret; + poly8x8x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0); - ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1); + __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0); + ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1); return ret; } -__extension__ extern __inline int32x2x2_t +__extension__ extern __inline poly64x1x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_dup_s32 (const int32_t * __a) +vld2_p64 (const poly64_t * __a) { - int32x2x2_t ret; + poly64x1x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv2si ((const __builtin_aarch64_simd_si *) __a); - ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0); - ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1); + __o = __builtin_aarch64_ld2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 0); + ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 1); return ret; } -__extension__ extern __inline float16x4x2_t +__extension__ extern __inline int16x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_dup_f16 (const float16_t * __a) +vld2_s16 (const int16_t * __a) { - float16x4x2_t ret; + int16x4x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv4hf ((const __builtin_aarch64_simd_hf *) __a); - ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0); - ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregoiv4hf (__o, 1); + __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0); + ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1); return ret; } -__extension__ extern __inline float32x2x2_t +__extension__ extern __inline poly16x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_dup_f32 (const float32_t * __a) +vld2_p16 (const poly16_t * __a) { - float32x2x2_t ret; + poly16x4x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv2sf ((const __builtin_aarch64_simd_sf *) __a); - ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 0); - ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 1); + __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0); + ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1); return ret; } -__extension__ extern __inline float64x1x2_t +__extension__ extern __inline int32x2x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_dup_f64 (const float64_t * __a) +vld2_s32 (const int32_t * __a) { - float64x1x2_t ret; + int32x2x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rdf ((const __builtin_aarch64_simd_df *) __a); - ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 0)}; - ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 1)}; + __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0); + ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1); return ret; } __extension__ extern __inline uint8x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_dup_u8 (const uint8_t * __a) +vld2_u8 (const uint8_t * __a) { uint8x8x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a); + __o = __builtin_aarch64_ld2v8qi ((const __builtin_aarch64_simd_qi *) __a); ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0); ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1); return ret; @@ -18834,11 +17298,11 @@ vld2_dup_u8 (const uint8_t * __a) __extension__ extern __inline uint16x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_dup_u16 (const uint16_t * __a) +vld2_u16 (const uint16_t * __a) { uint16x4x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a); + __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a); ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0); ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1); return ret; @@ -18846,84 +17310,47 @@ vld2_dup_u16 (const uint16_t * __a) __extension__ extern __inline uint32x2x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_dup_u32 (const uint32_t * __a) +vld2_u32 (const uint32_t * __a) { uint32x2x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv2si ((const __builtin_aarch64_simd_si *) __a); + __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a); ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0); ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1); return ret; } -__extension__ extern __inline poly8x8x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_dup_p8 (const poly8_t * __a) -{ - poly8x8x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a); - ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0); - ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1); - return ret; -} - -__extension__ extern __inline poly16x4x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_dup_p16 (const poly16_t * __a) -{ - poly16x4x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a); - ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0); - ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1); - return ret; -} - -__extension__ extern __inline poly64x1x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_dup_p64 (const poly64_t * __a) -{ - poly64x1x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 0); - ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 1); - return ret; -} - - -__extension__ extern __inline int64x1x2_t +__extension__ extern __inline float16x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_dup_s64 (const int64_t * __a) +vld2_f16 (const float16_t * __a) { - int64x1x2_t ret; + float16x4x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rdi ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 0); - ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 1); + __o = __builtin_aarch64_ld2v4hf (__a); + ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0); + ret.val[1] = __builtin_aarch64_get_dregoiv4hf (__o, 1); return ret; } -__extension__ extern __inline uint64x1x2_t +__extension__ extern __inline float32x2x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2_dup_u64 (const uint64_t * __a) +vld2_f32 (const float32_t * __a) { - uint64x1x2_t ret; + float32x2x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rdi ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 0); - ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 1); + __o = __builtin_aarch64_ld2v2sf ((const __builtin_aarch64_simd_sf *) __a); + ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 0); + ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 1); return ret; } __extension__ extern __inline int8x16x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_dup_s8 (const int8_t * __a) +vld2q_s8 (const int8_t * __a) { int8x16x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a); + __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a); ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0); ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1); return ret; @@ -18931,11 +17358,11 @@ vld2q_dup_s8 (const int8_t * __a) __extension__ extern __inline poly8x16x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_dup_p8 (const poly8_t * __a) +vld2q_p8 (const poly8_t * __a) { poly8x16x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a); + __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a); ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0); ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1); return ret; @@ -18943,11 +17370,11 @@ vld2q_dup_p8 (const poly8_t * __a) __extension__ extern __inline int16x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_dup_s16 (const int16_t * __a) +vld2q_s16 (const int16_t * __a) { int16x8x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a); + __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a); ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0); ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1); return ret; @@ -18955,23 +17382,35 @@ vld2q_dup_s16 (const int16_t * __a) __extension__ extern __inline poly16x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_dup_p16 (const poly16_t * __a) +vld2q_p16 (const poly16_t * __a) { poly16x8x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a); + __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a); ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0); ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1); return ret; } +__extension__ extern __inline poly64x2x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_p64 (const poly64_t * __a) +{ + poly64x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 0); + ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 1); + return ret; +} + __extension__ extern __inline int32x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_dup_s32 (const int32_t * __a) +vld2q_s32 (const int32_t * __a) { int32x4x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv4si ((const __builtin_aarch64_simd_si *) __a); + __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a); ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0); ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1); return ret; @@ -18979,11 +17418,11 @@ vld2q_dup_s32 (const int32_t * __a) __extension__ extern __inline int64x2x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_dup_s64 (const int64_t * __a) +vld2q_s64 (const int64_t * __a) { int64x2x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a); + __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a); ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0); ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1); return ret; @@ -18991,11 +17430,11 @@ vld2q_dup_s64 (const int64_t * __a) __extension__ extern __inline uint8x16x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_dup_u8 (const uint8_t * __a) +vld2q_u8 (const uint8_t * __a) { uint8x16x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a); + __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a); ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0); ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1); return ret; @@ -19003,11 +17442,11 @@ vld2q_dup_u8 (const uint8_t * __a) __extension__ extern __inline uint16x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_dup_u16 (const uint16_t * __a) +vld2q_u16 (const uint16_t * __a) { uint16x8x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a); + __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a); ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0); ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1); return ret; @@ -19015,11 +17454,11 @@ vld2q_dup_u16 (const uint16_t * __a) __extension__ extern __inline uint32x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_dup_u32 (const uint32_t * __a) +vld2q_u32 (const uint32_t * __a) { uint32x4x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv4si ((const __builtin_aarch64_simd_si *) __a); + __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a); ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0); ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1); return ret; @@ -19027,11 +17466,11 @@ vld2q_dup_u32 (const uint32_t * __a) __extension__ extern __inline uint64x2x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_dup_u64 (const uint64_t * __a) +vld2q_u64 (const uint64_t * __a) { uint64x2x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a); + __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a); ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0); ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1); return ret; @@ -19039,23 +17478,23 @@ vld2q_dup_u64 (const uint64_t * __a) __extension__ extern __inline float16x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_dup_f16 (const float16_t * __a) +vld2q_f16 (const float16_t * __a) { float16x8x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv8hf ((const __builtin_aarch64_simd_hf *) __a); - ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregoiv8hf (__o, 0); + __o = __builtin_aarch64_ld2v8hf (__a); + ret.val[0] = __builtin_aarch64_get_qregoiv8hf (__o, 0); ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1); return ret; } __extension__ extern __inline float32x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_dup_f32 (const float32_t * __a) +vld2q_f32 (const float32_t * __a) { float32x4x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv4sf ((const __builtin_aarch64_simd_sf *) __a); + __o = __builtin_aarch64_ld2v4sf ((const __builtin_aarch64_simd_sf *) __a); ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 0); ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 1); return ret; @@ -19063,35 +17502,23 @@ vld2q_dup_f32 (const float32_t * __a) __extension__ extern __inline float64x2x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_dup_f64 (const float64_t * __a) +vld2q_f64 (const float64_t * __a) { float64x2x2_t ret; __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv2df ((const __builtin_aarch64_simd_df *) __a); + __o = __builtin_aarch64_ld2v2df ((const __builtin_aarch64_simd_df *) __a); ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 0); ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 1); return ret; } -__extension__ extern __inline poly64x2x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld2q_dup_p64 (const poly64_t * __a) -{ - poly64x2x2_t ret; - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a); - ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 0); - ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 1); - return ret; -} - __extension__ extern __inline int64x1x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_dup_s64 (const int64_t * __a) +vld3_s64 (const int64_t * __a) { int64x1x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rdi ((const __builtin_aarch64_simd_di *) __a); + __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a); ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0); ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1); ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2); @@ -19100,11 +17527,11 @@ vld3_dup_s64 (const int64_t * __a) __extension__ extern __inline uint64x1x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_dup_u64 (const uint64_t * __a) +vld3_u64 (const uint64_t * __a) { uint64x1x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rdi ((const __builtin_aarch64_simd_di *) __a); + __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a); ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0); ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1); ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2); @@ -19113,11 +17540,11 @@ vld3_dup_u64 (const uint64_t * __a) __extension__ extern __inline float64x1x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_dup_f64 (const float64_t * __a) +vld3_f64 (const float64_t * __a) { float64x1x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rdf ((const __builtin_aarch64_simd_df *) __a); + __o = __builtin_aarch64_ld3df ((const __builtin_aarch64_simd_df *) __a); ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 0)}; ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 1)}; ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 2)}; @@ -19126,11 +17553,11 @@ vld3_dup_f64 (const float64_t * __a) __extension__ extern __inline int8x8x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_dup_s8 (const int8_t * __a) +vld3_s8 (const int8_t * __a) { int8x8x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a); + __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a); ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0); ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1); ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2); @@ -19139,11 +17566,11 @@ vld3_dup_s8 (const int8_t * __a) __extension__ extern __inline poly8x8x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_dup_p8 (const poly8_t * __a) +vld3_p8 (const poly8_t * __a) { poly8x8x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a); + __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a); ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0); ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1); ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2); @@ -19152,11 +17579,11 @@ vld3_dup_p8 (const poly8_t * __a) __extension__ extern __inline int16x4x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_dup_s16 (const int16_t * __a) +vld3_s16 (const int16_t * __a) { int16x4x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a); + __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a); ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0); ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1); ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2); @@ -19165,11 +17592,11 @@ vld3_dup_s16 (const int16_t * __a) __extension__ extern __inline poly16x4x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_dup_p16 (const poly16_t * __a) +vld3_p16 (const poly16_t * __a) { poly16x4x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a); + __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a); ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0); ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1); ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2); @@ -19178,11 +17605,11 @@ vld3_dup_p16 (const poly16_t * __a) __extension__ extern __inline int32x2x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_dup_s32 (const int32_t * __a) +vld3_s32 (const int32_t * __a) { int32x2x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv2si ((const __builtin_aarch64_simd_si *) __a); + __o = __builtin_aarch64_ld3v2si ((const __builtin_aarch64_simd_si *) __a); ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0); ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1); ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2); @@ -19191,11 +17618,11 @@ vld3_dup_s32 (const int32_t * __a) __extension__ extern __inline uint8x8x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_dup_u8 (const uint8_t * __a) +vld3_u8 (const uint8_t * __a) { uint8x8x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a); + __o = __builtin_aarch64_ld3v8qi ((const __builtin_aarch64_simd_qi *) __a); ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0); ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1); ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2); @@ -19204,11 +17631,11 @@ vld3_dup_u8 (const uint8_t * __a) __extension__ extern __inline uint16x4x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_dup_u16 (const uint16_t * __a) +vld3_u16 (const uint16_t * __a) { uint16x4x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a); + __o = __builtin_aarch64_ld3v4hi ((const __builtin_aarch64_simd_hi *) __a); ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0); ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1); ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2); @@ -19217,11 +17644,11 @@ vld3_dup_u16 (const uint16_t * __a) __extension__ extern __inline uint32x2x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_dup_u32 (const uint32_t * __a) +vld3_u32 (const uint32_t * __a) { uint32x2x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv2si ((const __builtin_aarch64_simd_si *) __a); + __o = __builtin_aarch64_ld3v2si ((const __builtin_aarch64_simd_si *) __a); ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0); ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1); ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2); @@ -19230,24 +17657,24 @@ vld3_dup_u32 (const uint32_t * __a) __extension__ extern __inline float16x4x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_dup_f16 (const float16_t * __a) +vld3_f16 (const float16_t * __a) { float16x4x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv4hf ((const __builtin_aarch64_simd_hf *) __a); - ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 0); - ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 1); - ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 2); + __o = __builtin_aarch64_ld3v4hf (__a); + ret.val[0] = __builtin_aarch64_get_dregciv4hf (__o, 0); + ret.val[1] = __builtin_aarch64_get_dregciv4hf (__o, 1); + ret.val[2] = __builtin_aarch64_get_dregciv4hf (__o, 2); return ret; } __extension__ extern __inline float32x2x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_dup_f32 (const float32_t * __a) +vld3_f32 (const float32_t * __a) { float32x2x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv2sf ((const __builtin_aarch64_simd_sf *) __a); + __o = __builtin_aarch64_ld3v2sf ((const __builtin_aarch64_simd_sf *) __a); ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 0); ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 1); ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 2); @@ -19256,11 +17683,11 @@ vld3_dup_f32 (const float32_t * __a) __extension__ extern __inline poly64x1x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3_dup_p64 (const poly64_t * __a) +vld3_p64 (const poly64_t * __a) { poly64x1x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a); + __o = __builtin_aarch64_ld3di ((const __builtin_aarch64_simd_di *) __a); ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 0); ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 1); ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 2); @@ -19269,11 +17696,11 @@ vld3_dup_p64 (const poly64_t * __a) __extension__ extern __inline int8x16x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_dup_s8 (const int8_t * __a) +vld3q_s8 (const int8_t * __a) { int8x16x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a); + __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a); ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0); ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1); ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2); @@ -19282,11 +17709,11 @@ vld3q_dup_s8 (const int8_t * __a) __extension__ extern __inline poly8x16x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_dup_p8 (const poly8_t * __a) +vld3q_p8 (const poly8_t * __a) { poly8x16x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a); + __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a); ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0); ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1); ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2); @@ -19295,11 +17722,11 @@ vld3q_dup_p8 (const poly8_t * __a) __extension__ extern __inline int16x8x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_dup_s16 (const int16_t * __a) +vld3q_s16 (const int16_t * __a) { int16x8x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a); + __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a); ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0); ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1); ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2); @@ -19308,11 +17735,11 @@ vld3q_dup_s16 (const int16_t * __a) __extension__ extern __inline poly16x8x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_dup_p16 (const poly16_t * __a) +vld3q_p16 (const poly16_t * __a) { poly16x8x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a); + __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a); ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0); ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1); ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2); @@ -19321,11 +17748,11 @@ vld3q_dup_p16 (const poly16_t * __a) __extension__ extern __inline int32x4x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_dup_s32 (const int32_t * __a) +vld3q_s32 (const int32_t * __a) { int32x4x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv4si ((const __builtin_aarch64_simd_si *) __a); + __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a); ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0); ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1); ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2); @@ -19334,11 +17761,11 @@ vld3q_dup_s32 (const int32_t * __a) __extension__ extern __inline int64x2x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_dup_s64 (const int64_t * __a) +vld3q_s64 (const int64_t * __a) { int64x2x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a); + __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a); ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0); ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1); ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2); @@ -19347,11 +17774,11 @@ vld3q_dup_s64 (const int64_t * __a) __extension__ extern __inline uint8x16x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_dup_u8 (const uint8_t * __a) +vld3q_u8 (const uint8_t * __a) { uint8x16x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a); + __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a); ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0); ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1); ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2); @@ -19360,11 +17787,11 @@ vld3q_dup_u8 (const uint8_t * __a) __extension__ extern __inline uint16x8x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_dup_u16 (const uint16_t * __a) +vld3q_u16 (const uint16_t * __a) { uint16x8x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a); + __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a); ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0); ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1); ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2); @@ -19373,11 +17800,11 @@ vld3q_dup_u16 (const uint16_t * __a) __extension__ extern __inline uint32x4x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_dup_u32 (const uint32_t * __a) +vld3q_u32 (const uint32_t * __a) { uint32x4x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv4si ((const __builtin_aarch64_simd_si *) __a); + __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a); ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0); ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1); ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2); @@ -19386,11 +17813,11 @@ vld3q_dup_u32 (const uint32_t * __a) __extension__ extern __inline uint64x2x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_dup_u64 (const uint64_t * __a) +vld3q_u64 (const uint64_t * __a) { uint64x2x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a); + __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a); ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0); ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1); ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2); @@ -19399,24 +17826,24 @@ vld3q_dup_u64 (const uint64_t * __a) __extension__ extern __inline float16x8x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_dup_f16 (const float16_t * __a) +vld3q_f16 (const float16_t * __a) { float16x8x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv8hf ((const __builtin_aarch64_simd_hf *) __a); - ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 0); - ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 1); - ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 2); + __o = __builtin_aarch64_ld3v8hf (__a); + ret.val[0] = __builtin_aarch64_get_qregciv8hf (__o, 0); + ret.val[1] = __builtin_aarch64_get_qregciv8hf (__o, 1); + ret.val[2] = __builtin_aarch64_get_qregciv8hf (__o, 2); return ret; } __extension__ extern __inline float32x4x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_dup_f32 (const float32_t * __a) +vld3q_f32 (const float32_t * __a) { float32x4x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv4sf ((const __builtin_aarch64_simd_sf *) __a); + __o = __builtin_aarch64_ld3v4sf ((const __builtin_aarch64_simd_sf *) __a); ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 0); ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 1); ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 2); @@ -19425,11 +17852,11 @@ vld3q_dup_f32 (const float32_t * __a) __extension__ extern __inline float64x2x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_dup_f64 (const float64_t * __a) +vld3q_f64 (const float64_t * __a) { float64x2x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv2df ((const __builtin_aarch64_simd_df *) __a); + __o = __builtin_aarch64_ld3v2df ((const __builtin_aarch64_simd_df *) __a); ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 0); ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 1); ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 2); @@ -19438,11 +17865,11 @@ vld3q_dup_f64 (const float64_t * __a) __extension__ extern __inline poly64x2x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld3q_dup_p64 (const poly64_t * __a) +vld3q_p64 (const poly64_t * __a) { poly64x2x3_t ret; __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a); + __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a); ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 0); ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 1); ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 2); @@ -19451,11 +17878,11 @@ vld3q_dup_p64 (const poly64_t * __a) __extension__ extern __inline int64x1x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_dup_s64 (const int64_t * __a) +vld4_s64 (const int64_t * __a) { int64x1x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rdi ((const __builtin_aarch64_simd_di *) __a); + __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a); ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 0); ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 1); ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 2); @@ -19465,11 +17892,11 @@ vld4_dup_s64 (const int64_t * __a) __extension__ extern __inline uint64x1x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_dup_u64 (const uint64_t * __a) +vld4_u64 (const uint64_t * __a) { uint64x1x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rdi ((const __builtin_aarch64_simd_di *) __a); + __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a); ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 0); ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 1); ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 2); @@ -19479,11 +17906,11 @@ vld4_dup_u64 (const uint64_t * __a) __extension__ extern __inline float64x1x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_dup_f64 (const float64_t * __a) +vld4_f64 (const float64_t * __a) { float64x1x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rdf ((const __builtin_aarch64_simd_df *) __a); + __o = __builtin_aarch64_ld4df ((const __builtin_aarch64_simd_df *) __a); ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 0)}; ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 1)}; ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 2)}; @@ -19493,11 +17920,11 @@ vld4_dup_f64 (const float64_t * __a) __extension__ extern __inline int8x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_dup_s8 (const int8_t * __a) +vld4_s8 (const int8_t * __a) { int8x8x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a); + __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a); ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0); ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1); ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2); @@ -19507,11 +17934,11 @@ vld4_dup_s8 (const int8_t * __a) __extension__ extern __inline poly8x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_dup_p8 (const poly8_t * __a) +vld4_p8 (const poly8_t * __a) { poly8x8x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a); + __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a); ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0); ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1); ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2); @@ -19521,11 +17948,11 @@ vld4_dup_p8 (const poly8_t * __a) __extension__ extern __inline int16x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_dup_s16 (const int16_t * __a) +vld4_s16 (const int16_t * __a) { int16x4x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a); + __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a); ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0); ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1); ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2); @@ -19535,11 +17962,11 @@ vld4_dup_s16 (const int16_t * __a) __extension__ extern __inline poly16x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_dup_p16 (const poly16_t * __a) +vld4_p16 (const poly16_t * __a) { poly16x4x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a); + __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a); ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0); ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1); ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2); @@ -19549,11 +17976,11 @@ vld4_dup_p16 (const poly16_t * __a) __extension__ extern __inline int32x2x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_dup_s32 (const int32_t * __a) +vld4_s32 (const int32_t * __a) { int32x2x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv2si ((const __builtin_aarch64_simd_si *) __a); + __o = __builtin_aarch64_ld4v2si ((const __builtin_aarch64_simd_si *) __a); ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0); ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1); ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2); @@ -19563,11 +17990,11 @@ vld4_dup_s32 (const int32_t * __a) __extension__ extern __inline uint8x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_dup_u8 (const uint8_t * __a) +vld4_u8 (const uint8_t * __a) { uint8x8x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a); + __o = __builtin_aarch64_ld4v8qi ((const __builtin_aarch64_simd_qi *) __a); ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0); ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1); ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2); @@ -19577,11 +18004,11 @@ vld4_dup_u8 (const uint8_t * __a) __extension__ extern __inline uint16x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_dup_u16 (const uint16_t * __a) +vld4_u16 (const uint16_t * __a) { uint16x4x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a); + __o = __builtin_aarch64_ld4v4hi ((const __builtin_aarch64_simd_hi *) __a); ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0); ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1); ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2); @@ -19591,11 +18018,11 @@ vld4_dup_u16 (const uint16_t * __a) __extension__ extern __inline uint32x2x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_dup_u32 (const uint32_t * __a) +vld4_u32 (const uint32_t * __a) { uint32x2x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv2si ((const __builtin_aarch64_simd_si *) __a); + __o = __builtin_aarch64_ld4v2si ((const __builtin_aarch64_simd_si *) __a); ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0); ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1); ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2); @@ -19605,25 +18032,25 @@ vld4_dup_u32 (const uint32_t * __a) __extension__ extern __inline float16x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_dup_f16 (const float16_t * __a) +vld4_f16 (const float16_t * __a) { float16x4x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv4hf ((const __builtin_aarch64_simd_hf *) __a); - ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 0); - ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 1); - ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 2); - ret.val[3] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 3); + __o = __builtin_aarch64_ld4v4hf (__a); + ret.val[0] = __builtin_aarch64_get_dregxiv4hf (__o, 0); + ret.val[1] = __builtin_aarch64_get_dregxiv4hf (__o, 1); + ret.val[2] = __builtin_aarch64_get_dregxiv4hf (__o, 2); + ret.val[3] = __builtin_aarch64_get_dregxiv4hf (__o, 3); return ret; } __extension__ extern __inline float32x2x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_dup_f32 (const float32_t * __a) +vld4_f32 (const float32_t * __a) { float32x2x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv2sf ((const __builtin_aarch64_simd_sf *) __a); + __o = __builtin_aarch64_ld4v2sf ((const __builtin_aarch64_simd_sf *) __a); ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 0); ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 1); ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 2); @@ -19633,11 +18060,11 @@ vld4_dup_f32 (const float32_t * __a) __extension__ extern __inline poly64x1x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4_dup_p64 (const poly64_t * __a) +vld4_p64 (const poly64_t * __a) { - poly64x1x4_t ret; + poly64x1x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a); + __o = __builtin_aarch64_ld4di ((const __builtin_aarch64_simd_di *) __a); ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 0); ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 1); ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 2); @@ -19647,11 +18074,11 @@ vld4_dup_p64 (const poly64_t * __a) __extension__ extern __inline int8x16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_dup_s8 (const int8_t * __a) +vld4q_s8 (const int8_t * __a) { int8x16x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a); + __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a); ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0); ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1); ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2); @@ -19661,11 +18088,11 @@ vld4q_dup_s8 (const int8_t * __a) __extension__ extern __inline poly8x16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_dup_p8 (const poly8_t * __a) +vld4q_p8 (const poly8_t * __a) { poly8x16x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a); + __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a); ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0); ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1); ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2); @@ -19675,11 +18102,11 @@ vld4q_dup_p8 (const poly8_t * __a) __extension__ extern __inline int16x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_dup_s16 (const int16_t * __a) +vld4q_s16 (const int16_t * __a) { int16x8x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a); + __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a); ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0); ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1); ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2); @@ -19689,11 +18116,11 @@ vld4q_dup_s16 (const int16_t * __a) __extension__ extern __inline poly16x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_dup_p16 (const poly16_t * __a) +vld4q_p16 (const poly16_t * __a) { poly16x8x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a); + __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a); ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0); ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1); ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2); @@ -19703,11 +18130,11 @@ vld4q_dup_p16 (const poly16_t * __a) __extension__ extern __inline int32x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_dup_s32 (const int32_t * __a) +vld4q_s32 (const int32_t * __a) { int32x4x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv4si ((const __builtin_aarch64_simd_si *) __a); + __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a); ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0); ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1); ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2); @@ -19717,11 +18144,11 @@ vld4q_dup_s32 (const int32_t * __a) __extension__ extern __inline int64x2x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_dup_s64 (const int64_t * __a) +vld4q_s64 (const int64_t * __a) { int64x2x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a); + __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a); ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0); ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1); ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2); @@ -19731,11 +18158,11 @@ vld4q_dup_s64 (const int64_t * __a) __extension__ extern __inline uint8x16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_dup_u8 (const uint8_t * __a) +vld4q_u8 (const uint8_t * __a) { uint8x16x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a); + __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a); ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0); ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1); ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2); @@ -19745,11 +18172,11 @@ vld4q_dup_u8 (const uint8_t * __a) __extension__ extern __inline uint16x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_dup_u16 (const uint16_t * __a) +vld4q_u16 (const uint16_t * __a) { uint16x8x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a); + __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a); ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0); ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1); ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2); @@ -19759,11 +18186,11 @@ vld4q_dup_u16 (const uint16_t * __a) __extension__ extern __inline uint32x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_dup_u32 (const uint32_t * __a) +vld4q_u32 (const uint32_t * __a) { uint32x4x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv4si ((const __builtin_aarch64_simd_si *) __a); + __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a); ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0); ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1); ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2); @@ -19773,11 +18200,11 @@ vld4q_dup_u32 (const uint32_t * __a) __extension__ extern __inline uint64x2x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_dup_u64 (const uint64_t * __a) +vld4q_u64 (const uint64_t * __a) { uint64x2x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a); + __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a); ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0); ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1); ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2); @@ -19787,25 +18214,25 @@ vld4q_dup_u64 (const uint64_t * __a) __extension__ extern __inline float16x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_dup_f16 (const float16_t * __a) +vld4q_f16 (const float16_t * __a) { float16x8x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv8hf ((const __builtin_aarch64_simd_hf *) __a); - ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 0); - ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 1); - ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 2); - ret.val[3] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 3); + __o = __builtin_aarch64_ld4v8hf (__a); + ret.val[0] = __builtin_aarch64_get_qregxiv8hf (__o, 0); + ret.val[1] = __builtin_aarch64_get_qregxiv8hf (__o, 1); + ret.val[2] = __builtin_aarch64_get_qregxiv8hf (__o, 2); + ret.val[3] = __builtin_aarch64_get_qregxiv8hf (__o, 3); return ret; } __extension__ extern __inline float32x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_dup_f32 (const float32_t * __a) +vld4q_f32 (const float32_t * __a) { float32x4x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv4sf ((const __builtin_aarch64_simd_sf *) __a); + __o = __builtin_aarch64_ld4v4sf ((const __builtin_aarch64_simd_sf *) __a); ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 0); ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 1); ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 2); @@ -19815,11 +18242,11 @@ vld4q_dup_f32 (const float32_t * __a) __extension__ extern __inline float64x2x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_dup_f64 (const float64_t * __a) +vld4q_f64 (const float64_t * __a) { float64x2x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv2df ((const __builtin_aarch64_simd_df *) __a); + __o = __builtin_aarch64_ld4v2df ((const __builtin_aarch64_simd_df *) __a); ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 0); ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 1); ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 2); @@ -19829,11 +18256,11 @@ vld4q_dup_f64 (const float64_t * __a) __extension__ extern __inline poly64x2x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vld4q_dup_p64 (const poly64_t * __a) +vld4q_p64 (const poly64_t * __a) { - poly64x2x4_t ret; + poly64x2x4_t ret; __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a); + __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a); ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 0); ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 1); ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 2); @@ -19841,11798 +18268,16267 @@ vld4q_dup_p64 (const poly64_t * __a) return ret; } -/* vld2_lane */ - -#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, \ - qmode, ptrmode, funcsuffix, signedtype) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_oi __o; \ - largetype __temp; \ - __temp.val[0] = \ - vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ - __temp.val[1] = \ - vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ - __o = __builtin_aarch64_set_qregoi##qmode (__o, \ - (signedtype) __temp.val[0], \ - 0); \ - __o = __builtin_aarch64_set_qregoi##qmode (__o, \ - (signedtype) __temp.val[1], \ - 1); \ - __o = __builtin_aarch64_ld2_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0); \ - __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1); \ - return __b; \ +__extension__ extern __inline poly128_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vldrq_p128 (const poly128_t * __ptr) +{ + return *__ptr; } -__LD2_LANE_FUNC (float16x4x2_t, float16x4_t, float16x8x2_t, float16_t, v4hf, - v8hf, hf, f16, float16x8_t) -__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v2sf, v4sf, - sf, f32, float32x4_t) -__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, df, v2df, - df, f64, float64x2_t) -__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8, - int8x16_t) -__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, - p16, int16x8_t) -__LD2_LANE_FUNC (poly64x1x2_t, poly64x1_t, poly64x2x2_t, poly64_t, di, - v2di_ssps, di, p64, poly64x2_t) -__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8, - int8x16_t) -__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16, - int16x8_t) -__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32, - int32x4_t) -__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, di, v2di, di, s64, - int64x2_t) -__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8, - int8x16_t) -__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi, - u16, int16x8_t) -__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, - u32, int32x4_t) -__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, di, v2di, di, - u64, int64x2_t) +/* vldn_dup */ -#undef __LD2_LANE_FUNC +__extension__ extern __inline int8x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_s8 (const int8_t * __a) +{ + int8x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0); + ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1); + return ret; +} -/* vld2q_lane */ +__extension__ extern __inline int16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_s16 (const int16_t * __a) +{ + int16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0); + ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1); + return ret; +} -#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_oi __o; \ - intype ret; \ - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); \ - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); \ - __o = __builtin_aarch64_ld2_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - ret.val[0] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 0); \ - ret.val[1] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 1); \ - return ret; \ +__extension__ extern __inline int32x2x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_s32 (const int32_t * __a) +{ + int32x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv2si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0); + ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1); + return ret; } -__LD2_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16) -__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32) -__LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64) -__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8) -__LD2_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16) -__LD2_LANE_FUNC (poly64x2x2_t, poly64x2_t, poly64_t, v2di, di, p64) -__LD2_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8) -__LD2_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16) -__LD2_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32) -__LD2_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64) -__LD2_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8) -__LD2_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16) -__LD2_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32) -__LD2_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64) +__extension__ extern __inline float16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_f16 (const float16_t * __a) +{ + float16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv4hf ((const __builtin_aarch64_simd_hf *) __a); + ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0); + ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregoiv4hf (__o, 1); + return ret; +} -#undef __LD2_LANE_FUNC +__extension__ extern __inline float32x2x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_f32 (const float32_t * __a) +{ + float32x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv2sf ((const __builtin_aarch64_simd_sf *) __a); + ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 0); + ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregoiv2sf (__o, 1); + return ret; +} + +__extension__ extern __inline float64x1x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_f64 (const float64_t * __a) +{ + float64x1x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rdf ((const __builtin_aarch64_simd_df *) __a); + ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 0)}; + ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregoidf (__o, 1)}; + return ret; +} + +__extension__ extern __inline uint8x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_u8 (const uint8_t * __a) +{ + uint8x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0); + ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1); + return ret; +} + +__extension__ extern __inline uint16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_u16 (const uint16_t * __a) +{ + uint16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0); + ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1); + return ret; +} + +__extension__ extern __inline uint32x2x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_u32 (const uint32_t * __a) +{ + uint32x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv2si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0); + ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1); + return ret; +} + +__extension__ extern __inline poly8x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_p8 (const poly8_t * __a) +{ + poly8x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 0); + ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregoiv8qi (__o, 1); + return ret; +} + +__extension__ extern __inline poly16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_p16 (const poly16_t * __a) +{ + poly16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0); + ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1); + return ret; +} + +__extension__ extern __inline poly64x1x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_p64 (const poly64_t * __a) +{ + poly64x1x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 0); + ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregoidi_pss (__o, 1); + return ret; +} + + +__extension__ extern __inline int64x1x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_s64 (const int64_t * __a) +{ + int64x1x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rdi ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 0); + ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregoidi (__o, 1); + return ret; +} + +__extension__ extern __inline uint64x1x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_u64 (const uint64_t * __a) +{ + uint64x1x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rdi ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 0); + ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregoidi (__o, 1); + return ret; +} + +__extension__ extern __inline int8x16x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_s8 (const int8_t * __a) +{ + int8x16x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0); + ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1); + return ret; +} + +__extension__ extern __inline poly8x16x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_p8 (const poly8_t * __a) +{ + poly8x16x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0); + ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1); + return ret; +} + +__extension__ extern __inline int16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_s16 (const int16_t * __a) +{ + int16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0); + ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1); + return ret; +} + +__extension__ extern __inline poly16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_p16 (const poly16_t * __a) +{ + poly16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0); + ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1); + return ret; +} + +__extension__ extern __inline int32x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_s32 (const int32_t * __a) +{ + int32x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv4si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; +} + +__extension__ extern __inline int64x2x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_s64 (const int64_t * __a) +{ + int64x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0); + ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1); + return ret; +} + +__extension__ extern __inline uint8x16x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_u8 (const uint8_t * __a) +{ + uint8x16x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0); + ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1); + return ret; +} + +__extension__ extern __inline uint16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_u16 (const uint16_t * __a) +{ + uint16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0); + ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1); + return ret; +} + +__extension__ extern __inline uint32x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_u32 (const uint32_t * __a) +{ + uint32x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv4si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0); + ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1); + return ret; +} + +__extension__ extern __inline uint64x2x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_u64 (const uint64_t * __a) +{ + uint64x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0); + ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1); + return ret; +} + +__extension__ extern __inline float16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_f16 (const float16_t * __a) +{ + float16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv8hf ((const __builtin_aarch64_simd_hf *) __a); + ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregoiv8hf (__o, 0); + ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1); + return ret; +} + +__extension__ extern __inline float32x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_f32 (const float32_t * __a) +{ + float32x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv4sf ((const __builtin_aarch64_simd_sf *) __a); + ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 0); + ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 1); + return ret; +} + +__extension__ extern __inline float64x2x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_f64 (const float64_t * __a) +{ + float64x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv2df ((const __builtin_aarch64_simd_df *) __a); + ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 0); + ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 1); + return ret; +} + +__extension__ extern __inline poly64x2x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_p64 (const poly64_t * __a) +{ + poly64x2x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 0); + ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregoiv2di_pss (__o, 1); + return ret; +} + +__extension__ extern __inline int64x1x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_s64 (const int64_t * __a) +{ + int64x1x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rdi ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 0); + ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 1); + ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregcidi (__o, 2); + return ret; +} + +__extension__ extern __inline uint64x1x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_u64 (const uint64_t * __a) +{ + uint64x1x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rdi ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 0); + ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 1); + ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregcidi (__o, 2); + return ret; +} + +__extension__ extern __inline float64x1x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_f64 (const float64_t * __a) +{ + float64x1x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rdf ((const __builtin_aarch64_simd_df *) __a); + ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 0)}; + ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 1)}; + ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregcidf (__o, 2)}; + return ret; +} + +__extension__ extern __inline int8x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_s8 (const int8_t * __a) +{ + int8x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0); + ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1); + ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2); + return ret; +} + +__extension__ extern __inline poly8x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_p8 (const poly8_t * __a) +{ + poly8x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0); + ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1); + ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2); + return ret; +} + +__extension__ extern __inline int16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_s16 (const int16_t * __a) +{ + int16x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0); + ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1); + ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2); + return ret; +} + +__extension__ extern __inline poly16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_p16 (const poly16_t * __a) +{ + poly16x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0); + ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1); + ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2); + return ret; +} + +__extension__ extern __inline int32x2x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_s32 (const int32_t * __a) +{ + int32x2x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv2si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0); + ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1); + ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2); + return ret; +} + +__extension__ extern __inline uint8x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_u8 (const uint8_t * __a) +{ + uint8x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 0); + ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 1); + ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregciv8qi (__o, 2); + return ret; +} + +__extension__ extern __inline uint16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_u16 (const uint16_t * __a) +{ + uint16x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 0); + ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 1); + ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregciv4hi (__o, 2); + return ret; +} + +__extension__ extern __inline uint32x2x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_u32 (const uint32_t * __a) +{ + uint32x2x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv2si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 0); + ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 1); + ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregciv2si (__o, 2); + return ret; +} + +__extension__ extern __inline float16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_f16 (const float16_t * __a) +{ + float16x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv4hf ((const __builtin_aarch64_simd_hf *) __a); + ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 0); + ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 1); + ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 2); + return ret; +} + +__extension__ extern __inline float32x2x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_f32 (const float32_t * __a) +{ + float32x2x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv2sf ((const __builtin_aarch64_simd_sf *) __a); + ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 0); + ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 1); + ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregciv2sf (__o, 2); + return ret; +} + +__extension__ extern __inline poly64x1x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_p64 (const poly64_t * __a) +{ + poly64x1x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 0); + ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 1); + ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregcidi_pss (__o, 2); + return ret; +} + +__extension__ extern __inline int8x16x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_s8 (const int8_t * __a) +{ + int8x16x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0); + ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1); + ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2); + return ret; +} + +__extension__ extern __inline poly8x16x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_p8 (const poly8_t * __a) +{ + poly8x16x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0); + ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1); + ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2); + return ret; +} + +__extension__ extern __inline int16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_s16 (const int16_t * __a) +{ + int16x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0); + ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1); + ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2); + return ret; +} + +__extension__ extern __inline poly16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_p16 (const poly16_t * __a) +{ + poly16x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0); + ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1); + ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2); + return ret; +} + +__extension__ extern __inline int32x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_s32 (const int32_t * __a) +{ + int32x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv4si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0); + ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1); + ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return ret; +} + +__extension__ extern __inline int64x2x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_s64 (const int64_t * __a) +{ + int64x2x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0); + ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1); + ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2); + return ret; +} + +__extension__ extern __inline uint8x16x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_u8 (const uint8_t * __a) +{ + uint8x16x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0); + ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1); + ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2); + return ret; +} + +__extension__ extern __inline uint16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_u16 (const uint16_t * __a) +{ + uint16x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0); + ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1); + ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2); + return ret; +} + +__extension__ extern __inline uint32x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_u32 (const uint32_t * __a) +{ + uint32x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv4si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0); + ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1); + ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2); + return ret; +} + +__extension__ extern __inline uint64x2x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_u64 (const uint64_t * __a) +{ + uint64x2x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0); + ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1); + ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2); + return ret; +} + +__extension__ extern __inline float16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_f16 (const float16_t * __a) +{ + float16x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv8hf ((const __builtin_aarch64_simd_hf *) __a); + ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 0); + ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 1); + ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 2); + return ret; +} + +__extension__ extern __inline float32x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_f32 (const float32_t * __a) +{ + float32x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv4sf ((const __builtin_aarch64_simd_sf *) __a); + ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 0); + ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 1); + ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 2); + return ret; +} + +__extension__ extern __inline float64x2x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_f64 (const float64_t * __a) +{ + float64x2x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv2df ((const __builtin_aarch64_simd_df *) __a); + ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 0); + ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 1); + ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 2); + return ret; +} + +__extension__ extern __inline poly64x2x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_p64 (const poly64_t * __a) +{ + poly64x2x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 0); + ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 1); + ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregciv2di_pss (__o, 2); + return ret; +} + +__extension__ extern __inline int64x1x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_s64 (const int64_t * __a) +{ + int64x1x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rdi ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 0); + ret.val[1] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 1); + ret.val[2] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 2); + ret.val[3] = (int64x1_t) __builtin_aarch64_get_dregxidi (__o, 3); + return ret; +} + +__extension__ extern __inline uint64x1x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_u64 (const uint64_t * __a) +{ + uint64x1x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rdi ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 0); + ret.val[1] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 1); + ret.val[2] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 2); + ret.val[3] = (uint64x1_t) __builtin_aarch64_get_dregxidi (__o, 3); + return ret; +} + +__extension__ extern __inline float64x1x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_f64 (const float64_t * __a) +{ + float64x1x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rdf ((const __builtin_aarch64_simd_df *) __a); + ret.val[0] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 0)}; + ret.val[1] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 1)}; + ret.val[2] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 2)}; + ret.val[3] = (float64x1_t) {__builtin_aarch64_get_dregxidf (__o, 3)}; + return ret; +} + +__extension__ extern __inline int8x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_s8 (const int8_t * __a) +{ + int8x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0); + ret.val[1] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1); + ret.val[2] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2); + ret.val[3] = (int8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3); + return ret; +} + +__extension__ extern __inline poly8x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_p8 (const poly8_t * __a) +{ + poly8x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0); + ret.val[1] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1); + ret.val[2] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2); + ret.val[3] = (poly8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3); + return ret; +} + +__extension__ extern __inline int16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_s16 (const int16_t * __a) +{ + int16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0); + ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1); + ret.val[2] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2); + ret.val[3] = (int16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3); + return ret; +} + +__extension__ extern __inline poly16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_p16 (const poly16_t * __a) +{ + poly16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0); + ret.val[1] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1); + ret.val[2] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2); + ret.val[3] = (poly16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3); + return ret; +} + +__extension__ extern __inline int32x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_s32 (const int32_t * __a) +{ + int32x2x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv2si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0); + ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1); + ret.val[2] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2); + ret.val[3] = (int32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3); + return ret; +} + +__extension__ extern __inline uint8x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_u8 (const uint8_t * __a) +{ + uint8x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv8qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 0); + ret.val[1] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 1); + ret.val[2] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 2); + ret.val[3] = (uint8x8_t) __builtin_aarch64_get_dregxiv8qi (__o, 3); + return ret; +} + +__extension__ extern __inline uint16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_u16 (const uint16_t * __a) +{ + uint16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv4hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 0); + ret.val[1] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 1); + ret.val[2] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 2); + ret.val[3] = (uint16x4_t) __builtin_aarch64_get_dregxiv4hi (__o, 3); + return ret; +} + +__extension__ extern __inline uint32x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_u32 (const uint32_t * __a) +{ + uint32x2x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv2si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 0); + ret.val[1] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 1); + ret.val[2] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 2); + ret.val[3] = (uint32x2_t) __builtin_aarch64_get_dregxiv2si (__o, 3); + return ret; +} + +__extension__ extern __inline float16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_f16 (const float16_t * __a) +{ + float16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv4hf ((const __builtin_aarch64_simd_hf *) __a); + ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 0); + ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 1); + ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 2); + ret.val[3] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 3); + return ret; +} + +__extension__ extern __inline float32x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_f32 (const float32_t * __a) +{ + float32x2x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv2sf ((const __builtin_aarch64_simd_sf *) __a); + ret.val[0] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 0); + ret.val[1] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 1); + ret.val[2] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 2); + ret.val[3] = (float32x2_t) __builtin_aarch64_get_dregxiv2sf (__o, 3); + return ret; +} + +__extension__ extern __inline poly64x1x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_p64 (const poly64_t * __a) +{ + poly64x1x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 0); + ret.val[1] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 1); + ret.val[2] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 2); + ret.val[3] = (poly64x1_t) __builtin_aarch64_get_dregxidi_pss (__o, 3); + return ret; +} + +__extension__ extern __inline int8x16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_s8 (const int8_t * __a) +{ + int8x16x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0); + ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1); + ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2); + ret.val[3] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3); + return ret; +} + +__extension__ extern __inline poly8x16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_p8 (const poly8_t * __a) +{ + poly8x16x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0); + ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1); + ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2); + ret.val[3] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3); + return ret; +} + +__extension__ extern __inline int16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_s16 (const int16_t * __a) +{ + int16x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0); + ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1); + ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2); + ret.val[3] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3); + return ret; +} + +__extension__ extern __inline poly16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_p16 (const poly16_t * __a) +{ + poly16x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0); + ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1); + ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2); + ret.val[3] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3); + return ret; +} + +__extension__ extern __inline int32x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_s32 (const int32_t * __a) +{ + int32x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv4si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline int64x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_s64 (const int64_t * __a) +{ + int64x2x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0); + ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1); + ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2); + ret.val[3] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3); + return ret; +} + +__extension__ extern __inline uint8x16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_u8 (const uint8_t * __a) +{ + uint8x16x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv16qi ((const __builtin_aarch64_simd_qi *) __a); + ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0); + ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1); + ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2); + ret.val[3] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3); + return ret; +} + +__extension__ extern __inline uint16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_u16 (const uint16_t * __a) +{ + uint16x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv8hi ((const __builtin_aarch64_simd_hi *) __a); + ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0); + ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1); + ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2); + ret.val[3] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3); + return ret; +} + +__extension__ extern __inline uint32x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_u32 (const uint32_t * __a) +{ + uint32x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv4si ((const __builtin_aarch64_simd_si *) __a); + ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0); + ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1); + ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2); + ret.val[3] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3); + return ret; +} + +__extension__ extern __inline uint64x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_u64 (const uint64_t * __a) +{ + uint64x2x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0); + ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1); + ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2); + ret.val[3] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3); + return ret; +} + +__extension__ extern __inline float16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_f16 (const float16_t * __a) +{ + float16x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv8hf ((const __builtin_aarch64_simd_hf *) __a); + ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 0); + ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 1); + ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 2); + ret.val[3] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 3); + return ret; +} + +__extension__ extern __inline float32x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_f32 (const float32_t * __a) +{ + float32x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv4sf ((const __builtin_aarch64_simd_sf *) __a); + ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 0); + ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 1); + ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 2); + ret.val[3] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 3); + return ret; +} + +__extension__ extern __inline float64x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_f64 (const float64_t * __a) +{ + float64x2x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv2df ((const __builtin_aarch64_simd_df *) __a); + ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 0); + ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 1); + ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 2); + ret.val[3] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 3); + return ret; +} + +__extension__ extern __inline poly64x2x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_p64 (const poly64_t * __a) +{ + poly64x2x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv2di ((const __builtin_aarch64_simd_di *) __a); + ret.val[0] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 0); + ret.val[1] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 1); + ret.val[2] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 2); + ret.val[3] = (poly64x2_t) __builtin_aarch64_get_qregxiv2di_pss (__o, 3); + return ret; +} + +/* vld2_lane */ + +#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, \ + qmode, ptrmode, funcsuffix, signedtype) \ +__extension__ extern __inline intype \ +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ +vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_oi __o; \ + largetype __temp; \ + __temp.val[0] = \ + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ + __temp.val[1] = \ + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ + __o = __builtin_aarch64_set_qregoi##qmode (__o, \ + (signedtype) __temp.val[0], \ + 0); \ + __o = __builtin_aarch64_set_qregoi##qmode (__o, \ + (signedtype) __temp.val[1], \ + 1); \ + __o = __builtin_aarch64_ld2_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0); \ + __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1); \ + return __b; \ +} + +__LD2_LANE_FUNC (float16x4x2_t, float16x4_t, float16x8x2_t, float16_t, v4hf, + v8hf, hf, f16, float16x8_t) +__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v2sf, v4sf, + sf, f32, float32x4_t) +__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, df, v2df, + df, f64, float64x2_t) +__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8, + int8x16_t) +__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, + p16, int16x8_t) +__LD2_LANE_FUNC (poly64x1x2_t, poly64x1_t, poly64x2x2_t, poly64_t, di, + v2di_ssps, di, p64, poly64x2_t) +__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8, + int8x16_t) +__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16, + int16x8_t) +__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32, + int32x4_t) +__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, di, v2di, di, s64, + int64x2_t) +__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8, + int8x16_t) +__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi, + u16, int16x8_t) +__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, + u32, int32x4_t) +__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, di, v2di, di, + u64, int64x2_t) + +/* vld2q_lane */ + +#define __LD2Q_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ +__extension__ extern __inline intype \ +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ +vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_oi __o; \ + intype ret; \ + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); \ + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); \ + __o = __builtin_aarch64_ld2_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + ret.val[0] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 0); \ + ret.val[1] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 1); \ + return ret; \ +} + +__LD2Q_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16) +__LD2Q_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32) +__LD2Q_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64) +__LD2Q_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8) +__LD2Q_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16) +__LD2Q_LANE_FUNC (poly64x2x2_t, poly64x2_t, poly64_t, v2di, di, p64) +__LD2Q_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8) +__LD2Q_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16) +__LD2Q_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32) +__LD2Q_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64) +__LD2Q_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8) +__LD2Q_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16) +__LD2Q_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32) +__LD2Q_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64) + +/* vld3_lane */ + +#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, \ + qmode, ptrmode, funcsuffix, signedtype) \ +__extension__ extern __inline intype \ +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ +vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_ci __o; \ + largetype __temp; \ + __temp.val[0] = \ + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ + __temp.val[1] = \ + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ + __temp.val[2] = \ + vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \ + __o = __builtin_aarch64_set_qregci##qmode (__o, \ + (signedtype) __temp.val[0], \ + 0); \ + __o = __builtin_aarch64_set_qregci##qmode (__o, \ + (signedtype) __temp.val[1], \ + 1); \ + __o = __builtin_aarch64_set_qregci##qmode (__o, \ + (signedtype) __temp.val[2], \ + 2); \ + __o = __builtin_aarch64_ld3_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0); \ + __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1); \ + __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2); \ + return __b; \ +} + +__LD3_LANE_FUNC (float16x4x3_t, float16x4_t, float16x8x3_t, float16_t, v4hf, + v8hf, hf, f16, float16x8_t) +__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v2sf, v4sf, + sf, f32, float32x4_t) +__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, df, v2df, + df, f64, float64x2_t) +__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8, + int8x16_t) +__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi, + p16, int16x8_t) +__LD3_LANE_FUNC (poly64x1x3_t, poly64x1_t, poly64x2x3_t, poly64_t, di, + v2di_ssps, di, p64, poly64x2_t) +__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8, + int8x16_t) +__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16, + int16x8_t) +__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v2si, v4si, si, s32, + int32x4_t) +__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, di, v2di, di, s64, + int64x2_t) +__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v8qi, v16qi, qi, u8, + int8x16_t) +__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v4hi, v8hi, hi, + u16, int16x8_t) +__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v2si, v4si, si, + u32, int32x4_t) +__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, di, v2di, di, + u64, int64x2_t) + +/* vld3q_lane */ + +#define __LD3Q_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ +__extension__ extern __inline intype \ +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ +vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_ci __o; \ + intype ret; \ + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); \ + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); \ + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); \ + __o = __builtin_aarch64_ld3_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + ret.val[0] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 0); \ + ret.val[1] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 1); \ + ret.val[2] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 2); \ + return ret; \ +} + +__LD3Q_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16) +__LD3Q_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32) +__LD3Q_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64) +__LD3Q_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8) +__LD3Q_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16) +__LD3Q_LANE_FUNC (poly64x2x3_t, poly64x2_t, poly64_t, v2di, di, p64) +__LD3Q_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8) +__LD3Q_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16) +__LD3Q_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32) +__LD3Q_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64) +__LD3Q_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8) +__LD3Q_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16) +__LD3Q_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32) +__LD3Q_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64) + +/* vld4_lane */ + +#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, \ + qmode, ptrmode, funcsuffix, signedtype) \ +__extension__ extern __inline intype \ +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ +vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_xi __o; \ + largetype __temp; \ + __temp.val[0] = \ + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ + __temp.val[1] = \ + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ + __temp.val[2] = \ + vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \ + __temp.val[3] = \ + vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0)); \ + __o = __builtin_aarch64_set_qregxi##qmode (__o, \ + (signedtype) __temp.val[0], \ + 0); \ + __o = __builtin_aarch64_set_qregxi##qmode (__o, \ + (signedtype) __temp.val[1], \ + 1); \ + __o = __builtin_aarch64_set_qregxi##qmode (__o, \ + (signedtype) __temp.val[2], \ + 2); \ + __o = __builtin_aarch64_set_qregxi##qmode (__o, \ + (signedtype) __temp.val[3], \ + 3); \ + __o = __builtin_aarch64_ld4_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0); \ + __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1); \ + __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2); \ + __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3); \ + return __b; \ +} + +/* vld4q_lane */ + +__LD4_LANE_FUNC (float16x4x4_t, float16x4_t, float16x8x4_t, float16_t, v4hf, + v8hf, hf, f16, float16x8_t) +__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v2sf, v4sf, + sf, f32, float32x4_t) +__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, df, v2df, + df, f64, float64x2_t) +__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8, + int8x16_t) +__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi, + p16, int16x8_t) +__LD4_LANE_FUNC (poly64x1x4_t, poly64x1_t, poly64x2x4_t, poly64_t, di, + v2di_ssps, di, p64, poly64x2_t) +__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8, + int8x16_t) +__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16, + int16x8_t) +__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v2si, v4si, si, s32, + int32x4_t) +__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, di, v2di, di, s64, + int64x2_t) +__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8, + int8x16_t) +__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi, + u16, int16x8_t) +__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v2si, v4si, si, + u32, int32x4_t) +__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, di, v2di, di, + u64, int64x2_t) + +/* vld4q_lane */ + +#define __LD4Q_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ +__extension__ extern __inline intype \ +__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ +vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_xi __o; \ + intype ret; \ + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); \ + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); \ + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); \ + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); \ + __o = __builtin_aarch64_ld4_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + ret.val[0] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 0); \ + ret.val[1] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 1); \ + ret.val[2] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 2); \ + ret.val[3] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 3); \ + return ret; \ +} + +__LD4Q_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16) +__LD4Q_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32) +__LD4Q_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64) +__LD4Q_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8) +__LD4Q_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16) +__LD4Q_LANE_FUNC (poly64x2x4_t, poly64x2_t, poly64_t, v2di, di, p64) +__LD4Q_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8) +__LD4Q_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16) +__LD4Q_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32) +__LD4Q_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64) +__LD4Q_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8) +__LD4Q_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16) +__LD4Q_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32) +__LD4Q_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64) + +/* vmax */ + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmax_f32 (float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_smax_nanv2sf (__a, __b); +} + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmax_f64 (float64x1_t __a, float64x1_t __b) +{ + return (float64x1_t) + { __builtin_aarch64_smax_nandf (vget_lane_f64 (__a, 0), + vget_lane_f64 (__b, 0)) }; +} + +__extension__ extern __inline int8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmax_s8 (int8x8_t __a, int8x8_t __b) +{ + return __builtin_aarch64_smaxv8qi (__a, __b); +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmax_s16 (int16x4_t __a, int16x4_t __b) +{ + return __builtin_aarch64_smaxv4hi (__a, __b); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmax_s32 (int32x2_t __a, int32x2_t __b) +{ + return __builtin_aarch64_smaxv2si (__a, __b); +} + +__extension__ extern __inline uint8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmax_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t) __builtin_aarch64_umaxv8qi ((int8x8_t) __a, + (int8x8_t) __b); +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmax_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t) __builtin_aarch64_umaxv4hi ((int16x4_t) __a, + (int16x4_t) __b); +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmax_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t) __builtin_aarch64_umaxv2si ((int32x2_t) __a, + (int32x2_t) __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxq_f32 (float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_smax_nanv4sf (__a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxq_f64 (float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_smax_nanv2df (__a, __b); +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxq_s8 (int8x16_t __a, int8x16_t __b) +{ + return __builtin_aarch64_smaxv16qi (__a, __b); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxq_s16 (int16x8_t __a, int16x8_t __b) +{ + return __builtin_aarch64_smaxv8hi (__a, __b); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxq_s32 (int32x4_t __a, int32x4_t __b) +{ + return __builtin_aarch64_smaxv4si (__a, __b); +} + +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t) __builtin_aarch64_umaxv16qi ((int8x16_t) __a, + (int8x16_t) __b); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t) __builtin_aarch64_umaxv8hi ((int16x8_t) __a, + (int16x8_t) __b); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t) __builtin_aarch64_umaxv4si ((int32x4_t) __a, + (int32x4_t) __b); +} +/* vmulx */ + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulx_f32 (float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_fmulxv2sf (__a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulxq_f32 (float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_fmulxv4sf (__a, __b); +} + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulx_f64 (float64x1_t __a, float64x1_t __b) +{ + return (float64x1_t) {__builtin_aarch64_fmulxdf (__a[0], __b[0])}; +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulxq_f64 (float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_fmulxv2df (__a, __b); +} + +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulxs_f32 (float32_t __a, float32_t __b) +{ + return __builtin_aarch64_fmulxsf (__a, __b); +} + +__extension__ extern __inline float64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulxd_f64 (float64_t __a, float64_t __b) +{ + return __builtin_aarch64_fmulxdf (__a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulx_lane_f32 (float32x2_t __a, float32x2_t __v, const int __lane) +{ + return vmulx_f32 (__a, __aarch64_vdup_lane_f32 (__v, __lane)); +} + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulx_lane_f64 (float64x1_t __a, float64x1_t __v, const int __lane) +{ + return vmulx_f64 (__a, __aarch64_vdup_lane_f64 (__v, __lane)); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulxq_lane_f32 (float32x4_t __a, float32x2_t __v, const int __lane) +{ + return vmulxq_f32 (__a, __aarch64_vdupq_lane_f32 (__v, __lane)); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulxq_lane_f64 (float64x2_t __a, float64x1_t __v, const int __lane) +{ + return vmulxq_f64 (__a, __aarch64_vdupq_lane_f64 (__v, __lane)); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulx_laneq_f32 (float32x2_t __a, float32x4_t __v, const int __lane) +{ + return vmulx_f32 (__a, __aarch64_vdup_laneq_f32 (__v, __lane)); +} + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulx_laneq_f64 (float64x1_t __a, float64x2_t __v, const int __lane) +{ + return vmulx_f64 (__a, __aarch64_vdup_laneq_f64 (__v, __lane)); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulxq_laneq_f32 (float32x4_t __a, float32x4_t __v, const int __lane) +{ + return vmulxq_f32 (__a, __aarch64_vdupq_laneq_f32 (__v, __lane)); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulxq_laneq_f64 (float64x2_t __a, float64x2_t __v, const int __lane) +{ + return vmulxq_f64 (__a, __aarch64_vdupq_laneq_f64 (__v, __lane)); +} + +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulxs_lane_f32 (float32_t __a, float32x2_t __v, const int __lane) +{ + return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane)); +} + +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulxs_laneq_f32 (float32_t __a, float32x4_t __v, const int __lane) +{ + return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane)); +} + +__extension__ extern __inline float64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulxd_lane_f64 (float64_t __a, float64x1_t __v, const int __lane) +{ + return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane)); +} + +__extension__ extern __inline float64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulxd_laneq_f64 (float64_t __a, float64x2_t __v, const int __lane) +{ + return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane)); +} + +/* vpmax */ + +__extension__ extern __inline int8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmax_s8 (int8x8_t __a, int8x8_t __b) +{ + return __builtin_aarch64_smaxpv8qi (__a, __b); +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmax_s16 (int16x4_t __a, int16x4_t __b) +{ + return __builtin_aarch64_smaxpv4hi (__a, __b); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmax_s32 (int32x2_t __a, int32x2_t __b) +{ + return __builtin_aarch64_smaxpv2si (__a, __b); +} + +__extension__ extern __inline uint8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmax_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t) __builtin_aarch64_umaxpv8qi ((int8x8_t) __a, + (int8x8_t) __b); +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmax_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t) __builtin_aarch64_umaxpv4hi ((int16x4_t) __a, + (int16x4_t) __b); +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmax_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t) __builtin_aarch64_umaxpv2si ((int32x2_t) __a, + (int32x2_t) __b); +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmaxq_s8 (int8x16_t __a, int8x16_t __b) +{ + return __builtin_aarch64_smaxpv16qi (__a, __b); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmaxq_s16 (int16x8_t __a, int16x8_t __b) +{ + return __builtin_aarch64_smaxpv8hi (__a, __b); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmaxq_s32 (int32x4_t __a, int32x4_t __b) +{ + return __builtin_aarch64_smaxpv4si (__a, __b); +} + +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmaxq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t) __builtin_aarch64_umaxpv16qi ((int8x16_t) __a, + (int8x16_t) __b); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmaxq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t) __builtin_aarch64_umaxpv8hi ((int16x8_t) __a, + (int16x8_t) __b); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmaxq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t) __builtin_aarch64_umaxpv4si ((int32x4_t) __a, + (int32x4_t) __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmax_f32 (float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_smax_nanpv2sf (__a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmaxq_f32 (float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_smax_nanpv4sf (__a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmaxq_f64 (float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_smax_nanpv2df (__a, __b); +} + +__extension__ extern __inline float64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmaxqd_f64 (float64x2_t __a) +{ + return __builtin_aarch64_reduc_smax_nan_scal_v2df (__a); +} + +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmaxs_f32 (float32x2_t __a) +{ + return __builtin_aarch64_reduc_smax_nan_scal_v2sf (__a); +} + +/* vpmaxnm */ + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmaxnm_f32 (float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_smaxpv2sf (__a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmaxnmq_f32 (float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_smaxpv4sf (__a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmaxnmq_f64 (float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_smaxpv2df (__a, __b); +} + +__extension__ extern __inline float64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmaxnmqd_f64 (float64x2_t __a) +{ + return __builtin_aarch64_reduc_smax_scal_v2df (__a); +} + +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmaxnms_f32 (float32x2_t __a) +{ + return __builtin_aarch64_reduc_smax_scal_v2sf (__a); +} + +/* vpmin */ + +__extension__ extern __inline int8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmin_s8 (int8x8_t __a, int8x8_t __b) +{ + return __builtin_aarch64_sminpv8qi (__a, __b); +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmin_s16 (int16x4_t __a, int16x4_t __b) +{ + return __builtin_aarch64_sminpv4hi (__a, __b); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmin_s32 (int32x2_t __a, int32x2_t __b) +{ + return __builtin_aarch64_sminpv2si (__a, __b); +} + +__extension__ extern __inline uint8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmin_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t) __builtin_aarch64_uminpv8qi ((int8x8_t) __a, + (int8x8_t) __b); +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmin_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t) __builtin_aarch64_uminpv4hi ((int16x4_t) __a, + (int16x4_t) __b); +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmin_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t) __builtin_aarch64_uminpv2si ((int32x2_t) __a, + (int32x2_t) __b); +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpminq_s8 (int8x16_t __a, int8x16_t __b) +{ + return __builtin_aarch64_sminpv16qi (__a, __b); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpminq_s16 (int16x8_t __a, int16x8_t __b) +{ + return __builtin_aarch64_sminpv8hi (__a, __b); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpminq_s32 (int32x4_t __a, int32x4_t __b) +{ + return __builtin_aarch64_sminpv4si (__a, __b); +} + +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpminq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t) __builtin_aarch64_uminpv16qi ((int8x16_t) __a, + (int8x16_t) __b); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpminq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t) __builtin_aarch64_uminpv8hi ((int16x8_t) __a, + (int16x8_t) __b); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpminq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t) __builtin_aarch64_uminpv4si ((int32x4_t) __a, + (int32x4_t) __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmin_f32 (float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_smin_nanpv2sf (__a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpminq_f32 (float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_smin_nanpv4sf (__a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpminq_f64 (float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_smin_nanpv2df (__a, __b); +} + +__extension__ extern __inline float64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpminqd_f64 (float64x2_t __a) +{ + return __builtin_aarch64_reduc_smin_nan_scal_v2df (__a); +} + +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmins_f32 (float32x2_t __a) +{ + return __builtin_aarch64_reduc_smin_nan_scal_v2sf (__a); +} + +/* vpminnm */ + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpminnm_f32 (float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_sminpv2sf (__a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpminnmq_f32 (float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_sminpv4sf (__a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpminnmq_f64 (float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_sminpv2df (__a, __b); +} + +__extension__ extern __inline float64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpminnmqd_f64 (float64x2_t __a) +{ + return __builtin_aarch64_reduc_smin_scal_v2df (__a); +} + +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpminnms_f32 (float32x2_t __a) +{ + return __builtin_aarch64_reduc_smin_scal_v2sf (__a); +} + +/* vmaxnm */ + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxnm_f32 (float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_fmaxv2sf (__a, __b); +} + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxnm_f64 (float64x1_t __a, float64x1_t __b) +{ + return (float64x1_t) + { __builtin_aarch64_fmaxdf (vget_lane_f64 (__a, 0), + vget_lane_f64 (__b, 0)) }; +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxnmq_f32 (float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_fmaxv4sf (__a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxnmq_f64 (float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_fmaxv2df (__a, __b); +} + +/* vmaxv */ + +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxv_f32 (float32x2_t __a) +{ + return __builtin_aarch64_reduc_smax_nan_scal_v2sf (__a); +} + +__extension__ extern __inline int8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxv_s8 (int8x8_t __a) +{ + return __builtin_aarch64_reduc_smax_scal_v8qi (__a); +} + +__extension__ extern __inline int16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxv_s16 (int16x4_t __a) +{ + return __builtin_aarch64_reduc_smax_scal_v4hi (__a); +} + +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxv_s32 (int32x2_t __a) +{ + return __builtin_aarch64_reduc_smax_scal_v2si (__a); +} + +__extension__ extern __inline uint8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxv_u8 (uint8x8_t __a) +{ + return __builtin_aarch64_reduc_umax_scal_v8qi_uu (__a); +} + +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxv_u16 (uint16x4_t __a) +{ + return __builtin_aarch64_reduc_umax_scal_v4hi_uu (__a); +} + +__extension__ extern __inline uint32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxv_u32 (uint32x2_t __a) +{ + return __builtin_aarch64_reduc_umax_scal_v2si_uu (__a); +} + +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxvq_f32 (float32x4_t __a) +{ + return __builtin_aarch64_reduc_smax_nan_scal_v4sf (__a); +} + +__extension__ extern __inline float64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxvq_f64 (float64x2_t __a) +{ + return __builtin_aarch64_reduc_smax_nan_scal_v2df (__a); +} + +__extension__ extern __inline int8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxvq_s8 (int8x16_t __a) +{ + return __builtin_aarch64_reduc_smax_scal_v16qi (__a); +} + +__extension__ extern __inline int16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxvq_s16 (int16x8_t __a) +{ + return __builtin_aarch64_reduc_smax_scal_v8hi (__a); +} + +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxvq_s32 (int32x4_t __a) +{ + return __builtin_aarch64_reduc_smax_scal_v4si (__a); +} + +__extension__ extern __inline uint8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxvq_u8 (uint8x16_t __a) +{ + return __builtin_aarch64_reduc_umax_scal_v16qi_uu (__a); +} + +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxvq_u16 (uint16x8_t __a) +{ + return __builtin_aarch64_reduc_umax_scal_v8hi_uu (__a); +} + +__extension__ extern __inline uint32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxvq_u32 (uint32x4_t __a) +{ + return __builtin_aarch64_reduc_umax_scal_v4si_uu (__a); +} + +/* vmaxnmv */ + +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxnmv_f32 (float32x2_t __a) +{ + return __builtin_aarch64_reduc_smax_scal_v2sf (__a); +} + +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxnmvq_f32 (float32x4_t __a) +{ + return __builtin_aarch64_reduc_smax_scal_v4sf (__a); +} + +__extension__ extern __inline float64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmaxnmvq_f64 (float64x2_t __a) +{ + return __builtin_aarch64_reduc_smax_scal_v2df (__a); +} + +/* vmin */ + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmin_f32 (float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_smin_nanv2sf (__a, __b); +} + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmin_f64 (float64x1_t __a, float64x1_t __b) +{ + return (float64x1_t) + { __builtin_aarch64_smin_nandf (vget_lane_f64 (__a, 0), + vget_lane_f64 (__b, 0)) }; +} + +__extension__ extern __inline int8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmin_s8 (int8x8_t __a, int8x8_t __b) +{ + return __builtin_aarch64_sminv8qi (__a, __b); +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmin_s16 (int16x4_t __a, int16x4_t __b) +{ + return __builtin_aarch64_sminv4hi (__a, __b); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmin_s32 (int32x2_t __a, int32x2_t __b) +{ + return __builtin_aarch64_sminv2si (__a, __b); +} + +__extension__ extern __inline uint8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmin_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t) __builtin_aarch64_uminv8qi ((int8x8_t) __a, + (int8x8_t) __b); +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmin_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t) __builtin_aarch64_uminv4hi ((int16x4_t) __a, + (int16x4_t) __b); +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmin_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t) __builtin_aarch64_uminv2si ((int32x2_t) __a, + (int32x2_t) __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminq_f32 (float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_smin_nanv4sf (__a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminq_f64 (float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_smin_nanv2df (__a, __b); +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminq_s8 (int8x16_t __a, int8x16_t __b) +{ + return __builtin_aarch64_sminv16qi (__a, __b); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminq_s16 (int16x8_t __a, int16x8_t __b) +{ + return __builtin_aarch64_sminv8hi (__a, __b); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminq_s32 (int32x4_t __a, int32x4_t __b) +{ + return __builtin_aarch64_sminv4si (__a, __b); +} + +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t) __builtin_aarch64_uminv16qi ((int8x16_t) __a, + (int8x16_t) __b); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t) __builtin_aarch64_uminv8hi ((int16x8_t) __a, + (int16x8_t) __b); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t) __builtin_aarch64_uminv4si ((int32x4_t) __a, + (int32x4_t) __b); +} + +/* vminnm */ + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminnm_f32 (float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_fminv2sf (__a, __b); +} + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminnm_f64 (float64x1_t __a, float64x1_t __b) +{ + return (float64x1_t) + { __builtin_aarch64_fmindf (vget_lane_f64 (__a, 0), + vget_lane_f64 (__b, 0)) }; +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminnmq_f32 (float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_fminv4sf (__a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminnmq_f64 (float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_fminv2df (__a, __b); +} + +/* vminv */ + +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminv_f32 (float32x2_t __a) +{ + return __builtin_aarch64_reduc_smin_nan_scal_v2sf (__a); +} + +__extension__ extern __inline int8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminv_s8 (int8x8_t __a) +{ + return __builtin_aarch64_reduc_smin_scal_v8qi (__a); +} + +__extension__ extern __inline int16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminv_s16 (int16x4_t __a) +{ + return __builtin_aarch64_reduc_smin_scal_v4hi (__a); +} + +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminv_s32 (int32x2_t __a) +{ + return __builtin_aarch64_reduc_smin_scal_v2si (__a); +} + +__extension__ extern __inline uint8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminv_u8 (uint8x8_t __a) +{ + return __builtin_aarch64_reduc_umin_scal_v8qi_uu (__a); +} + +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminv_u16 (uint16x4_t __a) +{ + return __builtin_aarch64_reduc_umin_scal_v4hi_uu (__a); +} + +__extension__ extern __inline uint32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminv_u32 (uint32x2_t __a) +{ + return __builtin_aarch64_reduc_umin_scal_v2si_uu (__a); +} + +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminvq_f32 (float32x4_t __a) +{ + return __builtin_aarch64_reduc_smin_nan_scal_v4sf (__a); +} + +__extension__ extern __inline float64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminvq_f64 (float64x2_t __a) +{ + return __builtin_aarch64_reduc_smin_nan_scal_v2df (__a); +} + +__extension__ extern __inline int8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminvq_s8 (int8x16_t __a) +{ + return __builtin_aarch64_reduc_smin_scal_v16qi (__a); +} + +__extension__ extern __inline int16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminvq_s16 (int16x8_t __a) +{ + return __builtin_aarch64_reduc_smin_scal_v8hi (__a); +} + +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminvq_s32 (int32x4_t __a) +{ + return __builtin_aarch64_reduc_smin_scal_v4si (__a); +} + +__extension__ extern __inline uint8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminvq_u8 (uint8x16_t __a) +{ + return __builtin_aarch64_reduc_umin_scal_v16qi_uu (__a); +} + +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminvq_u16 (uint16x8_t __a) +{ + return __builtin_aarch64_reduc_umin_scal_v8hi_uu (__a); +} + +__extension__ extern __inline uint32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminvq_u32 (uint32x4_t __a) +{ + return __builtin_aarch64_reduc_umin_scal_v4si_uu (__a); +} + +/* vminnmv */ + +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminnmv_f32 (float32x2_t __a) +{ + return __builtin_aarch64_reduc_smin_scal_v2sf (__a); +} + +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminnmvq_f32 (float32x4_t __a) +{ + return __builtin_aarch64_reduc_smin_scal_v4sf (__a); +} + +__extension__ extern __inline float64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vminnmvq_f64 (float64x2_t __a) +{ + return __builtin_aarch64_reduc_smin_scal_v2df (__a); +} + +/* vmla */ + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmla_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c) +{ + return __a + __b * __c; +} + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmla_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c) +{ + return __a + __b * __c; +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) +{ + return __a + __b * __c; +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlaq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c) +{ + return __a + __b * __c; +} + +/* vmla_lane */ + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmla_lane_f32 (float32x2_t __a, float32x2_t __b, + float32x2_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmla_lane_s16 (int16x4_t __a, int16x4_t __b, + int16x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmla_lane_s32 (int32x2_t __a, int32x2_t __b, + int32x2_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b, + uint16x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b, + uint32x2_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +/* vmla_laneq */ + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmla_laneq_f32 (float32x2_t __a, float32x2_t __b, + float32x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmla_laneq_s16 (int16x4_t __a, int16x4_t __b, + int16x8_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmla_laneq_s32 (int32x2_t __a, int32x2_t __b, + int32x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmla_laneq_u16 (uint16x4_t __a, uint16x4_t __b, + uint16x8_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmla_laneq_u32 (uint32x2_t __a, uint32x2_t __b, + uint32x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +/* vmlaq_lane */ + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b, + float32x2_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b, + int16x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b, + int32x2_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b, + uint16x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b, + uint32x2_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + + /* vmlaq_laneq */ + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlaq_laneq_f32 (float32x4_t __a, float32x4_t __b, + float32x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlaq_laneq_s16 (int16x8_t __a, int16x8_t __b, + int16x8_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlaq_laneq_s32 (int32x4_t __a, int32x4_t __b, + int32x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlaq_laneq_u16 (uint16x8_t __a, uint16x8_t __b, + uint16x8_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlaq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, + uint32x4_t __c, const int __lane) +{ + return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +/* vmls */ + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmls_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c) +{ + return __a - __b * __c; +} + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmls_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c) +{ + return __a - __b * __c; +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) +{ + return __a - __b * __c; +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c) +{ + return __a - __b * __c; +} + +/* vmls_lane */ + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmls_lane_f32 (float32x2_t __a, float32x2_t __b, + float32x2_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmls_lane_s16 (int16x4_t __a, int16x4_t __b, + int16x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmls_lane_s32 (int32x2_t __a, int32x2_t __b, + int32x2_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b, + uint16x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b, + uint32x2_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +/* vmls_laneq */ + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmls_laneq_f32 (float32x2_t __a, float32x2_t __b, + float32x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmls_laneq_s16 (int16x4_t __a, int16x4_t __b, + int16x8_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmls_laneq_s32 (int32x2_t __a, int32x2_t __b, + int32x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmls_laneq_u16 (uint16x4_t __a, uint16x4_t __b, + uint16x8_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmls_laneq_u32 (uint32x2_t __a, uint32x2_t __b, + uint32x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +/* vmlsq_lane */ + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b, + float32x2_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b, + int16x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b, + int32x2_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b, + uint16x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b, + uint32x2_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} + + /* vmlsq_laneq */ + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlsq_laneq_f32 (float32x4_t __a, float32x4_t __b, + float32x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlsq_laneq_s16 (int16x8_t __a, int16x8_t __b, + int16x8_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlsq_laneq_s32 (int32x4_t __a, int32x4_t __b, + int32x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlsq_laneq_u16 (uint16x8_t __a, uint16x8_t __b, + uint16x8_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlsq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, + uint32x4_t __c, const int __lane) +{ + return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); +} + +/* vmov_n_ */ + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmov_n_f16 (float16_t __a) +{ + return vdup_n_f16 (__a); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmov_n_f32 (float32_t __a) +{ + return vdup_n_f32 (__a); +} + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmov_n_f64 (float64_t __a) +{ + return (float64x1_t) {__a}; +} + +__extension__ extern __inline poly8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmov_n_p8 (poly8_t __a) +{ + return vdup_n_p8 (__a); +} + +__extension__ extern __inline poly16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmov_n_p16 (poly16_t __a) +{ + return vdup_n_p16 (__a); +} + +__extension__ extern __inline poly64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmov_n_p64 (poly64_t __a) +{ + return vdup_n_p64 (__a); +} + +__extension__ extern __inline int8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmov_n_s8 (int8_t __a) +{ + return vdup_n_s8 (__a); +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmov_n_s16 (int16_t __a) +{ + return vdup_n_s16 (__a); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmov_n_s32 (int32_t __a) +{ + return vdup_n_s32 (__a); +} + +__extension__ extern __inline int64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmov_n_s64 (int64_t __a) +{ + return (int64x1_t) {__a}; +} + +__extension__ extern __inline uint8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmov_n_u8 (uint8_t __a) +{ + return vdup_n_u8 (__a); +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmov_n_u16 (uint16_t __a) +{ + return vdup_n_u16 (__a); +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmov_n_u32 (uint32_t __a) +{ + return vdup_n_u32 (__a); +} + +__extension__ extern __inline uint64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmov_n_u64 (uint64_t __a) +{ + return (uint64x1_t) {__a}; +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmovq_n_f16 (float16_t __a) +{ + return vdupq_n_f16 (__a); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmovq_n_f32 (float32_t __a) +{ + return vdupq_n_f32 (__a); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmovq_n_f64 (float64_t __a) +{ + return vdupq_n_f64 (__a); +} + +__extension__ extern __inline poly8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmovq_n_p8 (poly8_t __a) +{ + return vdupq_n_p8 (__a); +} + +__extension__ extern __inline poly16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmovq_n_p16 (poly16_t __a) +{ + return vdupq_n_p16 (__a); +} + +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmovq_n_p64 (poly64_t __a) +{ + return vdupq_n_p64 (__a); +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmovq_n_s8 (int8_t __a) +{ + return vdupq_n_s8 (__a); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmovq_n_s16 (int16_t __a) +{ + return vdupq_n_s16 (__a); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmovq_n_s32 (int32_t __a) +{ + return vdupq_n_s32 (__a); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmovq_n_s64 (int64_t __a) +{ + return vdupq_n_s64 (__a); +} + +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmovq_n_u8 (uint8_t __a) +{ + return vdupq_n_u8 (__a); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmovq_n_u16 (uint16_t __a) +{ + return vdupq_n_u16 (__a); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmovq_n_u32 (uint32_t __a) +{ + return vdupq_n_u32 (__a); +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmovq_n_u64 (uint64_t __a) +{ + return vdupq_n_u64 (__a); +} + +/* vmul_lane */ + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmul_lane_f64 (float64x1_t __a, float64x1_t __b, const int __lane) +{ + return __a * __b; +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +/* vmuld_lane */ + +__extension__ extern __inline float64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmuld_lane_f64 (float64_t __a, float64x1_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline float64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmuld_laneq_f64 (float64_t __a, float64x2_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +/* vmuls_lane */ + +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmuls_lane_f32 (float32_t __a, float32x2_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmuls_laneq_f32 (float32_t __a, float32x4_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +/* vmul_laneq */ + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmul_laneq_f32 (float32x2_t __a, float32x4_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmul_laneq_f64 (float64x1_t __a, float64x2_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmul_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmul_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmul_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmul_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +/* vmul_n */ + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmul_n_f64 (float64x1_t __a, float64_t __b) +{ + return (float64x1_t) { vget_lane_f64 (__a, 0) * __b }; +} + +/* vmulq_lane */ + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulq_lane_f64 (float64x2_t __a, float64x1_t __b, const int __lane) +{ + __AARCH64_LANE_CHECK (__a, __lane); + return __a * __b[0]; +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +/* vmulq_laneq */ + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulq_laneq_f32 (float32x4_t __a, float32x4_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulq_laneq_f64 (float64x2_t __a, float64x2_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulq_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __lane) +{ + return __a * __aarch64_vget_lane_any (__b, __lane); +} + +/* vmul_n. */ + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmul_n_f32 (float32x2_t __a, float32_t __b) +{ + return __a * __b; +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulq_n_f32 (float32x4_t __a, float32_t __b) +{ + return __a * __b; +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulq_n_f64 (float64x2_t __a, float64_t __b) +{ + return __a * __b; +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmul_n_s16 (int16x4_t __a, int16_t __b) +{ + return __a * __b; +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulq_n_s16 (int16x8_t __a, int16_t __b) +{ + return __a * __b; +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmul_n_s32 (int32x2_t __a, int32_t __b) +{ + return __a * __b; +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulq_n_s32 (int32x4_t __a, int32_t __b) +{ + return __a * __b; +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmul_n_u16 (uint16x4_t __a, uint16_t __b) +{ + return __a * __b; +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulq_n_u16 (uint16x8_t __a, uint16_t __b) +{ + return __a * __b; +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmul_n_u32 (uint32x2_t __a, uint32_t __b) +{ + return __a * __b; +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmulq_n_u32 (uint32x4_t __a, uint32_t __b) +{ + return __a * __b; +} + +/* vmvn */ + +__extension__ extern __inline poly8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmvn_p8 (poly8x8_t __a) +{ + return (poly8x8_t) ~((int8x8_t) __a); +} + +__extension__ extern __inline int8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmvn_s8 (int8x8_t __a) +{ + return ~__a; +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmvn_s16 (int16x4_t __a) +{ + return ~__a; +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmvn_s32 (int32x2_t __a) +{ + return ~__a; +} + +__extension__ extern __inline uint8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmvn_u8 (uint8x8_t __a) +{ + return ~__a; +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmvn_u16 (uint16x4_t __a) +{ + return ~__a; +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmvn_u32 (uint32x2_t __a) +{ + return ~__a; +} + +__extension__ extern __inline poly8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmvnq_p8 (poly8x16_t __a) +{ + return (poly8x16_t) ~((int8x16_t) __a); +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmvnq_s8 (int8x16_t __a) +{ + return ~__a; +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmvnq_s16 (int16x8_t __a) +{ + return ~__a; +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmvnq_s32 (int32x4_t __a) +{ + return ~__a; +} + +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmvnq_u8 (uint8x16_t __a) +{ + return ~__a; +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmvnq_u16 (uint16x8_t __a) +{ + return ~__a; +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmvnq_u32 (uint32x4_t __a) +{ + return ~__a; +} + +/* vneg */ + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vneg_f32 (float32x2_t __a) +{ + return -__a; +} + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vneg_f64 (float64x1_t __a) +{ + return -__a; +} + +__extension__ extern __inline int8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vneg_s8 (int8x8_t __a) +{ + return -__a; +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vneg_s16 (int16x4_t __a) +{ + return -__a; +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vneg_s32 (int32x2_t __a) +{ + return -__a; +} + +__extension__ extern __inline int64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vneg_s64 (int64x1_t __a) +{ + return -__a; +} + +/* According to the ACLE, the negative of the minimum (signed) + value is itself. This leads to a semantics mismatch, as this is + undefined behaviour in C. The value range predictor is not + aware that the negation of a negative number can still be negative + and it may try to fold the expression. See the test in + gcc.target/aarch64/vnegd_s64.c for an example. + + The cast below tricks the value range predictor to include + INT64_MIN in the range it computes. So for x in the range + [INT64_MIN, y] the range prediction after vnegd_s64 (x) will + be ~[INT64_MIN + 1, y]. */ + +__extension__ extern __inline int64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vnegd_s64 (int64_t __a) +{ + return - (uint64_t) __a; +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vnegq_f32 (float32x4_t __a) +{ + return -__a; +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vnegq_f64 (float64x2_t __a) +{ + return -__a; +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vnegq_s8 (int8x16_t __a) +{ + return -__a; +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vnegq_s16 (int16x8_t __a) +{ + return -__a; +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vnegq_s32 (int32x4_t __a) +{ + return -__a; +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vnegq_s64 (int64x2_t __a) +{ + return -__a; +} + +/* vpadd */ + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadd_f32 (float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_faddpv2sf (__a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpaddq_f32 (float32x4_t __a, float32x4_t __b) +{ + return __builtin_aarch64_faddpv4sf (__a, __b); +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpaddq_f64 (float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_faddpv2df (__a, __b); +} + +__extension__ extern __inline int8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadd_s8 (int8x8_t __a, int8x8_t __b) +{ + return __builtin_aarch64_addpv8qi (__a, __b); +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadd_s16 (int16x4_t __a, int16x4_t __b) +{ + return __builtin_aarch64_addpv4hi (__a, __b); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadd_s32 (int32x2_t __a, int32x2_t __b) +{ + return __builtin_aarch64_addpv2si (__a, __b); +} + +__extension__ extern __inline uint8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadd_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t) __builtin_aarch64_addpv8qi ((int8x8_t) __a, + (int8x8_t) __b); +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadd_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t) __builtin_aarch64_addpv4hi ((int16x4_t) __a, + (int16x4_t) __b); +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadd_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t) __builtin_aarch64_addpv2si ((int32x2_t) __a, + (int32x2_t) __b); +} + +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpadds_f32 (float32x2_t __a) +{ + return __builtin_aarch64_reduc_plus_scal_v2sf (__a); +} + +__extension__ extern __inline float64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpaddd_f64 (float64x2_t __a) +{ + return __builtin_aarch64_reduc_plus_scal_v2df (__a); +} + +__extension__ extern __inline int64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpaddd_s64 (int64x2_t __a) +{ + return __builtin_aarch64_addpdi (__a); +} + +__extension__ extern __inline uint64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpaddd_u64 (uint64x2_t __a) +{ + return __builtin_aarch64_addpdi ((int64x2_t) __a); +} + +/* vqabs */ + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqabsq_s64 (int64x2_t __a) +{ + return (int64x2_t) __builtin_aarch64_sqabsv2di (__a); +} + +__extension__ extern __inline int8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqabsb_s8 (int8_t __a) +{ + return (int8_t) __builtin_aarch64_sqabsqi (__a); +} + +__extension__ extern __inline int16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqabsh_s16 (int16_t __a) +{ + return (int16_t) __builtin_aarch64_sqabshi (__a); +} + +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqabss_s32 (int32_t __a) +{ + return (int32_t) __builtin_aarch64_sqabssi (__a); +} + +__extension__ extern __inline int64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqabsd_s64 (int64_t __a) +{ + return __builtin_aarch64_sqabsdi (__a); +} + +/* vqadd */ + +__extension__ extern __inline int8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqaddb_s8 (int8_t __a, int8_t __b) +{ + return (int8_t) __builtin_aarch64_sqaddqi (__a, __b); +} + +__extension__ extern __inline int16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqaddh_s16 (int16_t __a, int16_t __b) +{ + return (int16_t) __builtin_aarch64_sqaddhi (__a, __b); +} + +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqadds_s32 (int32_t __a, int32_t __b) +{ + return (int32_t) __builtin_aarch64_sqaddsi (__a, __b); +} + +__extension__ extern __inline int64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqaddd_s64 (int64_t __a, int64_t __b) +{ + return __builtin_aarch64_sqadddi (__a, __b); +} + +__extension__ extern __inline uint8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqaddb_u8 (uint8_t __a, uint8_t __b) +{ + return (uint8_t) __builtin_aarch64_uqaddqi_uuu (__a, __b); +} + +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqaddh_u16 (uint16_t __a, uint16_t __b) +{ + return (uint16_t) __builtin_aarch64_uqaddhi_uuu (__a, __b); +} + +__extension__ extern __inline uint32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqadds_u32 (uint32_t __a, uint32_t __b) +{ + return (uint32_t) __builtin_aarch64_uqaddsi_uuu (__a, __b); +} + +__extension__ extern __inline uint64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqaddd_u64 (uint64_t __a, uint64_t __b) +{ + return __builtin_aarch64_uqadddi_uuu (__a, __b); +} + +/* vqdmlal */ + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c) +{ + return __builtin_aarch64_sqdmlalv4hi (__a, __b, __c); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlal_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c) +{ + return __builtin_aarch64_sqdmlal2v8hi (__a, __b, __c); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlal_high_lane_s16 (int32x4_t __a, int16x8_t __b, int16x4_t __c, + int const __d) +{ + return __builtin_aarch64_sqdmlal2_lanev8hi (__a, __b, __c, __d); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlal_high_laneq_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c, + int const __d) +{ + return __builtin_aarch64_sqdmlal2_laneqv8hi (__a, __b, __c, __d); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlal_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c) +{ + return __builtin_aarch64_sqdmlal2_nv8hi (__a, __b, __c); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, int const __d) +{ + return __builtin_aarch64_sqdmlal_lanev4hi (__a, __b, __c, __d); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlal_laneq_s16 (int32x4_t __a, int16x4_t __b, int16x8_t __c, int const __d) +{ + return __builtin_aarch64_sqdmlal_laneqv4hi (__a, __b, __c, __d); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c) +{ + return __builtin_aarch64_sqdmlal_nv4hi (__a, __b, __c); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c) +{ + return __builtin_aarch64_sqdmlalv2si (__a, __b, __c); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlal_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c) +{ + return __builtin_aarch64_sqdmlal2v4si (__a, __b, __c); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlal_high_lane_s32 (int64x2_t __a, int32x4_t __b, int32x2_t __c, + int const __d) +{ + return __builtin_aarch64_sqdmlal2_lanev4si (__a, __b, __c, __d); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlal_high_laneq_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c, + int const __d) +{ + return __builtin_aarch64_sqdmlal2_laneqv4si (__a, __b, __c, __d); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlal_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c) +{ + return __builtin_aarch64_sqdmlal2_nv4si (__a, __b, __c); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, int const __d) +{ + return __builtin_aarch64_sqdmlal_lanev2si (__a, __b, __c, __d); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlal_laneq_s32 (int64x2_t __a, int32x2_t __b, int32x4_t __c, int const __d) +{ + return __builtin_aarch64_sqdmlal_laneqv2si (__a, __b, __c, __d); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c) +{ + return __builtin_aarch64_sqdmlal_nv2si (__a, __b, __c); +} + +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlalh_s16 (int32_t __a, int16_t __b, int16_t __c) +{ + return __builtin_aarch64_sqdmlalhi (__a, __b, __c); +} + +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlalh_lane_s16 (int32_t __a, int16_t __b, int16x4_t __c, const int __d) +{ + return __builtin_aarch64_sqdmlal_lanehi (__a, __b, __c, __d); +} + +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlalh_laneq_s16 (int32_t __a, int16_t __b, int16x8_t __c, const int __d) +{ + return __builtin_aarch64_sqdmlal_laneqhi (__a, __b, __c, __d); +} + +__extension__ extern __inline int64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlals_s32 (int64_t __a, int32_t __b, int32_t __c) +{ + return __builtin_aarch64_sqdmlalsi (__a, __b, __c); +} + +__extension__ extern __inline int64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlals_lane_s32 (int64_t __a, int32_t __b, int32x2_t __c, const int __d) +{ + return __builtin_aarch64_sqdmlal_lanesi (__a, __b, __c, __d); +} + +__extension__ extern __inline int64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlals_laneq_s32 (int64_t __a, int32_t __b, int32x4_t __c, const int __d) +{ + return __builtin_aarch64_sqdmlal_laneqsi (__a, __b, __c, __d); +} + +/* vqdmlsl */ + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c) +{ + return __builtin_aarch64_sqdmlslv4hi (__a, __b, __c); +} -/* vld3_lane */ +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlsl_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c) +{ + return __builtin_aarch64_sqdmlsl2v8hi (__a, __b, __c); +} -#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, \ - qmode, ptrmode, funcsuffix, signedtype) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_ci __o; \ - largetype __temp; \ - __temp.val[0] = \ - vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ - __temp.val[1] = \ - vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ - __temp.val[2] = \ - vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \ - __o = __builtin_aarch64_set_qregci##qmode (__o, \ - (signedtype) __temp.val[0], \ - 0); \ - __o = __builtin_aarch64_set_qregci##qmode (__o, \ - (signedtype) __temp.val[1], \ - 1); \ - __o = __builtin_aarch64_set_qregci##qmode (__o, \ - (signedtype) __temp.val[2], \ - 2); \ - __o = __builtin_aarch64_ld3_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0); \ - __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1); \ - __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2); \ - return __b; \ +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlsl_high_lane_s16 (int32x4_t __a, int16x8_t __b, int16x4_t __c, + int const __d) +{ + return __builtin_aarch64_sqdmlsl2_lanev8hi (__a, __b, __c, __d); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlsl_high_laneq_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c, + int const __d) +{ + return __builtin_aarch64_sqdmlsl2_laneqv8hi (__a, __b, __c, __d); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlsl_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c) +{ + return __builtin_aarch64_sqdmlsl2_nv8hi (__a, __b, __c); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, int const __d) +{ + return __builtin_aarch64_sqdmlsl_lanev4hi (__a, __b, __c, __d); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlsl_laneq_s16 (int32x4_t __a, int16x4_t __b, int16x8_t __c, int const __d) +{ + return __builtin_aarch64_sqdmlsl_laneqv4hi (__a, __b, __c, __d); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c) +{ + return __builtin_aarch64_sqdmlsl_nv4hi (__a, __b, __c); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c) +{ + return __builtin_aarch64_sqdmlslv2si (__a, __b, __c); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlsl_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c) +{ + return __builtin_aarch64_sqdmlsl2v4si (__a, __b, __c); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlsl_high_lane_s32 (int64x2_t __a, int32x4_t __b, int32x2_t __c, + int const __d) +{ + return __builtin_aarch64_sqdmlsl2_lanev4si (__a, __b, __c, __d); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlsl_high_laneq_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c, + int const __d) +{ + return __builtin_aarch64_sqdmlsl2_laneqv4si (__a, __b, __c, __d); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlsl_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c) +{ + return __builtin_aarch64_sqdmlsl2_nv4si (__a, __b, __c); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, int const __d) +{ + return __builtin_aarch64_sqdmlsl_lanev2si (__a, __b, __c, __d); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlsl_laneq_s32 (int64x2_t __a, int32x2_t __b, int32x4_t __c, int const __d) +{ + return __builtin_aarch64_sqdmlsl_laneqv2si (__a, __b, __c, __d); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c) +{ + return __builtin_aarch64_sqdmlsl_nv2si (__a, __b, __c); +} + +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlslh_s16 (int32_t __a, int16_t __b, int16_t __c) +{ + return __builtin_aarch64_sqdmlslhi (__a, __b, __c); +} + +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlslh_lane_s16 (int32_t __a, int16_t __b, int16x4_t __c, const int __d) +{ + return __builtin_aarch64_sqdmlsl_lanehi (__a, __b, __c, __d); +} + +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlslh_laneq_s16 (int32_t __a, int16_t __b, int16x8_t __c, const int __d) +{ + return __builtin_aarch64_sqdmlsl_laneqhi (__a, __b, __c, __d); +} + +__extension__ extern __inline int64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlsls_s32 (int64_t __a, int32_t __b, int32_t __c) +{ + return __builtin_aarch64_sqdmlslsi (__a, __b, __c); +} + +__extension__ extern __inline int64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlsls_lane_s32 (int64_t __a, int32_t __b, int32x2_t __c, const int __d) +{ + return __builtin_aarch64_sqdmlsl_lanesi (__a, __b, __c, __d); +} + +__extension__ extern __inline int64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmlsls_laneq_s32 (int64_t __a, int32_t __b, int32x4_t __c, const int __d) +{ + return __builtin_aarch64_sqdmlsl_laneqsi (__a, __b, __c, __d); +} + +/* vqdmulh */ + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c) +{ + return __builtin_aarch64_sqdmulh_lanev4hi (__a, __b, __c); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c) +{ + return __builtin_aarch64_sqdmulh_lanev2si (__a, __b, __c); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c) +{ + return __builtin_aarch64_sqdmulh_lanev8hi (__a, __b, __c); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c) +{ + return __builtin_aarch64_sqdmulh_lanev4si (__a, __b, __c); +} + +__extension__ extern __inline int16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmulhh_s16 (int16_t __a, int16_t __b) +{ + return (int16_t) __builtin_aarch64_sqdmulhhi (__a, __b); +} + +__extension__ extern __inline int16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmulhh_lane_s16 (int16_t __a, int16x4_t __b, const int __c) +{ + return __builtin_aarch64_sqdmulh_lanehi (__a, __b, __c); +} + +__extension__ extern __inline int16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmulhh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_sqdmulh_laneqhi (__a, __b, __c); +} + +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmulhs_s32 (int32_t __a, int32_t __b) +{ + return (int32_t) __builtin_aarch64_sqdmulhsi (__a, __b); +} + +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmulhs_lane_s32 (int32_t __a, int32x2_t __b, const int __c) +{ + return __builtin_aarch64_sqdmulh_lanesi (__a, __b, __c); +} + +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmulhs_laneq_s32 (int32_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_sqdmulh_laneqsi (__a, __b, __c); +} + +/* vqdmull */ + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmull_s16 (int16x4_t __a, int16x4_t __b) +{ + return __builtin_aarch64_sqdmullv4hi (__a, __b); } -__LD3_LANE_FUNC (float16x4x3_t, float16x4_t, float16x8x3_t, float16_t, v4hf, - v8hf, hf, f16, float16x8_t) -__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v2sf, v4sf, - sf, f32, float32x4_t) -__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, df, v2df, - df, f64, float64x2_t) -__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v8qi, v16qi, qi, p8, - int8x16_t) -__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v4hi, v8hi, hi, - p16, int16x8_t) -__LD3_LANE_FUNC (poly64x1x3_t, poly64x1_t, poly64x2x3_t, poly64_t, di, - v2di_ssps, di, p64, poly64x2_t) -__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v8qi, v16qi, qi, s8, - int8x16_t) -__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v4hi, v8hi, hi, s16, - int16x8_t) -__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v2si, v4si, si, s32, - int32x4_t) -__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, di, v2di, di, s64, - int64x2_t) -__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v8qi, v16qi, qi, u8, - int8x16_t) -__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v4hi, v8hi, hi, - u16, int16x8_t) -__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v2si, v4si, si, - u32, int32x4_t) -__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, di, v2di, di, - u64, int64x2_t) +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmull_high_s16 (int16x8_t __a, int16x8_t __b) +{ + return __builtin_aarch64_sqdmull2v8hi (__a, __b); +} -#undef __LD3_LANE_FUNC +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, int const __c) +{ + return __builtin_aarch64_sqdmull2_lanev8hi (__a, __b,__c); +} -/* vld3q_lane */ +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, int const __c) +{ + return __builtin_aarch64_sqdmull2_laneqv8hi (__a, __b,__c); +} -#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_ci __o; \ - intype ret; \ - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); \ - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); \ - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); \ - __o = __builtin_aarch64_ld3_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - ret.val[0] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 0); \ - ret.val[1] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 1); \ - ret.val[2] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 2); \ - return ret; \ +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmull_high_n_s16 (int16x8_t __a, int16_t __b) +{ + return __builtin_aarch64_sqdmull2_nv8hi (__a, __b); } -__LD3_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16) -__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32) -__LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64) -__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8) -__LD3_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16) -__LD3_LANE_FUNC (poly64x2x3_t, poly64x2_t, poly64_t, v2di, di, p64) -__LD3_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8) -__LD3_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16) -__LD3_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32) -__LD3_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64) -__LD3_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8) -__LD3_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16) -__LD3_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32) -__LD3_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64) +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmull_lane_s16 (int16x4_t __a, int16x4_t __b, int const __c) +{ + return __builtin_aarch64_sqdmull_lanev4hi (__a, __b, __c); +} -#undef __LD3_LANE_FUNC +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmull_laneq_s16 (int16x4_t __a, int16x8_t __b, int const __c) +{ + return __builtin_aarch64_sqdmull_laneqv4hi (__a, __b, __c); +} -/* vld4_lane */ +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmull_n_s16 (int16x4_t __a, int16_t __b) +{ + return __builtin_aarch64_sqdmull_nv4hi (__a, __b); +} -#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, mode, \ - qmode, ptrmode, funcsuffix, signedtype) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_xi __o; \ - largetype __temp; \ - __temp.val[0] = \ - vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ - __temp.val[1] = \ - vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ - __temp.val[2] = \ - vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \ - __temp.val[3] = \ - vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0)); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[0], \ - 0); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[1], \ - 1); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[2], \ - 2); \ - __o = __builtin_aarch64_set_qregxi##qmode (__o, \ - (signedtype) __temp.val[3], \ - 3); \ - __o = __builtin_aarch64_ld4_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0); \ - __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1); \ - __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2); \ - __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3); \ - return __b; \ +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmull_s32 (int32x2_t __a, int32x2_t __b) +{ + return __builtin_aarch64_sqdmullv2si (__a, __b); } -/* vld4q_lane */ +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmull_high_s32 (int32x4_t __a, int32x4_t __b) +{ + return __builtin_aarch64_sqdmull2v4si (__a, __b); +} -__LD4_LANE_FUNC (float16x4x4_t, float16x4_t, float16x8x4_t, float16_t, v4hf, - v8hf, hf, f16, float16x8_t) -__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v2sf, v4sf, - sf, f32, float32x4_t) -__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, df, v2df, - df, f64, float64x2_t) -__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v8qi, v16qi, qi, p8, - int8x16_t) -__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v4hi, v8hi, hi, - p16, int16x8_t) -__LD4_LANE_FUNC (poly64x1x4_t, poly64x1_t, poly64x2x4_t, poly64_t, di, - v2di_ssps, di, p64, poly64x2_t) -__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v8qi, v16qi, qi, s8, - int8x16_t) -__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v4hi, v8hi, hi, s16, - int16x8_t) -__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v2si, v4si, si, s32, - int32x4_t) -__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, di, v2di, di, s64, - int64x2_t) -__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v8qi, v16qi, qi, u8, - int8x16_t) -__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v4hi, v8hi, hi, - u16, int16x8_t) -__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v2si, v4si, si, - u32, int32x4_t) -__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, di, v2di, di, - u64, int64x2_t) +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, int const __c) +{ + return __builtin_aarch64_sqdmull2_lanev4si (__a, __b, __c); +} -#undef __LD4_LANE_FUNC +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, int const __c) +{ + return __builtin_aarch64_sqdmull2_laneqv4si (__a, __b, __c); +} -/* vld4q_lane */ +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmull_high_n_s32 (int32x4_t __a, int32_t __b) +{ + return __builtin_aarch64_sqdmull2_nv4si (__a, __b); +} -#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ -__extension__ extern __inline intype \ -__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ -vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ -{ \ - __builtin_aarch64_simd_xi __o; \ - intype ret; \ - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); \ - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); \ - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); \ - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); \ - __o = __builtin_aarch64_ld4_lane##mode ( \ - (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ - ret.val[0] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 0); \ - ret.val[1] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 1); \ - ret.val[2] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 2); \ - ret.val[3] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 3); \ - return ret; \ +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmull_lane_s32 (int32x2_t __a, int32x2_t __b, int const __c) +{ + return __builtin_aarch64_sqdmull_lanev2si (__a, __b, __c); } -__LD4_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16) -__LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32) -__LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64) -__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8) -__LD4_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16) -__LD4_LANE_FUNC (poly64x2x4_t, poly64x2_t, poly64_t, v2di, di, p64) -__LD4_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8) -__LD4_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16) -__LD4_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32) -__LD4_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64) -__LD4_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8) -__LD4_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16) -__LD4_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32) -__LD4_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64) +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmull_laneq_s32 (int32x2_t __a, int32x4_t __b, int const __c) +{ + return __builtin_aarch64_sqdmull_laneqv2si (__a, __b, __c); +} -#undef __LD4_LANE_FUNC +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmull_n_s32 (int32x2_t __a, int32_t __b) +{ + return __builtin_aarch64_sqdmull_nv2si (__a, __b); +} -/* vmax */ +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmullh_s16 (int16_t __a, int16_t __b) +{ + return (int32_t) __builtin_aarch64_sqdmullhi (__a, __b); +} -__extension__ extern __inline float32x2_t +__extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmax_f32 (float32x2_t __a, float32x2_t __b) +vqdmullh_lane_s16 (int16_t __a, int16x4_t __b, const int __c) { - return __builtin_aarch64_smax_nanv2sf (__a, __b); + return __builtin_aarch64_sqdmull_lanehi (__a, __b, __c); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmullh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_sqdmull_laneqhi (__a, __b, __c); +} + +__extension__ extern __inline int64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmulls_s32 (int32_t __a, int32_t __b) +{ + return __builtin_aarch64_sqdmullsi (__a, __b); +} + +__extension__ extern __inline int64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqdmulls_lane_s32 (int32_t __a, int32x2_t __b, const int __c) +{ + return __builtin_aarch64_sqdmull_lanesi (__a, __b, __c); +} + +__extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmax_f64 (float64x1_t __a, float64x1_t __b) +vqdmulls_laneq_s32 (int32_t __a, int32x4_t __b, const int __c) { - return (float64x1_t) - { __builtin_aarch64_smax_nandf (vget_lane_f64 (__a, 0), - vget_lane_f64 (__b, 0)) }; + return __builtin_aarch64_sqdmull_laneqsi (__a, __b, __c); } +/* vqmovn */ + __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmax_s8 (int8x8_t __a, int8x8_t __b) +vqmovn_s16 (int16x8_t __a) { - return __builtin_aarch64_smaxv8qi (__a, __b); + return (int8x8_t) __builtin_aarch64_sqmovnv8hi (__a); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmax_s16 (int16x4_t __a, int16x4_t __b) +vqmovn_s32 (int32x4_t __a) { - return __builtin_aarch64_smaxv4hi (__a, __b); + return (int16x4_t) __builtin_aarch64_sqmovnv4si (__a); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmax_s32 (int32x2_t __a, int32x2_t __b) +vqmovn_s64 (int64x2_t __a) { - return __builtin_aarch64_smaxv2si (__a, __b); + return (int32x2_t) __builtin_aarch64_sqmovnv2di (__a); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmax_u8 (uint8x8_t __a, uint8x8_t __b) +vqmovn_u16 (uint16x8_t __a) { - return (uint8x8_t) __builtin_aarch64_umaxv8qi ((int8x8_t) __a, - (int8x8_t) __b); + return (uint8x8_t) __builtin_aarch64_uqmovnv8hi ((int16x8_t) __a); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmax_u16 (uint16x4_t __a, uint16x4_t __b) +vqmovn_u32 (uint32x4_t __a) { - return (uint16x4_t) __builtin_aarch64_umaxv4hi ((int16x4_t) __a, - (int16x4_t) __b); + return (uint16x4_t) __builtin_aarch64_uqmovnv4si ((int32x4_t) __a); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmax_u32 (uint32x2_t __a, uint32x2_t __b) +vqmovn_u64 (uint64x2_t __a) { - return (uint32x2_t) __builtin_aarch64_umaxv2si ((int32x2_t) __a, - (int32x2_t) __b); + return (uint32x2_t) __builtin_aarch64_uqmovnv2di ((int64x2_t) __a); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline int8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxq_f32 (float32x4_t __a, float32x4_t __b) +vqmovnh_s16 (int16_t __a) { - return __builtin_aarch64_smax_nanv4sf (__a, __b); + return (int8_t) __builtin_aarch64_sqmovnhi (__a); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxq_f64 (float64x2_t __a, float64x2_t __b) +vqmovns_s32 (int32_t __a) { - return __builtin_aarch64_smax_nanv2df (__a, __b); + return (int16_t) __builtin_aarch64_sqmovnsi (__a); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxq_s8 (int8x16_t __a, int8x16_t __b) +vqmovnd_s64 (int64_t __a) { - return __builtin_aarch64_smaxv16qi (__a, __b); + return (int32_t) __builtin_aarch64_sqmovndi (__a); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline uint8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxq_s16 (int16x8_t __a, int16x8_t __b) +vqmovnh_u16 (uint16_t __a) { - return __builtin_aarch64_smaxv8hi (__a, __b); + return (uint8_t) __builtin_aarch64_uqmovnhi (__a); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxq_s32 (int32x4_t __a, int32x4_t __b) +vqmovns_u32 (uint32_t __a) { - return __builtin_aarch64_smaxv4si (__a, __b); + return (uint16_t) __builtin_aarch64_uqmovnsi (__a); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxq_u8 (uint8x16_t __a, uint8x16_t __b) +vqmovnd_u64 (uint64_t __a) { - return (uint8x16_t) __builtin_aarch64_umaxv16qi ((int8x16_t) __a, - (int8x16_t) __b); + return (uint32_t) __builtin_aarch64_uqmovndi (__a); } -__extension__ extern __inline uint16x8_t +/* vqmovun */ + +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxq_u16 (uint16x8_t __a, uint16x8_t __b) +vqmovun_s16 (int16x8_t __a) { - return (uint16x8_t) __builtin_aarch64_umaxv8hi ((int16x8_t) __a, - (int16x8_t) __b); + return __builtin_aarch64_sqmovunv8hi_us (__a); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxq_u32 (uint32x4_t __a, uint32x4_t __b) +vqmovun_s32 (int32x4_t __a) { - return (uint32x4_t) __builtin_aarch64_umaxv4si ((int32x4_t) __a, - (int32x4_t) __b); + return __builtin_aarch64_sqmovunv4si_us (__a); } -/* vmulx */ -__extension__ extern __inline float32x2_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulx_f32 (float32x2_t __a, float32x2_t __b) +vqmovun_s64 (int64x2_t __a) { - return __builtin_aarch64_fmulxv2sf (__a, __b); + return __builtin_aarch64_sqmovunv2di_us (__a); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline uint8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulxq_f32 (float32x4_t __a, float32x4_t __b) +vqmovunh_s16 (int16_t __a) { - return __builtin_aarch64_fmulxv4sf (__a, __b); + return __builtin_aarch64_sqmovunhi_us (__a); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulx_f64 (float64x1_t __a, float64x1_t __b) +vqmovuns_s32 (int32_t __a) { - return (float64x1_t) {__builtin_aarch64_fmulxdf (__a[0], __b[0])}; + return __builtin_aarch64_sqmovunsi_us (__a); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulxq_f64 (float64x2_t __a, float64x2_t __b) +vqmovund_s64 (int64_t __a) { - return __builtin_aarch64_fmulxv2df (__a, __b); + return __builtin_aarch64_sqmovundi_us (__a); } -__extension__ extern __inline float32_t +/* vqneg */ + +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulxs_f32 (float32_t __a, float32_t __b) +vqnegq_s64 (int64x2_t __a) { - return __builtin_aarch64_fmulxsf (__a, __b); + return (int64x2_t) __builtin_aarch64_sqnegv2di (__a); } -__extension__ extern __inline float64_t +__extension__ extern __inline int8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulxd_f64 (float64_t __a, float64_t __b) +vqnegb_s8 (int8_t __a) { - return __builtin_aarch64_fmulxdf (__a, __b); + return (int8_t) __builtin_aarch64_sqnegqi (__a); } -__extension__ extern __inline float32x2_t +__extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulx_lane_f32 (float32x2_t __a, float32x2_t __v, const int __lane) +vqnegh_s16 (int16_t __a) { - return vmulx_f32 (__a, __aarch64_vdup_lane_f32 (__v, __lane)); + return (int16_t) __builtin_aarch64_sqneghi (__a); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulx_lane_f64 (float64x1_t __a, float64x1_t __v, const int __lane) +vqnegs_s32 (int32_t __a) { - return vmulx_f64 (__a, __aarch64_vdup_lane_f64 (__v, __lane)); + return (int32_t) __builtin_aarch64_sqnegsi (__a); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulxq_lane_f32 (float32x4_t __a, float32x2_t __v, const int __lane) +vqnegd_s64 (int64_t __a) { - return vmulxq_f32 (__a, __aarch64_vdupq_lane_f32 (__v, __lane)); + return __builtin_aarch64_sqnegdi (__a); } -__extension__ extern __inline float64x2_t +/* vqrdmulh */ + +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulxq_lane_f64 (float64x2_t __a, float64x1_t __v, const int __lane) +vqrdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c) { - return vmulxq_f64 (__a, __aarch64_vdupq_lane_f64 (__v, __lane)); + return __builtin_aarch64_sqrdmulh_lanev4hi (__a, __b, __c); } -__extension__ extern __inline float32x2_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulx_laneq_f32 (float32x2_t __a, float32x4_t __v, const int __lane) +vqrdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c) { - return vmulx_f32 (__a, __aarch64_vdup_laneq_f32 (__v, __lane)); + return __builtin_aarch64_sqrdmulh_lanev2si (__a, __b, __c); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulx_laneq_f64 (float64x1_t __a, float64x2_t __v, const int __lane) +vqrdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c) { - return vmulx_f64 (__a, __aarch64_vdup_laneq_f64 (__v, __lane)); + return __builtin_aarch64_sqrdmulh_lanev8hi (__a, __b, __c); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulxq_laneq_f32 (float32x4_t __a, float32x4_t __v, const int __lane) +vqrdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c) { - return vmulxq_f32 (__a, __aarch64_vdupq_laneq_f32 (__v, __lane)); + return __builtin_aarch64_sqrdmulh_lanev4si (__a, __b, __c); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulxq_laneq_f64 (float64x2_t __a, float64x2_t __v, const int __lane) +vqrdmulhh_s16 (int16_t __a, int16_t __b) { - return vmulxq_f64 (__a, __aarch64_vdupq_laneq_f64 (__v, __lane)); + return (int16_t) __builtin_aarch64_sqrdmulhhi (__a, __b); } -__extension__ extern __inline float32_t +__extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulxs_lane_f32 (float32_t __a, float32x2_t __v, const int __lane) +vqrdmulhh_lane_s16 (int16_t __a, int16x4_t __b, const int __c) { - return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane)); + return __builtin_aarch64_sqrdmulh_lanehi (__a, __b, __c); } -__extension__ extern __inline float32_t +__extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulxs_laneq_f32 (float32_t __a, float32x4_t __v, const int __lane) +vqrdmulhh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c) { - return vmulxs_f32 (__a, __aarch64_vget_lane_any (__v, __lane)); + return __builtin_aarch64_sqrdmulh_laneqhi (__a, __b, __c); } -__extension__ extern __inline float64_t +__extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulxd_lane_f64 (float64_t __a, float64x1_t __v, const int __lane) +vqrdmulhs_s32 (int32_t __a, int32_t __b) { - return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane)); + return (int32_t) __builtin_aarch64_sqrdmulhsi (__a, __b); } -__extension__ extern __inline float64_t +__extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulxd_laneq_f64 (float64_t __a, float64x2_t __v, const int __lane) +vqrdmulhs_lane_s32 (int32_t __a, int32x2_t __b, const int __c) { - return vmulxd_f64 (__a, __aarch64_vget_lane_any (__v, __lane)); + return __builtin_aarch64_sqrdmulh_lanesi (__a, __b, __c); } -/* vpmax */ +__extension__ extern __inline int32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrdmulhs_laneq_s32 (int32_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_sqrdmulh_laneqsi (__a, __b, __c); +} + +/* vqrshl */ __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmax_s8 (int8x8_t a, int8x8_t b) +vqrshl_s8 (int8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_smaxpv8qi (a, b); + return __builtin_aarch64_sqrshlv8qi (__a, __b); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmax_s16 (int16x4_t a, int16x4_t b) +vqrshl_s16 (int16x4_t __a, int16x4_t __b) { - return __builtin_aarch64_smaxpv4hi (a, b); + return __builtin_aarch64_sqrshlv4hi (__a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmax_s32 (int32x2_t a, int32x2_t b) +vqrshl_s32 (int32x2_t __a, int32x2_t __b) +{ + return __builtin_aarch64_sqrshlv2si (__a, __b); +} + +__extension__ extern __inline int64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshl_s64 (int64x1_t __a, int64x1_t __b) { - return __builtin_aarch64_smaxpv2si (a, b); + return (int64x1_t) {__builtin_aarch64_sqrshldi (__a[0], __b[0])}; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmax_u8 (uint8x8_t a, uint8x8_t b) +vqrshl_u8 (uint8x8_t __a, int8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_umaxpv8qi ((int8x8_t) a, - (int8x8_t) b); + return __builtin_aarch64_uqrshlv8qi_uus ( __a, __b); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmax_u16 (uint16x4_t a, uint16x4_t b) +vqrshl_u16 (uint16x4_t __a, int16x4_t __b) { - return (uint16x4_t) __builtin_aarch64_umaxpv4hi ((int16x4_t) a, - (int16x4_t) b); + return __builtin_aarch64_uqrshlv4hi_uus ( __a, __b); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmax_u32 (uint32x2_t a, uint32x2_t b) +vqrshl_u32 (uint32x2_t __a, int32x2_t __b) +{ + return __builtin_aarch64_uqrshlv2si_uus ( __a, __b); +} + +__extension__ extern __inline uint64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshl_u64 (uint64x1_t __a, int64x1_t __b) { - return (uint32x2_t) __builtin_aarch64_umaxpv2si ((int32x2_t) a, - (int32x2_t) b); + return (uint64x1_t) {__builtin_aarch64_uqrshldi_uus (__a[0], __b[0])}; } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmaxq_s8 (int8x16_t a, int8x16_t b) +vqrshlq_s8 (int8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_smaxpv16qi (a, b); + return __builtin_aarch64_sqrshlv16qi (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmaxq_s16 (int16x8_t a, int16x8_t b) +vqrshlq_s16 (int16x8_t __a, int16x8_t __b) { - return __builtin_aarch64_smaxpv8hi (a, b); + return __builtin_aarch64_sqrshlv8hi (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmaxq_s32 (int32x4_t a, int32x4_t b) +vqrshlq_s32 (int32x4_t __a, int32x4_t __b) { - return __builtin_aarch64_smaxpv4si (a, b); + return __builtin_aarch64_sqrshlv4si (__a, __b); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmaxq_u8 (uint8x16_t a, uint8x16_t b) +vqrshlq_s64 (int64x2_t __a, int64x2_t __b) { - return (uint8x16_t) __builtin_aarch64_umaxpv16qi ((int8x16_t) a, - (int8x16_t) b); + return __builtin_aarch64_sqrshlv2di (__a, __b); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmaxq_u16 (uint16x8_t a, uint16x8_t b) +vqrshlq_u8 (uint8x16_t __a, int8x16_t __b) { - return (uint16x8_t) __builtin_aarch64_umaxpv8hi ((int16x8_t) a, - (int16x8_t) b); + return __builtin_aarch64_uqrshlv16qi_uus ( __a, __b); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmaxq_u32 (uint32x4_t a, uint32x4_t b) +vqrshlq_u16 (uint16x8_t __a, int16x8_t __b) { - return (uint32x4_t) __builtin_aarch64_umaxpv4si ((int32x4_t) a, - (int32x4_t) b); + return __builtin_aarch64_uqrshlv8hi_uus ( __a, __b); } -__extension__ extern __inline float32x2_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmax_f32 (float32x2_t a, float32x2_t b) +vqrshlq_u32 (uint32x4_t __a, int32x4_t __b) { - return __builtin_aarch64_smax_nanpv2sf (a, b); + return __builtin_aarch64_uqrshlv4si_uus ( __a, __b); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmaxq_f32 (float32x4_t a, float32x4_t b) +vqrshlq_u64 (uint64x2_t __a, int64x2_t __b) { - return __builtin_aarch64_smax_nanpv4sf (a, b); + return __builtin_aarch64_uqrshlv2di_uus ( __a, __b); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline int8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmaxq_f64 (float64x2_t a, float64x2_t b) +vqrshlb_s8 (int8_t __a, int8_t __b) { - return __builtin_aarch64_smax_nanpv2df (a, b); + return __builtin_aarch64_sqrshlqi (__a, __b); } -__extension__ extern __inline float64_t +__extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmaxqd_f64 (float64x2_t a) +vqrshlh_s16 (int16_t __a, int16_t __b) { - return __builtin_aarch64_reduc_smax_nan_scal_v2df (a); + return __builtin_aarch64_sqrshlhi (__a, __b); } -__extension__ extern __inline float32_t +__extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmaxs_f32 (float32x2_t a) +vqrshls_s32 (int32_t __a, int32_t __b) { - return __builtin_aarch64_reduc_smax_nan_scal_v2sf (a); + return __builtin_aarch64_sqrshlsi (__a, __b); } -/* vpmaxnm */ - -__extension__ extern __inline float32x2_t +__extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmaxnm_f32 (float32x2_t a, float32x2_t b) +vqrshld_s64 (int64_t __a, int64_t __b) { - return __builtin_aarch64_smaxpv2sf (a, b); + return __builtin_aarch64_sqrshldi (__a, __b); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline uint8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmaxnmq_f32 (float32x4_t a, float32x4_t b) +vqrshlb_u8 (uint8_t __a, int8_t __b) { - return __builtin_aarch64_smaxpv4sf (a, b); + return __builtin_aarch64_uqrshlqi_uus (__a, __b); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmaxnmq_f64 (float64x2_t a, float64x2_t b) +vqrshlh_u16 (uint16_t __a, int16_t __b) { - return __builtin_aarch64_smaxpv2df (a, b); + return __builtin_aarch64_uqrshlhi_uus (__a, __b); } -__extension__ extern __inline float64_t +__extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmaxnmqd_f64 (float64x2_t a) +vqrshls_u32 (uint32_t __a, int32_t __b) { - return __builtin_aarch64_reduc_smax_scal_v2df (a); + return __builtin_aarch64_uqrshlsi_uus (__a, __b); } -__extension__ extern __inline float32_t +__extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmaxnms_f32 (float32x2_t a) +vqrshld_u64 (uint64_t __a, int64_t __b) { - return __builtin_aarch64_reduc_smax_scal_v2sf (a); + return __builtin_aarch64_uqrshldi_uus (__a, __b); } -/* vpmin */ +/* vqrshrn */ __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmin_s8 (int8x8_t a, int8x8_t b) +vqrshrn_n_s16 (int16x8_t __a, const int __b) { - return __builtin_aarch64_sminpv8qi (a, b); + return (int8x8_t) __builtin_aarch64_sqrshrn_nv8hi (__a, __b); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmin_s16 (int16x4_t a, int16x4_t b) +vqrshrn_n_s32 (int32x4_t __a, const int __b) { - return __builtin_aarch64_sminpv4hi (a, b); + return (int16x4_t) __builtin_aarch64_sqrshrn_nv4si (__a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmin_s32 (int32x2_t a, int32x2_t b) +vqrshrn_n_s64 (int64x2_t __a, const int __b) { - return __builtin_aarch64_sminpv2si (a, b); + return (int32x2_t) __builtin_aarch64_sqrshrn_nv2di (__a, __b); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmin_u8 (uint8x8_t a, uint8x8_t b) +vqrshrn_n_u16 (uint16x8_t __a, const int __b) { - return (uint8x8_t) __builtin_aarch64_uminpv8qi ((int8x8_t) a, - (int8x8_t) b); + return __builtin_aarch64_uqrshrn_nv8hi_uus ( __a, __b); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmin_u16 (uint16x4_t a, uint16x4_t b) +vqrshrn_n_u32 (uint32x4_t __a, const int __b) { - return (uint16x4_t) __builtin_aarch64_uminpv4hi ((int16x4_t) a, - (int16x4_t) b); + return __builtin_aarch64_uqrshrn_nv4si_uus ( __a, __b); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmin_u32 (uint32x2_t a, uint32x2_t b) +vqrshrn_n_u64 (uint64x2_t __a, const int __b) { - return (uint32x2_t) __builtin_aarch64_uminpv2si ((int32x2_t) a, - (int32x2_t) b); + return __builtin_aarch64_uqrshrn_nv2di_uus ( __a, __b); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline int8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpminq_s8 (int8x16_t a, int8x16_t b) +vqrshrnh_n_s16 (int16_t __a, const int __b) { - return __builtin_aarch64_sminpv16qi (a, b); + return (int8_t) __builtin_aarch64_sqrshrn_nhi (__a, __b); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpminq_s16 (int16x8_t a, int16x8_t b) +vqrshrns_n_s32 (int32_t __a, const int __b) { - return __builtin_aarch64_sminpv8hi (a, b); + return (int16_t) __builtin_aarch64_sqrshrn_nsi (__a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpminq_s32 (int32x4_t a, int32x4_t b) +vqrshrnd_n_s64 (int64_t __a, const int __b) { - return __builtin_aarch64_sminpv4si (a, b); + return (int32_t) __builtin_aarch64_sqrshrn_ndi (__a, __b); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline uint8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpminq_u8 (uint8x16_t a, uint8x16_t b) +vqrshrnh_n_u16 (uint16_t __a, const int __b) { - return (uint8x16_t) __builtin_aarch64_uminpv16qi ((int8x16_t) a, - (int8x16_t) b); + return __builtin_aarch64_uqrshrn_nhi_uus (__a, __b); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpminq_u16 (uint16x8_t a, uint16x8_t b) +vqrshrns_n_u32 (uint32_t __a, const int __b) { - return (uint16x8_t) __builtin_aarch64_uminpv8hi ((int16x8_t) a, - (int16x8_t) b); + return __builtin_aarch64_uqrshrn_nsi_uus (__a, __b); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpminq_u32 (uint32x4_t a, uint32x4_t b) +vqrshrnd_n_u64 (uint64_t __a, const int __b) { - return (uint32x4_t) __builtin_aarch64_uminpv4si ((int32x4_t) a, - (int32x4_t) b); + return __builtin_aarch64_uqrshrn_ndi_uus (__a, __b); } -__extension__ extern __inline float32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmin_f32 (float32x2_t a, float32x2_t b) -{ - return __builtin_aarch64_smin_nanpv2sf (a, b); -} +/* vqrshrun */ -__extension__ extern __inline float32x4_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpminq_f32 (float32x4_t a, float32x4_t b) +vqrshrun_n_s16 (int16x8_t __a, const int __b) { - return __builtin_aarch64_smin_nanpv4sf (a, b); + return (uint8x8_t) __builtin_aarch64_sqrshrun_nv8hi (__a, __b); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpminq_f64 (float64x2_t a, float64x2_t b) +vqrshrun_n_s32 (int32x4_t __a, const int __b) { - return __builtin_aarch64_smin_nanpv2df (a, b); + return (uint16x4_t) __builtin_aarch64_sqrshrun_nv4si (__a, __b); } -__extension__ extern __inline float64_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpminqd_f64 (float64x2_t a) +vqrshrun_n_s64 (int64x2_t __a, const int __b) { - return __builtin_aarch64_reduc_smin_nan_scal_v2df (a); + return (uint32x2_t) __builtin_aarch64_sqrshrun_nv2di (__a, __b); } -__extension__ extern __inline float32_t +__extension__ extern __inline int8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmins_f32 (float32x2_t a) +vqrshrunh_n_s16 (int16_t __a, const int __b) { - return __builtin_aarch64_reduc_smin_nan_scal_v2sf (a); + return (int8_t) __builtin_aarch64_sqrshrun_nhi (__a, __b); } -/* vpminnm */ - -__extension__ extern __inline float32x2_t +__extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpminnm_f32 (float32x2_t a, float32x2_t b) +vqrshruns_n_s32 (int32_t __a, const int __b) { - return __builtin_aarch64_sminpv2sf (a, b); + return (int16_t) __builtin_aarch64_sqrshrun_nsi (__a, __b); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpminnmq_f32 (float32x4_t a, float32x4_t b) +vqrshrund_n_s64 (int64_t __a, const int __b) { - return __builtin_aarch64_sminpv4sf (a, b); + return (int32_t) __builtin_aarch64_sqrshrun_ndi (__a, __b); } -__extension__ extern __inline float64x2_t +/* vqshl */ + +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpminnmq_f64 (float64x2_t a, float64x2_t b) +vqshl_s8 (int8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_sminpv2df (a, b); + return __builtin_aarch64_sqshlv8qi (__a, __b); } -__extension__ extern __inline float64_t +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpminnmqd_f64 (float64x2_t a) +vqshl_s16 (int16x4_t __a, int16x4_t __b) { - return __builtin_aarch64_reduc_smin_scal_v2df (a); + return __builtin_aarch64_sqshlv4hi (__a, __b); } -__extension__ extern __inline float32_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpminnms_f32 (float32x2_t a) +vqshl_s32 (int32x2_t __a, int32x2_t __b) { - return __builtin_aarch64_reduc_smin_scal_v2sf (a); + return __builtin_aarch64_sqshlv2si (__a, __b); } -/* vmaxnm */ - -__extension__ extern __inline float32x2_t +__extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxnm_f32 (float32x2_t __a, float32x2_t __b) +vqshl_s64 (int64x1_t __a, int64x1_t __b) { - return __builtin_aarch64_fmaxv2sf (__a, __b); + return (int64x1_t) {__builtin_aarch64_sqshldi (__a[0], __b[0])}; } -__extension__ extern __inline float64x1_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxnm_f64 (float64x1_t __a, float64x1_t __b) +vqshl_u8 (uint8x8_t __a, int8x8_t __b) { - return (float64x1_t) - { __builtin_aarch64_fmaxdf (vget_lane_f64 (__a, 0), - vget_lane_f64 (__b, 0)) }; + return __builtin_aarch64_uqshlv8qi_uus ( __a, __b); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxnmq_f32 (float32x4_t __a, float32x4_t __b) +vqshl_u16 (uint16x4_t __a, int16x4_t __b) { - return __builtin_aarch64_fmaxv4sf (__a, __b); + return __builtin_aarch64_uqshlv4hi_uus ( __a, __b); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxnmq_f64 (float64x2_t __a, float64x2_t __b) +vqshl_u32 (uint32x2_t __a, int32x2_t __b) { - return __builtin_aarch64_fmaxv2df (__a, __b); + return __builtin_aarch64_uqshlv2si_uus ( __a, __b); } -/* vmaxv */ - -__extension__ extern __inline float32_t +__extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxv_f32 (float32x2_t __a) +vqshl_u64 (uint64x1_t __a, int64x1_t __b) { - return __builtin_aarch64_reduc_smax_nan_scal_v2sf (__a); + return (uint64x1_t) {__builtin_aarch64_uqshldi_uus (__a[0], __b[0])}; } -__extension__ extern __inline int8_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxv_s8 (int8x8_t __a) +vqshlq_s8 (int8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_reduc_smax_scal_v8qi (__a); + return __builtin_aarch64_sqshlv16qi (__a, __b); } -__extension__ extern __inline int16_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxv_s16 (int16x4_t __a) +vqshlq_s16 (int16x8_t __a, int16x8_t __b) { - return __builtin_aarch64_reduc_smax_scal_v4hi (__a); + return __builtin_aarch64_sqshlv8hi (__a, __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxv_s32 (int32x2_t __a) +vqshlq_s32 (int32x4_t __a, int32x4_t __b) { - return __builtin_aarch64_reduc_smax_scal_v2si (__a); + return __builtin_aarch64_sqshlv4si (__a, __b); } -__extension__ extern __inline uint8_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxv_u8 (uint8x8_t __a) +vqshlq_s64 (int64x2_t __a, int64x2_t __b) { - return __builtin_aarch64_reduc_umax_scal_v8qi_uu (__a); + return __builtin_aarch64_sqshlv2di (__a, __b); } -__extension__ extern __inline uint16_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxv_u16 (uint16x4_t __a) +vqshlq_u8 (uint8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_reduc_umax_scal_v4hi_uu (__a); + return __builtin_aarch64_uqshlv16qi_uus ( __a, __b); } -__extension__ extern __inline uint32_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxv_u32 (uint32x2_t __a) +vqshlq_u16 (uint16x8_t __a, int16x8_t __b) { - return __builtin_aarch64_reduc_umax_scal_v2si_uu (__a); + return __builtin_aarch64_uqshlv8hi_uus ( __a, __b); } -__extension__ extern __inline float32_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxvq_f32 (float32x4_t __a) +vqshlq_u32 (uint32x4_t __a, int32x4_t __b) { - return __builtin_aarch64_reduc_smax_nan_scal_v4sf (__a); + return __builtin_aarch64_uqshlv4si_uus ( __a, __b); } -__extension__ extern __inline float64_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxvq_f64 (float64x2_t __a) +vqshlq_u64 (uint64x2_t __a, int64x2_t __b) { - return __builtin_aarch64_reduc_smax_nan_scal_v2df (__a); + return __builtin_aarch64_uqshlv2di_uus ( __a, __b); } __extension__ extern __inline int8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxvq_s8 (int8x16_t __a) +vqshlb_s8 (int8_t __a, int8_t __b) { - return __builtin_aarch64_reduc_smax_scal_v16qi (__a); + return __builtin_aarch64_sqshlqi (__a, __b); } __extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxvq_s16 (int16x8_t __a) +vqshlh_s16 (int16_t __a, int16_t __b) { - return __builtin_aarch64_reduc_smax_scal_v8hi (__a); + return __builtin_aarch64_sqshlhi (__a, __b); } __extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxvq_s32 (int32x4_t __a) -{ - return __builtin_aarch64_reduc_smax_scal_v4si (__a); -} - -__extension__ extern __inline uint8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxvq_u8 (uint8x16_t __a) -{ - return __builtin_aarch64_reduc_umax_scal_v16qi_uu (__a); -} - -__extension__ extern __inline uint16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxvq_u16 (uint16x8_t __a) -{ - return __builtin_aarch64_reduc_umax_scal_v8hi_uu (__a); -} - -__extension__ extern __inline uint32_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxvq_u32 (uint32x4_t __a) +vqshls_s32 (int32_t __a, int32_t __b) { - return __builtin_aarch64_reduc_umax_scal_v4si_uu (__a); + return __builtin_aarch64_sqshlsi (__a, __b); } -/* vmaxnmv */ - -__extension__ extern __inline float32_t +__extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxnmv_f32 (float32x2_t __a) +vqshld_s64 (int64_t __a, int64_t __b) { - return __builtin_aarch64_reduc_smax_scal_v2sf (__a); + return __builtin_aarch64_sqshldi (__a, __b); } -__extension__ extern __inline float32_t +__extension__ extern __inline uint8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxnmvq_f32 (float32x4_t __a) +vqshlb_u8 (uint8_t __a, int8_t __b) { - return __builtin_aarch64_reduc_smax_scal_v4sf (__a); + return __builtin_aarch64_uqshlqi_uus (__a, __b); } -__extension__ extern __inline float64_t +__extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxnmvq_f64 (float64x2_t __a) +vqshlh_u16 (uint16_t __a, int16_t __b) { - return __builtin_aarch64_reduc_smax_scal_v2df (__a); + return __builtin_aarch64_uqshlhi_uus (__a, __b); } -/* vmin */ - -__extension__ extern __inline float32x2_t +__extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmin_f32 (float32x2_t __a, float32x2_t __b) +vqshls_u32 (uint32_t __a, int32_t __b) { - return __builtin_aarch64_smin_nanv2sf (__a, __b); + return __builtin_aarch64_uqshlsi_uus (__a, __b); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmin_f64 (float64x1_t __a, float64x1_t __b) +vqshld_u64 (uint64_t __a, int64_t __b) { - return (float64x1_t) - { __builtin_aarch64_smin_nandf (vget_lane_f64 (__a, 0), - vget_lane_f64 (__b, 0)) }; + return __builtin_aarch64_uqshldi_uus (__a, __b); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmin_s8 (int8x8_t __a, int8x8_t __b) +vqshl_n_s8 (int8x8_t __a, const int __b) { - return __builtin_aarch64_sminv8qi (__a, __b); + return (int8x8_t) __builtin_aarch64_sqshl_nv8qi (__a, __b); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmin_s16 (int16x4_t __a, int16x4_t __b) +vqshl_n_s16 (int16x4_t __a, const int __b) { - return __builtin_aarch64_sminv4hi (__a, __b); + return (int16x4_t) __builtin_aarch64_sqshl_nv4hi (__a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmin_s32 (int32x2_t __a, int32x2_t __b) +vqshl_n_s32 (int32x2_t __a, const int __b) { - return __builtin_aarch64_sminv2si (__a, __b); + return (int32x2_t) __builtin_aarch64_sqshl_nv2si (__a, __b); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmin_u8 (uint8x8_t __a, uint8x8_t __b) +vqshl_n_s64 (int64x1_t __a, const int __b) { - return (uint8x8_t) __builtin_aarch64_uminv8qi ((int8x8_t) __a, - (int8x8_t) __b); + return (int64x1_t) {__builtin_aarch64_sqshl_ndi (__a[0], __b)}; } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmin_u16 (uint16x4_t __a, uint16x4_t __b) +vqshl_n_u8 (uint8x8_t __a, const int __b) { - return (uint16x4_t) __builtin_aarch64_uminv4hi ((int16x4_t) __a, - (int16x4_t) __b); + return __builtin_aarch64_uqshl_nv8qi_uus (__a, __b); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmin_u32 (uint32x2_t __a, uint32x2_t __b) +vqshl_n_u16 (uint16x4_t __a, const int __b) { - return (uint32x2_t) __builtin_aarch64_uminv2si ((int32x2_t) __a, - (int32x2_t) __b); + return __builtin_aarch64_uqshl_nv4hi_uus (__a, __b); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminq_f32 (float32x4_t __a, float32x4_t __b) +vqshl_n_u32 (uint32x2_t __a, const int __b) { - return __builtin_aarch64_smin_nanv4sf (__a, __b); + return __builtin_aarch64_uqshl_nv2si_uus (__a, __b); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminq_f64 (float64x2_t __a, float64x2_t __b) +vqshl_n_u64 (uint64x1_t __a, const int __b) { - return __builtin_aarch64_smin_nanv2df (__a, __b); + return (uint64x1_t) {__builtin_aarch64_uqshl_ndi_uus (__a[0], __b)}; } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminq_s8 (int8x16_t __a, int8x16_t __b) +vqshlq_n_s8 (int8x16_t __a, const int __b) { - return __builtin_aarch64_sminv16qi (__a, __b); + return (int8x16_t) __builtin_aarch64_sqshl_nv16qi (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminq_s16 (int16x8_t __a, int16x8_t __b) +vqshlq_n_s16 (int16x8_t __a, const int __b) { - return __builtin_aarch64_sminv8hi (__a, __b); + return (int16x8_t) __builtin_aarch64_sqshl_nv8hi (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminq_s32 (int32x4_t __a, int32x4_t __b) +vqshlq_n_s32 (int32x4_t __a, const int __b) { - return __builtin_aarch64_sminv4si (__a, __b); + return (int32x4_t) __builtin_aarch64_sqshl_nv4si (__a, __b); +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshlq_n_s64 (int64x2_t __a, const int __b) +{ + return (int64x2_t) __builtin_aarch64_sqshl_nv2di (__a, __b); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminq_u8 (uint8x16_t __a, uint8x16_t __b) +vqshlq_n_u8 (uint8x16_t __a, const int __b) { - return (uint8x16_t) __builtin_aarch64_uminv16qi ((int8x16_t) __a, - (int8x16_t) __b); + return __builtin_aarch64_uqshl_nv16qi_uus (__a, __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminq_u16 (uint16x8_t __a, uint16x8_t __b) +vqshlq_n_u16 (uint16x8_t __a, const int __b) { - return (uint16x8_t) __builtin_aarch64_uminv8hi ((int16x8_t) __a, - (int16x8_t) __b); + return __builtin_aarch64_uqshl_nv8hi_uus (__a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminq_u32 (uint32x4_t __a, uint32x4_t __b) +vqshlq_n_u32 (uint32x4_t __a, const int __b) { - return (uint32x4_t) __builtin_aarch64_uminv4si ((int32x4_t) __a, - (int32x4_t) __b); + return __builtin_aarch64_uqshl_nv4si_uus (__a, __b); } -/* vminnm */ +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshlq_n_u64 (uint64x2_t __a, const int __b) +{ + return __builtin_aarch64_uqshl_nv2di_uus (__a, __b); +} -__extension__ extern __inline float32x2_t +__extension__ extern __inline int8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminnm_f32 (float32x2_t __a, float32x2_t __b) +vqshlb_n_s8 (int8_t __a, const int __b) { - return __builtin_aarch64_fminv2sf (__a, __b); + return (int8_t) __builtin_aarch64_sqshl_nqi (__a, __b); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminnm_f64 (float64x1_t __a, float64x1_t __b) +vqshlh_n_s16 (int16_t __a, const int __b) { - return (float64x1_t) - { __builtin_aarch64_fmindf (vget_lane_f64 (__a, 0), - vget_lane_f64 (__b, 0)) }; + return (int16_t) __builtin_aarch64_sqshl_nhi (__a, __b); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminnmq_f32 (float32x4_t __a, float32x4_t __b) +vqshls_n_s32 (int32_t __a, const int __b) { - return __builtin_aarch64_fminv4sf (__a, __b); + return (int32_t) __builtin_aarch64_sqshl_nsi (__a, __b); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminnmq_f64 (float64x2_t __a, float64x2_t __b) +vqshld_n_s64 (int64_t __a, const int __b) { - return __builtin_aarch64_fminv2df (__a, __b); + return __builtin_aarch64_sqshl_ndi (__a, __b); } -/* vminv */ +__extension__ extern __inline uint8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshlb_n_u8 (uint8_t __a, const int __b) +{ + return __builtin_aarch64_uqshl_nqi_uus (__a, __b); +} -__extension__ extern __inline float32_t +__extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminv_f32 (float32x2_t __a) +vqshlh_n_u16 (uint16_t __a, const int __b) { - return __builtin_aarch64_reduc_smin_nan_scal_v2sf (__a); + return __builtin_aarch64_uqshl_nhi_uus (__a, __b); } -__extension__ extern __inline int8_t +__extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminv_s8 (int8x8_t __a) +vqshls_n_u32 (uint32_t __a, const int __b) { - return __builtin_aarch64_reduc_smin_scal_v8qi (__a); + return __builtin_aarch64_uqshl_nsi_uus (__a, __b); } -__extension__ extern __inline int16_t +__extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminv_s16 (int16x4_t __a) +vqshld_n_u64 (uint64_t __a, const int __b) { - return __builtin_aarch64_reduc_smin_scal_v4hi (__a); + return __builtin_aarch64_uqshl_ndi_uus (__a, __b); } -__extension__ extern __inline int32_t +/* vqshlu */ + +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminv_s32 (int32x2_t __a) +vqshlu_n_s8 (int8x8_t __a, const int __b) { - return __builtin_aarch64_reduc_smin_scal_v2si (__a); + return __builtin_aarch64_sqshlu_nv8qi_uss (__a, __b); } -__extension__ extern __inline uint8_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminv_u8 (uint8x8_t __a) +vqshlu_n_s16 (int16x4_t __a, const int __b) { - return __builtin_aarch64_reduc_umin_scal_v8qi_uu (__a); + return __builtin_aarch64_sqshlu_nv4hi_uss (__a, __b); } -__extension__ extern __inline uint16_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminv_u16 (uint16x4_t __a) +vqshlu_n_s32 (int32x2_t __a, const int __b) { - return __builtin_aarch64_reduc_umin_scal_v4hi_uu (__a); + return __builtin_aarch64_sqshlu_nv2si_uss (__a, __b); } -__extension__ extern __inline uint32_t +__extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminv_u32 (uint32x2_t __a) +vqshlu_n_s64 (int64x1_t __a, const int __b) { - return __builtin_aarch64_reduc_umin_scal_v2si_uu (__a); + return (uint64x1_t) {__builtin_aarch64_sqshlu_ndi_uss (__a[0], __b)}; } -__extension__ extern __inline float32_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminvq_f32 (float32x4_t __a) +vqshluq_n_s8 (int8x16_t __a, const int __b) { - return __builtin_aarch64_reduc_smin_nan_scal_v4sf (__a); + return __builtin_aarch64_sqshlu_nv16qi_uss (__a, __b); } -__extension__ extern __inline float64_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminvq_f64 (float64x2_t __a) +vqshluq_n_s16 (int16x8_t __a, const int __b) { - return __builtin_aarch64_reduc_smin_nan_scal_v2df (__a); + return __builtin_aarch64_sqshlu_nv8hi_uss (__a, __b); } -__extension__ extern __inline int8_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminvq_s8 (int8x16_t __a) +vqshluq_n_s32 (int32x4_t __a, const int __b) { - return __builtin_aarch64_reduc_smin_scal_v16qi (__a); + return __builtin_aarch64_sqshlu_nv4si_uss (__a, __b); } -__extension__ extern __inline int16_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminvq_s16 (int16x8_t __a) +vqshluq_n_s64 (int64x2_t __a, const int __b) { - return __builtin_aarch64_reduc_smin_scal_v8hi (__a); + return __builtin_aarch64_sqshlu_nv2di_uss (__a, __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline int8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminvq_s32 (int32x4_t __a) +vqshlub_n_s8 (int8_t __a, const int __b) { - return __builtin_aarch64_reduc_smin_scal_v4si (__a); + return (int8_t) __builtin_aarch64_sqshlu_nqi_uss (__a, __b); } -__extension__ extern __inline uint8_t +__extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminvq_u8 (uint8x16_t __a) +vqshluh_n_s16 (int16_t __a, const int __b) { - return __builtin_aarch64_reduc_umin_scal_v16qi_uu (__a); + return (int16_t) __builtin_aarch64_sqshlu_nhi_uss (__a, __b); } -__extension__ extern __inline uint16_t +__extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminvq_u16 (uint16x8_t __a) +vqshlus_n_s32 (int32_t __a, const int __b) { - return __builtin_aarch64_reduc_umin_scal_v8hi_uu (__a); + return (int32_t) __builtin_aarch64_sqshlu_nsi_uss (__a, __b); } -__extension__ extern __inline uint32_t +__extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminvq_u32 (uint32x4_t __a) +vqshlud_n_s64 (int64_t __a, const int __b) { - return __builtin_aarch64_reduc_umin_scal_v4si_uu (__a); + return __builtin_aarch64_sqshlu_ndi_uss (__a, __b); } -/* vminnmv */ +/* vqshrn */ -__extension__ extern __inline float32_t +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminnmv_f32 (float32x2_t __a) +vqshrn_n_s16 (int16x8_t __a, const int __b) { - return __builtin_aarch64_reduc_smin_scal_v2sf (__a); + return (int8x8_t) __builtin_aarch64_sqshrn_nv8hi (__a, __b); } -__extension__ extern __inline float32_t +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminnmvq_f32 (float32x4_t __a) +vqshrn_n_s32 (int32x4_t __a, const int __b) { - return __builtin_aarch64_reduc_smin_scal_v4sf (__a); + return (int16x4_t) __builtin_aarch64_sqshrn_nv4si (__a, __b); } -__extension__ extern __inline float64_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminnmvq_f64 (float64x2_t __a) +vqshrn_n_s64 (int64x2_t __a, const int __b) { - return __builtin_aarch64_reduc_smin_scal_v2df (__a); + return (int32x2_t) __builtin_aarch64_sqshrn_nv2di (__a, __b); } -/* vmla */ +__extension__ extern __inline uint8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshrn_n_u16 (uint16x8_t __a, const int __b) +{ + return __builtin_aarch64_uqshrn_nv8hi_uus ( __a, __b); +} -__extension__ extern __inline float32x2_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_f32 (float32x2_t a, float32x2_t b, float32x2_t c) +vqshrn_n_u32 (uint32x4_t __a, const int __b) { - return a + b * c; + return __builtin_aarch64_uqshrn_nv4si_uus ( __a, __b); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c) +vqshrn_n_u64 (uint64x2_t __a, const int __b) { - return __a + __b * __c; + return __builtin_aarch64_uqshrn_nv2di_uus ( __a, __b); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline int8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c) +vqshrnh_n_s16 (int16_t __a, const int __b) { - return a + b * c; + return (int8_t) __builtin_aarch64_sqshrn_nhi (__a, __b); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) +vqshrns_n_s32 (int32_t __a, const int __b) { - return a + b * c; + return (int16_t) __builtin_aarch64_sqshrn_nsi (__a, __b); } -/* vmla_lane */ - -__extension__ extern __inline float32x2_t +__extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_lane_f32 (float32x2_t __a, float32x2_t __b, - float32x2_t __c, const int __lane) +vqshrnd_n_s64 (int64_t __a, const int __b) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return (int32_t) __builtin_aarch64_sqshrn_ndi (__a, __b); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline uint8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_lane_s16 (int16x4_t __a, int16x4_t __b, - int16x4_t __c, const int __lane) +vqshrnh_n_u16 (uint16_t __a, const int __b) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return __builtin_aarch64_uqshrn_nhi_uus (__a, __b); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_lane_s32 (int32x2_t __a, int32x2_t __b, - int32x2_t __c, const int __lane) +vqshrns_n_u32 (uint32_t __a, const int __b) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return __builtin_aarch64_uqshrn_nsi_uus (__a, __b); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b, - uint16x4_t __c, const int __lane) +vqshrnd_n_u64 (uint64_t __a, const int __b) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return __builtin_aarch64_uqshrn_ndi_uus (__a, __b); } -__extension__ extern __inline uint32x2_t +/* vqshrun */ + +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b, - uint32x2_t __c, const int __lane) +vqshrun_n_s16 (int16x8_t __a, const int __b) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return (uint8x8_t) __builtin_aarch64_sqshrun_nv8hi (__a, __b); } -/* vmla_laneq */ - -__extension__ extern __inline float32x2_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_laneq_f32 (float32x2_t __a, float32x2_t __b, - float32x4_t __c, const int __lane) +vqshrun_n_s32 (int32x4_t __a, const int __b) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return (uint16x4_t) __builtin_aarch64_sqshrun_nv4si (__a, __b); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_laneq_s16 (int16x4_t __a, int16x4_t __b, - int16x8_t __c, const int __lane) +vqshrun_n_s64 (int64x2_t __a, const int __b) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return (uint32x2_t) __builtin_aarch64_sqshrun_nv2di (__a, __b); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline int8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_laneq_s32 (int32x2_t __a, int32x2_t __b, - int32x4_t __c, const int __lane) +vqshrunh_n_s16 (int16_t __a, const int __b) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return (int8_t) __builtin_aarch64_sqshrun_nhi (__a, __b); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_laneq_u16 (uint16x4_t __a, uint16x4_t __b, - uint16x8_t __c, const int __lane) +vqshruns_n_s32 (int32_t __a, const int __b) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return (int16_t) __builtin_aarch64_sqshrun_nsi (__a, __b); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmla_laneq_u32 (uint32x2_t __a, uint32x2_t __b, - uint32x4_t __c, const int __lane) +vqshrund_n_s64 (int64_t __a, const int __b) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return (int32_t) __builtin_aarch64_sqshrun_ndi (__a, __b); } -/* vmlaq_lane */ +/* vqsub */ -__extension__ extern __inline float32x4_t +__extension__ extern __inline int8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b, - float32x2_t __c, const int __lane) +vqsubb_s8 (int8_t __a, int8_t __b) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return (int8_t) __builtin_aarch64_sqsubqi (__a, __b); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b, - int16x4_t __c, const int __lane) +vqsubh_s16 (int16_t __a, int16_t __b) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return (int16_t) __builtin_aarch64_sqsubhi (__a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b, - int32x2_t __c, const int __lane) +vqsubs_s32 (int32_t __a, int32_t __b) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return (int32_t) __builtin_aarch64_sqsubsi (__a, __b); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b, - uint16x4_t __c, const int __lane) +vqsubd_s64 (int64_t __a, int64_t __b) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return __builtin_aarch64_sqsubdi (__a, __b); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline uint8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b, - uint32x2_t __c, const int __lane) +vqsubb_u8 (uint8_t __a, uint8_t __b) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return (uint8_t) __builtin_aarch64_uqsubqi_uuu (__a, __b); } - /* vmlaq_laneq */ - -__extension__ extern __inline float32x4_t +__extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_laneq_f32 (float32x4_t __a, float32x4_t __b, - float32x4_t __c, const int __lane) +vqsubh_u16 (uint16_t __a, uint16_t __b) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return (uint16_t) __builtin_aarch64_uqsubhi_uuu (__a, __b); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_laneq_s16 (int16x8_t __a, int16x8_t __b, - int16x8_t __c, const int __lane) +vqsubs_u32 (uint32_t __a, uint32_t __b) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return (uint32_t) __builtin_aarch64_uqsubsi_uuu (__a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_laneq_s32 (int32x4_t __a, int32x4_t __b, - int32x4_t __c, const int __lane) +vqsubd_u64 (uint64_t __a, uint64_t __b) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + return __builtin_aarch64_uqsubdi_uuu (__a, __b); } -__extension__ extern __inline uint16x8_t +/* vqtbl2 */ + +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_laneq_u16 (uint16x8_t __a, uint16x8_t __b, - uint16x8_t __c, const int __lane) +vqtbl2_s8 (int8x16x2_t __tab, uint8x8_t __idx) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1); + return __builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlaq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, - uint32x4_t __c, const int __lane) +vqtbl2_u8 (uint8x16x2_t __tab, uint8x8_t __idx) { - return (__a + (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + return (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx); } -/* vmls */ - -__extension__ extern __inline float32x2_t +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_f32 (float32x2_t a, float32x2_t b, float32x2_t c) +vqtbl2_p8 (poly8x16x2_t __tab, uint8x8_t __idx) { - return a - b * c; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + return (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)__idx); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c) +vqtbl2q_s8 (int8x16x2_t __tab, uint8x16_t __idx) { - return __a - __b * __c; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + return __builtin_aarch64_tbl3v16qi (__o, (int8x16_t)__idx); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c) +vqtbl2q_u8 (uint8x16x2_t __tab, uint8x16_t __idx) { - return a - b * c; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + return (uint8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)__idx); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) +vqtbl2q_p8 (poly8x16x2_t __tab, uint8x16_t __idx) { - return a - b * c; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + return (poly8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)__idx); } -/* vmls_lane */ +/* vqtbl3 */ -__extension__ extern __inline float32x2_t +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_lane_f32 (float32x2_t __a, float32x2_t __b, - float32x2_t __c, const int __lane) +vqtbl3_s8 (int8x16x3_t __tab, uint8x8_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + return __builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_lane_s16 (int16x4_t __a, int16x4_t __b, - int16x4_t __c, const int __lane) +vqtbl3_u8 (uint8x16x3_t __tab, uint8x8_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + return (uint8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_lane_s32 (int32x2_t __a, int32x2_t __b, - int32x2_t __c, const int __lane) +vqtbl3_p8 (poly8x16x3_t __tab, uint8x8_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + return (poly8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)__idx); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b, - uint16x4_t __c, const int __lane) +vqtbl3q_s8 (int8x16x3_t __tab, uint8x16_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + return __builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b, - uint32x2_t __c, const int __lane) +vqtbl3q_u8 (uint8x16x3_t __tab, uint8x16_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + return (uint8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx); } -/* vmls_laneq */ - -__extension__ extern __inline float32x2_t +__extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_laneq_f32 (float32x2_t __a, float32x2_t __b, - float32x4_t __c, const int __lane) +vqtbl3q_p8 (poly8x16x3_t __tab, uint8x16_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + return (poly8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)__idx); } -__extension__ extern __inline int16x4_t +/* vqtbl4 */ + +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_laneq_s16 (int16x4_t __a, int16x4_t __b, - int16x8_t __c, const int __lane) +vqtbl4_s8 (int8x16x4_t __tab, uint8x8_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + return __builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_laneq_s32 (int32x2_t __a, int32x2_t __b, - int32x4_t __c, const int __lane) +vqtbl4_u8 (uint8x16x4_t __tab, uint8x8_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + return (uint8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_laneq_u16 (uint16x4_t __a, uint16x4_t __b, - uint16x8_t __c, const int __lane) +vqtbl4_p8 (poly8x16x4_t __tab, uint8x8_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + return (poly8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)__idx); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmls_laneq_u32 (uint32x2_t __a, uint32x2_t __b, - uint32x4_t __c, const int __lane) +vqtbl4q_s8 (int8x16x4_t __tab, uint8x16_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + return __builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx); } -/* vmlsq_lane */ - -__extension__ extern __inline float32x4_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b, - float32x2_t __c, const int __lane) +vqtbl4q_u8 (uint8x16x4_t __tab, uint8x16_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + return (uint8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b, - int16x4_t __c, const int __lane) +vqtbl4q_p8 (poly8x16x4_t __tab, uint8x16_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + return (poly8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)__idx); } -__extension__ extern __inline int32x4_t + +/* vqtbx2 */ +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b, - int32x2_t __c, const int __lane) +vqtbx2_s8 (int8x8_t __r, int8x16x2_t __tab, uint8x8_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1); + return __builtin_aarch64_tbx4v8qi (__r, __o, (int8x8_t)__idx); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b, - uint16x4_t __c, const int __lane) +vqtbx2_u8 (uint8x8_t __r, uint8x16x2_t __tab, uint8x8_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + return (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o, + (int8x8_t)__idx); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b, - uint32x2_t __c, const int __lane) +vqtbx2_p8 (poly8x8_t __r, poly8x16x2_t __tab, uint8x8_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + return (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o, + (int8x8_t)__idx); } - /* vmlsq_laneq */ - -__extension__ extern __inline float32x4_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_laneq_f32 (float32x4_t __a, float32x4_t __b, - float32x4_t __c, const int __lane) +vqtbx2q_s8 (int8x16_t __r, int8x16x2_t __tab, uint8x16_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, __tab.val[1], 1); + return __builtin_aarch64_tbx4v16qi (__r, __o, (int8x16_t)__idx); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_laneq_s16 (int16x8_t __a, int16x8_t __b, - int16x8_t __c, const int __lane) +vqtbx2q_u8 (uint8x16_t __r, uint8x16x2_t __tab, uint8x16_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + return (uint8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)__r, __o, + (int8x16_t)__idx); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_laneq_s32 (int32x4_t __a, int32x4_t __b, - int32x4_t __c, const int __lane) +vqtbx2q_p8 (poly8x16_t __r, poly8x16x2_t __tab, uint8x16_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)__tab.val[1], 1); + return (poly8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)__r, __o, + (int8x16_t)__idx); } -__extension__ extern __inline uint16x8_t + +/* vqtbx3 */ +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_laneq_u16 (uint16x8_t __a, uint16x8_t __b, - uint16x8_t __c, const int __lane) +vqtbx3_s8 (int8x8_t __r, int8x16x3_t __tab, uint8x8_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[2], 2); + return __builtin_aarch64_qtbx3v8qi (__r, __o, (int8x8_t)__idx); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmlsq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, - uint32x4_t __c, const int __lane) +vqtbx3_u8 (uint8x8_t __r, uint8x16x3_t __tab, uint8x8_t __idx) { - return (__a - (__b * __aarch64_vget_lane_any (__c, __lane))); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + return (uint8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)__r, __o, + (int8x8_t)__idx); } -/* vmov_n_ */ - -__extension__ extern __inline float16x4_t +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_f16 (float16_t __a) +vqtbx3_p8 (poly8x8_t __r, poly8x16x3_t __tab, uint8x8_t __idx) { - return vdup_n_f16 (__a); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + return (poly8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)__r, __o, + (int8x8_t)__idx); } -__extension__ extern __inline float32x2_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_f32 (float32_t __a) +vqtbx3q_s8 (int8x16_t __r, int8x16x3_t __tab, uint8x16_t __idx) { - return vdup_n_f32 (__a); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, __tab.val[2], 2); + return __builtin_aarch64_qtbx3v16qi (__r, __o, (int8x16_t)__idx); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_f64 (float64_t __a) +vqtbx3q_u8 (uint8x16_t __r, uint8x16x3_t __tab, uint8x16_t __idx) { - return (float64x1_t) {__a}; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + return (uint8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)__r, __o, + (int8x16_t)__idx); } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_p8 (poly8_t __a) +vqtbx3q_p8 (poly8x16_t __r, poly8x16x3_t __tab, uint8x16_t __idx) { - return vdup_n_p8 (__a); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)__tab.val[2], 2); + return (poly8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)__r, __o, + (int8x16_t)__idx); } -__extension__ extern __inline poly16x4_t +/* vqtbx4 */ + +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_p16 (poly16_t __a) +vqtbx4_s8 (int8x8_t __r, int8x16x4_t __tab, uint8x8_t __idx) { - return vdup_n_p16 (__a); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[3], 3); + return __builtin_aarch64_qtbx4v8qi (__r, __o, (int8x8_t)__idx); } -__extension__ extern __inline poly64x1_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_p64 (poly64_t __a) +vqtbx4_u8 (uint8x8_t __r, uint8x16x4_t __tab, uint8x8_t __idx) { - return vdup_n_p64 (__a); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + return (uint8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)__r, __o, + (int8x8_t)__idx); } -__extension__ extern __inline int8x8_t +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_s8 (int8_t __a) +vqtbx4_p8 (poly8x8_t __r, poly8x16x4_t __tab, uint8x8_t __idx) { - return vdup_n_s8 (__a); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + return (poly8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)__r, __o, + (int8x8_t)__idx); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_s16 (int16_t __a) +vqtbx4q_s8 (int8x16_t __r, int8x16x4_t __tab, uint8x16_t __idx) { - return vdup_n_s16 (__a); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, __tab.val[3], 3); + return __builtin_aarch64_qtbx4v16qi (__r, __o, (int8x16_t)__idx); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_s32 (int32_t __a) +vqtbx4q_u8 (uint8x16_t __r, uint8x16x4_t __tab, uint8x16_t __idx) { - return vdup_n_s32 (__a); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + return (uint8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)__r, __o, + (int8x16_t)__idx); } -__extension__ extern __inline int64x1_t +__extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_s64 (int64_t __a) +vqtbx4q_p8 (poly8x16_t __r, poly8x16x4_t __tab, uint8x16_t __idx) { - return (int64x1_t) {__a}; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)__tab.val[3], 3); + return (poly8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)__r, __o, + (int8x16_t)__idx); } -__extension__ extern __inline uint8x8_t +/* vrbit */ + +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_u8 (uint8_t __a) +vrbit_p8 (poly8x8_t __a) { - return vdup_n_u8 (__a); + return (poly8x8_t) __builtin_aarch64_rbitv8qi ((int8x8_t) __a); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_u16 (uint16_t __a) +vrbit_s8 (int8x8_t __a) { - return vdup_n_u16 (__a); + return __builtin_aarch64_rbitv8qi (__a); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_u32 (uint32_t __a) +vrbit_u8 (uint8x8_t __a) { - return vdup_n_u32 (__a); + return (uint8x8_t) __builtin_aarch64_rbitv8qi ((int8x8_t) __a); } -__extension__ extern __inline uint64x1_t +__extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmov_n_u64 (uint64_t __a) +vrbitq_p8 (poly8x16_t __a) { - return (uint64x1_t) {__a}; + return (poly8x16_t) __builtin_aarch64_rbitv16qi ((int8x16_t)__a); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_f16 (float16_t __a) +vrbitq_s8 (int8x16_t __a) { - return vdupq_n_f16 (__a); + return __builtin_aarch64_rbitv16qi (__a); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_f32 (float32_t __a) +vrbitq_u8 (uint8x16_t __a) { - return vdupq_n_f32 (__a); + return (uint8x16_t) __builtin_aarch64_rbitv16qi ((int8x16_t) __a); } -__extension__ extern __inline float64x2_t +/* vrecpe */ + +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_f64 (float64_t __a) +vrecpe_u32 (uint32x2_t __a) { - return vdupq_n_f64 (__a); + return (uint32x2_t) __builtin_aarch64_urecpev2si ((int32x2_t) __a); } -__extension__ extern __inline poly8x16_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_p8 (poly8_t __a) +vrecpeq_u32 (uint32x4_t __a) { - return vdupq_n_p8 (__a); + return (uint32x4_t) __builtin_aarch64_urecpev4si ((int32x4_t) __a); } -__extension__ extern __inline poly16x8_t +__extension__ extern __inline float32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_p16 (poly16_t __a) +vrecpes_f32 (float32_t __a) { - return vdupq_n_p16 (__a); + return __builtin_aarch64_frecpesf (__a); } -__extension__ extern __inline poly64x2_t +__extension__ extern __inline float64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_p64 (poly64_t __a) +vrecped_f64 (float64_t __a) { - return vdupq_n_p64 (__a); + return __builtin_aarch64_frecpedf (__a); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_s8 (int8_t __a) +vrecpe_f32 (float32x2_t __a) { - return vdupq_n_s8 (__a); + return __builtin_aarch64_frecpev2sf (__a); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_s16 (int16_t __a) +vrecpe_f64 (float64x1_t __a) { - return vdupq_n_s16 (__a); + return (float64x1_t) { vrecped_f64 (vget_lane_f64 (__a, 0)) }; } -__extension__ extern __inline int32x4_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_s32 (int32_t __a) +vrecpeq_f32 (float32x4_t __a) { - return vdupq_n_s32 (__a); + return __builtin_aarch64_frecpev4sf (__a); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_s64 (int64_t __a) +vrecpeq_f64 (float64x2_t __a) { - return vdupq_n_s64 (__a); + return __builtin_aarch64_frecpev2df (__a); } -__extension__ extern __inline uint8x16_t +/* vrecps */ + +__extension__ extern __inline float32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_u8 (uint8_t __a) +vrecpss_f32 (float32_t __a, float32_t __b) { - return vdupq_n_u8 (__a); + return __builtin_aarch64_frecpssf (__a, __b); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline float64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_u16 (uint16_t __a) +vrecpsd_f64 (float64_t __a, float64_t __b) { - return vdupq_n_u16 (__a); + return __builtin_aarch64_frecpsdf (__a, __b); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_u32 (uint32_t __a) +vrecps_f32 (float32x2_t __a, float32x2_t __b) { - return vdupq_n_u32 (__a); + return __builtin_aarch64_frecpsv2sf (__a, __b); } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmovq_n_u64 (uint64_t __a) +vrecps_f64 (float64x1_t __a, float64x1_t __b) { - return vdupq_n_u64 (__a); + return (float64x1_t) { vrecpsd_f64 (vget_lane_f64 (__a, 0), + vget_lane_f64 (__b, 0)) }; } -/* vmul_lane */ - -__extension__ extern __inline float32x2_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __lane) +vrecpsq_f32 (float32x4_t __a, float32x4_t __b) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_aarch64_frecpsv4sf (__a, __b); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_lane_f64 (float64x1_t __a, float64x1_t __b, const int __lane) +vrecpsq_f64 (float64x2_t __a, float64x2_t __b) { - return __a * __b; + return __builtin_aarch64_frecpsv2df (__a, __b); } -__extension__ extern __inline int16x4_t +/* vrecpx */ + +__extension__ extern __inline float32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __lane) +vrecpxs_f32 (float32_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_aarch64_frecpxsf (__a); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline float64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __lane) +vrecpxd_f64 (float64_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_aarch64_frecpxdf (__a); } -__extension__ extern __inline uint16x4_t + +/* vrev */ + +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __lane) +vrev16_p8 (poly8x8_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 }); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __lane) +vrev16_s8 (int8x8_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 }); } -/* vmuld_lane */ - -__extension__ extern __inline float64_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmuld_lane_f64 (float64_t __a, float64x1_t __b, const int __lane) +vrev16_u8 (uint8x8_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 }); } -__extension__ extern __inline float64_t +__extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmuld_laneq_f64 (float64_t __a, float64x2_t __b, const int __lane) +vrev16q_p8 (poly8x16_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, + (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 }); } -/* vmuls_lane */ - -__extension__ extern __inline float32_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmuls_lane_f32 (float32_t __a, float32x2_t __b, const int __lane) +vrev16q_s8 (int8x16_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, + (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 }); } -__extension__ extern __inline float32_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmuls_laneq_f32 (float32_t __a, float32x4_t __b, const int __lane) +vrev16q_u8 (uint8x16_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, + (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 }); } -/* vmul_laneq */ - -__extension__ extern __inline float32x2_t +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_laneq_f32 (float32x2_t __a, float32x4_t __b, const int __lane) +vrev32_p8 (poly8x8_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_laneq_f64 (float64x1_t __a, float64x2_t __b, const int __lane) +vrev32_p16 (poly16x4_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 }); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __lane) +vrev32_s8 (int8x8_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __lane) +vrev32_s16 (int16x4_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 }); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const int __lane) +vrev32_u8 (uint8x8_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const int __lane) +vrev32_u16 (uint16x4_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, (uint16x4_t) { 1, 0, 3, 2 }); } -/* vmul_n */ - -__extension__ extern __inline float64x1_t +__extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_n_f64 (float64x1_t __a, float64_t __b) +vrev32q_p8 (poly8x16_t __a) { - return (float64x1_t) { vget_lane_f64 (__a, 0) * __b }; + return __builtin_shuffle (__a, + (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }); } -/* vmulq_lane */ - -__extension__ extern __inline float32x4_t +__extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __lane) +vrev32q_p16 (poly16x8_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 }); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_lane_f64 (float64x2_t __a, float64x1_t __b, const int __lane) +vrev32q_s8 (int8x16_t __a) { - __AARCH64_LANE_CHECK (__a, __lane); - return __a * __b[0]; + return __builtin_shuffle (__a, + (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __lane) +vrev32q_s16 (int16x8_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 }); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __lane) +vrev32q_u8 (uint8x16_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, + (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __lane) +vrev32q_u16 (uint16x8_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 }); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __lane) +vrev64_f16 (float16x4_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 }); } -/* vmulq_laneq */ - -__extension__ extern __inline float32x4_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_laneq_f32 (float32x4_t __a, float32x4_t __b, const int __lane) +vrev64_f32 (float32x2_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, (uint32x2_t) { 1, 0 }); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_laneq_f64 (float64x2_t __a, float64x2_t __b, const int __lane) +vrev64_p8 (poly8x8_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 }); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_laneq_s16 (int16x8_t __a, int16x8_t __b, const int __lane) +vrev64_p16 (poly16x4_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 }); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_laneq_s32 (int32x4_t __a, int32x4_t __b, const int __lane) +vrev64_s8 (int8x8_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 }); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_laneq_u16 (uint16x8_t __a, uint16x8_t __b, const int __lane) +vrev64_s16 (int16x4_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 }); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_laneq_u32 (uint32x4_t __a, uint32x4_t __b, const int __lane) +vrev64_s32 (int32x2_t __a) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_shuffle (__a, (uint32x2_t) { 1, 0 }); } -/* vmul_n. */ - -__extension__ extern __inline float32x2_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_n_f32 (float32x2_t __a, float32_t __b) +vrev64_u8 (uint8x8_t __a) { - return __a * __b; + return __builtin_shuffle (__a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 }); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_n_f32 (float32x4_t __a, float32_t __b) +vrev64_u16 (uint16x4_t __a) { - return __a * __b; + return __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 }); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_n_f64 (float64x2_t __a, float64_t __b) +vrev64_u32 (uint32x2_t __a) { - return __a * __b; + return __builtin_shuffle (__a, (uint32x2_t) { 1, 0 }); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_n_s16 (int16x4_t __a, int16_t __b) +vrev64q_f16 (float16x8_t __a) { - return __a * __b; + return __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_n_s16 (int16x8_t __a, int16_t __b) +vrev64q_f32 (float32x4_t __a) { - return __a * __b; + return __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 }); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_n_s32 (int32x2_t __a, int32_t __b) +vrev64q_p8 (poly8x16_t __a) { - return __a * __b; + return __builtin_shuffle (__a, + (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_n_s32 (int32x4_t __a, int32_t __b) +vrev64q_p16 (poly16x8_t __a) { - return __a * __b; + return __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_n_u16 (uint16x4_t __a, uint16_t __b) +vrev64q_s8 (int8x16_t __a) { - return __a * __b; + return __builtin_shuffle (__a, + (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_n_u16 (uint16x8_t __a, uint16_t __b) +vrev64q_s16 (int16x8_t __a) { - return __a * __b; + return __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_n_u32 (uint32x2_t __a, uint32_t __b) +vrev64q_s32 (int32x4_t __a) { - return __a * __b; + return __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 }); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_n_u32 (uint32x4_t __a, uint32_t __b) +vrev64q_u8 (uint8x16_t __a) { - return __a * __b; + return __builtin_shuffle (__a, + (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }); } -/* vmvn */ - -__extension__ extern __inline poly8x8_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmvn_p8 (poly8x8_t __a) +vrev64q_u16 (uint16x8_t __a) { - return (poly8x8_t) ~((int8x8_t) __a); + return __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); } -__extension__ extern __inline int8x8_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmvn_s8 (int8x8_t __a) +vrev64q_u32 (uint32x4_t __a) { - return ~__a; + return __builtin_shuffle (__a, (uint32x4_t) { 1, 0, 3, 2 }); } -__extension__ extern __inline int16x4_t +/* vrnd */ + +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmvn_s16 (int16x4_t __a) +vrnd_f32 (float32x2_t __a) { - return ~__a; + return __builtin_aarch64_btruncv2sf (__a); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmvn_s32 (int32x2_t __a) +vrnd_f64 (float64x1_t __a) { - return ~__a; + return vset_lane_f64 (__builtin_trunc (vget_lane_f64 (__a, 0)), __a, 0); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmvn_u8 (uint8x8_t __a) +vrndq_f32 (float32x4_t __a) { - return ~__a; + return __builtin_aarch64_btruncv4sf (__a); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmvn_u16 (uint16x4_t __a) +vrndq_f64 (float64x2_t __a) { - return ~__a; + return __builtin_aarch64_btruncv2df (__a); } -__extension__ extern __inline uint32x2_t +/* vrnda */ + +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmvn_u32 (uint32x2_t __a) +vrnda_f32 (float32x2_t __a) { - return ~__a; + return __builtin_aarch64_roundv2sf (__a); } -__extension__ extern __inline poly8x16_t +__extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmvnq_p8 (poly8x16_t __a) +vrnda_f64 (float64x1_t __a) { - return (poly8x16_t) ~((int8x16_t) __a); + return vset_lane_f64 (__builtin_round (vget_lane_f64 (__a, 0)), __a, 0); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmvnq_s8 (int8x16_t __a) +vrndaq_f32 (float32x4_t __a) { - return ~__a; + return __builtin_aarch64_roundv4sf (__a); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmvnq_s16 (int16x8_t __a) +vrndaq_f64 (float64x2_t __a) { - return ~__a; + return __builtin_aarch64_roundv2df (__a); } -__extension__ extern __inline int32x4_t +/* vrndi */ + +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmvnq_s32 (int32x4_t __a) +vrndi_f32 (float32x2_t __a) { - return ~__a; + return __builtin_aarch64_nearbyintv2sf (__a); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmvnq_u8 (uint8x16_t __a) +vrndi_f64 (float64x1_t __a) { - return ~__a; + return vset_lane_f64 (__builtin_nearbyint (vget_lane_f64 (__a, 0)), __a, 0); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmvnq_u16 (uint16x8_t __a) +vrndiq_f32 (float32x4_t __a) { - return ~__a; + return __builtin_aarch64_nearbyintv4sf (__a); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmvnq_u32 (uint32x4_t __a) +vrndiq_f64 (float64x2_t __a) { - return ~__a; + return __builtin_aarch64_nearbyintv2df (__a); } -/* vneg */ +/* vrndm */ __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vneg_f32 (float32x2_t __a) +vrndm_f32 (float32x2_t __a) { - return -__a; + return __builtin_aarch64_floorv2sf (__a); } __extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vneg_f64 (float64x1_t __a) +vrndm_f64 (float64x1_t __a) { - return -__a; + return vset_lane_f64 (__builtin_floor (vget_lane_f64 (__a, 0)), __a, 0); } -__extension__ extern __inline int8x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vneg_s8 (int8x8_t __a) +vrndmq_f32 (float32x4_t __a) { - return -__a; + return __builtin_aarch64_floorv4sf (__a); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vneg_s16 (int16x4_t __a) +vrndmq_f64 (float64x2_t __a) { - return -__a; + return __builtin_aarch64_floorv2df (__a); } -__extension__ extern __inline int32x2_t +/* vrndn */ + +__extension__ extern __inline float32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vneg_s32 (int32x2_t __a) +vrndns_f32 (float32_t __a) { - return -__a; + return __builtin_aarch64_frintnsf (__a); } -__extension__ extern __inline int64x1_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vneg_s64 (int64x1_t __a) +vrndn_f32 (float32x2_t __a) { - return -__a; + return __builtin_aarch64_frintnv2sf (__a); +} + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrndn_f64 (float64x1_t __a) +{ + return (float64x1_t) {__builtin_aarch64_frintndf (__a[0])}; } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vnegq_f32 (float32x4_t __a) +vrndnq_f32 (float32x4_t __a) { - return -__a; + return __builtin_aarch64_frintnv4sf (__a); } __extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vnegq_f64 (float64x2_t __a) +vrndnq_f64 (float64x2_t __a) { - return -__a; + return __builtin_aarch64_frintnv2df (__a); } -__extension__ extern __inline int8x16_t +/* vrndp */ + +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vnegq_s8 (int8x16_t __a) +vrndp_f32 (float32x2_t __a) { - return -__a; + return __builtin_aarch64_ceilv2sf (__a); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vnegq_s16 (int16x8_t __a) +vrndp_f64 (float64x1_t __a) { - return -__a; + return vset_lane_f64 (__builtin_ceil (vget_lane_f64 (__a, 0)), __a, 0); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vnegq_s32 (int32x4_t __a) +vrndpq_f32 (float32x4_t __a) { - return -__a; + return __builtin_aarch64_ceilv4sf (__a); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vnegq_s64 (int64x2_t __a) +vrndpq_f64 (float64x2_t __a) { - return -__a; + return __builtin_aarch64_ceilv2df (__a); } -/* vpadd */ +/* vrndx */ __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadd_f32 (float32x2_t __a, float32x2_t __b) +vrndx_f32 (float32x2_t __a) { - return __builtin_aarch64_faddpv2sf (__a, __b); + return __builtin_aarch64_rintv2sf (__a); +} + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrndx_f64 (float64x1_t __a) +{ + return vset_lane_f64 (__builtin_rint (vget_lane_f64 (__a, 0)), __a, 0); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddq_f32 (float32x4_t __a, float32x4_t __b) +vrndxq_f32 (float32x4_t __a) { - return __builtin_aarch64_faddpv4sf (__a, __b); + return __builtin_aarch64_rintv4sf (__a); } __extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddq_f64 (float64x2_t __a, float64x2_t __b) +vrndxq_f64 (float64x2_t __a) { - return __builtin_aarch64_faddpv2df (__a, __b); + return __builtin_aarch64_rintv2df (__a); } +/* vrshl */ + __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadd_s8 (int8x8_t __a, int8x8_t __b) +vrshl_s8 (int8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_addpv8qi (__a, __b); + return (int8x8_t) __builtin_aarch64_srshlv8qi (__a, __b); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadd_s16 (int16x4_t __a, int16x4_t __b) +vrshl_s16 (int16x4_t __a, int16x4_t __b) { - return __builtin_aarch64_addpv4hi (__a, __b); + return (int16x4_t) __builtin_aarch64_srshlv4hi (__a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadd_s32 (int32x2_t __a, int32x2_t __b) +vrshl_s32 (int32x2_t __a, int32x2_t __b) { - return __builtin_aarch64_addpv2si (__a, __b); + return (int32x2_t) __builtin_aarch64_srshlv2si (__a, __b); +} + +__extension__ extern __inline int64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrshl_s64 (int64x1_t __a, int64x1_t __b) +{ + return (int64x1_t) {__builtin_aarch64_srshldi (__a[0], __b[0])}; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadd_u8 (uint8x8_t __a, uint8x8_t __b) +vrshl_u8 (uint8x8_t __a, int8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_addpv8qi ((int8x8_t) __a, - (int8x8_t) __b); + return __builtin_aarch64_urshlv8qi_uus (__a, __b); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadd_u16 (uint16x4_t __a, uint16x4_t __b) +vrshl_u16 (uint16x4_t __a, int16x4_t __b) { - return (uint16x4_t) __builtin_aarch64_addpv4hi ((int16x4_t) __a, - (int16x4_t) __b); + return __builtin_aarch64_urshlv4hi_uus (__a, __b); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadd_u32 (uint32x2_t __a, uint32x2_t __b) +vrshl_u32 (uint32x2_t __a, int32x2_t __b) { - return (uint32x2_t) __builtin_aarch64_addpv2si ((int32x2_t) __a, - (int32x2_t) __b); + return __builtin_aarch64_urshlv2si_uus (__a, __b); } -__extension__ extern __inline float32_t +__extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadds_f32 (float32x2_t __a) +vrshl_u64 (uint64x1_t __a, int64x1_t __b) { - return __builtin_aarch64_reduc_plus_scal_v2sf (__a); + return (uint64x1_t) {__builtin_aarch64_urshldi_uus (__a[0], __b[0])}; } -__extension__ extern __inline float64_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddd_f64 (float64x2_t __a) +vrshlq_s8 (int8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_reduc_plus_scal_v2df (__a); + return (int8x16_t) __builtin_aarch64_srshlv16qi (__a, __b); } -__extension__ extern __inline int64_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddd_s64 (int64x2_t __a) +vrshlq_s16 (int16x8_t __a, int16x8_t __b) { - return __builtin_aarch64_addpdi (__a); + return (int16x8_t) __builtin_aarch64_srshlv8hi (__a, __b); } -__extension__ extern __inline uint64_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddd_u64 (uint64x2_t __a) +vrshlq_s32 (int32x4_t __a, int32x4_t __b) { - return __builtin_aarch64_addpdi ((int64x2_t) __a); + return (int32x4_t) __builtin_aarch64_srshlv4si (__a, __b); } -/* vqabs */ - __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqabsq_s64 (int64x2_t __a) +vrshlq_s64 (int64x2_t __a, int64x2_t __b) { - return (int64x2_t) __builtin_aarch64_sqabsv2di (__a); + return (int64x2_t) __builtin_aarch64_srshlv2di (__a, __b); } -__extension__ extern __inline int8_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqabsb_s8 (int8_t __a) +vrshlq_u8 (uint8x16_t __a, int8x16_t __b) { - return (int8_t) __builtin_aarch64_sqabsqi (__a); + return __builtin_aarch64_urshlv16qi_uus (__a, __b); } -__extension__ extern __inline int16_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqabsh_s16 (int16_t __a) +vrshlq_u16 (uint16x8_t __a, int16x8_t __b) { - return (int16_t) __builtin_aarch64_sqabshi (__a); + return __builtin_aarch64_urshlv8hi_uus (__a, __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqabss_s32 (int32_t __a) +vrshlq_u32 (uint32x4_t __a, int32x4_t __b) { - return (int32_t) __builtin_aarch64_sqabssi (__a); + return __builtin_aarch64_urshlv4si_uus (__a, __b); +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrshlq_u64 (uint64x2_t __a, int64x2_t __b) +{ + return __builtin_aarch64_urshlv2di_uus (__a, __b); } __extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqabsd_s64 (int64_t __a) +vrshld_s64 (int64_t __a, int64_t __b) { - return __builtin_aarch64_sqabsdi (__a); + return __builtin_aarch64_srshldi (__a, __b); } -/* vqadd */ +__extension__ extern __inline uint64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrshld_u64 (uint64_t __a, int64_t __b) +{ + return __builtin_aarch64_urshldi_uus (__a, __b); +} -__extension__ extern __inline int8_t +/* vrshr */ + +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqaddb_s8 (int8_t __a, int8_t __b) +vrshr_n_s8 (int8x8_t __a, const int __b) { - return (int8_t) __builtin_aarch64_sqaddqi (__a, __b); + return (int8x8_t) __builtin_aarch64_srshr_nv8qi (__a, __b); } -__extension__ extern __inline int16_t +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqaddh_s16 (int16_t __a, int16_t __b) +vrshr_n_s16 (int16x4_t __a, const int __b) { - return (int16_t) __builtin_aarch64_sqaddhi (__a, __b); + return (int16x4_t) __builtin_aarch64_srshr_nv4hi (__a, __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqadds_s32 (int32_t __a, int32_t __b) +vrshr_n_s32 (int32x2_t __a, const int __b) { - return (int32_t) __builtin_aarch64_sqaddsi (__a, __b); + return (int32x2_t) __builtin_aarch64_srshr_nv2si (__a, __b); } -__extension__ extern __inline int64_t +__extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqaddd_s64 (int64_t __a, int64_t __b) +vrshr_n_s64 (int64x1_t __a, const int __b) { - return __builtin_aarch64_sqadddi (__a, __b); + return (int64x1_t) {__builtin_aarch64_srshr_ndi (__a[0], __b)}; } -__extension__ extern __inline uint8_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqaddb_u8 (uint8_t __a, uint8_t __b) +vrshr_n_u8 (uint8x8_t __a, const int __b) { - return (uint8_t) __builtin_aarch64_uqaddqi_uuu (__a, __b); + return __builtin_aarch64_urshr_nv8qi_uus (__a, __b); } -__extension__ extern __inline uint16_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqaddh_u16 (uint16_t __a, uint16_t __b) +vrshr_n_u16 (uint16x4_t __a, const int __b) { - return (uint16_t) __builtin_aarch64_uqaddhi_uuu (__a, __b); + return __builtin_aarch64_urshr_nv4hi_uus (__a, __b); } -__extension__ extern __inline uint32_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqadds_u32 (uint32_t __a, uint32_t __b) +vrshr_n_u32 (uint32x2_t __a, const int __b) { - return (uint32_t) __builtin_aarch64_uqaddsi_uuu (__a, __b); + return __builtin_aarch64_urshr_nv2si_uus (__a, __b); +} + +__extension__ extern __inline uint64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrshr_n_u64 (uint64x1_t __a, const int __b) +{ + return (uint64x1_t) {__builtin_aarch64_urshr_ndi_uus (__a[0], __b)}; +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrshrq_n_s8 (int8x16_t __a, const int __b) +{ + return (int8x16_t) __builtin_aarch64_srshr_nv16qi (__a, __b); } -__extension__ extern __inline uint64_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqaddd_u64 (uint64_t __a, uint64_t __b) +vrshrq_n_s16 (int16x8_t __a, const int __b) { - return __builtin_aarch64_uqadddi_uuu (__a, __b); + return (int16x8_t) __builtin_aarch64_srshr_nv8hi (__a, __b); } -/* vqdmlal */ - __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c) +vrshrq_n_s32 (int32x4_t __a, const int __b) { - return __builtin_aarch64_sqdmlalv4hi (__a, __b, __c); + return (int32x4_t) __builtin_aarch64_srshr_nv4si (__a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlal_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c) +vrshrq_n_s64 (int64x2_t __a, const int __b) { - return __builtin_aarch64_sqdmlal2v8hi (__a, __b, __c); + return (int64x2_t) __builtin_aarch64_srshr_nv2di (__a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlal_high_lane_s16 (int32x4_t __a, int16x8_t __b, int16x4_t __c, - int const __d) +vrshrq_n_u8 (uint8x16_t __a, const int __b) { - return __builtin_aarch64_sqdmlal2_lanev8hi (__a, __b, __c, __d); + return __builtin_aarch64_urshr_nv16qi_uus (__a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlal_high_laneq_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c, - int const __d) +vrshrq_n_u16 (uint16x8_t __a, const int __b) { - return __builtin_aarch64_sqdmlal2_laneqv8hi (__a, __b, __c, __d); + return __builtin_aarch64_urshr_nv8hi_uus (__a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlal_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c) +vrshrq_n_u32 (uint32x4_t __a, const int __b) { - return __builtin_aarch64_sqdmlal2_nv8hi (__a, __b, __c); + return __builtin_aarch64_urshr_nv4si_uus (__a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, int const __d) +vrshrq_n_u64 (uint64x2_t __a, const int __b) { - return __builtin_aarch64_sqdmlal_lanev4hi (__a, __b, __c, __d); + return __builtin_aarch64_urshr_nv2di_uus (__a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlal_laneq_s16 (int32x4_t __a, int16x4_t __b, int16x8_t __c, int const __d) +vrshrd_n_s64 (int64_t __a, const int __b) { - return __builtin_aarch64_sqdmlal_laneqv4hi (__a, __b, __c, __d); + return __builtin_aarch64_srshr_ndi (__a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c) +vrshrd_n_u64 (uint64_t __a, const int __b) { - return __builtin_aarch64_sqdmlal_nv4hi (__a, __b, __c); + return __builtin_aarch64_urshr_ndi_uus (__a, __b); } -__extension__ extern __inline int64x2_t +/* vrsqrte. */ + +__extension__ extern __inline float32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c) +vrsqrtes_f32 (float32_t __a) { - return __builtin_aarch64_sqdmlalv2si (__a, __b, __c); + return __builtin_aarch64_rsqrtesf (__a); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline float64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlal_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c) +vrsqrted_f64 (float64_t __a) { - return __builtin_aarch64_sqdmlal2v4si (__a, __b, __c); + return __builtin_aarch64_rsqrtedf (__a); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlal_high_lane_s32 (int64x2_t __a, int32x4_t __b, int32x2_t __c, - int const __d) +vrsqrte_f32 (float32x2_t __a) { - return __builtin_aarch64_sqdmlal2_lanev4si (__a, __b, __c, __d); + return __builtin_aarch64_rsqrtev2sf (__a); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlal_high_laneq_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c, - int const __d) +vrsqrte_f64 (float64x1_t __a) { - return __builtin_aarch64_sqdmlal2_laneqv4si (__a, __b, __c, __d); + return (float64x1_t) {vrsqrted_f64 (vget_lane_f64 (__a, 0))}; } -__extension__ extern __inline int64x2_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlal_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c) +vrsqrteq_f32 (float32x4_t __a) { - return __builtin_aarch64_sqdmlal2_nv4si (__a, __b, __c); + return __builtin_aarch64_rsqrtev4sf (__a); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, int const __d) +vrsqrteq_f64 (float64x2_t __a) { - return __builtin_aarch64_sqdmlal_lanev2si (__a, __b, __c, __d); + return __builtin_aarch64_rsqrtev2df (__a); } -__extension__ extern __inline int64x2_t +/* vrsqrts. */ + +__extension__ extern __inline float32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlal_laneq_s32 (int64x2_t __a, int32x2_t __b, int32x4_t __c, int const __d) +vrsqrtss_f32 (float32_t __a, float32_t __b) { - return __builtin_aarch64_sqdmlal_laneqv2si (__a, __b, __c, __d); + return __builtin_aarch64_rsqrtssf (__a, __b); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline float64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c) +vrsqrtsd_f64 (float64_t __a, float64_t __b) { - return __builtin_aarch64_sqdmlal_nv2si (__a, __b, __c); + return __builtin_aarch64_rsqrtsdf (__a, __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlalh_s16 (int32_t __a, int16_t __b, int16_t __c) +vrsqrts_f32 (float32x2_t __a, float32x2_t __b) { - return __builtin_aarch64_sqdmlalhi (__a, __b, __c); + return __builtin_aarch64_rsqrtsv2sf (__a, __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlalh_lane_s16 (int32_t __a, int16_t __b, int16x4_t __c, const int __d) +vrsqrts_f64 (float64x1_t __a, float64x1_t __b) { - return __builtin_aarch64_sqdmlal_lanehi (__a, __b, __c, __d); + return (float64x1_t) {vrsqrtsd_f64 (vget_lane_f64 (__a, 0), + vget_lane_f64 (__b, 0))}; } -__extension__ extern __inline int32_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlalh_laneq_s16 (int32_t __a, int16_t __b, int16x8_t __c, const int __d) +vrsqrtsq_f32 (float32x4_t __a, float32x4_t __b) { - return __builtin_aarch64_sqdmlal_laneqhi (__a, __b, __c, __d); + return __builtin_aarch64_rsqrtsv4sf (__a, __b); } -__extension__ extern __inline int64_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlals_s32 (int64_t __a, int32_t __b, int32_t __c) +vrsqrtsq_f64 (float64x2_t __a, float64x2_t __b) { - return __builtin_aarch64_sqdmlalsi (__a, __b, __c); + return __builtin_aarch64_rsqrtsv2df (__a, __b); } -__extension__ extern __inline int64_t +/* vrsra */ + +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlals_lane_s32 (int64_t __a, int32_t __b, int32x2_t __c, const int __d) +vrsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c) { - return __builtin_aarch64_sqdmlal_lanesi (__a, __b, __c, __d); + return (int8x8_t) __builtin_aarch64_srsra_nv8qi (__a, __b, __c); } -__extension__ extern __inline int64_t +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlals_laneq_s32 (int64_t __a, int32_t __b, int32x4_t __c, const int __d) +vrsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c) { - return __builtin_aarch64_sqdmlal_laneqsi (__a, __b, __c, __d); + return (int16x4_t) __builtin_aarch64_srsra_nv4hi (__a, __b, __c); } -/* vqdmlsl */ - -__extension__ extern __inline int32x4_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c) +vrsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c) { - return __builtin_aarch64_sqdmlslv4hi (__a, __b, __c); + return (int32x2_t) __builtin_aarch64_srsra_nv2si (__a, __b, __c); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlsl_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c) +vrsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c) { - return __builtin_aarch64_sqdmlsl2v8hi (__a, __b, __c); + return (int64x1_t) {__builtin_aarch64_srsra_ndi (__a[0], __b[0], __c)}; } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlsl_high_lane_s16 (int32x4_t __a, int16x8_t __b, int16x4_t __c, - int const __d) +vrsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c) { - return __builtin_aarch64_sqdmlsl2_lanev8hi (__a, __b, __c, __d); + return __builtin_aarch64_ursra_nv8qi_uuus (__a, __b, __c); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlsl_high_laneq_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c, - int const __d) +vrsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) { - return __builtin_aarch64_sqdmlsl2_laneqv8hi (__a, __b, __c, __d); + return __builtin_aarch64_ursra_nv4hi_uuus (__a, __b, __c); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlsl_high_n_s16 (int32x4_t __a, int16x8_t __b, int16_t __c) +vrsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) { - return __builtin_aarch64_sqdmlsl2_nv8hi (__a, __b, __c); + return __builtin_aarch64_ursra_nv2si_uuus (__a, __b, __c); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, int const __d) +vrsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) { - return __builtin_aarch64_sqdmlsl_lanev4hi (__a, __b, __c, __d); + return (uint64x1_t) {__builtin_aarch64_ursra_ndi_uuus (__a[0], __b[0], __c)}; } -__extension__ extern __inline int32x4_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlsl_laneq_s16 (int32x4_t __a, int16x4_t __b, int16x8_t __c, int const __d) +vrsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c) { - return __builtin_aarch64_sqdmlsl_laneqv4hi (__a, __b, __c, __d); + return (int8x16_t) __builtin_aarch64_srsra_nv16qi (__a, __b, __c); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c) +vrsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c) { - return __builtin_aarch64_sqdmlsl_nv4hi (__a, __b, __c); + return (int16x8_t) __builtin_aarch64_srsra_nv8hi (__a, __b, __c); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c) +vrsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c) { - return __builtin_aarch64_sqdmlslv2si (__a, __b, __c); + return (int32x4_t) __builtin_aarch64_srsra_nv4si (__a, __b, __c); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlsl_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c) +vrsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c) { - return __builtin_aarch64_sqdmlsl2v4si (__a, __b, __c); + return (int64x2_t) __builtin_aarch64_srsra_nv2di (__a, __b, __c); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlsl_high_lane_s32 (int64x2_t __a, int32x4_t __b, int32x2_t __c, - int const __d) +vrsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c) { - return __builtin_aarch64_sqdmlsl2_lanev4si (__a, __b, __c, __d); + return __builtin_aarch64_ursra_nv16qi_uuus (__a, __b, __c); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlsl_high_laneq_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c, - int const __d) +vrsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) { - return __builtin_aarch64_sqdmlsl2_laneqv4si (__a, __b, __c, __d); + return __builtin_aarch64_ursra_nv8hi_uuus (__a, __b, __c); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlsl_high_n_s32 (int64x2_t __a, int32x4_t __b, int32_t __c) +vrsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) { - return __builtin_aarch64_sqdmlsl2_nv4si (__a, __b, __c); + return __builtin_aarch64_ursra_nv4si_uuus (__a, __b, __c); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, int const __d) +vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c) { - return __builtin_aarch64_sqdmlsl_lanev2si (__a, __b, __c, __d); + return __builtin_aarch64_ursra_nv2di_uuus (__a, __b, __c); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlsl_laneq_s32 (int64x2_t __a, int32x2_t __b, int32x4_t __c, int const __d) +vrsrad_n_s64 (int64_t __a, int64_t __b, const int __c) { - return __builtin_aarch64_sqdmlsl_laneqv2si (__a, __b, __c, __d); + return __builtin_aarch64_srsra_ndi (__a, __b, __c); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c) +vrsrad_n_u64 (uint64_t __a, uint64_t __b, const int __c) { - return __builtin_aarch64_sqdmlsl_nv2si (__a, __b, __c); + return __builtin_aarch64_ursra_ndi_uuus (__a, __b, __c); } -__extension__ extern __inline int32_t +#pragma GCC push_options +#pragma GCC target ("+nothing+crypto") + +/* vsha1 */ + +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlslh_s16 (int32_t __a, int16_t __b, int16_t __c) +vsha1cq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk) { - return __builtin_aarch64_sqdmlslhi (__a, __b, __c); + return __builtin_aarch64_crypto_sha1cv4si_uuuu (__hash_abcd, __hash_e, __wk); } -__extension__ extern __inline int32_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlslh_lane_s16 (int32_t __a, int16_t __b, int16x4_t __c, const int __d) +vsha1mq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk) { - return __builtin_aarch64_sqdmlsl_lanehi (__a, __b, __c, __d); + return __builtin_aarch64_crypto_sha1mv4si_uuuu (__hash_abcd, __hash_e, __wk); } -__extension__ extern __inline int32_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlslh_laneq_s16 (int32_t __a, int16_t __b, int16x8_t __c, const int __d) +vsha1pq_u32 (uint32x4_t __hash_abcd, uint32_t __hash_e, uint32x4_t __wk) { - return __builtin_aarch64_sqdmlsl_laneqhi (__a, __b, __c, __d); + return __builtin_aarch64_crypto_sha1pv4si_uuuu (__hash_abcd, __hash_e, __wk); } -__extension__ extern __inline int64_t +__extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlsls_s32 (int64_t __a, int32_t __b, int32_t __c) +vsha1h_u32 (uint32_t __hash_e) { - return __builtin_aarch64_sqdmlslsi (__a, __b, __c); + return __builtin_aarch64_crypto_sha1hsi_uu (__hash_e); } -__extension__ extern __inline int64_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlsls_lane_s32 (int64_t __a, int32_t __b, int32x2_t __c, const int __d) +vsha1su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7, uint32x4_t __w8_11) { - return __builtin_aarch64_sqdmlsl_lanesi (__a, __b, __c, __d); + return __builtin_aarch64_crypto_sha1su0v4si_uuuu (__w0_3, __w4_7, __w8_11); } -__extension__ extern __inline int64_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmlsls_laneq_s32 (int64_t __a, int32_t __b, int32x4_t __c, const int __d) +vsha1su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w12_15) { - return __builtin_aarch64_sqdmlsl_laneqsi (__a, __b, __c, __d); + return __builtin_aarch64_crypto_sha1su1v4si_uuu (__tw0_3, __w12_15); } -/* vqdmulh */ - -__extension__ extern __inline int16x4_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c) +vsha256hq_u32 (uint32x4_t __hash_abcd, uint32x4_t __hash_efgh, uint32x4_t __wk) { - return __builtin_aarch64_sqdmulh_lanev4hi (__a, __b, __c); + return __builtin_aarch64_crypto_sha256hv4si_uuuu (__hash_abcd, __hash_efgh, + __wk); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c) +vsha256h2q_u32 (uint32x4_t __hash_efgh, uint32x4_t __hash_abcd, uint32x4_t __wk) { - return __builtin_aarch64_sqdmulh_lanev2si (__a, __b, __c); + return __builtin_aarch64_crypto_sha256h2v4si_uuuu (__hash_efgh, __hash_abcd, + __wk); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c) +vsha256su0q_u32 (uint32x4_t __w0_3, uint32x4_t __w4_7) { - return __builtin_aarch64_sqdmulh_lanev8hi (__a, __b, __c); + return __builtin_aarch64_crypto_sha256su0v4si_uuu (__w0_3, __w4_7); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c) +vsha256su1q_u32 (uint32x4_t __tw0_3, uint32x4_t __w8_11, uint32x4_t __w12_15) { - return __builtin_aarch64_sqdmulh_lanev4si (__a, __b, __c); + return __builtin_aarch64_crypto_sha256su1v4si_uuuu (__tw0_3, __w8_11, + __w12_15); } -__extension__ extern __inline int16_t +__extension__ extern __inline poly128_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmulhh_s16 (int16_t __a, int16_t __b) +vmull_p64 (poly64_t __a, poly64_t __b) { - return (int16_t) __builtin_aarch64_sqdmulhhi (__a, __b); + return + __builtin_aarch64_crypto_pmulldi_ppp (__a, __b); } -__extension__ extern __inline int16_t +__extension__ extern __inline poly128_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmulhh_lane_s16 (int16_t __a, int16x4_t __b, const int __c) +vmull_high_p64 (poly64x2_t __a, poly64x2_t __b) { - return __builtin_aarch64_sqdmulh_lanehi (__a, __b, __c); + return __builtin_aarch64_crypto_pmullv2di_ppp (__a, __b); } -__extension__ extern __inline int16_t +#pragma GCC pop_options + +/* vshl */ + +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmulhh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c) +vshl_n_s8 (int8x8_t __a, const int __b) { - return __builtin_aarch64_sqdmulh_laneqhi (__a, __b, __c); + return (int8x8_t) __builtin_aarch64_ashlv8qi (__a, __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmulhs_s32 (int32_t __a, int32_t __b) +vshl_n_s16 (int16x4_t __a, const int __b) { - return (int32_t) __builtin_aarch64_sqdmulhsi (__a, __b); + return (int16x4_t) __builtin_aarch64_ashlv4hi (__a, __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmulhs_lane_s32 (int32_t __a, int32x2_t __b, const int __c) +vshl_n_s32 (int32x2_t __a, const int __b) { - return __builtin_aarch64_sqdmulh_lanesi (__a, __b, __c); + return (int32x2_t) __builtin_aarch64_ashlv2si (__a, __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmulhs_laneq_s32 (int32_t __a, int32x4_t __b, const int __c) +vshl_n_s64 (int64x1_t __a, const int __b) { - return __builtin_aarch64_sqdmulh_laneqsi (__a, __b, __c); + return (int64x1_t) {__builtin_aarch64_ashldi (__a[0], __b)}; } -/* vqdmull */ - -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmull_s16 (int16x4_t __a, int16x4_t __b) +vshl_n_u8 (uint8x8_t __a, const int __b) { - return __builtin_aarch64_sqdmullv4hi (__a, __b); + return (uint8x8_t) __builtin_aarch64_ashlv8qi ((int8x8_t) __a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmull_high_s16 (int16x8_t __a, int16x8_t __b) +vshl_n_u16 (uint16x4_t __a, const int __b) { - return __builtin_aarch64_sqdmull2v8hi (__a, __b); + return (uint16x4_t) __builtin_aarch64_ashlv4hi ((int16x4_t) __a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmull_high_lane_s16 (int16x8_t __a, int16x4_t __b, int const __c) +vshl_n_u32 (uint32x2_t __a, const int __b) { - return __builtin_aarch64_sqdmull2_lanev8hi (__a, __b,__c); + return (uint32x2_t) __builtin_aarch64_ashlv2si ((int32x2_t) __a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmull_high_laneq_s16 (int16x8_t __a, int16x8_t __b, int const __c) +vshl_n_u64 (uint64x1_t __a, const int __b) { - return __builtin_aarch64_sqdmull2_laneqv8hi (__a, __b,__c); + return (uint64x1_t) {__builtin_aarch64_ashldi ((int64_t) __a[0], __b)}; } -__extension__ extern __inline int32x4_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmull_high_n_s16 (int16x8_t __a, int16_t __b) +vshlq_n_s8 (int8x16_t __a, const int __b) { - return __builtin_aarch64_sqdmull2_nv8hi (__a, __b); + return (int8x16_t) __builtin_aarch64_ashlv16qi (__a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmull_lane_s16 (int16x4_t __a, int16x4_t __b, int const __c) +vshlq_n_s16 (int16x8_t __a, const int __b) { - return __builtin_aarch64_sqdmull_lanev4hi (__a, __b, __c); + return (int16x8_t) __builtin_aarch64_ashlv8hi (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmull_laneq_s16 (int16x4_t __a, int16x8_t __b, int const __c) +vshlq_n_s32 (int32x4_t __a, const int __b) { - return __builtin_aarch64_sqdmull_laneqv4hi (__a, __b, __c); + return (int32x4_t) __builtin_aarch64_ashlv4si (__a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmull_n_s16 (int16x4_t __a, int16_t __b) +vshlq_n_s64 (int64x2_t __a, const int __b) { - return __builtin_aarch64_sqdmull_nv4hi (__a, __b); + return (int64x2_t) __builtin_aarch64_ashlv2di (__a, __b); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmull_s32 (int32x2_t __a, int32x2_t __b) +vshlq_n_u8 (uint8x16_t __a, const int __b) { - return __builtin_aarch64_sqdmullv2si (__a, __b); + return (uint8x16_t) __builtin_aarch64_ashlv16qi ((int8x16_t) __a, __b); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmull_high_s32 (int32x4_t __a, int32x4_t __b) +vshlq_n_u16 (uint16x8_t __a, const int __b) { - return __builtin_aarch64_sqdmull2v4si (__a, __b); + return (uint16x8_t) __builtin_aarch64_ashlv8hi ((int16x8_t) __a, __b); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmull_high_lane_s32 (int32x4_t __a, int32x2_t __b, int const __c) +vshlq_n_u32 (uint32x4_t __a, const int __b) { - return __builtin_aarch64_sqdmull2_lanev4si (__a, __b, __c); + return (uint32x4_t) __builtin_aarch64_ashlv4si ((int32x4_t) __a, __b); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmull_high_laneq_s32 (int32x4_t __a, int32x4_t __b, int const __c) +vshlq_n_u64 (uint64x2_t __a, const int __b) { - return __builtin_aarch64_sqdmull2_laneqv4si (__a, __b, __c); + return (uint64x2_t) __builtin_aarch64_ashlv2di ((int64x2_t) __a, __b); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmull_high_n_s32 (int32x4_t __a, int32_t __b) +vshld_n_s64 (int64_t __a, const int __b) { - return __builtin_aarch64_sqdmull2_nv4si (__a, __b); + return __builtin_aarch64_ashldi (__a, __b); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmull_lane_s32 (int32x2_t __a, int32x2_t __b, int const __c) +vshld_n_u64 (uint64_t __a, const int __b) { - return __builtin_aarch64_sqdmull_lanev2si (__a, __b, __c); + return (uint64_t) __builtin_aarch64_ashldi (__a, __b); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmull_laneq_s32 (int32x2_t __a, int32x4_t __b, int const __c) +vshl_s8 (int8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_sqdmull_laneqv2si (__a, __b, __c); + return __builtin_aarch64_sshlv8qi (__a, __b); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmull_n_s32 (int32x2_t __a, int32_t __b) +vshl_s16 (int16x4_t __a, int16x4_t __b) { - return __builtin_aarch64_sqdmull_nv2si (__a, __b); + return __builtin_aarch64_sshlv4hi (__a, __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmullh_s16 (int16_t __a, int16_t __b) +vshl_s32 (int32x2_t __a, int32x2_t __b) { - return (int32_t) __builtin_aarch64_sqdmullhi (__a, __b); + return __builtin_aarch64_sshlv2si (__a, __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmullh_lane_s16 (int16_t __a, int16x4_t __b, const int __c) +vshl_s64 (int64x1_t __a, int64x1_t __b) { - return __builtin_aarch64_sqdmull_lanehi (__a, __b, __c); + return (int64x1_t) {__builtin_aarch64_sshldi (__a[0], __b[0])}; } -__extension__ extern __inline int32_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmullh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c) +vshl_u8 (uint8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_sqdmull_laneqhi (__a, __b, __c); + return __builtin_aarch64_ushlv8qi_uus (__a, __b); } -__extension__ extern __inline int64_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmulls_s32 (int32_t __a, int32_t __b) +vshl_u16 (uint16x4_t __a, int16x4_t __b) { - return __builtin_aarch64_sqdmullsi (__a, __b); + return __builtin_aarch64_ushlv4hi_uus (__a, __b); } -__extension__ extern __inline int64_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmulls_lane_s32 (int32_t __a, int32x2_t __b, const int __c) +vshl_u32 (uint32x2_t __a, int32x2_t __b) { - return __builtin_aarch64_sqdmull_lanesi (__a, __b, __c); + return __builtin_aarch64_ushlv2si_uus (__a, __b); } -__extension__ extern __inline int64_t +__extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqdmulls_laneq_s32 (int32_t __a, int32x4_t __b, const int __c) +vshl_u64 (uint64x1_t __a, int64x1_t __b) { - return __builtin_aarch64_sqdmull_laneqsi (__a, __b, __c); + return (uint64x1_t) {__builtin_aarch64_ushldi_uus (__a[0], __b[0])}; } -/* vqmovn */ - -__extension__ extern __inline int8x8_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovn_s16 (int16x8_t __a) +vshlq_s8 (int8x16_t __a, int8x16_t __b) { - return (int8x8_t) __builtin_aarch64_sqmovnv8hi (__a); + return __builtin_aarch64_sshlv16qi (__a, __b); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovn_s32 (int32x4_t __a) +vshlq_s16 (int16x8_t __a, int16x8_t __b) { - return (int16x4_t) __builtin_aarch64_sqmovnv4si (__a); + return __builtin_aarch64_sshlv8hi (__a, __b); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovn_s64 (int64x2_t __a) +vshlq_s32 (int32x4_t __a, int32x4_t __b) { - return (int32x2_t) __builtin_aarch64_sqmovnv2di (__a); + return __builtin_aarch64_sshlv4si (__a, __b); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovn_u16 (uint16x8_t __a) +vshlq_s64 (int64x2_t __a, int64x2_t __b) { - return (uint8x8_t) __builtin_aarch64_uqmovnv8hi ((int16x8_t) __a); + return __builtin_aarch64_sshlv2di (__a, __b); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovn_u32 (uint32x4_t __a) +vshlq_u8 (uint8x16_t __a, int8x16_t __b) { - return (uint16x4_t) __builtin_aarch64_uqmovnv4si ((int32x4_t) __a); + return __builtin_aarch64_ushlv16qi_uus (__a, __b); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovn_u64 (uint64x2_t __a) +vshlq_u16 (uint16x8_t __a, int16x8_t __b) { - return (uint32x2_t) __builtin_aarch64_uqmovnv2di ((int64x2_t) __a); + return __builtin_aarch64_ushlv8hi_uus (__a, __b); } -__extension__ extern __inline int8_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovnh_s16 (int16_t __a) +vshlq_u32 (uint32x4_t __a, int32x4_t __b) { - return (int8_t) __builtin_aarch64_sqmovnhi (__a); + return __builtin_aarch64_ushlv4si_uus (__a, __b); } -__extension__ extern __inline int16_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovns_s32 (int32_t __a) +vshlq_u64 (uint64x2_t __a, int64x2_t __b) { - return (int16_t) __builtin_aarch64_sqmovnsi (__a); + return __builtin_aarch64_ushlv2di_uus (__a, __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovnd_s64 (int64_t __a) +vshld_s64 (int64_t __a, int64_t __b) { - return (int32_t) __builtin_aarch64_sqmovndi (__a); + return __builtin_aarch64_sshldi (__a, __b); } -__extension__ extern __inline uint8_t +__extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovnh_u16 (uint16_t __a) +vshld_u64 (uint64_t __a, int64_t __b) { - return (uint8_t) __builtin_aarch64_uqmovnhi (__a); + return __builtin_aarch64_ushldi_uus (__a, __b); } -__extension__ extern __inline uint16_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovns_u32 (uint32_t __a) +vshll_high_n_s8 (int8x16_t __a, const int __b) { - return (uint16_t) __builtin_aarch64_uqmovnsi (__a); + return __builtin_aarch64_sshll2_nv16qi (__a, __b); } -__extension__ extern __inline uint32_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovnd_u64 (uint64_t __a) +vshll_high_n_s16 (int16x8_t __a, const int __b) { - return (uint32_t) __builtin_aarch64_uqmovndi (__a); + return __builtin_aarch64_sshll2_nv8hi (__a, __b); } -/* vqmovun */ - -__extension__ extern __inline uint8x8_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovun_s16 (int16x8_t __a) +vshll_high_n_s32 (int32x4_t __a, const int __b) { - return (uint8x8_t) __builtin_aarch64_sqmovunv8hi (__a); + return __builtin_aarch64_sshll2_nv4si (__a, __b); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovun_s32 (int32x4_t __a) +vshll_high_n_u8 (uint8x16_t __a, const int __b) { - return (uint16x4_t) __builtin_aarch64_sqmovunv4si (__a); + return (uint16x8_t) __builtin_aarch64_ushll2_nv16qi ((int8x16_t) __a, __b); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovun_s64 (int64x2_t __a) +vshll_high_n_u16 (uint16x8_t __a, const int __b) { - return (uint32x2_t) __builtin_aarch64_sqmovunv2di (__a); + return (uint32x4_t) __builtin_aarch64_ushll2_nv8hi ((int16x8_t) __a, __b); } -__extension__ extern __inline int8_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovunh_s16 (int16_t __a) +vshll_high_n_u32 (uint32x4_t __a, const int __b) { - return (int8_t) __builtin_aarch64_sqmovunhi (__a); + return (uint64x2_t) __builtin_aarch64_ushll2_nv4si ((int32x4_t) __a, __b); } -__extension__ extern __inline int16_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovuns_s32 (int32_t __a) +vshll_n_s8 (int8x8_t __a, const int __b) { - return (int16_t) __builtin_aarch64_sqmovunsi (__a); + return __builtin_aarch64_sshll_nv8qi (__a, __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqmovund_s64 (int64_t __a) +vshll_n_s16 (int16x4_t __a, const int __b) { - return (int32_t) __builtin_aarch64_sqmovundi (__a); + return __builtin_aarch64_sshll_nv4hi (__a, __b); } -/* vqneg */ - __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqnegq_s64 (int64x2_t __a) +vshll_n_s32 (int32x2_t __a, const int __b) { - return (int64x2_t) __builtin_aarch64_sqnegv2di (__a); + return __builtin_aarch64_sshll_nv2si (__a, __b); } -__extension__ extern __inline int8_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqnegb_s8 (int8_t __a) +vshll_n_u8 (uint8x8_t __a, const int __b) { - return (int8_t) __builtin_aarch64_sqnegqi (__a); + return __builtin_aarch64_ushll_nv8qi_uus (__a, __b); } -__extension__ extern __inline int16_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqnegh_s16 (int16_t __a) +vshll_n_u16 (uint16x4_t __a, const int __b) { - return (int16_t) __builtin_aarch64_sqneghi (__a); + return __builtin_aarch64_ushll_nv4hi_uus (__a, __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqnegs_s32 (int32_t __a) +vshll_n_u32 (uint32x2_t __a, const int __b) { - return (int32_t) __builtin_aarch64_sqnegsi (__a); + return __builtin_aarch64_ushll_nv2si_uus (__a, __b); } -__extension__ extern __inline int64_t +/* vshr */ + +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqnegd_s64 (int64_t __a) +vshr_n_s8 (int8x8_t __a, const int __b) { - return __builtin_aarch64_sqnegdi (__a); + return (int8x8_t) __builtin_aarch64_ashrv8qi (__a, __b); } -/* vqrdmulh */ - __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c) +vshr_n_s16 (int16x4_t __a, const int __b) { - return __builtin_aarch64_sqrdmulh_lanev4hi (__a, __b, __c); + return (int16x4_t) __builtin_aarch64_ashrv4hi (__a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c) +vshr_n_s32 (int32x2_t __a, const int __b) { - return __builtin_aarch64_sqrdmulh_lanev2si (__a, __b, __c); + return (int32x2_t) __builtin_aarch64_ashrv2si (__a, __b); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c) +vshr_n_s64 (int64x1_t __a, const int __b) { - return __builtin_aarch64_sqrdmulh_lanev8hi (__a, __b, __c); + return (int64x1_t) {__builtin_aarch64_ashr_simddi (__a[0], __b)}; } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c) +vshr_n_u8 (uint8x8_t __a, const int __b) { - return __builtin_aarch64_sqrdmulh_lanev4si (__a, __b, __c); + return (uint8x8_t) __builtin_aarch64_lshrv8qi ((int8x8_t) __a, __b); } -__extension__ extern __inline int16_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrdmulhh_s16 (int16_t __a, int16_t __b) +vshr_n_u16 (uint16x4_t __a, const int __b) { - return (int16_t) __builtin_aarch64_sqrdmulhhi (__a, __b); + return (uint16x4_t) __builtin_aarch64_lshrv4hi ((int16x4_t) __a, __b); } -__extension__ extern __inline int16_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrdmulhh_lane_s16 (int16_t __a, int16x4_t __b, const int __c) +vshr_n_u32 (uint32x2_t __a, const int __b) { - return __builtin_aarch64_sqrdmulh_lanehi (__a, __b, __c); + return (uint32x2_t) __builtin_aarch64_lshrv2si ((int32x2_t) __a, __b); } -__extension__ extern __inline int16_t +__extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrdmulhh_laneq_s16 (int16_t __a, int16x8_t __b, const int __c) +vshr_n_u64 (uint64x1_t __a, const int __b) { - return __builtin_aarch64_sqrdmulh_laneqhi (__a, __b, __c); + return (uint64x1_t) {__builtin_aarch64_lshr_simddi_uus ( __a[0], __b)}; } -__extension__ extern __inline int32_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrdmulhs_s32 (int32_t __a, int32_t __b) +vshrq_n_s8 (int8x16_t __a, const int __b) { - return (int32_t) __builtin_aarch64_sqrdmulhsi (__a, __b); + return (int8x16_t) __builtin_aarch64_ashrv16qi (__a, __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrdmulhs_lane_s32 (int32_t __a, int32x2_t __b, const int __c) +vshrq_n_s16 (int16x8_t __a, const int __b) { - return __builtin_aarch64_sqrdmulh_lanesi (__a, __b, __c); + return (int16x8_t) __builtin_aarch64_ashrv8hi (__a, __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrdmulhs_laneq_s32 (int32_t __a, int32x4_t __b, const int __c) +vshrq_n_s32 (int32x4_t __a, const int __b) { - return __builtin_aarch64_sqrdmulh_laneqsi (__a, __b, __c); + return (int32x4_t) __builtin_aarch64_ashrv4si (__a, __b); } -/* vqrshl */ - -__extension__ extern __inline int8x8_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshl_s8 (int8x8_t __a, int8x8_t __b) +vshrq_n_s64 (int64x2_t __a, const int __b) { - return __builtin_aarch64_sqrshlv8qi (__a, __b); + return (int64x2_t) __builtin_aarch64_ashrv2di (__a, __b); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshl_s16 (int16x4_t __a, int16x4_t __b) +vshrq_n_u8 (uint8x16_t __a, const int __b) { - return __builtin_aarch64_sqrshlv4hi (__a, __b); + return (uint8x16_t) __builtin_aarch64_lshrv16qi ((int8x16_t) __a, __b); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshl_s32 (int32x2_t __a, int32x2_t __b) +vshrq_n_u16 (uint16x8_t __a, const int __b) { - return __builtin_aarch64_sqrshlv2si (__a, __b); + return (uint16x8_t) __builtin_aarch64_lshrv8hi ((int16x8_t) __a, __b); } -__extension__ extern __inline int64x1_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshl_s64 (int64x1_t __a, int64x1_t __b) +vshrq_n_u32 (uint32x4_t __a, const int __b) { - return (int64x1_t) {__builtin_aarch64_sqrshldi (__a[0], __b[0])}; + return (uint32x4_t) __builtin_aarch64_lshrv4si ((int32x4_t) __a, __b); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshl_u8 (uint8x8_t __a, int8x8_t __b) +vshrq_n_u64 (uint64x2_t __a, const int __b) { - return __builtin_aarch64_uqrshlv8qi_uus ( __a, __b); + return (uint64x2_t) __builtin_aarch64_lshrv2di ((int64x2_t) __a, __b); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshl_u16 (uint16x4_t __a, int16x4_t __b) +vshrd_n_s64 (int64_t __a, const int __b) { - return __builtin_aarch64_uqrshlv4hi_uus ( __a, __b); + return __builtin_aarch64_ashr_simddi (__a, __b); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshl_u32 (uint32x2_t __a, int32x2_t __b) +vshrd_n_u64 (uint64_t __a, const int __b) { - return __builtin_aarch64_uqrshlv2si_uus ( __a, __b); + return __builtin_aarch64_lshr_simddi_uus (__a, __b); } -__extension__ extern __inline uint64x1_t +/* vsli */ + +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshl_u64 (uint64x1_t __a, int64x1_t __b) +vsli_n_s8 (int8x8_t __a, int8x8_t __b, const int __c) { - return (uint64x1_t) {__builtin_aarch64_uqrshldi_uus (__a[0], __b[0])}; + return (int8x8_t) __builtin_aarch64_ssli_nv8qi (__a, __b, __c); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshlq_s8 (int8x16_t __a, int8x16_t __b) +vsli_n_s16 (int16x4_t __a, int16x4_t __b, const int __c) { - return __builtin_aarch64_sqrshlv16qi (__a, __b); + return (int16x4_t) __builtin_aarch64_ssli_nv4hi (__a, __b, __c); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshlq_s16 (int16x8_t __a, int16x8_t __b) +vsli_n_s32 (int32x2_t __a, int32x2_t __b, const int __c) { - return __builtin_aarch64_sqrshlv8hi (__a, __b); + return (int32x2_t) __builtin_aarch64_ssli_nv2si (__a, __b, __c); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshlq_s32 (int32x4_t __a, int32x4_t __b) +vsli_n_s64 (int64x1_t __a, int64x1_t __b, const int __c) { - return __builtin_aarch64_sqrshlv4si (__a, __b); + return (int64x1_t) {__builtin_aarch64_ssli_ndi (__a[0], __b[0], __c)}; } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshlq_s64 (int64x2_t __a, int64x2_t __b) +vsli_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c) { - return __builtin_aarch64_sqrshlv2di (__a, __b); + return __builtin_aarch64_usli_nv8qi_uuus (__a, __b, __c); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshlq_u8 (uint8x16_t __a, int8x16_t __b) +vsli_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) { - return __builtin_aarch64_uqrshlv16qi_uus ( __a, __b); + return __builtin_aarch64_usli_nv4hi_uuus (__a, __b, __c); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshlq_u16 (uint16x8_t __a, int16x8_t __b) +vsli_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) { - return __builtin_aarch64_uqrshlv8hi_uus ( __a, __b); + return __builtin_aarch64_usli_nv2si_uuus (__a, __b, __c); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshlq_u32 (uint32x4_t __a, int32x4_t __b) +vsli_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) { - return __builtin_aarch64_uqrshlv4si_uus ( __a, __b); + return (uint64x1_t) {__builtin_aarch64_usli_ndi_uuus (__a[0], __b[0], __c)}; } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline poly64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshlq_u64 (uint64x2_t __a, int64x2_t __b) +vsli_n_p64 (poly64x1_t __a, poly64x1_t __b, const int __c) { - return __builtin_aarch64_uqrshlv2di_uus ( __a, __b); + return (poly64x1_t) {__builtin_aarch64_ssli_ndi_ppps (__a[0], __b[0], __c)}; } -__extension__ extern __inline int8_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshlb_s8 (int8_t __a, int8_t __b) +vsliq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c) { - return __builtin_aarch64_sqrshlqi (__a, __b); + return (int8x16_t) __builtin_aarch64_ssli_nv16qi (__a, __b, __c); } -__extension__ extern __inline int16_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshlh_s16 (int16_t __a, int16_t __b) +vsliq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c) { - return __builtin_aarch64_sqrshlhi (__a, __b); + return (int16x8_t) __builtin_aarch64_ssli_nv8hi (__a, __b, __c); } -__extension__ extern __inline int32_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshls_s32 (int32_t __a, int32_t __b) +vsliq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c) { - return __builtin_aarch64_sqrshlsi (__a, __b); + return (int32x4_t) __builtin_aarch64_ssli_nv4si (__a, __b, __c); } -__extension__ extern __inline int64_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshld_s64 (int64_t __a, int64_t __b) +vsliq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c) { - return __builtin_aarch64_sqrshldi (__a, __b); + return (int64x2_t) __builtin_aarch64_ssli_nv2di (__a, __b, __c); } -__extension__ extern __inline uint8_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshlb_u8 (uint8_t __a, uint8_t __b) +vsliq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c) { - return __builtin_aarch64_uqrshlqi_uus (__a, __b); + return __builtin_aarch64_usli_nv16qi_uuus (__a, __b, __c); } -__extension__ extern __inline uint16_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshlh_u16 (uint16_t __a, uint16_t __b) +vsliq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) { - return __builtin_aarch64_uqrshlhi_uus (__a, __b); + return __builtin_aarch64_usli_nv8hi_uuus (__a, __b, __c); } -__extension__ extern __inline uint32_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshls_u32 (uint32_t __a, uint32_t __b) +vsliq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) { - return __builtin_aarch64_uqrshlsi_uus (__a, __b); + return __builtin_aarch64_usli_nv4si_uuus (__a, __b, __c); } -__extension__ extern __inline uint64_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshld_u64 (uint64_t __a, uint64_t __b) +vsliq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c) { - return __builtin_aarch64_uqrshldi_uus (__a, __b); + return __builtin_aarch64_usli_nv2di_uuus (__a, __b, __c); } -/* vqrshrn */ - -__extension__ extern __inline int8x8_t +__extension__ extern __inline poly64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshrn_n_s16 (int16x8_t __a, const int __b) +vsliq_n_p64 (poly64x2_t __a, poly64x2_t __b, const int __c) { - return (int8x8_t) __builtin_aarch64_sqrshrn_nv8hi (__a, __b); + return __builtin_aarch64_ssli_nv2di_ppps (__a, __b, __c); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshrn_n_s32 (int32x4_t __a, const int __b) +vslid_n_s64 (int64_t __a, int64_t __b, const int __c) { - return (int16x4_t) __builtin_aarch64_sqrshrn_nv4si (__a, __b); + return __builtin_aarch64_ssli_ndi (__a, __b, __c); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshrn_n_s64 (int64x2_t __a, const int __b) +vslid_n_u64 (uint64_t __a, uint64_t __b, const int __c) { - return (int32x2_t) __builtin_aarch64_sqrshrn_nv2di (__a, __b); + return __builtin_aarch64_usli_ndi_uuus (__a, __b, __c); } +/* vsqadd */ + __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshrn_n_u16 (uint16x8_t __a, const int __b) +vsqadd_u8 (uint8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_uqrshrn_nv8hi_uus ( __a, __b); + return __builtin_aarch64_usqaddv8qi_uus (__a, __b); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshrn_n_u32 (uint32x4_t __a, const int __b) +vsqadd_u16 (uint16x4_t __a, int16x4_t __b) { - return __builtin_aarch64_uqrshrn_nv4si_uus ( __a, __b); + return __builtin_aarch64_usqaddv4hi_uus (__a, __b); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshrn_n_u64 (uint64x2_t __a, const int __b) +vsqadd_u32 (uint32x2_t __a, int32x2_t __b) { - return __builtin_aarch64_uqrshrn_nv2di_uus ( __a, __b); + return __builtin_aarch64_usqaddv2si_uus (__a, __b); } -__extension__ extern __inline int8_t +__extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshrnh_n_s16 (int16_t __a, const int __b) +vsqadd_u64 (uint64x1_t __a, int64x1_t __b) { - return (int8_t) __builtin_aarch64_sqrshrn_nhi (__a, __b); + return (uint64x1_t) {__builtin_aarch64_usqadddi_uus (__a[0], __b[0])}; } -__extension__ extern __inline int16_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshrns_n_s32 (int32_t __a, const int __b) +vsqaddq_u8 (uint8x16_t __a, int8x16_t __b) { - return (int16_t) __builtin_aarch64_sqrshrn_nsi (__a, __b); + return __builtin_aarch64_usqaddv16qi_uus (__a, __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshrnd_n_s64 (int64_t __a, const int __b) +vsqaddq_u16 (uint16x8_t __a, int16x8_t __b) { - return (int32_t) __builtin_aarch64_sqrshrn_ndi (__a, __b); + return __builtin_aarch64_usqaddv8hi_uus (__a, __b); } -__extension__ extern __inline uint8_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshrnh_n_u16 (uint16_t __a, const int __b) +vsqaddq_u32 (uint32x4_t __a, int32x4_t __b) { - return __builtin_aarch64_uqrshrn_nhi_uus (__a, __b); + return __builtin_aarch64_usqaddv4si_uus (__a, __b); } -__extension__ extern __inline uint16_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshrns_n_u32 (uint32_t __a, const int __b) +vsqaddq_u64 (uint64x2_t __a, int64x2_t __b) { - return __builtin_aarch64_uqrshrn_nsi_uus (__a, __b); + return __builtin_aarch64_usqaddv2di_uus (__a, __b); } -__extension__ extern __inline uint32_t +__extension__ extern __inline uint8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshrnd_n_u64 (uint64_t __a, const int __b) +vsqaddb_u8 (uint8_t __a, int8_t __b) { - return __builtin_aarch64_uqrshrn_ndi_uus (__a, __b); + return __builtin_aarch64_usqaddqi_uus (__a, __b); } -/* vqrshrun */ +__extension__ extern __inline uint16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsqaddh_u16 (uint16_t __a, int16_t __b) +{ + return __builtin_aarch64_usqaddhi_uus (__a, __b); +} -__extension__ extern __inline uint8x8_t +__extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshrun_n_s16 (int16x8_t __a, const int __b) +vsqadds_u32 (uint32_t __a, int32_t __b) { - return (uint8x8_t) __builtin_aarch64_sqrshrun_nv8hi (__a, __b); + return __builtin_aarch64_usqaddsi_uus (__a, __b); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshrun_n_s32 (int32x4_t __a, const int __b) +vsqaddd_u64 (uint64_t __a, int64_t __b) { - return (uint16x4_t) __builtin_aarch64_sqrshrun_nv4si (__a, __b); + return __builtin_aarch64_usqadddi_uus (__a, __b); } -__extension__ extern __inline uint32x2_t +/* vsqrt */ +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshrun_n_s64 (int64x2_t __a, const int __b) +vsqrt_f32 (float32x2_t __a) { - return (uint32x2_t) __builtin_aarch64_sqrshrun_nv2di (__a, __b); + return __builtin_aarch64_sqrtv2sf (__a); } -__extension__ extern __inline int8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshrunh_n_s16 (int16_t __a, const int __b) +vsqrtq_f32 (float32x4_t __a) { - return (int8_t) __builtin_aarch64_sqrshrun_nhi (__a, __b); + return __builtin_aarch64_sqrtv4sf (__a); } -__extension__ extern __inline int16_t +__extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshruns_n_s32 (int32_t __a, const int __b) +vsqrt_f64 (float64x1_t __a) { - return (int16_t) __builtin_aarch64_sqrshrun_nsi (__a, __b); + return (float64x1_t) { __builtin_aarch64_sqrtdf (__a[0]) }; } -__extension__ extern __inline int32_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshrund_n_s64 (int64_t __a, const int __b) +vsqrtq_f64 (float64x2_t __a) { - return (int32_t) __builtin_aarch64_sqrshrun_ndi (__a, __b); + return __builtin_aarch64_sqrtv2df (__a); } -/* vqshl */ +/* vsra */ __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshl_s8 (int8x8_t __a, int8x8_t __b) +vsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c) { - return __builtin_aarch64_sqshlv8qi (__a, __b); + return (int8x8_t) __builtin_aarch64_ssra_nv8qi (__a, __b, __c); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshl_s16 (int16x4_t __a, int16x4_t __b) +vsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c) { - return __builtin_aarch64_sqshlv4hi (__a, __b); + return (int16x4_t) __builtin_aarch64_ssra_nv4hi (__a, __b, __c); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshl_s32 (int32x2_t __a, int32x2_t __b) +vsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c) { - return __builtin_aarch64_sqshlv2si (__a, __b); + return (int32x2_t) __builtin_aarch64_ssra_nv2si (__a, __b, __c); } __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshl_s64 (int64x1_t __a, int64x1_t __b) +vsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c) { - return (int64x1_t) {__builtin_aarch64_sqshldi (__a[0], __b[0])}; + return (int64x1_t) {__builtin_aarch64_ssra_ndi (__a[0], __b[0], __c)}; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshl_u8 (uint8x8_t __a, int8x8_t __b) +vsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c) { - return __builtin_aarch64_uqshlv8qi_uus ( __a, __b); + return __builtin_aarch64_usra_nv8qi_uuus (__a, __b, __c); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshl_u16 (uint16x4_t __a, int16x4_t __b) +vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) { - return __builtin_aarch64_uqshlv4hi_uus ( __a, __b); + return __builtin_aarch64_usra_nv4hi_uuus (__a, __b, __c); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshl_u32 (uint32x2_t __a, int32x2_t __b) +vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) { - return __builtin_aarch64_uqshlv2si_uus ( __a, __b); + return __builtin_aarch64_usra_nv2si_uuus (__a, __b, __c); } __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshl_u64 (uint64x1_t __a, int64x1_t __b) +vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) { - return (uint64x1_t) {__builtin_aarch64_uqshldi_uus (__a[0], __b[0])}; + return (uint64x1_t) {__builtin_aarch64_usra_ndi_uuus (__a[0], __b[0], __c)}; } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlq_s8 (int8x16_t __a, int8x16_t __b) +vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c) { - return __builtin_aarch64_sqshlv16qi (__a, __b); + return (int8x16_t) __builtin_aarch64_ssra_nv16qi (__a, __b, __c); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlq_s16 (int16x8_t __a, int16x8_t __b) +vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c) { - return __builtin_aarch64_sqshlv8hi (__a, __b); + return (int16x8_t) __builtin_aarch64_ssra_nv8hi (__a, __b, __c); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlq_s32 (int32x4_t __a, int32x4_t __b) +vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c) { - return __builtin_aarch64_sqshlv4si (__a, __b); + return (int32x4_t) __builtin_aarch64_ssra_nv4si (__a, __b, __c); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlq_s64 (int64x2_t __a, int64x2_t __b) +vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c) { - return __builtin_aarch64_sqshlv2di (__a, __b); + return (int64x2_t) __builtin_aarch64_ssra_nv2di (__a, __b, __c); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlq_u8 (uint8x16_t __a, int8x16_t __b) +vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c) { - return __builtin_aarch64_uqshlv16qi_uus ( __a, __b); + return __builtin_aarch64_usra_nv16qi_uuus (__a, __b, __c); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlq_u16 (uint16x8_t __a, int16x8_t __b) +vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) { - return __builtin_aarch64_uqshlv8hi_uus ( __a, __b); + return __builtin_aarch64_usra_nv8hi_uuus (__a, __b, __c); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlq_u32 (uint32x4_t __a, int32x4_t __b) +vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) { - return __builtin_aarch64_uqshlv4si_uus ( __a, __b); + return __builtin_aarch64_usra_nv4si_uuus (__a, __b, __c); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlq_u64 (uint64x2_t __a, int64x2_t __b) -{ - return __builtin_aarch64_uqshlv2di_uus ( __a, __b); -} - -__extension__ extern __inline int8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlb_s8 (int8_t __a, int8_t __b) -{ - return __builtin_aarch64_sqshlqi (__a, __b); -} - -__extension__ extern __inline int16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlh_s16 (int16_t __a, int16_t __b) -{ - return __builtin_aarch64_sqshlhi (__a, __b); -} - -__extension__ extern __inline int32_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshls_s32 (int32_t __a, int32_t __b) +vsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c) { - return __builtin_aarch64_sqshlsi (__a, __b); + return __builtin_aarch64_usra_nv2di_uuus (__a, __b, __c); } __extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshld_s64 (int64_t __a, int64_t __b) -{ - return __builtin_aarch64_sqshldi (__a, __b); -} - -__extension__ extern __inline uint8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlb_u8 (uint8_t __a, uint8_t __b) -{ - return __builtin_aarch64_uqshlqi_uus (__a, __b); -} - -__extension__ extern __inline uint16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlh_u16 (uint16_t __a, uint16_t __b) -{ - return __builtin_aarch64_uqshlhi_uus (__a, __b); -} - -__extension__ extern __inline uint32_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshls_u32 (uint32_t __a, uint32_t __b) +vsrad_n_s64 (int64_t __a, int64_t __b, const int __c) { - return __builtin_aarch64_uqshlsi_uus (__a, __b); + return __builtin_aarch64_ssra_ndi (__a, __b, __c); } __extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshld_u64 (uint64_t __a, uint64_t __b) +vsrad_n_u64 (uint64_t __a, uint64_t __b, const int __c) { - return __builtin_aarch64_uqshldi_uus (__a, __b); + return __builtin_aarch64_usra_ndi_uuus (__a, __b, __c); } +/* vsri */ + __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshl_n_s8 (int8x8_t __a, const int __b) +vsri_n_s8 (int8x8_t __a, int8x8_t __b, const int __c) { - return (int8x8_t) __builtin_aarch64_sqshl_nv8qi (__a, __b); + return (int8x8_t) __builtin_aarch64_ssri_nv8qi (__a, __b, __c); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshl_n_s16 (int16x4_t __a, const int __b) +vsri_n_s16 (int16x4_t __a, int16x4_t __b, const int __c) { - return (int16x4_t) __builtin_aarch64_sqshl_nv4hi (__a, __b); + return (int16x4_t) __builtin_aarch64_ssri_nv4hi (__a, __b, __c); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshl_n_s32 (int32x2_t __a, const int __b) +vsri_n_s32 (int32x2_t __a, int32x2_t __b, const int __c) { - return (int32x2_t) __builtin_aarch64_sqshl_nv2si (__a, __b); + return (int32x2_t) __builtin_aarch64_ssri_nv2si (__a, __b, __c); } __extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshl_n_s64 (int64x1_t __a, const int __b) +vsri_n_s64 (int64x1_t __a, int64x1_t __b, const int __c) { - return (int64x1_t) {__builtin_aarch64_sqshl_ndi (__a[0], __b)}; + return (int64x1_t) {__builtin_aarch64_ssri_ndi (__a[0], __b[0], __c)}; } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshl_n_u8 (uint8x8_t __a, const int __b) +vsri_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c) { - return __builtin_aarch64_uqshl_nv8qi_uus (__a, __b); + return __builtin_aarch64_usri_nv8qi_uuus (__a, __b, __c); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshl_n_u16 (uint16x4_t __a, const int __b) +vsri_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) { - return __builtin_aarch64_uqshl_nv4hi_uus (__a, __b); + return __builtin_aarch64_usri_nv4hi_uuus (__a, __b, __c); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshl_n_u32 (uint32x2_t __a, const int __b) +vsri_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) { - return __builtin_aarch64_uqshl_nv2si_uus (__a, __b); + return __builtin_aarch64_usri_nv2si_uuus (__a, __b, __c); } __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshl_n_u64 (uint64x1_t __a, const int __b) +vsri_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) { - return (uint64x1_t) {__builtin_aarch64_uqshl_ndi_uus (__a[0], __b)}; + return (uint64x1_t) {__builtin_aarch64_usri_ndi_uuus (__a[0], __b[0], __c)}; } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlq_n_s8 (int8x16_t __a, const int __b) +vsriq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c) { - return (int8x16_t) __builtin_aarch64_sqshl_nv16qi (__a, __b); + return (int8x16_t) __builtin_aarch64_ssri_nv16qi (__a, __b, __c); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlq_n_s16 (int16x8_t __a, const int __b) +vsriq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c) { - return (int16x8_t) __builtin_aarch64_sqshl_nv8hi (__a, __b); + return (int16x8_t) __builtin_aarch64_ssri_nv8hi (__a, __b, __c); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlq_n_s32 (int32x4_t __a, const int __b) +vsriq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c) { - return (int32x4_t) __builtin_aarch64_sqshl_nv4si (__a, __b); + return (int32x4_t) __builtin_aarch64_ssri_nv4si (__a, __b, __c); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlq_n_s64 (int64x2_t __a, const int __b) +vsriq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c) { - return (int64x2_t) __builtin_aarch64_sqshl_nv2di (__a, __b); + return (int64x2_t) __builtin_aarch64_ssri_nv2di (__a, __b, __c); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlq_n_u8 (uint8x16_t __a, const int __b) +vsriq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c) { - return __builtin_aarch64_uqshl_nv16qi_uus (__a, __b); + return __builtin_aarch64_usri_nv16qi_uuus (__a, __b, __c); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlq_n_u16 (uint16x8_t __a, const int __b) +vsriq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) { - return __builtin_aarch64_uqshl_nv8hi_uus (__a, __b); + return __builtin_aarch64_usri_nv8hi_uuus (__a, __b, __c); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlq_n_u32 (uint32x4_t __a, const int __b) +vsriq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) { - return __builtin_aarch64_uqshl_nv4si_uus (__a, __b); + return __builtin_aarch64_usri_nv4si_uuus (__a, __b, __c); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlq_n_u64 (uint64x2_t __a, const int __b) -{ - return __builtin_aarch64_uqshl_nv2di_uus (__a, __b); -} - -__extension__ extern __inline int8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlb_n_s8 (int8_t __a, const int __b) -{ - return (int8_t) __builtin_aarch64_sqshl_nqi (__a, __b); -} - -__extension__ extern __inline int16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlh_n_s16 (int16_t __a, const int __b) -{ - return (int16_t) __builtin_aarch64_sqshl_nhi (__a, __b); -} - -__extension__ extern __inline int32_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshls_n_s32 (int32_t __a, const int __b) +vsriq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c) { - return (int32_t) __builtin_aarch64_sqshl_nsi (__a, __b); + return __builtin_aarch64_usri_nv2di_uuus (__a, __b, __c); } __extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshld_n_s64 (int64_t __a, const int __b) -{ - return __builtin_aarch64_sqshl_ndi (__a, __b); -} - -__extension__ extern __inline uint8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlb_n_u8 (uint8_t __a, const int __b) -{ - return __builtin_aarch64_uqshl_nqi_uus (__a, __b); -} - -__extension__ extern __inline uint16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlh_n_u16 (uint16_t __a, const int __b) -{ - return __builtin_aarch64_uqshl_nhi_uus (__a, __b); -} - -__extension__ extern __inline uint32_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshls_n_u32 (uint32_t __a, const int __b) +vsrid_n_s64 (int64_t __a, int64_t __b, const int __c) { - return __builtin_aarch64_uqshl_nsi_uus (__a, __b); + return __builtin_aarch64_ssri_ndi (__a, __b, __c); } __extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshld_n_u64 (uint64_t __a, const int __b) +vsrid_n_u64 (uint64_t __a, uint64_t __b, const int __c) { - return __builtin_aarch64_uqshl_ndi_uus (__a, __b); + return __builtin_aarch64_usri_ndi_uuus (__a, __b, __c); } -/* vqshlu */ - -__extension__ extern __inline uint8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlu_n_s8 (int8x8_t __a, const int __b) -{ - return __builtin_aarch64_sqshlu_nv8qi_uss (__a, __b); -} +/* vst1 */ -__extension__ extern __inline uint16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlu_n_s16 (int16x4_t __a, const int __b) +vst1_f16 (float16_t *__a, float16x4_t __b) { - return __builtin_aarch64_sqshlu_nv4hi_uss (__a, __b); + __builtin_aarch64_st1v4hf (__a, __b); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlu_n_s32 (int32x2_t __a, const int __b) +vst1_f32 (float32_t *__a, float32x2_t __b) { - return __builtin_aarch64_sqshlu_nv2si_uss (__a, __b); + __builtin_aarch64_st1v2sf ((__builtin_aarch64_simd_sf *) __a, __b); } -__extension__ extern __inline uint64x1_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlu_n_s64 (int64x1_t __a, const int __b) +vst1_f64 (float64_t *__a, float64x1_t __b) { - return (uint64x1_t) {__builtin_aarch64_sqshlu_ndi_uss (__a[0], __b)}; + *__a = __b[0]; } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshluq_n_s8 (int8x16_t __a, const int __b) +vst1_p8 (poly8_t *__a, poly8x8_t __b) { - return __builtin_aarch64_sqshlu_nv16qi_uss (__a, __b); + __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) __a, + (int8x8_t) __b); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshluq_n_s16 (int16x8_t __a, const int __b) +vst1_p16 (poly16_t *__a, poly16x4_t __b) { - return __builtin_aarch64_sqshlu_nv8hi_uss (__a, __b); + __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) __a, + (int16x4_t) __b); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshluq_n_s32 (int32x4_t __a, const int __b) +vst1_p64 (poly64_t *__a, poly64x1_t __b) { - return __builtin_aarch64_sqshlu_nv4si_uss (__a, __b); + *__a = __b[0]; } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshluq_n_s64 (int64x2_t __a, const int __b) +vst1_s8 (int8_t *__a, int8x8_t __b) { - return __builtin_aarch64_sqshlu_nv2di_uss (__a, __b); + __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) __a, __b); } -__extension__ extern __inline int8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlub_n_s8 (int8_t __a, const int __b) +vst1_s16 (int16_t *__a, int16x4_t __b) { - return (int8_t) __builtin_aarch64_sqshlu_nqi_uss (__a, __b); + __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) __a, __b); } -__extension__ extern __inline int16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshluh_n_s16 (int16_t __a, const int __b) +vst1_s32 (int32_t *__a, int32x2_t __b) { - return (int16_t) __builtin_aarch64_sqshlu_nhi_uss (__a, __b); + __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) __a, __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlus_n_s32 (int32_t __a, const int __b) +vst1_s64 (int64_t *__a, int64x1_t __b) { - return (int32_t) __builtin_aarch64_sqshlu_nsi_uss (__a, __b); + *__a = __b[0]; } -__extension__ extern __inline uint64_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlud_n_s64 (int64_t __a, const int __b) +vst1_u8 (uint8_t *__a, uint8x8_t __b) { - return __builtin_aarch64_sqshlu_ndi_uss (__a, __b); + __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) __a, + (int8x8_t) __b); } -/* vqshrn */ - -__extension__ extern __inline int8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshrn_n_s16 (int16x8_t __a, const int __b) +vst1_u16 (uint16_t *__a, uint16x4_t __b) { - return (int8x8_t) __builtin_aarch64_sqshrn_nv8hi (__a, __b); + __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) __a, + (int16x4_t) __b); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshrn_n_s32 (int32x4_t __a, const int __b) +vst1_u32 (uint32_t *__a, uint32x2_t __b) { - return (int16x4_t) __builtin_aarch64_sqshrn_nv4si (__a, __b); + __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) __a, + (int32x2_t) __b); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshrn_n_s64 (int64x2_t __a, const int __b) +vst1_u64 (uint64_t *__a, uint64x1_t __b) { - return (int32x2_t) __builtin_aarch64_sqshrn_nv2di (__a, __b); + *__a = __b[0]; } -__extension__ extern __inline uint8x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshrn_n_u16 (uint16x8_t __a, const int __b) -{ - return __builtin_aarch64_uqshrn_nv8hi_uus ( __a, __b); -} +/* vst1q */ -__extension__ extern __inline uint16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshrn_n_u32 (uint32x4_t __a, const int __b) +vst1q_f16 (float16_t *__a, float16x8_t __b) { - return __builtin_aarch64_uqshrn_nv4si_uus ( __a, __b); + __builtin_aarch64_st1v8hf (__a, __b); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshrn_n_u64 (uint64x2_t __a, const int __b) +vst1q_f32 (float32_t *__a, float32x4_t __b) { - return __builtin_aarch64_uqshrn_nv2di_uus ( __a, __b); + __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) __a, __b); } -__extension__ extern __inline int8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshrnh_n_s16 (int16_t __a, const int __b) +vst1q_f64 (float64_t *__a, float64x2_t __b) { - return (int8_t) __builtin_aarch64_sqshrn_nhi (__a, __b); + __builtin_aarch64_st1v2df ((__builtin_aarch64_simd_df *) __a, __b); } -__extension__ extern __inline int16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshrns_n_s32 (int32_t __a, const int __b) +vst1q_p8 (poly8_t *__a, poly8x16_t __b) { - return (int16_t) __builtin_aarch64_sqshrn_nsi (__a, __b); + __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) __a, + (int8x16_t) __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshrnd_n_s64 (int64_t __a, const int __b) +vst1q_p16 (poly16_t *__a, poly16x8_t __b) { - return (int32_t) __builtin_aarch64_sqshrn_ndi (__a, __b); + __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) __a, + (int16x8_t) __b); } -__extension__ extern __inline uint8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshrnh_n_u16 (uint16_t __a, const int __b) +vst1q_p64 (poly64_t *__a, poly64x2_t __b) { - return __builtin_aarch64_uqshrn_nhi_uus (__a, __b); + __builtin_aarch64_st1v2di_sp ((__builtin_aarch64_simd_di *) __a, + (poly64x2_t) __b); } -__extension__ extern __inline uint16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshrns_n_u32 (uint32_t __a, const int __b) +vst1q_s8 (int8_t *__a, int8x16_t __b) { - return __builtin_aarch64_uqshrn_nsi_uus (__a, __b); + __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) __a, __b); } -__extension__ extern __inline uint32_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshrnd_n_u64 (uint64_t __a, const int __b) +vst1q_s16 (int16_t *__a, int16x8_t __b) { - return __builtin_aarch64_uqshrn_ndi_uus (__a, __b); + __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) __a, __b); } -/* vqshrun */ - -__extension__ extern __inline uint8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshrun_n_s16 (int16x8_t __a, const int __b) +vst1q_s32 (int32_t *__a, int32x4_t __b) { - return (uint8x8_t) __builtin_aarch64_sqshrun_nv8hi (__a, __b); + __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) __a, __b); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshrun_n_s32 (int32x4_t __a, const int __b) +vst1q_s64 (int64_t *__a, int64x2_t __b) { - return (uint16x4_t) __builtin_aarch64_sqshrun_nv4si (__a, __b); + __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) __a, __b); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshrun_n_s64 (int64x2_t __a, const int __b) +vst1q_u8 (uint8_t *__a, uint8x16_t __b) { - return (uint32x2_t) __builtin_aarch64_sqshrun_nv2di (__a, __b); + __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) __a, + (int8x16_t) __b); } -__extension__ extern __inline int8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshrunh_n_s16 (int16_t __a, const int __b) +vst1q_u16 (uint16_t *__a, uint16x8_t __b) { - return (int8_t) __builtin_aarch64_sqshrun_nhi (__a, __b); + __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) __a, + (int16x8_t) __b); } -__extension__ extern __inline int16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshruns_n_s32 (int32_t __a, const int __b) +vst1q_u32 (uint32_t *__a, uint32x4_t __b) { - return (int16_t) __builtin_aarch64_sqshrun_nsi (__a, __b); + __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) __a, + (int32x4_t) __b); } -__extension__ extern __inline int32_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshrund_n_s64 (int64_t __a, const int __b) +vst1q_u64 (uint64_t *__a, uint64x2_t __b) { - return (int32_t) __builtin_aarch64_sqshrun_ndi (__a, __b); + __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) __a, + (int64x2_t) __b); } -/* vqsub */ +/* vst1_lane */ -__extension__ extern __inline int8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqsubb_s8 (int8_t __a, int8_t __b) +vst1_lane_f16 (float16_t *__a, float16x4_t __b, const int __lane) { - return (int8_t) __builtin_aarch64_sqsubqi (__a, __b); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline int16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqsubh_s16 (int16_t __a, int16_t __b) +vst1_lane_f32 (float32_t *__a, float32x2_t __b, const int __lane) { - return (int16_t) __builtin_aarch64_sqsubhi (__a, __b); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline int32_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqsubs_s32 (int32_t __a, int32_t __b) +vst1_lane_f64 (float64_t *__a, float64x1_t __b, const int __lane) { - return (int32_t) __builtin_aarch64_sqsubsi (__a, __b); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline int64_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqsubd_s64 (int64_t __a, int64_t __b) +vst1_lane_p8 (poly8_t *__a, poly8x8_t __b, const int __lane) { - return __builtin_aarch64_sqsubdi (__a, __b); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline uint8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqsubb_u8 (uint8_t __a, uint8_t __b) +vst1_lane_p16 (poly16_t *__a, poly16x4_t __b, const int __lane) { - return (uint8_t) __builtin_aarch64_uqsubqi_uuu (__a, __b); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline uint16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqsubh_u16 (uint16_t __a, uint16_t __b) +vst1_lane_p64 (poly64_t *__a, poly64x1_t __b, const int __lane) { - return (uint16_t) __builtin_aarch64_uqsubhi_uuu (__a, __b); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline uint32_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqsubs_u32 (uint32_t __a, uint32_t __b) +vst1_lane_s8 (int8_t *__a, int8x8_t __b, const int __lane) { - return (uint32_t) __builtin_aarch64_uqsubsi_uuu (__a, __b); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline uint64_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqsubd_u64 (uint64_t __a, uint64_t __b) +vst1_lane_s16 (int16_t *__a, int16x4_t __b, const int __lane) { - return __builtin_aarch64_uqsubdi_uuu (__a, __b); + *__a = __aarch64_vget_lane_any (__b, __lane); } -/* vqtbl2 */ - -__extension__ extern __inline int8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl2_s8 (int8x16x2_t tab, uint8x8_t idx) +vst1_lane_s32 (int32_t *__a, int32x2_t __b, const int __lane) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1); - return __builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl2_u8 (uint8x16x2_t tab, uint8x8_t idx) +vst1_lane_s64 (int64_t *__a, int64x1_t __b, const int __lane) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1); - return (uint8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl2_p8 (poly8x16x2_t tab, uint8x8_t idx) +vst1_lane_u8 (uint8_t *__a, uint8x8_t __b, const int __lane) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1); - return (poly8x8_t)__builtin_aarch64_tbl3v8qi (__o, (int8x8_t)idx); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl2q_s8 (int8x16x2_t tab, uint8x16_t idx) +vst1_lane_u16 (uint16_t *__a, uint16x4_t __b, const int __lane) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1); - return __builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl2q_u8 (uint8x16x2_t tab, uint8x16_t idx) +vst1_lane_u32 (uint32_t *__a, uint32x2_t __b, const int __lane) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1); - return (uint8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline poly8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl2q_p8 (poly8x16x2_t tab, uint8x16_t idx) +vst1_lane_u64 (uint64_t *__a, uint64x1_t __b, const int __lane) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1); - return (poly8x16_t)__builtin_aarch64_tbl3v16qi (__o, (int8x16_t)idx); + *__a = __aarch64_vget_lane_any (__b, __lane); } -/* vqtbl3 */ +/* vst1q_lane */ -__extension__ extern __inline int8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl3_s8 (int8x16x3_t tab, uint8x8_t idx) +vst1q_lane_f16 (float16_t *__a, float16x8_t __b, const int __lane) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2); - return __builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl3_u8 (uint8x16x3_t tab, uint8x8_t idx) +vst1q_lane_f32 (float32_t *__a, float32x4_t __b, const int __lane) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2); - return (uint8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl3_p8 (poly8x16x3_t tab, uint8x8_t idx) +vst1q_lane_f64 (float64_t *__a, float64x2_t __b, const int __lane) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2); - return (poly8x8_t)__builtin_aarch64_qtbl3v8qi (__o, (int8x8_t)idx); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl3q_s8 (int8x16x3_t tab, uint8x16_t idx) +vst1q_lane_p8 (poly8_t *__a, poly8x16_t __b, const int __lane) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2); - return __builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl3q_u8 (uint8x16x3_t tab, uint8x16_t idx) +vst1q_lane_p16 (poly16_t *__a, poly16x8_t __b, const int __lane) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2); - return (uint8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline poly8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl3q_p8 (poly8x16x3_t tab, uint8x16_t idx) +vst1q_lane_p64 (poly64_t *__a, poly64x2_t __b, const int __lane) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2); - return (poly8x16_t)__builtin_aarch64_qtbl3v16qi (__o, (int8x16_t)idx); + *__a = __aarch64_vget_lane_any (__b, __lane); } -/* vqtbl4 */ - -__extension__ extern __inline int8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl4_s8 (int8x16x4_t tab, uint8x8_t idx) +vst1q_lane_s8 (int8_t *__a, int8x16_t __b, const int __lane) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3); - return __builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl4_u8 (uint8x16x4_t tab, uint8x8_t idx) +vst1q_lane_s16 (int16_t *__a, int16x8_t __b, const int __lane) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3); - return (uint8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl4_p8 (poly8x16x4_t tab, uint8x8_t idx) +vst1q_lane_s32 (int32_t *__a, int32x4_t __b, const int __lane) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3); - return (poly8x8_t)__builtin_aarch64_qtbl4v8qi (__o, (int8x8_t)idx); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl4q_s8 (int8x16x4_t tab, uint8x16_t idx) +vst1q_lane_s64 (int64_t *__a, int64x2_t __b, const int __lane) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3); - return __builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl4q_u8 (uint8x16x4_t tab, uint8x16_t idx) +vst1q_lane_u8 (uint8_t *__a, uint8x16_t __b, const int __lane) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3); - return (uint8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline poly8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbl4q_p8 (poly8x16x4_t tab, uint8x16_t idx) +vst1q_lane_u16 (uint16_t *__a, uint16x8_t __b, const int __lane) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3); - return (poly8x16_t)__builtin_aarch64_qtbl4v16qi (__o, (int8x16_t)idx); + *__a = __aarch64_vget_lane_any (__b, __lane); } +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_lane_u32 (uint32_t *__a, uint32x4_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} -/* vqtbx2 */ -__extension__ extern __inline int8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx2_s8 (int8x8_t r, int8x16x2_t tab, uint8x8_t idx) +vst1q_lane_u64 (uint64_t *__a, uint64x2_t __b, const int __lane) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1); - return __builtin_aarch64_tbx4v8qi (r, __o, (int8x8_t)idx); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline uint8x8_t +/* vst1x2 */ + +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx2_u8 (uint8x8_t r, uint8x16x2_t tab, uint8x8_t idx) +vst1_s64_x2 (int64_t * __a, int64x1x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1); - return (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)r, __o, - (int8x8_t)idx); + int64x2x2_t __temp; + __temp.val[0] + = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); + __temp.val[1] + = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[1], 1); + __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx2_p8 (poly8x8_t r, poly8x16x2_t tab, uint8x8_t idx) +vst1_u64_x2 (uint64_t * __a, uint64x1x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1); - return (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)r, __o, - (int8x8_t)idx); + uint64x2x2_t __temp; + __temp.val[0] + = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[1] + = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[1], 1); + __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx2q_s8 (int8x16_t r, int8x16x2_t tab, uint8x16_t idx) +vst1_f64_x2 (float64_t * __a, float64x1x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, tab.val[1], 1); - return __builtin_aarch64_tbx4v16qi (r, __o, (int8x16_t)idx); + float64x2x2_t __temp; + __temp.val[0] + = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[1] + = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __temp.val[1], 1); + __builtin_aarch64_st1x2df ((__builtin_aarch64_simd_df *) __a, __o); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx2q_u8 (uint8x16_t r, uint8x16x2_t tab, uint8x16_t idx) +vst1_s8_x2 (int8_t * __a, int8x8x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1); - return (uint8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)r, __o, - (int8x16_t)idx); + int8x16x2_t __temp; + __temp.val[0] + = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); + __temp.val[1] + = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline poly8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx2q_p8 (poly8x16_t r, poly8x16x2_t tab, uint8x16_t idx) +vst1_p8_x2 (poly8_t * __a, poly8x8x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t)tab.val[1], 1); - return (poly8x16_t)__builtin_aarch64_tbx4v16qi ((int8x16_t)r, __o, - (int8x16_t)idx); + poly8x16x2_t __temp; + __temp.val[0] + = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[1] + = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } -/* vqtbx3 */ -__extension__ extern __inline int8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx3_s8 (int8x8_t r, int8x16x3_t tab, uint8x8_t idx) +vst1_s16_x2 (int16_t * __a, int16x4x2_t __val) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[2], 2); - return __builtin_aarch64_qtbx3v8qi (r, __o, (int8x8_t)idx); + __builtin_aarch64_simd_oi __o; + int16x8x2_t __temp; + __temp.val[0] + = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); + __temp.val[1] + = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx3_u8 (uint8x8_t r, uint8x16x3_t tab, uint8x8_t idx) +vst1_p16_x2 (poly16_t * __a, poly16x4x2_t __val) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2); - return (uint8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)r, __o, - (int8x8_t)idx); + __builtin_aarch64_simd_oi __o; + poly16x8x2_t __temp; + __temp.val[0] + = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[1] + = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx3_p8 (poly8x8_t r, poly8x16x3_t tab, uint8x8_t idx) +vst1_s32_x2 (int32_t * __a, int32x2x2_t __val) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2); - return (poly8x8_t)__builtin_aarch64_qtbx3v8qi ((int8x8_t)r, __o, - (int8x8_t)idx); + __builtin_aarch64_simd_oi __o; + int32x4x2_t __temp; + __temp.val[0] + = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); + __temp.val[1] + = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[1], 1); + __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx3q_s8 (int8x16_t r, int8x16x3_t tab, uint8x16_t idx) +vst1_u8_x2 (uint8_t * __a, uint8x8x2_t __val) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, tab.val[2], 2); - return __builtin_aarch64_qtbx3v16qi (r, __o, (int8x16_t)idx); + __builtin_aarch64_simd_oi __o; + uint8x16x2_t __temp; + __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __builtin_aarch64_st1x2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx3q_u8 (uint8x16_t r, uint8x16x3_t tab, uint8x16_t idx) +vst1_u16_x2 (uint16_t * __a, uint16x4x2_t __val) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2); - return (uint8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)r, __o, - (int8x16_t)idx); + __builtin_aarch64_simd_oi __o; + uint16x8x2_t __temp; + __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __builtin_aarch64_st1x2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline poly8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx3q_p8 (poly8x16_t r, poly8x16x3_t tab, uint8x16_t idx) +vst1_u32_x2 (uint32_t * __a, uint32x2x2_t __val) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t)tab.val[2], 2); - return (poly8x16_t)__builtin_aarch64_qtbx3v16qi ((int8x16_t)r, __o, - (int8x16_t)idx); + __builtin_aarch64_simd_oi __o; + uint32x4x2_t __temp; + __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[1], 1); + __builtin_aarch64_st1x2v2si ((__builtin_aarch64_simd_si *) __a, __o); } -/* vqtbx4 */ - -__extension__ extern __inline int8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx4_s8 (int8x8_t r, int8x16x4_t tab, uint8x8_t idx) +vst1_f16_x2 (float16_t * __a, float16x4x2_t __val) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[3], 3); - return __builtin_aarch64_qtbx4v8qi (r, __o, (int8x8_t)idx); + __builtin_aarch64_simd_oi __o; + float16x8x2_t __temp; + __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8hf (__o, __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hf (__o, __temp.val[1], 1); + __builtin_aarch64_st1x2v4hf (__a, __o); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx4_u8 (uint8x8_t r, uint8x16x4_t tab, uint8x8_t idx) +vst1_f32_x2 (float32_t * __a, float32x2x2_t __val) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3); - return (uint8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)r, __o, - (int8x8_t)idx); + __builtin_aarch64_simd_oi __o; + float32x4x2_t __temp; + __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __temp.val[1], 1); + __builtin_aarch64_st1x2v2sf ((__builtin_aarch64_simd_sf *) __a, __o); } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx4_p8 (poly8x8_t r, poly8x16x4_t tab, uint8x8_t idx) +vst1_p64_x2 (poly64_t * __a, poly64x1x2_t __val) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3); - return (poly8x8_t)__builtin_aarch64_qtbx4v8qi ((int8x8_t)r, __o, - (int8x8_t)idx); + __builtin_aarch64_simd_oi __o; + poly64x2x2_t __temp; + __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, + (poly64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, + (poly64x2_t) __temp.val[1], 1); + __builtin_aarch64_st1x2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx4q_s8 (int8x16_t r, int8x16x4_t tab, uint8x16_t idx) +vst1q_s8_x2 (int8_t * __a, int8x16x2_t __val) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, tab.val[3], 3); - return __builtin_aarch64_qtbx4v16qi (r, __o, (int8x16_t)idx); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1); + __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx4q_u8 (uint8x16_t r, uint8x16x4_t tab, uint8x16_t idx) +vst1q_p8_x2 (poly8_t * __a, poly8x16x2_t __val) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3); - return (uint8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)r, __o, - (int8x16_t)idx); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1); + __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline poly8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqtbx4q_p8 (poly8x16_t r, poly8x16x4_t tab, uint8x16_t idx) +vst1q_s16_x2 (int16_t * __a, int16x8x2_t __val) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t)tab.val[3], 3); - return (poly8x16_t)__builtin_aarch64_qtbx4v16qi ((int8x16_t)r, __o, - (int8x16_t)idx); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1); + __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -/* vrbit */ - -__extension__ extern __inline poly8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrbit_p8 (poly8x8_t __a) +vst1q_p16_x2 (poly16_t * __a, poly16x8x2_t __val) { - return (poly8x8_t) __builtin_aarch64_rbitv8qi ((int8x8_t) __a); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1); + __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline int8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrbit_s8 (int8x8_t __a) +vst1q_s32_x2 (int32_t * __a, int32x4x2_t __val) { - return __builtin_aarch64_rbitv8qi (__a); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[1], 1); + __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrbit_u8 (uint8x8_t __a) +vst1q_s64_x2 (int64_t * __a, int64x2x2_t __val) { - return (uint8x8_t) __builtin_aarch64_rbitv8qi ((int8x8_t) __a); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[1], 1); + __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline poly8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrbitq_p8 (poly8x16_t __a) +vst1q_u8_x2 (uint8_t * __a, uint8x16x2_t __val) { - return (poly8x16_t) __builtin_aarch64_rbitv16qi ((int8x16_t)__a); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1); + __builtin_aarch64_st1x2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrbitq_s8 (int8x16_t __a) +vst1q_u16_x2 (uint16_t * __a, uint16x8x2_t __val) { - return __builtin_aarch64_rbitv16qi (__a); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1); + __builtin_aarch64_st1x2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrbitq_u8 (uint8x16_t __a) +vst1q_u32_x2 (uint32_t * __a, uint32x4x2_t __val) { - return (uint8x16_t) __builtin_aarch64_rbitv16qi ((int8x16_t) __a); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[1], 1); + __builtin_aarch64_st1x2v4si ((__builtin_aarch64_simd_si *) __a, __o); } -/* vrecpe */ - -__extension__ extern __inline uint32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecpe_u32 (uint32x2_t __a) +vst1q_u64_x2 (uint64_t * __a, uint64x2x2_t __val) { - return (uint32x2_t) __builtin_aarch64_urecpev2si ((int32x2_t) __a); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[1], 1); + __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecpeq_u32 (uint32x4_t __a) +vst1q_f16_x2 (float16_t * __a, float16x8x2_t __val) { - return (uint32x4_t) __builtin_aarch64_urecpev4si ((int32x4_t) __a); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8hf (__o, __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hf (__o, __val.val[1], 1); + __builtin_aarch64_st1x2v8hf (__a, __o); } -__extension__ extern __inline float32_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecpes_f32 (float32_t __a) +vst1q_f32_x2 (float32_t * __a, float32x4x2_t __val) { - return __builtin_aarch64_frecpesf (__a); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __val.val[1], 1); + __builtin_aarch64_st1x2v4sf ((__builtin_aarch64_simd_sf *) __a, __o); } -__extension__ extern __inline float64_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecped_f64 (float64_t __a) +vst1q_f64_x2 (float64_t * __a, float64x2x2_t __val) { - return __builtin_aarch64_frecpedf (__a); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __val.val[1], 1); + __builtin_aarch64_st1x2v2df ((__builtin_aarch64_simd_df *) __a, __o); } -__extension__ extern __inline float32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecpe_f32 (float32x2_t __a) +vst1q_p64_x2 (poly64_t * __a, poly64x2x2_t __val) { - return __builtin_aarch64_frecpev2sf (__a); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, + (poly64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, + (poly64x2_t) __val.val[1], 1); + __builtin_aarch64_st1x2v2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline float64x1_t +/* vst1x3 */ + +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecpe_f64 (float64x1_t __a) +vst1_s64_x3 (int64_t * __a, int64x1x3_t __val) { - return (float64x1_t) { vrecped_f64 (vget_lane_f64 (__a, 0)) }; + __builtin_aarch64_simd_ci __o; + int64x2x3_t __temp; + __temp.val[0] = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s64 (__val.val[2], vcreate_s64 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2); + __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecpeq_f32 (float32x4_t __a) +vst1_u64_x3 (uint64_t * __a, uint64x1x3_t __val) { - return __builtin_aarch64_frecpev4sf (__a); + __builtin_aarch64_simd_ci __o; + uint64x2x3_t __temp; + __temp.val[0] = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u64 (__val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2); + __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecpeq_f64 (float64x2_t __a) +vst1_f64_x3 (float64_t * __a, float64x1x3_t __val) { - return __builtin_aarch64_frecpev2df (__a); + __builtin_aarch64_simd_ci __o; + float64x2x3_t __temp; + __temp.val[0] = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f64 (__val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[2], 2); + __builtin_aarch64_st1x3df ((__builtin_aarch64_simd_df *) __a, __o); } -/* vrecps */ - -__extension__ extern __inline float32_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecpss_f32 (float32_t __a, float32_t __b) +vst1_s8_x3 (int8_t * __a, int8x8x3_t __val) { - return __builtin_aarch64_frecpssf (__a, __b); + __builtin_aarch64_simd_ci __o; + int8x16x3_t __temp; + __temp.val[0] = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s8 (__val.val[2], vcreate_s8 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2); + __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline float64_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecpsd_f64 (float64_t __a, float64_t __b) +vst1_p8_x3 (poly8_t * __a, poly8x8x3_t __val) { - return __builtin_aarch64_frecpsdf (__a, __b); + __builtin_aarch64_simd_ci __o; + poly8x16x3_t __temp; + __temp.val[0] = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p8 (__val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2); + __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline float32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecps_f32 (float32x2_t __a, float32x2_t __b) +vst1_s16_x3 (int16_t * __a, int16x4x3_t __val) { - return __builtin_aarch64_frecpsv2sf (__a, __b); + __builtin_aarch64_simd_ci __o; + int16x8x3_t __temp; + __temp.val[0] = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s16 (__val.val[2], vcreate_s16 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2); + __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecps_f64 (float64x1_t __a, float64x1_t __b) +vst1_p16_x3 (poly16_t * __a, poly16x4x3_t __val) { - return (float64x1_t) { vrecpsd_f64 (vget_lane_f64 (__a, 0), - vget_lane_f64 (__b, 0)) }; + __builtin_aarch64_simd_ci __o; + poly16x8x3_t __temp; + __temp.val[0] = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p16 (__val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2); + __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecpsq_f32 (float32x4_t __a, float32x4_t __b) +vst1_s32_x3 (int32_t * __a, int32x2x3_t __val) { - return __builtin_aarch64_frecpsv4sf (__a, __b); + __builtin_aarch64_simd_ci __o; + int32x4x3_t __temp; + __temp.val[0] = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s32 (__val.val[2], vcreate_s32 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2); + __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecpsq_f64 (float64x2_t __a, float64x2_t __b) +vst1_u8_x3 (uint8_t * __a, uint8x8x3_t __val) { - return __builtin_aarch64_frecpsv2df (__a, __b); + __builtin_aarch64_simd_ci __o; + uint8x16x3_t __temp; + __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u8 (__val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2); + __builtin_aarch64_st1x3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } -/* vrecpx */ - -__extension__ extern __inline float32_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecpxs_f32 (float32_t __a) +vst1_u16_x3 (uint16_t * __a, uint16x4x3_t __val) { - return __builtin_aarch64_frecpxsf (__a); + __builtin_aarch64_simd_ci __o; + uint16x8x3_t __temp; + __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u16 (__val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2); + __builtin_aarch64_st1x3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline float64_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecpxd_f64 (float64_t __a) +vst1_u32_x3 (uint32_t * __a, uint32x2x3_t __val) { - return __builtin_aarch64_frecpxdf (__a); + __builtin_aarch64_simd_ci __o; + uint32x4x3_t __temp; + __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u32 (__val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2); + __builtin_aarch64_st1x3v2si ((__builtin_aarch64_simd_si *) __a, __o); } - -/* vrev */ - -__extension__ extern __inline poly8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev16_p8 (poly8x8_t a) +vst1_f16_x3 (float16_t * __a, float16x4x3_t __val) { - return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 }); + __builtin_aarch64_simd_ci __o; + float16x8x3_t __temp; + __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f16 (__val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[2], 2); + __builtin_aarch64_st1x3v4hf ((__builtin_aarch64_simd_hf *) __a, __o); } -__extension__ extern __inline int8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev16_s8 (int8x8_t a) +vst1_f32_x3 (float32_t * __a, float32x2x3_t __val) { - return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 }); + __builtin_aarch64_simd_ci __o; + float32x4x3_t __temp; + __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f32 (__val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[2], 2); + __builtin_aarch64_st1x3v2sf ((__builtin_aarch64_simd_sf *) __a, __o); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev16_u8 (uint8x8_t a) +vst1_p64_x3 (poly64_t * __a, poly64x1x3_t __val) { - return __builtin_shuffle (a, (uint8x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 }); + __builtin_aarch64_simd_ci __o; + poly64x2x3_t __temp; + __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p64 (__val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv2di_ssps (__o, + (poly64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di_ssps (__o, + (poly64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di_ssps (__o, + (poly64x2_t) __temp.val[2], 2); + __builtin_aarch64_st1x3di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline poly8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev16q_p8 (poly8x16_t a) +vst1q_s8_x3 (int8_t * __a, int8x16x3_t __val) { - return __builtin_shuffle (a, - (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 }); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2); + __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev16q_s8 (int8x16_t a) +vst1q_p8_x3 (poly8_t * __a, poly8x16x3_t __val) { - return __builtin_shuffle (a, - (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 }); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2); + __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev16q_u8 (uint8x16_t a) +vst1q_s16_x3 (int16_t * __a, int16x8x3_t __val) { - return __builtin_shuffle (a, - (uint8x16_t) { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 }); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2); + __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev32_p8 (poly8x8_t a) +vst1q_p16_x3 (poly16_t * __a, poly16x8x3_t __val) { - return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2); + __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline poly16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev32_p16 (poly16x4_t a) +vst1q_s32_x3 (int32_t * __a, int32x4x3_t __val) { - return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 }); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[2], 2); + __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o); } -__extension__ extern __inline int8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev32_s8 (int8x8_t a) +vst1q_s64_x3 (int64_t * __a, int64x2x3_t __val) { - return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[2], 2); + __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev32_s16 (int16x4_t a) +vst1q_u8_x3 (uint8_t * __a, uint8x16x3_t __val) { - return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 }); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2); + __builtin_aarch64_st1x3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev32_u8 (uint8x8_t a) +vst1q_u16_x3 (uint16_t * __a, uint16x8x3_t __val) { - return __builtin_shuffle (a, (uint8x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2); + __builtin_aarch64_st1x3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev32_u16 (uint16x4_t a) +vst1q_u32_x3 (uint32_t * __a, uint32x4x3_t __val) { - return __builtin_shuffle (a, (uint16x4_t) { 1, 0, 3, 2 }); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[2], 2); + __builtin_aarch64_st1x3v4si ((__builtin_aarch64_simd_si *) __a, __o); } -__extension__ extern __inline poly8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev32q_p8 (poly8x16_t a) +vst1q_u64_x3 (uint64_t * __a, uint64x2x3_t __val) { - return __builtin_shuffle (a, - (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[2], 2); + __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline poly16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev32q_p16 (poly16x8_t a) +vst1q_f16_x3 (float16_t * __a, float16x8x3_t __val) { - return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 }); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[2], 2); + __builtin_aarch64_st1x3v8hf ((__builtin_aarch64_simd_hf *) __a, __o); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev32q_s8 (int8x16_t a) +vst1q_f32_x3 (float32_t * __a, float32x4x3_t __val) { - return __builtin_shuffle (a, - (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[2], 2); + __builtin_aarch64_st1x3v4sf ((__builtin_aarch64_simd_sf *) __a, __o); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev32q_s16 (int16x8_t a) +vst1q_f64_x3 (float64_t * __a, float64x2x3_t __val) { - return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 }); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[2], 2); + __builtin_aarch64_st1x3v2df ((__builtin_aarch64_simd_df *) __a, __o); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev32q_u8 (uint8x16_t a) +vst1q_p64_x3 (poly64_t * __a, poly64x2x3_t __val) { - return __builtin_shuffle (a, - (uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 }); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv2di_ssps (__o, + (poly64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di_ssps (__o, + (poly64x2_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di_ssps (__o, + (poly64x2_t) __val.val[2], 2); + __builtin_aarch64_st1x3v2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline uint16x8_t +/* vst1(q)_x4. */ + +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev32q_u16 (uint16x8_t a) +vst1_s8_x4 (int8_t * __a, int8x8x4_t val) { - return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 }); + union { int8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __u.__o); } -__extension__ extern __inline float16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64_f16 (float16x4_t __a) +vst1q_s8_x4 (int8_t * __a, int8x16x4_t val) { - return __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 }); + union { int8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v16qi ((__builtin_aarch64_simd_qi *) __a, __u.__o); } -__extension__ extern __inline float32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64_f32 (float32x2_t a) +vst1_s16_x4 (int16_t * __a, int16x4x4_t val) { - return __builtin_shuffle (a, (uint32x2_t) { 1, 0 }); + union { int16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __u.__o); } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64_p8 (poly8x8_t a) +vst1q_s16_x4 (int16_t * __a, int16x8x4_t val) { - return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 }); + union { int16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v8hi ((__builtin_aarch64_simd_hi *) __a, __u.__o); } -__extension__ extern __inline poly16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64_p16 (poly16x4_t a) +vst1_s32_x4 (int32_t * __a, int32x2x4_t val) { - return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 }); + union { int32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v2si ((__builtin_aarch64_simd_si *) __a, __u.__o); } -__extension__ extern __inline int8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64_s8 (int8x8_t a) +vst1q_s32_x4 (int32_t * __a, int32x4x4_t val) { - return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 }); + union { int32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v4si ((__builtin_aarch64_simd_si *) __a, __u.__o); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64_s16 (int16x4_t a) +vst1_u8_x4 (uint8_t * __a, uint8x8x4_t val) { - return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 }); + union { uint8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __u.__o); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64_s32 (int32x2_t a) +vst1q_u8_x4 (uint8_t * __a, uint8x16x4_t val) { - return __builtin_shuffle (a, (uint32x2_t) { 1, 0 }); + union { uint8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v16qi ((__builtin_aarch64_simd_qi *) __a, __u.__o); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64_u8 (uint8x8_t a) +vst1_u16_x4 (uint16_t * __a, uint16x4x4_t val) { - return __builtin_shuffle (a, (uint8x8_t) { 7, 6, 5, 4, 3, 2, 1, 0 }); + union { uint16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __u.__o); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64_u16 (uint16x4_t a) +vst1q_u16_x4 (uint16_t * __a, uint16x8x4_t val) { - return __builtin_shuffle (a, (uint16x4_t) { 3, 2, 1, 0 }); + union { uint16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v8hi ((__builtin_aarch64_simd_hi *) __a, __u.__o); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64_u32 (uint32x2_t a) +vst1_u32_x4 (uint32_t * __a, uint32x2x4_t val) { - return __builtin_shuffle (a, (uint32x2_t) { 1, 0 }); + union { uint32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v2si ((__builtin_aarch64_simd_si *) __a, __u.__o); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64q_f16 (float16x8_t __a) +vst1q_u32_x4 (uint32_t * __a, uint32x4x4_t val) { - return __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); + union { uint32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v4si ((__builtin_aarch64_simd_si *) __a, __u.__o); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64q_f32 (float32x4_t a) +vst1_f16_x4 (float16_t * __a, float16x4x4_t val) { - return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 }); + union { float16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v4hf ((__builtin_aarch64_simd_hf *) __a, __u.__o); } -__extension__ extern __inline poly8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64q_p8 (poly8x16_t a) +vst1q_f16_x4 (float16_t * __a, float16x8x4_t val) { - return __builtin_shuffle (a, - (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }); + union { float16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v8hf ((__builtin_aarch64_simd_hf *) __a, __u.__o); } -__extension__ extern __inline poly16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64q_p16 (poly16x8_t a) +vst1_f32_x4 (float32_t * __a, float32x2x4_t val) { - return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); + union { float32x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v2sf ((__builtin_aarch64_simd_sf *) __a, __u.__o); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64q_s8 (int8x16_t a) +vst1q_f32_x4 (float32_t * __a, float32x4x4_t val) { - return __builtin_shuffle (a, - (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }); + union { float32x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v4sf ((__builtin_aarch64_simd_sf *) __a, __u.__o); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64q_s16 (int16x8_t a) +vst1_p8_x4 (poly8_t * __a, poly8x8x4_t val) { - return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); + union { poly8x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v8qi ((__builtin_aarch64_simd_qi *) __a, __u.__o); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64q_s32 (int32x4_t a) +vst1q_p8_x4 (poly8_t * __a, poly8x16x4_t val) { - return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 }); + union { poly8x16x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v16qi ((__builtin_aarch64_simd_qi *) __a, __u.__o); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64q_u8 (uint8x16_t a) +vst1_p16_x4 (poly16_t * __a, poly16x4x4_t val) { - return __builtin_shuffle (a, - (uint8x16_t) { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 }); + union { poly16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v4hi ((__builtin_aarch64_simd_hi *) __a, __u.__o); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64q_u16 (uint16x8_t a) +vst1q_p16_x4 (poly16_t * __a, poly16x8x4_t val) { - return __builtin_shuffle (a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 }); + union { poly16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v8hi ((__builtin_aarch64_simd_hi *) __a, __u.__o); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrev64q_u32 (uint32x4_t a) +vst1_s64_x4 (int64_t * __a, int64x1x4_t val) { - return __builtin_shuffle (a, (uint32x4_t) { 1, 0, 3, 2 }); + union { int64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __u.__o); } -/* vrnd */ - -__extension__ extern __inline float32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrnd_f32 (float32x2_t __a) +vst1_u64_x4 (uint64_t * __a, uint64x1x4_t val) { - return __builtin_aarch64_btruncv2sf (__a); + union { uint64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __u.__o); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrnd_f64 (float64x1_t __a) +vst1_p64_x4 (poly64_t * __a, poly64x1x4_t val) { - return vset_lane_f64 (__builtin_trunc (vget_lane_f64 (__a, 0)), __a, 0); + union { poly64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4di ((__builtin_aarch64_simd_di *) __a, __u.__o); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndq_f32 (float32x4_t __a) +vst1q_s64_x4 (int64_t * __a, int64x2x4_t val) { - return __builtin_aarch64_btruncv4sf (__a); + union { int64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v2di ((__builtin_aarch64_simd_di *) __a, __u.__o); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndq_f64 (float64x2_t __a) +vst1q_u64_x4 (uint64_t * __a, uint64x2x4_t val) { - return __builtin_aarch64_btruncv2df (__a); + union { uint64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v2di ((__builtin_aarch64_simd_di *) __a, __u.__o); } -/* vrnda */ - -__extension__ extern __inline float32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrnda_f32 (float32x2_t __a) +vst1q_p64_x4 (poly64_t * __a, poly64x2x4_t val) { - return __builtin_aarch64_roundv2sf (__a); + union { poly64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v2di ((__builtin_aarch64_simd_di *) __a, __u.__o); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrnda_f64 (float64x1_t __a) +vst1_f64_x4 (float64_t * __a, float64x1x4_t val) { - return vset_lane_f64 (__builtin_round (vget_lane_f64 (__a, 0)), __a, 0); + union { float64x1x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4df ((__builtin_aarch64_simd_df *) __a, __u.__o); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndaq_f32 (float32x4_t __a) +vst1q_f64_x4 (float64_t * __a, float64x2x4_t val) { - return __builtin_aarch64_roundv4sf (__a); + union { float64x2x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v2df ((__builtin_aarch64_simd_df *) __a, __u.__o); } -__extension__ extern __inline float64x2_t +/* vstn */ + +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndaq_f64 (float64x2_t __a) +vst2_s64 (int64_t * __a, int64x1x2_t __val) { - return __builtin_aarch64_roundv2df (__a); + __builtin_aarch64_simd_oi __o; + int64x2x2_t __temp; + __temp.val[0] = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[1], 1); + __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o); } -/* vrndi */ - -__extension__ extern __inline float32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndi_f32 (float32x2_t __a) +vst2_u64 (uint64_t * __a, uint64x1x2_t __val) { - return __builtin_aarch64_nearbyintv2sf (__a); + __builtin_aarch64_simd_oi __o; + uint64x2x2_t __temp; + __temp.val[0] = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __temp.val[1], 1); + __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndi_f64 (float64x1_t __a) +vst2_f64 (float64_t * __a, float64x1x2_t __val) { - return vset_lane_f64 (__builtin_nearbyint (vget_lane_f64 (__a, 0)), __a, 0); + __builtin_aarch64_simd_oi __o; + float64x2x2_t __temp; + __temp.val[0] = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __temp.val[1], 1); + __builtin_aarch64_st2df ((__builtin_aarch64_simd_df *) __a, __o); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndiq_f32 (float32x4_t __a) +vst2_s8 (int8_t * __a, int8x8x2_t __val) { - return __builtin_aarch64_nearbyintv4sf (__a); + __builtin_aarch64_simd_oi __o; + int8x16x2_t __temp; + __temp.val[0] = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndiq_f64 (float64x2_t __a) +vst2_p8 (poly8_t * __a, poly8x8x2_t __val) { - return __builtin_aarch64_nearbyintv2df (__a); + __builtin_aarch64_simd_oi __o; + poly8x16x2_t __temp; + __temp.val[0] = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } -/* vrndm */ - -__extension__ extern __inline float32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndm_f32 (float32x2_t __a) +vst2_s16 (int16_t * __a, int16x4x2_t __val) { - return __builtin_aarch64_floorv2sf (__a); + __builtin_aarch64_simd_oi __o; + int16x8x2_t __temp; + __temp.val[0] = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndm_f64 (float64x1_t __a) +vst2_p16 (poly16_t * __a, poly16x4x2_t __val) { - return vset_lane_f64 (__builtin_floor (vget_lane_f64 (__a, 0)), __a, 0); + __builtin_aarch64_simd_oi __o; + poly16x8x2_t __temp; + __temp.val[0] = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndmq_f32 (float32x4_t __a) +vst2_s32 (int32_t * __a, int32x2x2_t __val) { - return __builtin_aarch64_floorv4sf (__a); + __builtin_aarch64_simd_oi __o; + int32x4x2_t __temp; + __temp.val[0] = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[1], 1); + __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndmq_f64 (float64x2_t __a) +vst2_u8 (uint8_t * __a, uint8x8x2_t __val) { - return __builtin_aarch64_floorv2df (__a); + __builtin_aarch64_simd_oi __o; + uint8x16x2_t __temp; + __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } -/* vrndn */ - -__extension__ extern __inline float32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndn_f32 (float32x2_t __a) +vst2_u16 (uint16_t * __a, uint16x4x2_t __val) { - return __builtin_aarch64_frintnv2sf (__a); + __builtin_aarch64_simd_oi __o; + uint16x8x2_t __temp; + __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndn_f64 (float64x1_t __a) +vst2_u32 (uint32_t * __a, uint32x2x2_t __val) { - return (float64x1_t) {__builtin_aarch64_frintndf (__a[0])}; + __builtin_aarch64_simd_oi __o; + uint32x4x2_t __temp; + __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __temp.val[1], 1); + __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndnq_f32 (float32x4_t __a) +vst2_f16 (float16_t * __a, float16x4x2_t __val) { - return __builtin_aarch64_frintnv4sf (__a); + __builtin_aarch64_simd_oi __o; + float16x8x2_t __temp; + __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8hf (__o, __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hf (__o, __temp.val[1], 1); + __builtin_aarch64_st2v4hf (__a, __o); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndnq_f64 (float64x2_t __a) +vst2_f32 (float32_t * __a, float32x2x2_t __val) { - return __builtin_aarch64_frintnv2df (__a); + __builtin_aarch64_simd_oi __o; + float32x4x2_t __temp; + __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __temp.val[1], 1); + __builtin_aarch64_st2v2sf ((__builtin_aarch64_simd_sf *) __a, __o); } -/* vrndp */ - -__extension__ extern __inline float32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndp_f32 (float32x2_t __a) +vst2_p64 (poly64_t * __a, poly64x1x2_t __val) { - return __builtin_aarch64_ceilv2sf (__a); + __builtin_aarch64_simd_oi __o; + poly64x2x2_t __temp; + __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, + (poly64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, + (poly64x2_t) __temp.val[1], 1); + __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndp_f64 (float64x1_t __a) +vst2q_s8 (int8_t * __a, int8x16x2_t __val) { - return vset_lane_f64 (__builtin_ceil (vget_lane_f64 (__a, 0)), __a, 0); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1); + __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndpq_f32 (float32x4_t __a) +vst2q_p8 (poly8_t * __a, poly8x16x2_t __val) { - return __builtin_aarch64_ceilv4sf (__a); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1); + __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndpq_f64 (float64x2_t __a) +vst2q_s16 (int16_t * __a, int16x8x2_t __val) { - return __builtin_aarch64_ceilv2df (__a); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1); + __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -/* vrndx */ - -__extension__ extern __inline float32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndx_f32 (float32x2_t __a) +vst2q_p16 (poly16_t * __a, poly16x8x2_t __val) { - return __builtin_aarch64_rintv2sf (__a); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1); + __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndx_f64 (float64x1_t __a) +vst2q_s32 (int32_t * __a, int32x4x2_t __val) { - return vset_lane_f64 (__builtin_rint (vget_lane_f64 (__a, 0)), __a, 0); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[1], 1); + __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndxq_f32 (float32x4_t __a) +vst2q_s64 (int64_t * __a, int64x2x2_t __val) { - return __builtin_aarch64_rintv4sf (__a); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[1], 1); + __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndxq_f64 (float64x2_t __a) +vst2q_u8 (uint8_t * __a, uint8x16x2_t __val) { - return __builtin_aarch64_rintv2df (__a); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) __val.val[1], 1); + __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } -/* vrshl */ - -__extension__ extern __inline int8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshl_s8 (int8x8_t __a, int8x8_t __b) +vst2q_u16 (uint16_t * __a, uint16x8x2_t __val) { - return (int8x8_t) __builtin_aarch64_srshlv8qi (__a, __b); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) __val.val[1], 1); + __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshl_s16 (int16x4_t __a, int16x4_t __b) +vst2q_u32 (uint32_t * __a, uint32x4x2_t __val) { - return (int16x4_t) __builtin_aarch64_srshlv4hi (__a, __b); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __val.val[1], 1); + __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshl_s32 (int32x2_t __a, int32x2_t __b) +vst2q_u64 (uint64_t * __a, uint64x2x2_t __val) { - return (int32x2_t) __builtin_aarch64_srshlv2si (__a, __b); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) __val.val[1], 1); + __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline int64x1_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshl_s64 (int64x1_t __a, int64x1_t __b) +vst2q_f16 (float16_t * __a, float16x8x2_t __val) { - return (int64x1_t) {__builtin_aarch64_srshldi (__a[0], __b[0])}; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8hf (__o, __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8hf (__o, __val.val[1], 1); + __builtin_aarch64_st2v8hf (__a, __o); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshl_u8 (uint8x8_t __a, int8x8_t __b) +vst2q_f32 (float32_t * __a, float32x4x2_t __val) { - return __builtin_aarch64_urshlv8qi_uus (__a, __b); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) __val.val[1], 1); + __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __o); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshl_u16 (uint16x4_t __a, int16x4_t __b) +vst2q_f64 (float64_t * __a, float64x2x2_t __val) { - return __builtin_aarch64_urshlv4hi_uus (__a, __b); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) __val.val[1], 1); + __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __o); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshl_u32 (uint32x2_t __a, int32x2_t __b) +vst2q_p64 (poly64_t * __a, poly64x2x2_t __val) { - return __builtin_aarch64_urshlv2si_uus (__a, __b); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, + (poly64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, + (poly64x2_t) __val.val[1], 1); + __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline uint64x1_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshl_u64 (uint64x1_t __a, int64x1_t __b) +vst3_s64 (int64_t * __a, int64x1x3_t __val) { - return (uint64x1_t) {__builtin_aarch64_urshldi_uus (__a[0], __b[0])}; + __builtin_aarch64_simd_ci __o; + int64x2x3_t __temp; + __temp.val[0] = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s64 (__val.val[2], vcreate_s64 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2); + __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshlq_s8 (int8x16_t __a, int8x16_t __b) +vst3_u64 (uint64_t * __a, uint64x1x3_t __val) { - return (int8x16_t) __builtin_aarch64_srshlv16qi (__a, __b); + __builtin_aarch64_simd_ci __o; + uint64x2x3_t __temp; + __temp.val[0] = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u64 (__val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __temp.val[2], 2); + __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshlq_s16 (int16x8_t __a, int16x8_t __b) +vst3_f64 (float64_t * __a, float64x1x3_t __val) { - return (int16x8_t) __builtin_aarch64_srshlv8hi (__a, __b); + __builtin_aarch64_simd_ci __o; + float64x2x3_t __temp; + __temp.val[0] = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f64 (__val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __temp.val[2], 2); + __builtin_aarch64_st3df ((__builtin_aarch64_simd_df *) __a, __o); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshlq_s32 (int32x4_t __a, int32x4_t __b) +vst3_s8 (int8_t * __a, int8x8x3_t __val) { - return (int32x4_t) __builtin_aarch64_srshlv4si (__a, __b); + __builtin_aarch64_simd_ci __o; + int8x16x3_t __temp; + __temp.val[0] = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s8 (__val.val[2], vcreate_s8 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2); + __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshlq_s64 (int64x2_t __a, int64x2_t __b) +vst3_p8 (poly8_t * __a, poly8x8x3_t __val) { - return (int64x2_t) __builtin_aarch64_srshlv2di (__a, __b); + __builtin_aarch64_simd_ci __o; + poly8x16x3_t __temp; + __temp.val[0] = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p8 (__val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2); + __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshlq_u8 (uint8x16_t __a, int8x16_t __b) +vst3_s16 (int16_t * __a, int16x4x3_t __val) { - return __builtin_aarch64_urshlv16qi_uus (__a, __b); + __builtin_aarch64_simd_ci __o; + int16x8x3_t __temp; + __temp.val[0] = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s16 (__val.val[2], vcreate_s16 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2); + __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshlq_u16 (uint16x8_t __a, int16x8_t __b) +vst3_p16 (poly16_t * __a, poly16x4x3_t __val) { - return __builtin_aarch64_urshlv8hi_uus (__a, __b); + __builtin_aarch64_simd_ci __o; + poly16x8x3_t __temp; + __temp.val[0] = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p16 (__val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2); + __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshlq_u32 (uint32x4_t __a, int32x4_t __b) +vst3_s32 (int32_t * __a, int32x2x3_t __val) { - return __builtin_aarch64_urshlv4si_uus (__a, __b); + __builtin_aarch64_simd_ci __o; + int32x4x3_t __temp; + __temp.val[0] = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s32 (__val.val[2], vcreate_s32 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2); + __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o); } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshlq_u64 (uint64x2_t __a, int64x2_t __b) +vst3_u8 (uint8_t * __a, uint8x8x3_t __val) { - return __builtin_aarch64_urshlv2di_uus (__a, __b); + __builtin_aarch64_simd_ci __o; + uint8x16x3_t __temp; + __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u8 (__val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __temp.val[2], 2); + __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline int64_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshld_s64 (int64_t __a, int64_t __b) +vst3_u16 (uint16_t * __a, uint16x4x3_t __val) { - return __builtin_aarch64_srshldi (__a, __b); + __builtin_aarch64_simd_ci __o; + uint16x8x3_t __temp; + __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u16 (__val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __temp.val[2], 2); + __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline uint64_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshld_u64 (uint64_t __a, int64_t __b) +vst3_u32 (uint32_t * __a, uint32x2x3_t __val) { - return __builtin_aarch64_urshldi_uus (__a, __b); + __builtin_aarch64_simd_ci __o; + uint32x4x3_t __temp; + __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u32 (__val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __temp.val[2], 2); + __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o); } -/* vrshr */ - -__extension__ extern __inline int8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshr_n_s8 (int8x8_t __a, const int __b) +vst3_f16 (float16_t * __a, float16x4x3_t __val) { - return (int8x8_t) __builtin_aarch64_srshr_nv8qi (__a, __b); + __builtin_aarch64_simd_ci __o; + float16x8x3_t __temp; + __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f16 (__val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __temp.val[2], 2); + __builtin_aarch64_st3v4hf ((__builtin_aarch64_simd_hf *) __a, __o); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshr_n_s16 (int16x4_t __a, const int __b) +vst3_f32 (float32_t * __a, float32x2x3_t __val) { - return (int16x4_t) __builtin_aarch64_srshr_nv4hi (__a, __b); + __builtin_aarch64_simd_ci __o; + float32x4x3_t __temp; + __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f32 (__val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __temp.val[2], 2); + __builtin_aarch64_st3v2sf ((__builtin_aarch64_simd_sf *) __a, __o); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshr_n_s32 (int32x2_t __a, const int __b) +vst3_p64 (poly64_t * __a, poly64x1x3_t __val) { - return (int32x2_t) __builtin_aarch64_srshr_nv2si (__a, __b); + __builtin_aarch64_simd_ci __o; + poly64x2x3_t __temp; + __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p64 (__val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv2di_ssps (__o, + (poly64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di_ssps (__o, + (poly64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di_ssps (__o, + (poly64x2_t) __temp.val[2], 2); + __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline int64x1_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshr_n_s64 (int64x1_t __a, const int __b) +vst3q_s8 (int8_t * __a, int8x16x3_t __val) { - return (int64x1_t) {__builtin_aarch64_srshr_ndi (__a[0], __b)}; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2); + __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshr_n_u8 (uint8x8_t __a, const int __b) +vst3q_p8 (poly8_t * __a, poly8x16x3_t __val) { - return __builtin_aarch64_urshr_nv8qi_uus (__a, __b); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2); + __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshr_n_u16 (uint16x4_t __a, const int __b) +vst3q_s16 (int16_t * __a, int16x8x3_t __val) { - return __builtin_aarch64_urshr_nv4hi_uus (__a, __b); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2); + __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshr_n_u32 (uint32x2_t __a, const int __b) +vst3q_p16 (poly16_t * __a, poly16x8x3_t __val) { - return __builtin_aarch64_urshr_nv2si_uus (__a, __b); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2); + __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline uint64x1_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshr_n_u64 (uint64x1_t __a, const int __b) +vst3q_s32 (int32_t * __a, int32x4x3_t __val) { - return (uint64x1_t) {__builtin_aarch64_urshr_ndi_uus (__a[0], __b)}; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[2], 2); + __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshrq_n_s8 (int8x16_t __a, const int __b) +vst3q_s64 (int64_t * __a, int64x2x3_t __val) { - return (int8x16_t) __builtin_aarch64_srshr_nv16qi (__a, __b); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[2], 2); + __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshrq_n_s16 (int16x8_t __a, const int __b) +vst3q_u8 (uint8_t * __a, uint8x16x3_t __val) { - return (int16x8_t) __builtin_aarch64_srshr_nv8hi (__a, __b); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) __val.val[2], 2); + __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshrq_n_s32 (int32x4_t __a, const int __b) +vst3q_u16 (uint16_t * __a, uint16x8x3_t __val) { - return (int32x4_t) __builtin_aarch64_srshr_nv4si (__a, __b); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) __val.val[2], 2); + __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshrq_n_s64 (int64x2_t __a, const int __b) +vst3q_u32 (uint32_t * __a, uint32x4x3_t __val) { - return (int64x2_t) __builtin_aarch64_srshr_nv2di (__a, __b); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __val.val[2], 2); + __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshrq_n_u8 (uint8x16_t __a, const int __b) +vst3q_u64 (uint64_t * __a, uint64x2x3_t __val) { - return __builtin_aarch64_urshr_nv16qi_uus (__a, __b); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) __val.val[2], 2); + __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshrq_n_u16 (uint16x8_t __a, const int __b) +vst3q_f16 (float16_t * __a, float16x8x3_t __val) { - return __builtin_aarch64_urshr_nv8hi_uus (__a, __b); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) __val.val[2], 2); + __builtin_aarch64_st3v8hf ((__builtin_aarch64_simd_hf *) __a, __o); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshrq_n_u32 (uint32x4_t __a, const int __b) +vst3q_f32 (float32_t * __a, float32x4x3_t __val) { - return __builtin_aarch64_urshr_nv4si_uus (__a, __b); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) __val.val[2], 2); + __builtin_aarch64_st3v4sf ((__builtin_aarch64_simd_sf *) __a, __o); } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshrq_n_u64 (uint64x2_t __a, const int __b) +vst3q_f64 (float64_t * __a, float64x2x3_t __val) { - return __builtin_aarch64_urshr_nv2di_uus (__a, __b); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) __val.val[2], 2); + __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __o); } -__extension__ extern __inline int64_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshrd_n_s64 (int64_t __a, const int __b) +vst3q_p64 (poly64_t * __a, poly64x2x3_t __val) { - return __builtin_aarch64_srshr_ndi (__a, __b); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv2di_ssps (__o, + (poly64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv2di_ssps (__o, + (poly64x2_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv2di_ssps (__o, + (poly64x2_t) __val.val[2], 2); + __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline uint64_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrshrd_n_u64 (uint64_t __a, const int __b) +vst4_s64 (int64_t * __a, int64x1x4_t __val) { - return __builtin_aarch64_urshr_ndi_uus (__a, __b); + __builtin_aarch64_simd_xi __o; + int64x2x4_t __temp; + __temp.val[0] = vcombine_s64 (__val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s64 (__val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s64 (__val.val[2], vcreate_s64 (__AARCH64_INT64_C (0))); + __temp.val[3] = vcombine_s64 (__val.val[3], vcreate_s64 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[3], 3); + __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o); } -/* vrsqrte. */ - -__extension__ extern __inline float32_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsqrtes_f32 (float32_t __a) +vst4_u64 (uint64_t * __a, uint64x1x4_t __val) { - return __builtin_aarch64_rsqrtesf (__a); + __builtin_aarch64_simd_xi __o; + uint64x2x4_t __temp; + __temp.val[0] = vcombine_u64 (__val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u64 (__val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u64 (__val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u64 (__val.val[3], vcreate_u64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __temp.val[3], 3); + __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline float64_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsqrted_f64 (float64_t __a) +vst4_f64 (float64_t * __a, float64x1x4_t __val) { - return __builtin_aarch64_rsqrtedf (__a); + __builtin_aarch64_simd_xi __o; + float64x2x4_t __temp; + __temp.val[0] = vcombine_f64 (__val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f64 (__val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f64 (__val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_f64 (__val.val[3], vcreate_f64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __temp.val[3], 3); + __builtin_aarch64_st4df ((__builtin_aarch64_simd_df *) __a, __o); } -__extension__ extern __inline float32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsqrte_f32 (float32x2_t __a) +vst4_s8 (int8_t * __a, int8x8x4_t __val) { - return __builtin_aarch64_rsqrtev2sf (__a); + __builtin_aarch64_simd_xi __o; + int8x16x4_t __temp; + __temp.val[0] = vcombine_s8 (__val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s8 (__val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s8 (__val.val[2], vcreate_s8 (__AARCH64_INT64_C (0))); + __temp.val[3] = vcombine_s8 (__val.val[3], vcreate_s8 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3); + __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsqrte_f64 (float64x1_t __a) +vst4_p8 (poly8_t * __a, poly8x8x4_t __val) { - return (float64x1_t) {vrsqrted_f64 (vget_lane_f64 (__a, 0))}; + __builtin_aarch64_simd_xi __o; + poly8x16x4_t __temp; + __temp.val[0] = vcombine_p8 (__val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p8 (__val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p8 (__val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_p8 (__val.val[3], vcreate_p8 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3); + __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsqrteq_f32 (float32x4_t __a) +vst4_s16 (int16_t * __a, int16x4x4_t __val) { - return __builtin_aarch64_rsqrtev4sf (__a); + __builtin_aarch64_simd_xi __o; + int16x8x4_t __temp; + __temp.val[0] = vcombine_s16 (__val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s16 (__val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s16 (__val.val[2], vcreate_s16 (__AARCH64_INT64_C (0))); + __temp.val[3] = vcombine_s16 (__val.val[3], vcreate_s16 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3); + __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsqrteq_f64 (float64x2_t __a) +vst4_p16 (poly16_t * __a, poly16x4x4_t __val) { - return __builtin_aarch64_rsqrtev2df (__a); + __builtin_aarch64_simd_xi __o; + poly16x8x4_t __temp; + __temp.val[0] = vcombine_p16 (__val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p16 (__val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p16 (__val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_p16 (__val.val[3], vcreate_p16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3); + __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } -/* vrsqrts. */ - -__extension__ extern __inline float32_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsqrtss_f32 (float32_t __a, float32_t __b) +vst4_s32 (int32_t * __a, int32x2x4_t __val) { - return __builtin_aarch64_rsqrtssf (__a, __b); + __builtin_aarch64_simd_xi __o; + int32x4x4_t __temp; + __temp.val[0] = vcombine_s32 (__val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); + __temp.val[1] = vcombine_s32 (__val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); + __temp.val[2] = vcombine_s32 (__val.val[2], vcreate_s32 (__AARCH64_INT64_C (0))); + __temp.val[3] = vcombine_s32 (__val.val[3], vcreate_s32 (__AARCH64_INT64_C (0))); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[3], 3); + __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o); } -__extension__ extern __inline float64_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsqrtsd_f64 (float64_t __a, float64_t __b) +vst4_u8 (uint8_t * __a, uint8x8x4_t __val) { - return __builtin_aarch64_rsqrtsdf (__a, __b); + __builtin_aarch64_simd_xi __o; + uint8x16x4_t __temp; + __temp.val[0] = vcombine_u8 (__val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u8 (__val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u8 (__val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u8 (__val.val[3], vcreate_u8 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __temp.val[3], 3); + __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline float32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsqrts_f32 (float32x2_t __a, float32x2_t __b) +vst4_u16 (uint16_t * __a, uint16x4x4_t __val) { - return __builtin_aarch64_rsqrtsv2sf (__a, __b); + __builtin_aarch64_simd_xi __o; + uint16x8x4_t __temp; + __temp.val[0] = vcombine_u16 (__val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u16 (__val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u16 (__val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u16 (__val.val[3], vcreate_u16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __temp.val[3], 3); + __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline float64x1_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsqrts_f64 (float64x1_t __a, float64x1_t __b) +vst4_u32 (uint32_t * __a, uint32x2x4_t __val) { - return (float64x1_t) {vrsqrtsd_f64 (vget_lane_f64 (__a, 0), - vget_lane_f64 (__b, 0))}; + __builtin_aarch64_simd_xi __o; + uint32x4x4_t __temp; + __temp.val[0] = vcombine_u32 (__val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_u32 (__val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_u32 (__val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_u32 (__val.val[3], vcreate_u32 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __temp.val[3], 3); + __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsqrtsq_f32 (float32x4_t __a, float32x4_t __b) +vst4_f16 (float16_t * __a, float16x4x4_t __val) { - return __builtin_aarch64_rsqrtsv4sf (__a, __b); + __builtin_aarch64_simd_xi __o; + float16x8x4_t __temp; + __temp.val[0] = vcombine_f16 (__val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f16 (__val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f16 (__val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_f16 (__val.val[3], vcreate_f16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __temp.val[3], 3); + __builtin_aarch64_st4v4hf ((__builtin_aarch64_simd_hf *) __a, __o); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsqrtsq_f64 (float64x2_t __a, float64x2_t __b) +vst4_f32 (float32_t * __a, float32x2x4_t __val) { - return __builtin_aarch64_rsqrtsv2df (__a, __b); + __builtin_aarch64_simd_xi __o; + float32x4x4_t __temp; + __temp.val[0] = vcombine_f32 (__val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_f32 (__val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_f32 (__val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_f32 (__val.val[3], vcreate_f32 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __temp.val[3], 3); + __builtin_aarch64_st4v2sf ((__builtin_aarch64_simd_sf *) __a, __o); } -/* vrsra */ - -__extension__ extern __inline int8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c) +vst4_p64 (poly64_t * __a, poly64x1x4_t __val) { - return (int8x8_t) __builtin_aarch64_srsra_nv8qi (__a, __b, __c); + __builtin_aarch64_simd_xi __o; + poly64x2x4_t __temp; + __temp.val[0] = vcombine_p64 (__val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_p64 (__val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_p64 (__val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_p64 (__val.val[3], vcreate_p64 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, + (poly64x2_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, + (poly64x2_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, + (poly64x2_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, + (poly64x2_t) __temp.val[3], 3); + __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c) +vst4q_s8 (int8_t * __a, int8x16x4_t __val) { - return (int16x4_t) __builtin_aarch64_srsra_nv4hi (__a, __b, __c); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[3], 3); + __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c) +vst4q_p8 (poly8_t * __a, poly8x16x4_t __val) { - return (int32x2_t) __builtin_aarch64_srsra_nv2si (__a, __b, __c); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[3], 3); + __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline int64x1_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c) +vst4q_s16 (int16_t * __a, int16x8x4_t __val) { - return (int64x1_t) {__builtin_aarch64_srsra_ndi (__a[0], __b[0], __c)}; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[3], 3); + __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c) +vst4q_p16 (poly16_t * __a, poly16x8x4_t __val) { - return __builtin_aarch64_ursra_nv8qi_uuus (__a, __b, __c); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[3], 3); + __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) +vst4q_s32 (int32_t * __a, int32x4x4_t __val) { - return __builtin_aarch64_ursra_nv4hi_uuus (__a, __b, __c); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[3], 3); + __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) +vst4q_s64 (int64_t * __a, int64x2x4_t __val) { - return __builtin_aarch64_ursra_nv2si_uuus (__a, __b, __c); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[3], 3); + __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline uint64x1_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) +vst4q_u8 (uint8_t * __a, uint8x16x4_t __val) { - return (uint64x1_t) {__builtin_aarch64_ursra_ndi_uuus (__a[0], __b[0], __c)}; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) __val.val[3], 3); + __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c) +vst4q_u16 (uint16_t * __a, uint16x8x4_t __val) { - return (int8x16_t) __builtin_aarch64_srsra_nv16qi (__a, __b, __c); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) __val.val[3], 3); + __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c) +vst4q_u32 (uint32_t * __a, uint32x4x4_t __val) { - return (int16x8_t) __builtin_aarch64_srsra_nv8hi (__a, __b, __c); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __val.val[3], 3); + __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c) +vst4q_u64 (uint64_t * __a, uint64x2x4_t __val) { - return (int32x4_t) __builtin_aarch64_srsra_nv4si (__a, __b, __c); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) __val.val[3], 3); + __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c) +vst4q_f16 (float16_t * __a, float16x8x4_t __val) { - return (int64x2_t) __builtin_aarch64_srsra_nv2di (__a, __b, __c); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) __val.val[3], 3); + __builtin_aarch64_st4v8hf ((__builtin_aarch64_simd_hf *) __a, __o); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c) +vst4q_f32 (float32_t * __a, float32x4x4_t __val) { - return __builtin_aarch64_ursra_nv16qi_uuus (__a, __b, __c); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) __val.val[3], 3); + __builtin_aarch64_st4v4sf ((__builtin_aarch64_simd_sf *) __a, __o); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) +vst4q_f64 (float64_t * __a, float64x2x4_t __val) { - return __builtin_aarch64_ursra_nv8hi_uuus (__a, __b, __c); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) __val.val[3], 3); + __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __o); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) +vst4q_p64 (poly64_t * __a, poly64x2x4_t __val) { - return __builtin_aarch64_ursra_nv4si_uuus (__a, __b, __c); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, + (poly64x2_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, + (poly64x2_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, + (poly64x2_t) __val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, + (poly64x2_t) __val.val[3], 3); + __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o); } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c) +vstrq_p128 (poly128_t * __ptr, poly128_t __val) { - return __builtin_aarch64_ursra_nv2di_uuus (__a, __b, __c); + *__ptr = __val; } +/* vsub */ + __extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsrad_n_s64 (int64_t __a, int64_t __b, const int __c) +vsubd_s64 (int64_t __a, int64_t __b) { - return __builtin_aarch64_srsra_ndi (__a, __b, __c); + return __a - __b; } __extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsrad_n_u64 (uint64_t __a, uint64_t __b, const int __c) +vsubd_u64 (uint64_t __a, uint64_t __b) { - return __builtin_aarch64_ursra_ndi_uuus (__a, __b, __c); + return __a - __b; } -#pragma GCC push_options -#pragma GCC target ("+nothing+crypto") - -/* vsha1 */ +/* vtbx1 */ -__extension__ extern __inline uint32x4_t +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsha1cq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk) +vtbx1_s8 (int8x8_t __r, int8x8_t __tab, int8x8_t __idx) { - return __builtin_aarch64_crypto_sha1cv4si_uuuu (hash_abcd, hash_e, wk); + uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx), + vmov_n_u8 (8)); + int8x8_t __tbl = vtbl1_s8 (__tab, __idx); + + return vbsl_s8 (__mask, __tbl, __r); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsha1mq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk) +vtbx1_u8 (uint8x8_t __r, uint8x8_t __tab, uint8x8_t __idx) { - return __builtin_aarch64_crypto_sha1mv4si_uuuu (hash_abcd, hash_e, wk); + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8)); + uint8x8_t __tbl = vtbl1_u8 (__tab, __idx); + + return vbsl_u8 (__mask, __tbl, __r); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsha1pq_u32 (uint32x4_t hash_abcd, uint32_t hash_e, uint32x4_t wk) +vtbx1_p8 (poly8x8_t __r, poly8x8_t __tab, uint8x8_t __idx) { - return __builtin_aarch64_crypto_sha1pv4si_uuuu (hash_abcd, hash_e, wk); + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8)); + poly8x8_t __tbl = vtbl1_p8 (__tab, __idx); + + return vbsl_p8 (__mask, __tbl, __r); } -__extension__ extern __inline uint32_t +/* vtbx3 */ + +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsha1h_u32 (uint32_t hash_e) +vtbx3_s8 (int8x8_t __r, int8x8x3_t __tab, int8x8_t __idx) { - return __builtin_aarch64_crypto_sha1hsi_uu (hash_e); + uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx), + vmov_n_u8 (24)); + int8x8_t __tbl = vtbl3_s8 (__tab, __idx); + + return vbsl_s8 (__mask, __tbl, __r); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsha1su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7, uint32x4_t w8_11) +vtbx3_u8 (uint8x8_t __r, uint8x8x3_t __tab, uint8x8_t __idx) { - return __builtin_aarch64_crypto_sha1su0v4si_uuuu (w0_3, w4_7, w8_11); + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24)); + uint8x8_t __tbl = vtbl3_u8 (__tab, __idx); + + return vbsl_u8 (__mask, __tbl, __r); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsha1su1q_u32 (uint32x4_t tw0_3, uint32x4_t w12_15) +vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx) { - return __builtin_aarch64_crypto_sha1su1v4si_uuu (tw0_3, w12_15); + uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24)); + poly8x8_t __tbl = vtbl3_p8 (__tab, __idx); + + return vbsl_p8 (__mask, __tbl, __r); } -__extension__ extern __inline uint32x4_t +/* vtbx4 */ + +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsha256hq_u32 (uint32x4_t hash_abcd, uint32x4_t hash_efgh, uint32x4_t wk) +vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx) { - return __builtin_aarch64_crypto_sha256hv4si_uuuu (hash_abcd, hash_efgh, wk); + int8x8_t __result; + int8x16x2_t __temp; + __builtin_aarch64_simd_oi __o; + __temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]); + __temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) __temp.val[1], 1); + __result = __builtin_aarch64_tbx4v8qi (__r, __o, __idx); + return __result; } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsha256h2q_u32 (uint32x4_t hash_efgh, uint32x4_t hash_abcd, uint32x4_t wk) +vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx) { - return __builtin_aarch64_crypto_sha256h2v4si_uuuu (hash_efgh, hash_abcd, wk); + uint8x8_t __result; + uint8x16x2_t __temp; + __builtin_aarch64_simd_oi __o; + __temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]); + __temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) __temp.val[1], 1); + __result = (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o, + (int8x8_t)__idx); + return __result; } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsha256su0q_u32 (uint32x4_t w0_3, uint32x4_t w4_7) +vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx) { - return __builtin_aarch64_crypto_sha256su0v4si_uuu (w0_3, w4_7); + poly8x8_t __result; + poly8x16x2_t __temp; + __builtin_aarch64_simd_oi __o; + __temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]); + __temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv16qi (__o, + (int8x16_t) __temp.val[1], 1); + __result = (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o, + (int8x8_t)__idx); + return __result; } -__extension__ extern __inline uint32x4_t +/* vtrn */ + +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsha256su1q_u32 (uint32x4_t tw0_3, uint32x4_t w8_11, uint32x4_t w12_15) +vtrn1_f16 (float16x4_t __a, float16x4_t __b) { - return __builtin_aarch64_crypto_sha256su1v4si_uuuu (tw0_3, w8_11, w12_15); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6}); +#endif } -__extension__ extern __inline poly128_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_p64 (poly64_t a, poly64_t b) +vtrn1_f32 (float32x2_t __a, float32x2_t __b) { - return - __builtin_aarch64_crypto_pmulldi_ppp (a, b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); +#endif } -__extension__ extern __inline poly128_t +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmull_high_p64 (poly64x2_t a, poly64x2_t b) +vtrn1_p8 (poly8x8_t __a, poly8x8_t __b) { - return __builtin_aarch64_crypto_pmullv2di_ppp (a, b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); +#endif } -#pragma GCC pop_options - -/* vshl */ - -__extension__ extern __inline int8x8_t +__extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshl_n_s8 (int8x8_t __a, const int __b) +vtrn1_p16 (poly16x4_t __a, poly16x4_t __b) { - return (int8x8_t) __builtin_aarch64_ashlv8qi (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6}); +#endif } -__extension__ extern __inline int16x4_t +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshl_n_s16 (int16x4_t __a, const int __b) +vtrn1_s8 (int8x8_t __a, int8x8_t __b) { - return (int16x4_t) __builtin_aarch64_ashlv4hi (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); +#endif } -__extension__ extern __inline int32x2_t +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshl_n_s32 (int32x2_t __a, const int __b) +vtrn1_s16 (int16x4_t __a, int16x4_t __b) { - return (int32x2_t) __builtin_aarch64_ashlv2si (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6}); +#endif } -__extension__ extern __inline int64x1_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshl_n_s64 (int64x1_t __a, const int __b) +vtrn1_s32 (int32x2_t __a, int32x2_t __b) { - return (int64x1_t) {__builtin_aarch64_ashldi (__a[0], __b)}; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); +#endif } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshl_n_u8 (uint8x8_t __a, const int __b) +vtrn1_u8 (uint8x8_t __a, uint8x8_t __b) { - return (uint8x8_t) __builtin_aarch64_ashlv8qi ((int8x8_t) __a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); +#endif } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshl_n_u16 (uint16x4_t __a, const int __b) +vtrn1_u16 (uint16x4_t __a, uint16x4_t __b) { - return (uint16x4_t) __builtin_aarch64_ashlv4hi ((int16x4_t) __a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6}); +#endif } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshl_n_u32 (uint32x2_t __a, const int __b) -{ - return (uint32x2_t) __builtin_aarch64_ashlv2si ((int32x2_t) __a, __b); -} - -__extension__ extern __inline uint64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshl_n_u64 (uint64x1_t __a, const int __b) +vtrn1_u32 (uint32x2_t __a, uint32x2_t __b) { - return (uint64x1_t) {__builtin_aarch64_ashldi ((int64_t) __a[0], __b)}; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); +#endif } -__extension__ extern __inline int8x16_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshlq_n_s8 (int8x16_t __a, const int __b) +vtrn1q_f16 (float16x8_t __a, float16x8_t __b) { - return (int8x16_t) __builtin_aarch64_ashlv16qi (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); +#endif } -__extension__ extern __inline int16x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshlq_n_s16 (int16x8_t __a, const int __b) +vtrn1q_f32 (float32x4_t __a, float32x4_t __b) { - return (int16x8_t) __builtin_aarch64_ashlv8hi (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6}); +#endif } -__extension__ extern __inline int32x4_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshlq_n_s32 (int32x4_t __a, const int __b) +vtrn1q_f64 (float64x2_t __a, float64x2_t __b) { - return (int32x4_t) __builtin_aarch64_ashlv4si (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); +#endif } -__extension__ extern __inline int64x2_t +__extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshlq_n_s64 (int64x2_t __a, const int __b) +vtrn1q_p8 (poly8x16_t __a, poly8x16_t __b) { - return (int64x2_t) __builtin_aarch64_ashlv2di (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30}); +#endif } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshlq_n_u8 (uint8x16_t __a, const int __b) +vtrn1q_p16 (poly16x8_t __a, poly16x8_t __b) { - return (uint8x16_t) __builtin_aarch64_ashlv16qi ((int8x16_t) __a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); +#endif } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshlq_n_u16 (uint16x8_t __a, const int __b) +vtrn1q_s8 (int8x16_t __a, int8x16_t __b) { - return (uint16x8_t) __builtin_aarch64_ashlv8hi ((int16x8_t) __a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30}); +#endif } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshlq_n_u32 (uint32x4_t __a, const int __b) +vtrn1q_s16 (int16x8_t __a, int16x8_t __b) { - return (uint32x4_t) __builtin_aarch64_ashlv4si ((int32x4_t) __a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); +#endif } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshlq_n_u64 (uint64x2_t __a, const int __b) +vtrn1q_s32 (int32x4_t __a, int32x4_t __b) { - return (uint64x2_t) __builtin_aarch64_ashlv2di ((int64x2_t) __a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6}); +#endif } -__extension__ extern __inline int64_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshld_n_s64 (int64_t __a, const int __b) +vtrn1q_s64 (int64x2_t __a, int64x2_t __b) { - return __builtin_aarch64_ashldi (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); +#endif } -__extension__ extern __inline uint64_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshld_n_u64 (uint64_t __a, const int __b) +vtrn1q_u8 (uint8x16_t __a, uint8x16_t __b) { - return (uint64_t) __builtin_aarch64_ashldi (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30}); +#endif } -__extension__ extern __inline int8x8_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshl_s8 (int8x8_t __a, int8x8_t __b) +vtrn1q_u16 (uint16x8_t __a, uint16x8_t __b) { - return __builtin_aarch64_sshlv8qi (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); +#endif } -__extension__ extern __inline int16x4_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshl_s16 (int16x4_t __a, int16x4_t __b) +vtrn1q_u32 (uint32x4_t __a, uint32x4_t __b) { - return __builtin_aarch64_sshlv4hi (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6}); +#endif } -__extension__ extern __inline int32x2_t +__extension__ extern __inline poly64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshl_s32 (int32x2_t __a, int32x2_t __b) +vtrn1q_p64 (poly64x2_t __a, poly64x2_t __b) { - return __builtin_aarch64_sshlv2si (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {0, 2}); +#endif } -__extension__ extern __inline int64x1_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshl_s64 (int64x1_t __a, int64x1_t __b) +vtrn1q_u64 (uint64x2_t __a, uint64x2_t __b) { - return (int64x1_t) {__builtin_aarch64_sshldi (__a[0], __b[0])}; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); +#endif } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshl_u8 (uint8x8_t __a, int8x8_t __b) +vtrn2_f16 (float16x4_t __a, float16x4_t __b) { - return __builtin_aarch64_ushlv8qi_uus (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7}); +#endif } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshl_u16 (uint16x4_t __a, int16x4_t __b) +vtrn2_f32 (float32x2_t __a, float32x2_t __b) { - return __builtin_aarch64_ushlv4hi_uus (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); +#endif } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshl_u32 (uint32x2_t __a, int32x2_t __b) +vtrn2_p8 (poly8x8_t __a, poly8x8_t __b) { - return __builtin_aarch64_ushlv2si_uus (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); +#endif } -__extension__ extern __inline uint64x1_t +__extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshl_u64 (uint64x1_t __a, int64x1_t __b) +vtrn2_p16 (poly16x4_t __a, poly16x4_t __b) { - return (uint64x1_t) {__builtin_aarch64_ushldi_uus (__a[0], __b[0])}; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7}); +#endif } -__extension__ extern __inline int8x16_t +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshlq_s8 (int8x16_t __a, int8x16_t __b) +vtrn2_s8 (int8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_sshlv16qi (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); +#endif } -__extension__ extern __inline int16x8_t +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshlq_s16 (int16x8_t __a, int16x8_t __b) +vtrn2_s16 (int16x4_t __a, int16x4_t __b) { - return __builtin_aarch64_sshlv8hi (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7}); +#endif } -__extension__ extern __inline int32x4_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshlq_s32 (int32x4_t __a, int32x4_t __b) +vtrn2_s32 (int32x2_t __a, int32x2_t __b) { - return __builtin_aarch64_sshlv4si (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); +#endif } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshlq_s64 (int64x2_t __a, int64x2_t __b) +vtrn2_u8 (uint8x8_t __a, uint8x8_t __b) { - return __builtin_aarch64_sshlv2di (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); +#endif } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshlq_u8 (uint8x16_t __a, int8x16_t __b) +vtrn2_u16 (uint16x4_t __a, uint16x4_t __b) { - return __builtin_aarch64_ushlv16qi_uus (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7}); +#endif } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshlq_u16 (uint16x8_t __a, int16x8_t __b) +vtrn2_u32 (uint32x2_t __a, uint32x2_t __b) { - return __builtin_aarch64_ushlv8hi_uus (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); +#endif } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshlq_u32 (uint32x4_t __a, int32x4_t __b) +vtrn2q_f16 (float16x8_t __a, float16x8_t __b) { - return __builtin_aarch64_ushlv4si_uus (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); +#endif } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshlq_u64 (uint64x2_t __a, int64x2_t __b) +vtrn2q_f32 (float32x4_t __a, float32x4_t __b) { - return __builtin_aarch64_ushlv2di_uus (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7}); +#endif } -__extension__ extern __inline int64_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshld_s64 (int64_t __a, int64_t __b) +vtrn2q_f64 (float64x2_t __a, float64x2_t __b) { - return __builtin_aarch64_sshldi (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); +#endif } -__extension__ extern __inline uint64_t +__extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshld_u64 (uint64_t __a, uint64_t __b) +vtrn2q_p8 (poly8x16_t __a, poly8x16_t __b) { - return __builtin_aarch64_ushldi_uus (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31}); +#endif } -__extension__ extern __inline int16x8_t +__extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshll_high_n_s8 (int8x16_t __a, const int __b) +vtrn2q_p16 (poly16x8_t __a, poly16x8_t __b) { - return __builtin_aarch64_sshll2_nv16qi (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); +#endif } -__extension__ extern __inline int32x4_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshll_high_n_s16 (int16x8_t __a, const int __b) +vtrn2q_s8 (int8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_sshll2_nv8hi (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31}); +#endif } -__extension__ extern __inline int64x2_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshll_high_n_s32 (int32x4_t __a, const int __b) +vtrn2q_s16 (int16x8_t __a, int16x8_t __b) { - return __builtin_aarch64_sshll2_nv4si (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); +#endif } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshll_high_n_u8 (uint8x16_t __a, const int __b) +vtrn2q_s32 (int32x4_t __a, int32x4_t __b) { - return (uint16x8_t) __builtin_aarch64_ushll2_nv16qi ((int8x16_t) __a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7}); +#endif } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshll_high_n_u16 (uint16x8_t __a, const int __b) +vtrn2q_s64 (int64x2_t __a, int64x2_t __b) { - return (uint32x4_t) __builtin_aarch64_ushll2_nv8hi ((int16x8_t) __a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); +#endif } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshll_high_n_u32 (uint32x4_t __a, const int __b) +vtrn2q_u8 (uint8x16_t __a, uint8x16_t __b) { - return (uint64x2_t) __builtin_aarch64_ushll2_nv4si ((int32x4_t) __a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31}); +#endif } -__extension__ extern __inline int16x8_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshll_n_s8 (int8x8_t __a, const int __b) +vtrn2q_u16 (uint16x8_t __a, uint16x8_t __b) { - return __builtin_aarch64_sshll_nv8qi (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); +#endif } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshll_n_s16 (int16x4_t __a, const int __b) +vtrn2q_u32 (uint32x4_t __a, uint32x4_t __b) { - return __builtin_aarch64_sshll_nv4hi (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7}); +#endif } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshll_n_s32 (int32x2_t __a, const int __b) +vtrn2q_u64 (uint64x2_t __a, uint64x2_t __b) { - return __builtin_aarch64_sshll_nv2si (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); +#endif } -__extension__ extern __inline uint16x8_t + +__extension__ extern __inline poly64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshll_n_u8 (uint8x8_t __a, const int __b) +vtrn2q_p64 (poly64x2_t __a, poly64x2_t __b) { - return __builtin_aarch64_ushll_nv8qi_uus (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {1, 3}); +#endif } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline float16x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshll_n_u16 (uint16x4_t __a, const int __b) +vtrn_f16 (float16x4_t __a, float16x4_t __b) { - return __builtin_aarch64_ushll_nv4hi_uus (__a, __b); + return (float16x4x2_t) {vtrn1_f16 (__a, __b), vtrn2_f16 (__a, __b)}; } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline float32x2x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshll_n_u32 (uint32x2_t __a, const int __b) +vtrn_f32 (float32x2_t __a, float32x2_t __b) { - return __builtin_aarch64_ushll_nv2si_uus (__a, __b); + return (float32x2x2_t) {vtrn1_f32 (__a, __b), vtrn2_f32 (__a, __b)}; } -/* vshr */ - -__extension__ extern __inline int8x8_t +__extension__ extern __inline poly8x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshr_n_s8 (int8x8_t __a, const int __b) +vtrn_p8 (poly8x8_t __a, poly8x8_t __b) { - return (int8x8_t) __builtin_aarch64_ashrv8qi (__a, __b); + return (poly8x8x2_t) {vtrn1_p8 (__a, __b), vtrn2_p8 (__a, __b)}; } -__extension__ extern __inline int16x4_t +__extension__ extern __inline poly16x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshr_n_s16 (int16x4_t __a, const int __b) +vtrn_p16 (poly16x4_t __a, poly16x4_t __b) { - return (int16x4_t) __builtin_aarch64_ashrv4hi (__a, __b); + return (poly16x4x2_t) {vtrn1_p16 (__a, __b), vtrn2_p16 (__a, __b)}; } -__extension__ extern __inline int32x2_t +__extension__ extern __inline int8x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshr_n_s32 (int32x2_t __a, const int __b) +vtrn_s8 (int8x8_t __a, int8x8_t __b) { - return (int32x2_t) __builtin_aarch64_ashrv2si (__a, __b); + return (int8x8x2_t) {vtrn1_s8 (__a, __b), vtrn2_s8 (__a, __b)}; } -__extension__ extern __inline int64x1_t +__extension__ extern __inline int16x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshr_n_s64 (int64x1_t __a, const int __b) +vtrn_s16 (int16x4_t __a, int16x4_t __b) { - return (int64x1_t) {__builtin_aarch64_ashr_simddi (__a[0], __b)}; + return (int16x4x2_t) {vtrn1_s16 (__a, __b), vtrn2_s16 (__a, __b)}; } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline int32x2x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshr_n_u8 (uint8x8_t __a, const int __b) +vtrn_s32 (int32x2_t __a, int32x2_t __b) { - return (uint8x8_t) __builtin_aarch64_lshrv8qi ((int8x8_t) __a, __b); + return (int32x2x2_t) {vtrn1_s32 (__a, __b), vtrn2_s32 (__a, __b)}; } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline uint8x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshr_n_u16 (uint16x4_t __a, const int __b) +vtrn_u8 (uint8x8_t __a, uint8x8_t __b) { - return (uint16x4_t) __builtin_aarch64_lshrv4hi ((int16x4_t) __a, __b); + return (uint8x8x2_t) {vtrn1_u8 (__a, __b), vtrn2_u8 (__a, __b)}; } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline uint16x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshr_n_u32 (uint32x2_t __a, const int __b) +vtrn_u16 (uint16x4_t __a, uint16x4_t __b) { - return (uint32x2_t) __builtin_aarch64_lshrv2si ((int32x2_t) __a, __b); + return (uint16x4x2_t) {vtrn1_u16 (__a, __b), vtrn2_u16 (__a, __b)}; } -__extension__ extern __inline uint64x1_t +__extension__ extern __inline uint32x2x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshr_n_u64 (uint64x1_t __a, const int __b) +vtrn_u32 (uint32x2_t __a, uint32x2_t __b) { - return (uint64x1_t) {__builtin_aarch64_lshr_simddi_uus ( __a[0], __b)}; + return (uint32x2x2_t) {vtrn1_u32 (__a, __b), vtrn2_u32 (__a, __b)}; } -__extension__ extern __inline int8x16_t +__extension__ extern __inline float16x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshrq_n_s8 (int8x16_t __a, const int __b) +vtrnq_f16 (float16x8_t __a, float16x8_t __b) { - return (int8x16_t) __builtin_aarch64_ashrv16qi (__a, __b); + return (float16x8x2_t) {vtrn1q_f16 (__a, __b), vtrn2q_f16 (__a, __b)}; } -__extension__ extern __inline int16x8_t +__extension__ extern __inline float32x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshrq_n_s16 (int16x8_t __a, const int __b) +vtrnq_f32 (float32x4_t __a, float32x4_t __b) { - return (int16x8_t) __builtin_aarch64_ashrv8hi (__a, __b); + return (float32x4x2_t) {vtrn1q_f32 (__a, __b), vtrn2q_f32 (__a, __b)}; } -__extension__ extern __inline int32x4_t +__extension__ extern __inline poly8x16x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshrq_n_s32 (int32x4_t __a, const int __b) +vtrnq_p8 (poly8x16_t __a, poly8x16_t __b) { - return (int32x4_t) __builtin_aarch64_ashrv4si (__a, __b); + return (poly8x16x2_t) {vtrn1q_p8 (__a, __b), vtrn2q_p8 (__a, __b)}; } -__extension__ extern __inline int64x2_t +__extension__ extern __inline poly16x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshrq_n_s64 (int64x2_t __a, const int __b) +vtrnq_p16 (poly16x8_t __a, poly16x8_t __b) { - return (int64x2_t) __builtin_aarch64_ashrv2di (__a, __b); + return (poly16x8x2_t) {vtrn1q_p16 (__a, __b), vtrn2q_p16 (__a, __b)}; } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline int8x16x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshrq_n_u8 (uint8x16_t __a, const int __b) +vtrnq_s8 (int8x16_t __a, int8x16_t __b) { - return (uint8x16_t) __builtin_aarch64_lshrv16qi ((int8x16_t) __a, __b); + return (int8x16x2_t) {vtrn1q_s8 (__a, __b), vtrn2q_s8 (__a, __b)}; } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline int16x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshrq_n_u16 (uint16x8_t __a, const int __b) +vtrnq_s16 (int16x8_t __a, int16x8_t __b) { - return (uint16x8_t) __builtin_aarch64_lshrv8hi ((int16x8_t) __a, __b); + return (int16x8x2_t) {vtrn1q_s16 (__a, __b), vtrn2q_s16 (__a, __b)}; } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline int32x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshrq_n_u32 (uint32x4_t __a, const int __b) +vtrnq_s32 (int32x4_t __a, int32x4_t __b) { - return (uint32x4_t) __builtin_aarch64_lshrv4si ((int32x4_t) __a, __b); + return (int32x4x2_t) {vtrn1q_s32 (__a, __b), vtrn2q_s32 (__a, __b)}; } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline uint8x16x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshrq_n_u64 (uint64x2_t __a, const int __b) +vtrnq_u8 (uint8x16_t __a, uint8x16_t __b) { - return (uint64x2_t) __builtin_aarch64_lshrv2di ((int64x2_t) __a, __b); + return (uint8x16x2_t) {vtrn1q_u8 (__a, __b), vtrn2q_u8 (__a, __b)}; } -__extension__ extern __inline int64_t +__extension__ extern __inline uint16x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshrd_n_s64 (int64_t __a, const int __b) +vtrnq_u16 (uint16x8_t __a, uint16x8_t __b) { - return __builtin_aarch64_ashr_simddi (__a, __b); + return (uint16x8x2_t) {vtrn1q_u16 (__a, __b), vtrn2q_u16 (__a, __b)}; } -__extension__ extern __inline uint64_t +__extension__ extern __inline uint32x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshrd_n_u64 (uint64_t __a, const int __b) +vtrnq_u32 (uint32x4_t __a, uint32x4_t __b) { - return __builtin_aarch64_lshr_simddi_uus (__a, __b); + return (uint32x4x2_t) {vtrn1q_u32 (__a, __b), vtrn2q_u32 (__a, __b)}; } -/* vsli */ +/* vtst */ -__extension__ extern __inline int8x8_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsli_n_s8 (int8x8_t __a, int8x8_t __b, const int __c) +vtst_s8 (int8x8_t __a, int8x8_t __b) { - return (int8x8_t) __builtin_aarch64_ssli_nv8qi (__a, __b, __c); + return (uint8x8_t) ((__a & __b) != 0); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsli_n_s16 (int16x4_t __a, int16x4_t __b, const int __c) +vtst_s16 (int16x4_t __a, int16x4_t __b) { - return (int16x4_t) __builtin_aarch64_ssli_nv4hi (__a, __b, __c); + return (uint16x4_t) ((__a & __b) != 0); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsli_n_s32 (int32x2_t __a, int32x2_t __b, const int __c) +vtst_s32 (int32x2_t __a, int32x2_t __b) { - return (int32x2_t) __builtin_aarch64_ssli_nv2si (__a, __b, __c); + return (uint32x2_t) ((__a & __b) != 0); } -__extension__ extern __inline int64x1_t +__extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsli_n_s64 (int64x1_t __a, int64x1_t __b, const int __c) +vtst_s64 (int64x1_t __a, int64x1_t __b) { - return (int64x1_t) {__builtin_aarch64_ssli_ndi (__a[0], __b[0], __c)}; + return (uint64x1_t) ((__a & __b) != __AARCH64_INT64_C (0)); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsli_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c) +vtst_u8 (uint8x8_t __a, uint8x8_t __b) { - return __builtin_aarch64_usli_nv8qi_uuus (__a, __b, __c); + return ((__a & __b) != 0); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsli_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) +vtst_u16 (uint16x4_t __a, uint16x4_t __b) { - return __builtin_aarch64_usli_nv4hi_uuus (__a, __b, __c); + return ((__a & __b) != 0); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsli_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) +vtst_u32 (uint32x2_t __a, uint32x2_t __b) { - return __builtin_aarch64_usli_nv2si_uuus (__a, __b, __c); + return ((__a & __b) != 0); } __extension__ extern __inline uint64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsli_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) -{ - return (uint64x1_t) {__builtin_aarch64_usli_ndi_uuus (__a[0], __b[0], __c)}; -} - -__extension__ extern __inline poly64x1_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsli_n_p64 (poly64x1_t __a, poly64x1_t __b, const int __c) +vtst_u64 (uint64x1_t __a, uint64x1_t __b) { - return (poly64x1_t) {__builtin_aarch64_ssli_ndi_ppps (__a[0], __b[0], __c)}; + return ((__a & __b) != __AARCH64_UINT64_C (0)); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsliq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c) +vtstq_s8 (int8x16_t __a, int8x16_t __b) { - return (int8x16_t) __builtin_aarch64_ssli_nv16qi (__a, __b, __c); + return (uint8x16_t) ((__a & __b) != 0); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsliq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c) +vtstq_s16 (int16x8_t __a, int16x8_t __b) { - return (int16x8_t) __builtin_aarch64_ssli_nv8hi (__a, __b, __c); + return (uint16x8_t) ((__a & __b) != 0); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsliq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c) +vtstq_s32 (int32x4_t __a, int32x4_t __b) { - return (int32x4_t) __builtin_aarch64_ssli_nv4si (__a, __b, __c); + return (uint32x4_t) ((__a & __b) != 0); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsliq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c) +vtstq_s64 (int64x2_t __a, int64x2_t __b) { - return (int64x2_t) __builtin_aarch64_ssli_nv2di (__a, __b, __c); + return (uint64x2_t) ((__a & __b) != __AARCH64_INT64_C (0)); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsliq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c) +vtstq_u8 (uint8x16_t __a, uint8x16_t __b) { - return __builtin_aarch64_usli_nv16qi_uuus (__a, __b, __c); + return ((__a & __b) != 0); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsliq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) +vtstq_u16 (uint16x8_t __a, uint16x8_t __b) { - return __builtin_aarch64_usli_nv8hi_uuus (__a, __b, __c); + return ((__a & __b) != 0); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsliq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) +vtstq_u32 (uint32x4_t __a, uint32x4_t __b) { - return __builtin_aarch64_usli_nv4si_uuus (__a, __b, __c); + return ((__a & __b) != 0); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsliq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c) +vtstq_u64 (uint64x2_t __a, uint64x2_t __b) +{ + return ((__a & __b) != __AARCH64_UINT64_C (0)); +} + +__extension__ extern __inline uint64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vtstd_s64 (int64_t __a, int64_t __b) +{ + return (__a & __b) ? -1ll : 0ll; +} + +__extension__ extern __inline uint64_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vtstd_u64 (uint64_t __a, uint64_t __b) { - return __builtin_aarch64_usli_nv2di_uuus (__a, __b, __c); + return (__a & __b) ? -1ll : 0ll; } -__extension__ extern __inline poly64x2_t +/* vuqadd */ + +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsliq_n_p64 (poly64x2_t __a, poly64x2_t __b, const int __c) +vuqadd_s8 (int8x8_t __a, uint8x8_t __b) { - return __builtin_aarch64_ssli_nv2di_ppps (__a, __b, __c); + return __builtin_aarch64_suqaddv8qi_ssu (__a, __b); } -__extension__ extern __inline int64_t +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vslid_n_s64 (int64_t __a, int64_t __b, const int __c) +vuqadd_s16 (int16x4_t __a, uint16x4_t __b) { - return __builtin_aarch64_ssli_ndi (__a, __b, __c); + return __builtin_aarch64_suqaddv4hi_ssu (__a, __b); } -__extension__ extern __inline uint64_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vslid_n_u64 (uint64_t __a, uint64_t __b, const int __c) +vuqadd_s32 (int32x2_t __a, uint32x2_t __b) { - return __builtin_aarch64_usli_ndi_uuus (__a, __b, __c); + return __builtin_aarch64_suqaddv2si_ssu (__a, __b); } -/* vsqadd */ - -__extension__ extern __inline uint8x8_t +__extension__ extern __inline int64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsqadd_u8 (uint8x8_t __a, int8x8_t __b) +vuqadd_s64 (int64x1_t __a, uint64x1_t __b) { - return __builtin_aarch64_usqaddv8qi_uus (__a, __b); + return (int64x1_t) {__builtin_aarch64_suqadddi_ssu (__a[0], __b[0])}; } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsqadd_u16 (uint16x4_t __a, int16x4_t __b) +vuqaddq_s8 (int8x16_t __a, uint8x16_t __b) { - return __builtin_aarch64_usqaddv4hi_uus (__a, __b); + return __builtin_aarch64_suqaddv16qi_ssu (__a, __b); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsqadd_u32 (uint32x2_t __a, int32x2_t __b) +vuqaddq_s16 (int16x8_t __a, uint16x8_t __b) { - return __builtin_aarch64_usqaddv2si_uus (__a, __b); + return __builtin_aarch64_suqaddv8hi_ssu (__a, __b); } -__extension__ extern __inline uint64x1_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsqadd_u64 (uint64x1_t __a, int64x1_t __b) +vuqaddq_s32 (int32x4_t __a, uint32x4_t __b) { - return (uint64x1_t) {__builtin_aarch64_usqadddi_uus (__a[0], __b[0])}; + return __builtin_aarch64_suqaddv4si_ssu (__a, __b); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsqaddq_u8 (uint8x16_t __a, int8x16_t __b) +vuqaddq_s64 (int64x2_t __a, uint64x2_t __b) { - return __builtin_aarch64_usqaddv16qi_uus (__a, __b); + return __builtin_aarch64_suqaddv2di_ssu (__a, __b); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline int8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsqaddq_u16 (uint16x8_t __a, int16x8_t __b) +vuqaddb_s8 (int8_t __a, uint8_t __b) { - return __builtin_aarch64_usqaddv8hi_uus (__a, __b); + return __builtin_aarch64_suqaddqi_ssu (__a, __b); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsqaddq_u32 (uint32x4_t __a, int32x4_t __b) +vuqaddh_s16 (int16_t __a, uint16_t __b) { - return __builtin_aarch64_usqaddv4si_uus (__a, __b); + return __builtin_aarch64_suqaddhi_ssu (__a, __b); } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsqaddq_u64 (uint64x2_t __a, int64x2_t __b) +vuqadds_s32 (int32_t __a, uint32_t __b) { - return __builtin_aarch64_usqaddv2di_uus (__a, __b); + return __builtin_aarch64_suqaddsi_ssu (__a, __b); } -__extension__ extern __inline uint8_t +__extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsqaddb_u8 (uint8_t __a, int8_t __b) +vuqaddd_s64 (int64_t __a, uint64_t __b) { - return __builtin_aarch64_usqaddqi_uus (__a, __b); + return __builtin_aarch64_suqadddi_ssu (__a, __b); } -__extension__ extern __inline uint16_t +#define __DEFINTERLEAVE(op, rettype, intype, funcsuffix, Q) \ + __extension__ extern __inline rettype \ + __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ + v ## op ## Q ## _ ## funcsuffix (intype a, intype b) \ + { \ + return (rettype) {v ## op ## 1 ## Q ## _ ## funcsuffix (a, b), \ + v ## op ## 2 ## Q ## _ ## funcsuffix (a, b)}; \ + } + +#define __INTERLEAVE_LIST(op) \ + __DEFINTERLEAVE (op, float16x4x2_t, float16x4_t, f16,) \ + __DEFINTERLEAVE (op, float32x2x2_t, float32x2_t, f32,) \ + __DEFINTERLEAVE (op, poly8x8x2_t, poly8x8_t, p8,) \ + __DEFINTERLEAVE (op, poly16x4x2_t, poly16x4_t, p16,) \ + __DEFINTERLEAVE (op, int8x8x2_t, int8x8_t, s8,) \ + __DEFINTERLEAVE (op, int16x4x2_t, int16x4_t, s16,) \ + __DEFINTERLEAVE (op, int32x2x2_t, int32x2_t, s32,) \ + __DEFINTERLEAVE (op, uint8x8x2_t, uint8x8_t, u8,) \ + __DEFINTERLEAVE (op, uint16x4x2_t, uint16x4_t, u16,) \ + __DEFINTERLEAVE (op, uint32x2x2_t, uint32x2_t, u32,) \ + __DEFINTERLEAVE (op, float16x8x2_t, float16x8_t, f16, q) \ + __DEFINTERLEAVE (op, float32x4x2_t, float32x4_t, f32, q) \ + __DEFINTERLEAVE (op, poly8x16x2_t, poly8x16_t, p8, q) \ + __DEFINTERLEAVE (op, poly16x8x2_t, poly16x8_t, p16, q) \ + __DEFINTERLEAVE (op, int8x16x2_t, int8x16_t, s8, q) \ + __DEFINTERLEAVE (op, int16x8x2_t, int16x8_t, s16, q) \ + __DEFINTERLEAVE (op, int32x4x2_t, int32x4_t, s32, q) \ + __DEFINTERLEAVE (op, uint8x16x2_t, uint8x16_t, u8, q) \ + __DEFINTERLEAVE (op, uint16x8x2_t, uint16x8_t, u16, q) \ + __DEFINTERLEAVE (op, uint32x4x2_t, uint32x4_t, u32, q) + +/* vuzp */ + +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsqaddh_u16 (uint16_t __a, int16_t __b) +vuzp1_f16 (float16x4_t __a, float16x4_t __b) { - return __builtin_aarch64_usqaddhi_uus (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6}); +#endif } -__extension__ extern __inline uint32_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsqadds_u32 (uint32_t __a, int32_t __b) +vuzp1_f32 (float32x2_t __a, float32x2_t __b) { - return __builtin_aarch64_usqaddsi_uus (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); +#endif } -__extension__ extern __inline uint64_t +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsqaddd_u64 (uint64_t __a, int64_t __b) +vuzp1_p8 (poly8x8_t __a, poly8x8_t __b) { - return __builtin_aarch64_usqadddi_uus (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); +#endif } -/* vsqrt */ -__extension__ extern __inline float32x2_t +__extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsqrt_f32 (float32x2_t a) +vuzp1_p16 (poly16x4_t __a, poly16x4_t __b) { - return __builtin_aarch64_sqrtv2sf (a); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6}); +#endif } -__extension__ extern __inline float32x4_t +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsqrtq_f32 (float32x4_t a) +vuzp1_s8 (int8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_sqrtv4sf (a); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); +#endif } -__extension__ extern __inline float64x1_t +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsqrt_f64 (float64x1_t a) +vuzp1_s16 (int16x4_t __a, int16x4_t __b) { - return (float64x1_t) { __builtin_aarch64_sqrtdf (a[0]) }; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6}); +#endif } -__extension__ extern __inline float64x2_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsqrtq_f64 (float64x2_t a) +vuzp1_s32 (int32x2_t __a, int32x2_t __b) { - return __builtin_aarch64_sqrtv2df (a); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); +#endif } -/* vsra */ - -__extension__ extern __inline int8x8_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c) +vuzp1_u8 (uint8x8_t __a, uint8x8_t __b) { - return (int8x8_t) __builtin_aarch64_ssra_nv8qi (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); +#endif } -__extension__ extern __inline int16x4_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c) +vuzp1_u16 (uint16x4_t __a, uint16x4_t __b) { - return (int16x4_t) __builtin_aarch64_ssra_nv4hi (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6}); +#endif } -__extension__ extern __inline int32x2_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c) +vuzp1_u32 (uint32x2_t __a, uint32x2_t __b) { - return (int32x2_t) __builtin_aarch64_ssra_nv2si (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); +#endif } -__extension__ extern __inline int64x1_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c) +vuzp1q_f16 (float16x8_t __a, float16x8_t __b) { - return (int64x1_t) {__builtin_aarch64_ssra_ndi (__a[0], __b[0], __c)}; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); +#endif } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c) +vuzp1q_f32 (float32x4_t __a, float32x4_t __b) { - return __builtin_aarch64_usra_nv8qi_uuus (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6}); +#endif } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) +vuzp1q_f64 (float64x2_t __a, float64x2_t __b) { - return __builtin_aarch64_usra_nv4hi_uuus (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); +#endif } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) +vuzp1q_p8 (poly8x16_t __a, poly8x16_t __b) { - return __builtin_aarch64_usra_nv2si_uuus (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x16_t) + {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15}); +#else + return __builtin_shuffle (__a, __b, (uint8x16_t) + {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}); +#endif } -__extension__ extern __inline uint64x1_t +__extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) +vuzp1q_p16 (poly16x8_t __a, poly16x8_t __b) { - return (uint64x1_t) {__builtin_aarch64_usra_ndi_uuus (__a[0], __b[0], __c)}; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); +#endif } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c) +vuzp1q_s8 (int8x16_t __a, int8x16_t __b) { - return (int8x16_t) __builtin_aarch64_ssra_nv16qi (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}); +#endif } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c) +vuzp1q_s16 (int16x8_t __a, int16x8_t __b) { - return (int16x8_t) __builtin_aarch64_ssra_nv8hi (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); +#endif } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c) +vuzp1q_s32 (int32x4_t __a, int32x4_t __b) { - return (int32x4_t) __builtin_aarch64_ssra_nv4si (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6}); +#endif } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c) +vuzp1q_s64 (int64x2_t __a, int64x2_t __b) { - return (int64x2_t) __builtin_aarch64_ssra_nv2di (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); +#endif } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c) +vuzp1q_u8 (uint8x16_t __a, uint8x16_t __b) { - return __builtin_aarch64_usra_nv16qi_uuus (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}); +#endif } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) +vuzp1q_u16 (uint16x8_t __a, uint16x8_t __b) { - return __builtin_aarch64_usra_nv8hi_uuus (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); +#endif } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) +vuzp1q_u32 (uint32x4_t __a, uint32x4_t __b) { - return __builtin_aarch64_usra_nv4si_uuus (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6}); +#endif } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c) -{ - return __builtin_aarch64_usra_nv2di_uuus (__a, __b, __c); -} - -__extension__ extern __inline int64_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsrad_n_s64 (int64_t __a, int64_t __b, const int __c) -{ - return __builtin_aarch64_ssra_ndi (__a, __b, __c); -} - -__extension__ extern __inline uint64_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsrad_n_u64 (uint64_t __a, uint64_t __b, const int __c) +vuzp1q_u64 (uint64x2_t __a, uint64x2_t __b) { - return __builtin_aarch64_usra_ndi_uuus (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); +#endif } -/* vsri */ - -__extension__ extern __inline int8x8_t +__extension__ extern __inline poly64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsri_n_s8 (int8x8_t __a, int8x8_t __b, const int __c) +vuzp1q_p64 (poly64x2_t __a, poly64x2_t __b) { - return (int8x8_t) __builtin_aarch64_ssri_nv8qi (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {0, 2}); +#endif } -__extension__ extern __inline int16x4_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsri_n_s16 (int16x4_t __a, int16x4_t __b, const int __c) +vuzp2_f16 (float16x4_t __a, float16x4_t __b) { - return (int16x4_t) __builtin_aarch64_ssri_nv4hi (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7}); +#endif } -__extension__ extern __inline int32x2_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsri_n_s32 (int32x2_t __a, int32x2_t __b, const int __c) +vuzp2_f32 (float32x2_t __a, float32x2_t __b) { - return (int32x2_t) __builtin_aarch64_ssri_nv2si (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); +#endif } -__extension__ extern __inline int64x1_t +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsri_n_s64 (int64x1_t __a, int64x1_t __b, const int __c) +vuzp2_p8 (poly8x8_t __a, poly8x8_t __b) { - return (int64x1_t) {__builtin_aarch64_ssri_ndi (__a[0], __b[0], __c)}; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); +#endif } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsri_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c) +vuzp2_p16 (poly16x4_t __a, poly16x4_t __b) { - return __builtin_aarch64_usri_nv8qi_uuus (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7}); +#endif } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsri_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) +vuzp2_s8 (int8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_usri_nv4hi_uuus (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); +#endif } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsri_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) +vuzp2_s16 (int16x4_t __a, int16x4_t __b) { - return __builtin_aarch64_usri_nv2si_uuus (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7}); +#endif } -__extension__ extern __inline uint64x1_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsri_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c) +vuzp2_s32 (int32x2_t __a, int32x2_t __b) { - return (uint64x1_t) {__builtin_aarch64_usri_ndi_uuus (__a[0], __b[0], __c)}; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); +#endif } -__extension__ extern __inline int8x16_t +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsriq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c) +vuzp2_u8 (uint8x8_t __a, uint8x8_t __b) { - return (int8x16_t) __builtin_aarch64_ssri_nv16qi (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); +#endif } -__extension__ extern __inline int16x8_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsriq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c) +vuzp2_u16 (uint16x4_t __a, uint16x4_t __b) { - return (int16x8_t) __builtin_aarch64_ssri_nv8hi (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7}); +#endif } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsriq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c) +vuzp2_u32 (uint32x2_t __a, uint32x2_t __b) { - return (int32x4_t) __builtin_aarch64_ssri_nv4si (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); +#endif } -__extension__ extern __inline int64x2_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsriq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c) +vuzp2q_f16 (float16x8_t __a, float16x8_t __b) { - return (int64x2_t) __builtin_aarch64_ssri_nv2di (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); +#endif } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsriq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c) +vuzp2q_f32 (float32x4_t __a, float32x4_t __b) { - return __builtin_aarch64_usri_nv16qi_uuus (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7}); +#endif } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsriq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c) +vuzp2q_f64 (float64x2_t __a, float64x2_t __b) { - return __builtin_aarch64_usri_nv8hi_uuus (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); +#endif } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsriq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c) +vuzp2q_p8 (poly8x16_t __a, poly8x16_t __b) { - return __builtin_aarch64_usri_nv4si_uuus (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}); +#endif } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsriq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c) +vuzp2q_p16 (poly16x8_t __a, poly16x8_t __b) { - return __builtin_aarch64_usri_nv2di_uuus (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); +#endif } -__extension__ extern __inline int64_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsrid_n_s64 (int64_t __a, int64_t __b, const int __c) +vuzp2q_s8 (int8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_ssri_ndi (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint8x16_t) {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14}); +#else + return __builtin_shuffle (__a, __b, + (uint8x16_t) {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}); +#endif } -__extension__ extern __inline uint64_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsrid_n_u64 (uint64_t __a, uint64_t __b, const int __c) +vuzp2q_s16 (int16x8_t __a, int16x8_t __b) { - return __builtin_aarch64_usri_ndi_uuus (__a, __b, __c); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); +#endif } -/* vst1 */ - -__extension__ extern __inline void +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_f16 (float16_t *__a, float16x4_t __b) +vuzp2q_s32 (int32x4_t __a, int32x4_t __b) { - __builtin_aarch64_st1v4hf (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_f32 (float32_t *a, float32x2_t b) +vuzp2q_s64 (int64x2_t __a, int64x2_t __b) { - __builtin_aarch64_st1v2sf ((__builtin_aarch64_simd_sf *) a, b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_f64 (float64_t *a, float64x1_t b) +vuzp2q_u8 (uint8x16_t __a, uint8x16_t __b) { - *a = b[0]; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x16_t) + {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14}); +#else + return __builtin_shuffle (__a, __b, (uint8x16_t) + {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_p8 (poly8_t *a, poly8x8_t b) +vuzp2q_u16 (uint16x8_t __a, uint16x8_t __b) { - __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, - (int8x8_t) b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_p16 (poly16_t *a, poly16x4_t b) +vuzp2q_u32 (uint32x4_t __a, uint32x4_t __b) { - __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a, - (int16x4_t) b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_p64 (poly64_t *a, poly64x1_t b) +vuzp2q_u64 (uint64x2_t __a, uint64x2_t __b) { - *a = b[0]; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline poly64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_s8 (int8_t *a, int8x8_t b) +vuzp2q_p64 (poly64x2_t __a, poly64x2_t __b) { - __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {1, 3}); +#endif } -__extension__ extern __inline void +__INTERLEAVE_LIST (uzp) + +/* vzip */ + +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_s16 (int16_t *a, int16x4_t b) +vzip1_f16 (float16x4_t __a, float16x4_t __b) { - __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a, b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_s32 (int32_t *a, int32x2_t b) +vzip1_f32 (float32x2_t __a, float32x2_t __b) { - __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a, b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_s64 (int64_t *a, int64x1_t b) +vzip1_p8 (poly8x8_t __a, poly8x8_t __b) { - *a = b[0]; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_u8 (uint8_t *a, uint8x8_t b) +vzip1_p16 (poly16x4_t __a, poly16x4_t __b) { - __builtin_aarch64_st1v8qi ((__builtin_aarch64_simd_qi *) a, - (int8x8_t) b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_u16 (uint16_t *a, uint16x4_t b) +vzip1_s8 (int8x8_t __a, int8x8_t __b) { - __builtin_aarch64_st1v4hi ((__builtin_aarch64_simd_hi *) a, - (int16x4_t) b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_u32 (uint32_t *a, uint32x2_t b) +vzip1_s16 (int16x4_t __a, int16x4_t __b) { - __builtin_aarch64_st1v2si ((__builtin_aarch64_simd_si *) a, - (int32x2_t) b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_u64 (uint64_t *a, uint64x1_t b) +vzip1_s32 (int32x2_t __a, int32x2_t __b) { - *a = b[0]; +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); +#endif } -/* vst1q */ - -__extension__ extern __inline void +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_f16 (float16_t *__a, float16x8_t __b) +vzip1_u8 (uint8x8_t __a, uint8x8_t __b) { - __builtin_aarch64_st1v8hf (__a, __b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_f32 (float32_t *a, float32x4_t b) +vzip1_u16 (uint16x4_t __a, uint16x4_t __b) { - __builtin_aarch64_st1v4sf ((__builtin_aarch64_simd_sf *) a, b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_f64 (float64_t *a, float64x2_t b) +vzip1_u32 (uint32x2_t __a, uint32x2_t __b) { - __builtin_aarch64_st1v2df ((__builtin_aarch64_simd_df *) a, b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_p8 (poly8_t *a, poly8x16_t b) +vzip1q_f16 (float16x8_t __a, float16x8_t __b) { - __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, - (int8x16_t) b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint16x8_t) {12, 4, 13, 5, 14, 6, 15, 7}); +#else + return __builtin_shuffle (__a, __b, + (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_p16 (poly16_t *a, poly16x8_t b) +vzip1q_f32 (float32x4_t __a, float32x4_t __b) { - __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a, - (int16x8_t) b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_p64 (poly64_t *a, poly64x2_t b) +vzip1q_f64 (float64x2_t __a, float64x2_t __b) { - __builtin_aarch64_st1v2di_sp ((__builtin_aarch64_simd_di *) a, - (poly64x2_t) b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_s8 (int8_t *a, int8x16_t b) +vzip1q_p8 (poly8x16_t __a, poly8x16_t __b) { - __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x16_t) + {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15}); +#else + return __builtin_shuffle (__a, __b, (uint8x16_t) + {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_s16 (int16_t *a, int16x8_t b) +vzip1q_p16 (poly16x8_t __a, poly16x8_t __b) { - __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a, b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) + {12, 4, 13, 5, 14, 6, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_s32 (int32_t *a, int32x4_t b) +vzip1q_s8 (int8x16_t __a, int8x16_t __b) { - __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a, b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x16_t) + {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15}); +#else + return __builtin_shuffle (__a, __b, (uint8x16_t) + {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_s64 (int64_t *a, int64x2_t b) +vzip1q_s16 (int16x8_t __a, int16x8_t __b) { - __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a, b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) + {12, 4, 13, 5, 14, 6, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_u8 (uint8_t *a, uint8x16_t b) +vzip1q_s32 (int32x4_t __a, int32x4_t __b) { - __builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a, - (int8x16_t) b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_u16 (uint16_t *a, uint16x8_t b) +vzip1q_s64 (int64x2_t __a, int64x2_t __b) { - __builtin_aarch64_st1v8hi ((__builtin_aarch64_simd_hi *) a, - (int16x8_t) b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_u32 (uint32_t *a, uint32x4_t b) +vzip1q_u8 (uint8x16_t __a, uint8x16_t __b) { - __builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a, - (int32x4_t) b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x16_t) + {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15}); +#else + return __builtin_shuffle (__a, __b, (uint8x16_t) + {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_u64 (uint64_t *a, uint64x2_t b) +vzip1q_u16 (uint16x8_t __a, uint16x8_t __b) { - __builtin_aarch64_st1v2di ((__builtin_aarch64_simd_di *) a, - (int64x2_t) b); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) + {12, 4, 13, 5, 14, 6, 15, 7}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); +#endif } -/* vst1_lane */ - -__extension__ extern __inline void +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_lane_f16 (float16_t *__a, float16x4_t __b, const int __lane) +vzip1q_u32 (uint32x4_t __a, uint32x4_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_lane_f32 (float32_t *__a, float32x2_t __b, const int __lane) +vzip1q_u64 (uint64x2_t __a, uint64x2_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline poly64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_lane_f64 (float64_t *__a, float64x1_t __b, const int __lane) +vzip1q_p64 (poly64x2_t __a, poly64x2_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {0, 2}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_lane_p8 (poly8_t *__a, poly8x8_t __b, const int __lane) +vzip2_f16 (float16x4_t __a, float16x4_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_lane_p16 (poly16_t *__a, poly16x4_t __b, const int __lane) +vzip2_f32 (float32x2_t __a, float32x2_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline poly8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_lane_p64 (poly64_t *__a, poly64x1_t __b, const int __lane) +vzip2_p8 (poly8x8_t __a, poly8x8_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline poly16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_lane_s8 (int8_t *__a, int8x8_t __b, const int __lane) +vzip2_p16 (poly16x4_t __a, poly16x4_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_lane_s16 (int16_t *__a, int16x4_t __b, const int __lane) +vzip2_s8 (int8x8_t __a, int8x8_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_lane_s32 (int32_t *__a, int32x2_t __b, const int __lane) +vzip2_s16 (int16x4_t __a, int16x4_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_lane_s64 (int64_t *__a, int64x1_t __b, const int __lane) +vzip2_s32 (int32x2_t __a, int32x2_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_lane_u8 (uint8_t *__a, uint8x8_t __b, const int __lane) +vzip2_u8 (uint8x8_t __a, uint8x8_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); +#else + return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_lane_u16 (uint16_t *__a, uint16x4_t __b, const int __lane) +vzip2_u16 (uint16x4_t __a, uint16x4_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1}); +#else + return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_lane_u32 (uint32_t *__a, uint32x2_t __b, const int __lane) +vzip2_u32 (uint32x2_t __a, uint32x2_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_lane_u64 (uint64_t *__a, uint64x1_t __b, const int __lane) +vzip2q_f16 (float16x8_t __a, float16x8_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, + (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); +#else + return __builtin_shuffle (__a, __b, + (uint16x8_t) {4, 12, 5, 13, 6, 14, 7, 15}); +#endif } -/* vst1q_lane */ - -__extension__ extern __inline void +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_lane_f16 (float16_t *__a, float16x8_t __b, const int __lane) +vzip2q_f32 (float32x4_t __a, float32x4_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_lane_f32 (float32_t *__a, float32x4_t __b, const int __lane) +vzip2q_f64 (float64x2_t __a, float64x2_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_lane_f64 (float64_t *__a, float64x2_t __b, const int __lane) +vzip2q_p8 (poly8x16_t __a, poly8x16_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x16_t) + {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x16_t) + {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_lane_p8 (poly8_t *__a, poly8x16_t __b, const int __lane) +vzip2q_p16 (poly16x8_t __a, poly16x8_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) + {4, 12, 5, 13, 6, 14, 7, 15}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_lane_p16 (poly16_t *__a, poly16x8_t __b, const int __lane) +vzip2q_s8 (int8x16_t __a, int8x16_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x16_t) + {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x16_t) + {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_lane_p64 (poly64_t *__a, poly64x2_t __b, const int __lane) +vzip2q_s16 (int16x8_t __a, int16x8_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) + {4, 12, 5, 13, 6, 14, 7, 15}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_lane_s8 (int8_t *__a, int8x16_t __b, const int __lane) +vzip2q_s32 (int32x4_t __a, int32x4_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_lane_s16 (int16_t *__a, int16x8_t __b, const int __lane) +vzip2q_s64 (int64x2_t __a, int64x2_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_lane_s32 (int32_t *__a, int32x4_t __b, const int __lane) +vzip2q_u8 (uint8x16_t __a, uint8x16_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint8x16_t) + {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7}); +#else + return __builtin_shuffle (__a, __b, (uint8x16_t) + {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_lane_s64 (int64_t *__a, int64x2_t __b, const int __lane) +vzip2q_u16 (uint16x8_t __a, uint16x8_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); +#else + return __builtin_shuffle (__a, __b, (uint16x8_t) + {4, 12, 5, 13, 6, 14, 7, 15}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_lane_u8 (uint8_t *__a, uint8x16_t __b, const int __lane) +vzip2q_u32 (uint32x4_t __a, uint32x4_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1}); +#else + return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_lane_u16 (uint16_t *__a, uint16x8_t __b, const int __lane) +vzip2q_u64 (uint64x2_t __a, uint64x2_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); +#endif } -__extension__ extern __inline void +__extension__ extern __inline poly64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_lane_u32 (uint32_t *__a, uint32x4_t __b, const int __lane) +vzip2q_p64 (poly64x2_t __a, poly64x2_t __b) { - *__a = __aarch64_vget_lane_any (__b, __lane); +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {1, 3}); +#endif } -__extension__ extern __inline void +__INTERLEAVE_LIST (zip) + +#undef __INTERLEAVE_LIST +#undef __DEFINTERLEAVE + +/* End of optimal implementations in approved order. */ + +#pragma GCC pop_options + +/* ARMv8.2-A FP16 intrinsics. */ + +#include "arm_fp16.h" + +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+fp16") + +/* ARMv8.2-A FP16 one operand vector intrinsics. */ + +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_lane_u64 (uint64_t *__a, uint64x2_t __b, const int __lane) +vabs_f16 (float16x4_t __a) { - *__a = __aarch64_vget_lane_any (__b, __lane); + return __builtin_aarch64_absv4hf (__a); } -/* vstn */ - -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2_s64 (int64_t * __a, int64x1x2_t val) +vabsq_f16 (float16x8_t __a) { - __builtin_aarch64_simd_oi __o; - int64x2x2_t temp; - temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1); - __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o); + return __builtin_aarch64_absv8hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2_u64 (uint64_t * __a, uint64x1x2_t val) +vceqz_f16 (float16x4_t __a) { - __builtin_aarch64_simd_oi __o; - uint64x2x2_t temp; - temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) temp.val[1], 1); - __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o); + return __builtin_aarch64_cmeqv4hf_uss (__a, vdup_n_f16 (0.0f)); } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2_f64 (float64_t * __a, float64x1x2_t val) +vceqzq_f16 (float16x8_t __a) { - __builtin_aarch64_simd_oi __o; - float64x2x2_t temp; - temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) temp.val[1], 1); - __builtin_aarch64_st2df ((__builtin_aarch64_simd_df *) __a, __o); + return __builtin_aarch64_cmeqv8hf_uss (__a, vdupq_n_f16 (0.0f)); } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2_s8 (int8_t * __a, int8x8x2_t val) +vcgez_f16 (float16x4_t __a) { - __builtin_aarch64_simd_oi __o; - int8x16x2_t temp; - temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1); - __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); + return __builtin_aarch64_cmgev4hf_uss (__a, vdup_n_f16 (0.0f)); } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2_p8 (poly8_t * __a, poly8x8x2_t val) +vcgezq_f16 (float16x8_t __a) { - __builtin_aarch64_simd_oi __o; - poly8x16x2_t temp; - temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1); - __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); + return __builtin_aarch64_cmgev8hf_uss (__a, vdupq_n_f16 (0.0f)); } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2_s16 (int16_t * __a, int16x4x2_t val) +vcgtz_f16 (float16x4_t __a) { - __builtin_aarch64_simd_oi __o; - int16x8x2_t temp; - temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1); - __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); + return __builtin_aarch64_cmgtv4hf_uss (__a, vdup_n_f16 (0.0f)); } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2_p16 (poly16_t * __a, poly16x4x2_t val) +vcgtzq_f16 (float16x8_t __a) { - __builtin_aarch64_simd_oi __o; - poly16x8x2_t temp; - temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1); - __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); + return __builtin_aarch64_cmgtv8hf_uss (__a, vdupq_n_f16 (0.0f)); } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2_s32 (int32_t * __a, int32x2x2_t val) +vclez_f16 (float16x4_t __a) { - __builtin_aarch64_simd_oi __o; - int32x4x2_t temp; - temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1); - __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o); + return __builtin_aarch64_cmlev4hf_uss (__a, vdup_n_f16 (0.0f)); } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2_u8 (uint8_t * __a, uint8x8x2_t val) +vclezq_f16 (float16x8_t __a) { - __builtin_aarch64_simd_oi __o; - uint8x16x2_t temp; - temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) temp.val[1], 1); - __builtin_aarch64_st2v8qi ((__builtin_aarch64_simd_qi *) __a, __o); + return __builtin_aarch64_cmlev8hf_uss (__a, vdupq_n_f16 (0.0f)); } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2_u16 (uint16_t * __a, uint16x4x2_t val) +vcltz_f16 (float16x4_t __a) { - __builtin_aarch64_simd_oi __o; - uint16x8x2_t temp; - temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) temp.val[1], 1); - __builtin_aarch64_st2v4hi ((__builtin_aarch64_simd_hi *) __a, __o); + return __builtin_aarch64_cmltv4hf_uss (__a, vdup_n_f16 (0.0f)); } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2_u32 (uint32_t * __a, uint32x2x2_t val) +vcltzq_f16 (float16x8_t __a) { - __builtin_aarch64_simd_oi __o; - uint32x4x2_t temp; - temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) temp.val[1], 1); - __builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o); + return __builtin_aarch64_cmltv8hf_uss (__a, vdupq_n_f16 (0.0f)); } -__extension__ extern __inline void +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2_f16 (float16_t * __a, float16x4x2_t val) +vcvt_f16_s16 (int16x4_t __a) { - __builtin_aarch64_simd_oi __o; - float16x8x2_t temp; - temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1); - __builtin_aarch64_st2v4hf (__a, __o); + return __builtin_aarch64_floatv4hiv4hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2_f32 (float32_t * __a, float32x2x2_t val) +vcvtq_f16_s16 (int16x8_t __a) { - __builtin_aarch64_simd_oi __o; - float32x4x2_t temp; - temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) temp.val[1], 1); - __builtin_aarch64_st2v2sf ((__builtin_aarch64_simd_sf *) __a, __o); + return __builtin_aarch64_floatv8hiv8hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2_p64 (poly64_t * __a, poly64x1x2_t val) +vcvt_f16_u16 (uint16x4_t __a) { - __builtin_aarch64_simd_oi __o; - poly64x2x2_t temp; - temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, - (poly64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, - (poly64x2_t) temp.val[1], 1); - __builtin_aarch64_st2di ((__builtin_aarch64_simd_di *) __a, __o); + return __builtin_aarch64_floatunsv4hiv4hf ((int16x4_t) __a); } -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2q_s8 (int8_t * __a, int8x16x2_t val) +vcvtq_f16_u16 (uint16x8_t __a) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1); - __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); + return __builtin_aarch64_floatunsv8hiv8hf ((int16x8_t) __a); } -__extension__ extern __inline void +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2q_p8 (poly8_t * __a, poly8x16x2_t val) +vcvt_s16_f16 (float16x4_t __a) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1); - __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); + return __builtin_aarch64_lbtruncv4hfv4hi (__a); } -__extension__ extern __inline void +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2q_s16 (int16_t * __a, int16x8x2_t val) +vcvtq_s16_f16 (float16x8_t __a) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1); - __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); + return __builtin_aarch64_lbtruncv8hfv8hi (__a); } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2q_p16 (poly16_t * __a, poly16x8x2_t val) +vcvt_u16_f16 (float16x4_t __a) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1); - __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); + return __builtin_aarch64_lbtruncuv4hfv4hi_us (__a); } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2q_s32 (int32_t * __a, int32x4x2_t val) +vcvtq_u16_f16 (float16x8_t __a) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1); - __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o); + return __builtin_aarch64_lbtruncuv8hfv8hi_us (__a); } -__extension__ extern __inline void +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2q_s64 (int64_t * __a, int64x2x2_t val) +vcvta_s16_f16 (float16x4_t __a) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1); - __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o); + return __builtin_aarch64_lroundv4hfv4hi (__a); } -__extension__ extern __inline void +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2q_u8 (uint8_t * __a, uint8x16x2_t val) +vcvtaq_s16_f16 (float16x8_t __a) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1); - __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o); + return __builtin_aarch64_lroundv8hfv8hi (__a); } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2q_u16 (uint16_t * __a, uint16x8x2_t val) +vcvta_u16_f16 (float16x4_t __a) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1); - __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o); + return __builtin_aarch64_lrounduv4hfv4hi_us (__a); } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2q_u32 (uint32_t * __a, uint32x4x2_t val) +vcvtaq_u16_f16 (float16x8_t __a) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1); - __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o); + return __builtin_aarch64_lrounduv8hfv8hi_us (__a); } -__extension__ extern __inline void +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2q_u64 (uint64_t * __a, uint64x2x2_t val) +vcvtm_s16_f16 (float16x4_t __a) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1); - __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o); + return __builtin_aarch64_lfloorv4hfv4hi (__a); } -__extension__ extern __inline void +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2q_f16 (float16_t * __a, float16x8x2_t val) +vcvtmq_s16_f16 (float16x8_t __a) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1); - __builtin_aarch64_st2v8hf (__a, __o); + return __builtin_aarch64_lfloorv8hfv8hi (__a); } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2q_f32 (float32_t * __a, float32x4x2_t val) +vcvtm_u16_f16 (float16x4_t __a) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1); - __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __o); + return __builtin_aarch64_lflooruv4hfv4hi_us (__a); } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2q_f64 (float64_t * __a, float64x2x2_t val) +vcvtmq_u16_f16 (float16x8_t __a) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1); - __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __o); + return __builtin_aarch64_lflooruv8hfv8hi_us (__a); } -__extension__ extern __inline void +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst2q_p64 (poly64_t * __a, poly64x2x2_t val) +vcvtn_s16_f16 (float16x4_t __a) { - __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, - (poly64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv2di_ssps (__o, - (poly64x2_t) val.val[1], 1); - __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o); + return __builtin_aarch64_lfrintnv4hfv4hi (__a); } -__extension__ extern __inline void +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3_s64 (int64_t * __a, int64x1x3_t val) +vcvtnq_s16_f16 (float16x8_t __a) { - __builtin_aarch64_simd_ci __o; - int64x2x3_t temp; - temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); - temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2); - __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o); + return __builtin_aarch64_lfrintnv8hfv8hi (__a); } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3_u64 (uint64_t * __a, uint64x1x3_t val) +vcvtn_u16_f16 (float16x4_t __a) { - __builtin_aarch64_simd_ci __o; - uint64x2x3_t temp; - temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) temp.val[2], 2); - __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o); + return __builtin_aarch64_lfrintnuv4hfv4hi_us (__a); } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3_f64 (float64_t * __a, float64x1x3_t val) +vcvtnq_u16_f16 (float16x8_t __a) { - __builtin_aarch64_simd_ci __o; - float64x2x3_t temp; - temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) temp.val[2], 2); - __builtin_aarch64_st3df ((__builtin_aarch64_simd_df *) __a, __o); + return __builtin_aarch64_lfrintnuv8hfv8hi_us (__a); } -__extension__ extern __inline void +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3_s8 (int8_t * __a, int8x8x3_t val) +vcvtp_s16_f16 (float16x4_t __a) { - __builtin_aarch64_simd_ci __o; - int8x16x3_t temp; - temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); - temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2); - __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); + return __builtin_aarch64_lceilv4hfv4hi (__a); } -__extension__ extern __inline void +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3_p8 (poly8_t * __a, poly8x8x3_t val) +vcvtpq_s16_f16 (float16x8_t __a) { - __builtin_aarch64_simd_ci __o; - poly8x16x3_t temp; - temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2); - __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); + return __builtin_aarch64_lceilv8hfv8hi (__a); } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3_s16 (int16_t * __a, int16x4x3_t val) +vcvtp_u16_f16 (float16x4_t __a) { - __builtin_aarch64_simd_ci __o; - int16x8x3_t temp; - temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); - temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2); - __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); + return __builtin_aarch64_lceiluv4hfv4hi_us (__a); } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3_p16 (poly16_t * __a, poly16x4x3_t val) +vcvtpq_u16_f16 (float16x8_t __a) { - __builtin_aarch64_simd_ci __o; - poly16x8x3_t temp; - temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2); - __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); + return __builtin_aarch64_lceiluv8hfv8hi_us (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3_s32 (int32_t * __a, int32x2x3_t val) +vneg_f16 (float16x4_t __a) { - __builtin_aarch64_simd_ci __o; - int32x4x3_t temp; - temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); - temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2); - __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o); + return -__a; } -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3_u8 (uint8_t * __a, uint8x8x3_t val) +vnegq_f16 (float16x8_t __a) { - __builtin_aarch64_simd_ci __o; - uint8x16x3_t temp; - temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) temp.val[2], 2); - __builtin_aarch64_st3v8qi ((__builtin_aarch64_simd_qi *) __a, __o); + return -__a; } -__extension__ extern __inline void +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3_u16 (uint16_t * __a, uint16x4x3_t val) +vrecpe_f16 (float16x4_t __a) { - __builtin_aarch64_simd_ci __o; - uint16x8x3_t temp; - temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) temp.val[2], 2); - __builtin_aarch64_st3v4hi ((__builtin_aarch64_simd_hi *) __a, __o); + return __builtin_aarch64_frecpev4hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3_u32 (uint32_t * __a, uint32x2x3_t val) +vrecpeq_f16 (float16x8_t __a) { - __builtin_aarch64_simd_ci __o; - uint32x4x3_t temp; - temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) temp.val[2], 2); - __builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o); + return __builtin_aarch64_frecpev8hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3_f16 (float16_t * __a, float16x4x3_t val) +vrnd_f16 (float16x4_t __a) { - __builtin_aarch64_simd_ci __o; - float16x8x3_t temp; - temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2); - __builtin_aarch64_st3v4hf ((__builtin_aarch64_simd_hf *) __a, __o); + return __builtin_aarch64_btruncv4hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3_f32 (float32_t * __a, float32x2x3_t val) +vrndq_f16 (float16x8_t __a) { - __builtin_aarch64_simd_ci __o; - float32x4x3_t temp; - temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) temp.val[2], 2); - __builtin_aarch64_st3v2sf ((__builtin_aarch64_simd_sf *) __a, __o); + return __builtin_aarch64_btruncv8hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3_p64 (poly64_t * __a, poly64x1x3_t val) +vrnda_f16 (float16x4_t __a) { - __builtin_aarch64_simd_ci __o; - poly64x2x3_t temp; - temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv2di_ssps (__o, - (poly64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di_ssps (__o, - (poly64x2_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di_ssps (__o, - (poly64x2_t) temp.val[2], 2); - __builtin_aarch64_st3di ((__builtin_aarch64_simd_di *) __a, __o); + return __builtin_aarch64_roundv4hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3q_s8 (int8_t * __a, int8x16x3_t val) +vrndaq_f16 (float16x8_t __a) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2); - __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); + return __builtin_aarch64_roundv8hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3q_p8 (poly8_t * __a, poly8x16x3_t val) +vrndi_f16 (float16x4_t __a) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2); - __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); + return __builtin_aarch64_nearbyintv4hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3q_s16 (int16_t * __a, int16x8x3_t val) +vrndiq_f16 (float16x8_t __a) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2); - __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); + return __builtin_aarch64_nearbyintv8hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3q_p16 (poly16_t * __a, poly16x8x3_t val) +vrndm_f16 (float16x4_t __a) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2); - __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); + return __builtin_aarch64_floorv4hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3q_s32 (int32_t * __a, int32x4x3_t val) +vrndmq_f16 (float16x8_t __a) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2); - __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o); + return __builtin_aarch64_floorv8hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3q_s64 (int64_t * __a, int64x2x3_t val) +vrndn_f16 (float16x4_t __a) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2); - __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o); + return __builtin_aarch64_frintnv4hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3q_u8 (uint8_t * __a, uint8x16x3_t val) +vrndnq_f16 (float16x8_t __a) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2); - __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o); + return __builtin_aarch64_frintnv8hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3q_u16 (uint16_t * __a, uint16x8x3_t val) +vrndp_f16 (float16x4_t __a) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2); - __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o); + return __builtin_aarch64_ceilv4hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3q_u32 (uint32_t * __a, uint32x4x3_t val) +vrndpq_f16 (float16x8_t __a) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2); - __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o); + return __builtin_aarch64_ceilv8hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3q_u64 (uint64_t * __a, uint64x2x3_t val) +vrndx_f16 (float16x4_t __a) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2); - __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o); + return __builtin_aarch64_rintv4hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3q_f16 (float16_t * __a, float16x8x3_t val) +vrndxq_f16 (float16x8_t __a) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2); - __builtin_aarch64_st3v8hf ((__builtin_aarch64_simd_hf *) __a, __o); + return __builtin_aarch64_rintv8hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3q_f32 (float32_t * __a, float32x4x3_t val) +vrsqrte_f16 (float16x4_t __a) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2); - __builtin_aarch64_st3v4sf ((__builtin_aarch64_simd_sf *) __a, __o); + return __builtin_aarch64_rsqrtev4hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3q_f64 (float64_t * __a, float64x2x3_t val) +vrsqrteq_f16 (float16x8_t __a) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2); - __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __o); + return __builtin_aarch64_rsqrtev8hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst3q_p64 (poly64_t * __a, poly64x2x3_t val) +vsqrt_f16 (float16x4_t __a) { - __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv2di_ssps (__o, - (poly64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregciv2di_ssps (__o, - (poly64x2_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregciv2di_ssps (__o, - (poly64x2_t) val.val[2], 2); - __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o); + return __builtin_aarch64_sqrtv4hf (__a); } -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4_s64 (int64_t * __a, int64x1x4_t val) +vsqrtq_f16 (float16x8_t __a) { - __builtin_aarch64_simd_xi __o; - int64x2x4_t temp; - temp.val[0] = vcombine_s64 (val.val[0], vcreate_s64 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s64 (val.val[1], vcreate_s64 (__AARCH64_INT64_C (0))); - temp.val[2] = vcombine_s64 (val.val[2], vcreate_s64 (__AARCH64_INT64_C (0))); - temp.val[3] = vcombine_s64 (val.val[3], vcreate_s64 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3); - __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o); + return __builtin_aarch64_sqrtv8hf (__a); } -__extension__ extern __inline void +/* ARMv8.2-A FP16 two operands vector intrinsics. */ + +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4_u64 (uint64_t * __a, uint64x1x4_t val) +vadd_f16 (float16x4_t __a, float16x4_t __b) { - __builtin_aarch64_simd_xi __o; - uint64x2x4_t temp; - temp.val[0] = vcombine_u64 (val.val[0], vcreate_u64 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u64 (val.val[1], vcreate_u64 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_u64 (val.val[2], vcreate_u64 (__AARCH64_UINT64_C (0))); - temp.val[3] = vcombine_u64 (val.val[3], vcreate_u64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) temp.val[3], 3); - __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o); + return __a + __b; } -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4_f64 (float64_t * __a, float64x1x4_t val) +vaddq_f16 (float16x8_t __a, float16x8_t __b) { - __builtin_aarch64_simd_xi __o; - float64x2x4_t temp; - temp.val[0] = vcombine_f64 (val.val[0], vcreate_f64 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_f64 (val.val[1], vcreate_f64 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_f64 (val.val[2], vcreate_f64 (__AARCH64_UINT64_C (0))); - temp.val[3] = vcombine_f64 (val.val[3], vcreate_f64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) temp.val[3], 3); - __builtin_aarch64_st4df ((__builtin_aarch64_simd_df *) __a, __o); + return __a + __b; } -__extension__ extern __inline void +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4_s8 (int8_t * __a, int8x8x4_t val) +vabd_f16 (float16x4_t __a, float16x4_t __b) { - __builtin_aarch64_simd_xi __o; - int8x16x4_t temp; - temp.val[0] = vcombine_s8 (val.val[0], vcreate_s8 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s8 (val.val[1], vcreate_s8 (__AARCH64_INT64_C (0))); - temp.val[2] = vcombine_s8 (val.val[2], vcreate_s8 (__AARCH64_INT64_C (0))); - temp.val[3] = vcombine_s8 (val.val[3], vcreate_s8 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3); - __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); + return __builtin_aarch64_fabdv4hf (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4_p8 (poly8_t * __a, poly8x8x4_t val) +vabdq_f16 (float16x8_t __a, float16x8_t __b) { - __builtin_aarch64_simd_xi __o; - poly8x16x4_t temp; - temp.val[0] = vcombine_p8 (val.val[0], vcreate_p8 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_p8 (val.val[1], vcreate_p8 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_p8 (val.val[2], vcreate_p8 (__AARCH64_UINT64_C (0))); - temp.val[3] = vcombine_p8 (val.val[3], vcreate_p8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3); - __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); + return __builtin_aarch64_fabdv8hf (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4_s16 (int16_t * __a, int16x4x4_t val) +vcage_f16 (float16x4_t __a, float16x4_t __b) { - __builtin_aarch64_simd_xi __o; - int16x8x4_t temp; - temp.val[0] = vcombine_s16 (val.val[0], vcreate_s16 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s16 (val.val[1], vcreate_s16 (__AARCH64_INT64_C (0))); - temp.val[2] = vcombine_s16 (val.val[2], vcreate_s16 (__AARCH64_INT64_C (0))); - temp.val[3] = vcombine_s16 (val.val[3], vcreate_s16 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3); - __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); + return __builtin_aarch64_facgev4hf_uss (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4_p16 (poly16_t * __a, poly16x4x4_t val) +vcageq_f16 (float16x8_t __a, float16x8_t __b) { - __builtin_aarch64_simd_xi __o; - poly16x8x4_t temp; - temp.val[0] = vcombine_p16 (val.val[0], vcreate_p16 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_p16 (val.val[1], vcreate_p16 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_p16 (val.val[2], vcreate_p16 (__AARCH64_UINT64_C (0))); - temp.val[3] = vcombine_p16 (val.val[3], vcreate_p16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3); - __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); + return __builtin_aarch64_facgev8hf_uss (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4_s32 (int32_t * __a, int32x2x4_t val) +vcagt_f16 (float16x4_t __a, float16x4_t __b) { - __builtin_aarch64_simd_xi __o; - int32x4x4_t temp; - temp.val[0] = vcombine_s32 (val.val[0], vcreate_s32 (__AARCH64_INT64_C (0))); - temp.val[1] = vcombine_s32 (val.val[1], vcreate_s32 (__AARCH64_INT64_C (0))); - temp.val[2] = vcombine_s32 (val.val[2], vcreate_s32 (__AARCH64_INT64_C (0))); - temp.val[3] = vcombine_s32 (val.val[3], vcreate_s32 (__AARCH64_INT64_C (0))); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3); - __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o); + return __builtin_aarch64_facgtv4hf_uss (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4_u8 (uint8_t * __a, uint8x8x4_t val) +vcagtq_f16 (float16x8_t __a, float16x8_t __b) { - __builtin_aarch64_simd_xi __o; - uint8x16x4_t temp; - temp.val[0] = vcombine_u8 (val.val[0], vcreate_u8 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u8 (val.val[1], vcreate_u8 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_u8 (val.val[2], vcreate_u8 (__AARCH64_UINT64_C (0))); - temp.val[3] = vcombine_u8 (val.val[3], vcreate_u8 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) temp.val[3], 3); - __builtin_aarch64_st4v8qi ((__builtin_aarch64_simd_qi *) __a, __o); + return __builtin_aarch64_facgtv8hf_uss (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4_u16 (uint16_t * __a, uint16x4x4_t val) +vcale_f16 (float16x4_t __a, float16x4_t __b) { - __builtin_aarch64_simd_xi __o; - uint16x8x4_t temp; - temp.val[0] = vcombine_u16 (val.val[0], vcreate_u16 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u16 (val.val[1], vcreate_u16 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_u16 (val.val[2], vcreate_u16 (__AARCH64_UINT64_C (0))); - temp.val[3] = vcombine_u16 (val.val[3], vcreate_u16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) temp.val[3], 3); - __builtin_aarch64_st4v4hi ((__builtin_aarch64_simd_hi *) __a, __o); + return __builtin_aarch64_faclev4hf_uss (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4_u32 (uint32_t * __a, uint32x2x4_t val) +vcaleq_f16 (float16x8_t __a, float16x8_t __b) { - __builtin_aarch64_simd_xi __o; - uint32x4x4_t temp; - temp.val[0] = vcombine_u32 (val.val[0], vcreate_u32 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_u32 (val.val[1], vcreate_u32 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_u32 (val.val[2], vcreate_u32 (__AARCH64_UINT64_C (0))); - temp.val[3] = vcombine_u32 (val.val[3], vcreate_u32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) temp.val[3], 3); - __builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o); + return __builtin_aarch64_faclev8hf_uss (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4_f16 (float16_t * __a, float16x4x4_t val) +vcalt_f16 (float16x4_t __a, float16x4_t __b) { - __builtin_aarch64_simd_xi __o; - float16x8x4_t temp; - temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0))); - temp.val[3] = vcombine_f16 (val.val[3], vcreate_f16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[3], 3); - __builtin_aarch64_st4v4hf ((__builtin_aarch64_simd_hf *) __a, __o); + return __builtin_aarch64_facltv4hf_uss (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4_f32 (float32_t * __a, float32x2x4_t val) +vcaltq_f16 (float16x8_t __a, float16x8_t __b) { - __builtin_aarch64_simd_xi __o; - float32x4x4_t temp; - temp.val[0] = vcombine_f32 (val.val[0], vcreate_f32 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_f32 (val.val[1], vcreate_f32 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_f32 (val.val[2], vcreate_f32 (__AARCH64_UINT64_C (0))); - temp.val[3] = vcombine_f32 (val.val[3], vcreate_f32 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) temp.val[3], 3); - __builtin_aarch64_st4v2sf ((__builtin_aarch64_simd_sf *) __a, __o); + return __builtin_aarch64_facltv8hf_uss (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4_p64 (poly64_t * __a, poly64x1x4_t val) +vceq_f16 (float16x4_t __a, float16x4_t __b) { - __builtin_aarch64_simd_xi __o; - poly64x2x4_t temp; - temp.val[0] = vcombine_p64 (val.val[0], vcreate_p64 (__AARCH64_UINT64_C (0))); - temp.val[1] = vcombine_p64 (val.val[1], vcreate_p64 (__AARCH64_UINT64_C (0))); - temp.val[2] = vcombine_p64 (val.val[2], vcreate_p64 (__AARCH64_UINT64_C (0))); - temp.val[3] = vcombine_p64 (val.val[3], vcreate_p64 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, - (poly64x2_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, - (poly64x2_t) temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, - (poly64x2_t) temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, - (poly64x2_t) temp.val[3], 3); - __builtin_aarch64_st4di ((__builtin_aarch64_simd_di *) __a, __o); + return __builtin_aarch64_cmeqv4hf_uss (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4q_s8 (int8_t * __a, int8x16x4_t val) +vceqq_f16 (float16x8_t __a, float16x8_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3); - __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); + return __builtin_aarch64_cmeqv8hf_uss (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4q_p8 (poly8_t * __a, poly8x16x4_t val) +vcge_f16 (float16x4_t __a, float16x4_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3); - __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); + return __builtin_aarch64_cmgev4hf_uss (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4q_s16 (int16_t * __a, int16x8x4_t val) +vcgeq_f16 (float16x8_t __a, float16x8_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3); - __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); + return __builtin_aarch64_cmgev8hf_uss (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4q_p16 (poly16_t * __a, poly16x8x4_t val) +vcgt_f16 (float16x4_t __a, float16x4_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3); - __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); + return __builtin_aarch64_cmgtv4hf_uss (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4q_s32 (int32_t * __a, int32x4x4_t val) +vcgtq_f16 (float16x8_t __a, float16x8_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3); - __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o); + return __builtin_aarch64_cmgtv8hf_uss (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4q_s64 (int64_t * __a, int64x2x4_t val) +vcle_f16 (float16x4_t __a, float16x4_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3); - __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o); + return __builtin_aarch64_cmlev4hf_uss (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4q_u8 (uint8_t * __a, uint8x16x4_t val) +vcleq_f16 (float16x8_t __a, float16x8_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3); - __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o); + return __builtin_aarch64_cmlev8hf_uss (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4q_u16 (uint16_t * __a, uint16x8x4_t val) +vclt_f16 (float16x4_t __a, float16x4_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3); - __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o); + return __builtin_aarch64_cmltv4hf_uss (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4q_u32 (uint32_t * __a, uint32x4x4_t val) +vcltq_f16 (float16x8_t __a, float16x8_t __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3); - __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o); + return __builtin_aarch64_cmltv8hf_uss (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4q_u64 (uint64_t * __a, uint64x2x4_t val) +vcvt_n_f16_s16 (int16x4_t __a, const int __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3); - __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o); + return __builtin_aarch64_scvtfv4hi (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4q_f16 (float16_t * __a, float16x8x4_t val) +vcvtq_n_f16_s16 (int16x8_t __a, const int __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[3], 3); - __builtin_aarch64_st4v8hf ((__builtin_aarch64_simd_hf *) __a, __o); + return __builtin_aarch64_scvtfv8hi (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4q_f32 (float32_t * __a, float32x4x4_t val) +vcvt_n_f16_u16 (uint16x4_t __a, const int __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[3], 3); - __builtin_aarch64_st4v4sf ((__builtin_aarch64_simd_sf *) __a, __o); + return __builtin_aarch64_ucvtfv4hi_sus (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4q_f64 (float64_t * __a, float64x2x4_t val) +vcvtq_n_f16_u16 (uint16x8_t __a, const int __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[3], 3); - __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __o); + return __builtin_aarch64_ucvtfv8hi_sus (__a, __b); } -__extension__ extern __inline void +__extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst4q_p64 (poly64_t * __a, poly64x2x4_t val) +vcvt_n_s16_f16 (float16x4_t __a, const int __b) { - __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, - (poly64x2_t) val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, - (poly64x2_t) val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, - (poly64x2_t) val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv2di_ssps (__o, - (poly64x2_t) val.val[3], 3); - __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o); + return __builtin_aarch64_fcvtzsv4hf (__a, __b); } -/* vsub */ - -__extension__ extern __inline int64_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsubd_s64 (int64_t __a, int64_t __b) +vcvtq_n_s16_f16 (float16x8_t __a, const int __b) { - return __a - __b; + return __builtin_aarch64_fcvtzsv8hf (__a, __b); } -__extension__ extern __inline uint64_t +__extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsubd_u64 (uint64_t __a, uint64_t __b) +vcvt_n_u16_f16 (float16x4_t __a, const int __b) { - return __a - __b; + return __builtin_aarch64_fcvtzuv4hf_uss (__a, __b); } -/* vtbx1 */ - -__extension__ extern __inline int8x8_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbx1_s8 (int8x8_t __r, int8x8_t __tab, int8x8_t __idx) +vcvtq_n_u16_f16 (float16x8_t __a, const int __b) { - uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx), - vmov_n_u8 (8)); - int8x8_t __tbl = vtbl1_s8 (__tab, __idx); - - return vbsl_s8 (__mask, __tbl, __r); + return __builtin_aarch64_fcvtzuv8hf_uss (__a, __b); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbx1_u8 (uint8x8_t __r, uint8x8_t __tab, uint8x8_t __idx) +vdiv_f16 (float16x4_t __a, float16x4_t __b) { - uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8)); - uint8x8_t __tbl = vtbl1_u8 (__tab, __idx); - - return vbsl_u8 (__mask, __tbl, __r); + return __a / __b; } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbx1_p8 (poly8x8_t __r, poly8x8_t __tab, uint8x8_t __idx) +vdivq_f16 (float16x8_t __a, float16x8_t __b) { - uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (8)); - poly8x8_t __tbl = vtbl1_p8 (__tab, __idx); - - return vbsl_p8 (__mask, __tbl, __r); + return __a / __b; } -/* vtbx3 */ - -__extension__ extern __inline int8x8_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbx3_s8 (int8x8_t __r, int8x8x3_t __tab, int8x8_t __idx) +vmax_f16 (float16x4_t __a, float16x4_t __b) { - uint8x8_t __mask = vclt_u8 (vreinterpret_u8_s8 (__idx), - vmov_n_u8 (24)); - int8x8_t __tbl = vtbl3_s8 (__tab, __idx); - - return vbsl_s8 (__mask, __tbl, __r); + return __builtin_aarch64_smax_nanv4hf (__a, __b); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbx3_u8 (uint8x8_t __r, uint8x8x3_t __tab, uint8x8_t __idx) +vmaxq_f16 (float16x8_t __a, float16x8_t __b) { - uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24)); - uint8x8_t __tbl = vtbl3_u8 (__tab, __idx); - - return vbsl_u8 (__mask, __tbl, __r); + return __builtin_aarch64_smax_nanv8hf (__a, __b); } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbx3_p8 (poly8x8_t __r, poly8x8x3_t __tab, uint8x8_t __idx) +vmaxnm_f16 (float16x4_t __a, float16x4_t __b) { - uint8x8_t __mask = vclt_u8 (__idx, vmov_n_u8 (24)); - poly8x8_t __tbl = vtbl3_p8 (__tab, __idx); - - return vbsl_p8 (__mask, __tbl, __r); + return __builtin_aarch64_fmaxv4hf (__a, __b); } -/* vtbx4 */ - -__extension__ extern __inline int8x8_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbx4_s8 (int8x8_t __r, int8x8x4_t __tab, int8x8_t __idx) +vmaxnmq_f16 (float16x8_t __a, float16x8_t __b) { - int8x8_t result; - int8x16x2_t temp; - __builtin_aarch64_simd_oi __o; - temp.val[0] = vcombine_s8 (__tab.val[0], __tab.val[1]); - temp.val[1] = vcombine_s8 (__tab.val[2], __tab.val[3]); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) temp.val[1], 1); - result = __builtin_aarch64_tbx4v8qi (__r, __o, __idx); - return result; + return __builtin_aarch64_fmaxv8hf (__a, __b); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbx4_u8 (uint8x8_t __r, uint8x8x4_t __tab, uint8x8_t __idx) +vmin_f16 (float16x4_t __a, float16x4_t __b) { - uint8x8_t result; - uint8x16x2_t temp; - __builtin_aarch64_simd_oi __o; - temp.val[0] = vcombine_u8 (__tab.val[0], __tab.val[1]); - temp.val[1] = vcombine_u8 (__tab.val[2], __tab.val[3]); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) temp.val[1], 1); - result = (uint8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o, - (int8x8_t)__idx); - return result; + return __builtin_aarch64_smin_nanv4hf (__a, __b); } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx) +vminq_f16 (float16x8_t __a, float16x8_t __b) { - poly8x8_t result; - poly8x16x2_t temp; - __builtin_aarch64_simd_oi __o; - temp.val[0] = vcombine_p8 (__tab.val[0], __tab.val[1]); - temp.val[1] = vcombine_p8 (__tab.val[2], __tab.val[3]); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv16qi (__o, - (int8x16_t) temp.val[1], 1); - result = (poly8x8_t)__builtin_aarch64_tbx4v8qi ((int8x8_t)__r, __o, - (int8x8_t)__idx); - return result; + return __builtin_aarch64_smin_nanv8hf (__a, __b); } -/* vtrn */ - __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1_f16 (float16x4_t __a, float16x4_t __b) +vminnm_f16 (float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6}); -#endif + return __builtin_aarch64_fminv4hf (__a, __b); } -__extension__ extern __inline float32x2_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1_f32 (float32x2_t __a, float32x2_t __b) +vminnmq_f16 (float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); -#else - return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); -#endif + return __builtin_aarch64_fminv8hf (__a, __b); } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1_p8 (poly8x8_t __a, poly8x8_t __b) +vmul_f16 (float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); -#else - return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); -#endif + return __a * __b; } -__extension__ extern __inline poly16x4_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1_p16 (poly16x4_t __a, poly16x4_t __b) +vmulq_f16 (float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6}); -#endif + return __a * __b; } -__extension__ extern __inline int8x8_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1_s8 (int8x8_t __a, int8x8_t __b) +vmulx_f16 (float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); -#else - return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); -#endif + return __builtin_aarch64_fmulxv4hf (__a, __b); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1_s16 (int16x4_t __a, int16x4_t __b) +vmulxq_f16 (float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6}); -#endif + return __builtin_aarch64_fmulxv8hf (__a, __b); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1_s32 (int32x2_t __a, int32x2_t __b) +vpadd_f16 (float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); -#else - return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); -#endif + return __builtin_aarch64_faddpv4hf (__a, __b); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1_u8 (uint8x8_t __a, uint8x8_t __b) +vpaddq_f16 (float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); -#else - return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); -#endif + return __builtin_aarch64_faddpv8hf (__a, __b); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1_u16 (uint16x4_t __a, uint16x4_t __b) +vpmax_f16 (float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6}); -#endif + return __builtin_aarch64_smax_nanpv4hf (__a, __b); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1_u32 (uint32x2_t __a, uint32x2_t __b) +vpmaxq_f16 (float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); -#else - return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); -#endif + return __builtin_aarch64_smax_nanpv8hf (__a, __b); } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1q_f16 (float16x8_t __a, float16x8_t __b) -{ -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); -#endif +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vpmaxnm_f16 (float16x4_t __a, float16x4_t __b) +{ + return __builtin_aarch64_smaxpv4hf (__a, __b); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1q_f32 (float32x4_t __a, float32x4_t __b) +vpmaxnmq_f16 (float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3}); -#else - return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6}); -#endif + return __builtin_aarch64_smaxpv8hf (__a, __b); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1q_f64 (float64x2_t __a, float64x2_t __b) +vpmin_f16 (float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); -#else - return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); -#endif + return __builtin_aarch64_smin_nanpv4hf (__a, __b); } -__extension__ extern __inline poly8x16_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1q_p8 (poly8x16_t __a, poly8x16_t __b) +vpminq_f16 (float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, - (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15}); -#else - return __builtin_shuffle (__a, __b, - (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30}); -#endif + return __builtin_aarch64_smin_nanpv8hf (__a, __b); } -__extension__ extern __inline poly16x8_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1q_p16 (poly16x8_t __a, poly16x8_t __b) +vpminnm_f16 (float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); -#endif + return __builtin_aarch64_sminpv4hf (__a, __b); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1q_s8 (int8x16_t __a, int8x16_t __b) +vpminnmq_f16 (float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, - (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15}); -#else - return __builtin_shuffle (__a, __b, - (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30}); -#endif + return __builtin_aarch64_sminpv8hf (__a, __b); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1q_s16 (int16x8_t __a, int16x8_t __b) +vrecps_f16 (float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); -#endif + return __builtin_aarch64_frecpsv4hf (__a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1q_s32 (int32x4_t __a, int32x4_t __b) +vrecpsq_f16 (float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3}); -#else - return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6}); -#endif + return __builtin_aarch64_frecpsv8hf (__a, __b); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1q_s64 (int64x2_t __a, int64x2_t __b) +vrsqrts_f16 (float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); -#else - return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); -#endif + return __builtin_aarch64_rsqrtsv4hf (__a, __b); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1q_u8 (uint8x16_t __a, uint8x16_t __b) +vrsqrtsq_f16 (float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, - (uint8x16_t) {17, 1, 19, 3, 21, 5, 23, 7, 25, 9, 27, 11, 29, 13, 31, 15}); -#else - return __builtin_shuffle (__a, __b, - (uint8x16_t) {0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30}); -#endif + return __builtin_aarch64_rsqrtsv8hf (__a, __b); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1q_u16 (uint16x8_t __a, uint16x8_t __b) +vsub_f16 (float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14}); -#endif + return __a - __b; } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1q_u32 (uint32x4_t __a, uint32x4_t __b) +vsubq_f16 (float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 1, 7, 3}); -#else - return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 2, 6}); -#endif + return __a - __b; } -__extension__ extern __inline uint64x2_t +/* ARMv8.2-A FP16 three operands vector intrinsics. */ + +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn1q_u64 (uint64x2_t __a, uint64x2_t __b) +vfma_f16 (float16x4_t __a, float16x4_t __b, float16x4_t __c) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); -#else - return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); -#endif + return __builtin_aarch64_fmav4hf (__b, __c, __a); } -__extension__ extern __inline float16x4_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2_f16 (float16x4_t __a, float16x4_t __b) +vfmaq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7}); -#endif + return __builtin_aarch64_fmav8hf (__b, __c, __a); } -__extension__ extern __inline float32x2_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2_f32 (float32x2_t __a, float32x2_t __b) +vfms_f16 (float16x4_t __a, float16x4_t __b, float16x4_t __c) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); -#else - return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); -#endif + return __builtin_aarch64_fnmav4hf (__b, __c, __a); } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2_p8 (poly8x8_t __a, poly8x8_t __b) +vfmsq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); -#else - return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); -#endif + return __builtin_aarch64_fnmav8hf (__b, __c, __a); } -__extension__ extern __inline poly16x4_t +/* ARMv8.2-A FP16 lane vector intrinsics. */ + +__extension__ extern __inline float16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2_p16 (poly16x4_t __a, poly16x4_t __b) +vfmah_lane_f16 (float16_t __a, float16_t __b, + float16x4_t __c, const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7}); -#endif + return vfmah_f16 (__a, __b, __aarch64_vget_lane_any (__c, __lane)); } -__extension__ extern __inline int8x8_t +__extension__ extern __inline float16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2_s8 (int8x8_t __a, int8x8_t __b) +vfmah_laneq_f16 (float16_t __a, float16_t __b, + float16x8_t __c, const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); -#else - return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); -#endif + return vfmah_f16 (__a, __b, __aarch64_vget_lane_any (__c, __lane)); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2_s16 (int16x4_t __a, int16x4_t __b) +vfma_lane_f16 (float16x4_t __a, float16x4_t __b, + float16x4_t __c, const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7}); -#endif + return vfma_f16 (__a, __b, __aarch64_vdup_lane_f16 (__c, __lane)); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2_s32 (int32x2_t __a, int32x2_t __b) +vfmaq_lane_f16 (float16x8_t __a, float16x8_t __b, + float16x4_t __c, const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); -#else - return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); -#endif + return vfmaq_f16 (__a, __b, __aarch64_vdupq_lane_f16 (__c, __lane)); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2_u8 (uint8x8_t __a, uint8x8_t __b) +vfma_laneq_f16 (float16x4_t __a, float16x4_t __b, + float16x8_t __c, const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); -#else - return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); -#endif + return vfma_f16 (__a, __b, __aarch64_vdup_laneq_f16 (__c, __lane)); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2_u16 (uint16x4_t __a, uint16x4_t __b) +vfmaq_laneq_f16 (float16x8_t __a, float16x8_t __b, + float16x8_t __c, const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7}); -#endif + return vfmaq_f16 (__a, __b, __aarch64_vdupq_laneq_f16 (__c, __lane)); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2_u32 (uint32x2_t __a, uint32x2_t __b) +vfma_n_f16 (float16x4_t __a, float16x4_t __b, float16_t __c) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); -#else - return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); -#endif + return vfma_f16 (__a, __b, vdup_n_f16 (__c)); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2q_f16 (float16x8_t __a, float16x8_t __b) +vfmaq_n_f16 (float16x8_t __a, float16x8_t __b, float16_t __c) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); -#endif + return vfmaq_f16 (__a, __b, vdupq_n_f16 (__c)); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline float16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2q_f32 (float32x4_t __a, float32x4_t __b) +vfmsh_lane_f16 (float16_t __a, float16_t __b, + float16x4_t __c, const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2}); -#else - return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7}); -#endif + return vfmsh_f16 (__a, __b, __aarch64_vget_lane_any (__c, __lane)); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline float16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2q_f64 (float64x2_t __a, float64x2_t __b) +vfmsh_laneq_f16 (float16_t __a, float16_t __b, + float16x8_t __c, const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); -#else - return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); -#endif + return vfmsh_f16 (__a, __b, __aarch64_vget_lane_any (__c, __lane)); } -__extension__ extern __inline poly8x16_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2q_p8 (poly8x16_t __a, poly8x16_t __b) +vfms_lane_f16 (float16x4_t __a, float16x4_t __b, + float16x4_t __c, const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, - (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14}); -#else - return __builtin_shuffle (__a, __b, - (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31}); -#endif + return vfms_f16 (__a, __b, __aarch64_vdup_lane_f16 (__c, __lane)); } -__extension__ extern __inline poly16x8_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2q_p16 (poly16x8_t __a, poly16x8_t __b) +vfmsq_lane_f16 (float16x8_t __a, float16x8_t __b, + float16x4_t __c, const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); -#endif + return vfmsq_f16 (__a, __b, __aarch64_vdupq_lane_f16 (__c, __lane)); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2q_s8 (int8x16_t __a, int8x16_t __b) +vfms_laneq_f16 (float16x4_t __a, float16x4_t __b, + float16x8_t __c, const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, - (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14}); -#else - return __builtin_shuffle (__a, __b, - (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31}); -#endif + return vfms_f16 (__a, __b, __aarch64_vdup_laneq_f16 (__c, __lane)); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2q_s16 (int16x8_t __a, int16x8_t __b) +vfmsq_laneq_f16 (float16x8_t __a, float16x8_t __b, + float16x8_t __c, const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); -#endif + return vfmsq_f16 (__a, __b, __aarch64_vdupq_laneq_f16 (__c, __lane)); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2q_s32 (int32x4_t __a, int32x4_t __b) +vfms_n_f16 (float16x4_t __a, float16x4_t __b, float16_t __c) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2}); -#else - return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7}); -#endif + return vfms_f16 (__a, __b, vdup_n_f16 (__c)); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2q_s64 (int64x2_t __a, int64x2_t __b) +vfmsq_n_f16 (float16x8_t __a, float16x8_t __b, float16_t __c) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); -#else - return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); -#endif + return vfmsq_f16 (__a, __b, vdupq_n_f16 (__c)); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline float16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2q_u8 (uint8x16_t __a, uint8x16_t __b) +vmulh_lane_f16 (float16_t __a, float16x4_t __b, const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, - (uint8x16_t) {16, 0, 18, 2, 20, 4, 22, 6, 24, 8, 26, 10, 28, 12, 30, 14}); -#else - return __builtin_shuffle (__a, __b, - (uint8x16_t) {1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31}); -#endif + return __a * __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2q_u16 (uint16x8_t __a, uint16x8_t __b) +vmul_lane_f16 (float16x4_t __a, float16x4_t __b, const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15}); -#endif + return vmul_f16 (__a, vdup_n_f16 (__aarch64_vget_lane_any (__b, __lane))); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2q_u32 (uint32x4_t __a, uint32x4_t __b) +vmulq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 6, 2}); -#else - return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 5, 3, 7}); -#endif + return vmulq_f16 (__a, vdupq_n_f16 (__aarch64_vget_lane_any (__b, __lane))); } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline float16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn2q_u64 (uint64x2_t __a, uint64x2_t __b) +vmulh_laneq_f16 (float16_t __a, float16x8_t __b, const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); -#else - return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); -#endif + return __a * __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline float16x4x2_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn_f16 (float16x4_t __a, float16x4_t __b) +vmul_laneq_f16 (float16x4_t __a, float16x8_t __b, const int __lane) { - return (float16x4x2_t) {vtrn1_f16 (__a, __b), vtrn2_f16 (__a, __b)}; + return vmul_f16 (__a, vdup_n_f16 (__aarch64_vget_lane_any (__b, __lane))); } -__extension__ extern __inline float32x2x2_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn_f32 (float32x2_t a, float32x2_t b) +vmulq_laneq_f16 (float16x8_t __a, float16x8_t __b, const int __lane) { - return (float32x2x2_t) {vtrn1_f32 (a, b), vtrn2_f32 (a, b)}; + return vmulq_f16 (__a, vdupq_n_f16 (__aarch64_vget_lane_any (__b, __lane))); } -__extension__ extern __inline poly8x8x2_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn_p8 (poly8x8_t a, poly8x8_t b) +vmul_n_f16 (float16x4_t __a, float16_t __b) { - return (poly8x8x2_t) {vtrn1_p8 (a, b), vtrn2_p8 (a, b)}; + return vmul_lane_f16 (__a, vdup_n_f16 (__b), 0); } -__extension__ extern __inline poly16x4x2_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn_p16 (poly16x4_t a, poly16x4_t b) +vmulq_n_f16 (float16x8_t __a, float16_t __b) { - return (poly16x4x2_t) {vtrn1_p16 (a, b), vtrn2_p16 (a, b)}; + return vmulq_laneq_f16 (__a, vdupq_n_f16 (__b), 0); } -__extension__ extern __inline int8x8x2_t +__extension__ extern __inline float16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn_s8 (int8x8_t a, int8x8_t b) +vmulxh_lane_f16 (float16_t __a, float16x4_t __b, const int __lane) { - return (int8x8x2_t) {vtrn1_s8 (a, b), vtrn2_s8 (a, b)}; + return vmulxh_f16 (__a, __aarch64_vget_lane_any (__b, __lane)); } -__extension__ extern __inline int16x4x2_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn_s16 (int16x4_t a, int16x4_t b) +vmulx_lane_f16 (float16x4_t __a, float16x4_t __b, const int __lane) { - return (int16x4x2_t) {vtrn1_s16 (a, b), vtrn2_s16 (a, b)}; + return vmulx_f16 (__a, __aarch64_vdup_lane_f16 (__b, __lane)); } -__extension__ extern __inline int32x2x2_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn_s32 (int32x2_t a, int32x2_t b) +vmulxq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __lane) { - return (int32x2x2_t) {vtrn1_s32 (a, b), vtrn2_s32 (a, b)}; + return vmulxq_f16 (__a, __aarch64_vdupq_lane_f16 (__b, __lane)); } -__extension__ extern __inline uint8x8x2_t +__extension__ extern __inline float16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn_u8 (uint8x8_t a, uint8x8_t b) +vmulxh_laneq_f16 (float16_t __a, float16x8_t __b, const int __lane) { - return (uint8x8x2_t) {vtrn1_u8 (a, b), vtrn2_u8 (a, b)}; + return vmulxh_f16 (__a, __aarch64_vget_lane_any (__b, __lane)); } -__extension__ extern __inline uint16x4x2_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn_u16 (uint16x4_t a, uint16x4_t b) +vmulx_laneq_f16 (float16x4_t __a, float16x8_t __b, const int __lane) { - return (uint16x4x2_t) {vtrn1_u16 (a, b), vtrn2_u16 (a, b)}; + return vmulx_f16 (__a, __aarch64_vdup_laneq_f16 (__b, __lane)); } -__extension__ extern __inline uint32x2x2_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrn_u32 (uint32x2_t a, uint32x2_t b) +vmulxq_laneq_f16 (float16x8_t __a, float16x8_t __b, const int __lane) { - return (uint32x2x2_t) {vtrn1_u32 (a, b), vtrn2_u32 (a, b)}; + return vmulxq_f16 (__a, __aarch64_vdupq_laneq_f16 (__b, __lane)); } -__extension__ extern __inline float16x8x2_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrnq_f16 (float16x8_t __a, float16x8_t __b) +vmulx_n_f16 (float16x4_t __a, float16_t __b) { - return (float16x8x2_t) {vtrn1q_f16 (__a, __b), vtrn2q_f16 (__a, __b)}; + return vmulx_f16 (__a, vdup_n_f16 (__b)); } -__extension__ extern __inline float32x4x2_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrnq_f32 (float32x4_t a, float32x4_t b) +vmulxq_n_f16 (float16x8_t __a, float16_t __b) { - return (float32x4x2_t) {vtrn1q_f32 (a, b), vtrn2q_f32 (a, b)}; + return vmulxq_f16 (__a, vdupq_n_f16 (__b)); } -__extension__ extern __inline poly8x16x2_t +/* ARMv8.2-A FP16 reduction vector intrinsics. */ + +__extension__ extern __inline float16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrnq_p8 (poly8x16_t a, poly8x16_t b) +vmaxv_f16 (float16x4_t __a) { - return (poly8x16x2_t) {vtrn1q_p8 (a, b), vtrn2q_p8 (a, b)}; + return __builtin_aarch64_reduc_smax_nan_scal_v4hf (__a); } -__extension__ extern __inline poly16x8x2_t +__extension__ extern __inline float16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrnq_p16 (poly16x8_t a, poly16x8_t b) +vmaxvq_f16 (float16x8_t __a) { - return (poly16x8x2_t) {vtrn1q_p16 (a, b), vtrn2q_p16 (a, b)}; + return __builtin_aarch64_reduc_smax_nan_scal_v8hf (__a); } -__extension__ extern __inline int8x16x2_t +__extension__ extern __inline float16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrnq_s8 (int8x16_t a, int8x16_t b) +vminv_f16 (float16x4_t __a) { - return (int8x16x2_t) {vtrn1q_s8 (a, b), vtrn2q_s8 (a, b)}; + return __builtin_aarch64_reduc_smin_nan_scal_v4hf (__a); } -__extension__ extern __inline int16x8x2_t +__extension__ extern __inline float16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrnq_s16 (int16x8_t a, int16x8_t b) +vminvq_f16 (float16x8_t __a) { - return (int16x8x2_t) {vtrn1q_s16 (a, b), vtrn2q_s16 (a, b)}; + return __builtin_aarch64_reduc_smin_nan_scal_v8hf (__a); } -__extension__ extern __inline int32x4x2_t +__extension__ extern __inline float16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrnq_s32 (int32x4_t a, int32x4_t b) +vmaxnmv_f16 (float16x4_t __a) { - return (int32x4x2_t) {vtrn1q_s32 (a, b), vtrn2q_s32 (a, b)}; + return __builtin_aarch64_reduc_smax_scal_v4hf (__a); } -__extension__ extern __inline uint8x16x2_t +__extension__ extern __inline float16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrnq_u8 (uint8x16_t a, uint8x16_t b) +vmaxnmvq_f16 (float16x8_t __a) { - return (uint8x16x2_t) {vtrn1q_u8 (a, b), vtrn2q_u8 (a, b)}; + return __builtin_aarch64_reduc_smax_scal_v8hf (__a); } -__extension__ extern __inline uint16x8x2_t +__extension__ extern __inline float16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrnq_u16 (uint16x8_t a, uint16x8_t b) +vminnmv_f16 (float16x4_t __a) { - return (uint16x8x2_t) {vtrn1q_u16 (a, b), vtrn2q_u16 (a, b)}; + return __builtin_aarch64_reduc_smin_scal_v4hf (__a); } -__extension__ extern __inline uint32x4x2_t +__extension__ extern __inline float16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtrnq_u32 (uint32x4_t a, uint32x4_t b) +vminnmvq_f16 (float16x8_t __a) { - return (uint32x4x2_t) {vtrn1q_u32 (a, b), vtrn2q_u32 (a, b)}; + return __builtin_aarch64_reduc_smin_scal_v8hf (__a); } -/* vtst */ +#pragma GCC pop_options -__extension__ extern __inline uint8x8_t +/* AdvSIMD Dot Product intrinsics. */ + +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+dotprod") + +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtst_s8 (int8x8_t __a, int8x8_t __b) +vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b) { - return (uint8x8_t) ((__a & __b) != 0); + return __builtin_aarch64_udotv8qi_uuuu (__r, __a, __b); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtst_s16 (int16x4_t __a, int16x4_t __b) +vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) { - return (uint16x4_t) ((__a & __b) != 0); + return __builtin_aarch64_udotv16qi_uuuu (__r, __a, __b); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtst_s32 (int32x2_t __a, int32x2_t __b) +vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b) { - return (uint32x2_t) ((__a & __b) != 0); + return __builtin_aarch64_sdotv8qi (__r, __a, __b); } -__extension__ extern __inline uint64x1_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtst_s64 (int64x1_t __a, int64x1_t __b) +vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b) { - return (uint64x1_t) ((__a & __b) != __AARCH64_INT64_C (0)); + return __builtin_aarch64_sdotv16qi (__r, __a, __b); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtst_u8 (uint8x8_t __a, uint8x8_t __b) +vdot_lane_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b, const int __index) { - return ((__a & __b) != 0); + return __builtin_aarch64_udot_lanev8qi_uuuus (__r, __a, __b, __index); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtst_u16 (uint16x4_t __a, uint16x4_t __b) +vdot_laneq_u32 (uint32x2_t __r, uint8x8_t __a, uint8x16_t __b, + const int __index) { - return ((__a & __b) != 0); + return __builtin_aarch64_udot_laneqv8qi_uuuus (__r, __a, __b, __index); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtst_u32 (uint32x2_t __a, uint32x2_t __b) +vdotq_lane_u32 (uint32x4_t __r, uint8x16_t __a, uint8x8_t __b, + const int __index) { - return ((__a & __b) != 0); + return __builtin_aarch64_udot_lanev16qi_uuuus (__r, __a, __b, __index); } -__extension__ extern __inline uint64x1_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtst_u64 (uint64x1_t __a, uint64x1_t __b) +vdotq_laneq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b, + const int __index) { - return ((__a & __b) != __AARCH64_UINT64_C (0)); + return __builtin_aarch64_udot_laneqv16qi_uuuus (__r, __a, __b, __index); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtstq_s8 (int8x16_t __a, int8x16_t __b) +vdot_lane_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b, const int __index) { - return (uint8x16_t) ((__a & __b) != 0); + return __builtin_aarch64_sdot_lanev8qi (__r, __a, __b, __index); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtstq_s16 (int16x8_t __a, int16x8_t __b) +vdot_laneq_s32 (int32x2_t __r, int8x8_t __a, int8x16_t __b, const int __index) { - return (uint16x8_t) ((__a & __b) != 0); + return __builtin_aarch64_sdot_laneqv8qi (__r, __a, __b, __index); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtstq_s32 (int32x4_t __a, int32x4_t __b) +vdotq_lane_s32 (int32x4_t __r, int8x16_t __a, int8x8_t __b, const int __index) { - return (uint32x4_t) ((__a & __b) != 0); + return __builtin_aarch64_sdot_lanev16qi (__r, __a, __b, __index); } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtstq_s64 (int64x2_t __a, int64x2_t __b) +vdotq_laneq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b, const int __index) { - return (uint64x2_t) ((__a & __b) != __AARCH64_INT64_C (0)); + return __builtin_aarch64_sdot_laneqv16qi (__r, __a, __b, __index); } +#pragma GCC pop_options -__extension__ extern __inline uint8x16_t +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+sm4") + +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtstq_u8 (uint8x16_t __a, uint8x16_t __b) +vsm3ss1q_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) { - return ((__a & __b) != 0); + return __builtin_aarch64_sm3ss1qv4si_uuuu (__a, __b, __c); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtstq_u16 (uint16x8_t __a, uint16x8_t __b) +vsm3tt1aq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c, const int __imm2) { - return ((__a & __b) != 0); + return __builtin_aarch64_sm3tt1aqv4si_uuuus (__a, __b, __c, __imm2); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtstq_u32 (uint32x4_t __a, uint32x4_t __b) +vsm3tt1bq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c, const int __imm2) { - return ((__a & __b) != 0); + return __builtin_aarch64_sm3tt1bqv4si_uuuus (__a, __b, __c, __imm2); } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtstq_u64 (uint64x2_t __a, uint64x2_t __b) +vsm3tt2aq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c, const int __imm2) { - return ((__a & __b) != __AARCH64_UINT64_C (0)); + return __builtin_aarch64_sm3tt2aqv4si_uuuus (__a, __b, __c, __imm2); } -__extension__ extern __inline uint64_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtstd_s64 (int64_t __a, int64_t __b) +vsm3tt2bq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c, const int __imm2) { - return (__a & __b) ? -1ll : 0ll; + return __builtin_aarch64_sm3tt2bqv4si_uuuus (__a, __b, __c, __imm2); } -__extension__ extern __inline uint64_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vtstd_u64 (uint64_t __a, uint64_t __b) +vsm3partw1q_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) { - return (__a & __b) ? -1ll : 0ll; + return __builtin_aarch64_sm3partw1qv4si_uuuu (__a, __b, __c); } - -/* vuqadd */ - -__extension__ extern __inline int8x8_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuqadd_s8 (int8x8_t __a, uint8x8_t __b) +vsm3partw2q_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) { - return __builtin_aarch64_suqaddv8qi_ssu (__a, __b); + return __builtin_aarch64_sm3partw2qv4si_uuuu (__a, __b, __c); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuqadd_s16 (int16x4_t __a, uint16x4_t __b) +vsm4eq_u32 (uint32x4_t __a, uint32x4_t __b) { - return __builtin_aarch64_suqaddv4hi_ssu (__a, __b); + return __builtin_aarch64_sm4eqv4si_uuu (__a, __b); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuqadd_s32 (int32x2_t __a, uint32x2_t __b) +vsm4ekeyq_u32 (uint32x4_t __a, uint32x4_t __b) { - return __builtin_aarch64_suqaddv2si_ssu (__a, __b); + return __builtin_aarch64_sm4ekeyqv4si_uuu (__a, __b); } -__extension__ extern __inline int64x1_t +#pragma GCC pop_options + +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+sha3") + +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuqadd_s64 (int64x1_t __a, uint64x1_t __b) +vsha512hq_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c) { - return (int64x1_t) {__builtin_aarch64_suqadddi_ssu (__a[0], __b[0])}; + return __builtin_aarch64_crypto_sha512hqv2di_uuuu (__a, __b, __c); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuqaddq_s8 (int8x16_t __a, uint8x16_t __b) +vsha512h2q_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c) { - return __builtin_aarch64_suqaddv16qi_ssu (__a, __b); + return __builtin_aarch64_crypto_sha512h2qv2di_uuuu (__a, __b, __c); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuqaddq_s16 (int16x8_t __a, uint16x8_t __b) +vsha512su0q_u64 (uint64x2_t __a, uint64x2_t __b) { - return __builtin_aarch64_suqaddv8hi_ssu (__a, __b); + return __builtin_aarch64_crypto_sha512su0qv2di_uuu (__a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuqaddq_s32 (int32x4_t __a, uint32x4_t __b) +vsha512su1q_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c) { - return __builtin_aarch64_suqaddv4si_ssu (__a, __b); + return __builtin_aarch64_crypto_sha512su1qv2di_uuuu (__a, __b, __c); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuqaddq_s64 (int64x2_t __a, uint64x2_t __b) +veor3q_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) { - return __builtin_aarch64_suqaddv2di_ssu (__a, __b); + return __builtin_aarch64_eor3qv16qi_uuuu (__a, __b, __c); } -__extension__ extern __inline int8_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuqaddb_s8 (int8_t __a, uint8_t __b) +veor3q_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) { - return __builtin_aarch64_suqaddqi_ssu (__a, __b); + return __builtin_aarch64_eor3qv8hi_uuuu (__a, __b, __c); } -__extension__ extern __inline int16_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuqaddh_s16 (int16_t __a, uint16_t __b) +veor3q_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) { - return __builtin_aarch64_suqaddhi_ssu (__a, __b); + return __builtin_aarch64_eor3qv4si_uuuu (__a, __b, __c); } -__extension__ extern __inline int32_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuqadds_s32 (int32_t __a, uint32_t __b) +veor3q_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c) { - return __builtin_aarch64_suqaddsi_ssu (__a, __b); + return __builtin_aarch64_eor3qv2di_uuuu (__a, __b, __c); } -__extension__ extern __inline int64_t + +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuqaddd_s64 (int64_t __a, uint64_t __b) +veor3q_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c) { - return __builtin_aarch64_suqadddi_ssu (__a, __b); + return __builtin_aarch64_eor3qv16qi (__a, __b, __c); } -#define __DEFINTERLEAVE(op, rettype, intype, funcsuffix, Q) \ - __extension__ extern __inline rettype \ - __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ - v ## op ## Q ## _ ## funcsuffix (intype a, intype b) \ - { \ - return (rettype) {v ## op ## 1 ## Q ## _ ## funcsuffix (a, b), \ - v ## op ## 2 ## Q ## _ ## funcsuffix (a, b)}; \ - } - -#define __INTERLEAVE_LIST(op) \ - __DEFINTERLEAVE (op, float16x4x2_t, float16x4_t, f16,) \ - __DEFINTERLEAVE (op, float32x2x2_t, float32x2_t, f32,) \ - __DEFINTERLEAVE (op, poly8x8x2_t, poly8x8_t, p8,) \ - __DEFINTERLEAVE (op, poly16x4x2_t, poly16x4_t, p16,) \ - __DEFINTERLEAVE (op, int8x8x2_t, int8x8_t, s8,) \ - __DEFINTERLEAVE (op, int16x4x2_t, int16x4_t, s16,) \ - __DEFINTERLEAVE (op, int32x2x2_t, int32x2_t, s32,) \ - __DEFINTERLEAVE (op, uint8x8x2_t, uint8x8_t, u8,) \ - __DEFINTERLEAVE (op, uint16x4x2_t, uint16x4_t, u16,) \ - __DEFINTERLEAVE (op, uint32x2x2_t, uint32x2_t, u32,) \ - __DEFINTERLEAVE (op, float16x8x2_t, float16x8_t, f16, q) \ - __DEFINTERLEAVE (op, float32x4x2_t, float32x4_t, f32, q) \ - __DEFINTERLEAVE (op, poly8x16x2_t, poly8x16_t, p8, q) \ - __DEFINTERLEAVE (op, poly16x8x2_t, poly16x8_t, p16, q) \ - __DEFINTERLEAVE (op, int8x16x2_t, int8x16_t, s8, q) \ - __DEFINTERLEAVE (op, int16x8x2_t, int16x8_t, s16, q) \ - __DEFINTERLEAVE (op, int32x4x2_t, int32x4_t, s32, q) \ - __DEFINTERLEAVE (op, uint8x16x2_t, uint8x16_t, u8, q) \ - __DEFINTERLEAVE (op, uint16x8x2_t, uint16x8_t, u16, q) \ - __DEFINTERLEAVE (op, uint32x4x2_t, uint32x4_t, u32, q) - -/* vuzp */ - -__extension__ extern __inline float16x4_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1_f16 (float16x4_t __a, float16x4_t __b) +veor3q_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6}); -#endif + return __builtin_aarch64_eor3qv8hi (__a, __b, __c); } -__extension__ extern __inline float32x2_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1_f32 (float32x2_t __a, float32x2_t __b) +veor3q_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); -#else - return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); -#endif + return __builtin_aarch64_eor3qv4si (__a, __b, __c); } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1_p8 (poly8x8_t __a, poly8x8_t __b) +veor3q_s64 (int64x2_t __a, int64x2_t __b, int64x2_t __c) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); -#else - return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); -#endif + return __builtin_aarch64_eor3qv2di (__a, __b, __c); } -__extension__ extern __inline poly16x4_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1_p16 (poly16x4_t __a, poly16x4_t __b) +vrax1q_u64 (uint64x2_t __a, uint64x2_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6}); -#endif + return __builtin_aarch64_rax1qv2di_uuu (__a, __b); } -__extension__ extern __inline int8x8_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1_s8 (int8x8_t __a, int8x8_t __b) +vxarq_u64 (uint64x2_t __a, uint64x2_t __b, const int imm6) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); -#else - return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); -#endif + return __builtin_aarch64_xarqv2di_uuus (__a, __b,imm6); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1_s16 (int16x4_t __a, int16x4_t __b) +vbcaxq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6}); -#endif + return __builtin_aarch64_bcaxqv16qi_uuuu (__a, __b, __c); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1_s32 (int32x2_t __a, int32x2_t __b) +vbcaxq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); -#else - return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); -#endif + return __builtin_aarch64_bcaxqv8hi_uuuu (__a, __b, __c); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1_u8 (uint8x8_t __a, uint8x8_t __b) +vbcaxq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); -#else - return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); -#endif + return __builtin_aarch64_bcaxqv4si_uuuu (__a, __b, __c); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1_u16 (uint16x4_t __a, uint16x4_t __b) +vbcaxq_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6}); -#endif + return __builtin_aarch64_bcaxqv2di_uuuu (__a, __b, __c); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1_u32 (uint32x2_t __a, uint32x2_t __b) +vbcaxq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); -#else - return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); -#endif + return __builtin_aarch64_bcaxqv16qi (__a, __b, __c); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1q_f16 (float16x8_t __a, float16x8_t __b) +vbcaxq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); -#endif + return __builtin_aarch64_bcaxqv8hi (__a, __b, __c); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1q_f32 (float32x4_t __a, float32x4_t __b) +vbcaxq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3}); -#else - return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6}); -#endif + return __builtin_aarch64_bcaxqv4si (__a, __b, __c); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1q_f64 (float64x2_t __a, float64x2_t __b) +vbcaxq_s64 (int64x2_t __a, int64x2_t __b, int64x2_t __c) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); -#else - return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); -#endif + return __builtin_aarch64_bcaxqv2di (__a, __b, __c); } -__extension__ extern __inline poly8x16_t +#pragma GCC pop_options + +/* AdvSIMD Complex numbers intrinsics. */ + +#pragma GCC push_options +#pragma GCC target ("arch=armv8.3-a") + +#pragma GCC push_options +#pragma GCC target ("+fp16") +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1q_p8 (poly8x16_t __a, poly8x16_t __b) +vcadd_rot90_f16 (float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x16_t) - {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15}); -#else - return __builtin_shuffle (__a, __b, (uint8x16_t) - {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}); -#endif + return __builtin_aarch64_fcadd90v4hf (__a, __b); } -__extension__ extern __inline poly16x8_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1q_p16 (poly16x8_t __a, poly16x8_t __b) +vcaddq_rot90_f16 (float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); -#endif + return __builtin_aarch64_fcadd90v8hf (__a, __b); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1q_s8 (int8x16_t __a, int8x16_t __b) +vcadd_rot270_f16 (float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, - (uint8x16_t) {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15}); -#else - return __builtin_shuffle (__a, __b, - (uint8x16_t) {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}); -#endif + return __builtin_aarch64_fcadd270v4hf (__a, __b); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1q_s16 (int16x8_t __a, int16x8_t __b) +vcaddq_rot270_f16 (float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); -#endif + return __builtin_aarch64_fcadd270v8hf (__a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1q_s32 (int32x4_t __a, int32x4_t __b) +vcmla_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3}); -#else - return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6}); -#endif + return __builtin_aarch64_fcmla0v4hf (__r, __a, __b); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1q_s64 (int64x2_t __a, int64x2_t __b) +vcmlaq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); -#else - return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); -#endif + return __builtin_aarch64_fcmla0v8hf (__r, __a, __b); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1q_u8 (uint8x16_t __a, uint8x16_t __b) +vcmla_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, - (uint8x16_t) {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15}); -#else - return __builtin_shuffle (__a, __b, - (uint8x16_t) {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}); -#endif + return __builtin_aarch64_fcmla_lane0v4hf (__r, __a, __b, __index); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1q_u16 (uint16x8_t __a, uint16x8_t __b) +vcmla_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14}); -#endif + return __builtin_aarch64_fcmla_laneq0v4hf (__r, __a, __b, __index); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1q_u32 (uint32x4_t __a, uint32x4_t __b) +vcmlaq_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3}); -#else - return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6}); -#endif + return __builtin_aarch64_fcmlaq_lane0v8hf (__r, __a, __b, __index); } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp1q_u64 (uint64x2_t __a, uint64x2_t __b) +vcmlaq_rot90_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); -#else - return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); -#endif + return __builtin_aarch64_fcmlaq_lane90v8hf (__r, __a, __b, __index); } __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2_f16 (float16x4_t __a, float16x4_t __b) +vcmla_rot90_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7}); -#endif + return __builtin_aarch64_fcmla_laneq90v4hf (__r, __a, __b, __index); } -__extension__ extern __inline float32x2_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2_f32 (float32x2_t __a, float32x2_t __b) +vcmla_rot90_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); -#else - return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); -#endif + return __builtin_aarch64_fcmla_lane90v4hf (__r, __a, __b, __index); } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2_p8 (poly8x8_t __a, poly8x8_t __b) +vcmlaq_rot90_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); -#else - return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); -#endif + return __builtin_aarch64_fcmla90v8hf (__r, __a, __b); } -__extension__ extern __inline poly16x4_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2_p16 (poly16x4_t __a, poly16x4_t __b) +vcmla_rot90_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7}); -#endif + return __builtin_aarch64_fcmla90v4hf (__r, __a, __b); } -__extension__ extern __inline int8x8_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2_s8 (int8x8_t __a, int8x8_t __b) +vcmlaq_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); -#else - return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); -#endif + return __builtin_aarch64_fcmla_lane0v8hf (__r, __a, __b, __index); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2_s16 (int16x4_t __a, int16x4_t __b) +vcmla_rot180_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7}); -#endif + return __builtin_aarch64_fcmla_laneq180v4hf (__r, __a, __b, __index); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2_s32 (int32x2_t __a, int32x2_t __b) +vcmla_rot180_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); -#else - return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); -#endif + return __builtin_aarch64_fcmla_lane180v4hf (__r, __a, __b, __index); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2_u8 (uint8x8_t __a, uint8x8_t __b) +vcmlaq_rot180_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); -#else - return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); -#endif + return __builtin_aarch64_fcmla180v8hf (__r, __a, __b); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2_u16 (uint16x4_t __a, uint16x4_t __b) +vcmla_rot180_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7}); -#endif + return __builtin_aarch64_fcmla180v4hf (__r, __a, __b); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2_u32 (uint32x2_t __a, uint32x2_t __b) +vcmlaq_rot90_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); -#else - return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); -#endif + return __builtin_aarch64_fcmla_lane90v8hf (__r, __a, __b, __index); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2q_f16 (float16x8_t __a, float16x8_t __b) +vcmlaq_rot270_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); -#endif + return __builtin_aarch64_fcmla_lane270v8hf (__r, __a, __b, __index); } -__extension__ extern __inline float32x4_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2q_f32 (float32x4_t __a, float32x4_t __b) +vcmlaq_rot270_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2}); -#else - return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7}); -#endif + return __builtin_aarch64_fcmlaq_lane270v8hf (__r, __a, __b, __index); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2q_f64 (float64x2_t __a, float64x2_t __b) +vcmla_rot270_laneq_f16 (float16x4_t __r, float16x4_t __a, float16x8_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); -#else - return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); -#endif + return __builtin_aarch64_fcmla_laneq270v4hf (__r, __a, __b, __index); } -__extension__ extern __inline poly8x16_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2q_p8 (poly8x16_t __a, poly8x16_t __b) +vcmlaq_rot270_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, - (uint8x16_t) {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14}); -#else - return __builtin_shuffle (__a, __b, - (uint8x16_t) {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}); -#endif + return __builtin_aarch64_fcmla270v8hf (__r, __a, __b); } -__extension__ extern __inline poly16x8_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2q_p16 (poly16x8_t __a, poly16x8_t __b) +vcmla_rot270_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); -#endif + return __builtin_aarch64_fcmla270v4hf (__r, __a, __b); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2q_s8 (int8x16_t __a, int8x16_t __b) +vcmlaq_rot180_laneq_f16 (float16x8_t __r, float16x8_t __a, float16x8_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, - (uint8x16_t) {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14}); -#else - return __builtin_shuffle (__a, __b, - (uint8x16_t) {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}); -#endif + return __builtin_aarch64_fcmla_lane180v8hf (__r, __a, __b, __index); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2q_s16 (int16x8_t __a, int16x8_t __b) +vcmlaq_rot180_lane_f16 (float16x8_t __r, float16x8_t __a, float16x4_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); -#endif + return __builtin_aarch64_fcmlaq_lane180v8hf (__r, __a, __b, __index); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2q_s32 (int32x4_t __a, int32x4_t __b) +vcmla_rot270_lane_f16 (float16x4_t __r, float16x4_t __a, float16x4_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2}); -#else - return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7}); -#endif + return __builtin_aarch64_fcmla_lane270v4hf (__r, __a, __b, __index); } +#pragma GCC pop_options -__extension__ extern __inline int64x2_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2q_s64 (int64x2_t __a, int64x2_t __b) +vcadd_rot90_f32 (float32x2_t __a, float32x2_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); -#else - return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); -#endif + return __builtin_aarch64_fcadd90v2sf (__a, __b); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2q_u8 (uint8x16_t __a, uint8x16_t __b) +vcaddq_rot90_f32 (float32x4_t __a, float32x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x16_t) - {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14}); -#else - return __builtin_shuffle (__a, __b, (uint8x16_t) - {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}); -#endif + return __builtin_aarch64_fcadd90v4sf (__a, __b); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2q_u16 (uint16x8_t __a, uint16x8_t __b) +vcaddq_rot90_f64 (float64x2_t __a, float64x2_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15}); -#endif + return __builtin_aarch64_fcadd90v2df (__a, __b); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2q_u32 (uint32x4_t __a, uint32x4_t __b) +vcadd_rot270_f32 (float32x2_t __a, float32x2_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2}); -#else - return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7}); -#endif + return __builtin_aarch64_fcadd270v2sf (__a, __b); } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vuzp2q_u64 (uint64x2_t __a, uint64x2_t __b) +vcaddq_rot270_f32 (float32x4_t __a, float32x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); -#else - return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); -#endif + return __builtin_aarch64_fcadd270v4sf (__a, __b); } -__INTERLEAVE_LIST (uzp) +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcaddq_rot270_f64 (float64x2_t __a, float64x2_t __b) +{ + return __builtin_aarch64_fcadd270v2df (__a, __b); +} -/* vzip */ +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcmla_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) +{ + return __builtin_aarch64_fcmla0v2sf (__r, __a, __b); +} -__extension__ extern __inline float16x4_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1_f16 (float16x4_t __a, float16x4_t __b) +vcmlaq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5}); -#endif + return __builtin_aarch64_fcmla0v4sf (__r, __a, __b); } -__extension__ extern __inline float32x2_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1_f32 (float32x2_t __a, float32x2_t __b) +vcmlaq_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); -#else - return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); -#endif + return __builtin_aarch64_fcmla0v2df (__r, __a, __b); } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1_p8 (poly8x8_t __a, poly8x8_t __b) +vcmla_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7}); -#else - return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); -#endif + return __builtin_aarch64_fcmla_lane0v2sf (__r, __a, __b, __index); } -__extension__ extern __inline poly16x4_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1_p16 (poly16x4_t __a, poly16x4_t __b) +vcmla_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5}); -#endif + return __builtin_aarch64_fcmla_laneq0v2sf (__r, __a, __b, __index); } -__extension__ extern __inline int8x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1_s8 (int8x8_t __a, int8x8_t __b) +vcmlaq_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7}); -#else - return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); -#endif + return __builtin_aarch64_fcmlaq_lane0v4sf (__r, __a, __b, __index); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1_s16 (int16x4_t __a, int16x4_t __b) +vcmlaq_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5}); -#endif + return __builtin_aarch64_fcmla_lane0v4sf (__r, __a, __b, __index); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1_s32 (int32x2_t __a, int32x2_t __b) +vcmla_rot90_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); -#else - return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); -#endif + return __builtin_aarch64_fcmla90v2sf (__r, __a, __b); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1_u8 (uint8x8_t __a, uint8x8_t __b) +vcmlaq_rot90_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7}); -#else - return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); -#endif + return __builtin_aarch64_fcmla90v4sf (__r, __a, __b); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1_u16 (uint16x4_t __a, uint16x4_t __b) +vcmlaq_rot90_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5}); -#endif + return __builtin_aarch64_fcmla90v2df (__r, __a, __b); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1_u32 (uint32x2_t __a, uint32x2_t __b) +vcmla_rot90_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1}); -#else - return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2}); -#endif + return __builtin_aarch64_fcmla_lane90v2sf (__r, __a, __b, __index); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1q_f16 (float16x8_t __a, float16x8_t __b) +vcmla_rot90_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, - (uint16x8_t) {12, 4, 13, 5, 14, 6, 15, 7}); -#else - return __builtin_shuffle (__a, __b, - (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); -#endif + return __builtin_aarch64_fcmla_laneq90v2sf (__r, __a, __b, __index); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1q_f32 (float32x4_t __a, float32x4_t __b) +vcmlaq_rot90_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3}); -#else - return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5}); -#endif + return __builtin_aarch64_fcmlaq_lane90v4sf (__r, __a, __b, __index); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1q_f64 (float64x2_t __a, float64x2_t __b) +vcmlaq_rot90_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); -#else - return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); -#endif + return __builtin_aarch64_fcmla_lane90v4sf (__r, __a, __b, __index); } -__extension__ extern __inline poly8x16_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1q_p8 (poly8x16_t __a, poly8x16_t __b) +vcmla_rot180_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x16_t) - {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15}); -#else - return __builtin_shuffle (__a, __b, (uint8x16_t) - {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23}); -#endif + return __builtin_aarch64_fcmla180v2sf (__r, __a, __b); } -__extension__ extern __inline poly16x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1q_p16 (poly16x8_t __a, poly16x8_t __b) +vcmlaq_rot180_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) - {12, 4, 13, 5, 14, 6, 15, 7}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); -#endif + return __builtin_aarch64_fcmla180v4sf (__r, __a, __b); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1q_s8 (int8x16_t __a, int8x16_t __b) +vcmlaq_rot180_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x16_t) - {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15}); -#else - return __builtin_shuffle (__a, __b, (uint8x16_t) - {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23}); -#endif + return __builtin_aarch64_fcmla180v2df (__r, __a, __b); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1q_s16 (int16x8_t __a, int16x8_t __b) +vcmla_rot180_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) - {12, 4, 13, 5, 14, 6, 15, 7}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); -#endif + return __builtin_aarch64_fcmla_lane180v2sf (__r, __a, __b, __index); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1q_s32 (int32x4_t __a, int32x4_t __b) +vcmla_rot180_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3}); -#else - return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5}); -#endif + return __builtin_aarch64_fcmla_laneq180v2sf (__r, __a, __b, __index); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1q_s64 (int64x2_t __a, int64x2_t __b) +vcmlaq_rot180_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); -#else - return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); -#endif + return __builtin_aarch64_fcmlaq_lane180v4sf (__r, __a, __b, __index); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1q_u8 (uint8x16_t __a, uint8x16_t __b) +vcmlaq_rot180_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x16_t) - {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15}); -#else - return __builtin_shuffle (__a, __b, (uint8x16_t) - {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23}); -#endif + return __builtin_aarch64_fcmla_lane180v4sf (__r, __a, __b, __index); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1q_u16 (uint16x8_t __a, uint16x8_t __b) +vcmla_rot270_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) - {12, 4, 13, 5, 14, 6, 15, 7}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11}); -#endif + return __builtin_aarch64_fcmla270v2sf (__r, __a, __b); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1q_u32 (uint32x4_t __a, uint32x4_t __b) +vcmlaq_rot270_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3}); -#else - return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5}); -#endif + return __builtin_aarch64_fcmla270v4sf (__r, __a, __b); } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip1q_u64 (uint64x2_t __a, uint64x2_t __b) +vcmlaq_rot270_f64 (float64x2_t __r, float64x2_t __a, float64x2_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1}); -#else - return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2}); -#endif + return __builtin_aarch64_fcmla270v2df (__r, __a, __b); } -__extension__ extern __inline float16x4_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2_f16 (float16x4_t __a, float16x4_t __b) +vcmla_rot270_lane_f32 (float32x2_t __r, float32x2_t __a, float32x2_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7}); -#endif + return __builtin_aarch64_fcmla_lane270v2sf (__r, __a, __b, __index); } __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2_f32 (float32x2_t __a, float32x2_t __b) +vcmla_rot270_laneq_f32 (float32x2_t __r, float32x2_t __a, float32x4_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); -#else - return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); -#endif + return __builtin_aarch64_fcmla_laneq270v2sf (__r, __a, __b, __index); } -__extension__ extern __inline poly8x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2_p8 (poly8x8_t __a, poly8x8_t __b) +vcmlaq_rot270_lane_f32 (float32x4_t __r, float32x4_t __a, float32x2_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); -#else - return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15}); -#endif + return __builtin_aarch64_fcmlaq_lane270v4sf (__r, __a, __b, __index); } -__extension__ extern __inline poly16x4_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2_p16 (poly16x4_t __a, poly16x4_t __b) +vcmlaq_rot270_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b, + const int __index) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7}); -#endif + return __builtin_aarch64_fcmla_lane270v4sf (__r, __a, __b, __index); } -__extension__ extern __inline int8x8_t +#pragma GCC pop_options + +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+fp16fml") + +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2_s8 (int8x8_t __a, int8x8_t __b) +vfmlal_low_f16 (float32x2_t __r, float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); -#else - return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15}); -#endif + return __builtin_aarch64_fmlal_lowv2sf (__r, __a, __b); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2_s16 (int16x4_t __a, int16x4_t __b) +vfmlsl_low_f16 (float32x2_t __r, float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7}); -#endif + return __builtin_aarch64_fmlsl_lowv2sf (__r, __a, __b); } -__extension__ extern __inline int32x2_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2_s32 (int32x2_t __a, int32x2_t __b) +vfmlalq_low_f16 (float32x4_t __r, float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); -#else - return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); -#endif + return __builtin_aarch64_fmlalq_lowv4sf (__r, __a, __b); } -__extension__ extern __inline uint8x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2_u8 (uint8x8_t __a, uint8x8_t __b) +vfmlslq_low_f16 (float32x4_t __r, float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); -#else - return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15}); -#endif + return __builtin_aarch64_fmlslq_lowv4sf (__r, __a, __b); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2_u16 (uint16x4_t __a, uint16x4_t __b) +vfmlal_high_f16 (float32x2_t __r, float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1}); -#else - return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7}); -#endif + return __builtin_aarch64_fmlal_highv2sf (__r, __a, __b); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2_u32 (uint32x2_t __a, uint32x2_t __b) +vfmlsl_high_f16 (float32x2_t __r, float16x4_t __a, float16x4_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0}); -#else - return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3}); -#endif + return __builtin_aarch64_fmlsl_highv2sf (__r, __a, __b); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2q_f16 (float16x8_t __a, float16x8_t __b) +vfmlalq_high_f16 (float32x4_t __r, float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, - (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); -#else - return __builtin_shuffle (__a, __b, - (uint16x8_t) {4, 12, 5, 13, 6, 14, 7, 15}); -#endif + return __builtin_aarch64_fmlalq_highv4sf (__r, __a, __b); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2q_f32 (float32x4_t __a, float32x4_t __b) +vfmlslq_high_f16 (float32x4_t __r, float16x8_t __a, float16x8_t __b) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1}); -#else - return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7}); -#endif + return __builtin_aarch64_fmlslq_highv4sf (__r, __a, __b); } -__extension__ extern __inline float64x2_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2q_f64 (float64x2_t __a, float64x2_t __b) +vfmlal_lane_low_f16 (float32x2_t __r, float16x4_t __a, float16x4_t __b, + const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); -#else - return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); -#endif + return __builtin_aarch64_fmlal_lane_lowv2sf (__r, __a, __b, __lane); } -__extension__ extern __inline poly8x16_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2q_p8 (poly8x16_t __a, poly8x16_t __b) +vfmlsl_lane_low_f16 (float32x2_t __r, float16x4_t __a, float16x4_t __b, + const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x16_t) - {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7}); -#else - return __builtin_shuffle (__a, __b, (uint8x16_t) - {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); -#endif + return __builtin_aarch64_fmlsl_lane_lowv2sf (__r, __a, __b, __lane); } -__extension__ extern __inline poly16x8_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2q_p16 (poly16x8_t __a, poly16x8_t __b) +vfmlal_laneq_low_f16 (float32x2_t __r, float16x4_t __a, float16x8_t __b, + const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) - {4, 12, 5, 13, 6, 14, 7, 15}); -#endif + return __builtin_aarch64_fmlal_laneq_lowv2sf (__r, __a, __b, __lane); } -__extension__ extern __inline int8x16_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2q_s8 (int8x16_t __a, int8x16_t __b) +vfmlsl_laneq_low_f16 (float32x2_t __r, float16x4_t __a, float16x8_t __b, + const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x16_t) - {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7}); -#else - return __builtin_shuffle (__a, __b, (uint8x16_t) - {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); -#endif + return __builtin_aarch64_fmlsl_laneq_lowv2sf (__r, __a, __b, __lane); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2q_s16 (int16x8_t __a, int16x8_t __b) +vfmlalq_lane_low_f16 (float32x4_t __r, float16x8_t __a, float16x4_t __b, + const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) - {4, 12, 5, 13, 6, 14, 7, 15}); -#endif + return __builtin_aarch64_fmlalq_lane_lowv4sf (__r, __a, __b, __lane); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2q_s32 (int32x4_t __a, int32x4_t __b) +vfmlslq_lane_low_f16 (float32x4_t __r, float16x8_t __a, float16x4_t __b, + const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1}); -#else - return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7}); -#endif + return __builtin_aarch64_fmlslq_lane_lowv4sf (__r, __a, __b, __lane); } -__extension__ extern __inline int64x2_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2q_s64 (int64x2_t __a, int64x2_t __b) +vfmlalq_laneq_low_f16 (float32x4_t __r, float16x8_t __a, float16x8_t __b, + const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); -#else - return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); -#endif + return __builtin_aarch64_fmlalq_laneq_lowv4sf (__r, __a, __b, __lane); } -__extension__ extern __inline uint8x16_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2q_u8 (uint8x16_t __a, uint8x16_t __b) +vfmlslq_laneq_low_f16 (float32x4_t __r, float16x8_t __a, float16x8_t __b, + const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint8x16_t) - {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7}); -#else - return __builtin_shuffle (__a, __b, (uint8x16_t) - {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}); -#endif + return __builtin_aarch64_fmlslq_laneq_lowv4sf (__r, __a, __b, __lane); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2q_u16 (uint16x8_t __a, uint16x8_t __b) +vfmlal_lane_high_f16 (float32x2_t __r, float16x4_t __a, float16x4_t __b, + const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3}); -#else - return __builtin_shuffle (__a, __b, (uint16x8_t) - {4, 12, 5, 13, 6, 14, 7, 15}); -#endif + return __builtin_aarch64_fmlal_lane_highv2sf (__r, __a, __b, __lane); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2q_u32 (uint32x4_t __a, uint32x4_t __b) +vfmlsl_lane_high_f16 (float32x2_t __r, float16x4_t __a, float16x4_t __b, + const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1}); -#else - return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7}); -#endif + return __builtin_aarch64_fmlsl_lane_highv2sf (__r, __a, __b, __lane); } -__extension__ extern __inline uint64x2_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vzip2q_u64 (uint64x2_t __a, uint64x2_t __b) +vfmlal_laneq_high_f16 (float32x2_t __r, float16x4_t __a, float16x8_t __b, + const int __lane) { -#ifdef __AARCH64EB__ - return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0}); -#else - return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3}); -#endif + return __builtin_aarch64_fmlal_laneq_highv2sf (__r, __a, __b, __lane); } -__INTERLEAVE_LIST (zip) - -#undef __INTERLEAVE_LIST -#undef __DEFINTERLEAVE - -/* End of optimal implementations in approved order. */ - -#pragma GCC pop_options - -/* ARMv8.2-A FP16 intrinsics. */ - -#include "arm_fp16.h" - -#pragma GCC push_options -#pragma GCC target ("arch=armv8.2-a+fp16") - -/* ARMv8.2-A FP16 one operand vector intrinsics. */ +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vfmlsl_laneq_high_f16 (float32x2_t __r, float16x4_t __a, float16x8_t __b, + const int __lane) +{ + return __builtin_aarch64_fmlsl_laneq_highv2sf (__r, __a, __b, __lane); +} -__extension__ extern __inline float16x4_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabs_f16 (float16x4_t __a) +vfmlalq_lane_high_f16 (float32x4_t __r, float16x8_t __a, float16x4_t __b, + const int __lane) { - return __builtin_aarch64_absv4hf (__a); + return __builtin_aarch64_fmlalq_lane_highv4sf (__r, __a, __b, __lane); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabsq_f16 (float16x8_t __a) +vfmlslq_lane_high_f16 (float32x4_t __r, float16x8_t __a, float16x4_t __b, + const int __lane) { - return __builtin_aarch64_absv8hf (__a); + return __builtin_aarch64_fmlslq_lane_highv4sf (__r, __a, __b, __lane); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vceqz_f16 (float16x4_t __a) +vfmlalq_laneq_high_f16 (float32x4_t __r, float16x8_t __a, float16x8_t __b, + const int __lane) { - return __builtin_aarch64_cmeqv4hf_uss (__a, vdup_n_f16 (0.0f)); + return __builtin_aarch64_fmlalq_laneq_highv4sf (__r, __a, __b, __lane); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vceqzq_f16 (float16x8_t __a) +vfmlslq_laneq_high_f16 (float32x4_t __r, float16x8_t __a, float16x8_t __b, + const int __lane) { - return __builtin_aarch64_cmeqv8hf_uss (__a, vdupq_n_f16 (0.0f)); + return __builtin_aarch64_fmlslq_laneq_highv4sf (__r, __a, __b, __lane); } -__extension__ extern __inline uint16x4_t +#pragma GCC pop_options + +#pragma GCC push_options +#pragma GCC target ("arch=armv8.5-a") + +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcgez_f16 (float16x4_t __a) +vrnd32z_f32 (float32x2_t __a) { - return __builtin_aarch64_cmgev4hf_uss (__a, vdup_n_f16 (0.0f)); + return __builtin_aarch64_frint32zv2sf (__a); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcgezq_f16 (float16x8_t __a) +vrnd32zq_f32 (float32x4_t __a) { - return __builtin_aarch64_cmgev8hf_uss (__a, vdupq_n_f16 (0.0f)); + return __builtin_aarch64_frint32zv4sf (__a); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcgtz_f16 (float16x4_t __a) +vrnd32z_f64 (float64x1_t __a) { - return __builtin_aarch64_cmgtv4hf_uss (__a, vdup_n_f16 (0.0f)); + return (float64x1_t) + {__builtin_aarch64_frint32zdf (vget_lane_f64 (__a, 0))}; } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcgtzq_f16 (float16x8_t __a) +vrnd32zq_f64 (float64x2_t __a) { - return __builtin_aarch64_cmgtv8hf_uss (__a, vdupq_n_f16 (0.0f)); + return __builtin_aarch64_frint32zv2df (__a); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vclez_f16 (float16x4_t __a) +vrnd32x_f32 (float32x2_t __a) { - return __builtin_aarch64_cmlev4hf_uss (__a, vdup_n_f16 (0.0f)); + return __builtin_aarch64_frint32xv2sf (__a); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vclezq_f16 (float16x8_t __a) +vrnd32xq_f32 (float32x4_t __a) { - return __builtin_aarch64_cmlev8hf_uss (__a, vdupq_n_f16 (0.0f)); + return __builtin_aarch64_frint32xv4sf (__a); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcltz_f16 (float16x4_t __a) +vrnd32x_f64 (float64x1_t __a) { - return __builtin_aarch64_cmltv4hf_uss (__a, vdup_n_f16 (0.0f)); + return (float64x1_t) {__builtin_aarch64_frint32xdf (vget_lane_f64 (__a, 0))}; } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcltzq_f16 (float16x8_t __a) +vrnd32xq_f64 (float64x2_t __a) { - return __builtin_aarch64_cmltv8hf_uss (__a, vdupq_n_f16 (0.0f)); + return __builtin_aarch64_frint32xv2df (__a); } -__extension__ extern __inline float16x4_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvt_f16_s16 (int16x4_t __a) +vrnd64z_f32 (float32x2_t __a) { - return __builtin_aarch64_floatv4hiv4hf (__a); + return __builtin_aarch64_frint64zv2sf (__a); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtq_f16_s16 (int16x8_t __a) +vrnd64zq_f32 (float32x4_t __a) { - return __builtin_aarch64_floatv8hiv8hf (__a); + return __builtin_aarch64_frint64zv4sf (__a); } -__extension__ extern __inline float16x4_t +__extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvt_f16_u16 (uint16x4_t __a) +vrnd64z_f64 (float64x1_t __a) { - return __builtin_aarch64_floatunsv4hiv4hf ((int16x4_t) __a); + return (float64x1_t) {__builtin_aarch64_frint64zdf (vget_lane_f64 (__a, 0))}; } -__extension__ extern __inline float16x8_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtq_f16_u16 (uint16x8_t __a) +vrnd64zq_f64 (float64x2_t __a) { - return __builtin_aarch64_floatunsv8hiv8hf ((int16x8_t) __a); + return __builtin_aarch64_frint64zv2df (__a); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvt_s16_f16 (float16x4_t __a) +vrnd64x_f32 (float32x2_t __a) { - return __builtin_aarch64_lbtruncv4hfv4hi (__a); + return __builtin_aarch64_frint64xv2sf (__a); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtq_s16_f16 (float16x8_t __a) +vrnd64xq_f32 (float32x4_t __a) { - return __builtin_aarch64_lbtruncv8hfv8hi (__a); + return __builtin_aarch64_frint64xv4sf (__a); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline float64x1_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvt_u16_f16 (float16x4_t __a) +vrnd64x_f64 (float64x1_t __a) { - return __builtin_aarch64_lbtruncuv4hfv4hi_us (__a); + return (float64x1_t) {__builtin_aarch64_frint64xdf (vget_lane_f64 (__a, 0))}; } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline float64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtq_u16_f16 (float16x8_t __a) +vrnd64xq_f64 (float64x2_t __a) { - return __builtin_aarch64_lbtruncuv8hfv8hi_us (__a); + return __builtin_aarch64_frint64xv2df (__a); } -__extension__ extern __inline int16x4_t +#pragma GCC pop_options + +#include "arm_bf16.h" + +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+bf16") + +__extension__ extern __inline bfloat16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvta_s16_f16 (float16x4_t __a) +vset_lane_bf16 (bfloat16_t __elem, bfloat16x4_t __vec, const int __index) { - return __builtin_aarch64_lroundv4hfv4hi (__a); + return __aarch64_vset_lane_any (__elem, __vec, __index); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline bfloat16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtaq_s16_f16 (float16x8_t __a) +vsetq_lane_bf16 (bfloat16_t __elem, bfloat16x8_t __vec, const int __index) { - return __builtin_aarch64_lroundv8hfv8hi (__a); + return __aarch64_vset_lane_any (__elem, __vec, __index); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline bfloat16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvta_u16_f16 (float16x4_t __a) +vget_lane_bf16 (bfloat16x4_t __a, const int __b) { - return __builtin_aarch64_lrounduv4hfv4hi_us (__a); + return __aarch64_vget_lane_any (__a, __b); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline bfloat16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtaq_u16_f16 (float16x8_t __a) +vgetq_lane_bf16 (bfloat16x8_t __a, const int __b) { - return __builtin_aarch64_lrounduv8hfv8hi_us (__a); + return __aarch64_vget_lane_any (__a, __b); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline bfloat16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtm_s16_f16 (float16x4_t __a) +vcreate_bf16 (uint64_t __a) { - return __builtin_aarch64_lfloorv4hfv4hi (__a); + return (bfloat16x4_t) __a; } -__extension__ extern __inline int16x8_t +__extension__ extern __inline bfloat16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtmq_s16_f16 (float16x8_t __a) +vcombine_bf16 (bfloat16x4_t __a, bfloat16x4_t __b) { - return __builtin_aarch64_lfloorv8hfv8hi (__a); + return (bfloat16x8_t)__builtin_aarch64_combinev4bf (__a, __b); } -__extension__ extern __inline uint16x4_t +/* vdup */ + +__extension__ extern __inline bfloat16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtm_u16_f16 (float16x4_t __a) +vdup_n_bf16 (bfloat16_t __a) { - return __builtin_aarch64_lflooruv4hfv4hi_us (__a); + return (bfloat16x4_t) {__a, __a, __a, __a}; } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline bfloat16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtmq_u16_f16 (float16x8_t __a) +vdupq_n_bf16 (bfloat16_t __a) { - return __builtin_aarch64_lflooruv8hfv8hi_us (__a); + return (bfloat16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } -__extension__ extern __inline int16x4_t +__extension__ extern __inline bfloat16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtn_s16_f16 (float16x4_t __a) +vdup_lane_bf16 (bfloat16x4_t __a, const int __b) { - return __builtin_aarch64_lfrintnv4hfv4hi (__a); + return vdup_n_bf16 (__aarch64_vget_lane_any (__a, __b)); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline bfloat16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtnq_s16_f16 (float16x8_t __a) +vdup_laneq_bf16 (bfloat16x8_t __a, const int __b) { - return __builtin_aarch64_lfrintnv8hfv8hi (__a); + return vdup_n_bf16 (__aarch64_vget_lane_any (__a, __b)); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline bfloat16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtn_u16_f16 (float16x4_t __a) +vdupq_lane_bf16 (bfloat16x4_t __a, const int __b) { - return __builtin_aarch64_lfrintnuv4hfv4hi_us (__a); + return vdupq_n_bf16 (__aarch64_vget_lane_any (__a, __b)); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline bfloat16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtnq_u16_f16 (float16x8_t __a) +vdupq_laneq_bf16 (bfloat16x8_t __a, const int __b) { - return __builtin_aarch64_lfrintnuv8hfv8hi_us (__a); + return vdupq_n_bf16 (__aarch64_vget_lane_any (__a, __b)); } -__extension__ extern __inline int16x4_t +__extension__ extern __inline bfloat16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtp_s16_f16 (float16x4_t __a) +vduph_lane_bf16 (bfloat16x4_t __a, const int __b) { - return __builtin_aarch64_lceilv4hfv4hi (__a); + return __aarch64_vget_lane_any (__a, __b); } -__extension__ extern __inline int16x8_t +__extension__ extern __inline bfloat16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtpq_s16_f16 (float16x8_t __a) +vduph_laneq_bf16 (bfloat16x8_t __a, const int __b) { - return __builtin_aarch64_lceilv8hfv8hi (__a); + return __aarch64_vget_lane_any (__a, __b); } -__extension__ extern __inline uint16x4_t +/* vld */ + +__extension__ extern __inline bfloat16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtp_u16_f16 (float16x4_t __a) +vld1_bf16 (const bfloat16_t *__a) { - return __builtin_aarch64_lceiluv4hfv4hi_us (__a); + return (bfloat16x4_t) __builtin_aarch64_ld1v4bf (__a); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline bfloat16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtpq_u16_f16 (float16x8_t __a) +vld1q_bf16 (const bfloat16_t *__a) { - return __builtin_aarch64_lceiluv8hfv8hi_us (__a); + return __builtin_aarch64_ld1v8bf (__a); } -__extension__ extern __inline float16x4_t +__extension__ extern __inline bfloat16x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vneg_f16 (float16x4_t __a) +vld1_bf16_x2 (const bfloat16_t *__a) { - return -__a; + bfloat16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v4bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1); + return ret; } -__extension__ extern __inline float16x8_t +__extension__ extern __inline bfloat16x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vnegq_f16 (float16x8_t __a) +vld1q_bf16_x2 (const bfloat16_t *__a) { - return -__a; + bfloat16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1); + return ret; } -__extension__ extern __inline float16x4_t +__extension__ extern __inline bfloat16x4x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecpe_f16 (float16x4_t __a) +vld1_bf16_x3 (const bfloat16_t *__a) { - return __builtin_aarch64_frecpev4hf (__a); + bfloat16x4x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v4bf ((const __builtin_aarch64_simd_bf *) __a); + __i.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 0); + __i.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 1); + __i.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 2); + return __i; } -__extension__ extern __inline float16x8_t +__extension__ extern __inline bfloat16x8x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecpeq_f16 (float16x8_t __a) +vld1q_bf16_x3 (const bfloat16_t *__a) { - return __builtin_aarch64_frecpev8hf (__a); + bfloat16x8x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v8bf ((const __builtin_aarch64_simd_bf *) __a); + __i.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 0); + __i.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 1); + __i.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 2); + return __i; } - -__extension__ extern __inline float16x4_t +__extension__ extern __inline bfloat16x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrnd_f16 (float16x4_t __a) +vld1_bf16_x4 (const bfloat16_t *__a) { - return __builtin_aarch64_btruncv4hf (__a); + union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v4bf ((const __builtin_aarch64_simd_bf *) __a); + return __au.__i; } -__extension__ extern __inline float16x8_t +__extension__ extern __inline bfloat16x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndq_f16 (float16x8_t __a) +vld1q_bf16_x4 (const bfloat16_t *__a) { - return __builtin_aarch64_btruncv8hf (__a); + union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v8bf ((const __builtin_aarch64_simd_bf *) __a); + return __au.__i; } -__extension__ extern __inline float16x4_t +__extension__ extern __inline bfloat16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrnda_f16 (float16x4_t __a) +vld1_lane_bf16 (const bfloat16_t *__src, bfloat16x4_t __vec, const int __lane) { - return __builtin_aarch64_roundv4hf (__a); + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline bfloat16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndaq_f16 (float16x8_t __a) +vld1q_lane_bf16 (const bfloat16_t *__src, bfloat16x8_t __vec, const int __lane) { - return __builtin_aarch64_roundv8hf (__a); + return __aarch64_vset_lane_any (*__src, __vec, __lane); } -__extension__ extern __inline float16x4_t +__extension__ extern __inline bfloat16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndi_f16 (float16x4_t __a) +vld1_dup_bf16 (const bfloat16_t* __a) { - return __builtin_aarch64_nearbyintv4hf (__a); + return vdup_n_bf16 (*__a); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline bfloat16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndiq_f16 (float16x8_t __a) +vld1q_dup_bf16 (const bfloat16_t* __a) { - return __builtin_aarch64_nearbyintv8hf (__a); + return vdupq_n_bf16 (*__a); } -__extension__ extern __inline float16x4_t +__extension__ extern __inline bfloat16x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndm_f16 (float16x4_t __a) +vld2_bf16 (const bfloat16_t * __a) { - return __builtin_aarch64_floorv4hf (__a); + bfloat16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v4bf (__a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1); + return ret; } -__extension__ extern __inline float16x8_t +__extension__ extern __inline bfloat16x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndmq_f16 (float16x8_t __a) +vld2q_bf16 (const bfloat16_t * __a) { - return __builtin_aarch64_floorv8hf (__a); + bfloat16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1); + return ret; } -__extension__ extern __inline float16x4_t +__extension__ extern __inline bfloat16x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndn_f16 (float16x4_t __a) +vld2_dup_bf16 (const bfloat16_t * __a) { - return __builtin_aarch64_frintnv4hf (__a); + bfloat16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv4bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1); + return ret; } -__extension__ extern __inline float16x8_t +__extension__ extern __inline bfloat16x8x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndnq_f16 (float16x8_t __a) +vld2q_dup_bf16 (const bfloat16_t * __a) { - return __builtin_aarch64_frintnv8hf (__a); + bfloat16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1); + return ret; } -__extension__ extern __inline float16x4_t +__extension__ extern __inline bfloat16x4x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndp_f16 (float16x4_t __a) +vld3_bf16 (const bfloat16_t * __a) { - return __builtin_aarch64_ceilv4hf (__a); + bfloat16x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v4bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 1); + ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 2); + return ret; } -__extension__ extern __inline float16x8_t +__extension__ extern __inline bfloat16x8x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndpq_f16 (float16x8_t __a) +vld3q_bf16 (const bfloat16_t * __a) { - return __builtin_aarch64_ceilv8hf (__a); + bfloat16x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 1); + ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 2); + return ret; } -__extension__ extern __inline float16x4_t +__extension__ extern __inline bfloat16x4x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndx_f16 (float16x4_t __a) +vld3_dup_bf16 (const bfloat16_t * __a) { - return __builtin_aarch64_rintv4hf (__a); + bfloat16x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv4bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 1); + ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 2); + return ret; } -__extension__ extern __inline float16x8_t +__extension__ extern __inline bfloat16x8x3_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrndxq_f16 (float16x8_t __a) +vld3q_dup_bf16 (const bfloat16_t * __a) { - return __builtin_aarch64_rintv8hf (__a); + bfloat16x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 1); + ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 2); + return ret; } -__extension__ extern __inline float16x4_t +__extension__ extern __inline bfloat16x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsqrte_f16 (float16x4_t a) +vld4_bf16 (const bfloat16_t * __a) { - return __builtin_aarch64_rsqrtev4hf (a); + bfloat16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v4bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 1); + ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 2); + ret.val[3] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 3); + return ret; } -__extension__ extern __inline float16x8_t +__extension__ extern __inline bfloat16x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsqrteq_f16 (float16x8_t a) +vld4q_bf16 (const bfloat16_t * __a) { - return __builtin_aarch64_rsqrtev8hf (a); + bfloat16x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 1); + ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 2); + ret.val[3] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 3); + return ret; } -__extension__ extern __inline float16x4_t +__extension__ extern __inline bfloat16x4x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsqrt_f16 (float16x4_t a) +vld4_dup_bf16 (const bfloat16_t * __a) { - return __builtin_aarch64_sqrtv4hf (a); + bfloat16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv4bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 1); + ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 2); + ret.val[3] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 3); + return ret; } -__extension__ extern __inline float16x8_t +__extension__ extern __inline bfloat16x8x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsqrtq_f16 (float16x8_t a) +vld4q_dup_bf16 (const bfloat16_t * __a) { - return __builtin_aarch64_sqrtv8hf (a); + bfloat16x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 1); + ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 2); + ret.val[3] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 3); + return ret; } -/* ARMv8.2-A FP16 two operands vector intrinsics. */ +/* vst */ -__extension__ extern __inline float16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vadd_f16 (float16x4_t __a, float16x4_t __b) +vst1_bf16 (bfloat16_t *__a, bfloat16x4_t __b) { - return __a + __b; + __builtin_aarch64_st1v4bf (__a, __b); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vaddq_f16 (float16x8_t __a, float16x8_t __b) +vst1_bf16_x2 (bfloat16_t * __a, bfloat16x4x2_t __val) { - return __a + __b; + __builtin_aarch64_simd_oi __o; + bfloat16x8x2_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1); + __builtin_aarch64_st1x2v4bf (__a, __o); } -__extension__ extern __inline float16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabd_f16 (float16x4_t a, float16x4_t b) +vst1q_bf16_x2 (bfloat16_t * __a, bfloat16x8x2_t __val) { - return __builtin_aarch64_fabdv4hf (a, b); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1); + __builtin_aarch64_st1x2v8bf (__a, __o); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vabdq_f16 (float16x8_t a, float16x8_t b) +vst1_bf16_x3 (bfloat16_t * __a, bfloat16x4x3_t __val) { - return __builtin_aarch64_fabdv8hf (a, b); + __builtin_aarch64_simd_ci __o; + bfloat16x8x3_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); + __builtin_aarch64_st1x3v4bf ((__builtin_aarch64_simd_bf *) __a, __o); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcage_f16 (float16x4_t __a, float16x4_t __b) +vst1q_bf16_x3 (bfloat16_t * __a, bfloat16x8x3_t __val) { - return __builtin_aarch64_facgev4hf_uss (__a, __b); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2); + __builtin_aarch64_st1x3v8bf ((__builtin_aarch64_simd_bf *) __a, __o); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcageq_f16 (float16x8_t __a, float16x8_t __b) +vst1_bf16_x4 (bfloat16_t * __a, bfloat16x4x4_t val) { - return __builtin_aarch64_facgev8hf_uss (__a, __b); + union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v4bf ((__builtin_aarch64_simd_bf *) __a, __u.__o); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcagt_f16 (float16x4_t __a, float16x4_t __b) +vst1q_bf16_x4 (bfloat16_t * __a, bfloat16x8x4_t val) { - return __builtin_aarch64_facgtv4hf_uss (__a, __b); + union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v8bf ((__builtin_aarch64_simd_bf *) __a, __u.__o); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcagtq_f16 (float16x8_t __a, float16x8_t __b) +vst1q_bf16 (bfloat16_t *__a, bfloat16x8_t __b) { - return __builtin_aarch64_facgtv8hf_uss (__a, __b); + __builtin_aarch64_st1v8bf (__a, __b); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcale_f16 (float16x4_t __a, float16x4_t __b) +vst1_lane_bf16 (bfloat16_t *__a, bfloat16x4_t __b, const int __lane) { - return __builtin_aarch64_faclev4hf_uss (__a, __b); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcaleq_f16 (float16x8_t __a, float16x8_t __b) +vst1q_lane_bf16 (bfloat16_t *__a, bfloat16x8_t __b, const int __lane) { - return __builtin_aarch64_faclev8hf_uss (__a, __b); + *__a = __aarch64_vget_lane_any (__b, __lane); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcalt_f16 (float16x4_t __a, float16x4_t __b) +vst2_bf16 (bfloat16_t * __a, bfloat16x4x2_t __val) { - return __builtin_aarch64_facltv4hf_uss (__a, __b); + __builtin_aarch64_simd_oi __o; + bfloat16x8x2_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1); + __builtin_aarch64_st2v4bf (__a, __o); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcaltq_f16 (float16x8_t __a, float16x8_t __b) +vst2q_bf16 (bfloat16_t * __a, bfloat16x8x2_t __val) { - return __builtin_aarch64_facltv8hf_uss (__a, __b); + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1); + __builtin_aarch64_st2v8bf (__a, __o); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vceq_f16 (float16x4_t __a, float16x4_t __b) +vst3_bf16 (bfloat16_t * __a, bfloat16x4x3_t __val) { - return __builtin_aarch64_cmeqv4hf_uss (__a, __b); + __builtin_aarch64_simd_ci __o; + bfloat16x8x3_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); + __builtin_aarch64_st3v4bf ((__builtin_aarch64_simd_bf *) __a, __o); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vceqq_f16 (float16x8_t __a, float16x8_t __b) +vst3q_bf16 (bfloat16_t * __a, bfloat16x8x3_t __val) { - return __builtin_aarch64_cmeqv8hf_uss (__a, __b); + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2); + __builtin_aarch64_st3v8bf ((__builtin_aarch64_simd_bf *) __a, __o); } -__extension__ extern __inline uint16x4_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcge_f16 (float16x4_t __a, float16x4_t __b) +vst4_bf16 (bfloat16_t * __a, bfloat16x4x4_t __val) { - return __builtin_aarch64_cmgev4hf_uss (__a, __b); + __builtin_aarch64_simd_xi __o; + bfloat16x8x4_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_bf16 (__val.val[3], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[3], 3); + __builtin_aarch64_st4v4bf ((__builtin_aarch64_simd_bf *) __a, __o); } -__extension__ extern __inline uint16x8_t +__extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcgeq_f16 (float16x8_t __a, float16x8_t __b) +vst4q_bf16 (bfloat16_t * __a, bfloat16x8x4_t __val) { - return __builtin_aarch64_cmgev8hf_uss (__a, __b); + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[3], 3); + __builtin_aarch64_st4v8bf ((__builtin_aarch64_simd_bf *) __a, __o); } -__extension__ extern __inline uint16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcgt_f16 (float16x4_t __a, float16x4_t __b) +/* vreinterpret */ + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u8 (uint8x8_t __a) { - return __builtin_aarch64_cmgtv4hf_uss (__a, __b); + return (bfloat16x4_t)__a; } -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcgtq_f16 (float16x8_t __a, float16x8_t __b) +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u16 (uint16x4_t __a) { - return __builtin_aarch64_cmgtv8hf_uss (__a, __b); + return (bfloat16x4_t)__a; } -__extension__ extern __inline uint16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcle_f16 (float16x4_t __a, float16x4_t __b) +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u32 (uint32x2_t __a) { - return __builtin_aarch64_cmlev4hf_uss (__a, __b); + return (bfloat16x4_t)__a; } -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcleq_f16 (float16x8_t __a, float16x8_t __b) +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u64 (uint64x1_t __a) { - return __builtin_aarch64_cmlev8hf_uss (__a, __b); + return (bfloat16x4_t)__a; } -__extension__ extern __inline uint16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vclt_f16 (float16x4_t __a, float16x4_t __b) +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s8 (int8x8_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s16 (int16x4_t __a) { - return __builtin_aarch64_cmltv4hf_uss (__a, __b); + return (bfloat16x4_t)__a; } -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcltq_f16 (float16x8_t __a, float16x8_t __b) +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s32 (int32x2_t __a) { - return __builtin_aarch64_cmltv8hf_uss (__a, __b); + return (bfloat16x4_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvt_n_f16_s16 (int16x4_t __a, const int __b) +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s64 (int64x1_t __a) { - return __builtin_aarch64_scvtfv4hi (__a, __b); + return (bfloat16x4_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtq_n_f16_s16 (int16x8_t __a, const int __b) +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_p8 (poly8x8_t __a) { - return __builtin_aarch64_scvtfv8hi (__a, __b); + return (bfloat16x4_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvt_n_f16_u16 (uint16x4_t __a, const int __b) +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_p16 (poly16x4_t __a) { - return __builtin_aarch64_ucvtfv4hi_sus (__a, __b); + return (bfloat16x4_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtq_n_f16_u16 (uint16x8_t __a, const int __b) +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_p64 (poly64x1_t __a) { - return __builtin_aarch64_ucvtfv8hi_sus (__a, __b); + return (bfloat16x4_t)__a; } -__extension__ extern __inline int16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvt_n_s16_f16 (float16x4_t __a, const int __b) +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_f16 (float16x4_t __a) { - return __builtin_aarch64_fcvtzsv4hf (__a, __b); + return (bfloat16x4_t)__a; } -__extension__ extern __inline int16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtq_n_s16_f16 (float16x8_t __a, const int __b) +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_f32 (float32x2_t __a) { - return __builtin_aarch64_fcvtzsv8hf (__a, __b); + return (bfloat16x4_t)__a; } -__extension__ extern __inline uint16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvt_n_u16_f16 (float16x4_t __a, const int __b) +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_f64 (float64x1_t __a) { - return __builtin_aarch64_fcvtzuv4hf_uss (__a, __b); + return (bfloat16x4_t)__a; } -__extension__ extern __inline uint16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vcvtq_n_u16_f16 (float16x8_t __a, const int __b) +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u8 (uint8x16_t __a) { - return __builtin_aarch64_fcvtzuv8hf_uss (__a, __b); + return (bfloat16x8_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdiv_f16 (float16x4_t __a, float16x4_t __b) +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u16 (uint16x8_t __a) { - return __a / __b; + return (bfloat16x8_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdivq_f16 (float16x8_t __a, float16x8_t __b) +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u32 (uint32x4_t __a) { - return __a / __b; + return (bfloat16x8_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmax_f16 (float16x4_t __a, float16x4_t __b) +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u64 (uint64x2_t __a) { - return __builtin_aarch64_smax_nanv4hf (__a, __b); + return (bfloat16x8_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxq_f16 (float16x8_t __a, float16x8_t __b) +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s8 (int8x16_t __a) { - return __builtin_aarch64_smax_nanv8hf (__a, __b); + return (bfloat16x8_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxnm_f16 (float16x4_t __a, float16x4_t __b) +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s16 (int16x8_t __a) { - return __builtin_aarch64_fmaxv4hf (__a, __b); + return (bfloat16x8_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxnmq_f16 (float16x8_t __a, float16x8_t __b) +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s32 (int32x4_t __a) { - return __builtin_aarch64_fmaxv8hf (__a, __b); + return (bfloat16x8_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmin_f16 (float16x4_t __a, float16x4_t __b) +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s64 (int64x2_t __a) { - return __builtin_aarch64_smin_nanv4hf (__a, __b); + return (bfloat16x8_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminq_f16 (float16x8_t __a, float16x8_t __b) +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p8 (poly8x16_t __a) { - return __builtin_aarch64_smin_nanv8hf (__a, __b); + return (bfloat16x8_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminnm_f16 (float16x4_t __a, float16x4_t __b) +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p16 (poly16x8_t __a) { - return __builtin_aarch64_fminv4hf (__a, __b); + return (bfloat16x8_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminnmq_f16 (float16x8_t __a, float16x8_t __b) +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p64 (poly64x2_t __a) { - return __builtin_aarch64_fminv8hf (__a, __b); + return (bfloat16x8_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_f16 (float16x4_t __a, float16x4_t __b) +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p128 (poly128_t __a) { - return __a * __b; + return (bfloat16x8_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_f16 (float16x8_t __a, float16x8_t __b) +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_f16 (float16x8_t __a) { - return __a * __b; + return (bfloat16x8_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulx_f16 (float16x4_t __a, float16x4_t __b) +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_f32 (float32x4_t __a) { - return __builtin_aarch64_fmulxv4hf (__a, __b); + return (bfloat16x8_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulxq_f16 (float16x8_t __a, float16x8_t __b) +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_f64 (float64x2_t __a) { - return __builtin_aarch64_fmulxv8hf (__a, __b); + return (bfloat16x8_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpadd_f16 (float16x4_t a, float16x4_t b) +__extension__ extern __inline int8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s8_bf16 (bfloat16x4_t __a) { - return __builtin_aarch64_faddpv4hf (a, b); + return (int8x8_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpaddq_f16 (float16x8_t a, float16x8_t b) +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s16_bf16 (bfloat16x4_t __a) { - return __builtin_aarch64_faddpv8hf (a, b); + return (int16x4_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmax_f16 (float16x4_t a, float16x4_t b) +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s32_bf16 (bfloat16x4_t __a) { - return __builtin_aarch64_smax_nanpv4hf (a, b); + return (int32x2_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmaxq_f16 (float16x8_t a, float16x8_t b) +__extension__ extern __inline int64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s64_bf16 (bfloat16x4_t __a) { - return __builtin_aarch64_smax_nanpv8hf (a, b); + return (int64x1_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmaxnm_f16 (float16x4_t a, float16x4_t b) +__extension__ extern __inline uint8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u8_bf16 (bfloat16x4_t __a) { - return __builtin_aarch64_smaxpv4hf (a, b); + return (uint8x8_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmaxnmq_f16 (float16x8_t a, float16x8_t b) +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u16_bf16 (bfloat16x4_t __a) { - return __builtin_aarch64_smaxpv8hf (a, b); + return (uint16x4_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpmin_f16 (float16x4_t a, float16x4_t b) +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u32_bf16 (bfloat16x4_t __a) { - return __builtin_aarch64_smin_nanpv4hf (a, b); + return (uint32x2_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpminq_f16 (float16x8_t a, float16x8_t b) +__extension__ extern __inline uint64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u64_bf16 (bfloat16x4_t __a) { - return __builtin_aarch64_smin_nanpv8hf (a, b); + return (uint64x1_t)__a; } __extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpminnm_f16 (float16x4_t a, float16x4_t b) +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_f16_bf16 (bfloat16x4_t __a) { - return __builtin_aarch64_sminpv4hf (a, b); + return (float16x4_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vpminnmq_f16 (float16x8_t a, float16x8_t b) +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_f32_bf16 (bfloat16x4_t __a) { - return __builtin_aarch64_sminpv8hf (a, b); + return (float32x2_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecps_f16 (float16x4_t __a, float16x4_t __b) +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_f64_bf16 (bfloat16x4_t __a) { - return __builtin_aarch64_frecpsv4hf (__a, __b); + return (float64x1_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrecpsq_f16 (float16x8_t __a, float16x8_t __b) +__extension__ extern __inline poly8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_p8_bf16 (bfloat16x4_t __a) { - return __builtin_aarch64_frecpsv8hf (__a, __b); + return (poly8x8_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsqrts_f16 (float16x4_t a, float16x4_t b) +__extension__ extern __inline poly16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_p16_bf16 (bfloat16x4_t __a) { - return __builtin_aarch64_rsqrtsv4hf (a, b); + return (poly16x4_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vrsqrtsq_f16 (float16x8_t a, float16x8_t b) +__extension__ extern __inline poly64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_p64_bf16 (bfloat16x4_t __a) { - return __builtin_aarch64_rsqrtsv8hf (a, b); + return (poly64x1_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsub_f16 (float16x4_t __a, float16x4_t __b) +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s8_bf16 (bfloat16x8_t __a) { - return __a - __b; + return (int8x16_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vsubq_f16 (float16x8_t __a, float16x8_t __b) +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s16_bf16 (bfloat16x8_t __a) { - return __a - __b; + return (int16x8_t)__a; } -/* ARMv8.2-A FP16 three operands vector intrinsics. */ - -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfma_f16 (float16x4_t __a, float16x4_t __b, float16x4_t __c) +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s32_bf16 (bfloat16x8_t __a) { - return __builtin_aarch64_fmav4hf (__b, __c, __a); + return (int32x4_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfmaq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c) +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s64_bf16 (bfloat16x8_t __a) { - return __builtin_aarch64_fmav8hf (__b, __c, __a); + return (int64x2_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfms_f16 (float16x4_t __a, float16x4_t __b, float16x4_t __c) +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u8_bf16 (bfloat16x8_t __a) { - return __builtin_aarch64_fnmav4hf (__b, __c, __a); + return (uint8x16_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfmsq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c) +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u16_bf16 (bfloat16x8_t __a) { - return __builtin_aarch64_fnmav8hf (__b, __c, __a); + return (uint16x8_t)__a; } -/* ARMv8.2-A FP16 lane vector intrinsics. */ +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u32_bf16 (bfloat16x8_t __a) +{ + return (uint32x4_t)__a; +} -__extension__ extern __inline float16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfmah_lane_f16 (float16_t __a, float16_t __b, - float16x4_t __c, const int __lane) +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u64_bf16 (bfloat16x8_t __a) { - return vfmah_f16 (__a, __b, __aarch64_vget_lane_any (__c, __lane)); + return (uint64x2_t)__a; } -__extension__ extern __inline float16_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfmah_laneq_f16 (float16_t __a, float16_t __b, - float16x8_t __c, const int __lane) +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_f16_bf16 (bfloat16x8_t __a) { - return vfmah_f16 (__a, __b, __aarch64_vget_lane_any (__c, __lane)); + return (float16x8_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfma_lane_f16 (float16x4_t __a, float16x4_t __b, - float16x4_t __c, const int __lane) +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_f32_bf16 (bfloat16x8_t __a) { - return vfma_f16 (__a, __b, __aarch64_vdup_lane_f16 (__c, __lane)); + return (float32x4_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfmaq_lane_f16 (float16x8_t __a, float16x8_t __b, - float16x4_t __c, const int __lane) +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_f64_bf16 (bfloat16x8_t __a) +{ + return (float64x2_t)__a; +} + +__extension__ extern __inline poly8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p8_bf16 (bfloat16x8_t __a) { - return vfmaq_f16 (__a, __b, __aarch64_vdupq_lane_f16 (__c, __lane)); + return (poly8x16_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfma_laneq_f16 (float16x4_t __a, float16x4_t __b, - float16x8_t __c, const int __lane) +__extension__ extern __inline poly16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p16_bf16 (bfloat16x8_t __a) { - return vfma_f16 (__a, __b, __aarch64_vdup_laneq_f16 (__c, __lane)); + return (poly16x8_t)__a; } -__extension__ extern __inline float16x8_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfmaq_laneq_f16 (float16x8_t __a, float16x8_t __b, - float16x8_t __c, const int __lane) +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p64_bf16 (bfloat16x8_t __a) { - return vfmaq_f16 (__a, __b, __aarch64_vdupq_laneq_f16 (__c, __lane)); + return (poly64x2_t)__a; } -__extension__ extern __inline float16x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfma_n_f16 (float16x4_t __a, float16x4_t __b, float16_t __c) +__extension__ extern __inline poly128_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p128_bf16 (bfloat16x8_t __a) { - return vfma_f16 (__a, __b, vdup_n_f16 (__c)); + return (poly128_t)__a; } -__extension__ extern __inline float16x8_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfmaq_n_f16 (float16x8_t __a, float16x8_t __b, float16_t __c) +vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b) { - return vfmaq_f16 (__a, __b, vdupq_n_f16 (__c)); + return __builtin_aarch64_bfdotv2sf (__r, __a, __b); } -__extension__ extern __inline float16_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfmsh_lane_f16 (float16_t __a, float16_t __b, - float16x4_t __c, const int __lane) +vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) { - return vfmsh_f16 (__a, __b, __aarch64_vget_lane_any (__c, __lane)); + return __builtin_aarch64_bfdotv4sf (__r, __a, __b); } -__extension__ extern __inline float16_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfmsh_laneq_f16 (float16_t __a, float16_t __b, - float16x8_t __c, const int __lane) +vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b, + const int __index) { - return vfmsh_f16 (__a, __b, __aarch64_vget_lane_any (__c, __lane)); + return __builtin_aarch64_bfdot_lanev2sf (__r, __a, __b, __index); } -__extension__ extern __inline float16x4_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfms_lane_f16 (float16x4_t __a, float16x4_t __b, - float16x4_t __c, const int __lane) +vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, + const int __index) { - return vfms_f16 (__a, __b, __aarch64_vdup_lane_f16 (__c, __lane)); + return __builtin_aarch64_bfdot_lanev4sf (__r, __a, __b, __index); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfmsq_lane_f16 (float16x8_t __a, float16x8_t __b, - float16x4_t __c, const int __lane) +vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b, + const int __index) { - return vfmsq_f16 (__a, __b, __aarch64_vdupq_lane_f16 (__c, __lane)); + return __builtin_aarch64_bfdot_laneqv2sf (__r, __a, __b, __index); } -__extension__ extern __inline float16x4_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfms_laneq_f16 (float16x4_t __a, float16x4_t __b, - float16x8_t __c, const int __lane) +vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, + const int __index) { - return vfms_f16 (__a, __b, __aarch64_vdup_laneq_f16 (__c, __lane)); + return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfmsq_laneq_f16 (float16x8_t __a, float16x8_t __b, - float16x8_t __c, const int __lane) +vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) + { - return vfmsq_f16 (__a, __b, __aarch64_vdupq_laneq_f16 (__c, __lane)); + return __builtin_aarch64_bfmmlaqv4sf (__r, __a, __b); } -__extension__ extern __inline float16x4_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfms_n_f16 (float16x4_t __a, float16x4_t __b, float16_t __c) +vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) { - return vfms_f16 (__a, __b, vdup_n_f16 (__c)); + return __builtin_aarch64_bfmlalbv4sf (__r, __a, __b); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vfmsq_n_f16 (float16x8_t __a, float16x8_t __b, float16_t __c) +vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) { - return vfmsq_f16 (__a, __b, vdupq_n_f16 (__c)); + return __builtin_aarch64_bfmlaltv4sf (__r, __a, __b); } -__extension__ extern __inline float16_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulh_lane_f16 (float16_t __a, float16x4_t __b, const int __lane) +vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, + const int __index) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_aarch64_bfmlalb_lanev4sf (__r, __a, __b, __index); } -__extension__ extern __inline float16x4_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_lane_f16 (float16x4_t __a, float16x4_t __b, const int __lane) +vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, + const int __index) { - return vmul_f16 (__a, vdup_n_f16 (__aarch64_vget_lane_any (__b, __lane))); + return __builtin_aarch64_bfmlalt_lanev4sf (__r, __a, __b, __index); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __lane) +vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, + const int __index) { - return vmulq_f16 (__a, vdupq_n_f16 (__aarch64_vget_lane_any (__b, __lane))); + return __builtin_aarch64_bfmlalb_lane_qv4sf (__r, __a, __b, __index); } -__extension__ extern __inline float16_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulh_laneq_f16 (float16_t __a, float16x8_t __b, const int __lane) +vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, + const int __index) { - return __a * __aarch64_vget_lane_any (__b, __lane); + return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index); } -__extension__ extern __inline float16x4_t +__extension__ extern __inline bfloat16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_laneq_f16 (float16x4_t __a, float16x8_t __b, const int __lane) +vget_low_bf16 (bfloat16x8_t __a) { - return vmul_f16 (__a, vdup_n_f16 (__aarch64_vget_lane_any (__b, __lane))); + return __builtin_aarch64_vget_lo_halfv8bf (__a); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline bfloat16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_laneq_f16 (float16x8_t __a, float16x8_t __b, const int __lane) +vget_high_bf16 (bfloat16x8_t __a) { - return vmulq_f16 (__a, vdupq_n_f16 (__aarch64_vget_lane_any (__b, __lane))); + return __builtin_aarch64_vget_hi_halfv8bf (__a); } -__extension__ extern __inline float16x4_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmul_n_f16 (float16x4_t __a, float16_t __b) +vcvt_f32_bf16 (bfloat16x4_t __a) { - return vmul_lane_f16 (__a, vdup_n_f16 (__b), 0); + return __builtin_aarch64_vbfcvtv4bf (__a); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulq_n_f16 (float16x8_t __a, float16_t __b) +vcvtq_low_f32_bf16 (bfloat16x8_t __a) { - return vmulq_laneq_f16 (__a, vdupq_n_f16 (__b), 0); + return __builtin_aarch64_vbfcvtv8bf (__a); } -__extension__ extern __inline float16_t +__extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulxh_lane_f16 (float16_t __a, float16x4_t __b, const int __lane) +vcvtq_high_f32_bf16 (bfloat16x8_t __a) { - return vmulxh_f16 (__a, __aarch64_vget_lane_any (__b, __lane)); + return __builtin_aarch64_vbfcvt_highv8bf (__a); } -__extension__ extern __inline float16x4_t +__extension__ extern __inline bfloat16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulx_lane_f16 (float16x4_t __a, float16x4_t __b, const int __lane) +vcvt_bf16_f32 (float32x4_t __a) { - return vmulx_f16 (__a, __aarch64_vdup_lane_f16 (__b, __lane)); + return __builtin_aarch64_bfcvtnv4bf (__a); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline bfloat16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulxq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __lane) +vcvtq_low_bf16_f32 (float32x4_t __a) { - return vmulxq_f16 (__a, __aarch64_vdupq_lane_f16 (__b, __lane)); + return __builtin_aarch64_bfcvtn_qv8bf (__a); } -__extension__ extern __inline float16_t +__extension__ extern __inline bfloat16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulxh_laneq_f16 (float16_t __a, float16x8_t __b, const int __lane) +vcvtq_high_bf16_f32 (bfloat16x8_t __inactive, float32x4_t __a) { - return vmulxh_f16 (__a, __aarch64_vget_lane_any (__b, __lane)); + return __builtin_aarch64_bfcvtn2v8bf (__inactive, __a); } -__extension__ extern __inline float16x4_t +__extension__ extern __inline bfloat16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulx_laneq_f16 (float16x4_t __a, float16x8_t __b, const int __lane) +vcopy_lane_bf16 (bfloat16x4_t __a, const int __lane1, + bfloat16x4_t __b, const int __lane2) { - return vmulx_f16 (__a, __aarch64_vdup_laneq_f16 (__b, __lane)); + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline bfloat16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulxq_laneq_f16 (float16x8_t __a, float16x8_t __b, const int __lane) +vcopyq_lane_bf16 (bfloat16x8_t __a, const int __lane1, + bfloat16x4_t __b, const int __lane2) { - return vmulxq_f16 (__a, __aarch64_vdupq_laneq_f16 (__b, __lane)); + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); } -__extension__ extern __inline float16x4_t +__extension__ extern __inline bfloat16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulx_n_f16 (float16x4_t __a, float16_t __b) +vcopy_laneq_bf16 (bfloat16x4_t __a, const int __lane1, + bfloat16x8_t __b, const int __lane2) { - return vmulx_f16 (__a, vdup_n_f16 (__b)); + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); } -__extension__ extern __inline float16x8_t +__extension__ extern __inline bfloat16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmulxq_n_f16 (float16x8_t __a, float16_t __b) +vcopyq_laneq_bf16 (bfloat16x8_t __a, const int __lane1, + bfloat16x8_t __b, const int __lane2) { - return vmulxq_f16 (__a, vdupq_n_f16 (__b)); + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); } -/* ARMv8.2-A FP16 reduction vector intrinsics. */ +__LD2_LANE_FUNC (bfloat16x4x2_t, bfloat16x4_t, bfloat16x8x2_t, bfloat16_t, v4bf, + v8bf, bf, bf16, bfloat16x8_t) +__LD2Q_LANE_FUNC (bfloat16x8x2_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16) +__LD3_LANE_FUNC (bfloat16x4x3_t, bfloat16x4_t, bfloat16x8x3_t, bfloat16_t, v4bf, + v8bf, bf, bf16, bfloat16x8_t) +__LD3Q_LANE_FUNC (bfloat16x8x3_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16) +__LD4_LANE_FUNC (bfloat16x4x4_t, bfloat16x4_t, bfloat16x8x4_t, bfloat16_t, v4bf, + v8bf, bf, bf16, bfloat16x8_t) +__LD4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16) + +__ST2_LANE_FUNC (bfloat16x4x2_t, bfloat16x8x2_t, bfloat16_t, v4bf, v8bf, bf, + bf16, bfloat16x8_t) +__ST2Q_LANE_FUNC (bfloat16x8x2_t, bfloat16_t, v8bf, bf, bf16) +__ST3_LANE_FUNC (bfloat16x4x3_t, bfloat16x8x3_t, bfloat16_t, v4bf, v8bf, bf, + bf16, bfloat16x8_t) +__ST3Q_LANE_FUNC (bfloat16x8x3_t, bfloat16_t, v8bf, bf, bf16) +__ST4_LANE_FUNC (bfloat16x4x4_t, bfloat16x8x4_t, bfloat16_t, v4bf, v8bf, bf, + bf16, bfloat16x8_t) +__ST4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16_t, v8bf, bf, bf16) -__extension__ extern __inline float16_t +#pragma GCC pop_options + +/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics. */ + +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+i8mm") + +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxv_f16 (float16x4_t __a) +vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b) { - return __builtin_aarch64_reduc_smax_nan_scal_v4hf (__a); + return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b); } -__extension__ extern __inline float16_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxvq_f16 (float16x8_t __a) +vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_reduc_smax_nan_scal_v8hf (__a); + return __builtin_aarch64_usdotv16qi_ssus (__r, __a, __b); } -__extension__ extern __inline float16_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminv_f16 (float16x4_t __a) +vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b, const int __index) { - return __builtin_aarch64_reduc_smin_nan_scal_v4hf (__a); + return __builtin_aarch64_usdot_lanev8qi_ssuss (__r, __a, __b, __index); } -__extension__ extern __inline float16_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminvq_f16 (float16x8_t __a) +vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a, int8x16_t __b, + const int __index) { - return __builtin_aarch64_reduc_smin_nan_scal_v8hf (__a); + return __builtin_aarch64_usdot_laneqv8qi_ssuss (__r, __a, __b, __index); } -__extension__ extern __inline float16_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxnmv_f16 (float16x4_t __a) +vusdotq_lane_s32 (int32x4_t __r, uint8x16_t __a, int8x8_t __b, + const int __index) { - return __builtin_aarch64_reduc_smax_scal_v4hf (__a); + return __builtin_aarch64_usdot_lanev16qi_ssuss (__r, __a, __b, __index); } -__extension__ extern __inline float16_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vmaxnmvq_f16 (float16x8_t __a) +vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b, + const int __index) { - return __builtin_aarch64_reduc_smax_scal_v8hf (__a); + return __builtin_aarch64_usdot_laneqv16qi_ssuss (__r, __a, __b, __index); } -__extension__ extern __inline float16_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminnmv_f16 (float16x4_t __a) +vsudot_lane_s32 (int32x2_t __r, int8x8_t __a, uint8x8_t __b, const int __index) { - return __builtin_aarch64_reduc_smin_scal_v4hf (__a); + return __builtin_aarch64_sudot_lanev8qi_sssus (__r, __a, __b, __index); } -__extension__ extern __inline float16_t +__extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vminnmvq_f16 (float16x8_t __a) +vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a, uint8x16_t __b, + const int __index) { - return __builtin_aarch64_reduc_smin_scal_v8hf (__a); + return __builtin_aarch64_sudot_laneqv8qi_sssus (__r, __a, __b, __index); } -#pragma GCC pop_options - -/* AdvSIMD Dot Product intrinsics. */ - -#pragma GCC push_options -#pragma GCC target ("arch=armv8.2-a+dotprod") - -__extension__ extern __inline uint32x2_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdot_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b) +vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a, uint8x8_t __b, + const int __index) { - return __builtin_aarch64_udotv8qi_uuuu (__r, __a, __b); + return __builtin_aarch64_sudot_lanev16qi_sssus (__r, __a, __b, __index); } -__extension__ extern __inline uint32x4_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdotq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) +vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a, uint8x16_t __b, + const int __index) { - return __builtin_aarch64_udotv16qi_uuuu (__r, __a, __b); + return __builtin_aarch64_sudot_laneqv16qi_sssus (__r, __a, __b, __index); } -__extension__ extern __inline int32x2_t +/* Matrix Multiply-Accumulate. */ + +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdot_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b) +vmmlaq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_sdotv8qi (__r, __a, __b); + return __builtin_aarch64_simd_smmlav16qi (__r, __a, __b); } -__extension__ extern __inline int32x4_t +__extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdotq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b) +vmmlaq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) { - return __builtin_aarch64_sdotv16qi (__r, __a, __b); + return __builtin_aarch64_simd_ummlav16qi_uuuu (__r, __a, __b); } -__extension__ extern __inline uint32x2_t +__extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdot_lane_u32 (uint32x2_t __r, uint8x8_t __a, uint8x8_t __b, const int __index) +vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) { - return __builtin_aarch64_udot_lanev8qi_uuuus (__r, __a, __b, __index); + return __builtin_aarch64_simd_usmmlav16qi_ssus (__r, __a, __b); } -__extension__ extern __inline uint32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdot_laneq_u32 (uint32x2_t __r, uint8x8_t __a, uint8x16_t __b, - const int __index) +#pragma GCC pop_options + +__extension__ extern __inline poly8x8_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vadd_p8 (poly8x8_t __a, poly8x8_t __b) { - return __builtin_aarch64_udot_laneqv8qi_uuuus (__r, __a, __b, __index); + return __a ^ __b; } -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdotq_lane_u32 (uint32x4_t __r, uint8x16_t __a, uint8x8_t __b, - const int __index) +__extension__ extern __inline poly16x4_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vadd_p16 (poly16x4_t __a, poly16x4_t __b) { - return __builtin_aarch64_udot_lanev16qi_uuuus (__r, __a, __b, __index); + return __a ^ __b; } -__extension__ extern __inline uint32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdotq_laneq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b, - const int __index) +__extension__ extern __inline poly64x1_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vadd_p64 (poly64x1_t __a, poly64x1_t __b) { - return __builtin_aarch64_udot_laneqv16qi_uuuus (__r, __a, __b, __index); + return __a ^ __b; } -__extension__ extern __inline int32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdot_lane_s32 (int32x2_t __r, int8x8_t __a, int8x8_t __b, const int __index) +__extension__ extern __inline poly8x16_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vaddq_p8 (poly8x16_t __a, poly8x16_t __b) { - return __builtin_aarch64_sdot_lanev8qi (__r, __a, __b, __index); + return __a ^ __b; } -__extension__ extern __inline int32x2_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdot_laneq_s32 (int32x2_t __r, int8x8_t __a, int8x16_t __b, const int __index) +__extension__ extern __inline poly16x8_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vaddq_p16 (poly16x8_t __a, poly16x8_t __b) { - return __builtin_aarch64_sdot_laneqv8qi (__r, __a, __b, __index); + return __a ^__b; } -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdotq_lane_s32 (int32x4_t __r, int8x16_t __a, int8x8_t __b, const int __index) +__extension__ extern __inline poly64x2_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vaddq_p64 (poly64x2_t __a, poly64x2_t __b) { - return __builtin_aarch64_sdot_lanev16qi (__r, __a, __b, __index); + return __a ^ __b; } -__extension__ extern __inline int32x4_t -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdotq_laneq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b, const int __index) +__extension__ extern __inline poly128_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vaddq_p128 (poly128_t __a, poly128_t __b) { - return __builtin_aarch64_sdot_laneqv16qi (__r, __a, __b, __index); + return __a ^ __b; } -#pragma GCC pop_options #undef __aarch64_vget_lane_any @@ -31690,4 +34586,17 @@ vdotq_laneq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b, const int __index) #undef __aarch64_vdupq_laneq_u32 #undef __aarch64_vdupq_laneq_u64 +#undef __LD2_LANE_FUNC +#undef __LD2Q_LANE_FUNC +#undef __LD3_LANE_FUNC +#undef __LD3Q_LANE_FUNC +#undef __LD4_LANE_FUNC +#undef __LD4Q_LANE_FUNC +#undef __ST2_LANE_FUNC +#undef __ST2Q_LANE_FUNC +#undef __ST3_LANE_FUNC +#undef __ST3Q_LANE_FUNC +#undef __ST4_LANE_FUNC +#undef __ST4Q_LANE_FUNC + #endif -- Gitee