29#if HWY_COMPILER_GCC_ACTUAL
43#include <avx2intrin.h>
44#include <bmi2intrin.h>
45#include <f16cintrin.h>
55#include <sanitizer/msan_interface.h>
59#include
"hwy/ops/x86_128-inl.h"
87 static constexpr size_t kPrivateN = 32 /
sizeof(T);
92 return *
this = (*
this * other);
95 return *
this = (*
this / other);
98 return *
this = (*
this + other);
101 return *
this = (*
this - other);
104 return *
this = (*
this & other);
107 return *
this = (*
this | other);
110 return *
this = (*
this ^ other);
116#if HWY_TARGET <= HWY_AVX3
121template <
size_t size>
164using Full256 = Simd<T, 32 /
sizeof(T), 0>;
173 return _mm256_castpd_si256(
v);
202template <
typename T,
typename FromT>
212 return Vec256<T>{_mm256_setzero_si256()};
233 _mm256_set1_epi64x(
static_cast<long long>(t))};
246 _mm256_set1_epi64x(
static_cast<long long>(t))};
263 return Vec256<T>{_mm256_undefined_si256()};
279HWY_API Vec256<T>
And(Vec256<T> a, Vec256<T> b) {
280 return Vec256<T>{_mm256_and_si256(a.raw, b.raw)};
294HWY_API Vec256<T>
AndNot(Vec256<T> not_mask, Vec256<T> mask) {
295 return Vec256<T>{_mm256_andnot_si256(not_mask.raw, mask.raw)};
309HWY_API Vec256<T>
Or(Vec256<T> a, Vec256<T> b) {
310 return Vec256<T>{_mm256_or_si256(a.raw, b.raw)};
323HWY_API Vec256<T>
Xor(Vec256<T> a, Vec256<T> b) {
324 return Vec256<T>{_mm256_xor_si256(a.raw, b.raw)};
338#if HWY_TARGET <= HWY_AVX3
341 Vec256<TU>{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)});
343 return Xor(
v,
BitCast(Full256<T>(), Vec256<TU>{_mm256_set1_epi32(-1)}));
349HWY_API Vec256<T>
Xor3(Vec256<T> x1, Vec256<T> x2, Vec256<T> x3) {
350#if HWY_TARGET <= HWY_AVX3
353 using VU =
VFromD<
decltype(du)>;
354 const __m256i ret = _mm256_ternarylogic_epi64(
358 return Xor(x1,
Xor(x2, x3));
364HWY_API Vec256<T>
Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
365#if HWY_TARGET <= HWY_AVX3
368 using VU =
VFromD<
decltype(du)>;
369 const __m256i ret = _mm256_ternarylogic_epi64(
373 return Or(o1,
Or(o2, o3));
379HWY_API Vec256<T>
OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
380#if HWY_TARGET <= HWY_AVX3
383 using VU =
VFromD<
decltype(du)>;
384 const __m256i ret = _mm256_ternarylogic_epi64(
388 return Or(o,
And(a1, a2));
395#if HWY_TARGET <= HWY_AVX3
398 using VU =
VFromD<
decltype(du)>;
427#if HWY_TARGET == HWY_AVX3_DL
429#ifdef HWY_NATIVE_POPCNT
430#undef HWY_NATIVE_POPCNT
432#define HWY_NATIVE_POPCNT
468HWY_API Vec256<T>
CopySign(
const Vec256<T> magn,
const Vec256<T> sign) {
469 static_assert(IsFloat<T>(),
"Only makes sense for floating-point");
474#if HWY_TARGET <= HWY_AVX3
475 const Rebind<MakeUnsigned<T>,
decltype(
d)> du;
486 const __m256i out = _mm256_ternarylogic_epi32(
496#if HWY_TARGET <= HWY_AVX3
506#if HWY_TARGET <= HWY_AVX3
627 static_assert(IsSigned<T>(),
"Only for float");
639#if HWY_COMPILER_HAS_MASK_INTRINSICS
648#if HWY_COMPILER_HAS_MASK_INTRINSICS
657#if HWY_COMPILER_HAS_MASK_INTRINSICS
666#if HWY_COMPILER_HAS_MASK_INTRINSICS
676#if HWY_COMPILER_HAS_MASK_INTRINSICS
685#if HWY_COMPILER_HAS_MASK_INTRINSICS
694#if HWY_COMPILER_HAS_MASK_INTRINSICS
703#if HWY_COMPILER_HAS_MASK_INTRINSICS
713#if HWY_COMPILER_HAS_MASK_INTRINSICS
722#if HWY_COMPILER_HAS_MASK_INTRINSICS
731#if HWY_COMPILER_HAS_MASK_INTRINSICS
740#if HWY_COMPILER_HAS_MASK_INTRINSICS
750#if HWY_COMPILER_HAS_MASK_INTRINSICS
759#if HWY_COMPILER_HAS_MASK_INTRINSICS
768#if HWY_COMPILER_HAS_MASK_INTRINSICS
777#if HWY_COMPILER_HAS_MASK_INTRINSICS
787#if HWY_COMPILER_HAS_MASK_INTRINSICS
796#if HWY_COMPILER_HAS_MASK_INTRINSICS
805#if HWY_COMPILER_HAS_MASK_INTRINSICS
814#if HWY_COMPILER_HAS_MASK_INTRINSICS
824HWY_API Mask256<T>
And(
const Mask256<T> a, Mask256<T> b) {
834HWY_API Mask256<T>
Or(
const Mask256<T> a, Mask256<T> b) {
839HWY_API Mask256<T>
Xor(
const Mask256<T> a, Mask256<T> b) {
846 constexpr size_t N = 32 /
sizeof(T);
862 return Mask256<T>{
v.raw};
867 return Vec256<T>{
v.raw};
872 return Vec256<T>{
v.raw};
880 const Vec256<T> no) {
881 return Vec256<T>{_mm256_blendv_epi8(no.raw, yes.raw, mask.raw)};
884 const Vec256<float> yes,
885 const Vec256<float> no) {
886 return Vec256<float>{_mm256_blendv_ps(no.raw, yes.raw, mask.raw)};
889 const Vec256<double> yes,
890 const Vec256<double> no) {
891 return Vec256<double>{_mm256_blendv_pd(no.raw, yes.raw, mask.raw)};
908 static_assert(IsSigned<T>(),
"Only for float");
909 const auto zero =
Zero(Full256<T>());
922HWY_API Mask256<T>
And(
const Mask256<T> a, Mask256<T> b) {
934HWY_API Mask256<T>
Or(
const Mask256<T> a, Mask256<T> b) {
940HWY_API Mask256<T>
Xor(
const Mask256<T> a, Mask256<T> b) {
955#if HWY_TARGET <= HWY_AVX3
959template <
typename TFrom,
typename TTo>
961 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
962 return Mask256<TTo>{m.raw};
992 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
998template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1002template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1004 return Mask256<T>{_mm256_cmpeq_epi16_mask(a.raw, b.raw)};
1006template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1008 return Mask256<T>{_mm256_cmpeq_epi32_mask(a.raw, b.raw)};
1010template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1012 return Mask256<T>{_mm256_cmpeq_epi64_mask(a.raw, b.raw)};
1025template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1029template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1031 return Mask256<T>{_mm256_cmpneq_epi16_mask(a.raw, b.raw)};
1033template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1035 return Mask256<T>{_mm256_cmpneq_epi32_mask(a.raw, b.raw)};
1037template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1039 return Mask256<T>{_mm256_cmpneq_epi64_mask(a.raw, b.raw)};
1101template <
typename T>
1105template <
typename T>
1109template <
typename T>
1113template <
typename T>
1120template <
typename T>
1132template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1137template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1139 return Vec256<T>{_mm256_movm_epi16(
v.raw)};
1142template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1144 return Vec256<T>{_mm256_movm_epi32(
v.raw)};
1147template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1149 return Vec256<T>{_mm256_movm_epi64(
v.raw)};
1153 return Vec256<float>{_mm256_castsi256_ps(_mm256_movm_epi32(
v.raw))};
1160template <
typename T>
1169template <
typename TFrom,
typename TTo>
1171 static_assert(
sizeof(TFrom) ==
sizeof(TTo),
"Must have same size");
1175template <
typename T>
1177 static_assert(!hwy::IsFloat<T>(),
"Only integer vectors supported");
1178 return (
v & bit) == bit;
1183template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1185 return Mask256<T>{_mm256_cmpeq_epi8(a.raw, b.raw)};
1188template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1190 return Mask256<T>{_mm256_cmpeq_epi16(a.raw, b.raw)};
1193template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
1195 return Mask256<T>{_mm256_cmpeq_epi32(a.raw, b.raw)};
1198template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
1200 return Mask256<T>{_mm256_cmpeq_epi64(a.raw, b.raw)};
1204 const Vec256<float> b) {
1205 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_EQ_OQ)};
1209 const Vec256<double> b) {
1210 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_EQ_OQ)};
1215template <
typename T>
1220 const Vec256<float> b) {
1221 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_NEQ_OQ)};
1224 const Vec256<double> b) {
1225 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_NEQ_OQ)};
1236#if HWY_COMPILER_GCC != 0 && HWY_COMPILER_GCC < 930
1237#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 1
1239#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 0
1244#if HWY_AVX2_GCC_CMPGT8_WORKAROUND
1245 using i8x32 =
signed char __attribute__((__vector_size__(32)));
1246 return Mask256<int8_t>{
static_cast<__m256i
>(
reinterpret_cast<i8x32
>(a.raw) >
1247 reinterpret_cast<i8x32
>(b.raw))};
1249 return Mask256<int8_t>{_mm256_cmpgt_epi8(a.raw, b.raw)};
1253 Vec256<int16_t> b) {
1254 return Mask256<int16_t>{_mm256_cmpgt_epi16(a.raw, b.raw)};
1257 Vec256<int32_t> b) {
1258 return Mask256<int32_t>{_mm256_cmpgt_epi32(a.raw, b.raw)};
1261 Vec256<int64_t> b) {
1262 return Mask256<int64_t>{_mm256_cmpgt_epi64(a.raw, b.raw)};
1265template <
typename T>
1267 const Full256<T> du;
1269 const Vec256<T> msb =
Set(du, (LimitsMax<T>() >> 1) + 1);
1275 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GT_OQ)};
1279 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GT_OQ)};
1284template <
typename T>
1286 return detail::Gt(hwy::TypeTag<T>(), a, b);
1292 const Vec256<float> b) {
1293 return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GE_OQ)};
1296 const Vec256<double> b) {
1297 return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GE_OQ)};
1304template <
typename T>
1309template <
typename T>
1330#if HWY_TARGET <= HWY_AVX3
1335 const auto msb =
Set(du, 1ull << 63);
1352#if HWY_TARGET <= HWY_AVX3
1383#if HWY_TARGET <= HWY_AVX3
1388 const auto msb =
Set(du, 1ull << 63);
1405#if HWY_TARGET <= HWY_AVX3
1422template <
typename T>
1424#if HWY_TARGET <= HWY_AVX3
1426 constexpr size_t N = 32 /
sizeof(T);
1428 const uint64_t all = (1ull <<
N) - 1;
1432 const uint32_t all =
static_cast<uint32_t
>((1ull <<
N) - 1);
1435 (n > 255) ? all : _bzhi_u32(all, static_cast<uint32_t>(n)));
1541 return Vec256<uint64_t>{_mm256_sad_epu8(
v.raw, _mm256_setzero_si256())};
1610#if HWY_COMPILER_MSVC
1667HWY_API Vec256<int64_t>
MulEven(Vec256<int32_t> a, Vec256<int32_t> b) {
1668 return Vec256<int64_t>{_mm256_mul_epi32(a.raw, b.raw)};
1670HWY_API Vec256<uint64_t>
MulEven(Vec256<uint32_t> a, Vec256<uint32_t> b) {
1671 return Vec256<uint64_t>{_mm256_mul_epu32(a.raw, b.raw)};
1706template <
int kBits,
typename T, HWY_IF_LANE_SIZE(T, 1)>
1713 : (shifted &
Set(d8,
static_cast<T
>((0xFF << kBits) & 0xFF)));
1738 return shifted &
Set(d8, 0xFF >> kBits);
1756 const auto shifted_sign =
BitCast(di,
Set(du, 0x80 >> kBits));
1757 return (shifted ^ shifted_sign) - shifted_sign;
1766 static_assert(0 <= kBits && kBits < 32,
"Invalid shift count");
1767#if HWY_TARGET <= HWY_AVX3
1770 if (kBits == 0)
return v;
1777 static_assert(0 <= kBits && kBits < 64,
"Invalid shift count");
1778#if HWY_TARGET <= HWY_AVX3
1781 if (kBits == 0)
return v;
1793 return ShiftRight<15>(
v);
1797 return ShiftRight<31>(
v);
1801#if HWY_TARGET == HWY_AVX2
1810#if HWY_TARGET <= HWY_AVX3
1817 return right | sign;
1822#if HWY_TARGET <= HWY_AVX3
1837template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
1839 static_assert(IsSigned<T>(),
"Only works for signed/float");
1848template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
1850 static_assert(IsSigned<T>(),
"Only works for signed/float");
1886template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
1891 return shifted &
Set(d8,
static_cast<T
>((0xFF << bits) & 0xFF));
1913 return shifted &
Set(d8,
static_cast<uint8_t
>(0xFF >> bits));
1927#if HWY_TARGET <= HWY_AVX3
1934 return right | sign;
1942 const auto shifted_sign =
1943 BitCast(di,
Set(du,
static_cast<uint8_t
>(0x80 >> bits)));
1944 return (shifted ^ shifted_sign) - shifted_sign;
1952template <
typename T>
1958template <
typename T>
1965template <
typename T>
1990 return Vec256<float>{_mm256_rcp_ps(
v.raw)};
1994HWY_API Vec256<float>
AbsDiff(
const Vec256<float> a,
const Vec256<float> b) {
2001HWY_API Vec256<float>
MulAdd(
const Vec256<float> mul,
const Vec256<float> x,
2002 const Vec256<float> add) {
2003#ifdef HWY_DISABLE_BMI2_FMA
2004 return mul * x + add;
2006 return Vec256<float>{_mm256_fmadd_ps(mul.raw, x.raw, add.raw)};
2011#ifdef HWY_DISABLE_BMI2_FMA
2012 return mul * x + add;
2019HWY_API Vec256<float>
NegMulAdd(
const Vec256<float> mul,
const Vec256<float> x,
2020 const Vec256<float> add) {
2021#ifdef HWY_DISABLE_BMI2_FMA
2022 return add - mul * x;
2024 return Vec256<float>{_mm256_fnmadd_ps(mul.raw, x.raw, add.raw)};
2030#ifdef HWY_DISABLE_BMI2_FMA
2031 return add - mul * x;
2038HWY_API Vec256<float>
MulSub(
const Vec256<float> mul,
const Vec256<float> x,
2039 const Vec256<float> sub) {
2040#ifdef HWY_DISABLE_BMI2_FMA
2041 return mul * x - sub;
2043 return Vec256<float>{_mm256_fmsub_ps(mul.raw, x.raw, sub.raw)};
2048#ifdef HWY_DISABLE_BMI2_FMA
2049 return mul * x - sub;
2056HWY_API Vec256<float>
NegMulSub(
const Vec256<float> mul,
const Vec256<float> x,
2057 const Vec256<float> sub) {
2058#ifdef HWY_DISABLE_BMI2_FMA
2059 return Neg(mul * x) - sub;
2061 return Vec256<float>{_mm256_fnmsub_ps(mul.raw, x.raw, sub.raw)};
2067#ifdef HWY_DISABLE_BMI2_FMA
2068 return Neg(mul * x) - sub;
2086 return Vec256<float>{_mm256_rsqrt_ps(
v.raw)};
2093 return Vec256<float>{
2094 _mm256_round_ps(
v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
2098 _mm256_round_pd(
v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
2103 return Vec256<float>{
2104 _mm256_round_ps(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
2108 _mm256_round_pd(
v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
2113 return Vec256<float>{
2114 _mm256_round_ps(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
2118 _mm256_round_pd(
v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
2123 return Vec256<float>{
2124 _mm256_round_ps(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2128 _mm256_round_pd(
v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
2134#if HWY_TARGET <= HWY_AVX3
2141#if HWY_TARGET <= HWY_AVX3
2148#if HWY_TARGET <= HWY_AVX3
2168template <
typename T>
2170 static_assert(IsFloat<T>(),
"Only for float");
2175 return RebindMask(
d, Eq(Add(vi, vi),
Set(di, hwy::MaxExponentTimes2<T>())));
2179template <
typename T>
2181 static_assert(IsFloat<T>(),
"Only for float");
2190 const VFromD<
decltype(di)> exp =
2192 return RebindMask(
d, Lt(exp,
Set(di, hwy::MaxExponentField<T>())));
2201template <
typename T>
2204 _mm256_load_si256(
reinterpret_cast<const __m256i*
>(aligned))};
2215template <
typename T>
2217 return Vec256<T>{_mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(p))};
2230#if HWY_TARGET <= HWY_AVX3
2232template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2238template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2241 return Vec256<T>{_mm256_maskz_loadu_epi16(m.raw, p)};
2244template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2247 return Vec256<T>{_mm256_maskz_loadu_epi32(m.raw, p)};
2250template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2253 return Vec256<T>{_mm256_maskz_loadu_epi64(m.raw, p)};
2269template <
typename T, hwy::EnableIf<sizeof(T) <= 2>* =
nullptr>
2270HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> d,
2271 const T* HWY_RESTRICT p) {
2272 return IfThenElseZero(m, LoadU(d, p));
2275template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2278 auto pi =
reinterpret_cast<const int*
>(p);
2279 return Vec256<T>{_mm256_maskload_epi32(pi, m.raw)};
2282template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2285 auto pi =
reinterpret_cast<const long long*
>(p);
2286 return Vec256<T>{_mm256_maskload_epi64(pi, m.raw)};
2291 const Vec256<int32_t> mi =
2293 return Vec256<float>{_mm256_maskload_ps(p, mi.raw)};
2298 const Vec256<int64_t> mi =
2300 return Vec256<double>{_mm256_maskload_pd(p, mi.raw)};
2309template <
typename T>
2311#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
2317 const __m128i v128 =
LoadU(Full128<T>(), p).raw;
2319 _mm256_inserti128_si256(_mm256_castsi128_si256(v128), v128, 1)};
2321 return Vec256<T>{_mm256_broadcastsi128_si256(
LoadU(Full128<T>(), p).raw)};
2326#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
2329 _mm256_insertf128_ps(_mm256_castps128_ps256(v128), v128, 1)};
2331 return Vec256<float>{_mm256_broadcast_ps(
reinterpret_cast<const __m128*
>(p))};
2336#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
2339 _mm256_insertf128_pd(_mm256_castpd128_pd256(v128), v128, 1)};
2342 _mm256_broadcast_pd(
reinterpret_cast<const __m128d*
>(p))};
2348template <
typename T>
2350 _mm256_store_si256(
reinterpret_cast<__m256i*
>(aligned),
v.raw);
2354 _mm256_store_ps(aligned,
v.raw);
2358 _mm256_store_pd(aligned,
v.raw);
2361template <
typename T>
2363 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(p),
v.raw);
2367 _mm256_storeu_ps(p,
v.raw);
2371 _mm256_storeu_pd(p,
v.raw);
2376#if HWY_TARGET <= HWY_AVX3
2378template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
2381 _mm256_mask_storeu_epi8(p, m.
raw,
v.raw);
2384template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
2387 _mm256_mask_storeu_epi16(p, m.raw,
v.raw);
2390template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2393 _mm256_mask_storeu_epi32(p, m.raw,
v.raw);
2396template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2399 _mm256_mask_storeu_epi64(p, m.raw,
v.raw);
2404 _mm256_mask_storeu_ps(p, m.
raw,
v.raw);
2409 _mm256_mask_storeu_pd(p, m.
raw,
v.raw);
2423template <
typename T, hwy::EnableIf<sizeof(T) <= 2>* =
nullptr>
2424HWY_API
void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
2425 T* HWY_RESTRICT p) {
2429 const RebindToUn
signed<decltype(d)> du;
2430 using TU = TFromD<decltype(du)>;
2431 alignas(32) TU buf[32 / sizeof(T)];
2432 alignas(32) TU mask[32 / sizeof(T)];
2433 Store(BitCast(du, v), du, buf);
2434 Store(BitCast(du, VecFromMask(d, m)), du, mask);
2435 for (
size_t i = 0; i < 32 / sizeof(T); ++i) {
2437 CopySameSize(buf + i, p + i);
2442template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2445 auto pi =
reinterpret_cast<int*
>(p);
2446 _mm256_maskstore_epi32(pi, m.raw,
v.raw);
2449template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
2452 auto pi =
reinterpret_cast<long long*
>(p);
2453 _mm256_maskstore_epi64(pi, m.raw,
v.raw);
2458 const Vec256<int32_t> mi =
2460 _mm256_maskstore_ps(p, mi.raw,
v.raw);
2465 const Vec256<int64_t> mi =
2467 _mm256_maskstore_pd(p, mi.raw,
v.raw);
2474template <
typename T>
2477 _mm256_stream_si256(
reinterpret_cast<__m256i*
>(aligned),
v.raw);
2481 _mm256_stream_ps(aligned,
v.raw);
2485 _mm256_stream_pd(aligned,
v.raw);
2497template <typename T>
2500 const
Vec256<int32_t> offset) {
2501 _mm256_i32scatter_epi32(base, offset.raw,
v.raw, 1);
2503template <
typename T>
2507 _mm256_i32scatter_epi32(base, index.
raw,
v.raw, 4);
2510template <
typename T>
2514 _mm256_i64scatter_epi64(base, offset.
raw,
v.raw, 1);
2516template <
typename T>
2520 _mm256_i64scatter_epi64(base, index.
raw,
v.raw, 8);
2525template <
typename T,
typename Offset>
2527 const Vec256<Offset> offset) {
2528 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2529 return detail::ScatterOffset(
hwy::SizeTag<
sizeof(T)>(), v, d, base, offset);
2531template <
typename T,
typename Index>
2533 const Vec256<Index> index) {
2534 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2535 return detail::ScatterIndex(
hwy::SizeTag<
sizeof(T)>(), v, d, base, index);
2541 _mm256_i32scatter_ps(base, offset.
raw,
v.raw, 1);
2546 _mm256_i32scatter_ps(base, index.
raw,
v.raw, 4);
2552 _mm256_i64scatter_pd(base, offset.
raw,
v.raw, 1);
2557 _mm256_i64scatter_pd(base, index.
raw,
v.raw, 8);
2562template <
typename T,
typename Offset>
2564 const Vec256<Offset> offset) {
2565 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2567 constexpr size_t N = 32 /
sizeof(T);
2568 alignas(32) T lanes[N];
2571 alignas(32) Offset offset_lanes[N];
2572 Store(offset, Full256<Offset>(), offset_lanes);
2574 uint8_t* base_bytes =
reinterpret_cast<uint8_t*
>(base);
2575 for (
size_t i = 0; i < N; ++i) {
2576 CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
2580template <
typename T,
typename Index>
2582 const Vec256<Index> index) {
2583 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2585 constexpr size_t N = 32 /
sizeof(T);
2586 alignas(32) T lanes[N];
2589 alignas(32) Index index_lanes[N];
2590 Store(index, Full256<Index>(), index_lanes);
2592 for (
size_t i = 0; i <
N; ++i) {
2593 base[index_lanes[i]] = lanes[i];
2603template <
typename T>
2608 return Vec256<T>{_mm256_i32gather_epi32(
2609 reinterpret_cast<const int32_t*
>(base), offset.
raw, 1)};
2611template <
typename T>
2616 return Vec256<T>{_mm256_i32gather_epi32(
2617 reinterpret_cast<const int32_t*
>(base), index.
raw, 4)};
2620template <
typename T>
2625 return Vec256<T>{_mm256_i64gather_epi64(
2628template <
typename T>
2633 return Vec256<T>{_mm256_i64gather_epi64(
2639template <
typename T,
typename Offset>
2641 const Vec256<Offset> offset) {
2642 static_assert(
sizeof(T) ==
sizeof(Offset),
"Must match for portability");
2643 return detail::GatherOffset(
hwy::SizeTag<
sizeof(T)>(), d, base, offset);
2645template <
typename T,
typename Index>
2647 const Vec256<Index> index) {
2648 static_assert(
sizeof(T) ==
sizeof(Index),
"Must match for portability");
2649 return detail::GatherIndex(
hwy::SizeTag<
sizeof(T)>(), d, base, index);
2680template <
typename T>
2681HWY_API Vec128<T> LowerHalf(Full128<T> , Vec256<T> v) {
2682 return Vec128<T>{_mm256_castsi256_si128(v.raw)};
2691template <
typename T>
2692HWY_API Vec128<T> LowerHalf(Vec256<T> v) {
2693 return LowerHalf(Full128<T>(), v);
2698template <
typename T>
2699HWY_API Vec128<T> UpperHalf(Full128<T> , Vec256<T> v) {
2700 return Vec128<T>{_mm256_extracti128_si256(v.raw, 1)};
2710template <
typename T>
2711HWY_API T ExtractLane(
const Vec256<T> v,
size_t i) {
2714 alignas(32) T lanes[32 /
sizeof(T)];
2720template <
typename T>
2721HWY_API Vec256<T> InsertLane(
const Vec256<T> v,
size_t i, T t) {
2724 alignas(64) T lanes[64 /
sizeof(T)];
2727 return Load(d, lanes);
2731template <
typename T>
2749#if !defined(HWY_HAVE_ZEXT)
2750#if (HWY_COMPILER_MSVC && HWY_COMPILER_MSVC >= 1915) || \
2751 (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 500) || \
2752 (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1000)
2753#define HWY_HAVE_ZEXT 1
2755#define HWY_HAVE_ZEXT 0
2759template <
typename T>
2760HWY_API Vec256<T> ZeroExtendVector(Full256<T> , Vec128<T> lo) {
2762return Vec256<T>{_mm256_zextsi128_si256(lo.raw)};
2764 return Vec256<T>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
2772 return Vec256<float>{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.
raw, 0)};
2786template <
typename T>
2787HWY_API Vec256<T> Combine(Full256<T> d, Vec128<T> hi, Vec128<T> lo) {
2788 const auto lo256 = ZeroExtendVector(d, lo);
2789 return Vec256<T>{_mm256_inserti128_si256(lo256.raw, hi.raw, 1)};
2804template <
int kBytes,
typename T>
2805HWY_API Vec256<T> ShiftLeftBytes(Full256<T> ,
const Vec256<T> v) {
2806 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2808 return Vec256<T>{_mm256_slli_si256(v.raw, kBytes)};
2811template <
int kBytes,
typename T>
2813 return ShiftLeftBytes<kBytes>(Full256<T>(), v);
2818template <
int kLanes,
typename T>
2821 return BitCast(d, ShiftLeftBytes<kLanes *
sizeof(T)>(
BitCast(d8, v)));
2824template <
int kLanes,
typename T>
2826 return ShiftLeftLanes<kLanes>(Full256<T>(), v);
2831template <
int kBytes,
typename T>
2833 static_assert(0 <= kBytes && kBytes <= 16,
"Invalid kBytes");
2835 return Vec256<T>{_mm256_srli_si256(
v.raw, kBytes)};
2839template <
int kLanes,
typename T>
2842 return BitCast(d, ShiftRightBytes<kLanes *
sizeof(T)>(d8,
BitCast(d8, v)));
2848template <
int kBytes,
typename T,
class V = Vec256<T>>
2851 return BitCast(d, Vec256<uint8_t>{_mm256_alignr_epi8(
2860 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2862 const __m256i lo = _mm256_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
2866 _mm256_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
2872 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2877 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2884 static_assert(0 <= kLane && kLane < 8,
"Invalid lane");
2886 const __m256i lo = _mm256_shufflelo_epi16(
v.raw, (0x55 * kLane) & 0xFF);
2890 _mm256_shufflehi_epi16(
v.raw, (0x55 * (kLane - 4)) & 0xFF);
2896 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2901 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2908 static_assert(0 <= kLane && kLane < 4,
"Invalid lane");
2913 static_assert(0 <= kLane && kLane < 2,
"Invalid lane");
2926template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2928 return Vec256<T>{_mm256_shuffle_epi32(
v.raw, 0xB1)};
2937template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2938HWY_API Vec256<T> Shuffle2301(
const Vec256<T> a,
const Vec256<T> b) {
2940 const RebindToFloat<
decltype(d)> df;
2941 constexpr int m = _MM_SHUFFLE(2, 3, 0, 1);
2942 return BitCast(d, Vec256<float>{_mm256_shuffle_ps(BitCast(df, a).raw,
2943 BitCast(df, b).raw, m)});
2945template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2949 constexpr int m = _MM_SHUFFLE(1, 2, 3, 0);
2950 return BitCast(d, Vec256<float>{_mm256_shuffle_ps(
BitCast(df, a).raw,
2953template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
2957 constexpr int m = _MM_SHUFFLE(3, 0, 1, 2);
2958 return BitCast(d, Vec256<float>{_mm256_shuffle_ps(
BitCast(df, a).raw,
3021template <
typename T>
3027template <
typename T,
typename TI, HWY_IF_LANE_SIZE(T, 4)>
3029 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
3030#if HWY_IS_DEBUG_BUILD
3033 AllTrue(di, Lt(vec,
Set(di,
static_cast<TI
>(32 /
sizeof(T))))));
3039template <
typename T,
typename TI, HWY_IF_LANE_SIZE(T, 8)>
3040HWY_API Indices256<T> IndicesFromVec(Full256<T> d, Vec256<TI> idx64) {
3041 static_assert(
sizeof(T) ==
sizeof(TI),
"Index size must match lane");
3042 const Rebind<TI,
decltype(d)> di;
3044#if HWY_IS_DEBUG_BUILD
3046 AllTrue(di, Lt(idx64, Set(di,
static_cast<TI
>(32 /
sizeof(T))))));
3049#if HWY_TARGET <= HWY_AVX3
3051 return Indices256<T>{idx64.raw};
3053 const Repartition<float,
decltype(d)> df;
3055 const Vec256<TI> dup =
3056 BitCast(di, Vec256<float>{_mm256_moveldup_ps(BitCast(df, idx64).raw)});
3058 const Vec256<TI> idx32 = dup + dup +
Set(di, TI(1) << 32);
3059 return Indices256<T>{idx32.raw};
3063template <
typename T,
typename TI>
3065 const Rebind<TI,
decltype(
d)> di;
3069template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3071 return Vec256<T>{_mm256_permutevar8x32_epi32(
v.raw, idx.
raw)};
3074template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3075HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) {
3076#if HWY_TARGET <= HWY_AVX3
3077 return Vec256<T>{_mm256_permutexvar_epi64(idx.raw, v.raw)};
3079 return Vec256<T>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
3090#if HWY_TARGET <= HWY_AVX3
3102template <
typename T>
3103HWY_API Vec256<T> SwapAdjacentBlocks(Vec256<T> v) {
3104 return Vec256<T>{_mm256_permute2x128_si256(v.raw, v.raw, 0x01)};
3117template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3119 alignas(32)
constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
3123template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3124HWY_API Vec256<T> Reverse(Full256<T> d,
const Vec256<T> v) {
3125 alignas(32)
constexpr int64_t kReverse[4] = {3, 2, 1, 0};
3126 return TableLookupLanes(v, SetTableIndices(d, kReverse));
3129template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3131#if HWY_TARGET <= HWY_AVX3
3133 alignas(32)
constexpr int16_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
3134 7, 6, 5, 4, 3, 2, 1, 0};
3135 const Vec256<int16_t> idx =
Load(di, kReverse);
3136 return BitCast(d, Vec256<int16_t>{
3137 _mm256_permutexvar_epi16(idx.raw,
BitCast(di, v).raw)});
3141 return BitCast(d, RotateRight<16>(rev32));
3147template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3153template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3154HWY_API Vec256<T> Reverse2(Full256<T> ,
const Vec256<T> v) {
3155 return Shuffle2301(v);
3158template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3159HWY_API Vec256<T> Reverse2(Full256<T> ,
const Vec256<T> v) {
3160 return Shuffle01(v);
3165template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3167#if HWY_TARGET <= HWY_AVX3
3169 alignas(32)
constexpr int16_t kReverse4[16] = {3, 2, 1, 0, 7, 6, 5, 4,
3170 11, 10, 9, 8, 15, 14, 13, 12};
3171 const Vec256<int16_t> idx =
Load(di, kReverse4);
3172 return BitCast(d, Vec256<int16_t>{
3173 _mm256_permutexvar_epi16(idx.raw,
BitCast(di, v).raw)});
3180template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3185template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3193template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3195#if HWY_TARGET <= HWY_AVX3
3197 alignas(32)
constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
3198 15, 14, 13, 12, 11, 10, 9, 8};
3199 const Vec256<int16_t> idx =
Load(di, kReverse8);
3200 return BitCast(d, Vec256<int16_t>{
3201 _mm256_permutexvar_epi16(idx.raw,
BitCast(di, v).raw)});
3208template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3213template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3317template <
typename T,
class V = Vec256<T>>
3318HWY_API V InterleaveUpper(Full256<T> , V a, V b) {
3319 return detail::InterleaveUpper(a, b);
3326template <
typename T,
typename TW = MakeW
ide<T>>
3330template <
typename T,
typename TW = MakeW
ide<T>>
3335template <
typename T,
typename TW = MakeW
ide<T>>
3347template <
typename T>
3348HWY_API Vec256<T> ConcatLowerLower(Full256<T> d,
const Vec256<T> hi,
3349 const Vec256<T> lo) {
3350 const Half<
decltype(d)> d2;
3351 return Vec256<T>{_mm256_inserti128_si256(lo.raw, LowerHalf(d2, hi).raw, 1)};
3355 const Half<
decltype(
d)> d2;
3361 const Half<
decltype(
d)> d2;
3366template <
typename T>
3367HWY_API Vec256<T> ConcatLowerUpper(Full256<T> ,
const Vec256<T> hi,
3368 const Vec256<T> lo) {
3369 return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x21)};
3383template <
typename T>
3384HWY_API Vec256<T> ConcatUpperLower(Full256<T> ,
const Vec256<T> hi,
3385 const Vec256<T> lo) {
3386 return Vec256<T>{_mm256_blend_epi32(hi.raw, lo.raw, 0x0F)};
3400template <
typename T>
3401HWY_API Vec256<T> ConcatUpperUpper(Full256<T> ,
const Vec256<T> hi,
3402 const Vec256<T> lo) {
3403 return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x31)};
3418template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
3421#if HWY_TARGET == HWY_AVX3_DL
3422 alignas(32)
constexpr uint8_t kIdx[32] = {
3423 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
3424 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63};
3427 __mmask32{0xFFFFFFFFu},
BitCast(du, hi).raw)});
3433 const __m256i u8 = _mm256_packus_epi16(uL.
raw, uH.raw);
3434 return Vec256<T>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))};
3438template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3439HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3440 const RebindToUnsigned<
decltype(d)> du;
3441#if HWY_TARGET <= HWY_AVX3
3442 alignas(32)
constexpr uint16_t kIdx[16] = {1, 3, 5, 7, 9, 11, 13, 15,
3443 17, 19, 21, 23, 25, 27, 29, 31};
3444 return BitCast(d, Vec256<uint16_t>{_mm256_mask2_permutex2var_epi16(
3445 BitCast(du, lo).raw, Load(du, kIdx).raw,
3446 __mmask16{0xFFFF},
BitCast(du, hi).raw)});
3450 const Vec256<uint32_t> uH = ShiftRight<16>(
BitCast(dw, hi));
3451 const Vec256<uint32_t> uL = ShiftRight<16>(
BitCast(dw, lo));
3452 const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
3453 return Vec256<T>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))};
3457template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3460#if HWY_TARGET <= HWY_AVX3
3461 alignas(32)
constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
3462 return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
3463 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
3467 const Vec256<float> v3131{_mm256_shuffle_ps(
3468 BitCast(df, lo).raw,
BitCast(df, hi).raw, _MM_SHUFFLE(3, 1, 3, 1))};
3469 return Vec256<T>{_mm256_permute4x64_epi64(
BitCast(du, v3131).raw,
3470 _MM_SHUFFLE(3, 1, 2, 0))};
3477#if HWY_TARGET <= HWY_AVX3
3478 alignas(32)
constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
3480 __mmask8{0xFF}, hi.
raw)};
3483 _mm256_shuffle_ps(lo.
raw, hi.
raw, _MM_SHUFFLE(3, 1, 3, 1))};
3485 BitCast(du, v3131).raw, _MM_SHUFFLE(3, 1, 2, 0))});
3489template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3490HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3491 const RebindToUnsigned<
decltype(d)> du;
3492#if HWY_TARGET <= HWY_AVX3
3493 alignas(64)
constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
3494 return BitCast(d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
3495 BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
3499 const Vec256<double> v31{
3500 _mm256_shuffle_pd(
BitCast(df, lo).raw,
BitCast(df, hi).raw, 15)};
3502 _mm256_permute4x64_epi64(
BitCast(du, v31).raw, _MM_SHUFFLE(3, 1, 2, 0))};
3508#if HWY_TARGET <= HWY_AVX3
3510 alignas(64)
constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
3512 __mmask8{0xFF}, hi.
raw)};
3517 _mm256_permute4x64_pd(v31.raw, _MM_SHUFFLE(3, 1, 2, 0))};
3523template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
3526#if HWY_TARGET == HWY_AVX3_DL
3527 alignas(64)
constexpr uint8_t kIdx[32] = {
3528 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
3529 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
3532 __mmask32{0xFFFFFFFFu},
BitCast(du, hi).raw)});
3539 const __m256i u8 = _mm256_packus_epi16(uL.
raw, uH.
raw);
3540 return Vec256<T>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))};
3544template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
3545HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3546 const RebindToUnsigned<
decltype(d)> du;
3547#if HWY_TARGET <= HWY_AVX3
3548 alignas(64)
constexpr uint16_t kIdx[16] = {0, 2, 4, 6, 8, 10, 12, 14,
3549 16, 18, 20, 22, 24, 26, 28, 30};
3550 return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi16(
3551 BitCast(du, lo).raw, Load(du, kIdx).raw,
3552 __mmask16{0xFFFF},
BitCast(du, hi).raw)});
3556 const Vec256<uint32_t> mask =
Set(dw, 0x0000FFFF);
3557 const Vec256<uint32_t> uH =
And(
BitCast(dw, hi), mask);
3558 const Vec256<uint32_t> uL =
And(
BitCast(dw, lo), mask);
3559 const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
3560 return Vec256<T>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))};
3564template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3567#if HWY_TARGET <= HWY_AVX3
3568 alignas(64)
constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3569 return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
3570 BitCast(du, lo).raw,
Load(du, kIdx).raw, __mmask8{0xFF},
3574 const Vec256<float> v2020{_mm256_shuffle_ps(
3575 BitCast(df, lo).raw,
BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))};
3576 return Vec256<T>{_mm256_permute4x64_epi64(
BitCast(du, v2020).raw,
3577 _MM_SHUFFLE(3, 1, 2, 0))};
3585#if HWY_TARGET <= HWY_AVX3
3586 alignas(64)
constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
3588 __mmask8{0xFF}, hi.
raw)};
3591 _mm256_shuffle_ps(lo.
raw, hi.
raw, _MM_SHUFFLE(2, 0, 2, 0))};
3593 BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))});
3598template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3599HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
3600 const RebindToUnsigned<
decltype(d)> du;
3601#if HWY_TARGET <= HWY_AVX3
3602 alignas(64)
constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
3603 return BitCast(d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
3604 BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
3608 const Vec256<double> v20{
3611 _mm256_permute4x64_epi64(
BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))};
3618#if HWY_TARGET <= HWY_AVX3
3620 alignas(64)
constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
3622 __mmask8{0xFF}, hi.
raw)};
3627 _mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))};
3633template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3635 return Vec256<T>{_mm256_shuffle_epi32(
v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
3639 _mm256_shuffle_ps(
v.raw,
v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
3642template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3643HWY_API Vec256<T> DupEven(
const Vec256<T> v) {
3644 return InterleaveLower(Full256<T>(), v, v);
3649template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
3651 return Vec256<T>{_mm256_shuffle_epi32(
v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
3655 _mm256_shuffle_ps(
v.raw,
v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
3658template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
3659HWY_API Vec256<T> DupOdd(
const Vec256<T> v) {
3660 return InterleaveUpper(Full256<T>(), v, v);
3667template <
typename T>
3672 alignas(32)
constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
3673 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
3676template <
typename T>
3681template <
typename T>
3686template <
typename T>
3694template <
typename T>
3695HWY_API Vec256<T> OddEven(
const Vec256<T> a,
const Vec256<T> b) {
3696 return detail::OddEven(
hwy::SizeTag<
sizeof(T)>(), a, b);
3708template <
typename T>
3723template <
typename T>
3724HWY_API Vec256<T> ReverseBlocks(Full256<T> d, Vec256<T> v) {
3725 return ConcatLowerUpper(d, v, v);
3731template <
typename T,
typename TI>
3732HWY_API Vec256<TI> TableLookupBytes(
const Vec256<T> bytes,
3733 const Vec256<TI> from) {
3734 return Vec256<TI>{_mm256_shuffle_epi8(bytes.raw, from.raw)};
3738template <
typename T,
typename TI,
size_t NI>
3740 const Vec128<TI, NI> from) {
3742 const auto from_256 =
ZeroExtendVector(Full256<TI>(), Vec128<TI>{from.raw});
3745 return Vec128<TI, NI>{
LowerHalf(Full128<TI>(), tbl_full).raw};
3749template <
typename T,
size_t N,
typename TI>
3751 const Vec256<TI> from) {
3753 const auto bytes_256 =
ZeroExtendVector(Full256<T>(), Vec128<T>{bytes.raw});
3763#if HWY_TARGET > HWY_AVX3 && !HWY_IDE
3766template <
typename T>
3768 static_assert(
sizeof(T) == 2,
"Only for 16-bit");
3771 const Rebind<float,
decltype(dw)> df;
3772 const auto zero =
Zero(d);
3775 const auto upper = exp +
Set(d, 0x3F80);
3777 const auto f0 =
ZipLower(dw, zero, upper);
3778 const auto f1 =
ZipUpper(dw, zero, upper);
3781 const Vec256<int32_t> bits0{_mm256_cvttps_epi32(
BitCast(df, f0).raw)};
3782 const Vec256<int32_t> bits1{_mm256_cvttps_epi32(
BitCast(df, f1).raw)};
3783 return Vec256<MakeUnsigned<T>>{_mm256_packus_epi32(bits0.raw, bits1.raw)};
3790#if HWY_TARGET <= HWY_AVX3 || HWY_IDE
3793 return v *
Pow2(bits);
3807template <
typename T>
3818template <
typename T>
3819HWY_API Vec256<T> operator<<(Vec256<T> v, Vec256<T> bits) {
3820 return detail::Shl(hwy::TypeTag<T>(), v, bits);
3826#if HWY_TARGET <= HWY_AVX3 || HWY_IDE
3831 auto out =
MulHigh(
v, detail::Pow2(
Set(
d, 16) - bits));
3846#if HWY_TARGET <= HWY_AVX3
3858#if HWY_TARGET <= HWY_AVX3
3865HWY_INLINE Vec256<uint64_t> MulEven(
const Vec256<uint64_t> a,
3866 const Vec256<uint64_t> b) {
3867 const Full256<uint64_t> du64;
3868 const RepartitionToNarrow<
decltype(du64)> du32;
3869 const auto maskL = Set(du64, 0xFFFFFFFFULL);
3870 const auto a32 = BitCast(du32, a);
3871 const auto b32 = BitCast(du32, b);
3873 const auto aH = Shuffle2301(a32);
3874 const auto bH = Shuffle2301(b32);
3879 const auto aLbL = MulEven(a32, b32);
3880 const auto w3 = aLbL & maskL;
3882 const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
3883 const auto w2 = t2 & maskL;
3884 const auto w1 = ShiftRight<32>(t2);
3886 const auto t = MulEven(a32, bH) + w2;
3887 const auto k = ShiftRight<32>(t);
3889 const auto mulH = MulEven(aH, bH) + w1 + k;
3890 const auto mulL = ShiftLeft<32>(t) + w3;
3891 return InterleaveLower(mulL, mulH);
3894HWY_INLINE Vec256<uint64_t> MulOdd(
const Vec256<uint64_t> a,
3895 const Vec256<uint64_t> b) {
3896 const Full256<uint64_t> du64;
3897 const RepartitionToNarrow<
decltype(du64)> du32;
3898 const auto maskL = Set(du64, 0xFFFFFFFFULL);
3899 const auto a32 = BitCast(du32, a);
3900 const auto b32 = BitCast(du32, b);
3902 const auto aH = Shuffle2301(a32);
3903 const auto bH = Shuffle2301(b32);
3906 const auto aLbL = MulEven(a32, b32);
3907 const auto w3 = aLbL & maskL;
3909 const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
3910 const auto w2 = t2 & maskL;
3911 const auto w1 = ShiftRight<32>(t2);
3913 const auto t = MulEven(a32, bH) + w2;
3914 const auto k = ShiftRight<32>(t);
3916 const auto mulH = MulEven(aH, bH) + w1 + k;
3917 const auto mulL = ShiftLeft<32>(t) + w3;
3918 return InterleaveUpper(du64, mulL, mulH);
4005HWY_API Vec128<uint16_t> DemoteTo(Full128<uint16_t> ,
4006 const Vec256<int32_t> v) {
4007 const __m256i u16 = _mm256_packus_epi32(v.raw, v.raw);
4010 return Vec128<uint16_t>{
4011 _mm256_castsi256_si128(_mm256_permute4x64_epi64(u16, 0x88))};
4015 const Vec256<int32_t> v) {
4016 const __m256i i16 = _mm256_packs_epi32(
v.raw,
v.raw);
4017 return Vec128<int16_t>{
4018 _mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))};
4022 const Vec256<int32_t> v) {
4023 const __m256i u16_blocks = _mm256_packus_epi32(
v.raw,
v.raw);
4025 const __m256i u16_concat = _mm256_permute4x64_epi64(u16_blocks, 0x88);
4026 const __m128i u16 = _mm256_castsi256_si128(u16_concat);
4029 const __m128i i16 = _mm_and_si128(u16, _mm_set1_epi16(0x7FFF));
4030 return Vec128<uint8_t, 8>{_mm_packus_epi16(i16, i16)};
4034 const Vec256<int16_t> v) {
4035 const __m256i u8 = _mm256_packus_epi16(
v.raw,
v.raw);
4036 return Vec128<uint8_t>{
4037 _mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))};
4041 const Vec256<int32_t> v) {
4042 const __m256i i16_blocks = _mm256_packs_epi32(
v.raw,
v.raw);
4044 const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88);
4045 const __m128i i16 = _mm256_castsi256_si128(i16_concat);
4046 return Vec128<int8_t, 8>{_mm_packs_epi16(i16, i16)};
4050 const Vec256<int16_t> v) {
4051 const __m256i i8 = _mm256_packs_epi16(
v.raw,
v.raw);
4052 return Vec128<int8_t>{
4053 _mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))};
4062 const Vec256<float>
v) {
4063#ifdef HWY_DISABLE_F16C
4065 const Rebind<uint32_t,
decltype(df16)> du;
4067 const auto bits32 =
BitCast(du, v);
4068 const auto sign = ShiftRight<31>(bits32);
4069 const auto biased_exp32 = ShiftRight<23>(bits32) &
Set(du, 0xFF);
4070 const auto mantissa32 = bits32 &
Set(du, 0x7FFFFF);
4072 const auto k15 =
Set(di, 15);
4073 const auto exp =
Min(
BitCast(di, biased_exp32) -
Set(di, 127), k15);
4074 const auto is_tiny = exp <
Set(di, -24);
4076 const auto is_subnormal = exp <
Set(di, -14);
4077 const auto biased_exp16 =
4079 const auto sub_exp =
BitCast(du,
Set(di, -14) - exp);
4080 const auto sub_m = (
Set(du, 1) << (
Set(du, 10) - sub_exp)) +
4081 (mantissa32 >> (
Set(du, 13) + sub_exp));
4083 ShiftRight<13>(mantissa32));
4085 const auto sign16 = ShiftLeft<15>(sign);
4086 const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
4091 return Vec128<float16_t>{_mm256_cvtps_ph(
v.raw, _MM_FROUND_NO_EXC)};
4098 const Vec256<float> v) {
4100 const Rebind<int32_t,
decltype(dbf16)> di32;
4101 const Rebind<uint32_t,
decltype(dbf16)> du32;
4102 const Rebind<uint16_t,
decltype(dbf16)> du16;
4103 const auto bits_in_32 =
BitCast(di32, ShiftRight<16>(
BitCast(du32, v)));
4108 Vec256<float> a, Vec256<float> b) {
4111 const Repartition<uint32_t,
decltype(dbf16)> du32;
4112 const Vec256<uint32_t> b_in_even = ShiftRight<16>(
BitCast(du32, b));
4117 Vec256<int32_t> a, Vec256<int32_t> b) {
4118 return Vec256<int16_t>{_mm256_packs_epi32(a.raw, b.raw)};
4126HWY_API Vec128<int32_t> DemoteTo(Full128<int32_t> ,
4127 const Vec256<double> v) {
4128 const auto clamped = detail::ClampF64ToI32Max(Full256<double>(), v);
4129 return Vec128<int32_t>{_mm256_cvttpd_epi32(clamped.raw)};
4134 const Full256<uint32_t> d32;
4135 alignas(32)
static constexpr uint32_t k8From32[8] = {
4136 0x0C080400u, ~0u, ~0u, ~0u, ~0u, 0x0C080400u, ~0u, ~0u};
4141 const auto hi =
UpperHalf(Full128<uint32_t>(), quad);
4143 return BitCast(Full64<uint8_t>(), pair);
4151template <u
int32_t LO, u
int32_t HI,
typename T>
4155#if HWY_TARGET <= HWY_AVX3_DL
4156 alignas(32)
constexpr uint32_t kMap[8] = {
4157 LO, HI, 0x10101010 + LO, 0x10101010 + HI, 0, 0, 0, 0};
4158 const auto result = _mm256_permutexvar_epi8(
v.raw,
Load(d32, kMap).raw);
4160 alignas(32)
static constexpr uint32_t kMap[8] = {LO, HI, ~0u, ~0u,
4163 const auto result = _mm256_permute4x64_epi64(quad.raw, 0xCC);
4174template <u
int16_t LO, u
int16_t HI,
typename T>
4178#if HWY_TARGET <= HWY_AVX3_DL
4179 alignas(32)
constexpr uint16_t kMap[16] = {
4180 LO, HI, 0x1010 + LO, 0x1010 + HI, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
4181 const auto result = _mm256_permutexvar_epi8(
v.raw,
Load(d16, kMap).raw);
4184 constexpr uint16_t ff =
static_cast<uint16_t
>(~0u);
4185 alignas(32)
static constexpr uint16_t kMap[16] = {
4186 LO, ff, HI, ff, ff, ff, ff, ff, ff, ff, ff, ff, LO, ff, HI, ff};
4188 const auto mixed = _mm256_permute4x64_epi64(quad.raw, 0xCC);
4189 const auto half = _mm256_castsi256_si128(mixed);
4199#if HWY_TARGET <= HWY_AVX3_DL
4200 alignas(32)
constexpr uint32_t kMap[8] = {0x18100800u, 0, 0, 0, 0, 0, 0, 0};
4201 const auto result = _mm256_permutexvar_epi8(
v.raw,
Load(d32, kMap).raw);
4204 alignas(32)
static constexpr uint32_t kMap[8] = {0xFFFF0800u, ~0u, ~0u, ~0u,
4205 0x0800FFFFu, ~0u, ~0u, ~0u};
4209 const auto result = lo | hi;
4216 const auto result = detail::LookupAndConcatQuarters<0x100, 0x908>(
v);
4223 alignas(32)
constexpr uint32_t kEven[8] = {0, 2, 4, 6, 0, 2, 4, 6};
4231 const auto full = detail::LookupAndConcatQuarters<0x400, 0xC08>(
v);
4237 const auto full = detail::LookupAndConcatHalves<0x05040100, 0x0D0C0908>(
v);
4243 const auto full = detail::LookupAndConcatHalves<0x06040200, 0x0E0C0A08>(
v);
4255#if HWY_TARGET <= HWY_AVX3
4264 const auto k84_63 =
Set(d64, 0x4530000080000000ULL);
4265 const auto v_upper =
BitCast(dd, ShiftRight<32>(
BitCast(d64,
v)) ^ k84_63);
4268 const auto k52 =
Set(d32, 0x43300000);
4271 const auto k84_63_52 =
BitCast(dd,
Set(d64, 0x4530000080100000ULL));
4272 return (v_upper - k84_63_52) + v_lower;
4278#if HWY_TARGET <= HWY_AVX3
4285 const auto msk_lo =
Set(du32, 0xFFFF);
4286 const auto cnst2_16_flt =
Set(df, 65536.0f);
4290 const auto v_hi =
BitCast(d32, ShiftRight<16>(
v));
4298#if HWY_TARGET <= HWY_AVX3
4303 using VU =
VFromD<
decltype(d64)>;
4305 const VU msk_lo =
Set(d64, 0xFFFFFFFFULL);
4306 const auto cnst2_32_dbl =
Set(dd, 4294967296.0);
4309 const VU v_lo =
And(
v, msk_lo);
4310 const VU v_hi = ShiftRight<32>(
v);
4314 detail::BitCastToInteger(
Set(dd, 0x0010000000000000).raw)});
4315 return BitCast(dd, w) -
Set(dd, 0x0010000000000000);
4318 const auto v_lo_dbl = uint64_to_double256_fast(v_lo);
4319 return MulAdd(cnst2_32_dbl, uint64_to_double256_fast(v_hi), v_lo_dbl);
4325 return detail::FixConversionOverflow(
d,
v, _mm256_cvttps_epi32(
v.raw));
4329#if HWY_TARGET <= HWY_AVX3
4330 return detail::FixConversionOverflow(di,
v, _mm256_cvttpd_epi64(
v.raw));
4332 using VI =
decltype(
Zero(di));
4333 const VI k0 =
Zero(di);
4334 const VI k1 =
Set(di, 1);
4335 const VI k51 =
Set(di, 51);
4338 const VI biased_exp = ShiftRight<52>(
BitCast(di,
v)) &
Set(di, 0x7FF);
4339 const VI exp = biased_exp -
Set(di, 0x3FF);
4340 const auto in_range = exp <
Set(di, 63);
4348 const VI shift_mnt =
Max(k51 - exp, k0);
4349 const VI shift_int =
Max(exp - k51, k0);
4350 const VI mantissa =
BitCast(di,
v) &
Set(di, (1ULL << 52) - 1);
4352 const VI int52 = (mantissa |
Set(di, 1ULL << 52)) >> (shift_mnt + k1);
4354 const VI shifted = int52 << shift_int;
4356 const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
4360 const VI limit =
Set(di, LimitsMax<int64_t>()) - sign_mask;
4361 const VI magnitude =
IfThenElse(in_range, restored, limit);
4364 return (magnitude ^ sign_mask) - sign_mask;
4368HWY_API Vec256<int32_t> NearestInt(
const Vec256<float> v) {
4369 const Full256<int32_t> di;
4370 return detail::FixConversionOverflow(di, v, _mm256_cvtps_epi32(v.raw));
4376#ifdef HWY_DISABLE_F16C
4381 const auto sign = ShiftRight<15>(bits16);
4382 const auto biased_exp = ShiftRight<10>(bits16) &
Set(du32, 0x1F);
4383 const auto mantissa = bits16 &
Set(du32, 0x3FF);
4384 const auto subnormal =
4386 Set(df32, 1.0f / 16384 / 1024));
4388 const auto biased_exp32 = biased_exp +
Set(du32, 127 - 15);
4389 const auto mantissa32 =
ShiftLeft<23 - 10>(mantissa);
4390 const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
4391 const auto bits32 =
IfThenElse(biased_exp ==
Zero(du32), subnormal, normal);
4392 return BitCast(df32, ShiftLeft<31>(sign) | bits32);
4401 const Rebind<uint16_t,
decltype(df32)> du16;
4408#if !defined(HWY_DISABLE_PCLMUL_AES)
4411#ifdef HWY_NATIVE_AES
4412#undef HWY_NATIVE_AES
4414#define HWY_NATIVE_AES
4419#if HWY_TARGET == HWY_AVX3_DL
4423 const Half<
decltype(
d)> d2;
4431#if HWY_TARGET == HWY_AVX3_DL
4435 const Half<
decltype(
d)> d2;
4443#if HWY_TARGET == HWY_AVX3_DL
4447 const Half<
decltype(
d)> d2;
4454#if HWY_TARGET == HWY_AVX3_DL
4458 const Half<
decltype(
d)> d2;
4469template <
typename T,
typename T2>
4472 for (
size_t i = 0; i < 32 /
sizeof(T); ++i) {
4476 return Load(
d, lanes);
4479#if HWY_TARGET <= HWY_AVX3
4484template <
typename T>
4487 constexpr size_t N = 32 /
sizeof(T);
4488 constexpr size_t kNumBytes = (
N + 7) / 8;
4490 uint64_t mask_bits = 0;
4491 CopyBytes<kNumBytes>(bits, &mask_bits);
4494 mask_bits &= (1ull <<
N) - 1;
4503template <
typename T>
4506 constexpr size_t N = 32 /
sizeof(T);
4507 constexpr size_t kNumBytes = (
N + 7) / 8;
4509 CopyBytes<kNumBytes>(&mask.
raw, bits);
4513 const int mask_bits =
static_cast<int>((1ull <<
N) - 1);
4514 bits[0] =
static_cast<uint8_t
>(bits[0] & mask_bits);
4521template <
typename T>
4522HWY_API size_t CountTrue(
const Full256<T> ,
const Mask256<T> mask) {
4523 return PopCount(
static_cast<uint64_t
>(mask.raw));
4526template <
typename T>
4527HWY_API size_t FindKnownFirstTrue(
const Full256<T> ,
4528 const Mask256<T> mask) {
4532template <
typename T>
4542template <
typename T>
4544#if HWY_COMPILER_HAS_MASK_INTRINSICS
4545 return _kortestz_mask32_u8(mask.
raw, mask.
raw);
4547 return mask.
raw == 0;
4550template <
typename T>
4552#if HWY_COMPILER_HAS_MASK_INTRINSICS
4553 return _kortestz_mask16_u8(mask.
raw, mask.
raw);
4555 return mask.
raw == 0;
4558template <
typename T>
4560#if HWY_COMPILER_HAS_MASK_INTRINSICS
4561 return _kortestz_mask8_u8(mask.
raw, mask.
raw);
4563 return mask.
raw == 0;
4566template <
typename T>
4568 return (uint64_t{mask.
raw} & 0xF) == 0;
4573template <
typename T>
4574HWY_API bool AllFalse(
const Full256<T> ,
const Mask256<T> mask) {
4575 return detail::AllFalse(
hwy::SizeTag<
sizeof(T)>(), mask);
4580template <
typename T>
4582#if HWY_COMPILER_HAS_MASK_INTRINSICS
4583 return _kortestc_mask32_u8(mask.
raw, mask.
raw);
4585 return mask.
raw == 0xFFFFFFFFu;
4588template <
typename T>
4590#if HWY_COMPILER_HAS_MASK_INTRINSICS
4591 return _kortestc_mask16_u8(mask.
raw, mask.
raw);
4593 return mask.
raw == 0xFFFFu;
4596template <
typename T>
4598#if HWY_COMPILER_HAS_MASK_INTRINSICS
4599 return _kortestc_mask8_u8(mask.
raw, mask.
raw);
4601 return mask.
raw == 0xFFu;
4604template <
typename T>
4607 return mask.
raw == 0xFu;
4612template <
typename T>
4613HWY_API bool AllTrue(
const Full256<T> ,
const Mask256<T> mask) {
4614 return detail::AllTrue(
hwy::SizeTag<
sizeof(T)>(), mask);
4621template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4623 return Vec256<T>{_mm256_maskz_compress_epi32(mask.
raw,
v.raw)};
4630template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4631HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> mask) {
4633 alignas(16)
constexpr uint64_t packed_array[16] = {
4635 0x00003210, 0x00003210, 0x00003201, 0x00003210, 0x00003102, 0x00003120,
4636 0x00003021, 0x00003210, 0x00002103, 0x00002130, 0x00002031, 0x00002310,
4637 0x00001032, 0x00001320, 0x00000321, 0x00003210};
4642 const RebindToUnsigned<
decltype(d)> du64;
4643 const auto packed = Set(du64, packed_array[mask.raw]);
4644 alignas(64)
constexpr uint64_t shifts[4] = {0, 4, 8, 12};
4645 const auto indices = Indices256<T>{(packed >>
Load(du64, shifts)).raw};
4653template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4656 alignas(16)
constexpr uint64_t packed_array[16] = {
4658 0x00003210, 0x00000321, 0x00001320, 0x00001032, 0x00002310, 0x00002031,
4659 0x00002130, 0x00002103, 0x00003210, 0x00003021, 0x00003120, 0x00003102,
4660 0x00003210, 0x00003201, 0x00003210, 0x00003210};
4666 const auto packed =
Set(du64, packed_array[mask.
raw]);
4667 alignas(32)
constexpr uint64_t shifts[4] = {0, 4, 8, 12};
4676template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4679 _mm256_mask_compressstoreu_epi32(unaligned, mask.
raw,
v.raw);
4680 const size_t count =
PopCount(uint64_t{mask.
raw});
4681 detail::MaybeUnpoison(unaligned, count);
4685template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4686HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> mask, Full256<T> ,
4688 _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
4689 const size_t count =
PopCount(uint64_t{mask.raw} & 0xFull);
4690 detail::MaybeUnpoison(unaligned, count);
4697 _mm256_mask_compressstoreu_ps(unaligned, mask.
raw,
v.raw);
4698 const size_t count =
PopCount(uint64_t{mask.
raw});
4699 detail::MaybeUnpoison(unaligned, count);
4706 _mm256_mask_compressstoreu_pd(unaligned, mask.
raw,
v.raw);
4707 const size_t count =
PopCount(uint64_t{mask.
raw} & 0xFull);
4708 detail::MaybeUnpoison(unaligned, count);
4714template <
typename T>
4715HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
4720 return CompressStore(v, m, d, unaligned);
4722 const size_t count = CountTrue(d, m);
4723 BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned);
4724 detail::MaybeUnpoison(unaligned, count);
4731template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
4744template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4745HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4746 const RebindToUnsigned<
decltype(d)> du;
4747 const Repartition<uint32_t,
decltype(d)> du32;
4748 const auto vbits = BitCast(du, Set(du32,
static_cast<uint32_t
>(mask_bits)));
4751 const Repartition<uint64_t,
decltype(d)> du64;
4752 alignas(32)
constexpr uint64_t kRep8[4] = {
4753 0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull,
4754 0x0303030303030303ull};
4755 const auto rep8 = TableLookupBytes(vbits, BitCast(du, Load(du64, kRep8)));
4757 alignas(32)
constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
4758 1, 2, 4, 8, 16, 32, 64, 128};
4762template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4763HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4765 alignas(32)
constexpr uint16_t kBit[16] = {
4766 1, 2, 4, 8, 16, 32, 64, 128,
4767 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
4768 const auto vmask_bits =
Set(du,
static_cast<uint16_t
>(mask_bits));
4772template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4773HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4775 alignas(32)
constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
4776 const auto vmask_bits =
Set(du,
static_cast<uint32_t
>(mask_bits));
4780template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4781HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
4783 alignas(32)
constexpr uint64_t kBit[8] = {1, 2, 4, 8};
4790template <
typename T>
4793 constexpr size_t N = 32 /
sizeof(T);
4794 constexpr size_t kNumBytes = (
N + 7) / 8;
4796 uint64_t mask_bits = 0;
4797 CopyBytes<kNumBytes>(bits, &mask_bits);
4800 mask_bits &= (1ull <<
N) - 1;
4803 return detail::LoadMaskBits256(d, mask_bits);
4810template <
typename T, HWY_IF_LANE_SIZE(T, 1)>
4813 const Full256<uint8_t> d8;
4816 return static_cast<uint32_t
>(_mm256_movemask_epi8(sign_bits));
4819template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4823 const Full256<uint8_t> d8;
4828 return _pext_u64(sign_bits8, 0xAAAAAAAAull);
4833 const auto sign_bits = _mm256_packs_epi16(mask.raw, _mm256_setzero_si256());
4835 const auto compressed =
4836 _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0));
4837 return static_cast<unsigned>(_mm256_movemask_epi8(compressed));
4841template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4844 const Full256<float> df;
4846 return static_cast<unsigned>(_mm256_movemask_ps(sign_bits));
4849template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4852 const Full256<double> df;
4854 return static_cast<unsigned>(_mm256_movemask_pd(sign_bits));
4860template <
typename T>
4863 constexpr size_t N = 32 /
sizeof(T);
4864 constexpr size_t kNumBytes = (
N + 7) / 8;
4866 const uint64_t mask_bits = detail::BitsFromMask(mask);
4867 CopyBytes<kNumBytes>(&mask_bits, bits);
4875template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4879 return detail::BitsFromMask(mask8) == 0;
4882template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4885 return detail::BitsFromMask(mask) == 0;
4888template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4892 return detail::BitsFromMask(mask8) == (1ull << 32) - 1;
4894template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4896 constexpr uint64_t kAllBits = (1ull << (32 /
sizeof(T))) - 1;
4897 return detail::BitsFromMask(mask) == kAllBits;
4900template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
4904 return PopCount(detail::BitsFromMask(mask8)) >> 1;
4906template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
4908 return PopCount(detail::BitsFromMask(mask));
4911template <
typename T>
4913 const Mask256<T> mask) {
4914 const uint64_t mask_bits = detail::BitsFromMask(mask);
4918template <
typename T>
4920 const Mask256<T> mask) {
4921 const uint64_t mask_bits = detail::BitsFromMask(mask);
4929template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
4930HWY_INLINE Vec256<uint32_t> IndicesFromBits(Full256<T> d, uint64_t mask_bits) {
4937 alignas(16)
constexpr uint32_t packed_array[256] = {
4939 0x76543210, 0x76543218, 0x76543209, 0x76543298, 0x7654310a, 0x765431a8,
4940 0x765430a9, 0x76543a98, 0x7654210b, 0x765421b8, 0x765420b9, 0x76542b98,
4941 0x765410ba, 0x76541ba8, 0x76540ba9, 0x7654ba98, 0x7653210c, 0x765321c8,
4942 0x765320c9, 0x76532c98, 0x765310ca, 0x76531ca8, 0x76530ca9, 0x7653ca98,
4943 0x765210cb, 0x76521cb8, 0x76520cb9, 0x7652cb98, 0x76510cba, 0x7651cba8,
4944 0x7650cba9, 0x765cba98, 0x7643210d, 0x764321d8, 0x764320d9, 0x76432d98,
4945 0x764310da, 0x76431da8, 0x76430da9, 0x7643da98, 0x764210db, 0x76421db8,
4946 0x76420db9, 0x7642db98, 0x76410dba, 0x7641dba8, 0x7640dba9, 0x764dba98,
4947 0x763210dc, 0x76321dc8, 0x76320dc9, 0x7632dc98, 0x76310dca, 0x7631dca8,
4948 0x7630dca9, 0x763dca98, 0x76210dcb, 0x7621dcb8, 0x7620dcb9, 0x762dcb98,
4949 0x7610dcba, 0x761dcba8, 0x760dcba9, 0x76dcba98, 0x7543210e, 0x754321e8,
4950 0x754320e9, 0x75432e98, 0x754310ea, 0x75431ea8, 0x75430ea9, 0x7543ea98,
4951 0x754210eb, 0x75421eb8, 0x75420eb9, 0x7542eb98, 0x75410eba, 0x7541eba8,
4952 0x7540eba9, 0x754eba98, 0x753210ec, 0x75321ec8, 0x75320ec9, 0x7532ec98,
4953 0x75310eca, 0x7531eca8, 0x7530eca9, 0x753eca98, 0x75210ecb, 0x7521ecb8,
4954 0x7520ecb9, 0x752ecb98, 0x7510ecba, 0x751ecba8, 0x750ecba9, 0x75ecba98,
4955 0x743210ed, 0x74321ed8, 0x74320ed9, 0x7432ed98, 0x74310eda, 0x7431eda8,
4956 0x7430eda9, 0x743eda98, 0x74210edb, 0x7421edb8, 0x7420edb9, 0x742edb98,
4957 0x7410edba, 0x741edba8, 0x740edba9, 0x74edba98, 0x73210edc, 0x7321edc8,
4958 0x7320edc9, 0x732edc98, 0x7310edca, 0x731edca8, 0x730edca9, 0x73edca98,
4959 0x7210edcb, 0x721edcb8, 0x720edcb9, 0x72edcb98, 0x710edcba, 0x71edcba8,
4960 0x70edcba9, 0x7edcba98, 0x6543210f, 0x654321f8, 0x654320f9, 0x65432f98,
4961 0x654310fa, 0x65431fa8, 0x65430fa9, 0x6543fa98, 0x654210fb, 0x65421fb8,
4962 0x65420fb9, 0x6542fb98, 0x65410fba, 0x6541fba8, 0x6540fba9, 0x654fba98,
4963 0x653210fc, 0x65321fc8, 0x65320fc9, 0x6532fc98, 0x65310fca, 0x6531fca8,
4964 0x6530fca9, 0x653fca98, 0x65210fcb, 0x6521fcb8, 0x6520fcb9, 0x652fcb98,
4965 0x6510fcba, 0x651fcba8, 0x650fcba9, 0x65fcba98, 0x643210fd, 0x64321fd8,
4966 0x64320fd9, 0x6432fd98, 0x64310fda, 0x6431fda8, 0x6430fda9, 0x643fda98,
4967 0x64210fdb, 0x6421fdb8, 0x6420fdb9, 0x642fdb98, 0x6410fdba, 0x641fdba8,
4968 0x640fdba9, 0x64fdba98, 0x63210fdc, 0x6321fdc8, 0x6320fdc9, 0x632fdc98,
4969 0x6310fdca, 0x631fdca8, 0x630fdca9, 0x63fdca98, 0x6210fdcb, 0x621fdcb8,
4970 0x620fdcb9, 0x62fdcb98, 0x610fdcba, 0x61fdcba8, 0x60fdcba9, 0x6fdcba98,
4971 0x543210fe, 0x54321fe8, 0x54320fe9, 0x5432fe98, 0x54310fea, 0x5431fea8,
4972 0x5430fea9, 0x543fea98, 0x54210feb, 0x5421feb8, 0x5420feb9, 0x542feb98,
4973 0x5410feba, 0x541feba8, 0x540feba9, 0x54feba98, 0x53210fec, 0x5321fec8,
4974 0x5320fec9, 0x532fec98, 0x5310feca, 0x531feca8, 0x530feca9, 0x53feca98,
4975 0x5210fecb, 0x521fecb8, 0x520fecb9, 0x52fecb98, 0x510fecba, 0x51fecba8,
4976 0x50fecba9, 0x5fecba98, 0x43210fed, 0x4321fed8, 0x4320fed9, 0x432fed98,
4977 0x4310feda, 0x431feda8, 0x430feda9, 0x43feda98, 0x4210fedb, 0x421fedb8,
4978 0x420fedb9, 0x42fedb98, 0x410fedba, 0x41fedba8, 0x40fedba9, 0x4fedba98,
4979 0x3210fedc, 0x321fedc8, 0x320fedc9, 0x32fedc98, 0x310fedca, 0x31fedca8,
4980 0x30fedca9, 0x3fedca98, 0x210fedcb, 0x21fedcb8, 0x20fedcb9, 0x2fedcb98,
4981 0x10fedcba, 0x1fedcba8, 0x0fedcba9, 0xfedcba98};
4987 const auto packed =
Set(d32, packed_array[mask_bits]);
4988 alignas(32)
constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
4989 return packed >>
Load(d32, shifts);
4992template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
4993HWY_INLINE Vec256<uint32_t> IndicesFromBits(Full256<T> d, uint64_t mask_bits) {
4999 alignas(32)
constexpr uint32_t u32_indices[128] = {
5001 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7,
5002 10, 11, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 7,
5003 12, 13, 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 2, 3, 6, 7,
5004 10, 11, 12, 13, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 6, 7,
5005 14, 15, 0, 1, 2, 3, 4, 5, 8, 9, 14, 15, 2, 3, 4, 5,
5006 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 10, 11, 14, 15, 4, 5,
5007 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 12, 13, 14, 15, 2, 3,
5008 10, 11, 12, 13, 14, 15, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15};
5009 return Load(d32, u32_indices + 8 * mask_bits);
5012template <
typename T, HWY_IF_LANE_SIZE(T, 4)>
5013HWY_INLINE Vec256<uint32_t> IndicesFromNotBits(Full256<T> d,
5014 uint64_t mask_bits) {
5021 alignas(16)
constexpr uint32_t packed_array[256] = {
5023 0xfedcba98, 0x8fedcba9, 0x9fedcba8, 0x98fedcba, 0xafedcb98, 0xa8fedcb9,
5024 0xa9fedcb8, 0xa98fedcb, 0xbfedca98, 0xb8fedca9, 0xb9fedca8, 0xb98fedca,
5025 0xbafedc98, 0xba8fedc9, 0xba9fedc8, 0xba98fedc, 0xcfedba98, 0xc8fedba9,
5026 0xc9fedba8, 0xc98fedba, 0xcafedb98, 0xca8fedb9, 0xca9fedb8, 0xca98fedb,
5027 0xcbfeda98, 0xcb8feda9, 0xcb9feda8, 0xcb98feda, 0xcbafed98, 0xcba8fed9,
5028 0xcba9fed8, 0xcba98fed, 0xdfecba98, 0xd8fecba9, 0xd9fecba8, 0xd98fecba,
5029 0xdafecb98, 0xda8fecb9, 0xda9fecb8, 0xda98fecb, 0xdbfeca98, 0xdb8feca9,
5030 0xdb9feca8, 0xdb98feca, 0xdbafec98, 0xdba8fec9, 0xdba9fec8, 0xdba98fec,
5031 0xdcfeba98, 0xdc8feba9, 0xdc9feba8, 0xdc98feba, 0xdcafeb98, 0xdca8feb9,
5032 0xdca9feb8, 0xdca98feb, 0xdcbfea98, 0xdcb8fea9, 0xdcb9fea8, 0xdcb98fea,
5033 0xdcbafe98, 0xdcba8fe9, 0xdcba9fe8, 0xdcba98fe, 0xefdcba98, 0xe8fdcba9,
5034 0xe9fdcba8, 0xe98fdcba, 0xeafdcb98, 0xea8fdcb9, 0xea9fdcb8, 0xea98fdcb,
5035 0xebfdca98, 0xeb8fdca9, 0xeb9fdca8, 0xeb98fdca, 0xebafdc98, 0xeba8fdc9,
5036 0xeba9fdc8, 0xeba98fdc, 0xecfdba98, 0xec8fdba9, 0xec9fdba8, 0xec98fdba,
5037 0xecafdb98, 0xeca8fdb9, 0xeca9fdb8, 0xeca98fdb, 0xecbfda98, 0xecb8fda9,
5038 0xecb9fda8, 0xecb98fda, 0xecbafd98, 0xecba8fd9, 0xecba9fd8, 0xecba98fd,
5039 0xedfcba98, 0xed8fcba9, 0xed9fcba8, 0xed98fcba, 0xedafcb98, 0xeda8fcb9,
5040 0xeda9fcb8, 0xeda98fcb, 0xedbfca98, 0xedb8fca9, 0xedb9fca8, 0xedb98fca,
5041 0xedbafc98, 0xedba8fc9, 0xedba9fc8, 0xedba98fc, 0xedcfba98, 0xedc8fba9,
5042 0xedc9fba8, 0xedc98fba, 0xedcafb98, 0xedca8fb9, 0xedca9fb8, 0xedca98fb,
5043 0xedcbfa98, 0xedcb8fa9, 0xedcb9fa8, 0xedcb98fa, 0xedcbaf98, 0xedcba8f9,
5044 0xedcba9f8, 0xedcba98f, 0xfedcba98, 0xf8edcba9, 0xf9edcba8, 0xf98edcba,
5045 0xfaedcb98, 0xfa8edcb9, 0xfa9edcb8, 0xfa98edcb, 0xfbedca98, 0xfb8edca9,
5046 0xfb9edca8, 0xfb98edca, 0xfbaedc98, 0xfba8edc9, 0xfba9edc8, 0xfba98edc,
5047 0xfcedba98, 0xfc8edba9, 0xfc9edba8, 0xfc98edba, 0xfcaedb98, 0xfca8edb9,
5048 0xfca9edb8, 0xfca98edb, 0xfcbeda98, 0xfcb8eda9, 0xfcb9eda8, 0xfcb98eda,
5049 0xfcbaed98, 0xfcba8ed9, 0xfcba9ed8, 0xfcba98ed, 0xfdecba98, 0xfd8ecba9,
5050 0xfd9ecba8, 0xfd98ecba, 0xfdaecb98, 0xfda8ecb9, 0xfda9ecb8, 0xfda98ecb,
5051 0xfdbeca98, 0xfdb8eca9, 0xfdb9eca8, 0xfdb98eca, 0xfdbaec98, 0xfdba8ec9,
5052 0xfdba9ec8, 0xfdba98ec, 0xfdceba98, 0xfdc8eba9, 0xfdc9eba8, 0xfdc98eba,
5053 0xfdcaeb98, 0xfdca8eb9, 0xfdca9eb8, 0xfdca98eb, 0xfdcbea98, 0xfdcb8ea9,
5054 0xfdcb9ea8, 0xfdcb98ea, 0xfdcbae98, 0xfdcba8e9, 0xfdcba9e8, 0xfdcba98e,
5055 0xfedcba98, 0xfe8dcba9, 0xfe9dcba8, 0xfe98dcba, 0xfeadcb98, 0xfea8dcb9,
5056 0xfea9dcb8, 0xfea98dcb, 0xfebdca98, 0xfeb8dca9, 0xfeb9dca8, 0xfeb98dca,
5057 0xfebadc98, 0xfeba8dc9, 0xfeba9dc8, 0xfeba98dc, 0xfecdba98, 0xfec8dba9,
5058 0xfec9dba8, 0xfec98dba, 0xfecadb98, 0xfeca8db9, 0xfeca9db8, 0xfeca98db,
5059 0xfecbda98, 0xfecb8da9, 0xfecb9da8, 0xfecb98da, 0xfecbad98, 0xfecba8d9,
5060 0xfecba9d8, 0xfecba98d, 0xfedcba98, 0xfed8cba9, 0xfed9cba8, 0xfed98cba,
5061 0xfedacb98, 0xfeda8cb9, 0xfeda9cb8, 0xfeda98cb, 0xfedbca98, 0xfedb8ca9,
5062 0xfedb9ca8, 0xfedb98ca, 0xfedbac98, 0xfedba8c9, 0xfedba9c8, 0xfedba98c,
5063 0xfedcba98, 0xfedc8ba9, 0xfedc9ba8, 0xfedc98ba, 0xfedcab98, 0xfedca8b9,
5064 0xfedca9b8, 0xfedca98b, 0xfedcba98, 0xfedcb8a9, 0xfedcb9a8, 0xfedcb98a,
5065 0xfedcba98, 0xfedcba89, 0xfedcba98, 0xfedcba98};
5071 const auto packed =
Set(d32, packed_array[mask_bits]);
5072 alignas(32)
constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
5073 return packed >>
Load(d32, shifts);
5076template <
typename T, HWY_IF_LANE_SIZE(T, 8)>
5077HWY_INLINE Vec256<uint32_t> IndicesFromNotBits(Full256<T> d,
5078 uint64_t mask_bits) {
5084 alignas(32)
constexpr uint32_t u32_indices[128] = {
5086 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9,
5087 8, 9, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11,
5088 8, 9, 10, 11, 14, 15, 12, 13, 10, 11, 14, 15, 8, 9, 12, 13,
5089 8, 9, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13,
5090 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 8, 9, 14, 15,
5091 8, 9, 12, 13, 10, 11, 14, 15, 12, 13, 8, 9, 10, 11, 14, 15,
5092 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 8, 9, 12, 13, 14, 15,
5093 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15};
5094 return Load(d32, u32_indices + 8 * mask_bits);
5096template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
5101 HWY_DASSERT(mask_bits < (1ull << (32 /
sizeof(T))));
5104 const Indices256<uint32_t> indices{IndicesFromBits(d, mask_bits).raw};
5110template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
5114 const auto vu16 =
BitCast(du, v);
5115 const Half<
decltype(du)> duh;
5116 const auto half0 =
LowerHalf(duh, vu16);
5117 const auto half1 =
UpperHalf(duh, vu16);
5119 const uint64_t mask_bits0 = mask_bits & 0xFF;
5120 const uint64_t mask_bits1 = mask_bits >> 8;
5121 const auto compressed0 = detail::CompressBits(half0, mask_bits0);
5122 const auto compressed1 = detail::CompressBits(half1, mask_bits1);
5124 alignas(32) uint16_t all_true[16] = {};
5126 const size_t num_true0 =
PopCount(mask_bits0);
5127 Store(compressed0, duh, all_true);
5128 StoreU(compressed1, duh, all_true + num_true0);
5134 alignas(32) uint16_t all_false[16] = {};
5135 const size_t num_true1 =
PopCount(mask_bits1);
5136 Store(compressed1, duh, all_false + 8);
5137 StoreU(compressed0, duh, all_false + num_true1);
5139 const auto mask =
FirstN(du, num_true0 + num_true1);
5148template <
typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x110)>
5153 HWY_DASSERT(mask_bits < (1ull << (32 /
sizeof(T))));
5156 const Indices256<uint32_t> indices{IndicesFromNotBits(d, mask_bits).raw};
5162template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
5165 return Compress(v, mask_bits ^ 0xFFFF);
5170template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
5172 return detail::Compress(v, detail::BitsFromMask(m));
5175template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
5177 return detail::CompressNot(v, detail::BitsFromMask(m));
5181 Mask256<uint64_t> mask) {
5185template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
5187 constexpr size_t N = 32 /
sizeof(T);
5188 constexpr size_t kNumBytes = (
N + 7) / 8;
5190 uint64_t mask_bits = 0;
5191 CopyBytes<kNumBytes>(bits, &mask_bits);
5194 mask_bits &= (1ull <<
N) - 1;
5197 return detail::Compress(v, mask_bits);
5202template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
5205 const uint64_t mask_bits = detail::BitsFromMask(m);
5206 const size_t count =
PopCount(mask_bits);
5207 StoreU(detail::Compress(v, mask_bits), d, unaligned);
5208 detail::MaybeUnpoison(unaligned, count);
5212template <
typename T, HWY_IF_LANE_SIZE_ONE_OF(T, 0x110)>
5215 const uint64_t mask_bits = detail::BitsFromMask(m);
5216 const size_t count =
PopCount(mask_bits);
5219 HWY_DASSERT(mask_bits < (1ull << (32 /
sizeof(T))));
5222 const Vec256<uint32_t> idx_and_mask = detail::IndicesFromBits(d, mask_bits);
5224 const Mask256<uint32_t> mask32 =
MaskFromVec(ShiftLeft<28>(idx_and_mask));
5226 const Mask256<MakeUnsigned<T>> mask_u{mask32.raw};
5227 const Mask256<T> mask =
RebindMask(d, mask_u);
5228 const Vec256<T> compressed =
5230 Indices256<uint32_t>{idx_and_mask.raw}));
5233 detail::MaybeUnpoison(unaligned, count);
5237template <
typename T, HWY_IF_LANE_SIZE(T, 2)>
5240 const uint64_t mask_bits = detail::BitsFromMask(m);
5241 const size_t count =
PopCount(mask_bits);
5242 const Vec256<T> compressed = detail::Compress(v, mask_bits);
5244#if HWY_MEM_OPS_MIGHT_FAULT
5247 alignas(32) T buf[16];
5248 Store(compressed, d, buf);
5249 memcpy(unaligned, buf, count *
sizeof(T));
5256template <
typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
5259 constexpr size_t N = 32 /
sizeof(T);
5260 constexpr size_t kNumBytes = (
N + 7) / 8;
5262 uint64_t mask_bits = 0;
5263 CopyBytes<kNumBytes>(bits, &mask_bits);
5266 mask_bits &= (1ull <<
N) - 1;
5268 const size_t count =
PopCount(mask_bits);
5270 StoreU(detail::Compress(v, mask_bits), d, unaligned);
5271 detail::MaybeUnpoison(unaligned, count);
5291template <
typename T>
5294 Vec256<T>& A, Vec256<T>& B, Vec256<T>& C) {
5295 constexpr size_t N = 32 /
sizeof(T);
5296 const Vec256<T> v10 =
LoadU(d, unaligned + 0 * N);
5297 const Vec256<T> v32 =
LoadU(d, unaligned + 1 * N);
5298 const Vec256<T> v54 =
LoadU(d, unaligned + 2 * N);
5315template <
typename T>
5318 Vec256<T>& A, Vec256<T>& B, Vec256<T>& C,
5320 constexpr size_t N = 32 /
sizeof(T);
5321 const Vec256<T> v10 =
LoadU(d, unaligned + 0 * N);
5322 const Vec256<T> v32 =
LoadU(d, unaligned + 1 * N);
5323 const Vec256<T> v54 =
LoadU(d, unaligned + 2 * N);
5324 const Vec256<T> v76 =
LoadU(d, unaligned + 3 * N);
5346template <
typename T>
5350 constexpr size_t N = 32 /
sizeof(T);
5353 StoreU(out0, d, unaligned + 0 * N);
5354 StoreU(out1, d, unaligned + 1 * N);
5365template <
typename T>
5367 const Vec256<T> k, Full256<T> d,
5369 constexpr size_t N = 32 /
sizeof(T);
5373 StoreU(out0, d, unaligned + 0 * N);
5374 StoreU(out1, d, unaligned + 1 * N);
5375 StoreU(out2, d, unaligned + 2 * N);
5388template <
typename T>
5390 const Vec256<T> k,
const Vec256<T> l,
5392 constexpr size_t N = 32 /
sizeof(T);
5396 StoreU(out0, d, unaligned + 0 * N);
5397 StoreU(out1, d, unaligned + 1 * N);
5400 StoreU(out2, d, unaligned + 2 * N);
5401 StoreU(out3, d, unaligned + 3 * N);
5412template <
typename T>
5416 const auto v31_20_31_20 = v3210 + v1032;
5417 const auto v20_31_20_31 =
Shuffle0321(v31_20_31_20);
5418 return v20_31_20_31 + v31_20_31_20;
5420template <
typename T>
5424 const auto v31_20_31_20 =
Min(v3210, v1032);
5425 const auto v20_31_20_31 =
Shuffle0321(v31_20_31_20);
5426 return Min(v20_31_20_31, v31_20_31_20);
5428template <
typename T>
5432 const auto v31_20_31_20 =
Max(v3210, v1032);
5433 const auto v20_31_20_31 =
Shuffle0321(v31_20_31_20);
5434 return Max(v20_31_20_31, v31_20_31_20);
5437template <
typename T>
5443template <
typename T>
5447 return Min(v10, v01);
5449template <
typename T>
5453 return Max(v10, v01);
5461 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5471 const auto even = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32,
v)));
5472 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5483 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5493 const auto even = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32,
v)));
5494 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5505 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5515 const auto even = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32,
v)));
5516 const auto odd = ShiftRight<16>(
BitCast(d32,
v));
5525template <
typename T>
5526HWY_API Vec256<T> SumOfLanes(Full256<T> d,
const Vec256<T> vHL) {
5527 const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
5528 return detail::SumOfLanes(
hwy::SizeTag<
sizeof(T)>(), vLH + vHL);
5530template <
typename T>
5531HWY_API Vec256<T> MinOfLanes(Full256<T> d,
const Vec256<T> vHL) {
5532 const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
5533 return detail::MinOfLanes(
hwy::SizeTag<
sizeof(T)>(), Min(vLH, vHL));
5535template <
typename T>
#define HWY_RESTRICT
Definition base.h:64
#define HWY_DIAGNOSTICS(tokens)
Definition base.h:78
#define HWY_API
Definition base.h:129
#define HWY_MIN(a, b)
Definition base.h:134
#define HWY_IS_MSAN
Definition base.h:202
#define HWY_INLINE
Definition base.h:70
#define HWY_DIAGNOSTICS_OFF(msc, gcc)
Definition base.h:79
#define HWY_DASSERT(condition)
Definition base.h:238
#define HWY_MAYBE_UNUSED
Definition base.h:82
#define HWY_ASSERT(condition)
Definition base.h:192
Definition arm_neon-inl.h:778
Raw raw
Definition arm_neon-inl.h:814
Definition wasm_256-inl.h:27
HWY_INLINE Vec256 & operator^=(const Vec256 other)
Definition x86_256-inl.h:109
HWY_INLINE Vec256 & operator&=(const Vec256 other)
Definition x86_256-inl.h:103
HWY_INLINE Vec256 & operator-=(const Vec256 other)
Definition x86_256-inl.h:100
HWY_INLINE Vec256 & operator+=(const Vec256 other)
Definition x86_256-inl.h:97
Raw raw
Definition x86_256-inl.h:113
HWY_INLINE Vec256 & operator|=(const Vec256 other)
Definition x86_256-inl.h:106
HWY_INLINE Vec256 & operator/=(const Vec256 other)
Definition x86_256-inl.h:94
static constexpr size_t kPrivateN
Definition wasm_256-inl.h:30
typename detail::Raw256< T >::type Raw
Definition x86_256-inl.h:83
T PrivateT
Definition wasm_256-inl.h:29
HWY_INLINE Vec256 & operator*=(const Vec256 other)
Definition x86_256-inl.h:91
#define HWY_COMPILER_CLANGCL
Definition detect_compiler_arch.h:45
#define HWY_AVX3_DL
Definition detect_targets.h:65
#define HWY_TARGET
Definition detect_targets.h:380
#define HWY_AVX3
Definition detect_targets.h:66
HWY_API Vec128< T, N > Neg(hwy::NonFloatTag, Vec128< T, N > v)
Definition emu128-inl.h:726
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition x86_128-inl.h:718
HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition arm_neon-inl.h:5447
HWY_API void LoadTransposedBlocks3(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C)
Definition generic_ops-inl.h:159
HWY_API Vec128< T, N > Shuffle3012(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2451
HWY_INLINE Mask128< T, N > MaskFromVec(hwy::SizeTag< 1 >, const Vec128< T, N > v)
Definition x86_128-inl.h:1570
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:470
HWY_INLINE Vec128< uint32_t, 2 > LookupAndConcatQuarters(Vec256< T > v)
Definition x86_256-inl.h:4175
HWY_INLINE Mask128< T, N > ExclusiveNeither(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:963
HWY_INLINE Vec128< uint32_t > LookupAndConcatHalves(Vec256< T > v)
Definition x86_256-inl.h:4152
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:815
HWY_API Vec128< uint16_t, N > Shl(hwy::UnsignedTag, Vec128< uint16_t, N > v, Vec128< uint16_t, N > bits)
Definition x86_128-inl.h:5009
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, const uint64_t mask_bits)
Definition arm_neon-inl.h:6153
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:862
HWY_INLINE __v128_u BitCastToInteger(__v128_u v)
Definition wasm_128-inl.h:130
HWY_INLINE Vec128< uint8_t, N > BitCastFromByte(Simd< uint8_t, N, 0 >, Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:888
HWY_INLINE Vec128< T, 1 > MinOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5063
HWY_INLINE Mask128< T, N > Or(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:889
HWY_INLINE Vec128< T, 1 > SumOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5058
HWY_INLINE Vec128< T, N > CompressNot(Vec128< T, N > v, const uint64_t mask_bits)
Definition arm_neon-inl.h:6162
HWY_INLINE Mask128< T, N > AndNot(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:852
HWY_INLINE size_t CountTrue(hwy::SizeTag< 1 >, const Mask128< T > mask)
Definition arm_neon-inl.h:5609
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:861
HWY_API Vec128< T, N > Shuffle1230(const Vec128< T, N > a, const Vec128< T, N > b)
Definition wasm_128-inl.h:2432
HWY_INLINE Vec128< T > PopulationCount(hwy::SizeTag< 1 >, Vec128< T > v)
Definition arm_neon-inl.h:2080
HWY_INLINE Vec128< T, N > IfThenElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition x86_128-inl.h:670
HWY_INLINE Vec128< T, N > IfThenZeroElse(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > no)
Definition x86_128-inl.h:760
HWY_INLINE Vec128< T, 1 > MaxOfLanes(hwy::SizeTag< sizeof(T)>, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:5068
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition generic_ops-inl.h:505
HWY_API void LoadTransposedBlocks4(Simd< T, N, 0 > d, const T *HWY_RESTRICT unaligned, V &A, V &B, V &C, V &D)
Definition generic_ops-inl.h:340
HWY_API Vec128< uint64_t > InterleaveUpper(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4235
HWY_INLINE Mask128< T, N > LoadMaskBits(Simd< T, N, 0 > d, uint64_t mask_bits)
Definition arm_neon-inl.h:5364
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition x86_128-inl.h:926
HWY_INLINE Mask128< T, N > TestBit(hwy::SizeTag< 1 >, const Vec128< T, N > v, const Vec128< T, N > bit)
Definition x86_128-inl.h:1406
static bool SignBit(float f)
Definition scalar-inl.h:601
d
Definition rvv-inl.h:1998
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1631
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:619
HWY_API Vec128< T, N > CopySign(const Vec128< T, N > magn, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2190
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:4697
decltype(FirstN(D(), 0)) MFromD
Definition arm_sve-inl.h:276
HWY_API Mask128< T, N > operator>(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2445
HWY_API Vec128< T, N > operator-(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:576
HWY_API Mask128< TTo, N > RebindMask(Simd< TTo, N, 0 > dto, Mask128< TFrom, N > m)
Definition arm_neon-inl.h:2230
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:4662
HWY_API Mask128< T, N > operator==(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1139
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:4272
HWY_API bool AllTrue(const Full128< T > d, const Mask128< T > m)
Definition arm_neon-inl.h:5716
HWY_API Vec128< T > Shuffle1032(const Vec128< T > v)
Definition arm_neon-inl.h:4131
HWY_API Vec128< int16_t > MulHigh(const Vec128< int16_t > a, const Vec128< int16_t > b)
Definition arm_neon-inl.h:1684
HWY_API Vec128< T > Shuffle2103(const Vec128< T > v)
Definition arm_neon-inl.h:4147
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:3436
HWY_API Vec128< T, N > ZeroExtendVector(Simd< T, N, 0 > d, Vec128< T, N/2 > lo)
Definition arm_neon-inl.h:4448
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:3506
HWY_API intptr_t FindFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5691
HWY_API V128 CombineShiftRightBytes(Full128< T > d, V128 hi, V128 lo)
Definition arm_neon-inl.h:3592
HWY_API Vec128< T, N > ShiftLeftLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3695
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition arm_neon-inl.h:2456
HWY_API size_t StoreMaskBits(Simd< T, N, 0 >, const Mask128< T, N > mask, uint8_t *bits)
Definition arm_neon-inl.h:5701
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1799
HWY_API void Stream(const Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2955
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2025
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:221
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1949
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2207
HWY_API Vec128< To, 1 > TruncateTo(Simd< To, 1, 0 >, const Vec128< From, 1 > v)
Definition arm_neon-inl.h:4806
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2517
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4453
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition arm_neon-inl.h:2137
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:2555
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2217
HWY_API Vec128< T, N > ConcatUpperUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4517
HWY_API Vec64< int64_t > Neg(const Vec64< int64_t > v)
Definition arm_neon-inl.h:1405
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:212
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:597
HWY_API Vec128< T, N > GatherIndex(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5037
N ConcatEven(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4617
HWY_API Vec128< T > Shuffle0321(const Vec128< T > v)
Definition arm_neon-inl.h:4141
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:1931
HWY_API Mask128< T, N > IsInf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3511
HWY_API Vec128< T, N > ConcatLowerUpper(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4544
HWY_API Vec128< T, N/2 > LowerHalf(const Vec128< T, N > v)
Definition arm_neon-inl.h:3540
HWY_API Vec128< T, N > operator&(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2055
HWY_API Vec128< T, N > operator|(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2060
HWY_API Vec128< uint64_t > InterleaveLower(const Vec128< uint64_t > a, const Vec128< uint64_t > b)
Definition arm_neon-inl.h:4181
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition arm_neon-inl.h:4872
HWY_API Vec128< bfloat16_t, 2 *N > ReorderDemote2To(Simd< bfloat16_t, 2 *N, 0 > dbf16, Vec128< float, N > a, Vec128< float, N > b)
Definition arm_neon-inl.h:4719
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:214
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2758
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:210
HWY_API Mask128< T, N > operator<(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1163
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:6226
HWY_API Vec128< float, N > ReorderWidenMulAccumulate(Simd< float, N, 0 > df32, Vec128< bfloat16_t, 2 *N > a, Vec128< bfloat16_t, 2 *N > b, const Vec128< float, N > sum0, Vec128< float, N > &sum1)
Definition arm_neon-inl.h:4288
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2047
HWY_API Vec128< T, N > operator^(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2065
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2941
HWY_API Vec128< T, N > VecFromMask(Simd< T, N, 0 > d, const Mask128< T, N > v)
Definition arm_neon-inl.h:2223
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:4646
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition arm_neon-inl.h:2253
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:2477
Simd< T, 32/sizeof(T), 0 > Full256
Definition wasm_128-inl.h:54
HWY_API Vec128< T, N > Load(Simd< T, N, 0 > d, const T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2753
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition arm_neon-inl.h:4922
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition x86_256-inl.h:4417
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition emu128-inl.h:303
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:4019
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1998
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:3467
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1853
HWY_API Vec128< T, N > CopySignToAbs(const Vec128< T, N > abs, const Vec128< T, N > sign)
Definition arm_neon-inl.h:2198
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
N ConcatOdd(Simd< T, N, 0 >, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4586
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:3453
HWY_API Indices128< T, N > IndicesFromVec(Simd< T, N, 0 > d, Vec128< TI, N > vec)
Definition arm_neon-inl.h:3973
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:4704
HWY_API Vec128< T, N > ShiftLeftBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3684
HWY_API Vec128< T, N > Reverse2(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4061
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:2326
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
HWY_API Vec128< uint8_t > Combine(Full128< uint8_t >, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:4352
HWY_API Vec128< T, N > Reverse8(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4113
HWY_API Vec< D > SignBit(D d)
Definition generic_ops-inl.h:69
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition arm_neon-inl.h:1049
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:2314
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:6234
HWY_API Vec128< T, N > ZeroIfNegative(Vec128< T, N > v)
Definition arm_neon-inl.h:2277
HWY_API Vec128< T > Shuffle01(const Vec128< T > v)
Definition arm_neon-inl.h:4135
Simd< T, 16/sizeof(T), 0 > Full128
Definition emu128-inl.h:31
HWY_API Vec128< float, N > operator/(const Vec128< float, N > a, const Vec128< float, N > b)
Definition arm_neon-inl.h:1761
HWY_API Vec64< uint16_t > DemoteTo(Full64< uint16_t >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3145
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2040
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2266
HWY_API Vec128< T, N > ConcatUpperLower(Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4570
HWY_API Vec128< uint16_t > operator*(const Vec128< uint16_t > a, const Vec128< uint16_t > b)
Definition arm_neon-inl.h:1642
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition arm_neon-inl.h:997
HWY_API bool AllFalse(const Simd< T, N, 0 > d, const Mask128< T, N > m)
Definition arm_neon-inl.h:5710
HWY_API Vec64< uint8_t > UpperHalf(Full64< uint8_t >, const Vec128< uint8_t > v)
Definition arm_neon-inl.h:3739
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:881
HWY_API void ScatterOffset(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:4984
HWY_API Vec128< T, N > Undefined(Simd< T, N, 0 >)
Definition arm_neon-inl.h:1040
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:4281
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:386
HWY_API Vec128< T, N > ConcatLowerLower(const Simd< T, N, 0 > d, Vec128< T, N > hi, Vec128< T, N > lo)
Definition arm_neon-inl.h:4456
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:207
HWY_API Vec128< float, N > RearrangeToOddPlusEven(const Vec128< float, N > sum0, const Vec128< float, N > sum1)
Definition arm_neon-inl.h:4412
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition x86_256-inl.h:4442
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition arm_neon-inl.h:1020
HWY_API Mask128< T, N > operator>=(Vec128< T, N > a, Vec128< T, N > b)
Definition arm_neon-inl.h:2449
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:1635
HWY_API Vec128< T, N > GatherOffset(const Simd< T, N, 0 > d, const T *HWY_RESTRICT base, const Vec128< Offset, N > offset)
Definition arm_neon-inl.h:5020
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition arm_neon-inl.h:2260
HWY_API Mask128< T, N > operator!=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1148
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:1986
HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D)
Definition ops/shared-inl.h:271
HWY_API Vec128< float > ApproximateReciprocal(const Vec128< float > v)
Definition arm_neon-inl.h:1734
HWY_API Vec32< uint8_t > U8FromU32(const Vec128< uint32_t > v)
Definition arm_neon-inl.h:3287
HWY_API Indices128< T, N > SetTableIndices(Simd< T, N, 0 > d, const TI *idx)
Definition arm_neon-inl.h:4013
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1076
HWY_API void ScatterIndex(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT base, const Vec128< Index, N > index)
Definition arm_neon-inl.h:5002
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition arm_neon-inl.h:1832
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec64< uint8_t > v)
Definition arm_neon-inl.h:2965
HWY_API Mask128< T, N > operator<=(const Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:1180
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2033
decltype(Zero(D())) VFromD
Definition arm_neon-inl.h:1030
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition arm_neon-inl.h:2765
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:4678
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:1720
HWY_API Vec128< T > Shuffle0123(const Vec128< T > v)
Definition arm_neon-inl.h:4153
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:3425
typename D::Half Half
Definition ops/shared-inl.h:227
HWY_API Vec128< T, N > ShiftRightBytes(Simd< T, N, 0 >, Vec128< T, N > v)
Definition arm_neon-inl.h:3707
HWY_API size_t CompressStore(Vec128< T, N > v, const Mask128< T, N > mask, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6248
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:218
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition arm_neon-inl.h:2146
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition arm_neon-inl.h:3327
N
Definition rvv-inl.h:1998
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition arm_neon-inl.h:1913
HWY_API size_t CompressBitsStore(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6273
HWY_API Vec128< uint32_t, N > RotateRight(const Vec128< uint32_t, N > v)
Definition arm_neon-inl.h:1444
HWY_API Mask128< T, N > IsFinite(const Vec128< T, N > v)
Definition arm_neon-inl.h:3521
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:1964
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1361
HWY_API Vec128< float > ApproximateReciprocalSqrt(const Vec128< float > v)
Definition arm_neon-inl.h:1885
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:6257
HWY_API Vec128< T, N > Reverse4(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:4090
HWY_API size_t FindKnownFirstTrue(const Simd< T, N, 0 > d, const Mask128< T, N > mask)
Definition arm_neon-inl.h:5683
HWY_API Vec128< T, N > operator+(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:580
HWY_API Vec128< T, 1 > Reverse(Simd< T, 1, 0 >, const Vec128< T, 1 > v)
Definition arm_neon-inl.h:4030
HWY_API Vec128< uint8_t > operator>>(const Vec128< uint8_t > v, const Vec128< uint8_t > bits)
Definition arm_neon-inl.h:1542
HWY_API void Store(Vec128< T, N > v, Simd< T, N, 0 > d, T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2934
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, const Vec128< T, N > b)
Definition emu128-inl.h:608
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:376
long long int GatherIndex64
Definition x86_128-inl.h:3268
HWY_API Vec128< uint16_t > Broadcast(const Vec128< uint16_t > v)
Definition arm_neon-inl.h:3885
const vfloat64m1_t v
Definition rvv-inl.h:1998
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition x86_256-inl.h:4429
HWY_API Vec128< float > AbsDiff(const Vec128< float > a, const Vec128< float > b)
Definition arm_neon-inl.h:1773
HWY_API Vec128< T, N > ShiftRightLanes(Simd< T, N, 0 > d, const Vec128< T, N > v)
Definition arm_neon-inl.h:3713
HWY_API Vec128< float, N > NegMulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition arm_neon-inl.h:1861
Definition aligned_allocator.h:27
HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag, T t, size_t n)
Definition base.h:906
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x)
Definition base.h:806
HWY_API size_t PopCount(uint64_t x)
Definition base.h:865
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x)
Definition base.h:796
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:593
typename detail::Relations< T >::Signed MakeSigned
Definition base.h:595
#define HWY_ALIGN
Definition set_macros-inl.h:83
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
#define HWY_ATTR
Definition set_macros-inl.h:443
Definition arm_neon-inl.h:5729
Definition wasm_256-inl.h:1043
__m256i raw
Definition x86_256-inl.h:3023
Definition wasm_256-inl.h:61
typename detail::RawMask256< sizeof(T)>::type Raw
Definition x86_256-inl.h:144
static Mask256< T > FromBits(uint64_t mask_bits)
Definition x86_256-inl.h:146
Raw raw
Definition x86_256-inl.h:150
Definition ops/shared-inl.h:52
HWY_INLINE __m256d operator()(__m256i v)
Definition x86_256-inl.h:192
HWY_INLINE __m256 operator()(__m256i v)
Definition x86_256-inl.h:188
Definition x86_256-inl.h:183
HWY_INLINE __m256i operator()(__m256i v)
Definition x86_256-inl.h:184
__m256d type
Definition x86_256-inl.h:76
__m256 type
Definition x86_256-inl.h:72
Definition x86_256-inl.h:67
__m256i type
Definition x86_256-inl.h:68
__mmask32 type
Definition x86_256-inl.h:125
__mmask16 type
Definition x86_256-inl.h:129
__mmask8 type
Definition x86_256-inl.h:133
__mmask8 type
Definition x86_256-inl.h:137
Definition x86_256-inl.h:122