Vector Optimized Library of Kernels 3.1.1
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
sse2neon.h
Go to the documentation of this file.
1#ifndef SSE2NEON_H
2#define SSE2NEON_H
3
4// This header file provides a simple API translation layer
5// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
6//
7// Contributors to this work are:
8// John W. Ratcliff <jratcliffscarab@gmail.com>
9// Brandon Rowlett <browlett@nvidia.com>
10// Ken Fast <kfast@gdeb.com>
11// Eric van Beurden <evanbeurden@nvidia.com>
12// Alexander Potylitsin <apotylitsin@nvidia.com>
13// Hasindu Gamaarachchi <hasindu2008@gmail.com>
14// Jim Huang <jserv@ccns.ncku.edu.tw>
15// Mark Cheng <marktwtn@gmail.com>
16// Malcolm James MacLeod <malcolm@gulden.com>
17// Devin Hussey (easyaspi314) <husseydevin@gmail.com>
18// Sebastian Pop <spop@amazon.com>
19// Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
20// Danila Kutenin <danilak@google.com>
21// François Turban (JishinMaster) <francois.turban@gmail.com>
22// Pei-Hsuan Hung <afcidk@gmail.com>
23// Yang-Hao Yuan <yuanyanghau@gmail.com>
24// Syoyo Fujita <syoyo@lighttransport.com>
25// Brecht Van Lommel <brecht@blender.org>
26// Jonathan Hue <jhue@adobe.com>
27// Cuda Chen <clh960524@gmail.com>
28// Aymen Qader <aymen.qader@arm.com>
29// Anthony Roberts <anthony.roberts@linaro.org>
30
31/*
32 * sse2neon is freely redistributable under the MIT License.
33 *
34 * Permission is hereby granted, free of charge, to any person obtaining a copy
35 * of this software and associated documentation files (the "Software"), to deal
36 * in the Software without restriction, including without limitation the rights
37 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
38 * copies of the Software, and to permit persons to whom the Software is
39 * furnished to do so, subject to the following conditions:
40 *
41 * The above copyright notice and this permission notice shall be included in
42 * all copies or substantial portions of the Software.
43 *
44 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
47 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
48 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
49 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
50 * SOFTWARE.
51 */
52
53/* Tunable configurations */
54
55/* Enable precise implementation of math operations
56 * This would slow down the computation a bit, but gives consistent result with
57 * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
58 */
59/* _mm_min|max_ps|ss|pd|sd */
60#ifndef SSE2NEON_PRECISE_MINMAX
61#define SSE2NEON_PRECISE_MINMAX (0)
62#endif
63/* _mm_rcp_ps and _mm_div_ps */
64#ifndef SSE2NEON_PRECISE_DIV
65#define SSE2NEON_PRECISE_DIV (0)
66#endif
67/* _mm_sqrt_ps and _mm_rsqrt_ps */
68#ifndef SSE2NEON_PRECISE_SQRT
69#define SSE2NEON_PRECISE_SQRT (0)
70#endif
71/* _mm_dp_pd */
72#ifndef SSE2NEON_PRECISE_DP
73#define SSE2NEON_PRECISE_DP (0)
74#endif
75
76/* Enable inclusion of windows.h on MSVC platforms
77 * This makes _mm_clflush functional on windows, as there is no builtin.
78 */
79#ifndef SSE2NEON_INCLUDE_WINDOWS_H
80#define SSE2NEON_INCLUDE_WINDOWS_H (0)
81#endif
82
83/* compiler specific definitions */
84#if defined(__GNUC__) || defined(__clang__)
85#pragma push_macro("FORCE_INLINE")
86#pragma push_macro("ALIGN_STRUCT")
87#define FORCE_INLINE static inline __attribute__((always_inline))
88#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
89#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
90#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
91#elif defined(_MSC_VER)
92#if _MSVC_TRADITIONAL
93#error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
94#endif
95#ifndef FORCE_INLINE
96#define FORCE_INLINE static inline
97#endif
98#ifndef ALIGN_STRUCT
99#define ALIGN_STRUCT(x) __declspec(align(x))
100#endif
101#define _sse2neon_likely(x) (x)
102#define _sse2neon_unlikely(x) (x)
103#else
104#pragma message("Macro name collisions may happen with unsupported compilers.")
105#endif
106
107#if defined(__GNUC__) && __GNUC__ < 10
108#warning "GCC versions earlier than 10 are not supported."
109#endif
110
111/* C language does not allow initializing a variable with a function call. */
112#ifdef __cplusplus
113#define _sse2neon_const static const
114#else
115#define _sse2neon_const const
116#endif
117
118#include <stdint.h>
119#include <stdlib.h>
120
121#if defined(_WIN32)
122/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
123 * from both MinGW-w64 and MSVC.
124 */
125#define SSE2NEON_ALLOC_DEFINED
126#endif
127
128/* If using MSVC */
129#ifdef _MSC_VER
130#include <intrin.h>
131#if SSE2NEON_INCLUDE_WINDOWS_H
132#include <processthreadsapi.h>
133#include <windows.h>
134#endif
135
136#if !defined(__cplusplus)
137#error SSE2NEON only supports C++ compilation with this compiler
138#endif
139
140#ifdef SSE2NEON_ALLOC_DEFINED
141#include <malloc.h>
142#endif
143
144#if (defined(_M_AMD64) || defined(__x86_64__)) || \
145 (defined(_M_ARM64) || defined(__arm64__))
146#define SSE2NEON_HAS_BITSCAN64
147#endif
148#endif
149
150#if defined(__GNUC__) || defined(__clang__)
151#define _sse2neon_define0(type, s, body) \
152 __extension__({ \
153 type _a = (s); \
154 body \
155 })
156#define _sse2neon_define1(type, s, body) \
157 __extension__({ \
158 type _a = (s); \
159 body \
160 })
161#define _sse2neon_define2(type, a, b, body) \
162 __extension__({ \
163 type _a = (a), _b = (b); \
164 body \
165 })
166#define _sse2neon_return(ret) (ret)
167#else
168#define _sse2neon_define0(type, a, body) [=](type _a) { body }(a)
169#define _sse2neon_define1(type, a, body) [](type _a) { body }(a)
170#define _sse2neon_define2(type, a, b, body) \
171 [](type _a, type _b) { body }((a), (b))
172#define _sse2neon_return(ret) return ret
173#endif
174
175#define _sse2neon_init(...) \
176 { \
177 __VA_ARGS__ \
178 }
179
180/* Compiler barrier */
181#if defined(_MSC_VER)
182#define SSE2NEON_BARRIER() _ReadWriteBarrier()
183#else
184#define SSE2NEON_BARRIER() \
185 do { \
186 __asm__ __volatile__("" ::: "memory"); \
187 (void) 0; \
188 } while (0)
189#endif
190
191/* Memory barriers
192 * __atomic_thread_fence does not include a compiler barrier; instead,
193 * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
194 * semantics.
195 */
196#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
197#include <stdatomic.h>
198#endif
199
200FORCE_INLINE void _sse2neon_smp_mb(void)
201{
203#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
204 !defined(__STDC_NO_ATOMICS__)
205 atomic_thread_fence(memory_order_seq_cst);
206#elif defined(__GNUC__) || defined(__clang__)
207 __atomic_thread_fence(__ATOMIC_SEQ_CST);
208#else /* MSVC */
209 __dmb(_ARM64_BARRIER_ISH);
210#endif
211}
212
213/* Architecture-specific build options */
214/* FIXME: #pragma GCC push_options is only available on GCC */
215#if defined(__GNUC__)
216#if defined(__arm__) && __ARM_ARCH == 7
217/* According to ARM C Language Extensions Architecture specification,
218 * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
219 * architecture supported.
220 */
221#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
222#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
223#endif
224#if !defined(__clang__)
225#pragma GCC push_options
226#pragma GCC target("fpu=neon")
227#endif
228#elif defined(__aarch64__) || defined(_M_ARM64)
229#if !defined(__clang__) && !defined(_MSC_VER)
230#pragma GCC push_options
231#pragma GCC target("+simd")
232#endif
233#elif __ARM_ARCH == 8
234#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
235#error \
236 "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
237#endif
238#if !defined(__clang__) && !defined(_MSC_VER)
239#pragma GCC push_options
240#endif
241#else
242#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
243#endif
244#endif
245
246#include <arm_neon.h>
247#if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8)
248#if defined __has_include && __has_include(<arm_acle.h>)
249#include <arm_acle.h>
250#endif
251#endif
252
253/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
254 * and other Arm microarchitectures use.
255 * From sysctl -a on Apple M1:
256 * hw.cachelinesize: 128
257 */
258#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
259#define SSE2NEON_CACHELINE_SIZE 128
260#else
261#define SSE2NEON_CACHELINE_SIZE 64
262#endif
263
264/* Rounding functions require either Aarch64 instructions or libm fallback */
265#if !defined(__aarch64__) && !defined(_M_ARM64)
266#include <math.h>
267#endif
268
269/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
270 * or even not accessible in user mode.
271 * To write or access to these registers in user mode,
272 * we have to perform syscall instead.
273 */
274#if (!defined(__aarch64__) && !defined(_M_ARM64))
275#include <sys/time.h>
276#endif
277
278/* "__has_builtin" can be used to query support for built-in functions
279 * provided by gcc/clang and other compilers that support it.
280 */
281#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
282/* Compatibility with gcc <= 9 */
283#if defined(__GNUC__) && (__GNUC__ <= 9)
284#define __has_builtin(x) HAS##x
285#define HAS__builtin_popcount 1
286#define HAS__builtin_popcountll 1
287
288// __builtin_shuffle introduced in GCC 4.7.0
289#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
290#define HAS__builtin_shuffle 1
291#else
292#define HAS__builtin_shuffle 0
293#endif
294
295#define HAS__builtin_shufflevector 0
296#define HAS__builtin_nontemporal_store 0
297#else
298#define __has_builtin(x) 0
299#endif
300#endif
301
310#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
311 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
312
313#if __has_builtin(__builtin_shufflevector)
314#define _sse2neon_shuffle(type, a, b, ...) \
315 __builtin_shufflevector(a, b, __VA_ARGS__)
316#elif __has_builtin(__builtin_shuffle)
317#define _sse2neon_shuffle(type, a, b, ...) \
318 __extension__({ \
319 type tmp = {__VA_ARGS__}; \
320 __builtin_shuffle(a, b, tmp); \
321 })
322#endif
323
324#ifdef _sse2neon_shuffle
325#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
326#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
327#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
328#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
329#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
330#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
331#endif
332
333/* Rounding mode macros. */
334#define _MM_FROUND_TO_NEAREST_INT 0x00
335#define _MM_FROUND_TO_NEG_INF 0x01
336#define _MM_FROUND_TO_POS_INF 0x02
337#define _MM_FROUND_TO_ZERO 0x03
338#define _MM_FROUND_CUR_DIRECTION 0x04
339#define _MM_FROUND_NO_EXC 0x08
340#define _MM_FROUND_RAISE_EXC 0x00
341#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
342#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
343#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
344#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
345#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
346#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
347#define _MM_ROUND_NEAREST 0x0000
348#define _MM_ROUND_DOWN 0x2000
349#define _MM_ROUND_UP 0x4000
350#define _MM_ROUND_TOWARD_ZERO 0x6000
351/* Flush zero mode macros. */
352#define _MM_FLUSH_ZERO_MASK 0x8000
353#define _MM_FLUSH_ZERO_ON 0x8000
354#define _MM_FLUSH_ZERO_OFF 0x0000
355/* Denormals are zeros mode macros. */
356#define _MM_DENORMALS_ZERO_MASK 0x0040
357#define _MM_DENORMALS_ZERO_ON 0x0040
358#define _MM_DENORMALS_ZERO_OFF 0x0000
359
360/* indicate immediate constant argument in a given range */
361#define __constrange(a, b) const
362
363/* A few intrinsics accept traditional data types like ints or floats, but
364 * most operate on data types that are specific to SSE.
365 * If a vector type ends in d, it contains doubles, and if it does not have
366 * a suffix, it contains floats. An integer vector type can contain any type
367 * of integer, from chars to shorts to unsigned long longs.
368 */
369typedef int64x1_t __m64;
370typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
371// On ARM 32-bit architecture, the float64x2_t is not supported.
372// The data type __m128d should be represented in a different way for related
373// intrinsic conversion.
374#if defined(__aarch64__) || defined(_M_ARM64)
375typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
376#else
377typedef float32x4_t __m128d;
378#endif
379typedef int64x2_t __m128i; /* 128-bit vector containing integers */
380
381// __int64 is defined in the Intrinsics Guide which maps to different datatype
382// in different data model
383#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
384#if (defined(__x86_64__) || defined(__i386__))
385#define __int64 long long
386#else
387#define __int64 int64_t
388#endif
389#endif
390
391/* type-safe casting between types */
392
393#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
394#define vreinterpretq_m128_f32(x) (x)
395#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
396
397#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
398#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
399#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
400#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
401
402#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
403#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
404#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
405#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
406
407#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
408#define vreinterpretq_f32_m128(x) (x)
409#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
410
411#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
412#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
413#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
414#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
415
416#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
417#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
418#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
419#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
420
421#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
422#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
423#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
424#define vreinterpretq_m128i_s64(x) (x)
425
426#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
427#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
428#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
429#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
430
431#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
432#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
433
434#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
435#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
436#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
437#define vreinterpretq_s64_m128i(x) (x)
438
439#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
440#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
441#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
442#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
443
444#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
445#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
446#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
447#define vreinterpret_m64_s64(x) (x)
448
449#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
450#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
451#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
452#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
453
454#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
455#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
456#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
457
458#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
459#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
460#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
461#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
462
463#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
464#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
465#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
466#define vreinterpret_s64_m64(x) (x)
467
468#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
469
470#if defined(__aarch64__) || defined(_M_ARM64)
471#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
472#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
473
474#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
475
476#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
477#define vreinterpretq_m128d_f64(x) (x)
478
479#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
480
481#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
482#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
483
484#define vreinterpretq_f64_m128d(x) (x)
485#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
486#else
487#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
488#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
489
490#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
491#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
492
493#define vreinterpretq_m128d_f32(x) (x)
494
495#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
496
497#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
498#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
499
500#define vreinterpretq_f32_m128d(x) (x)
501#endif
502
503// A struct is defined in this header file called 'SIMDVec' which can be used
504// by applications which attempt to access the contents of an __m128 struct
505// directly. It is important to note that accessing the __m128 struct directly
506// is bad coding practice by Microsoft: @see:
507// https://learn.microsoft.com/en-us/cpp/cpp/m128
508//
509// However, some legacy source code may try to access the contents of an __m128
510// struct directly so the developer can use the SIMDVec as an alias for it. Any
511// casting must be done manually by the developer, as you cannot cast or
512// otherwise alias the base NEON data type for intrinsic operations.
513//
514// union intended to allow direct access to an __m128 variable using the names
515// that the MSVC compiler provides. This union should really only be used when
516// trying to access the members of the vector as integer values. GCC/clang
517// allow native access to the float members through a simple array access
518// operator (in C since 4.6, in C++ since 4.8).
519//
520// Ideally direct accesses to SIMD vectors should not be used since it can cause
521// a performance hit. If it really is needed however, the original __m128
522// variable can be aliased with a pointer to this union and used to access
523// individual components. The use of this union should be hidden behind a macro
524// that is used throughout the codebase to access the members instead of always
525// declaring this type of variable.
526typedef union ALIGN_STRUCT(16) SIMDVec {
527 float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
528 int8_t m128_i8[16]; // as signed 8-bit integers.
529 int16_t m128_i16[8]; // as signed 16-bit integers.
530 int32_t m128_i32[4]; // as signed 32-bit integers.
531 int64_t m128_i64[2]; // as signed 64-bit integers.
532 uint8_t m128_u8[16]; // as unsigned 8-bit integers.
533 uint16_t m128_u16[8]; // as unsigned 16-bit integers.
534 uint32_t m128_u32[4]; // as unsigned 32-bit integers.
535 uint64_t m128_u64[2]; // as unsigned 64-bit integers.
537
538// casting using SIMDVec
539#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
540#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
541#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
542
543/* SSE macros */
544#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
545#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
546#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
547#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
548
549// Function declaration
550// SSE
551FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void);
552FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
553FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
554FORCE_INLINE __m128 _mm_set_ps1(float);
555FORCE_INLINE __m128 _mm_setzero_ps(void);
556// SSE2
557FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
558FORCE_INLINE __m128i _mm_castps_si128(__m128);
560FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
561FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
562FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
563FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
564FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
565FORCE_INLINE __m128d _mm_set_pd(double, double);
566FORCE_INLINE __m128i _mm_set1_epi32(int);
567FORCE_INLINE __m128i _mm_setzero_si128(void);
568// SSE4.1
569FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
570FORCE_INLINE __m128 _mm_ceil_ps(__m128);
571FORCE_INLINE __m128d _mm_floor_pd(__m128d);
572FORCE_INLINE __m128 _mm_floor_ps(__m128);
573FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
574FORCE_INLINE __m128 _mm_round_ps(__m128, int);
575// SSE4.2
576FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
577
578/* Backwards compatibility for compilers with lack of specific type support */
579
580// Older gcc does not define vld1q_u8_x4 type
581#if defined(__GNUC__) && !defined(__clang__) && \
582 ((__GNUC__ <= 13 && defined(__arm__)) || \
583 (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
584 (__GNUC__ <= 9 && defined(__aarch64__)))
585FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
586{
587 uint8x16x4_t ret;
588 ret.val[0] = vld1q_u8(p + 0);
589 ret.val[1] = vld1q_u8(p + 16);
590 ret.val[2] = vld1q_u8(p + 32);
591 ret.val[3] = vld1q_u8(p + 48);
592 return ret;
593}
594#else
595// Wraps vld1q_u8_x4
596FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
597{
598 return vld1q_u8_x4(p);
599}
600#endif
601
602#if !defined(__aarch64__) && !defined(_M_ARM64)
603/* emulate vaddv u8 variant */
604FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
605{
606 const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
607 return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
608}
609#else
610// Wraps vaddv_u8
611FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
612{
613 return vaddv_u8(v8);
614}
615#endif
616
617#if !defined(__aarch64__) && !defined(_M_ARM64)
618/* emulate vaddvq u8 variant */
619FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
620{
621 uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
622 uint8_t res = 0;
623 for (int i = 0; i < 8; ++i)
624 res += tmp[i];
625 return res;
626}
627#else
628// Wraps vaddvq_u8
629FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
630{
631 return vaddvq_u8(a);
632}
633#endif
634
635#if !defined(__aarch64__) && !defined(_M_ARM64)
636/* emulate vaddvq u16 variant */
637FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
638{
639 uint32x4_t m = vpaddlq_u16(a);
640 uint64x2_t n = vpaddlq_u32(m);
641 uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
642
643 return vget_lane_u32((uint32x2_t) o, 0);
644}
645#else
646// Wraps vaddvq_u16
647FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
648{
649 return vaddvq_u16(a);
650}
651#endif
652
653/* Function Naming Conventions
654 * The naming convention of SSE intrinsics is straightforward. A generic SSE
655 * intrinsic function is given as follows:
656 * _mm_<name>_<data_type>
657 *
658 * The parts of this format are given as follows:
659 * 1. <name> describes the operation performed by the intrinsic
660 * 2. <data_type> identifies the data type of the function's primary arguments
661 *
662 * This last part, <data_type>, is a little complicated. It identifies the
663 * content of the input values, and can be set to any of the following values:
664 * + ps - vectors contain floats (ps stands for packed single-precision)
665 * + pd - vectors contain doubles (pd stands for packed double-precision)
666 * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
667 * signed integers
668 * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
669 * unsigned integers
670 * + si128 - unspecified 128-bit vector or 256-bit vector
671 * + m128/m128i/m128d - identifies input vector types when they are different
672 * than the type of the returned vector
673 *
674 * For example, _mm_setzero_ps. The _mm implies that the function returns
675 * a 128-bit vector. The _ps at the end implies that the argument vectors
676 * contain floats.
677 *
678 * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
679 * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
680 * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
681 * // Set packed 8-bit integers
682 * // 128 bits, 16 chars, per 8 bits
683 * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
684 * 4, 5, 12, 13, 6, 7, 14, 15);
685 * // Shuffle packed 8-bit integers
686 * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
687 */
688
689/* Constants for use with _mm_prefetch. */
691 _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
692 _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
693 _MM_HINT_T1 = 2, /* load data to L2 cache only */
694 _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
695};
696
697// The bit field mapping to the FPCR(floating-point control register)
698typedef struct {
699 uint16_t res0;
700 uint8_t res1 : 6;
701 uint8_t bit22 : 1;
702 uint8_t bit23 : 1;
703 uint8_t bit24 : 1;
704 uint8_t res2 : 7;
705#if defined(__aarch64__) || defined(_M_ARM64)
706 uint32_t res3;
707#endif
709
710// Takes the upper 64 bits of a and places it in the low end of the result
711// Takes the lower 64 bits of b and places it into the high end of the result.
713{
714 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
715 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
716 return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
717}
718
719// takes the lower two 32-bit values from a and swaps them and places in high
720// end of result takes the higher two 32 bit values from b and swaps them and
721// places in low end of result.
723{
724 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
725 float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
726 return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
727}
728
730{
731 float32x2_t a21 = vget_high_f32(
733 float32x2_t b03 = vget_low_f32(
735 return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
736}
737
739{
740 float32x2_t a03 = vget_low_f32(
742 float32x2_t b21 = vget_high_f32(
744 return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
745}
746
748{
749 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
750 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
751 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
752}
753
755{
756 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
757 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
758 return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
759}
760
762{
763 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
764 float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
765 return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
766}
767
768// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
769// high
771{
772 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
773 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
774 return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
775}
776
778{
779 float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
780 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
781 return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
782}
783
785{
786 float32x2_t a22 =
787 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
788 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
789 return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
790}
791
793{
794 float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
795 float32x2_t b22 =
796 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
797 return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
798}
799
801{
802 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
803 float32x2_t a22 =
804 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
805 float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
806 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
807 return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
808}
809
811{
812 float32x2_t a33 =
813 vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
814 float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
815 return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
816}
817
819{
820 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
821 float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
822 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
823 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
824 return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
825}
826
828{
829 float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
830 float32_t b2 = vgetq_lane_f32(b, 2);
831 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
832 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
833 return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
834}
835
837{
838 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
839 float32_t b2 = vgetq_lane_f32(b, 2);
840 float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
841 float32x2_t b20 = vset_lane_f32(b2, b00, 1);
842 return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
843}
844
845// For MSVC, we check only if it is ARM64, as every single ARM64 processor
846// supported by WoA has crypto extensions. If this changes in the future,
847// this can be verified via the runtime-only method of:
848// IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)
849#if (defined(_M_ARM64) && !defined(__clang__)) || \
850 (defined(__ARM_FEATURE_CRYPTO) && \
851 (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64)))
852// Wraps vmull_p64
853FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
854{
855 poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
856 poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
857#if defined(_MSC_VER)
858 __n64 a1 = {a}, b1 = {b};
859 return vreinterpretq_u64_p128(vmull_p64(a1, b1));
860#else
861 return vreinterpretq_u64_p128(vmull_p64(a, b));
862#endif
863}
864#else // ARMv7 polyfill
865// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
866//
867// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
868// 64-bit->128-bit polynomial multiply.
869//
870// It needs some work and is somewhat slow, but it is still faster than all
871// known scalar methods.
872//
873// Algorithm adapted to C from
874// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
875// from "Fast Software Polynomial Multiplication on ARM Processors Using the
876// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
877// (https://hal.inria.fr/hal-01506572)
878static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
879{
880 poly8x8_t a = vreinterpret_p8_u64(_a);
881 poly8x8_t b = vreinterpret_p8_u64(_b);
882
883 // Masks
884 uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
885 vcreate_u8(0x00000000ffffffff));
886 uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
887 vcreate_u8(0x0000000000000000));
888
889 // Do the multiplies, rotating with vext to get all combinations
890 uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
891 uint8x16_t e =
892 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
893 uint8x16_t f =
894 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
895 uint8x16_t g =
896 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
897 uint8x16_t h =
898 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
899 uint8x16_t i =
900 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
901 uint8x16_t j =
902 vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
903 uint8x16_t k =
904 vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
905
906 // Add cross products
907 uint8x16_t l = veorq_u8(e, f); // L = E + F
908 uint8x16_t m = veorq_u8(g, h); // M = G + H
909 uint8x16_t n = veorq_u8(i, j); // N = I + J
910
911 // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
912 // instructions.
913#if defined(__aarch64__)
914 uint8x16_t lm_p0 = vreinterpretq_u8_u64(
915 vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
916 uint8x16_t lm_p1 = vreinterpretq_u8_u64(
917 vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
918 uint8x16_t nk_p0 = vreinterpretq_u8_u64(
919 vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
920 uint8x16_t nk_p1 = vreinterpretq_u8_u64(
921 vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
922#else
923 uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
924 uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
925 uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
926 uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
927#endif
928 // t0 = (L) (P0 + P1) << 8
929 // t1 = (M) (P2 + P3) << 16
930 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
931 uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
932 uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
933
934 // t2 = (N) (P4 + P5) << 24
935 // t3 = (K) (P6 + P7) << 32
936 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
937 uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
938 uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
939
940 // De-interleave
941#if defined(__aarch64__)
942 uint8x16_t t0 = vreinterpretq_u8_u64(
943 vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
944 uint8x16_t t1 = vreinterpretq_u8_u64(
945 vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
946 uint8x16_t t2 = vreinterpretq_u8_u64(
947 vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
948 uint8x16_t t3 = vreinterpretq_u8_u64(
949 vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
950#else
951 uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
952 uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
953 uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
954 uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
955#endif
956 // Shift the cross products
957 uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
958 uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
959 uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
960 uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
961
962 // Accumulate the products
963 uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
964 uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
965 uint8x16_t mix = veorq_u8(d, cross1);
966 uint8x16_t r = veorq_u8(mix, cross2);
967 return vreinterpretq_u64_u8(r);
968}
969#endif // ARMv7 polyfill
970
971// C equivalent:
972// __m128i _mm_shuffle_epi32_default(__m128i a,
973// __constrange(0, 255) int imm) {
974// __m128i ret;
975// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
976// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
977// return ret;
978// }
979#define _mm_shuffle_epi32_default(a, imm) \
980 vreinterpretq_m128i_s32(vsetq_lane_s32( \
981 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
982 vsetq_lane_s32( \
983 vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
984 vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), \
985 ((imm) >> 2) & 0x3), \
986 vmovq_n_s32(vgetq_lane_s32( \
987 vreinterpretq_s32_m128i(a), (imm) & (0x3))), \
988 1), \
989 2), \
990 3))
991
992// Takes the upper 64 bits of a and places it in the low end of the result
993// Takes the lower 64 bits of a and places it into the high end of the result.
995{
996 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
997 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
998 return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
999}
1000
1001// takes the lower two 32-bit values from a and swaps them and places in low end
1002// of result takes the higher two 32 bit values from a and swaps them and places
1003// in high end of result.
1005{
1006 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1007 int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
1008 return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
1009}
1010
1011// rotates the least significant 32 bits into the most significant 32 bits, and
1012// shifts the rest down
1014{
1017}
1018
1019// rotates the most significant 32 bits into the least significant 32 bits, and
1020// shifts the rest up
1022{
1025}
1026
1027// gets the lower 64 bits of a, and places it in the upper 64 bits
1028// gets the lower 64 bits of a and places it in the lower 64 bits
1030{
1031 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1032 return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
1033}
1034
1035// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
1036// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
1038{
1039 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1040 int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1041 return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
1042}
1043
1044// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
1045// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
1046// places it in the lower 64 bits
1048{
1049 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1050 return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
1051}
1052
1054{
1055 int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
1056 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1057 return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
1058}
1059
1061{
1062 int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1063 int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1064 return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
1065}
1066
1068{
1069 int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1070 int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
1071 return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
1072}
1073
1074#if defined(__aarch64__) || defined(_M_ARM64)
1075#define _mm_shuffle_epi32_splat(a, imm) \
1076 vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm)))
1077#else
1078#define _mm_shuffle_epi32_splat(a, imm) \
1079 vreinterpretq_m128i_s32( \
1080 vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))))
1081#endif
1082
1083// NEON does not support a general purpose permute intrinsic.
1084// Shuffle single-precision (32-bit) floating-point elements in a using the
1085// control in imm8, and store the results in dst.
1086//
1087// C equivalent:
1088// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
1089// __constrange(0, 255) int imm) {
1090// __m128 ret;
1091// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
1092// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
1093// return ret;
1094// }
1095//
1096// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
1097#define _mm_shuffle_ps_default(a, b, imm) \
1098 vreinterpretq_m128_f32(vsetq_lane_f32( \
1099 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
1100 vsetq_lane_f32( \
1101 vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
1102 vsetq_lane_f32( \
1103 vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
1104 vmovq_n_f32( \
1105 vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))), \
1106 1), \
1107 2), \
1108 3))
1109
1110// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
1111// Store the results in the low 64 bits of dst, with the high 64 bits being
1112// copied from a to dst.
1113// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
1114#define _mm_shufflelo_epi16_function(a, imm) \
1115 _sse2neon_define1( \
1116 __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \
1117 int16x4_t lowBits = vget_low_s16(ret); \
1118 ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
1119 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1120 1); \
1121 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
1122 2); \
1123 ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1124 3); \
1125 _sse2neon_return(vreinterpretq_m128i_s16(ret));)
1126
1127// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
1128// Store the results in the high 64 bits of dst, with the low 64 bits being
1129// copied from a to dst.
1130// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
1131#define _mm_shufflehi_epi16_function(a, imm) \
1132 _sse2neon_define1( \
1133 __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \
1134 int16x4_t highBits = vget_high_s16(ret); \
1135 ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
1136 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1137 5); \
1138 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1139 6); \
1140 ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1141 7); \
1142 _sse2neon_return(vreinterpretq_m128i_s16(ret));)
1143
1144/* MMX */
1145
1146//_mm_empty is a no-op on arm
1147FORCE_INLINE void _mm_empty(void) {}
1148
1149/* SSE */
1150
1151// Add packed single-precision (32-bit) floating-point elements in a and b, and
1152// store the results in dst.
1153// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
1154FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
1155{
1158}
1159
1160// Add the lower single-precision (32-bit) floating-point element in a and b,
1161// store the result in the lower element of dst, and copy the upper 3 packed
1162// elements from a to the upper elements of dst.
1163// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
1164FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
1165{
1166 float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
1167 float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1168 // the upper values in the result must be the remnants of <a>.
1169 return vreinterpretq_m128_f32(vaddq_f32(a, value));
1170}
1171
1172// Compute the bitwise AND of packed single-precision (32-bit) floating-point
1173// elements in a and b, and store the results in dst.
1174// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
1175FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
1176{
1179}
1180
1181// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
1182// elements in a and then AND with b, and store the results in dst.
1183// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
1185{
1187 vbicq_s32(vreinterpretq_s32_m128(b),
1188 vreinterpretq_s32_m128(a))); // *NOTE* argument swap
1189}
1190
1191// Average packed unsigned 16-bit integers in a and b, and store the results in
1192// dst.
1193// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
1194FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
1195{
1196 return vreinterpret_m64_u16(
1197 vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
1198}
1199
1200// Average packed unsigned 8-bit integers in a and b, and store the results in
1201// dst.
1202// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
1203FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
1204{
1205 return vreinterpret_m64_u8(
1206 vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1207}
1208
1209// Compare packed single-precision (32-bit) floating-point elements in a and b
1210// for equality, and store the results in dst.
1211// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
1213{
1216}
1217
1218// Compare the lower single-precision (32-bit) floating-point elements in a and
1219// b for equality, store the result in the lower element of dst, and copy the
1220// upper 3 packed elements from a to the upper elements of dst.
1221// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
1223{
1224 return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
1225}
1226
1227// Compare packed single-precision (32-bit) floating-point elements in a and b
1228// for greater-than-or-equal, and store the results in dst.
1229// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
1231{
1234}
1235
1236// Compare the lower single-precision (32-bit) floating-point elements in a and
1237// b for greater-than-or-equal, store the result in the lower element of dst,
1238// and copy the upper 3 packed elements from a to the upper elements of dst.
1239// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
1241{
1242 return _mm_move_ss(a, _mm_cmpge_ps(a, b));
1243}
1244
1245// Compare packed single-precision (32-bit) floating-point elements in a and b
1246// for greater-than, and store the results in dst.
1247// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
1249{
1252}
1253
1254// Compare the lower single-precision (32-bit) floating-point elements in a and
1255// b for greater-than, store the result in the lower element of dst, and copy
1256// the upper 3 packed elements from a to the upper elements of dst.
1257// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
1259{
1260 return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
1261}
1262
1263// Compare packed single-precision (32-bit) floating-point elements in a and b
1264// for less-than-or-equal, and store the results in dst.
1265// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
1267{
1270}
1271
1272// Compare the lower single-precision (32-bit) floating-point elements in a and
1273// b for less-than-or-equal, store the result in the lower element of dst, and
1274// copy the upper 3 packed elements from a to the upper elements of dst.
1275// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
1277{
1278 return _mm_move_ss(a, _mm_cmple_ps(a, b));
1279}
1280
1281// Compare packed single-precision (32-bit) floating-point elements in a and b
1282// for less-than, and store the results in dst.
1283// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
1285{
1288}
1289
1290// Compare the lower single-precision (32-bit) floating-point elements in a and
1291// b for less-than, store the result in the lower element of dst, and copy the
1292// upper 3 packed elements from a to the upper elements of dst.
1293// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
1295{
1296 return _mm_move_ss(a, _mm_cmplt_ps(a, b));
1297}
1298
1299// Compare packed single-precision (32-bit) floating-point elements in a and b
1300// for not-equal, and store the results in dst.
1301// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
1303{
1304 return vreinterpretq_m128_u32(vmvnq_u32(
1306}
1307
1308// Compare the lower single-precision (32-bit) floating-point elements in a and
1309// b for not-equal, store the result in the lower element of dst, and copy the
1310// upper 3 packed elements from a to the upper elements of dst.
1311// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
1313{
1314 return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
1315}
1316
1317// Compare packed single-precision (32-bit) floating-point elements in a and b
1318// for not-greater-than-or-equal, and store the results in dst.
1319// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
1321{
1322 return vreinterpretq_m128_u32(vmvnq_u32(
1324}
1325
1326// Compare the lower single-precision (32-bit) floating-point elements in a and
1327// b for not-greater-than-or-equal, store the result in the lower element of
1328// dst, and copy the upper 3 packed elements from a to the upper elements of
1329// dst.
1330// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
1332{
1333 return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
1334}
1335
1336// Compare packed single-precision (32-bit) floating-point elements in a and b
1337// for not-greater-than, and store the results in dst.
1338// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
1340{
1341 return vreinterpretq_m128_u32(vmvnq_u32(
1343}
1344
1345// Compare the lower single-precision (32-bit) floating-point elements in a and
1346// b for not-greater-than, store the result in the lower element of dst, and
1347// copy the upper 3 packed elements from a to the upper elements of dst.
1348// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
1350{
1351 return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
1352}
1353
1354// Compare packed single-precision (32-bit) floating-point elements in a and b
1355// for not-less-than-or-equal, and store the results in dst.
1356// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
1358{
1359 return vreinterpretq_m128_u32(vmvnq_u32(
1361}
1362
1363// Compare the lower single-precision (32-bit) floating-point elements in a and
1364// b for not-less-than-or-equal, store the result in the lower element of dst,
1365// and copy the upper 3 packed elements from a to the upper elements of dst.
1366// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
1368{
1369 return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
1370}
1371
1372// Compare packed single-precision (32-bit) floating-point elements in a and b
1373// for not-less-than, and store the results in dst.
1374// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
1376{
1377 return vreinterpretq_m128_u32(vmvnq_u32(
1379}
1380
1381// Compare the lower single-precision (32-bit) floating-point elements in a and
1382// b for not-less-than, store the result in the lower element of dst, and copy
1383// the upper 3 packed elements from a to the upper elements of dst.
1384// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
1386{
1387 return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
1388}
1389
1390// Compare packed single-precision (32-bit) floating-point elements in a and b
1391// to see if neither is NaN, and store the results in dst.
1392// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
1393//
1394// See also:
1395// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
1396// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
1398{
1399 // Note: NEON does not have ordered compare builtin
1400 // Need to compare a eq a and b eq b to check for NaN
1401 // Do AND of results to get final
1402 uint32x4_t ceqaa =
1404 uint32x4_t ceqbb =
1406 return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
1407}
1408
1409// Compare the lower single-precision (32-bit) floating-point elements in a and
1410// b to see if neither is NaN, store the result in the lower element of dst, and
1411// copy the upper 3 packed elements from a to the upper elements of dst.
1412// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
1414{
1415 return _mm_move_ss(a, _mm_cmpord_ps(a, b));
1416}
1417
1418// Compare packed single-precision (32-bit) floating-point elements in a and b
1419// to see if either is NaN, and store the results in dst.
1420// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
1422{
1423 uint32x4_t f32a =
1425 uint32x4_t f32b =
1427 return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
1428}
1429
1430// Compare the lower single-precision (32-bit) floating-point elements in a and
1431// b to see if either is NaN, store the result in the lower element of dst, and
1432// copy the upper 3 packed elements from a to the upper elements of dst.
1433// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
1435{
1436 return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
1437}
1438
1439// Compare the lower single-precision (32-bit) floating-point element in a and b
1440// for equality, and return the boolean result (0 or 1).
1441// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
1442FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
1443{
1444 uint32x4_t a_eq_b =
1446 return vgetq_lane_u32(a_eq_b, 0) & 0x1;
1447}
1448
1449// Compare the lower single-precision (32-bit) floating-point element in a and b
1450// for greater-than-or-equal, and return the boolean result (0 or 1).
1451// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
1452FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
1453{
1454 uint32x4_t a_ge_b =
1456 return vgetq_lane_u32(a_ge_b, 0) & 0x1;
1457}
1458
1459// Compare the lower single-precision (32-bit) floating-point element in a and b
1460// for greater-than, and return the boolean result (0 or 1).
1461// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
1462FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
1463{
1464 uint32x4_t a_gt_b =
1466 return vgetq_lane_u32(a_gt_b, 0) & 0x1;
1467}
1468
1469// Compare the lower single-precision (32-bit) floating-point element in a and b
1470// for less-than-or-equal, and return the boolean result (0 or 1).
1471// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
1472FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
1473{
1474 uint32x4_t a_le_b =
1476 return vgetq_lane_u32(a_le_b, 0) & 0x1;
1477}
1478
1479// Compare the lower single-precision (32-bit) floating-point element in a and b
1480// for less-than, and return the boolean result (0 or 1).
1481// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
1482FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
1483{
1484 uint32x4_t a_lt_b =
1486 return vgetq_lane_u32(a_lt_b, 0) & 0x1;
1487}
1488
1489// Compare the lower single-precision (32-bit) floating-point element in a and b
1490// for not-equal, and return the boolean result (0 or 1).
1491// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
1492FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
1493{
1494 return !_mm_comieq_ss(a, b);
1495}
1496
1497// Convert packed signed 32-bit integers in b to packed single-precision
1498// (32-bit) floating-point elements, store the results in the lower 2 elements
1499// of dst, and copy the upper 2 packed elements from a to the upper elements of
1500// dst.
1501// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
1503{
1505 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1506 vget_high_f32(vreinterpretq_f32_m128(a))));
1507}
1508
1509// Convert packed single-precision (32-bit) floating-point elements in a to
1510// packed 32-bit integers, and store the results in dst.
1511// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
1513{
1514#if (defined(__aarch64__) || defined(_M_ARM64)) || \
1515 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1516 return vreinterpret_m64_s32(
1517 vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
1518#else
1519 return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
1521#endif
1522}
1523
1524// Convert the signed 32-bit integer b to a single-precision (32-bit)
1525// floating-point element, store the result in the lower element of dst, and
1526// copy the upper 3 packed elements from a to the upper elements of dst.
1527// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
1528FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
1529{
1531 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1532}
1533
1534// Convert the lower single-precision (32-bit) floating-point element in a to a
1535// 32-bit integer, and store the result in dst.
1536// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
1537FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
1538{
1539#if (defined(__aarch64__) || defined(_M_ARM64)) || \
1540 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1541 return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
1542 0);
1543#else
1544 float32_t data = vgetq_lane_f32(
1546 return (int32_t) data;
1547#endif
1548}
1549
1550// Convert packed 16-bit integers in a to packed single-precision (32-bit)
1551// floating-point elements, and store the results in dst.
1552// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
1554{
1556 vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
1557}
1558
1559// Convert packed 32-bit integers in b to packed single-precision (32-bit)
1560// floating-point elements, store the results in the lower 2 elements of dst,
1561// and copy the upper 2 packed elements from a to the upper elements of dst.
1562// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
1564{
1566 vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1567 vget_high_f32(vreinterpretq_f32_m128(a))));
1568}
1569
1570// Convert packed signed 32-bit integers in a to packed single-precision
1571// (32-bit) floating-point elements, store the results in the lower 2 elements
1572// of dst, then convert the packed signed 32-bit integers in b to
1573// single-precision (32-bit) floating-point element, and store the results in
1574// the upper 2 elements of dst.
1575// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
1577{
1578 return vreinterpretq_m128_f32(vcvtq_f32_s32(
1579 vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
1580}
1581
1582// Convert the lower packed 8-bit integers in a to packed single-precision
1583// (32-bit) floating-point elements, and store the results in dst.
1584// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
1586{
1587 return vreinterpretq_m128_f32(vcvtq_f32_s32(
1588 vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
1589}
1590
1591// Convert packed single-precision (32-bit) floating-point elements in a to
1592// packed 16-bit integers, and store the results in dst. Note: this intrinsic
1593// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
1594// 0x7FFFFFFF.
1595// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
1597{
1598 return vreinterpret_m64_s16(
1600}
1601
1602// Convert packed single-precision (32-bit) floating-point elements in a to
1603// packed 32-bit integers, and store the results in dst.
1604// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
1605#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1606
1607// Convert packed single-precision (32-bit) floating-point elements in a to
1608// packed 8-bit integers, and store the results in lower 4 elements of dst.
1609// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
1610// between 0x7F and 0x7FFFFFFF.
1611// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
1613{
1614 return vreinterpret_m64_s8(vqmovn_s16(
1615 vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
1616}
1617
1618// Convert packed unsigned 16-bit integers in a to packed single-precision
1619// (32-bit) floating-point elements, and store the results in dst.
1620// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
1622{
1624 vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
1625}
1626
1627// Convert the lower packed unsigned 8-bit integers in a to packed
1628// single-precision (32-bit) floating-point elements, and store the results in
1629// dst.
1630// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
1632{
1633 return vreinterpretq_m128_f32(vcvtq_f32_u32(
1634 vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
1635}
1636
1637// Convert the signed 32-bit integer b to a single-precision (32-bit)
1638// floating-point element, store the result in the lower element of dst, and
1639// copy the upper 3 packed elements from a to the upper elements of dst.
1640// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
1641#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1642
1643// Convert the signed 64-bit integer b to a single-precision (32-bit)
1644// floating-point element, store the result in the lower element of dst, and
1645// copy the upper 3 packed elements from a to the upper elements of dst.
1646// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
1647FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
1648{
1650 vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1651}
1652
1653// Copy the lower single-precision (32-bit) floating-point element of a to dst.
1654// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
1655FORCE_INLINE float _mm_cvtss_f32(__m128 a)
1656{
1657 return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1658}
1659
1660// Convert the lower single-precision (32-bit) floating-point element in a to a
1661// 32-bit integer, and store the result in dst.
1662// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
1663#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1664
1665// Convert the lower single-precision (32-bit) floating-point element in a to a
1666// 64-bit integer, and store the result in dst.
1667// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
1668FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
1669{
1670#if (defined(__aarch64__) || defined(_M_ARM64)) || \
1671 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1672 return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
1673#else
1674 float32_t data = vgetq_lane_f32(
1676 return (int64_t) data;
1677#endif
1678}
1679
1680// Convert packed single-precision (32-bit) floating-point elements in a to
1681// packed 32-bit integers with truncation, and store the results in dst.
1682// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
1684{
1685 return vreinterpret_m64_s32(
1686 vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
1687}
1688
1689// Convert the lower single-precision (32-bit) floating-point element in a to a
1690// 32-bit integer with truncation, and store the result in dst.
1691// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
1692FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
1693{
1694 return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
1695}
1696
1697// Convert packed single-precision (32-bit) floating-point elements in a to
1698// packed 32-bit integers with truncation, and store the results in dst.
1699// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
1700#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1701
1702// Convert the lower single-precision (32-bit) floating-point element in a to a
1703// 32-bit integer with truncation, and store the result in dst.
1704// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
1705#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1706
1707// Convert the lower single-precision (32-bit) floating-point element in a to a
1708// 64-bit integer with truncation, and store the result in dst.
1709// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
1710FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
1711{
1712 return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1713}
1714
1715// Divide packed single-precision (32-bit) floating-point elements in a by
1716// packed elements in b, and store the results in dst.
1717// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement
1718// division by multiplying a by b's reciprocal before using the Newton-Raphson
1719// method to approximate the results.
1720// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
1721FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
1722{
1723#if defined(__aarch64__) || defined(_M_ARM64)
1726#else
1727 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
1728 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1729 // Additional Netwon-Raphson iteration for accuracy
1730 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1731 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
1732#endif
1733}
1734
1735// Divide the lower single-precision (32-bit) floating-point element in a by the
1736// lower single-precision (32-bit) floating-point element in b, store the result
1737// in the lower element of dst, and copy the upper 3 packed elements from a to
1738// the upper elements of dst.
1739// Warning: ARMv7-A does not produce the same result compared to Intel and not
1740// IEEE-compliant.
1741// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
1742FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
1743{
1744 float32_t value =
1745 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
1747 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1748}
1749
1750// Extract a 16-bit integer from a, selected with imm8, and store the result in
1751// the lower element of dst.
1752// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
1753#define _mm_extract_pi16(a, imm) \
1754 (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1755
1756// Free aligned memory that was allocated with _mm_malloc.
1757// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
1758#if !defined(SSE2NEON_ALLOC_DEFINED)
1759FORCE_INLINE void _mm_free(void *addr)
1760{
1761 free(addr);
1762}
1763#endif
1764
1765FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
1766{
1767 uint64_t value;
1768#if defined(_MSC_VER)
1769 value = _ReadStatusReg(ARM64_FPCR);
1770#else
1771 __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */
1772#endif
1773 return value;
1774}
1775
1776FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
1777{
1778#if defined(_MSC_VER)
1779 _WriteStatusReg(ARM64_FPCR, value);
1780#else
1781 __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */
1782#endif
1783}
1784
1785// Macro: Get the flush zero bits from the MXCSR control and status register.
1786// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
1787// _MM_FLUSH_ZERO_OFF
1788// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
1789FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
1790{
1791 union {
1792 fpcr_bitfield field;
1793#if defined(__aarch64__) || defined(_M_ARM64)
1794 uint64_t value;
1795#else
1796 uint32_t value;
1797#endif
1798 } r;
1799
1800#if defined(__aarch64__) || defined(_M_ARM64)
1801 r.value = _sse2neon_get_fpcr();
1802#else
1803 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1804#endif
1805
1806 return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
1807}
1808
1809// Macro: Get the rounding mode bits from the MXCSR control and status register.
1810// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
1811// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
1812// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
1813FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
1814{
1815 union {
1816 fpcr_bitfield field;
1817#if defined(__aarch64__) || defined(_M_ARM64)
1818 uint64_t value;
1819#else
1820 uint32_t value;
1821#endif
1822 } r;
1823
1824#if defined(__aarch64__) || defined(_M_ARM64)
1825 r.value = _sse2neon_get_fpcr();
1826#else
1827 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1828#endif
1829
1830 if (r.field.bit22) {
1831 return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
1832 } else {
1833 return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
1834 }
1835}
1836
1837// Copy a to dst, and insert the 16-bit integer i into dst at the location
1838// specified by imm8.
1839// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
1840#define _mm_insert_pi16(a, b, imm) \
1841 vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm)))
1842
1843// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
1844// elements) from memory into dst. mem_addr must be aligned on a 16-byte
1845// boundary or a general-protection exception may be generated.
1846// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
1847FORCE_INLINE __m128 _mm_load_ps(const float *p)
1848{
1849 return vreinterpretq_m128_f32(vld1q_f32(p));
1850}
1851
1852// Load a single-precision (32-bit) floating-point element from memory into all
1853// elements of dst.
1854//
1855// dst[31:0] := MEM[mem_addr+31:mem_addr]
1856// dst[63:32] := MEM[mem_addr+31:mem_addr]
1857// dst[95:64] := MEM[mem_addr+31:mem_addr]
1858// dst[127:96] := MEM[mem_addr+31:mem_addr]
1859//
1860// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
1861#define _mm_load_ps1 _mm_load1_ps
1862
1863// Load a single-precision (32-bit) floating-point element from memory into the
1864// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
1865// aligned on any particular boundary.
1866// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
1867FORCE_INLINE __m128 _mm_load_ss(const float *p)
1868{
1869 return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1870}
1871
1872// Load a single-precision (32-bit) floating-point element from memory into all
1873// elements of dst.
1874// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
1875FORCE_INLINE __m128 _mm_load1_ps(const float *p)
1876{
1877 return vreinterpretq_m128_f32(vld1q_dup_f32(p));
1878}
1879
1880// Load 2 single-precision (32-bit) floating-point elements from memory into the
1881// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
1882// mem_addr does not need to be aligned on any particular boundary.
1883// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
1884FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
1885{
1887 vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
1888}
1889
1890// Load 2 single-precision (32-bit) floating-point elements from memory into the
1891// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
1892// mem_addr does not need to be aligned on any particular boundary.
1893// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
1894FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
1895{
1897 vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
1898}
1899
1900// Load 4 single-precision (32-bit) floating-point elements from memory into dst
1901// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1902// general-protection exception may be generated.
1903// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
1904FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
1905{
1906 float32x4_t v = vrev64q_f32(vld1q_f32(p));
1907 return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
1908}
1909
1910// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
1911// elements) from memory into dst. mem_addr does not need to be aligned on any
1912// particular boundary.
1913// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
1914FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
1915{
1916 // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
1917 // equivalent for neon
1918 return vreinterpretq_m128_f32(vld1q_f32(p));
1919}
1920
1921// Load unaligned 16-bit integer from memory into the first element of dst.
1922// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
1923FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
1924{
1926 vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
1927}
1928
1929// Load unaligned 64-bit integer from memory into the first element of dst.
1930// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
1931FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
1932{
1934 vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
1935}
1936
1937// Allocate size bytes of memory, aligned to the alignment specified in align,
1938// and return a pointer to the allocated memory. _mm_free should be used to free
1939// memory that is allocated with _mm_malloc.
1940// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
1941#if !defined(SSE2NEON_ALLOC_DEFINED)
1942FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
1943{
1944 void *ptr;
1945 if (align == 1)
1946 return malloc(size);
1947 if (align == 2 || (sizeof(void *) == 8 && align == 4))
1948 align = sizeof(void *);
1949 if (!posix_memalign(&ptr, align, size))
1950 return ptr;
1951 return NULL;
1952}
1953#endif
1954
1955// Conditionally store 8-bit integer elements from a into memory using mask
1956// (elements are not stored when the highest bit is not set in the corresponding
1957// element) and a non-temporal memory hint.
1958// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
1959FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
1960{
1961 int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
1962 __m128 b = _mm_load_ps((const float *) mem_addr);
1963 int8x8_t masked =
1964 vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
1965 vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
1966 vst1_s8((int8_t *) mem_addr, masked);
1967}
1968
1969// Conditionally store 8-bit integer elements from a into memory using mask
1970// (elements are not stored when the highest bit is not set in the corresponding
1971// element) and a non-temporal memory hint.
1972// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
1973#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
1974
1975// Compare packed signed 16-bit integers in a and b, and store packed maximum
1976// values in dst.
1977// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
1978FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
1979{
1980 return vreinterpret_m64_s16(
1982}
1983
1984// Compare packed single-precision (32-bit) floating-point elements in a and b,
1985// and store packed maximum values in dst. dst does not follow the IEEE Standard
1986// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
1987// signed-zero values.
1988// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
1989FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
1990{
1991#if SSE2NEON_PRECISE_MINMAX
1992 float32x4_t _a = vreinterpretq_f32_m128(a);
1993 float32x4_t _b = vreinterpretq_f32_m128(b);
1994 return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
1995#else
1998#endif
1999}
2000
2001// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2002// values in dst.
2003// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
2004FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
2005{
2006 return vreinterpret_m64_u8(
2008}
2009
2010// Compare the lower single-precision (32-bit) floating-point elements in a and
2011// b, store the maximum value in the lower element of dst, and copy the upper 3
2012// packed elements from a to the upper element of dst. dst does not follow the
2013// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
2014// inputs are NaN or signed-zero values.
2015// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
2016FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
2017{
2018 float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
2020 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2021}
2022
2023// Compare packed signed 16-bit integers in a and b, and store packed minimum
2024// values in dst.
2025// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
2026FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
2027{
2028 return vreinterpret_m64_s16(
2030}
2031
2032// Compare packed single-precision (32-bit) floating-point elements in a and b,
2033// and store packed minimum values in dst. dst does not follow the IEEE Standard
2034// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
2035// signed-zero values.
2036// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
2037FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
2038{
2039#if SSE2NEON_PRECISE_MINMAX
2040 float32x4_t _a = vreinterpretq_f32_m128(a);
2041 float32x4_t _b = vreinterpretq_f32_m128(b);
2042 return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
2043#else
2046#endif
2047}
2048
2049// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2050// values in dst.
2051// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
2052FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
2053{
2054 return vreinterpret_m64_u8(
2056}
2057
2058// Compare the lower single-precision (32-bit) floating-point elements in a and
2059// b, store the minimum value in the lower element of dst, and copy the upper 3
2060// packed elements from a to the upper element of dst. dst does not follow the
2061// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
2062// inputs are NaN or signed-zero values.
2063// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
2064FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
2065{
2066 float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
2068 vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2069}
2070
2071// Move the lower single-precision (32-bit) floating-point element from b to the
2072// lower element of dst, and copy the upper 3 packed elements from a to the
2073// upper elements of dst.
2074// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
2076{
2078 vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
2079 vreinterpretq_f32_m128(a), 0));
2080}
2081
2082// Move the upper 2 single-precision (32-bit) floating-point elements from b to
2083// the lower 2 elements of dst, and copy the upper 2 elements from a to the
2084// upper 2 elements of dst.
2085// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
2087{
2088#if defined(aarch64__)
2091#else
2092 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
2093 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
2094 return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
2095#endif
2096}
2097
2098// Move the lower 2 single-precision (32-bit) floating-point elements from b to
2099// the upper 2 elements of dst, and copy the lower 2 elements from a to the
2100// lower 2 elements of dst.
2101// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
2102FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
2103{
2104 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
2105 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
2106 return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
2107}
2108
2109// Create mask from the most significant bit of each 8-bit element in a, and
2110// store the result in dst.
2111// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
2112FORCE_INLINE int _mm_movemask_pi8(__m64 a)
2113{
2114 uint8x8_t input = vreinterpret_u8_m64(a);
2115#if defined(__aarch64__) || defined(_M_ARM64)
2116 static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7};
2117 uint8x8_t tmp = vshr_n_u8(input, 7);
2118 return vaddv_u8(vshl_u8(tmp, vld1_s8(shift)));
2119#else
2120 // Refer the implementation of `_mm_movemask_epi8`
2121 uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2122 uint32x2_t paired16 =
2123 vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2124 uint8x8_t paired32 =
2125 vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2126 return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2127#endif
2128}
2129
2130// Set each bit of mask dst based on the most significant bit of the
2131// corresponding packed single-precision (32-bit) floating-point element in a.
2132// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
2133FORCE_INLINE int _mm_movemask_ps(__m128 a)
2134{
2135 uint32x4_t input = vreinterpretq_u32_m128(a);
2136#if defined(__aarch64__) || defined(_M_ARM64)
2137 static const int32_t shift[4] = {0, 1, 2, 3};
2138 uint32x4_t tmp = vshrq_n_u32(input, 31);
2139 return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)));
2140#else
2141 // Uses the exact same method as _mm_movemask_epi8, see that for details.
2142 // Shift out everything but the sign bits with a 32-bit unsigned shift
2143 // right.
2144 uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2145 // Merge the two pairs together with a 64-bit unsigned shift right + add.
2146 uint8x16_t paired =
2147 vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2148 // Extract the result.
2149 return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2150#endif
2151}
2152
2153// Multiply packed single-precision (32-bit) floating-point elements in a and b,
2154// and store the results in dst.
2155// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
2156FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2157{
2160}
2161
2162// Multiply the lower single-precision (32-bit) floating-point element in a and
2163// b, store the result in the lower element of dst, and copy the upper 3 packed
2164// elements from a to the upper elements of dst.
2165// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
2166FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
2167{
2168 return _mm_move_ss(a, _mm_mul_ps(a, b));
2169}
2170
2171// Multiply the packed unsigned 16-bit integers in a and b, producing
2172// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2173// integers in dst.
2174// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
2176{
2177 return vreinterpret_m64_u16(vshrn_n_u32(
2178 vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
2179}
2180
2181// Compute the bitwise OR of packed single-precision (32-bit) floating-point
2182// elements in a and b, and store the results in dst.
2183// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
2184FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
2185{
2188}
2189
2190// Average packed unsigned 8-bit integers in a and b, and store the results in
2191// dst.
2192// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
2193#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2194
2195// Average packed unsigned 16-bit integers in a and b, and store the results in
2196// dst.
2197// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
2198#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2199
2200// Extract a 16-bit integer from a, selected with imm8, and store the result in
2201// the lower element of dst.
2202// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
2203#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2204
2205// Copy a to dst, and insert the 16-bit integer i into dst at the location
2206// specified by imm8.
2207// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
2208#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2209
2210// Compare packed signed 16-bit integers in a and b, and store packed maximum
2211// values in dst.
2212// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
2213#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2214
2215// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2216// values in dst.
2217// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
2218#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2219
2220// Compare packed signed 16-bit integers in a and b, and store packed minimum
2221// values in dst.
2222// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
2223#define _m_pminsw(a, b) _mm_min_pi16(a, b)
2224
2225// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2226// values in dst.
2227// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
2228#define _m_pminub(a, b) _mm_min_pu8(a, b)
2229
2230// Create mask from the most significant bit of each 8-bit element in a, and
2231// store the result in dst.
2232// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
2233#define _m_pmovmskb(a) _mm_movemask_pi8(a)
2234
2235// Multiply the packed unsigned 16-bit integers in a and b, producing
2236// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2237// integers in dst.
2238// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
2239#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2240
2241// Fetch the line of data from memory that contains address p to a location in
2242// the cache hierarchy specified by the locality hint i.
2243// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
2244FORCE_INLINE void _mm_prefetch(char const *p, int i)
2245{
2246 (void) i;
2247#if defined(_MSC_VER)
2248 switch (i) {
2249 case _MM_HINT_NTA:
2250 __prefetch2(p, 1);
2251 break;
2252 case _MM_HINT_T0:
2253 __prefetch2(p, 0);
2254 break;
2255 case _MM_HINT_T1:
2256 __prefetch2(p, 2);
2257 break;
2258 case _MM_HINT_T2:
2259 __prefetch2(p, 4);
2260 break;
2261 }
2262#else
2263 switch (i) {
2264 case _MM_HINT_NTA:
2265 __builtin_prefetch(p, 0, 0);
2266 break;
2267 case _MM_HINT_T0:
2268 __builtin_prefetch(p, 0, 3);
2269 break;
2270 case _MM_HINT_T1:
2271 __builtin_prefetch(p, 0, 2);
2272 break;
2273 case _MM_HINT_T2:
2274 __builtin_prefetch(p, 0, 1);
2275 break;
2276 }
2277#endif
2278}
2279
2280// Compute the absolute differences of packed unsigned 8-bit integers in a and
2281// b, then horizontally sum each consecutive 8 differences to produce four
2282// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2283// 16 bits of dst.
2284// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
2285#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2286
2287// Shuffle 16-bit integers in a using the control in imm8, and store the results
2288// in dst.
2289// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
2290#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2291
2292// Compute the approximate reciprocal of packed single-precision (32-bit)
2293// floating-point elements in a, and store the results in dst. The maximum
2294// relative error for this approximation is less than 1.5*2^-12.
2295// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
2296FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
2297{
2298 float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2299 recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2300 return vreinterpretq_m128_f32(recip);
2301}
2302
2303// Compute the approximate reciprocal of the lower single-precision (32-bit)
2304// floating-point element in a, store the result in the lower element of dst,
2305// and copy the upper 3 packed elements from a to the upper elements of dst. The
2306// maximum relative error for this approximation is less than 1.5*2^-12.
2307// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
2308FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
2309{
2310 return _mm_move_ss(a, _mm_rcp_ps(a));
2311}
2312
2313// Compute the approximate reciprocal square root of packed single-precision
2314// (32-bit) floating-point elements in a, and store the results in dst. The
2315// maximum relative error for this approximation is less than 1.5*2^-12.
2316// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
2318{
2319 float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2320
2321 // Generate masks for detecting whether input has any 0.0f/-0.0f
2322 // (which becomes positive/negative infinity by IEEE-754 arithmetic rules).
2323 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2324 const uint32x4_t neg_inf = vdupq_n_u32(0xFF800000);
2325 const uint32x4_t has_pos_zero =
2326 vceqq_u32(pos_inf, vreinterpretq_u32_f32(out));
2327 const uint32x4_t has_neg_zero =
2328 vceqq_u32(neg_inf, vreinterpretq_u32_f32(out));
2329
2330 out = vmulq_f32(
2331 out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2332
2333 // Set output vector element to infinity/negative-infinity if
2334 // the corresponding input vector element is 0.0f/-0.0f.
2335 out = vbslq_f32(has_pos_zero, (float32x4_t) pos_inf, out);
2336 out = vbslq_f32(has_neg_zero, (float32x4_t) neg_inf, out);
2337
2338 return vreinterpretq_m128_f32(out);
2339}
2340
2341// Compute the approximate reciprocal square root of the lower single-precision
2342// (32-bit) floating-point element in a, store the result in the lower element
2343// of dst, and copy the upper 3 packed elements from a to the upper elements of
2344// dst.
2345// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
2347{
2348 return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
2349}
2350
2351// Compute the absolute differences of packed unsigned 8-bit integers in a and
2352// b, then horizontally sum each consecutive 8 differences to produce four
2353// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2354// 16 bits of dst.
2355// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
2356FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
2357{
2358 uint64x1_t t = vpaddl_u32(vpaddl_u16(
2359 vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
2360 return vreinterpret_m64_u16(
2361 vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2362}
2363
2364// Macro: Set the flush zero bits of the MXCSR control and status register to
2365// the value in unsigned 32-bit integer a. The flush zero may contain any of the
2366// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
2367// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
2368FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
2369{
2370 // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
2371 // regardless of the value of the FZ bit.
2372 union {
2373 fpcr_bitfield field;
2374#if defined(__aarch64__) || defined(_M_ARM64)
2375 uint64_t value;
2376#else
2377 uint32_t value;
2378#endif
2379 } r;
2380
2381#if defined(__aarch64__) || defined(_M_ARM64)
2382 r.value = _sse2neon_get_fpcr();
2383#else
2384 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2385#endif
2386
2387 r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
2388
2389#if defined(__aarch64__) || defined(_M_ARM64)
2390 _sse2neon_set_fpcr(r.value);
2391#else
2392 __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
2393#endif
2394}
2395
2396// Set packed single-precision (32-bit) floating-point elements in dst with the
2397// supplied values.
2398// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
2399FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
2400{
2401 float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
2402 return vreinterpretq_m128_f32(vld1q_f32(data));
2403}
2404
2405// Broadcast single-precision (32-bit) floating-point value a to all elements of
2406// dst.
2407// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
2408FORCE_INLINE __m128 _mm_set_ps1(float _w)
2409{
2410 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2411}
2412
2413// Macro: Set the rounding mode bits of the MXCSR control and status register to
2414// the value in unsigned 32-bit integer a. The rounding mode may contain any of
2415// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
2416// _MM_ROUND_TOWARD_ZERO
2417// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
2418FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
2419{
2420 union {
2421 fpcr_bitfield field;
2422#if defined(__aarch64__) || defined(_M_ARM64)
2423 uint64_t value;
2424#else
2425 uint32_t value;
2426#endif
2427 } r;
2428
2429#if defined(__aarch64__) || defined(_M_ARM64)
2430 r.value = _sse2neon_get_fpcr();
2431#else
2432 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2433#endif
2434
2435 switch (rounding) {
2437 r.field.bit22 = 1;
2438 r.field.bit23 = 1;
2439 break;
2440 case _MM_ROUND_DOWN:
2441 r.field.bit22 = 0;
2442 r.field.bit23 = 1;
2443 break;
2444 case _MM_ROUND_UP:
2445 r.field.bit22 = 1;
2446 r.field.bit23 = 0;
2447 break;
2448 default: //_MM_ROUND_NEAREST
2449 r.field.bit22 = 0;
2450 r.field.bit23 = 0;
2451 }
2452
2453#if defined(__aarch64__) || defined(_M_ARM64)
2454 _sse2neon_set_fpcr(r.value);
2455#else
2456 __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
2457#endif
2458}
2459
2460// Copy single-precision (32-bit) floating-point element a to the lower element
2461// of dst, and zero the upper 3 elements.
2462// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
2463FORCE_INLINE __m128 _mm_set_ss(float a)
2464{
2465 return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
2466}
2467
2468// Broadcast single-precision (32-bit) floating-point value a to all elements of
2469// dst.
2470// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
2471FORCE_INLINE __m128 _mm_set1_ps(float _w)
2472{
2473 return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2474}
2475
2476// Set the MXCSR control and status register with the value in unsigned 32-bit
2477// integer a.
2478// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
2479// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
2480FORCE_INLINE void _mm_setcsr(unsigned int a)
2481{
2483}
2484
2485// Get the unsigned 32-bit value of the MXCSR control and status register.
2486// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
2487// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
2488FORCE_INLINE unsigned int _mm_getcsr(void)
2489{
2490 return _MM_GET_ROUNDING_MODE();
2491}
2492
2493// Set packed single-precision (32-bit) floating-point elements in dst with the
2494// supplied values in reverse order.
2495// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
2496FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
2497{
2498 float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
2499 return vreinterpretq_m128_f32(vld1q_f32(data));
2500}
2501
2502// Return vector of type __m128 with all elements set to zero.
2503// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
2504FORCE_INLINE __m128 _mm_setzero_ps(void)
2505{
2506 return vreinterpretq_m128_f32(vdupq_n_f32(0));
2507}
2508
2509// Shuffle 16-bit integers in a using the control in imm8, and store the results
2510// in dst.
2511// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
2512#ifdef _sse2neon_shuffle
2513#define _mm_shuffle_pi16(a, imm) \
2514 vreinterpret_m64_s16(vshuffle_s16( \
2515 vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
2516 ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)))
2517#else
2518#define _mm_shuffle_pi16(a, imm) \
2519 _sse2neon_define1( \
2520 __m64, a, int16x4_t ret; \
2521 ret = vmov_n_s16( \
2522 vget_lane_s16(vreinterpret_s16_m64(_a), (imm) & (0x3))); \
2523 ret = vset_lane_s16( \
2524 vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 2) & 0x3), ret, \
2525 1); \
2526 ret = vset_lane_s16( \
2527 vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 4) & 0x3), ret, \
2528 2); \
2529 ret = vset_lane_s16( \
2530 vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 6) & 0x3), ret, \
2531 3); \
2532 _sse2neon_return(vreinterpret_m64_s16(ret));)
2533#endif
2534
2535// Perform a serializing operation on all store-to-memory instructions that were
2536// issued prior to this instruction. Guarantees that every store instruction
2537// that precedes, in program order, is globally visible before any store
2538// instruction which follows the fence in program order.
2539// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
2540FORCE_INLINE void _mm_sfence(void)
2541{
2543}
2544
2545// Perform a serializing operation on all load-from-memory and store-to-memory
2546// instructions that were issued prior to this instruction. Guarantees that
2547// every memory access that precedes, in program order, the memory fence
2548// instruction is globally visible before any memory instruction which follows
2549// the fence in program order.
2550// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
2551FORCE_INLINE void _mm_mfence(void)
2552{
2554}
2555
2556// Perform a serializing operation on all load-from-memory instructions that
2557// were issued prior to this instruction. Guarantees that every load instruction
2558// that precedes, in program order, is globally visible before any load
2559// instruction which follows the fence in program order.
2560// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
2561FORCE_INLINE void _mm_lfence(void)
2562{
2564}
2565
2566// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
2567// int imm)
2568#ifdef _sse2neon_shuffle
2569#define _mm_shuffle_ps(a, b, imm) \
2570 __extension__({ \
2571 float32x4_t _input1 = vreinterpretq_f32_m128(a); \
2572 float32x4_t _input2 = vreinterpretq_f32_m128(b); \
2573 float32x4_t _shuf = \
2574 vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2575 (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2576 vreinterpretq_m128_f32(_shuf); \
2577 })
2578#else // generic
2579#define _mm_shuffle_ps(a, b, imm) \
2580 _sse2neon_define2( \
2581 __m128, a, b, __m128 ret; switch (imm) { \
2582 case _MM_SHUFFLE(1, 0, 3, 2): \
2583 ret = _mm_shuffle_ps_1032(_a, _b); \
2584 break; \
2585 case _MM_SHUFFLE(2, 3, 0, 1): \
2586 ret = _mm_shuffle_ps_2301(_a, _b); \
2587 break; \
2588 case _MM_SHUFFLE(0, 3, 2, 1): \
2589 ret = _mm_shuffle_ps_0321(_a, _b); \
2590 break; \
2591 case _MM_SHUFFLE(2, 1, 0, 3): \
2592 ret = _mm_shuffle_ps_2103(_a, _b); \
2593 break; \
2594 case _MM_SHUFFLE(1, 0, 1, 0): \
2595 ret = _mm_movelh_ps(_a, _b); \
2596 break; \
2597 case _MM_SHUFFLE(1, 0, 0, 1): \
2598 ret = _mm_shuffle_ps_1001(_a, _b); \
2599 break; \
2600 case _MM_SHUFFLE(0, 1, 0, 1): \
2601 ret = _mm_shuffle_ps_0101(_a, _b); \
2602 break; \
2603 case _MM_SHUFFLE(3, 2, 1, 0): \
2604 ret = _mm_shuffle_ps_3210(_a, _b); \
2605 break; \
2606 case _MM_SHUFFLE(0, 0, 1, 1): \
2607 ret = _mm_shuffle_ps_0011(_a, _b); \
2608 break; \
2609 case _MM_SHUFFLE(0, 0, 2, 2): \
2610 ret = _mm_shuffle_ps_0022(_a, _b); \
2611 break; \
2612 case _MM_SHUFFLE(2, 2, 0, 0): \
2613 ret = _mm_shuffle_ps_2200(_a, _b); \
2614 break; \
2615 case _MM_SHUFFLE(3, 2, 0, 2): \
2616 ret = _mm_shuffle_ps_3202(_a, _b); \
2617 break; \
2618 case _MM_SHUFFLE(3, 2, 3, 2): \
2619 ret = _mm_movehl_ps(_b, _a); \
2620 break; \
2621 case _MM_SHUFFLE(1, 1, 3, 3): \
2622 ret = _mm_shuffle_ps_1133(_a, _b); \
2623 break; \
2624 case _MM_SHUFFLE(2, 0, 1, 0): \
2625 ret = _mm_shuffle_ps_2010(_a, _b); \
2626 break; \
2627 case _MM_SHUFFLE(2, 0, 0, 1): \
2628 ret = _mm_shuffle_ps_2001(_a, _b); \
2629 break; \
2630 case _MM_SHUFFLE(2, 0, 3, 2): \
2631 ret = _mm_shuffle_ps_2032(_a, _b); \
2632 break; \
2633 default: \
2634 ret = _mm_shuffle_ps_default(_a, _b, (imm)); \
2635 break; \
2636 } _sse2neon_return(ret);)
2637#endif
2638
2639// Compute the square root of packed single-precision (32-bit) floating-point
2640// elements in a, and store the results in dst.
2641// Due to ARMv7-A NEON's lack of a precise square root intrinsic, we implement
2642// square root by multiplying input in with its reciprocal square root before
2643// using the Newton-Raphson method to approximate the results.
2644// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
2645FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
2646{
2647#if defined(__aarch64__) || defined(_M_ARM64)
2648 return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
2649#else
2650 float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2651
2652 // Test for vrsqrteq_f32(0) -> positive infinity case.
2653 // Change to zero, so that s * 1/sqrt(s) result is zero too.
2654 const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2655 const uint32x4_t div_by_zero =
2656 vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2657 recip = vreinterpretq_f32_u32(
2658 vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2659
2660 recip = vmulq_f32(
2661 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2662 recip);
2663 // Additional Netwon-Raphson iteration for accuracy
2664 recip = vmulq_f32(
2665 vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2666 recip);
2667
2668 // sqrt(s) = s * 1/sqrt(s)
2669 return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
2670#endif
2671}
2672
2673// Compute the square root of the lower single-precision (32-bit) floating-point
2674// element in a, store the result in the lower element of dst, and copy the
2675// upper 3 packed elements from a to the upper elements of dst.
2676// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
2677FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
2678{
2679 float32_t value =
2680 vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2682 vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2683}
2684
2685// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
2686// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
2687// or a general-protection exception may be generated.
2688// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
2689FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
2690{
2691 vst1q_f32(p, vreinterpretq_f32_m128(a));
2692}
2693
2694// Store the lower single-precision (32-bit) floating-point element from a into
2695// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2696// boundary or a general-protection exception may be generated.
2697// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
2698FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
2699{
2700 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
2701 vst1q_f32(p, vdupq_n_f32(a0));
2702}
2703
2704// Store the lower single-precision (32-bit) floating-point element from a into
2705// memory. mem_addr does not need to be aligned on any particular boundary.
2706// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
2707FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
2708{
2709 vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
2710}
2711
2712// Store the lower single-precision (32-bit) floating-point element from a into
2713// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2714// boundary or a general-protection exception may be generated.
2715// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
2716#define _mm_store1_ps _mm_store_ps1
2717
2718// Store the upper 2 single-precision (32-bit) floating-point elements from a
2719// into memory.
2720// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
2721FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
2722{
2723 *p = vreinterpret_m64_f32(vget_high_f32(a));
2724}
2725
2726// Store the lower 2 single-precision (32-bit) floating-point elements from a
2727// into memory.
2728// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
2729FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
2730{
2731 *p = vreinterpret_m64_f32(vget_low_f32(a));
2732}
2733
2734// Store 4 single-precision (32-bit) floating-point elements from a into memory
2735// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
2736// general-protection exception may be generated.
2737// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
2738FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
2739{
2740 float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
2741 float32x4_t rev = vextq_f32(tmp, tmp, 2);
2742 vst1q_f32(p, rev);
2743}
2744
2745// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
2746// elements) from a into memory. mem_addr does not need to be aligned on any
2747// particular boundary.
2748// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
2749FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
2750{
2751 vst1q_f32(p, vreinterpretq_f32_m128(a));
2752}
2753
2754// Stores 16-bits of integer data a at the address p.
2755// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
2756FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
2757{
2758 vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
2759}
2760
2761// Stores 64-bits of integer data a at the address p.
2762// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
2763FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
2764{
2765 vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
2766}
2767
2768// Store 64-bits of integer data from a into memory using a non-temporal memory
2769// hint.
2770// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
2771FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
2772{
2773 vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
2774}
2775
2776// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
2777// point elements) from a into memory using a non-temporal memory hint.
2778// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
2779FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
2780{
2781#if __has_builtin(__builtin_nontemporal_store)
2782 __builtin_nontemporal_store(a, (float32x4_t *) p);
2783#else
2784 vst1q_f32(p, vreinterpretq_f32_m128(a));
2785#endif
2786}
2787
2788// Subtract packed single-precision (32-bit) floating-point elements in b from
2789// packed single-precision (32-bit) floating-point elements in a, and store the
2790// results in dst.
2791// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
2792FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2793{
2796}
2797
2798// Subtract the lower single-precision (32-bit) floating-point element in b from
2799// the lower single-precision (32-bit) floating-point element in a, store the
2800// result in the lower element of dst, and copy the upper 3 packed elements from
2801// a to the upper elements of dst.
2802// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
2803FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2804{
2805 return _mm_move_ss(a, _mm_sub_ps(a, b));
2806}
2807
2808// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
2809// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
2810// transposed matrix in these vectors (row0 now contains column 0, etc.).
2811// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
2812#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2813 do { \
2814 float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
2815 float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
2816 row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
2817 vget_low_f32(ROW23.val[0])); \
2818 row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
2819 vget_low_f32(ROW23.val[1])); \
2820 row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
2821 vget_high_f32(ROW23.val[0])); \
2822 row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
2823 vget_high_f32(ROW23.val[1])); \
2824 } while (0)
2825
2826// according to the documentation, these intrinsics behave the same as the
2827// non-'u' versions. We'll just alias them here.
2828#define _mm_ucomieq_ss _mm_comieq_ss
2829#define _mm_ucomige_ss _mm_comige_ss
2830#define _mm_ucomigt_ss _mm_comigt_ss
2831#define _mm_ucomile_ss _mm_comile_ss
2832#define _mm_ucomilt_ss _mm_comilt_ss
2833#define _mm_ucomineq_ss _mm_comineq_ss
2834
2835// Return vector of type __m128i with undefined elements.
2836// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
2838{
2839#if defined(__GNUC__) || defined(__clang__)
2840#pragma GCC diagnostic push
2841#pragma GCC diagnostic ignored "-Wuninitialized"
2842#endif
2843 __m128i a;
2844#if defined(_MSC_VER)
2845 a = _mm_setzero_si128();
2846#endif
2847 return a;
2848#if defined(__GNUC__) || defined(__clang__)
2849#pragma GCC diagnostic pop
2850#endif
2851}
2852
2853// Return vector of type __m128 with undefined elements.
2854// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
2855FORCE_INLINE __m128 _mm_undefined_ps(void)
2856{
2857#if defined(__GNUC__) || defined(__clang__)
2858#pragma GCC diagnostic push
2859#pragma GCC diagnostic ignored "-Wuninitialized"
2860#endif
2861 __m128 a;
2862#if defined(_MSC_VER)
2863 a = _mm_setzero_ps();
2864#endif
2865 return a;
2866#if defined(__GNUC__) || defined(__clang__)
2867#pragma GCC diagnostic pop
2868#endif
2869}
2870
2871// Unpack and interleave single-precision (32-bit) floating-point elements from
2872// the high half a and b, and store the results in dst.
2873// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
2875{
2876#if defined(__aarch64__) || defined(_M_ARM64)
2879#else
2880 float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
2881 float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
2882 float32x2x2_t result = vzip_f32(a1, b1);
2883 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2884#endif
2885}
2886
2887// Unpack and interleave single-precision (32-bit) floating-point elements from
2888// the low half of a and b, and store the results in dst.
2889// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
2891{
2892#if defined(__aarch64__) || defined(_M_ARM64)
2895#else
2896 float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
2897 float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
2898 float32x2x2_t result = vzip_f32(a1, b1);
2899 return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2900#endif
2901}
2902
2903// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
2904// elements in a and b, and store the results in dst.
2905// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
2906FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
2907{
2910}
2911
2912/* SSE2 */
2913
2914// Add packed 16-bit integers in a and b, and store the results in dst.
2915// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
2917{
2920}
2921
2922// Add packed 32-bit integers in a and b, and store the results in dst.
2923// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
2925{
2928}
2929
2930// Add packed 64-bit integers in a and b, and store the results in dst.
2931// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
2933{
2936}
2937
2938// Add packed 8-bit integers in a and b, and store the results in dst.
2939// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
2941{
2944}
2945
2946// Add packed double-precision (64-bit) floating-point elements in a and b, and
2947// store the results in dst.
2948// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
2950{
2951#if defined(__aarch64__) || defined(_M_ARM64)
2952 return vreinterpretq_m128d_f64(
2953 vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2954#else
2955 double *da = (double *) &a;
2956 double *db = (double *) &b;
2957 double c[2];
2958 c[0] = da[0] + db[0];
2959 c[1] = da[1] + db[1];
2960 return vld1q_f32((float32_t *) c);
2961#endif
2962}
2963
2964// Add the lower double-precision (64-bit) floating-point element in a and b,
2965// store the result in the lower element of dst, and copy the upper element from
2966// a to the upper element of dst.
2967// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
2969{
2970#if defined(__aarch64__) || defined(_M_ARM64)
2971 return _mm_move_sd(a, _mm_add_pd(a, b));
2972#else
2973 double *da = (double *) &a;
2974 double *db = (double *) &b;
2975 double c[2];
2976 c[0] = da[0] + db[0];
2977 c[1] = da[1];
2978 return vld1q_f32((float32_t *) c);
2979#endif
2980}
2981
2982// Add 64-bit integers a and b, and store the result in dst.
2983// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
2984FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
2985{
2986 return vreinterpret_m64_s64(
2988}
2989
2990// Add packed signed 16-bit integers in a and b using saturation, and store the
2991// results in dst.
2992// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
2994{
2997}
2998
2999// Add packed signed 8-bit integers in a and b using saturation, and store the
3000// results in dst.
3001// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
3003{
3006}
3007
3008// Add packed unsigned 16-bit integers in a and b using saturation, and store
3009// the results in dst.
3010// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
3012{
3015}
3016
3017// Add packed unsigned 8-bit integers in a and b using saturation, and store the
3018// results in dst.
3019// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
3021{
3024}
3025
3026// Compute the bitwise AND of packed double-precision (64-bit) floating-point
3027// elements in a and b, and store the results in dst.
3028// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
3030{
3033}
3034
3035// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
3036// and store the result in dst.
3037// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
3039{
3042}
3043
3044// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
3045// elements in a and then AND with b, and store the results in dst.
3046// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
3048{
3049 // *NOTE* argument swap
3052}
3053
3054// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
3055// AND with b, and store the result in dst.
3056// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
3058{
3060 vbicq_s32(vreinterpretq_s32_m128i(b),
3061 vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
3062}
3063
3064// Average packed unsigned 16-bit integers in a and b, and store the results in
3065// dst.
3066// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
3068{
3069 return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
3071}
3072
3073// Average packed unsigned 8-bit integers in a and b, and store the results in
3074// dst.
3075// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
3077{
3080}
3081
3082// Shift a left by imm8 bytes while shifting in zeros, and store the results in
3083// dst.
3084// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
3085#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3086
3087// Shift a right by imm8 bytes while shifting in zeros, and store the results in
3088// dst.
3089// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
3090#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3091
3092// Cast vector of type __m128d to type __m128. This intrinsic is only used for
3093// compilation and does not generate any instructions, thus it has zero latency.
3094// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
3099
3100// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
3101// compilation and does not generate any instructions, thus it has zero latency.
3102// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
3107
3108// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
3109// compilation and does not generate any instructions, thus it has zero latency.
3110// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
3115
3116// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
3117// compilation and does not generate any instructions, thus it has zero latency.
3118// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
3123
3124// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
3125// compilation and does not generate any instructions, thus it has zero latency.
3126// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
3128{
3129#if defined(__aarch64__) || defined(_M_ARM64)
3130 return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
3131#else
3133#endif
3134}
3135
3136// Cast vector of type __m128i to type __m128. This intrinsic is only used for
3137// compilation and does not generate any instructions, thus it has zero latency.
3138// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
3143
3144// Invalidate and flush the cache line that contains p from all levels of the
3145// cache hierarchy.
3146// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
3147#if defined(__APPLE__)
3148#include <libkern/OSCacheControl.h>
3149#endif
3150FORCE_INLINE void _mm_clflush(void const *p)
3151{
3152 (void) p;
3153
3154 /* sys_icache_invalidate is supported since macOS 10.5.
3155 * However, it does not work on non-jailbroken iOS devices, although the
3156 * compilation is successful.
3157 */
3158#if defined(__APPLE__)
3159 sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
3160#elif defined(__GNUC__) || defined(__clang__)
3161 uintptr_t ptr = (uintptr_t) p;
3162 __builtin___clear_cache((char *) ptr,
3163 (char *) ptr + SSE2NEON_CACHELINE_SIZE);
3164#elif (_MSC_VER) && SSE2NEON_INCLUDE_WINDOWS_H
3165 FlushInstructionCache(GetCurrentProcess(), p, SSE2NEON_CACHELINE_SIZE);
3166#endif
3167}
3168
3169// Compare packed 16-bit integers in a and b for equality, and store the results
3170// in dst.
3171// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
3173{
3176}
3177
3178// Compare packed 32-bit integers in a and b for equality, and store the results
3179// in dst.
3180// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
3182{
3185}
3186
3187// Compare packed 8-bit integers in a and b for equality, and store the results
3188// in dst.
3189// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
3191{
3194}
3195
3196// Compare packed double-precision (64-bit) floating-point elements in a and b
3197// for equality, and store the results in dst.
3198// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
3200{
3201#if defined(__aarch64__) || defined(_M_ARM64)
3203 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3204#else
3205 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3206 uint32x4_t cmp =
3208 uint32x4_t swapped = vrev64q_u32(cmp);
3209 return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
3210#endif
3211}
3212
3213// Compare the lower double-precision (64-bit) floating-point elements in a and
3214// b for equality, store the result in the lower element of dst, and copy the
3215// upper element from a to the upper element of dst.
3216// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
3218{
3219 return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
3220}
3221
3222// Compare packed double-precision (64-bit) floating-point elements in a and b
3223// for greater-than-or-equal, and store the results in dst.
3224// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
3226{
3227#if defined(__aarch64__) || defined(_M_ARM64)
3229 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3230#else
3231 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3232 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3233 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3234 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3235 uint64_t d[2];
3236 d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3237 d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3238
3239 return vreinterpretq_m128d_u64(vld1q_u64(d));
3240#endif
3241}
3242
3243// Compare the lower double-precision (64-bit) floating-point elements in a and
3244// b for greater-than-or-equal, store the result in the lower element of dst,
3245// and copy the upper element from a to the upper element of dst.
3246// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
3248{
3249#if defined(__aarch64__) || defined(_M_ARM64)
3250 return _mm_move_sd(a, _mm_cmpge_pd(a, b));
3251#else
3252 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3253 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3254 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3255 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3256 uint64_t d[2];
3257 d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3258 d[1] = a1;
3259
3260 return vreinterpretq_m128d_u64(vld1q_u64(d));
3261#endif
3262}
3263
3264// Compare packed signed 16-bit integers in a and b for greater-than, and store
3265// the results in dst.
3266// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
3268{
3271}
3272
3273// Compare packed signed 32-bit integers in a and b for greater-than, and store
3274// the results in dst.
3275// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
3277{
3280}
3281
3282// Compare packed signed 8-bit integers in a and b for greater-than, and store
3283// the results in dst.
3284// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
3286{
3289}
3290
3291// Compare packed double-precision (64-bit) floating-point elements in a and b
3292// for greater-than, and store the results in dst.
3293// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
3295{
3296#if defined(__aarch64__) || defined(_M_ARM64)
3298 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3299#else
3300 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3301 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3302 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3303 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3304 uint64_t d[2];
3305 d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3306 d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3307
3308 return vreinterpretq_m128d_u64(vld1q_u64(d));
3309#endif
3310}
3311
3312// Compare the lower double-precision (64-bit) floating-point elements in a and
3313// b for greater-than, store the result in the lower element of dst, and copy
3314// the upper element from a to the upper element of dst.
3315// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
3317{
3318#if defined(__aarch64__) || defined(_M_ARM64)
3319 return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
3320#else
3321 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3322 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3323 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3324 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3325 uint64_t d[2];
3326 d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3327 d[1] = a1;
3328
3329 return vreinterpretq_m128d_u64(vld1q_u64(d));
3330#endif
3331}
3332
3333// Compare packed double-precision (64-bit) floating-point elements in a and b
3334// for less-than-or-equal, and store the results in dst.
3335// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
3337{
3338#if defined(__aarch64__) || defined(_M_ARM64)
3340 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3341#else
3342 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3343 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3344 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3345 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3346 uint64_t d[2];
3347 d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3348 d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3349
3350 return vreinterpretq_m128d_u64(vld1q_u64(d));
3351#endif
3352}
3353
3354// Compare the lower double-precision (64-bit) floating-point elements in a and
3355// b for less-than-or-equal, store the result in the lower element of dst, and
3356// copy the upper element from a to the upper element of dst.
3357// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
3359{
3360#if defined(__aarch64__) || defined(_M_ARM64)
3361 return _mm_move_sd(a, _mm_cmple_pd(a, b));
3362#else
3363 // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3364 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3365 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3366 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3367 uint64_t d[2];
3368 d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3369 d[1] = a1;
3370
3371 return vreinterpretq_m128d_u64(vld1q_u64(d));
3372#endif
3373}
3374
3375// Compare packed signed 16-bit integers in a and b for less-than, and store the
3376// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
3377// order of the operands switched.
3378// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
3380{
3383}
3384
3385// Compare packed signed 32-bit integers in a and b for less-than, and store the
3386// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
3387// order of the operands switched.
3388// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
3390{
3393}
3394
3395// Compare packed signed 8-bit integers in a and b for less-than, and store the
3396// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
3397// order of the operands switched.
3398// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
3400{
3403}
3404
3405// Compare packed double-precision (64-bit) floating-point elements in a and b
3406// for less-than, and store the results in dst.
3407// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
3409{
3410#if defined(__aarch64__) || defined(_M_ARM64)
3412 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3413#else
3414 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3415 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3416 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3417 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3418 uint64_t d[2];
3419 d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3420 d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3421
3422 return vreinterpretq_m128d_u64(vld1q_u64(d));
3423#endif
3424}
3425
3426// Compare the lower double-precision (64-bit) floating-point elements in a and
3427// b for less-than, store the result in the lower element of dst, and copy the
3428// upper element from a to the upper element of dst.
3429// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
3431{
3432#if defined(__aarch64__) || defined(_M_ARM64)
3433 return _mm_move_sd(a, _mm_cmplt_pd(a, b));
3434#else
3435 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3436 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3437 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3438 uint64_t d[2];
3439 d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3440 d[1] = a1;
3441
3442 return vreinterpretq_m128d_u64(vld1q_u64(d));
3443#endif
3444}
3445
3446// Compare packed double-precision (64-bit) floating-point elements in a and b
3447// for not-equal, and store the results in dst.
3448// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
3450{
3451#if defined(__aarch64__) || defined(_M_ARM64)
3452 return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
3453 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3454#else
3455 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3456 uint32x4_t cmp =
3458 uint32x4_t swapped = vrev64q_u32(cmp);
3459 return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
3460#endif
3461}
3462
3463// Compare the lower double-precision (64-bit) floating-point elements in a and
3464// b for not-equal, store the result in the lower element of dst, and copy the
3465// upper element from a to the upper element of dst.
3466// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
3468{
3469 return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
3470}
3471
3472// Compare packed double-precision (64-bit) floating-point elements in a and b
3473// for not-greater-than-or-equal, and store the results in dst.
3474// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
3476{
3477#if defined(__aarch64__) || defined(_M_ARM64)
3478 return vreinterpretq_m128d_u64(veorq_u64(
3479 vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3480 vdupq_n_u64(UINT64_MAX)));
3481#else
3482 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3483 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3484 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3485 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3486 uint64_t d[2];
3487 d[0] =
3488 !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3489 d[1] =
3490 !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3491
3492 return vreinterpretq_m128d_u64(vld1q_u64(d));
3493#endif
3494}
3495
3496// Compare the lower double-precision (64-bit) floating-point elements in a and
3497// b for not-greater-than-or-equal, store the result in the lower element of
3498// dst, and copy the upper element from a to the upper element of dst.
3499// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
3501{
3502 return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
3503}
3504
3505// Compare packed double-precision (64-bit) floating-point elements in a and b
3506// for not-greater-than, and store the results in dst.
3507// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
3509{
3510#if defined(__aarch64__) || defined(_M_ARM64)
3511 return vreinterpretq_m128d_u64(veorq_u64(
3512 vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3513 vdupq_n_u64(UINT64_MAX)));
3514#else
3515 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3516 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3517 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3518 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3519 uint64_t d[2];
3520 d[0] =
3521 !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3522 d[1] =
3523 !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3524
3525 return vreinterpretq_m128d_u64(vld1q_u64(d));
3526#endif
3527}
3528
3529// Compare the lower double-precision (64-bit) floating-point elements in a and
3530// b for not-greater-than, store the result in the lower element of dst, and
3531// copy the upper element from a to the upper element of dst.
3532// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
3534{
3535 return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
3536}
3537
3538// Compare packed double-precision (64-bit) floating-point elements in a and b
3539// for not-less-than-or-equal, and store the results in dst.
3540// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
3542{
3543#if defined(__aarch64__) || defined(_M_ARM64)
3544 return vreinterpretq_m128d_u64(veorq_u64(
3545 vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3546 vdupq_n_u64(UINT64_MAX)));
3547#else
3548 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3549 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3550 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3551 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3552 uint64_t d[2];
3553 d[0] =
3554 !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3555 d[1] =
3556 !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3557
3558 return vreinterpretq_m128d_u64(vld1q_u64(d));
3559#endif
3560}
3561
3562// Compare the lower double-precision (64-bit) floating-point elements in a and
3563// b for not-less-than-or-equal, store the result in the lower element of dst,
3564// and copy the upper element from a to the upper element of dst.
3565// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
3567{
3568 return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
3569}
3570
3571// Compare packed double-precision (64-bit) floating-point elements in a and b
3572// for not-less-than, and store the results in dst.
3573// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
3575{
3576#if defined(__aarch64__) || defined(_M_ARM64)
3577 return vreinterpretq_m128d_u64(veorq_u64(
3578 vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3579 vdupq_n_u64(UINT64_MAX)));
3580#else
3581 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3582 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3583 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3584 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3585 uint64_t d[2];
3586 d[0] =
3587 !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3588 d[1] =
3589 !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3590
3591 return vreinterpretq_m128d_u64(vld1q_u64(d));
3592#endif
3593}
3594
3595// Compare the lower double-precision (64-bit) floating-point elements in a and
3596// b for not-less-than, store the result in the lower element of dst, and copy
3597// the upper element from a to the upper element of dst.
3598// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
3600{
3601 return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
3602}
3603
3604// Compare packed double-precision (64-bit) floating-point elements in a and b
3605// to see if neither is NaN, and store the results in dst.
3606// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
3608{
3609#if defined(__aarch64__) || defined(_M_ARM64)
3610 // Excluding NaNs, any two floating point numbers can be compared.
3611 uint64x2_t not_nan_a =
3612 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3613 uint64x2_t not_nan_b =
3614 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3615 return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
3616#else
3617 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3618 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3619 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3620 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3621 uint64_t d[2];
3622 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3623 (*(double *) &b0) == (*(double *) &b0))
3624 ? ~UINT64_C(0)
3625 : UINT64_C(0);
3626 d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3627 (*(double *) &b1) == (*(double *) &b1))
3628 ? ~UINT64_C(0)
3629 : UINT64_C(0);
3630
3631 return vreinterpretq_m128d_u64(vld1q_u64(d));
3632#endif
3633}
3634
3635// Compare the lower double-precision (64-bit) floating-point elements in a and
3636// b to see if neither is NaN, store the result in the lower element of dst, and
3637// copy the upper element from a to the upper element of dst.
3638// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
3640{
3641#if defined(__aarch64__) || defined(_M_ARM64)
3642 return _mm_move_sd(a, _mm_cmpord_pd(a, b));
3643#else
3644 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3645 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3646 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3647 uint64_t d[2];
3648 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3649 (*(double *) &b0) == (*(double *) &b0))
3650 ? ~UINT64_C(0)
3651 : UINT64_C(0);
3652 d[1] = a1;
3653
3654 return vreinterpretq_m128d_u64(vld1q_u64(d));
3655#endif
3656}
3657
3658// Compare packed double-precision (64-bit) floating-point elements in a and b
3659// to see if either is NaN, and store the results in dst.
3660// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
3662{
3663#if defined(__aarch64__) || defined(_M_ARM64)
3664 // Two NaNs are not equal in comparison operation.
3665 uint64x2_t not_nan_a =
3666 vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3667 uint64x2_t not_nan_b =
3668 vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3670 vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3671#else
3672 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3673 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3674 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3675 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3676 uint64_t d[2];
3677 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3678 (*(double *) &b0) == (*(double *) &b0))
3679 ? UINT64_C(0)
3680 : ~UINT64_C(0);
3681 d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3682 (*(double *) &b1) == (*(double *) &b1))
3683 ? UINT64_C(0)
3684 : ~UINT64_C(0);
3685
3686 return vreinterpretq_m128d_u64(vld1q_u64(d));
3687#endif
3688}
3689
3690// Compare the lower double-precision (64-bit) floating-point elements in a and
3691// b to see if either is NaN, store the result in the lower element of dst, and
3692// copy the upper element from a to the upper element of dst.
3693// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
3695{
3696#if defined(__aarch64__) || defined(_M_ARM64)
3697 return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
3698#else
3699 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3700 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3701 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3702 uint64_t d[2];
3703 d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3704 (*(double *) &b0) == (*(double *) &b0))
3705 ? UINT64_C(0)
3706 : ~UINT64_C(0);
3707 d[1] = a1;
3708
3709 return vreinterpretq_m128d_u64(vld1q_u64(d));
3710#endif
3711}
3712
3713// Compare the lower double-precision (64-bit) floating-point element in a and b
3714// for greater-than-or-equal, and return the boolean result (0 or 1).
3715// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
3716FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
3717{
3718#if defined(__aarch64__) || defined(_M_ARM64)
3719 return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3720#else
3721 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3722 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3723
3724 return (*(double *) &a0 >= *(double *) &b0);
3725#endif
3726}
3727
3728// Compare the lower double-precision (64-bit) floating-point element in a and b
3729// for greater-than, and return the boolean result (0 or 1).
3730// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
3731FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
3732{
3733#if defined(__aarch64__) || defined(_M_ARM64)
3734 return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
3735#else
3736 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3737 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3738
3739 return (*(double *) &a0 > *(double *) &b0);
3740#endif
3741}
3742
3743// Compare the lower double-precision (64-bit) floating-point element in a and b
3744// for less-than-or-equal, and return the boolean result (0 or 1).
3745// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
3746FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
3747{
3748#if defined(__aarch64__) || defined(_M_ARM64)
3749 return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
3750#else
3751 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3752 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3753
3754 return (*(double *) &a0 <= *(double *) &b0);
3755#endif
3756}
3757
3758// Compare the lower double-precision (64-bit) floating-point element in a and b
3759// for less-than, and return the boolean result (0 or 1).
3760// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
3761FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
3762{
3763#if defined(__aarch64__) || defined(_M_ARM64)
3764 return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
3765#else
3766 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3767 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3768
3769 return (*(double *) &a0 < *(double *) &b0);
3770#endif
3771}
3772
3773// Compare the lower double-precision (64-bit) floating-point element in a and b
3774// for equality, and return the boolean result (0 or 1).
3775// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
3776FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
3777{
3778#if defined(__aarch64__) || defined(_M_ARM64)
3779 return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
3780#else
3781 uint32x4_t a_not_nan =
3783 uint32x4_t b_not_nan =
3785 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3786 uint32x4_t a_eq_b =
3788 uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3789 vreinterpretq_u64_u32(a_eq_b));
3790 return vgetq_lane_u64(and_results, 0) & 0x1;
3791#endif
3792}
3793
3794// Compare the lower double-precision (64-bit) floating-point element in a and b
3795// for not-equal, and return the boolean result (0 or 1).
3796// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
3797FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
3798{
3799 return !_mm_comieq_sd(a, b);
3800}
3801
3802// Convert packed signed 32-bit integers in a to packed double-precision
3803// (64-bit) floating-point elements, and store the results in dst.
3804// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
3806{
3807#if defined(__aarch64__) || defined(_M_ARM64)
3808 return vreinterpretq_m128d_f64(
3809 vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
3810#else
3811 double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
3812 double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
3813 return _mm_set_pd(a1, a0);
3814#endif
3815}
3816
3817// Convert packed signed 32-bit integers in a to packed single-precision
3818// (32-bit) floating-point elements, and store the results in dst.
3819// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
3821{
3822 return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
3823}
3824
3825// Convert packed double-precision (64-bit) floating-point elements in a to
3826// packed 32-bit integers, and store the results in dst.
3827// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
3829{
3830// vrnd32xq_f64 not supported on clang
3831#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
3832 float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
3833 int64x2_t integers = vcvtq_s64_f64(rounded);
3835 vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
3836#else
3838 double d0 = ((double *) &rnd)[0];
3839 double d1 = ((double *) &rnd)[1];
3840 return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
3841#endif
3842}
3843
3844// Convert packed double-precision (64-bit) floating-point elements in a to
3845// packed 32-bit integers, and store the results in dst.
3846// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
3848{
3850 double d0 = ((double *) &rnd)[0];
3851 double d1 = ((double *) &rnd)[1];
3852 int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
3853 return vreinterpret_m64_s32(vld1_s32(data));
3854}
3855
3856// Convert packed double-precision (64-bit) floating-point elements in a to
3857// packed single-precision (32-bit) floating-point elements, and store the
3858// results in dst.
3859// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
3861{
3862#if defined(__aarch64__) || defined(_M_ARM64)
3863 float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
3864 return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
3865#else
3866 float a0 = (float) ((double *) &a)[0];
3867 float a1 = (float) ((double *) &a)[1];
3868 return _mm_set_ps(0, 0, a1, a0);
3869#endif
3870}
3871
3872// Convert packed signed 32-bit integers in a to packed double-precision
3873// (64-bit) floating-point elements, and store the results in dst.
3874// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
3876{
3877#if defined(__aarch64__) || defined(_M_ARM64)
3878 return vreinterpretq_m128d_f64(
3879 vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
3880#else
3881 double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
3882 double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
3883 return _mm_set_pd(a1, a0);
3884#endif
3885}
3886
3887// Convert packed single-precision (32-bit) floating-point elements in a to
3888// packed 32-bit integers, and store the results in dst.
3889// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
3890// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
3891// does not support! It is supported on ARMv8-A however.
3893{
3894#if defined(__ARM_FEATURE_FRINT)
3895 return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
3896#elif (defined(__aarch64__) || defined(_M_ARM64)) || \
3897 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
3898 switch (_MM_GET_ROUNDING_MODE()) {
3899 case _MM_ROUND_NEAREST:
3900 return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
3901 case _MM_ROUND_DOWN:
3902 return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
3903 case _MM_ROUND_UP:
3904 return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
3905 default: // _MM_ROUND_TOWARD_ZERO
3906 return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
3907 }
3908#else
3909 float *f = (float *) &a;
3910 switch (_MM_GET_ROUNDING_MODE()) {
3911 case _MM_ROUND_NEAREST: {
3912 uint32x4_t signmask = vdupq_n_u32(0x80000000);
3913 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
3914 vdupq_n_f32(0.5f)); /* +/- 0.5 */
3915 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
3916 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
3917 int32x4_t r_trunc = vcvtq_s32_f32(
3918 vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
3919 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
3920 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
3921 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
3922 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
3923 float32x4_t delta = vsubq_f32(
3925 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
3926 uint32x4_t is_delta_half =
3927 vceqq_f32(delta, half); /* delta == +/- 0.5 */
3929 vbslq_s32(is_delta_half, r_even, r_normal));
3930 }
3931 case _MM_ROUND_DOWN:
3932 return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
3933 floorf(f[0]));
3934 case _MM_ROUND_UP:
3935 return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
3936 ceilf(f[0]));
3937 default: // _MM_ROUND_TOWARD_ZERO
3938 return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
3939 (int32_t) f[0]);
3940 }
3941#endif
3942}
3943
3944// Convert packed single-precision (32-bit) floating-point elements in a to
3945// packed double-precision (64-bit) floating-point elements, and store the
3946// results in dst.
3947// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
3949{
3950#if defined(__aarch64__) || defined(_M_ARM64)
3951 return vreinterpretq_m128d_f64(
3952 vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
3953#else
3954 double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
3955 double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
3956 return _mm_set_pd(a1, a0);
3957#endif
3958}
3959
3960// Copy the lower double-precision (64-bit) floating-point element of a to dst.
3961// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
3962FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
3963{
3964#if defined(__aarch64__) || defined(_M_ARM64)
3965 return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
3966#else
3967 return ((double *) &a)[0];
3968#endif
3969}
3970
3971// Convert the lower double-precision (64-bit) floating-point element in a to a
3972// 32-bit integer, and store the result in dst.
3973// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
3974FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
3975{
3976#if defined(__aarch64__) || defined(_M_ARM64)
3977 return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
3978#else
3980 double ret = ((double *) &rnd)[0];
3981 return (int32_t) ret;
3982#endif
3983}
3984
3985// Convert the lower double-precision (64-bit) floating-point element in a to a
3986// 64-bit integer, and store the result in dst.
3987// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
3988FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
3989{
3990#if defined(__aarch64__) || defined(_M_ARM64)
3991 return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
3992#else
3994 double ret = ((double *) &rnd)[0];
3995 return (int64_t) ret;
3996#endif
3997}
3998
3999// Convert the lower double-precision (64-bit) floating-point element in a to a
4000// 64-bit integer, and store the result in dst.
4001// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
4002#define _mm_cvtsd_si64x _mm_cvtsd_si64
4003
4004// Convert the lower double-precision (64-bit) floating-point element in b to a
4005// single-precision (32-bit) floating-point element, store the result in the
4006// lower element of dst, and copy the upper 3 packed elements from a to the
4007// upper elements of dst.
4008// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
4010{
4011#if defined(__aarch64__) || defined(_M_ARM64)
4012 return vreinterpretq_m128_f32(vsetq_lane_f32(
4013 vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
4014 vreinterpretq_f32_m128(a), 0));
4015#else
4016 return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
4017 vreinterpretq_f32_m128(a), 0));
4018#endif
4019}
4020
4021// Copy the lower 32-bit integer in a to dst.
4022// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
4023FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
4024{
4025 return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4026}
4027
4028// Copy the lower 64-bit integer in a to dst.
4029// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
4030FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
4031{
4032 return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4033}
4034
4035// Copy the lower 64-bit integer in a to dst.
4036// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
4037#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4038
4039// Convert the signed 32-bit integer b to a double-precision (64-bit)
4040// floating-point element, store the result in the lower element of dst, and
4041// copy the upper element from a to the upper element of dst.
4042// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
4043FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
4044{
4045#if defined(__aarch64__) || defined(_M_ARM64)
4046 return vreinterpretq_m128d_f64(
4047 vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4048#else
4049 double bf = (double) b;
4051 vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4052#endif
4053}
4054
4055// Copy the lower 64-bit integer in a to dst.
4056// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
4057#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4058
4059// Copy 32-bit integer a to the lower elements of dst, and zero the upper
4060// elements of dst.
4061// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
4062FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
4063{
4064 return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4065}
4066
4067// Convert the signed 64-bit integer b to a double-precision (64-bit)
4068// floating-point element, store the result in the lower element of dst, and
4069// copy the upper element from a to the upper element of dst.
4070// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
4071FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
4072{
4073#if defined(__aarch64__) || defined(_M_ARM64)
4074 return vreinterpretq_m128d_f64(
4075 vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4076#else
4077 double bf = (double) b;
4079 vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4080#endif
4081}
4082
4083// Copy 64-bit integer a to the lower element of dst, and zero the upper
4084// element.
4085// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
4086FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4087{
4088 return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4089}
4090
4091// Copy 64-bit integer a to the lower element of dst, and zero the upper
4092// element.
4093// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
4094#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4095
4096// Convert the signed 64-bit integer b to a double-precision (64-bit)
4097// floating-point element, store the result in the lower element of dst, and
4098// copy the upper element from a to the upper element of dst.
4099// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
4100#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4101
4102// Convert the lower single-precision (32-bit) floating-point element in b to a
4103// double-precision (64-bit) floating-point element, store the result in the
4104// lower element of dst, and copy the upper element from a to the upper element
4105// of dst.
4106// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
4108{
4109 double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
4110#if defined(__aarch64__) || defined(_M_ARM64)
4111 return vreinterpretq_m128d_f64(
4112 vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4113#else
4115 vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
4116#endif
4117}
4118
4119// Convert packed double-precision (64-bit) floating-point elements in a to
4120// packed 32-bit integers with truncation, and store the results in dst.
4121// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
4123{
4124 double a0 = ((double *) &a)[0];
4125 double a1 = ((double *) &a)[1];
4126 return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
4127}
4128
4129// Convert packed double-precision (64-bit) floating-point elements in a to
4130// packed 32-bit integers with truncation, and store the results in dst.
4131// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
4133{
4134 double a0 = ((double *) &a)[0];
4135 double a1 = ((double *) &a)[1];
4136 int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4137 return vreinterpret_m64_s32(vld1_s32(data));
4138}
4139
4140// Convert packed single-precision (32-bit) floating-point elements in a to
4141// packed 32-bit integers with truncation, and store the results in dst.
4142// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
4144{
4145 return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4146}
4147
4148// Convert the lower double-precision (64-bit) floating-point element in a to a
4149// 32-bit integer with truncation, and store the result in dst.
4150// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
4151FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
4152{
4153 double ret = *((double *) &a);
4154 return (int32_t) ret;
4155}
4156
4157// Convert the lower double-precision (64-bit) floating-point element in a to a
4158// 64-bit integer with truncation, and store the result in dst.
4159// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
4160FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
4161{
4162#if defined(__aarch64__) || defined(_M_ARM64)
4163 return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4164#else
4165 double ret = *((double *) &a);
4166 return (int64_t) ret;
4167#endif
4168}
4169
4170// Convert the lower double-precision (64-bit) floating-point element in a to a
4171// 64-bit integer with truncation, and store the result in dst.
4172// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
4173#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4174
4175// Divide packed double-precision (64-bit) floating-point elements in a by
4176// packed elements in b, and store the results in dst.
4177// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
4179{
4180#if defined(__aarch64__) || defined(_M_ARM64)
4181 return vreinterpretq_m128d_f64(
4182 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4183#else
4184 double *da = (double *) &a;
4185 double *db = (double *) &b;
4186 double c[2];
4187 c[0] = da[0] / db[0];
4188 c[1] = da[1] / db[1];
4189 return vld1q_f32((float32_t *) c);
4190#endif
4191}
4192
4193// Divide the lower double-precision (64-bit) floating-point element in a by the
4194// lower double-precision (64-bit) floating-point element in b, store the result
4195// in the lower element of dst, and copy the upper element from a to the upper
4196// element of dst.
4197// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
4199{
4200#if defined(__aarch64__) || defined(_M_ARM64)
4201 float64x2_t tmp =
4202 vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4203 return vreinterpretq_m128d_f64(
4204 vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4205#else
4206 return _mm_move_sd(a, _mm_div_pd(a, b));
4207#endif
4208}
4209
4210// Extract a 16-bit integer from a, selected with imm8, and store the result in
4211// the lower element of dst.
4212// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
4213// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
4214#define _mm_extract_epi16(a, imm) \
4215 vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4216
4217// Copy a to dst, and insert the 16-bit integer i into dst at the location
4218// specified by imm8.
4219// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
4220// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
4221// __constrange(0,8) int imm)
4222#define _mm_insert_epi16(a, b, imm) \
4223 vreinterpretq_m128i_s16( \
4224 vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm)))
4225
4226// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
4227// elements) from memory into dst. mem_addr must be aligned on a 16-byte
4228// boundary or a general-protection exception may be generated.
4229// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
4230FORCE_INLINE __m128d _mm_load_pd(const double *p)
4231{
4232#if defined(__aarch64__) || defined(_M_ARM64)
4233 return vreinterpretq_m128d_f64(vld1q_f64(p));
4234#else
4235 const float *fp = (const float *) p;
4236 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4237 return vreinterpretq_m128d_f32(vld1q_f32(data));
4238#endif
4239}
4240
4241// Load a double-precision (64-bit) floating-point element from memory into both
4242// elements of dst.
4243// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
4244#define _mm_load_pd1 _mm_load1_pd
4245
4246// Load a double-precision (64-bit) floating-point element from memory into the
4247// lower of dst, and zero the upper element. mem_addr does not need to be
4248// aligned on any particular boundary.
4249// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
4250FORCE_INLINE __m128d _mm_load_sd(const double *p)
4251{
4252#if defined(__aarch64__) || defined(_M_ARM64)
4253 return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4254#else
4255 const float *fp = (const float *) p;
4256 float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
4257 return vreinterpretq_m128d_f32(vld1q_f32(data));
4258#endif
4259}
4260
4261// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
4262// on a 16-byte boundary or a general-protection exception may be generated.
4263// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
4264FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
4265{
4266 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4267}
4268
4269// Load a double-precision (64-bit) floating-point element from memory into both
4270// elements of dst.
4271// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
4272FORCE_INLINE __m128d _mm_load1_pd(const double *p)
4273{
4274#if defined(__aarch64__) || defined(_M_ARM64)
4275 return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4276#else
4277 return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4278#endif
4279}
4280
4281// Load a double-precision (64-bit) floating-point element from memory into the
4282// upper element of dst, and copy the lower element from a to dst. mem_addr does
4283// not need to be aligned on any particular boundary.
4284// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
4285FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
4286{
4287#if defined(__aarch64__) || defined(_M_ARM64)
4288 return vreinterpretq_m128d_f64(
4289 vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4290#else
4291 return vreinterpretq_m128d_f32(vcombine_f32(
4292 vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4293#endif
4294}
4295
4296// Load 64-bit integer from memory into the first element of dst.
4297// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
4298FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
4299{
4300 /* Load the lower 64 bits of the value pointed to by p into the
4301 * lower 64 bits of the result, zeroing the upper 64 bits of the result.
4302 */
4304 vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
4305}
4306
4307// Load a double-precision (64-bit) floating-point element from memory into the
4308// lower element of dst, and copy the upper element from a to dst. mem_addr does
4309// not need to be aligned on any particular boundary.
4310// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
4311FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
4312{
4313#if defined(__aarch64__) || defined(_M_ARM64)
4314 return vreinterpretq_m128d_f64(
4315 vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4316#else
4318 vcombine_f32(vld1_f32((const float *) p),
4319 vget_high_f32(vreinterpretq_f32_m128d(a))));
4320#endif
4321}
4322
4323// Load 2 double-precision (64-bit) floating-point elements from memory into dst
4324// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
4325// general-protection exception may be generated.
4326// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
4327FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
4328{
4329#if defined(__aarch64__) || defined(_M_ARM64)
4330 float64x2_t v = vld1q_f64(p);
4331 return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4332#else
4333 int64x2_t v = vld1q_s64((const int64_t *) p);
4334 return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
4335#endif
4336}
4337
4338// Loads two double-precision from unaligned memory, floating-point values.
4339// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
4340FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
4341{
4342 return _mm_load_pd(p);
4343}
4344
4345// Load 128-bits of integer data from memory into dst. mem_addr does not need to
4346// be aligned on any particular boundary.
4347// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
4348FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
4349{
4350 return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4351}
4352
4353// Load unaligned 32-bit integer from memory into the first element of dst.
4354// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
4355FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
4356{
4358 vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4359}
4360
4361// Multiply packed signed 16-bit integers in a and b, producing intermediate
4362// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
4363// 32-bit integers, and pack the results in dst.
4364// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
4366{
4367 int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
4368 vget_low_s16(vreinterpretq_s16_m128i(b)));
4369#if defined(__aarch64__) || defined(_M_ARM64)
4370 int32x4_t high =
4371 vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
4372
4373 return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
4374#else
4375 int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
4376 vget_high_s16(vreinterpretq_s16_m128i(b)));
4377
4378 int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4379 int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4380
4381 return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
4382#endif
4383}
4384
4385// Conditionally store 8-bit integer elements from a into memory using mask
4386// (elements are not stored when the highest bit is not set in the corresponding
4387// element) and a non-temporal memory hint. mem_addr does not need to be aligned
4388// on any particular boundary.
4389// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
4390FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
4391{
4392 int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
4393 __m128 b = _mm_load_ps((const float *) mem_addr);
4394 int8x16_t masked =
4395 vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
4397 vst1q_s8((int8_t *) mem_addr, masked);
4398}
4399
4400// Compare packed signed 16-bit integers in a and b, and store packed maximum
4401// values in dst.
4402// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
4404{
4407}
4408
4409// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
4410// values in dst.
4411// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
4413{
4416}
4417
4418// Compare packed double-precision (64-bit) floating-point elements in a and b,
4419// and store packed maximum values in dst.
4420// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
4422{
4423#if defined(__aarch64__) || defined(_M_ARM64)
4424#if SSE2NEON_PRECISE_MINMAX
4425 float64x2_t _a = vreinterpretq_f64_m128d(a);
4426 float64x2_t _b = vreinterpretq_f64_m128d(b);
4427 return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
4428#else
4429 return vreinterpretq_m128d_f64(
4430 vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4431#endif
4432#else
4433 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4434 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4435 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4436 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4437 uint64_t d[2];
4438 d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
4439 d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
4440
4441 return vreinterpretq_m128d_u64(vld1q_u64(d));
4442#endif
4443}
4444
4445// Compare the lower double-precision (64-bit) floating-point elements in a and
4446// b, store the maximum value in the lower element of dst, and copy the upper
4447// element from a to the upper element of dst.
4448// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
4450{
4451#if defined(__aarch64__) || defined(_M_ARM64)
4452 return _mm_move_sd(a, _mm_max_pd(a, b));
4453#else
4454 double *da = (double *) &a;
4455 double *db = (double *) &b;
4456 double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
4457 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4458#endif
4459}
4460
4461// Compare packed signed 16-bit integers in a and b, and store packed minimum
4462// values in dst.
4463// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
4465{
4468}
4469
4470// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
4471// values in dst.
4472// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
4474{
4477}
4478
4479// Compare packed double-precision (64-bit) floating-point elements in a and b,
4480// and store packed minimum values in dst.
4481// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
4483{
4484#if defined(__aarch64__) || defined(_M_ARM64)
4485#if SSE2NEON_PRECISE_MINMAX
4486 float64x2_t _a = vreinterpretq_f64_m128d(a);
4487 float64x2_t _b = vreinterpretq_f64_m128d(b);
4488 return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
4489#else
4490 return vreinterpretq_m128d_f64(
4491 vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4492#endif
4493#else
4494 uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4495 uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4496 uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4497 uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4498 uint64_t d[2];
4499 d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
4500 d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
4501 return vreinterpretq_m128d_u64(vld1q_u64(d));
4502#endif
4503}
4504
4505// Compare the lower double-precision (64-bit) floating-point elements in a and
4506// b, store the minimum value in the lower element of dst, and copy the upper
4507// element from a to the upper element of dst.
4508// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
4510{
4511#if defined(__aarch64__) || defined(_M_ARM64)
4512 return _mm_move_sd(a, _mm_min_pd(a, b));
4513#else
4514 double *da = (double *) &a;
4515 double *db = (double *) &b;
4516 double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
4517 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4518#endif
4519}
4520
4521// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
4522// upper element.
4523// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
4525{
4527 vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
4528}
4529
4530// Move the lower double-precision (64-bit) floating-point element from b to the
4531// lower element of dst, and copy the upper element from a to the upper element
4532// of dst.
4533// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
4535{
4537 vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
4538 vget_high_f32(vreinterpretq_f32_m128d(a))));
4539}
4540
4541// Create mask from the most significant bit of each 8-bit element in a, and
4542// store the result in dst.
4543// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
4544FORCE_INLINE int _mm_movemask_epi8(__m128i a)
4545{
4546 // Use increasingly wide shifts+adds to collect the sign bits
4547 // together.
4548 // Since the widening shifts would be rather confusing to follow in little
4549 // endian, everything will be illustrated in big endian order instead. This
4550 // has a different result - the bits would actually be reversed on a big
4551 // endian machine.
4552
4553 // Starting input (only half the elements are shown):
4554 // 89 ff 1d c0 00 10 99 33
4555 uint8x16_t input = vreinterpretq_u8_m128i(a);
4556
4557 // Shift out everything but the sign bits with an unsigned shift right.
4558 //
4559 // Bytes of the vector::
4560 // 89 ff 1d c0 00 10 99 33
4561 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
4562 // | | | | | | | |
4563 // 01 01 00 01 00 00 01 00
4564 //
4565 // Bits of first important lane(s):
4566 // 10001001 (89)
4567 // \______
4568 // |
4569 // 00000001 (01)
4570 uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4571
4572 // Merge the even lanes together with a 16-bit unsigned shift right + add.
4573 // 'xx' represents garbage data which will be ignored in the final result.
4574 // In the important bytes, the add functions like a binary OR.
4575 //
4576 // 01 01 00 01 00 00 01 00
4577 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
4578 // \| \| \| \|
4579 // xx 03 xx 01 xx 00 xx 02
4580 //
4581 // 00000001 00000001 (01 01)
4582 // \_______ |
4583 // \|
4584 // xxxxxxxx xxxxxx11 (xx 03)
4585 uint32x4_t paired16 =
4586 vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4587
4588 // Repeat with a wider 32-bit shift + add.
4589 // xx 03 xx 01 xx 00 xx 02
4590 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
4591 // 14))
4592 // \| \|
4593 // xx xx xx 0d xx xx xx 02
4594 //
4595 // 00000011 00000001 (03 01)
4596 // \\_____ ||
4597 // '----.\||
4598 // xxxxxxxx xxxx1101 (xx 0d)
4599 uint64x2_t paired32 =
4600 vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4601
4602 // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
4603 // lanes. xx xx xx 0d xx xx xx 02
4604 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
4605 // 28))
4606 // \|
4607 // xx xx xx xx xx xx xx d2
4608 //
4609 // 00001101 00000010 (0d 02)
4610 // \ \___ | |
4611 // '---. \| |
4612 // xxxxxxxx 11010010 (xx d2)
4613 uint8x16_t paired64 =
4614 vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4615
4616 // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
4617 // xx xx xx xx xx xx xx d2
4618 // || return paired64[0]
4619 // d2
4620 // Note: Little endian would return the correct value 4b (01001011) instead.
4621 return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
4622}
4623
4624// Set each bit of mask dst based on the most significant bit of the
4625// corresponding packed double-precision (64-bit) floating-point element in a.
4626// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
4627FORCE_INLINE int _mm_movemask_pd(__m128d a)
4628{
4629 uint64x2_t input = vreinterpretq_u64_m128d(a);
4630 uint64x2_t high_bits = vshrq_n_u64(input, 63);
4631 return (int) (vgetq_lane_u64(high_bits, 0) |
4632 (vgetq_lane_u64(high_bits, 1) << 1));
4633}
4634
4635// Copy the lower 64-bit integer in a to dst.
4636// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
4638{
4639 return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
4640}
4641
4642// Copy the 64-bit integer a to the lower element of dst, and zero the upper
4643// element.
4644// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
4646{
4648 vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
4649}
4650
4651// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
4652// a and b, and store the unsigned 64-bit results in dst.
4653// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
4655{
4656 // vmull_u32 upcasts instead of masking, so we downcast.
4657 uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
4658 uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
4659 return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
4660}
4661
4662// Multiply packed double-precision (64-bit) floating-point elements in a and b,
4663// and store the results in dst.
4664// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
4666{
4667#if defined(__aarch64__) || defined(_M_ARM64)
4668 return vreinterpretq_m128d_f64(
4669 vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4670#else
4671 double *da = (double *) &a;
4672 double *db = (double *) &b;
4673 double c[2];
4674 c[0] = da[0] * db[0];
4675 c[1] = da[1] * db[1];
4676 return vld1q_f32((float32_t *) c);
4677#endif
4678}
4679
4680// Multiply the lower double-precision (64-bit) floating-point element in a and
4681// b, store the result in the lower element of dst, and copy the upper element
4682// from a to the upper element of dst.
4683// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
4685{
4686 return _mm_move_sd(a, _mm_mul_pd(a, b));
4687}
4688
4689// Multiply the low unsigned 32-bit integers from a and b, and store the
4690// unsigned 64-bit result in dst.
4691// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
4692FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
4693{
4694 return vreinterpret_m64_u64(vget_low_u64(
4695 vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
4696}
4697
4698// Multiply the packed signed 16-bit integers in a and b, producing intermediate
4699// 32-bit integers, and store the high 16 bits of the intermediate integers in
4700// dst.
4701// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
4703{
4704 /* FIXME: issue with large values because of result saturation */
4705 // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
4706 // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
4707 // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
4708 int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
4709 int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
4710 int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4711 int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
4712 int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
4713 int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4714 uint16x8x2_t r =
4715 vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4716 return vreinterpretq_m128i_u16(r.val[1]);
4717}
4718
4719// Multiply the packed unsigned 16-bit integers in a and b, producing
4720// intermediate 32-bit integers, and store the high 16 bits of the intermediate
4721// integers in dst.
4722// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
4724{
4725 uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
4726 uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
4727 uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4728#if defined(__aarch64__) || defined(_M_ARM64)
4729 uint32x4_t ab7654 =
4730 vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
4731 uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4732 vreinterpretq_u16_u32(ab7654));
4733 return vreinterpretq_m128i_u16(r);
4734#else
4735 uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
4736 uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
4737 uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4738 uint16x8x2_t r =
4739 vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4740 return vreinterpretq_m128i_u16(r.val[1]);
4741#endif
4742}
4743
4744// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
4745// integers, and store the low 16 bits of the intermediate integers in dst.
4746// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
4748{
4751}
4752
4753// Compute the bitwise OR of packed double-precision (64-bit) floating-point
4754// elements in a and b, and store the results in dst.
4755// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
4757{
4760}
4761
4762// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
4763// and store the result in dst.
4764// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
4766{
4769}
4770
4771// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
4772// using signed saturation, and store the results in dst.
4773// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
4775{
4777 vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
4778 vqmovn_s16(vreinterpretq_s16_m128i(b))));
4779}
4780
4781// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
4782// using signed saturation, and store the results in dst.
4783// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
4785{
4787 vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
4788 vqmovn_s32(vreinterpretq_s32_m128i(b))));
4789}
4790
4791// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
4792// using unsigned saturation, and store the results in dst.
4793// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
4794FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
4795{
4797 vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
4798 vqmovun_s16(vreinterpretq_s16_m128i(b))));
4799}
4800
4801// Pause the processor. This is typically used in spin-wait loops and depending
4802// on the x86 processor typical values are in the 40-100 cycle range. The
4803// 'yield' instruction isn't a good fit because it's effectively a nop on most
4804// Arm cores. Experience with several databases has shown has shown an 'isb' is
4805// a reasonable approximation.
4806// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
4807FORCE_INLINE void _mm_pause(void)
4808{
4809#if defined(_MSC_VER)
4810 __isb(_ARM64_BARRIER_SY);
4811#else
4812 __asm__ __volatile__("isb\n");
4813#endif
4814}
4815
4816// Compute the absolute differences of packed unsigned 8-bit integers in a and
4817// b, then horizontally sum each consecutive 8 differences to produce two
4818// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
4819// 16 bits of 64-bit elements in dst.
4820// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
4822{
4823 uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
4824 return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
4825}
4826
4827// Set packed 16-bit integers in dst with the supplied values.
4828// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
4829FORCE_INLINE __m128i _mm_set_epi16(short i7,
4830 short i6,
4831 short i5,
4832 short i4,
4833 short i3,
4834 short i2,
4835 short i1,
4836 short i0)
4837{
4838 int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
4839 return vreinterpretq_m128i_s16(vld1q_s16(data));
4840}
4841
4842// Set packed 32-bit integers in dst with the supplied values.
4843// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
4844FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
4845{
4846 int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
4847 return vreinterpretq_m128i_s32(vld1q_s32(data));
4848}
4849
4850// Set packed 64-bit integers in dst with the supplied values.
4851// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
4852FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
4853{
4854 return _mm_set_epi64x(vget_lane_s64(i1, 0), vget_lane_s64(i2, 0));
4855}
4856
4857// Set packed 64-bit integers in dst with the supplied values.
4858// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
4859FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
4860{
4862 vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
4863}
4864
4865// Set packed 8-bit integers in dst with the supplied values.
4866// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
4867FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
4868 signed char b14,
4869 signed char b13,
4870 signed char b12,
4871 signed char b11,
4872 signed char b10,
4873 signed char b9,
4874 signed char b8,
4875 signed char b7,
4876 signed char b6,
4877 signed char b5,
4878 signed char b4,
4879 signed char b3,
4880 signed char b2,
4881 signed char b1,
4882 signed char b0)
4883{
4884 int8_t ALIGN_STRUCT(16)
4885 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
4886 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
4887 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
4888 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
4889 return (__m128i) vld1q_s8(data);
4890}
4891
4892// Set packed double-precision (64-bit) floating-point elements in dst with the
4893// supplied values.
4894// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
4895FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
4896{
4897 double ALIGN_STRUCT(16) data[2] = {e0, e1};
4898#if defined(__aarch64__) || defined(_M_ARM64)
4899 return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
4900#else
4901 return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
4902#endif
4903}
4904
4905// Broadcast double-precision (64-bit) floating-point value a to all elements of
4906// dst.
4907// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
4908#define _mm_set_pd1 _mm_set1_pd
4909
4910// Copy double-precision (64-bit) floating-point element a to the lower element
4911// of dst, and zero the upper element.
4912// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
4913FORCE_INLINE __m128d _mm_set_sd(double a)
4914{
4915#if defined(__aarch64__) || defined(_M_ARM64)
4916 return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
4917#else
4918 return _mm_set_pd(0, a);
4919#endif
4920}
4921
4922// Broadcast 16-bit integer a to all elements of dst.
4923// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
4924FORCE_INLINE __m128i _mm_set1_epi16(short w)
4925{
4926 return vreinterpretq_m128i_s16(vdupq_n_s16(w));
4927}
4928
4929// Broadcast 32-bit integer a to all elements of dst.
4930// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
4931FORCE_INLINE __m128i _mm_set1_epi32(int _i)
4932{
4933 return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
4934}
4935
4936// Broadcast 64-bit integer a to all elements of dst.
4937// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
4939{
4940 return vreinterpretq_m128i_s64(vdupq_lane_s64(_i, 0));
4941}
4942
4943// Broadcast 64-bit integer a to all elements of dst.
4944// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
4945FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
4946{
4947 return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
4948}
4949
4950// Broadcast 8-bit integer a to all elements of dst.
4951// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
4952FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
4953{
4954 return vreinterpretq_m128i_s8(vdupq_n_s8(w));
4955}
4956
4957// Broadcast double-precision (64-bit) floating-point value a to all elements of
4958// dst.
4959// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
4960FORCE_INLINE __m128d _mm_set1_pd(double d)
4961{
4962#if defined(__aarch64__) || defined(_M_ARM64)
4963 return vreinterpretq_m128d_f64(vdupq_n_f64(d));
4964#else
4965 return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
4966#endif
4967}
4968
4969// Set packed 16-bit integers in dst with the supplied values in reverse order.
4970// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
4971FORCE_INLINE __m128i _mm_setr_epi16(short w0,
4972 short w1,
4973 short w2,
4974 short w3,
4975 short w4,
4976 short w5,
4977 short w6,
4978 short w7)
4979{
4980 int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
4981 return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
4982}
4983
4984// Set packed 32-bit integers in dst with the supplied values in reverse order.
4985// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
4986FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
4987{
4988 int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
4989 return vreinterpretq_m128i_s32(vld1q_s32(data));
4990}
4991
4992// Set packed 64-bit integers in dst with the supplied values in reverse order.
4993// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
4994FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
4995{
4996 return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
4997}
4998
4999// Set packed 8-bit integers in dst with the supplied values in reverse order.
5000// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
5001FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
5002 signed char b1,
5003 signed char b2,
5004 signed char b3,
5005 signed char b4,
5006 signed char b5,
5007 signed char b6,
5008 signed char b7,
5009 signed char b8,
5010 signed char b9,
5011 signed char b10,
5012 signed char b11,
5013 signed char b12,
5014 signed char b13,
5015 signed char b14,
5016 signed char b15)
5017{
5018 int8_t ALIGN_STRUCT(16)
5019 data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5020 (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5021 (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5022 (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5023 return (__m128i) vld1q_s8(data);
5024}
5025
5026// Set packed double-precision (64-bit) floating-point elements in dst with the
5027// supplied values in reverse order.
5028// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
5029FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
5030{
5031 return _mm_set_pd(e0, e1);
5032}
5033
5034// Return vector of type __m128d with all elements set to zero.
5035// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
5036FORCE_INLINE __m128d _mm_setzero_pd(void)
5037{
5038#if defined(__aarch64__) || defined(_M_ARM64)
5039 return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5040#else
5041 return vreinterpretq_m128d_f32(vdupq_n_f32(0));
5042#endif
5043}
5044
5045// Return vector of type __m128i with all elements set to zero.
5046// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
5047FORCE_INLINE __m128i _mm_setzero_si128(void)
5048{
5049 return vreinterpretq_m128i_s32(vdupq_n_s32(0));
5050}
5051
5052// Shuffle 32-bit integers in a using the control in imm8, and store the results
5053// in dst.
5054// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
5055// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
5056// __constrange(0,255) int imm)
5057#if defined(_sse2neon_shuffle)
5058#define _mm_shuffle_epi32(a, imm) \
5059 __extension__({ \
5060 int32x4_t _input = vreinterpretq_s32_m128i(a); \
5061 int32x4_t _shuf = \
5062 vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5063 ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
5064 vreinterpretq_m128i_s32(_shuf); \
5065 })
5066#else // generic
5067#define _mm_shuffle_epi32(a, imm) \
5068 _sse2neon_define1( \
5069 __m128i, a, __m128i ret; switch (imm) { \
5070 case _MM_SHUFFLE(1, 0, 3, 2): \
5071 ret = _mm_shuffle_epi_1032(_a); \
5072 break; \
5073 case _MM_SHUFFLE(2, 3, 0, 1): \
5074 ret = _mm_shuffle_epi_2301(_a); \
5075 break; \
5076 case _MM_SHUFFLE(0, 3, 2, 1): \
5077 ret = _mm_shuffle_epi_0321(_a); \
5078 break; \
5079 case _MM_SHUFFLE(2, 1, 0, 3): \
5080 ret = _mm_shuffle_epi_2103(_a); \
5081 break; \
5082 case _MM_SHUFFLE(1, 0, 1, 0): \
5083 ret = _mm_shuffle_epi_1010(_a); \
5084 break; \
5085 case _MM_SHUFFLE(1, 0, 0, 1): \
5086 ret = _mm_shuffle_epi_1001(_a); \
5087 break; \
5088 case _MM_SHUFFLE(0, 1, 0, 1): \
5089 ret = _mm_shuffle_epi_0101(_a); \
5090 break; \
5091 case _MM_SHUFFLE(2, 2, 1, 1): \
5092 ret = _mm_shuffle_epi_2211(_a); \
5093 break; \
5094 case _MM_SHUFFLE(0, 1, 2, 2): \
5095 ret = _mm_shuffle_epi_0122(_a); \
5096 break; \
5097 case _MM_SHUFFLE(3, 3, 3, 2): \
5098 ret = _mm_shuffle_epi_3332(_a); \
5099 break; \
5100 case _MM_SHUFFLE(0, 0, 0, 0): \
5101 ret = _mm_shuffle_epi32_splat(_a, 0); \
5102 break; \
5103 case _MM_SHUFFLE(1, 1, 1, 1): \
5104 ret = _mm_shuffle_epi32_splat(_a, 1); \
5105 break; \
5106 case _MM_SHUFFLE(2, 2, 2, 2): \
5107 ret = _mm_shuffle_epi32_splat(_a, 2); \
5108 break; \
5109 case _MM_SHUFFLE(3, 3, 3, 3): \
5110 ret = _mm_shuffle_epi32_splat(_a, 3); \
5111 break; \
5112 default: \
5113 ret = _mm_shuffle_epi32_default(_a, (imm)); \
5114 break; \
5115 } _sse2neon_return(ret);)
5116#endif
5117
5118// Shuffle double-precision (64-bit) floating-point elements using the control
5119// in imm8, and store the results in dst.
5120// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
5121#ifdef _sse2neon_shuffle
5122#define _mm_shuffle_pd(a, b, imm8) \
5123 vreinterpretq_m128d_s64( \
5124 vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
5125 imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
5126#else
5127#define _mm_shuffle_pd(a, b, imm8) \
5128 _mm_castsi128_pd(_mm_set_epi64x( \
5129 vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
5130 vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
5131#endif
5132
5133// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
5134// __constrange(0,255) int imm)
5135#if defined(_sse2neon_shuffle)
5136#define _mm_shufflehi_epi16(a, imm) \
5137 __extension__({ \
5138 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5139 int16x8_t _shuf = \
5140 vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
5141 (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5142 (((imm) >> 6) & 0x3) + 4); \
5143 vreinterpretq_m128i_s16(_shuf); \
5144 })
5145#else // generic
5146#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5147#endif
5148
5149// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
5150// __constrange(0,255) int imm)
5151#if defined(_sse2neon_shuffle)
5152#define _mm_shufflelo_epi16(a, imm) \
5153 __extension__({ \
5154 int16x8_t _input = vreinterpretq_s16_m128i(a); \
5155 int16x8_t _shuf = vshuffleq_s16( \
5156 _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
5157 (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5158 vreinterpretq_m128i_s16(_shuf); \
5159 })
5160#else // generic
5161#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5162#endif
5163
5164// Shift packed 16-bit integers in a left by count while shifting in zeros, and
5165// store the results in dst.
5166// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
5167FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
5168{
5169 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5170 if (_sse2neon_unlikely(c & ~15))
5171 return _mm_setzero_si128();
5172
5173 int16x8_t vc = vdupq_n_s16((int16_t) c);
5174 return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
5175}
5176
5177// Shift packed 32-bit integers in a left by count while shifting in zeros, and
5178// store the results in dst.
5179// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
5180FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
5181{
5182 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5183 if (_sse2neon_unlikely(c & ~31))
5184 return _mm_setzero_si128();
5185
5186 int32x4_t vc = vdupq_n_s32((int32_t) c);
5187 return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
5188}
5189
5190// Shift packed 64-bit integers in a left by count while shifting in zeros, and
5191// store the results in dst.
5192// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
5193FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
5194{
5195 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5196 if (_sse2neon_unlikely(c & ~63))
5197 return _mm_setzero_si128();
5198
5199 int64x2_t vc = vdupq_n_s64((int64_t) c);
5200 return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
5201}
5202
5203// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
5204// store the results in dst.
5205// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
5206FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
5207{
5208 if (_sse2neon_unlikely(imm & ~15))
5209 return _mm_setzero_si128();
5211 vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
5212}
5213
5214// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
5215// store the results in dst.
5216// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
5217FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
5218{
5219 if (_sse2neon_unlikely(imm & ~31))
5220 return _mm_setzero_si128();
5222 vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
5223}
5224
5225// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
5226// store the results in dst.
5227// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
5228FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
5229{
5230 if (_sse2neon_unlikely(imm & ~63))
5231 return _mm_setzero_si128();
5233 vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
5234}
5235
5236// Shift a left by imm8 bytes while shifting in zeros, and store the results in
5237// dst.
5238// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
5239#define _mm_slli_si128(a, imm) \
5240 _sse2neon_define1( \
5241 __m128i, a, int8x16_t ret; \
5242 if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \
5243 else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \
5244 else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a), \
5245 ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \
5246 _sse2neon_return(vreinterpretq_m128i_s8(ret));)
5247
5248// Compute the square root of packed double-precision (64-bit) floating-point
5249// elements in a, and store the results in dst.
5250// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
5252{
5253#if defined(__aarch64__) || defined(_M_ARM64)
5254 return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5255#else
5256 double a0 = sqrt(((double *) &a)[0]);
5257 double a1 = sqrt(((double *) &a)[1]);
5258 return _mm_set_pd(a1, a0);
5259#endif
5260}
5261
5262// Compute the square root of the lower double-precision (64-bit) floating-point
5263// element in b, store the result in the lower element of dst, and copy the
5264// upper element from a to the upper element of dst.
5265// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
5267{
5268#if defined(__aarch64__) || defined(_M_ARM64)
5269 return _mm_move_sd(a, _mm_sqrt_pd(b));
5270#else
5271 return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
5272#endif
5273}
5274
5275// Shift packed 16-bit integers in a right by count while shifting in sign bits,
5276// and store the results in dst.
5277// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
5278FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
5279{
5280 int64_t c = vgetq_lane_s64(count, 0);
5281 if (_sse2neon_unlikely(c & ~15))
5284 vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c)));
5285}
5286
5287// Shift packed 32-bit integers in a right by count while shifting in sign bits,
5288// and store the results in dst.
5289// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
5290FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
5291{
5292 int64_t c = vgetq_lane_s64(count, 0);
5293 if (_sse2neon_unlikely(c & ~31))
5296 vshlq_s32((int32x4_t) a, vdupq_n_s32((int) -c)));
5297}
5298
5299// Shift packed 16-bit integers in a right by imm8 while shifting in sign
5300// bits, and store the results in dst.
5301// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
5302FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
5303{
5304 const int count = (imm & ~15) ? 15 : imm;
5305 return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5306}
5307
5308// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
5309// and store the results in dst.
5310// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
5311// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
5312#define _mm_srai_epi32(a, imm) \
5313 _sse2neon_define0( \
5314 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) == 0)) { \
5315 ret = _a; \
5316 } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \
5317 ret = vreinterpretq_m128i_s32( \
5318 vshlq_s32(vreinterpretq_s32_m128i(_a), vdupq_n_s32(-(imm)))); \
5319 } else { \
5320 ret = vreinterpretq_m128i_s32( \
5321 vshrq_n_s32(vreinterpretq_s32_m128i(_a), 31)); \
5322 } _sse2neon_return(ret);)
5323
5324// Shift packed 16-bit integers in a right by count while shifting in zeros, and
5325// store the results in dst.
5326// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
5327FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
5328{
5329 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5330 if (_sse2neon_unlikely(c & ~15))
5331 return _mm_setzero_si128();
5332
5333 int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5334 return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
5335}
5336
5337// Shift packed 32-bit integers in a right by count while shifting in zeros, and
5338// store the results in dst.
5339// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
5340FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
5341{
5342 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5343 if (_sse2neon_unlikely(c & ~31))
5344 return _mm_setzero_si128();
5345
5346 int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5347 return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
5348}
5349
5350// Shift packed 64-bit integers in a right by count while shifting in zeros, and
5351// store the results in dst.
5352// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
5353FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
5354{
5355 uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5356 if (_sse2neon_unlikely(c & ~63))
5357 return _mm_setzero_si128();
5358
5359 int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5360 return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
5361}
5362
5363// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
5364// store the results in dst.
5365// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
5366#define _mm_srli_epi16(a, imm) \
5367 _sse2neon_define0( \
5368 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) { \
5369 ret = _mm_setzero_si128(); \
5370 } else { \
5371 ret = vreinterpretq_m128i_u16( \
5372 vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \
5373 } _sse2neon_return(ret);)
5374
5375// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
5376// store the results in dst.
5377// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
5378// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
5379#define _mm_srli_epi32(a, imm) \
5380 _sse2neon_define0( \
5381 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~31)) { \
5382 ret = _mm_setzero_si128(); \
5383 } else { \
5384 ret = vreinterpretq_m128i_u32( \
5385 vshlq_u32(vreinterpretq_u32_m128i(_a), vdupq_n_s32(-(imm)))); \
5386 } _sse2neon_return(ret);)
5387
5388// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
5389// store the results in dst.
5390// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
5391#define _mm_srli_epi64(a, imm) \
5392 _sse2neon_define0( \
5393 __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~63)) { \
5394 ret = _mm_setzero_si128(); \
5395 } else { \
5396 ret = vreinterpretq_m128i_u64( \
5397 vshlq_u64(vreinterpretq_u64_m128i(_a), vdupq_n_s64(-(imm)))); \
5398 } _sse2neon_return(ret);)
5399
5400// Shift a right by imm8 bytes while shifting in zeros, and store the results in
5401// dst.
5402// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
5403#define _mm_srli_si128(a, imm) \
5404 _sse2neon_define1( \
5405 __m128i, a, int8x16_t ret; \
5406 if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \
5407 else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \
5408 (imm > 15 ? 0 : imm)); \
5409 _sse2neon_return(vreinterpretq_m128i_s8(ret));)
5410
5411// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5412// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
5413// or a general-protection exception may be generated.
5414// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
5415FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
5416{
5417#if defined(__aarch64__) || defined(_M_ARM64)
5418 vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
5419#else
5420 vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
5421#endif
5422}
5423
5424// Store the lower double-precision (64-bit) floating-point element from a into
5425// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5426// boundary or a general-protection exception may be generated.
5427// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
5428FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
5429{
5430#if defined(__aarch64__) || defined(_M_ARM64)
5431 float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
5432 vst1q_f64((float64_t *) mem_addr,
5433 vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
5434#else
5435 float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
5436 vst1q_f32((float32_t *) mem_addr,
5437 vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
5438#endif
5439}
5440
5441// Store the lower double-precision (64-bit) floating-point element from a into
5442// memory. mem_addr does not need to be aligned on any particular boundary.
5443// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
5444FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
5445{
5446#if defined(__aarch64__) || defined(_M_ARM64)
5447 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5448#else
5449 vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
5450#endif
5451}
5452
5453// Store 128-bits of integer data from a into memory. mem_addr must be aligned
5454// on a 16-byte boundary or a general-protection exception may be generated.
5455// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
5456FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
5457{
5458 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5459}
5460
5461// Store the lower double-precision (64-bit) floating-point element from a into
5462// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5463// boundary or a general-protection exception may be generated.
5464// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
5465#define _mm_store1_pd _mm_store_pd1
5466
5467// Store the upper double-precision (64-bit) floating-point element from a into
5468// memory.
5469// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
5470FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
5471{
5472#if defined(__aarch64__) || defined(_M_ARM64)
5473 vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
5474#else
5475 vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
5476#endif
5477}
5478
5479// Store 64-bit integer from the first element of a into memory.
5480// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
5481FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
5482{
5483 vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
5484}
5485
5486// Store the lower double-precision (64-bit) floating-point element from a into
5487// memory.
5488// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
5489FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
5490{
5491#if defined(__aarch64__) || defined(_M_ARM64)
5492 vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5493#else
5494 vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
5495#endif
5496}
5497
5498// Store 2 double-precision (64-bit) floating-point elements from a into memory
5499// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
5500// general-protection exception may be generated.
5501// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
5502FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
5503{
5504 float32x4_t f = vreinterpretq_f32_m128d(a);
5505 _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
5506}
5507
5508// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5509// elements) from a into memory. mem_addr does not need to be aligned on any
5510// particular boundary.
5511// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
5512FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
5513{
5514 _mm_store_pd(mem_addr, a);
5515}
5516
5517// Store 128-bits of integer data from a into memory. mem_addr does not need to
5518// be aligned on any particular boundary.
5519// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
5520FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
5521{
5522 vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5523}
5524
5525// Store 32-bit integer from the first element of a into memory. mem_addr does
5526// not need to be aligned on any particular boundary.
5527// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
5528FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
5529{
5530 vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
5531}
5532
5533// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5534// elements) from a into memory using a non-temporal memory hint. mem_addr must
5535// be aligned on a 16-byte boundary or a general-protection exception may be
5536// generated.
5537// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
5538FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
5539{
5540#if __has_builtin(__builtin_nontemporal_store)
5541 __builtin_nontemporal_store(a, (__m128d *) p);
5542#elif defined(__aarch64__) || defined(_M_ARM64)
5543 vst1q_f64(p, vreinterpretq_f64_m128d(a));
5544#else
5545 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
5546#endif
5547}
5548
5549// Store 128-bits of integer data from a into memory using a non-temporal memory
5550// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
5551// exception may be generated.
5552// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
5553FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
5554{
5555#if __has_builtin(__builtin_nontemporal_store)
5556 __builtin_nontemporal_store(a, p);
5557#else
5558 vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
5559#endif
5560}
5561
5562// Store 32-bit integer a into memory using a non-temporal hint to minimize
5563// cache pollution. If the cache line containing address mem_addr is already in
5564// the cache, the cache will be updated.
5565// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
5566FORCE_INLINE void _mm_stream_si32(int *p, int a)
5567{
5568 vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
5569}
5570
5571// Store 64-bit integer a into memory using a non-temporal hint to minimize
5572// cache pollution. If the cache line containing address mem_addr is already in
5573// the cache, the cache will be updated.
5574// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
5575FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
5576{
5577 vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
5578}
5579
5580// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
5581// store the results in dst.
5582// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
5584{
5587}
5588
5589// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
5590// store the results in dst.
5591// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
5593{
5596}
5597
5598// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
5599// store the results in dst.
5600// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
5602{
5605}
5606
5607// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
5608// store the results in dst.
5609// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
5611{
5614}
5615
5616// Subtract packed double-precision (64-bit) floating-point elements in b from
5617// packed double-precision (64-bit) floating-point elements in a, and store the
5618// results in dst.
5619// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
5621{
5622#if defined(__aarch64__) || defined(_M_ARM64)
5623 return vreinterpretq_m128d_f64(
5624 vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5625#else
5626 double *da = (double *) &a;
5627 double *db = (double *) &b;
5628 double c[2];
5629 c[0] = da[0] - db[0];
5630 c[1] = da[1] - db[1];
5631 return vld1q_f32((float32_t *) c);
5632#endif
5633}
5634
5635// Subtract the lower double-precision (64-bit) floating-point element in b from
5636// the lower double-precision (64-bit) floating-point element in a, store the
5637// result in the lower element of dst, and copy the upper element from a to the
5638// upper element of dst.
5639// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
5641{
5642 return _mm_move_sd(a, _mm_sub_pd(a, b));
5643}
5644
5645// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
5646// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
5647FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
5648{
5649 return vreinterpret_m64_s64(
5651}
5652
5653// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
5654// using saturation, and store the results in dst.
5655// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
5657{
5660}
5661
5662// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
5663// using saturation, and store the results in dst.
5664// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
5666{
5669}
5670
5671// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
5672// integers in a using saturation, and store the results in dst.
5673// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
5675{
5678}
5679
5680// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
5681// integers in a using saturation, and store the results in dst.
5682// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
5684{
5687}
5688
5689#define _mm_ucomieq_sd _mm_comieq_sd
5690#define _mm_ucomige_sd _mm_comige_sd
5691#define _mm_ucomigt_sd _mm_comigt_sd
5692#define _mm_ucomile_sd _mm_comile_sd
5693#define _mm_ucomilt_sd _mm_comilt_sd
5694#define _mm_ucomineq_sd _mm_comineq_sd
5695
5696// Return vector of type __m128d with undefined elements.
5697// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
5698FORCE_INLINE __m128d _mm_undefined_pd(void)
5699{
5700#if defined(__GNUC__) || defined(__clang__)
5701#pragma GCC diagnostic push
5702#pragma GCC diagnostic ignored "-Wuninitialized"
5703#endif
5704 __m128d a;
5705#if defined(_MSC_VER)
5706 a = _mm_setzero_pd();
5707#endif
5708 return a;
5709#if defined(__GNUC__) || defined(__clang__)
5710#pragma GCC diagnostic pop
5711#endif
5712}
5713
5714// Unpack and interleave 16-bit integers from the high half of a and b, and
5715// store the results in dst.
5716// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
5718{
5719#if defined(__aarch64__) || defined(_M_ARM64)
5722#else
5723 int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
5724 int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
5725 int16x4x2_t result = vzip_s16(a1, b1);
5726 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5727#endif
5728}
5729
5730// Unpack and interleave 32-bit integers from the high half of a and b, and
5731// store the results in dst.
5732// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
5734{
5735#if defined(__aarch64__) || defined(_M_ARM64)
5738#else
5739 int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
5740 int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
5741 int32x2x2_t result = vzip_s32(a1, b1);
5742 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5743#endif
5744}
5745
5746// Unpack and interleave 64-bit integers from the high half of a and b, and
5747// store the results in dst.
5748// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
5750{
5751#if defined(__aarch64__) || defined(_M_ARM64)
5754#else
5755 int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
5756 int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
5757 return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
5758#endif
5759}
5760
5761// Unpack and interleave 8-bit integers from the high half of a and b, and store
5762// the results in dst.
5763// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
5765{
5766#if defined(__aarch64__) || defined(_M_ARM64)
5769#else
5770 int8x8_t a1 =
5771 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
5772 int8x8_t b1 =
5773 vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
5774 int8x8x2_t result = vzip_s8(a1, b1);
5775 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5776#endif
5777}
5778
5779// Unpack and interleave double-precision (64-bit) floating-point elements from
5780// the high half of a and b, and store the results in dst.
5781// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
5783{
5784#if defined(__aarch64__) || defined(_M_ARM64)
5785 return vreinterpretq_m128d_f64(
5786 vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5787#else
5789 vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
5790 vget_high_s64(vreinterpretq_s64_m128d(b))));
5791#endif
5792}
5793
5794// Unpack and interleave 16-bit integers from the low half of a and b, and store
5795// the results in dst.
5796// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
5798{
5799#if defined(__aarch64__) || defined(_M_ARM64)
5802#else
5803 int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
5804 int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
5805 int16x4x2_t result = vzip_s16(a1, b1);
5806 return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5807#endif
5808}
5809
5810// Unpack and interleave 32-bit integers from the low half of a and b, and store
5811// the results in dst.
5812// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
5814{
5815#if defined(__aarch64__) || defined(_M_ARM64)
5818#else
5819 int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
5820 int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
5821 int32x2x2_t result = vzip_s32(a1, b1);
5822 return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5823#endif
5824}
5825
5826// Unpack and interleave 64-bit integers from the low half of a and b, and store
5827// the results in dst.
5828// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
5830{
5831#if defined(__aarch64__) || defined(_M_ARM64)
5834#else
5835 int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
5836 int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
5837 return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
5838#endif
5839}
5840
5841// Unpack and interleave 8-bit integers from the low half of a and b, and store
5842// the results in dst.
5843// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
5845{
5846#if defined(__aarch64__) || defined(_M_ARM64)
5849#else
5850 int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
5851 int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
5852 int8x8x2_t result = vzip_s8(a1, b1);
5853 return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5854#endif
5855}
5856
5857// Unpack and interleave double-precision (64-bit) floating-point elements from
5858// the low half of a and b, and store the results in dst.
5859// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
5861{
5862#if defined(__aarch64__) || defined(_M_ARM64)
5863 return vreinterpretq_m128d_f64(
5864 vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5865#else
5867 vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
5868 vget_low_s64(vreinterpretq_s64_m128d(b))));
5869#endif
5870}
5871
5872// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
5873// elements in a and b, and store the results in dst.
5874// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
5876{
5879}
5880
5881// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
5882// and store the result in dst.
5883// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
5885{
5888}
5889
5890/* SSE3 */
5891
5892// Alternatively add and subtract packed double-precision (64-bit)
5893// floating-point elements in a to/from packed elements in b, and store the
5894// results in dst.
5895// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
5897{
5898 _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
5899#if defined(__aarch64__) || defined(_M_ARM64)
5900 return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
5901 vreinterpretq_f64_m128d(b),
5902 vreinterpretq_f64_m128d(mask)));
5903#else
5904 return _mm_add_pd(_mm_mul_pd(b, mask), a);
5905#endif
5906}
5907
5908// Alternatively add and subtract packed single-precision (32-bit)
5909// floating-point elements in a to/from packed elements in b, and store the
5910// results in dst.
5911// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
5913{
5914 _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
5915#if (defined(__aarch64__) || defined(_M_ARM64)) || \
5916 defined(__ARM_FEATURE_FMA) /* VFPv4+ */
5920#else
5921 return _mm_add_ps(_mm_mul_ps(b, mask), a);
5922#endif
5923}
5924
5925// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
5926// elements in a and b, and pack the results in dst.
5927// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
5929{
5930#if defined(__aarch64__) || defined(_M_ARM64)
5931 return vreinterpretq_m128d_f64(
5932 vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5933#else
5934 double *da = (double *) &a;
5935 double *db = (double *) &b;
5936 double c[] = {da[0] + da[1], db[0] + db[1]};
5937 return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
5938#endif
5939}
5940
5941// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
5942// elements in a and b, and pack the results in dst.
5943// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
5945{
5946#if defined(__aarch64__) || defined(_M_ARM64)
5949#else
5950 float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
5951 float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
5952 float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
5953 float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
5955 vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
5956#endif
5957}
5958
5959// Horizontally subtract adjacent pairs of double-precision (64-bit)
5960// floating-point elements in a and b, and pack the results in dst.
5961// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
5963{
5964#if defined(__aarch64__) || defined(_M_ARM64)
5965 float64x2_t a = vreinterpretq_f64_m128d(_a);
5966 float64x2_t b = vreinterpretq_f64_m128d(_b);
5967 return vreinterpretq_m128d_f64(
5968 vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
5969#else
5970 double *da = (double *) &_a;
5971 double *db = (double *) &_b;
5972 double c[] = {da[0] - da[1], db[0] - db[1]};
5973 return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
5974#endif
5975}
5976
5977// Horizontally subtract adjacent pairs of single-precision (32-bit)
5978// floating-point elements in a and b, and pack the results in dst.
5979// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
5980FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
5981{
5982 float32x4_t a = vreinterpretq_f32_m128(_a);
5983 float32x4_t b = vreinterpretq_f32_m128(_b);
5984#if defined(__aarch64__) || defined(_M_ARM64)
5986 vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
5987#else
5988 float32x4x2_t c = vuzpq_f32(a, b);
5989 return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
5990#endif
5991}
5992
5993// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
5994// may perform better than _mm_loadu_si128 when the data crosses a cache line
5995// boundary.
5996// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
5997#define _mm_lddqu_si128 _mm_loadu_si128
5998
5999// Load a double-precision (64-bit) floating-point element from memory into both
6000// elements of dst.
6001// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
6002#define _mm_loaddup_pd _mm_load1_pd
6003
6004// Duplicate the low double-precision (64-bit) floating-point element from a,
6005// and store the results in dst.
6006// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
6008{
6009#if defined(__aarch64__) || defined(_M_ARM64)
6010 return vreinterpretq_m128d_f64(
6011 vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6012#else
6014 vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
6015#endif
6016}
6017
6018// Duplicate odd-indexed single-precision (32-bit) floating-point elements
6019// from a, and store the results in dst.
6020// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
6022{
6023#if defined(__aarch64__) || defined(_M_ARM64)
6026#elif defined(_sse2neon_shuffle)
6027 return vreinterpretq_m128_f32(vshuffleq_s32(
6029#else
6030 float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
6031 float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
6032 float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
6033 return vreinterpretq_m128_f32(vld1q_f32(data));
6034#endif
6035}
6036
6037// Duplicate even-indexed single-precision (32-bit) floating-point elements
6038// from a, and store the results in dst.
6039// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
6041{
6042#if defined(__aarch64__) || defined(_M_ARM64)
6045#elif defined(_sse2neon_shuffle)
6046 return vreinterpretq_m128_f32(vshuffleq_s32(
6048#else
6049 float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
6050 float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
6051 float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
6052 return vreinterpretq_m128_f32(vld1q_f32(data));
6053#endif
6054}
6055
6056/* SSSE3 */
6057
6058// Compute the absolute value of packed signed 16-bit integers in a, and store
6059// the unsigned results in dst.
6060// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
6062{
6064}
6065
6066// Compute the absolute value of packed signed 32-bit integers in a, and store
6067// the unsigned results in dst.
6068// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
6070{
6072}
6073
6074// Compute the absolute value of packed signed 8-bit integers in a, and store
6075// the unsigned results in dst.
6076// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
6078{
6080}
6081
6082// Compute the absolute value of packed signed 16-bit integers in a, and store
6083// the unsigned results in dst.
6084// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
6085FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
6086{
6087 return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
6088}
6089
6090// Compute the absolute value of packed signed 32-bit integers in a, and store
6091// the unsigned results in dst.
6092// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
6093FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
6094{
6095 return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
6096}
6097
6098// Compute the absolute value of packed signed 8-bit integers in a, and store
6099// the unsigned results in dst.
6100// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
6101FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
6102{
6103 return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
6104}
6105
6106// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
6107// the result right by imm8 bytes, and store the low 16 bytes in dst.
6108// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
6109#if defined(__GNUC__) && !defined(__clang__)
6110#define _mm_alignr_epi8(a, b, imm) \
6111 __extension__({ \
6112 uint8x16_t _a = vreinterpretq_u8_m128i(a); \
6113 uint8x16_t _b = vreinterpretq_u8_m128i(b); \
6114 __m128i ret; \
6115 if (_sse2neon_unlikely((imm) & ~31)) \
6116 ret = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
6117 else if (imm >= 16) \
6118 ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0); \
6119 else \
6120 ret = \
6121 vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
6122 ret; \
6123 })
6124
6125#else
6126#define _mm_alignr_epi8(a, b, imm) \
6127 _sse2neon_define2( \
6128 __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a); \
6129 uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret; \
6130 if (_sse2neon_unlikely((imm) & ~31)) ret = \
6131 vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
6132 else if (imm >= 16) ret = \
6133 _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0); \
6134 else ret = \
6135 vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \
6136 _sse2neon_return(ret);)
6137
6138#endif
6139
6140// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
6141// the result right by imm8 bytes, and store the low 8 bytes in dst.
6142// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
6143#define _mm_alignr_pi8(a, b, imm) \
6144 _sse2neon_define2( \
6145 __m64, a, b, __m64 ret; if (_sse2neon_unlikely((imm) >= 16)) { \
6146 ret = vreinterpret_m64_s8(vdup_n_s8(0)); \
6147 } else { \
6148 uint8x8_t tmp_low; \
6149 uint8x8_t tmp_high; \
6150 if ((imm) >= 8) { \
6151 const int idx = (imm) -8; \
6152 tmp_low = vreinterpret_u8_m64(_a); \
6153 tmp_high = vdup_n_u8(0); \
6154 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6155 } else { \
6156 const int idx = (imm); \
6157 tmp_low = vreinterpret_u8_m64(_b); \
6158 tmp_high = vreinterpret_u8_m64(_a); \
6159 ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6160 } \
6161 } _sse2neon_return(ret);)
6162
6163// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
6164// signed 16-bit results in dst.
6165// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
6167{
6168 int16x8_t a = vreinterpretq_s16_m128i(_a);
6169 int16x8_t b = vreinterpretq_s16_m128i(_b);
6170#if defined(__aarch64__) || defined(_M_ARM64)
6171 return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
6172#else
6174 vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6175 vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6176#endif
6177}
6178
6179// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
6180// signed 32-bit results in dst.
6181// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
6183{
6184 int32x4_t a = vreinterpretq_s32_m128i(_a);
6185 int32x4_t b = vreinterpretq_s32_m128i(_b);
6186#if defined(__aarch64__) || defined(_M_ARM64)
6187 return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
6188#else
6190 vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
6191 vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
6192#endif
6193}
6194
6195// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
6196// signed 16-bit results in dst.
6197// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
6198FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
6199{
6200 return vreinterpret_m64_s16(
6201 vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
6202}
6203
6204// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
6205// signed 32-bit results in dst.
6206// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
6207FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
6208{
6209 return vreinterpret_m64_s32(
6210 vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
6211}
6212
6213// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
6214// saturation, and pack the signed 16-bit results in dst.
6215// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
6217{
6218#if defined(__aarch64__) || defined(_M_ARM64)
6219 int16x8_t a = vreinterpretq_s16_m128i(_a);
6220 int16x8_t b = vreinterpretq_s16_m128i(_b);
6221 return vreinterpretq_s64_s16(
6222 vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6223#else
6224 int32x4_t a = vreinterpretq_s32_m128i(_a);
6225 int32x4_t b = vreinterpretq_s32_m128i(_b);
6226 // Interleave using vshrn/vmovn
6227 // [a0|a2|a4|a6|b0|b2|b4|b6]
6228 // [a1|a3|a5|a7|b1|b3|b5|b7]
6229 int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6230 int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6231 // Saturated add
6232 return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
6233#endif
6234}
6235
6236// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
6237// saturation, and pack the signed 16-bit results in dst.
6238// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
6239FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
6240{
6241 int16x4_t a = vreinterpret_s16_m64(_a);
6242 int16x4_t b = vreinterpret_s16_m64(_b);
6243#if defined(__aarch64__) || defined(_M_ARM64)
6244 return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6245#else
6246 int16x4x2_t res = vuzp_s16(a, b);
6247 return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
6248#endif
6249}
6250
6251// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6252// the signed 16-bit results in dst.
6253// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
6255{
6256 int16x8_t a = vreinterpretq_s16_m128i(_a);
6257 int16x8_t b = vreinterpretq_s16_m128i(_b);
6258#if defined(__aarch64__) || defined(_M_ARM64)
6260 vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6261#else
6262 int16x8x2_t c = vuzpq_s16(a, b);
6263 return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
6264#endif
6265}
6266
6267// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6268// the signed 32-bit results in dst.
6269// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
6271{
6272 int32x4_t a = vreinterpretq_s32_m128i(_a);
6273 int32x4_t b = vreinterpretq_s32_m128i(_b);
6274#if defined(__aarch64__) || defined(_M_ARM64)
6276 vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
6277#else
6278 int32x4x2_t c = vuzpq_s32(a, b);
6279 return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
6280#endif
6281}
6282
6283// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6284// the signed 16-bit results in dst.
6285// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
6286FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
6287{
6288 int16x4_t a = vreinterpret_s16_m64(_a);
6289 int16x4_t b = vreinterpret_s16_m64(_b);
6290#if defined(__aarch64__) || defined(_M_ARM64)
6291 return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6292#else
6293 int16x4x2_t c = vuzp_s16(a, b);
6294 return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
6295#endif
6296}
6297
6298// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6299// the signed 32-bit results in dst.
6300// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
6301FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
6302{
6303 int32x2_t a = vreinterpret_s32_m64(_a);
6304 int32x2_t b = vreinterpret_s32_m64(_b);
6305#if defined(__aarch64__) || defined(_M_ARM64)
6306 return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
6307#else
6308 int32x2x2_t c = vuzp_s32(a, b);
6309 return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
6310#endif
6311}
6312
6313// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
6314// using saturation, and pack the signed 16-bit results in dst.
6315// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
6317{
6318 int16x8_t a = vreinterpretq_s16_m128i(_a);
6319 int16x8_t b = vreinterpretq_s16_m128i(_b);
6320#if defined(__aarch64__) || defined(_M_ARM64)
6322 vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6323#else
6324 int16x8x2_t c = vuzpq_s16(a, b);
6325 return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
6326#endif
6327}
6328
6329// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
6330// using saturation, and pack the signed 16-bit results in dst.
6331// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
6332FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
6333{
6334 int16x4_t a = vreinterpret_s16_m64(_a);
6335 int16x4_t b = vreinterpret_s16_m64(_b);
6336#if defined(__aarch64__) || defined(_M_ARM64)
6337 return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6338#else
6339 int16x4x2_t c = vuzp_s16(a, b);
6340 return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
6341#endif
6342}
6343
6344// Vertically multiply each unsigned 8-bit integer from a with the corresponding
6345// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6346// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
6347// and pack the saturated results in dst.
6348// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
6350{
6351#if defined(__aarch64__) || defined(_M_ARM64)
6352 uint8x16_t a = vreinterpretq_u8_m128i(_a);
6353 int8x16_t b = vreinterpretq_s8_m128i(_b);
6354 int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6355 vmovl_s8(vget_low_s8(b)));
6356 int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6357 vmovl_s8(vget_high_s8(b)));
6359 vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6360#else
6361 // This would be much simpler if x86 would choose to zero extend OR sign
6362 // extend, not both. This could probably be optimized better.
6363 uint16x8_t a = vreinterpretq_u16_m128i(_a);
6364 int16x8_t b = vreinterpretq_s16_m128i(_b);
6365
6366 // Zero extend a
6367 int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6368 int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6369
6370 // Sign extend by shifting left then shifting right.
6371 int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6372 int16x8_t b_odd = vshrq_n_s16(b, 8);
6373
6374 // multiply
6375 int16x8_t prod1 = vmulq_s16(a_even, b_even);
6376 int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
6377
6378 // saturated add
6379 return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
6380#endif
6381}
6382
6383// Vertically multiply each unsigned 8-bit integer from a with the corresponding
6384// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6385// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
6386// pack the saturated results in dst.
6387// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
6388FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
6389{
6390 uint16x4_t a = vreinterpret_u16_m64(_a);
6391 int16x4_t b = vreinterpret_s16_m64(_b);
6392
6393 // Zero extend a
6394 int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
6395 int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
6396
6397 // Sign extend by shifting left then shifting right.
6398 int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
6399 int16x4_t b_odd = vshr_n_s16(b, 8);
6400
6401 // multiply
6402 int16x4_t prod1 = vmul_s16(a_even, b_even);
6403 int16x4_t prod2 = vmul_s16(a_odd, b_odd);
6404
6405 // saturated add
6406 return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
6407}
6408
6409// Multiply packed signed 16-bit integers in a and b, producing intermediate
6410// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
6411// the packed 16-bit integers in dst.
6412// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
6414{
6415 // Has issues due to saturation
6416 // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
6417
6418 // Multiply
6419 int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
6420 vget_low_s16(vreinterpretq_s16_m128i(b)));
6421 int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
6422 vget_high_s16(vreinterpretq_s16_m128i(b)));
6423
6424 // Rounding narrowing shift right
6425 // narrow = (int16_t)((mul + 16384) >> 15);
6426 int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
6427 int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
6428
6429 // Join together
6430 return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
6431}
6432
6433// Multiply packed signed 16-bit integers in a and b, producing intermediate
6434// signed 32-bit integers. Truncate each intermediate integer to the 18 most
6435// significant bits, round by adding 1, and store bits [16:1] to dst.
6436// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
6438{
6439 int32x4_t mul_extend =
6440 vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
6441
6442 // Rounding narrowing shift right
6443 return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
6444}
6445
6446// Shuffle packed 8-bit integers in a according to shuffle control mask in the
6447// corresponding 8-bit element of b, and store the results in dst.
6448// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
6450{
6451 int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
6452 uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
6453 uint8x16_t idx_masked =
6454 vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
6455#if defined(__aarch64__) || defined(_M_ARM64)
6456 return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
6457#elif defined(__GNUC__)
6458 int8x16_t ret;
6459 // %e and %f represent the even and odd D registers
6460 // respectively.
6461 __asm__ __volatile__(
6462 "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
6463 "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
6464 : [ret] "=&w"(ret)
6465 : [tbl] "w"(tbl), [idx] "w"(idx_masked));
6466 return vreinterpretq_m128i_s8(ret);
6467#else
6468 // use this line if testing on aarch64
6469 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
6471 vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
6472 vtbl2_s8(a_split, vget_high_u8(idx_masked))));
6473#endif
6474}
6475
6476// Shuffle packed 8-bit integers in a according to shuffle control mask in the
6477// corresponding 8-bit element of b, and store the results in dst.
6478// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
6480{
6481 const int8x8_t controlMask =
6482 vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
6483 int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
6484 return vreinterpret_m64_s8(res);
6485}
6486
6487// Negate packed 16-bit integers in a when the corresponding signed
6488// 16-bit integer in b is negative, and store the results in dst.
6489// Element in dst are zeroed out when the corresponding element
6490// in b is zero.
6491// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
6493{
6494 int16x8_t a = vreinterpretq_s16_m128i(_a);
6495 int16x8_t b = vreinterpretq_s16_m128i(_b);
6496
6497 // signed shift right: faster than vclt
6498 // (b < 0) ? 0xFFFF : 0
6499 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
6500 // (b == 0) ? 0xFFFF : 0
6501#if defined(__aarch64__) || defined(_M_ARM64)
6502 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
6503#else
6504 int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
6505#endif
6506
6507 // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
6508 // 'a') based on ltMask
6509 int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
6510 // res = masked & (~zeroMask)
6511 int16x8_t res = vbicq_s16(masked, zeroMask);
6512 return vreinterpretq_m128i_s16(res);
6513}
6514
6515// Negate packed 32-bit integers in a when the corresponding signed
6516// 32-bit integer in b is negative, and store the results in dst.
6517// Element in dst are zeroed out when the corresponding element
6518// in b is zero.
6519// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
6521{
6522 int32x4_t a = vreinterpretq_s32_m128i(_a);
6523 int32x4_t b = vreinterpretq_s32_m128i(_b);
6524
6525 // signed shift right: faster than vclt
6526 // (b < 0) ? 0xFFFFFFFF : 0
6527 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
6528
6529 // (b == 0) ? 0xFFFFFFFF : 0
6530#if defined(__aarch64__) || defined(_M_ARM64)
6531 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
6532#else
6533 int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
6534#endif
6535
6536 // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
6537 // 'a') based on ltMask
6538 int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
6539 // res = masked & (~zeroMask)
6540 int32x4_t res = vbicq_s32(masked, zeroMask);
6541 return vreinterpretq_m128i_s32(res);
6542}
6543
6544// Negate packed 8-bit integers in a when the corresponding signed
6545// 8-bit integer in b is negative, and store the results in dst.
6546// Element in dst are zeroed out when the corresponding element
6547// in b is zero.
6548// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
6550{
6551 int8x16_t a = vreinterpretq_s8_m128i(_a);
6552 int8x16_t b = vreinterpretq_s8_m128i(_b);
6553
6554 // signed shift right: faster than vclt
6555 // (b < 0) ? 0xFF : 0
6556 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
6557
6558 // (b == 0) ? 0xFF : 0
6559#if defined(__aarch64__) || defined(_M_ARM64)
6560 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
6561#else
6562 int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
6563#endif
6564
6565 // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
6566 // based on ltMask
6567 int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
6568 // res = masked & (~zeroMask)
6569 int8x16_t res = vbicq_s8(masked, zeroMask);
6570
6571 return vreinterpretq_m128i_s8(res);
6572}
6573
6574// Negate packed 16-bit integers in a when the corresponding signed 16-bit
6575// integer in b is negative, and store the results in dst. Element in dst are
6576// zeroed out when the corresponding element in b is zero.
6577// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
6578FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
6579{
6580 int16x4_t a = vreinterpret_s16_m64(_a);
6581 int16x4_t b = vreinterpret_s16_m64(_b);
6582
6583 // signed shift right: faster than vclt
6584 // (b < 0) ? 0xFFFF : 0
6585 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
6586
6587 // (b == 0) ? 0xFFFF : 0
6588#if defined(__aarch64__) || defined(_M_ARM64)
6589 int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
6590#else
6591 int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
6592#endif
6593
6594 // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
6595 // based on ltMask
6596 int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
6597 // res = masked & (~zeroMask)
6598 int16x4_t res = vbic_s16(masked, zeroMask);
6599
6600 return vreinterpret_m64_s16(res);
6601}
6602
6603// Negate packed 32-bit integers in a when the corresponding signed 32-bit
6604// integer in b is negative, and store the results in dst. Element in dst are
6605// zeroed out when the corresponding element in b is zero.
6606// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
6607FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
6608{
6609 int32x2_t a = vreinterpret_s32_m64(_a);
6610 int32x2_t b = vreinterpret_s32_m64(_b);
6611
6612 // signed shift right: faster than vclt
6613 // (b < 0) ? 0xFFFFFFFF : 0
6614 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
6615
6616 // (b == 0) ? 0xFFFFFFFF : 0
6617#if defined(__aarch64__) || defined(_M_ARM64)
6618 int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
6619#else
6620 int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
6621#endif
6622
6623 // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
6624 // based on ltMask
6625 int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
6626 // res = masked & (~zeroMask)
6627 int32x2_t res = vbic_s32(masked, zeroMask);
6628
6629 return vreinterpret_m64_s32(res);
6630}
6631
6632// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
6633// in b is negative, and store the results in dst. Element in dst are zeroed out
6634// when the corresponding element in b is zero.
6635// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
6636FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
6637{
6638 int8x8_t a = vreinterpret_s8_m64(_a);
6639 int8x8_t b = vreinterpret_s8_m64(_b);
6640
6641 // signed shift right: faster than vclt
6642 // (b < 0) ? 0xFF : 0
6643 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
6644
6645 // (b == 0) ? 0xFF : 0
6646#if defined(__aarch64__) || defined(_M_ARM64)
6647 int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
6648#else
6649 int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
6650#endif
6651
6652 // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
6653 // based on ltMask
6654 int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
6655 // res = masked & (~zeroMask)
6656 int8x8_t res = vbic_s8(masked, zeroMask);
6657
6658 return vreinterpret_m64_s8(res);
6659}
6660
6661/* SSE4.1 */
6662
6663// Blend packed 16-bit integers from a and b using control mask imm8, and store
6664// the results in dst.
6665// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
6666// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
6667// __constrange(0,255) int imm)
6668#define _mm_blend_epi16(a, b, imm) \
6669 _sse2neon_define2( \
6670 __m128i, a, b, \
6671 const uint16_t _mask[8] = \
6672 _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \
6673 ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \
6674 ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \
6675 ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \
6676 ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \
6677 ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \
6678 ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \
6679 ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0); \
6680 uint16x8_t _mask_vec = vld1q_u16(_mask); \
6681 uint16x8_t __a = vreinterpretq_u16_m128i(_a); \
6682 uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
6683 vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, __b, __a)));)
6684
6685// Blend packed double-precision (64-bit) floating-point elements from a and b
6686// using control mask imm8, and store the results in dst.
6687// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
6688#define _mm_blend_pd(a, b, imm) \
6689 _sse2neon_define2( \
6690 __m128d, a, b, \
6691 const uint64_t _mask[2] = \
6692 _sse2neon_init(((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \
6693 ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)); \
6694 uint64x2_t _mask_vec = vld1q_u64(_mask); \
6695 uint64x2_t __a = vreinterpretq_u64_m128d(_a); \
6696 uint64x2_t __b = vreinterpretq_u64_m128d(_b); _sse2neon_return( \
6697 vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, __b, __a)));)
6698
6699// Blend packed single-precision (32-bit) floating-point elements from a and b
6700// using mask, and store the results in dst.
6701// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
6702FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
6703{
6704 const uint32_t ALIGN_STRUCT(16)
6705 data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
6706 ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
6707 ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
6708 ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
6709 uint32x4_t mask = vld1q_u32(data);
6710 float32x4_t a = vreinterpretq_f32_m128(_a);
6711 float32x4_t b = vreinterpretq_f32_m128(_b);
6712 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
6713}
6714
6715// Blend packed 8-bit integers from a and b using mask, and store the results in
6716// dst.
6717// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
6718FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
6719{
6720 // Use a signed shift right to create a mask with the sign bit
6721 uint8x16_t mask =
6722 vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
6723 uint8x16_t a = vreinterpretq_u8_m128i(_a);
6724 uint8x16_t b = vreinterpretq_u8_m128i(_b);
6725 return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
6726}
6727
6728// Blend packed double-precision (64-bit) floating-point elements from a and b
6729// using mask, and store the results in dst.
6730// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
6731FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
6732{
6733 uint64x2_t mask =
6734 vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
6735#if defined(__aarch64__) || defined(_M_ARM64)
6736 float64x2_t a = vreinterpretq_f64_m128d(_a);
6737 float64x2_t b = vreinterpretq_f64_m128d(_b);
6738 return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
6739#else
6740 uint64x2_t a = vreinterpretq_u64_m128d(_a);
6741 uint64x2_t b = vreinterpretq_u64_m128d(_b);
6742 return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
6743#endif
6744}
6745
6746// Blend packed single-precision (32-bit) floating-point elements from a and b
6747// using mask, and store the results in dst.
6748// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
6749FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
6750{
6751 // Use a signed shift right to create a mask with the sign bit
6752 uint32x4_t mask =
6753 vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
6754 float32x4_t a = vreinterpretq_f32_m128(_a);
6755 float32x4_t b = vreinterpretq_f32_m128(_b);
6756 return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
6757}
6758
6759// Round the packed double-precision (64-bit) floating-point elements in a up
6760// to an integer value, and store the results as packed double-precision
6761// floating-point elements in dst.
6762// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
6764{
6765#if defined(__aarch64__) || defined(_M_ARM64)
6766 return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
6767#else
6768 double *f = (double *) &a;
6769 return _mm_set_pd(ceil(f[1]), ceil(f[0]));
6770#endif
6771}
6772
6773// Round the packed single-precision (32-bit) floating-point elements in a up to
6774// an integer value, and store the results as packed single-precision
6775// floating-point elements in dst.
6776// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
6778{
6779#if (defined(__aarch64__) || defined(_M_ARM64)) || \
6780 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
6781 return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
6782#else
6783 float *f = (float *) &a;
6784 return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
6785#endif
6786}
6787
6788// Round the lower double-precision (64-bit) floating-point element in b up to
6789// an integer value, store the result as a double-precision floating-point
6790// element in the lower element of dst, and copy the upper element from a to the
6791// upper element of dst.
6792// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
6794{
6795 return _mm_move_sd(a, _mm_ceil_pd(b));
6796}
6797
6798// Round the lower single-precision (32-bit) floating-point element in b up to
6799// an integer value, store the result as a single-precision floating-point
6800// element in the lower element of dst, and copy the upper 3 packed elements
6801// from a to the upper elements of dst.
6802// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
6804{
6805 return _mm_move_ss(a, _mm_ceil_ps(b));
6806}
6807
6808// Compare packed 64-bit integers in a and b for equality, and store the results
6809// in dst
6811{
6812#if defined(__aarch64__) || defined(_M_ARM64)
6815#else
6816 // ARMv7 lacks vceqq_u64
6817 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
6818 uint32x4_t cmp =
6820 uint32x4_t swapped = vrev64q_u32(cmp);
6821 return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
6822#endif
6823}
6824
6825// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
6826// the results in dst.
6827// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
6829{
6831 vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
6832}
6833
6834// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
6835// the results in dst.
6836// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
6838{
6839 int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
6840 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
6841 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
6842 return vreinterpretq_m128i_s64(s64x2);
6843}
6844
6845// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
6846// the results in dst.
6847// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
6849{
6851 vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
6852}
6853
6854// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
6855// the results in dst.
6856// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
6858{
6859 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
6860 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
6861 return vreinterpretq_m128i_s16(s16x8);
6862}
6863
6864// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
6865// the results in dst.
6866// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
6868{
6869 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
6870 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
6871 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
6872 return vreinterpretq_m128i_s32(s32x4);
6873}
6874
6875// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
6876// integers, and store the results in dst.
6877// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
6879{
6880 int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
6881 int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
6882 int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
6883 int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
6884 return vreinterpretq_m128i_s64(s64x2);
6885}
6886
6887// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
6888// and store the results in dst.
6889// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
6891{
6893 vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
6894}
6895
6896// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
6897// and store the results in dst.
6898// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
6900{
6901 uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
6902 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
6903 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
6904 return vreinterpretq_m128i_u64(u64x2);
6905}
6906
6907// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
6908// and store the results in dst.
6909// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
6911{
6913 vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
6914}
6915
6916// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
6917// and store the results in dst.
6918// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
6920{
6921 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */
6922 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
6923 return vreinterpretq_m128i_u16(u16x8);
6924}
6925
6926// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
6927// and store the results in dst.
6928// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
6930{
6931 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
6932 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
6933 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
6934 return vreinterpretq_m128i_u32(u32x4);
6935}
6936
6937// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed
6938// 64-bit integers, and store the results in dst.
6939// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
6941{
6942 uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
6943 uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
6944 uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
6945 uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
6946 return vreinterpretq_m128i_u64(u64x2);
6947}
6948
6949// Conditionally multiply the packed double-precision (64-bit) floating-point
6950// elements in a and b using the high 4 bits in imm8, sum the four products, and
6951// conditionally store the sum in dst using the low 4 bits of imm8.
6952// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
6953FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
6954{
6955 // Generate mask value from constant immediate bit value
6956 const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
6957 const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
6958#if !SSE2NEON_PRECISE_DP
6959 const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
6960 const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
6961#endif
6962 // Conditional multiplication
6963#if !SSE2NEON_PRECISE_DP
6964 __m128d mul = _mm_mul_pd(a, b);
6965 const __m128d mulMask =
6966 _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
6967 __m128d tmp = _mm_and_pd(mul, mulMask);
6968#else
6969#if defined(__aarch64__) || defined(_M_ARM64)
6970 double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
6971 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
6972 : 0;
6973 double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
6974 vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
6975 : 0;
6976#else
6977 double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
6978 double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
6979#endif
6980 __m128d tmp = _mm_set_pd(d1, d0);
6981#endif
6982 // Sum the products
6983#if defined(__aarch64__) || defined(_M_ARM64)
6984 double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
6985#else
6986 double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
6987#endif
6988 // Conditionally store the sum
6989 const __m128d sumMask =
6990 _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
6991 __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
6992 return res;
6993}
6994
6995// Conditionally multiply the packed single-precision (32-bit) floating-point
6996// elements in a and b using the high 4 bits in imm8, sum the four products,
6997// and conditionally store the sum in dst using the low 4 bits of imm.
6998// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
6999FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
7000{
7001 float32x4_t elementwise_prod = _mm_mul_ps(a, b);
7002
7003#if defined(__aarch64__) || defined(_M_ARM64)
7004 /* shortcuts */
7005 if (imm == 0xFF) {
7006 return _mm_set1_ps(vaddvq_f32(elementwise_prod));
7007 }
7008
7009 if ((imm & 0x0F) == 0x0F) {
7010 if (!(imm & (1 << 4)))
7011 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0);
7012 if (!(imm & (1 << 5)))
7013 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1);
7014 if (!(imm & (1 << 6)))
7015 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2);
7016 if (!(imm & (1 << 7)))
7017 elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3);
7018
7019 return _mm_set1_ps(vaddvq_f32(elementwise_prod));
7020 }
7021#endif
7022
7023 float s = 0.0f;
7024
7025 if (imm & (1 << 4))
7026 s += vgetq_lane_f32(elementwise_prod, 0);
7027 if (imm & (1 << 5))
7028 s += vgetq_lane_f32(elementwise_prod, 1);
7029 if (imm & (1 << 6))
7030 s += vgetq_lane_f32(elementwise_prod, 2);
7031 if (imm & (1 << 7))
7032 s += vgetq_lane_f32(elementwise_prod, 3);
7033
7034 const float32_t res[4] = {
7035 (imm & 0x1) ? s : 0.0f,
7036 (imm & 0x2) ? s : 0.0f,
7037 (imm & 0x4) ? s : 0.0f,
7038 (imm & 0x8) ? s : 0.0f,
7039 };
7040 return vreinterpretq_m128_f32(vld1q_f32(res));
7041}
7042
7043// Extract a 32-bit integer from a, selected with imm8, and store the result in
7044// dst.
7045// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
7046// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
7047#define _mm_extract_epi32(a, imm) \
7048 vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7049
7050// Extract a 64-bit integer from a, selected with imm8, and store the result in
7051// dst.
7052// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
7053// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
7054#define _mm_extract_epi64(a, imm) \
7055 vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7056
7057// Extract an 8-bit integer from a, selected with imm8, and store the result in
7058// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
7059// __constrange(0,16) int imm)
7060// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
7061#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7062
7063// Extracts the selected single-precision (32-bit) floating-point from a.
7064// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
7065#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7066
7067// Round the packed double-precision (64-bit) floating-point elements in a down
7068// to an integer value, and store the results as packed double-precision
7069// floating-point elements in dst.
7070// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
7072{
7073#if defined(__aarch64__) || defined(_M_ARM64)
7074 return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7075#else
7076 double *f = (double *) &a;
7077 return _mm_set_pd(floor(f[1]), floor(f[0]));
7078#endif
7079}
7080
7081// Round the packed single-precision (32-bit) floating-point elements in a down
7082// to an integer value, and store the results as packed single-precision
7083// floating-point elements in dst.
7084// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
7086{
7087#if (defined(__aarch64__) || defined(_M_ARM64)) || \
7088 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7089 return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
7090#else
7091 float *f = (float *) &a;
7092 return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7093#endif
7094}
7095
7096// Round the lower double-precision (64-bit) floating-point element in b down to
7097// an integer value, store the result as a double-precision floating-point
7098// element in the lower element of dst, and copy the upper element from a to the
7099// upper element of dst.
7100// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
7102{
7103 return _mm_move_sd(a, _mm_floor_pd(b));
7104}
7105
7106// Round the lower single-precision (32-bit) floating-point element in b down to
7107// an integer value, store the result as a single-precision floating-point
7108// element in the lower element of dst, and copy the upper 3 packed elements
7109// from a to the upper elements of dst.
7110// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
7112{
7113 return _mm_move_ss(a, _mm_floor_ps(b));
7114}
7115
7116// Copy a to dst, and insert the 32-bit integer i into dst at the location
7117// specified by imm8.
7118// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
7119// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
7120// __constrange(0,4) int imm)
7121#define _mm_insert_epi32(a, b, imm) \
7122 vreinterpretq_m128i_s32( \
7123 vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm)))
7124
7125// Copy a to dst, and insert the 64-bit integer i into dst at the location
7126// specified by imm8.
7127// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
7128// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
7129// __constrange(0,2) int imm)
7130#define _mm_insert_epi64(a, b, imm) \
7131 vreinterpretq_m128i_s64( \
7132 vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm)))
7133
7134// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
7135// location specified by imm8.
7136// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
7137// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
7138// __constrange(0,16) int imm)
7139#define _mm_insert_epi8(a, b, imm) \
7140 vreinterpretq_m128i_s8(vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm)))
7141
7142// Copy a to tmp, then insert a single-precision (32-bit) floating-point
7143// element from b into tmp using the control in imm8. Store tmp to dst using
7144// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
7145// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
7146#define _mm_insert_ps(a, b, imm8) \
7147 _sse2neon_define2( \
7148 __m128, a, b, \
7149 float32x4_t tmp1 = \
7150 vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3), \
7151 vreinterpretq_f32_m128(_a), 0); \
7152 float32x4_t tmp2 = \
7153 vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), \
7154 vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \
7155 const uint32_t data[4] = \
7156 _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
7157 ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
7158 ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
7159 ((imm8) & (1 << 3)) ? UINT32_MAX : 0); \
7160 uint32x4_t mask = vld1q_u32(data); \
7161 float32x4_t all_zeros = vdupq_n_f32(0); \
7162 \
7163 _sse2neon_return(vreinterpretq_m128_f32( \
7164 vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));)
7165
7166// Compare packed signed 32-bit integers in a and b, and store packed maximum
7167// values in dst.
7168// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
7170{
7173}
7174
7175// Compare packed signed 8-bit integers in a and b, and store packed maximum
7176// values in dst.
7177// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
7179{
7182}
7183
7184// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
7185// values in dst.
7186// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
7188{
7191}
7192
7193// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
7194// values in dst.
7195// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
7197{
7200}
7201
7202// Compare packed signed 32-bit integers in a and b, and store packed minimum
7203// values in dst.
7204// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
7206{
7209}
7210
7211// Compare packed signed 8-bit integers in a and b, and store packed minimum
7212// values in dst.
7213// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
7215{
7218}
7219
7220// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
7221// values in dst.
7222// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
7224{
7227}
7228
7229// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
7230// values in dst.
7231// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
7233{
7236}
7237
7238// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
7239// in a, store the minimum and index in dst, and zero the remaining bits in dst.
7240// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
7242{
7243 __m128i dst;
7244 uint16_t min, idx = 0;
7245#if defined(__aarch64__) || defined(_M_ARM64)
7246 // Find the minimum value
7247 min = vminvq_u16(vreinterpretq_u16_m128i(a));
7248
7249 // Get the index of the minimum value
7250 static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
7251 uint16x8_t minv = vdupq_n_u16(min);
7252 uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
7253 idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
7254#else
7255 // Find the minimum value
7256 __m64 tmp;
7258 vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
7259 vget_high_u16(vreinterpretq_u16_m128i(a))));
7261 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7263 vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7264 min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
7265 // Get the index of the minimum value
7266 int i;
7267 for (i = 0; i < 8; i++) {
7268 if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
7269 idx = (uint16_t) i;
7270 break;
7271 }
7272 a = _mm_srli_si128(a, 2);
7273 }
7274#endif
7275 // Generate result
7276 dst = _mm_setzero_si128();
7278 vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
7280 vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
7281 return dst;
7282}
7283
7284// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
7285// 8-bit integers in a compared to those in b, and store the 16-bit results in
7286// dst. Eight SADs are performed using one quadruplet from b and eight
7287// quadruplets from a. One quadruplet is selected from b starting at on the
7288// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
7289// integers selected from a starting at the offset specified in imm8.
7290// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
7291FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
7292{
7293 uint8x16_t _a, _b;
7294
7295 switch (imm & 0x4) {
7296 case 0:
7297 // do nothing
7298 _a = vreinterpretq_u8_m128i(a);
7299 break;
7300 case 4:
7301 _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
7303 break;
7304 default:
7305#if defined(__GNUC__) || defined(__clang__)
7306 __builtin_unreachable();
7307#elif defined(_MSC_VER)
7308 __assume(0);
7309#endif
7310 break;
7311 }
7312
7313 switch (imm & 0x3) {
7314 case 0:
7315 _b = vreinterpretq_u8_u32(
7316 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
7317 break;
7318 case 1:
7319 _b = vreinterpretq_u8_u32(
7320 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
7321 break;
7322 case 2:
7323 _b = vreinterpretq_u8_u32(
7324 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
7325 break;
7326 case 3:
7327 _b = vreinterpretq_u8_u32(
7328 vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
7329 break;
7330 default:
7331#if defined(__GNUC__) || defined(__clang__)
7332 __builtin_unreachable();
7333#elif defined(_MSC_VER)
7334 __assume(0);
7335#endif
7336 break;
7337 }
7338
7339 int16x8_t c04, c15, c26, c37;
7340 uint8x8_t low_b = vget_low_u8(_b);
7341 c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
7342 uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
7343 c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
7344 uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
7345 c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
7346 uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
7347 c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
7348#if defined(__aarch64__) || defined(_M_ARM64)
7349 // |0|4|2|6|
7350 c04 = vpaddq_s16(c04, c26);
7351 // |1|5|3|7|
7352 c15 = vpaddq_s16(c15, c37);
7353
7354 int32x4_t trn1_c =
7355 vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
7356 int32x4_t trn2_c =
7357 vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
7358 return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
7359 vreinterpretq_s16_s32(trn2_c)));
7360#else
7361 int16x4_t c01, c23, c45, c67;
7362 c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
7363 c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
7364 c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
7365 c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
7366
7368 vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
7369#endif
7370}
7371
7372// Multiply the low signed 32-bit integers from each packed 64-bit element in
7373// a and b, and store the signed 64-bit results in dst.
7374// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
7376{
7377 // vmull_s32 upcasts instead of masking, so we downcast.
7378 int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
7379 int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
7380 return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
7381}
7382
7383// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
7384// integers, and store the low 32 bits of the intermediate integers in dst.
7385// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
7387{
7390}
7391
7392// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
7393// using unsigned saturation, and store the results in dst.
7394// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
7396{
7398 vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
7399 vqmovun_s32(vreinterpretq_s32_m128i(b))));
7400}
7401
7402// Round the packed double-precision (64-bit) floating-point elements in a using
7403// the rounding parameter, and store the results as packed double-precision
7404// floating-point elements in dst.
7405// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
7406FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
7407{
7408#if defined(__aarch64__) || defined(_M_ARM64)
7409 switch (rounding) {
7411 return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
7413 return _mm_floor_pd(a);
7415 return _mm_ceil_pd(a);
7417 return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
7418 default: //_MM_FROUND_CUR_DIRECTION
7419 return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
7420 }
7421#else
7422 double *v_double = (double *) &a;
7423
7424 if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7425 (rounding == _MM_FROUND_CUR_DIRECTION &&
7427 double res[2], tmp;
7428 for (int i = 0; i < 2; i++) {
7429 tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
7430 double roundDown = floor(tmp); // Round down value
7431 double roundUp = ceil(tmp); // Round up value
7432 double diffDown = tmp - roundDown;
7433 double diffUp = roundUp - tmp;
7434 if (diffDown < diffUp) {
7435 /* If it's closer to the round down value, then use it */
7436 res[i] = roundDown;
7437 } else if (diffDown > diffUp) {
7438 /* If it's closer to the round up value, then use it */
7439 res[i] = roundUp;
7440 } else {
7441 /* If it's equidistant between round up and round down value,
7442 * pick the one which is an even number */
7443 double half = roundDown / 2;
7444 if (half != floor(half)) {
7445 /* If the round down value is odd, return the round up value
7446 */
7447 res[i] = roundUp;
7448 } else {
7449 /* If the round up value is odd, return the round down value
7450 */
7451 res[i] = roundDown;
7452 }
7453 }
7454 res[i] = (v_double[i] < 0) ? -res[i] : res[i];
7455 }
7456 return _mm_set_pd(res[1], res[0]);
7457 } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7458 (rounding == _MM_FROUND_CUR_DIRECTION &&
7460 return _mm_floor_pd(a);
7461 } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7462 (rounding == _MM_FROUND_CUR_DIRECTION &&
7464 return _mm_ceil_pd(a);
7465 }
7466 return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
7467 v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
7468#endif
7469}
7470
7471// Round the packed single-precision (32-bit) floating-point elements in a using
7472// the rounding parameter, and store the results as packed single-precision
7473// floating-point elements in dst.
7474// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
7475FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
7476{
7477#if (defined(__aarch64__) || defined(_M_ARM64)) || \
7478 defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7479 switch (rounding) {
7481 return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
7483 return _mm_floor_ps(a);
7485 return _mm_ceil_ps(a);
7487 return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
7488 default: //_MM_FROUND_CUR_DIRECTION
7489 return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
7490 }
7491#else
7492 float *v_float = (float *) &a;
7493
7494 if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7495 (rounding == _MM_FROUND_CUR_DIRECTION &&
7497 uint32x4_t signmask = vdupq_n_u32(0x80000000);
7498 float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
7499 vdupq_n_f32(0.5f)); /* +/- 0.5 */
7500 int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
7501 vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
7502 int32x4_t r_trunc = vcvtq_s32_f32(
7503 vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
7504 int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
7505 vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
7506 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
7507 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
7508 float32x4_t delta = vsubq_f32(
7510 vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
7511 uint32x4_t is_delta_half =
7512 vceqq_f32(delta, half); /* delta == +/- 0.5 */
7514 vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
7515 } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7516 (rounding == _MM_FROUND_CUR_DIRECTION &&
7518 return _mm_floor_ps(a);
7519 } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7520 (rounding == _MM_FROUND_CUR_DIRECTION &&
7522 return _mm_ceil_ps(a);
7523 }
7524 return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
7525 v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
7526 v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
7527 v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
7528#endif
7529}
7530
7531// Round the lower double-precision (64-bit) floating-point element in b using
7532// the rounding parameter, store the result as a double-precision floating-point
7533// element in the lower element of dst, and copy the upper element from a to the
7534// upper element of dst.
7535// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
7536FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
7537{
7538 return _mm_move_sd(a, _mm_round_pd(b, rounding));
7539}
7540
7541// Round the lower single-precision (32-bit) floating-point element in b using
7542// the rounding parameter, store the result as a single-precision floating-point
7543// element in the lower element of dst, and copy the upper 3 packed elements
7544// from a to the upper elements of dst. Rounding is done according to the
7545// rounding[3:0] parameter, which can be one of:
7546// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
7547// suppress exceptions
7548// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and
7549// suppress exceptions
7550// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress
7551// exceptions
7552// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress
7553// exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
7554// _MM_SET_ROUNDING_MODE
7555// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
7556FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
7557{
7558 return _mm_move_ss(a, _mm_round_ps(b, rounding));
7559}
7560
7561// Load 128-bits of integer data from memory into dst using a non-temporal
7562// memory hint. mem_addr must be aligned on a 16-byte boundary or a
7563// general-protection exception may be generated.
7564// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
7566{
7567#if __has_builtin(__builtin_nontemporal_store)
7568 return __builtin_nontemporal_load(p);
7569#else
7570 return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
7571#endif
7572}
7573
7574// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
7575// all 1's, and return 1 if the result is zero, otherwise return 0.
7576// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
7577FORCE_INLINE int _mm_test_all_ones(__m128i a)
7578{
7579 return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
7580 ~(uint64_t) 0;
7581}
7582
7583// Compute the bitwise AND of 128 bits (representing integer data) in a and
7584// mask, and return 1 if the result is zero, otherwise return 0.
7585// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
7586FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
7587{
7588 int64x2_t a_and_mask =
7590 return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
7591}
7592
7593// Compute the bitwise AND of 128 bits (representing integer data) in a and
7594// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
7595// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
7596// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
7597// otherwise return 0.
7598// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
7599// Note: Argument names may be wrong in the Intel intrinsics guide.
7600FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
7601{
7602 uint64x2_t v = vreinterpretq_u64_m128i(a);
7603 uint64x2_t m = vreinterpretq_u64_m128i(mask);
7604
7605 // find ones (set-bits) and zeros (clear-bits) under clip mask
7606 uint64x2_t ones = vandq_u64(m, v);
7607 uint64x2_t zeros = vbicq_u64(m, v);
7608
7609 // If both 128-bit variables are populated (non-zero) then return 1.
7610 // For comparision purposes, first compact each var down to 32-bits.
7611 uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros));
7612
7613 // if folding minimum is non-zero then both vars must be non-zero
7614 return (vget_lane_u32(vpmin_u32(reduced, reduced), 0) != 0);
7615}
7616
7617// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7618// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7619// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7620// otherwise set CF to 0. Return the CF value.
7621// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
7622FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
7623{
7624 int64x2_t s64 =
7626 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
7627}
7628
7629// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7630// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7631// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7632// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
7633// otherwise return 0.
7634// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
7635#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
7636
7637// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7638// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7639// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7640// otherwise set CF to 0. Return the ZF value.
7641// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
7642FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
7643{
7644 int64x2_t s64 =
7646 return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
7647}
7648
7649/* SSE4.2 */
7650
7651static const uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = {
7652 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7653};
7654static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
7655 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7656 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7657};
7658
7659/* specify the source data format */
7660#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
7661#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
7662#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
7663#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
7664
7665/* specify the comparison operation */
7666#define _SIDD_CMP_EQUAL_ANY 0x00 /* compare equal any: strchr */
7667#define _SIDD_CMP_RANGES 0x04 /* compare ranges */
7668#define _SIDD_CMP_EQUAL_EACH 0x08 /* compare equal each: strcmp */
7669#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
7670
7671/* specify the polarity */
7672#define _SIDD_POSITIVE_POLARITY 0x00
7673#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
7674#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
7675#define _SIDD_MASKED_NEGATIVE_POLARITY \
7676 0x30 /* negate results only before end of string */
7677
7678/* specify the output selection in _mm_cmpXstri */
7679#define _SIDD_LEAST_SIGNIFICANT 0x00
7680#define _SIDD_MOST_SIGNIFICANT 0x40
7681
7682/* specify the output selection in _mm_cmpXstrm */
7683#define _SIDD_BIT_MASK 0x00
7684#define _SIDD_UNIT_MASK 0x40
7685
7686/* Pattern Matching for C macros.
7687 * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
7688 */
7689
7690/* catenate */
7691#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
7692#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
7693
7694#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
7695/* run the 2nd parameter */
7696#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
7697/* run the 1st parameter */
7698#define SSE2NEON_IIF_1(t, ...) t
7699
7700#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
7701#define SSE2NEON_COMPL_0 1
7702#define SSE2NEON_COMPL_1 0
7703
7704#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
7705#define SSE2NEON_DEC_1 0
7706#define SSE2NEON_DEC_2 1
7707#define SSE2NEON_DEC_3 2
7708#define SSE2NEON_DEC_4 3
7709#define SSE2NEON_DEC_5 4
7710#define SSE2NEON_DEC_6 5
7711#define SSE2NEON_DEC_7 6
7712#define SSE2NEON_DEC_8 7
7713#define SSE2NEON_DEC_9 8
7714#define SSE2NEON_DEC_10 9
7715#define SSE2NEON_DEC_11 10
7716#define SSE2NEON_DEC_12 11
7717#define SSE2NEON_DEC_13 12
7718#define SSE2NEON_DEC_14 13
7719#define SSE2NEON_DEC_15 14
7720#define SSE2NEON_DEC_16 15
7721
7722/* detection */
7723#define SSE2NEON_CHECK_N(x, n, ...) n
7724#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
7725#define SSE2NEON_PROBE(x) x, 1,
7726
7727#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
7728#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
7729
7730#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
7731#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
7732
7733#define SSE2NEON_EAT(...)
7734#define SSE2NEON_EXPAND(...) __VA_ARGS__
7735#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
7736
7737/* recursion */
7738/* deferred expression */
7739#define SSE2NEON_EMPTY()
7740#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
7741#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
7742#define SSE2NEON_EXPAND(...) __VA_ARGS__
7743
7744#define SSE2NEON_EVAL(...) \
7745 SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
7746#define SSE2NEON_EVAL1(...) \
7747 SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
7748#define SSE2NEON_EVAL2(...) \
7749 SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
7750#define SSE2NEON_EVAL3(...) __VA_ARGS__
7751
7752#define SSE2NEON_REPEAT(count, macro, ...) \
7753 SSE2NEON_WHEN(count) \
7754 (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()( \
7755 SSE2NEON_DEC(count), macro, \
7756 __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
7757 __VA_ARGS__))
7758#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
7759
7760#define SSE2NEON_SIZE_OF_byte 8
7761#define SSE2NEON_NUMBER_OF_LANES_byte 16
7762#define SSE2NEON_SIZE_OF_word 16
7763#define SSE2NEON_NUMBER_OF_LANES_word 8
7764
7765#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type) \
7766 mtx[i] = vreinterpretq_m128i_##type(vceqq_##type( \
7767 vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
7768 vreinterpretq_##type##_m128i(a)));
7769
7770#define SSE2NEON_FILL_LANE(i, type) \
7771 vec_b[i] = \
7772 vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
7773
7774#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size, \
7775 number_of_lanes, byte_or_word) \
7776 do { \
7777 SSE2NEON_CAT( \
7778 data_type_prefix, \
7779 SSE2NEON_CAT(size, \
7780 SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
7781 vec_b[number_of_lanes]; \
7782 __m128i mask = SSE2NEON_IIF(byte_or_word)( \
7783 vreinterpretq_m128i_u16(vdupq_n_u16(0xff)), \
7784 vreinterpretq_m128i_u32(vdupq_n_u32(0xffff))); \
7785 SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE, \
7786 SSE2NEON_CAT(type_prefix, size))) \
7787 for (int i = 0; i < number_of_lanes; i++) { \
7788 mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u, \
7789 size)(SSE2NEON_CAT(vbslq_u, size)( \
7790 SSE2NEON_CAT(vreinterpretq_u, \
7791 SSE2NEON_CAT(size, _m128i))(mask), \
7792 SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))( \
7793 vec_b[i], \
7794 SSE2NEON_CAT( \
7795 vreinterpretq_, \
7796 SSE2NEON_CAT(type_prefix, \
7797 SSE2NEON_CAT(size, _m128i(a))))), \
7798 SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))( \
7799 vec_b[i], \
7800 SSE2NEON_CAT( \
7801 vreinterpretq_, \
7802 SSE2NEON_CAT(type_prefix, \
7803 SSE2NEON_CAT(size, _m128i(a))))))); \
7804 } \
7805 } while (0)
7806
7807#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes) \
7808 do { \
7809 SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, \
7810 SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
7811 SSE2NEON_CAT(u, size))) \
7812 } while (0)
7813
7814#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \
7815 static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
7816 int lb) \
7817 { \
7818 __m128i mtx[16]; \
7819 PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7820 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \
7821 return SSE2NEON_CAT( \
7822 _sse2neon_aggregate_equal_any_, \
7823 SSE2NEON_CAT( \
7824 SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7825 SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
7826 type))))(la, lb, mtx); \
7827 }
7828
7829#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \
7830 static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
7831 int lb) \
7832 { \
7833 __m128i mtx[16]; \
7834 PCMPSTR_RANGES( \
7835 a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7836 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \
7837 return SSE2NEON_CAT( \
7838 _sse2neon_aggregate_ranges_, \
7839 SSE2NEON_CAT( \
7840 SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7841 SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
7842 type))))(la, lb, mtx); \
7843 }
7844
7845#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type) \
7846 static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \
7847 __m128i b, int lb) \
7848 { \
7849 __m128i mtx[16]; \
7850 PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7851 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \
7852 return SSE2NEON_CAT( \
7853 _sse2neon_aggregate_equal_ordered_, \
7854 SSE2NEON_CAT( \
7855 SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7856 SSE2NEON_CAT(x, \
7857 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
7858 SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx); \
7859 }
7860
7861static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
7862{
7863 int res = 0;
7864 int m = (1 << la) - 1;
7865 uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
7866 uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
7867 uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
7868 uint8x16_t vec = vcombine_u8(t_lo, t_hi);
7869 for (int j = 0; j < lb; j++) {
7870 mtx[j] = vreinterpretq_m128i_u8(
7871 vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
7872 mtx[j] = vreinterpretq_m128i_u8(
7873 vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
7874 int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
7875 res |= (tmp << j);
7876 }
7877 return res;
7878}
7879
7880static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
7881{
7882 int res = 0;
7883 int m = (1 << la) - 1;
7884 uint16x8_t vec =
7885 vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
7886 for (int j = 0; j < lb; j++) {
7887 mtx[j] = vreinterpretq_m128i_u16(
7888 vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
7889 mtx[j] = vreinterpretq_m128i_u16(
7890 vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
7891 int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
7892 res |= (tmp << j);
7893 }
7894 return res;
7895}
7896
7897/* clang-format off */
7898#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
7899 prefix##IMPL(byte) \
7900 prefix##IMPL(word)
7901/* clang-format on */
7902
7903SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
7904
7905static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
7906{
7907 int res = 0;
7908 int m = (1 << la) - 1;
7909 uint16x8_t vec =
7910 vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
7911 for (int j = 0; j < lb; j++) {
7912 mtx[j] = vreinterpretq_m128i_u16(
7913 vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
7914 mtx[j] = vreinterpretq_m128i_u16(
7915 vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
7917 vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
7918 uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
7920#if defined(__aarch64__) || defined(_M_ARM64)
7921 int t = vaddvq_u32(vec_res) ? 1 : 0;
7922#else
7923 uint64x2_t sumh = vpaddlq_u32(vec_res);
7924 int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
7925#endif
7926 res |= (t << j);
7927 }
7928 return res;
7929}
7930
7931static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
7932{
7933 int res = 0;
7934 int m = (1 << la) - 1;
7935 uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
7936 uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
7937 uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
7938 uint8x16_t vec = vcombine_u8(t_lo, t_hi);
7939 for (int j = 0; j < lb; j++) {
7940 mtx[j] = vreinterpretq_m128i_u8(
7941 vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
7942 mtx[j] = vreinterpretq_m128i_u8(
7943 vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
7945 vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
7946 uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
7948 int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
7949 res |= (t << j);
7950 }
7951 return res;
7952}
7953
7954#define SSE2NEON_CMP_RANGES_IS_BYTE 1
7955#define SSE2NEON_CMP_RANGES_IS_WORD 0
7956
7957/* clang-format off */
7958#define SSE2NEON_GENERATE_CMP_RANGES(prefix) \
7959 prefix##IMPL(byte, uint, u, prefix##IS_BYTE) \
7960 prefix##IMPL(byte, int, s, prefix##IS_BYTE) \
7961 prefix##IMPL(word, uint, u, prefix##IS_WORD) \
7962 prefix##IMPL(word, int, s, prefix##IS_WORD)
7963/* clang-format on */
7964
7965SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
7966
7967#undef SSE2NEON_CMP_RANGES_IS_BYTE
7968#undef SSE2NEON_CMP_RANGES_IS_WORD
7969
7970static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
7971{
7972 uint8x16_t mtx =
7974 int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
7975 int m1 = 0x10000 - (1 << la);
7976 int tb = 0x10000 - (1 << lb);
7977 uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
7978 uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
7979 vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
7980 vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
7981 vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
7982 vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
7983 vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
7984 tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
7985 tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
7986
7987 res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
7988 res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
7989 res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
7990 res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
7991 res_lo = vand_u8(res_lo, vec_mask);
7992 res_hi = vand_u8(res_hi, vec_mask);
7993
7994 int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
7995 return res;
7996}
7997
7998static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
7999{
8000 uint16x8_t mtx =
8002 int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
8003 int m1 = 0x100 - (1 << la);
8004 int tb = 0x100 - (1 << lb);
8005 uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
8006 uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
8007 uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
8008 uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
8009 mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
8010 mtx = vbslq_u16(vec1, tmp, mtx);
8011 mtx = vandq_u16(mtx, vec_mask);
8012 return _sse2neon_vaddvq_u16(mtx);
8013}
8014
8015#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
8016#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
8017
8018#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type) \
8019 static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes( \
8020 int bound, int la, int lb, __m128i mtx[16]) \
8021 { \
8022 int res = 0; \
8023 int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la); \
8024 uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)( \
8025 vld1_u##size(_sse2neon_cmpestr_mask##size##b), \
8026 vld1q_u##size(_sse2neon_cmpestr_mask##size##b)); \
8027 uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)( \
8028 vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask), \
8029 vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
8030 vtstq_u##size(vdupq_n_u##size(m1), vec_mask)); \
8031 uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
8032 uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0); \
8033 for (int j = 0; j < lb; j++) { \
8034 mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size( \
8035 vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j]))); \
8036 } \
8037 for (int j = lb; j < bound; j++) { \
8038 mtx[j] = vreinterpretq_m128i_u##size( \
8039 vbslq_u##size(vec1, vec_minusone, vec_zero)); \
8040 } \
8041 unsigned SSE2NEON_IIF(data_type)(char, short) *ptr = \
8042 (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx; \
8043 for (int i = 0; i < bound; i++) { \
8044 int val = 1; \
8045 for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) \
8046 val &= ptr[k * bound + j]; \
8047 res += val << i; \
8048 } \
8049 return res; \
8050 }
8051
8052/* clang-format off */
8053#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
8054 prefix##IMPL(8, 16, prefix##IS_UBYTE) \
8055 prefix##IMPL(16, 8, prefix##IS_UWORD)
8056/* clang-format on */
8057
8058SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
8059
8060#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
8061#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
8062
8063/* clang-format off */
8064#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
8065 prefix##IMPL(byte) \
8066 prefix##IMPL(word)
8067/* clang-format on */
8068
8069SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
8070
8071#define SSE2NEON_CMPESTR_LIST \
8072 _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any) \
8073 _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any) \
8074 _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any) \
8075 _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any) \
8076 _(CMP_UBYTE_RANGES, cmp_ubyte_ranges) \
8077 _(CMP_UWORD_RANGES, cmp_uword_ranges) \
8078 _(CMP_SBYTE_RANGES, cmp_sbyte_ranges) \
8079 _(CMP_SWORD_RANGES, cmp_sword_ranges) \
8080 _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each) \
8081 _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each) \
8082 _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each) \
8083 _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each) \
8084 _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
8085 _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
8086 _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
8087 _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
8088
8089enum {
8090#define _(name, func_suffix) name,
8092#undef _
8094typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
8096#define _(name, func_suffix) _sse2neon_##func_suffix,
8098#undef _
8099};
8100
8101FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
8102{
8103 switch (imm8 & 0x30) {
8105 res ^= 0xffffffff;
8106 break;
8108 res ^= (1 << lb) - 1;
8109 break;
8110 default:
8111 break;
8112 }
8113
8114 return res & ((bound == 8) ? 0xFF : 0xFFFF);
8115}
8116
8117FORCE_INLINE int _sse2neon_clz(unsigned int x)
8118{
8119#ifdef _MSC_VER
8120 unsigned long cnt = 0;
8121 if (_BitScanReverse(&cnt, x))
8122 return 31 - cnt;
8123 return 32;
8124#else
8125 return x != 0 ? __builtin_clz(x) : 32;
8126#endif
8127}
8128
8129FORCE_INLINE int _sse2neon_ctz(unsigned int x)
8130{
8131#ifdef _MSC_VER
8132 unsigned long cnt = 0;
8133 if (_BitScanForward(&cnt, x))
8134 return cnt;
8135 return 32;
8136#else
8137 return x != 0 ? __builtin_ctz(x) : 32;
8138#endif
8139}
8140
8141FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
8142{
8143#ifdef _MSC_VER
8144 unsigned long cnt;
8145#if defined(SSE2NEON_HAS_BITSCAN64)
8146 if (_BitScanForward64(&cnt, x))
8147 return (int) (cnt);
8148#else
8149 if (_BitScanForward(&cnt, (unsigned long) (x)))
8150 return (int) cnt;
8151 if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
8152 return (int) (cnt + 32);
8153#endif /* SSE2NEON_HAS_BITSCAN64 */
8154 return 64;
8155#else /* assume GNU compatible compilers */
8156 return x != 0 ? __builtin_ctzll(x) : 64;
8157#endif
8158}
8159
8160#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
8161
8162#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
8163 const int var = (imm & 0x01) ? 8 : 16
8164
8165#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
8166 int tmp1 = la ^ (la >> 31); \
8167 la = tmp1 - (la >> 31); \
8168 int tmp2 = lb ^ (lb >> 31); \
8169 lb = tmp2 - (lb >> 31); \
8170 la = SSE2NEON_MIN(la, bound); \
8171 lb = SSE2NEON_MIN(lb, bound)
8172
8173// Compare all pairs of character in string a and b,
8174// then aggregate the result.
8175// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
8176// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
8177// string a and b.
8178#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \
8179 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \
8180 SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \
8181 int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
8182 r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
8183
8184#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \
8185 return (r2 == 0) ? bound \
8186 : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
8187 : _sse2neon_ctz(r2))
8188
8189#define SSE2NEON_CMPSTR_GENERATE_MASK(dst) \
8190 __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
8191 if (imm8 & 0x40) { \
8192 if (bound == 8) { \
8193 uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2), \
8194 vld1q_u16(_sse2neon_cmpestr_mask16b)); \
8195 dst = vreinterpretq_m128i_u16(vbslq_u16( \
8196 tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst))); \
8197 } else { \
8198 uint8x16_t vec_r2 = \
8199 vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8)); \
8200 uint8x16_t tmp = \
8201 vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b)); \
8202 dst = vreinterpretq_m128i_u8( \
8203 vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst))); \
8204 } \
8205 } else { \
8206 if (bound == 16) { \
8207 dst = vreinterpretq_m128i_u16( \
8208 vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
8209 } else { \
8210 dst = vreinterpretq_m128i_u8( \
8211 vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0)); \
8212 } \
8213 } \
8214 return dst
8215
8216// Compare packed strings in a and b with lengths la and lb using the control
8217// in imm8, and returns 1 if b did not contain a null character and the
8218// resulting mask was zero, and 0 otherwise.
8219// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
8220FORCE_INLINE int _mm_cmpestra(__m128i a,
8221 int la,
8222 __m128i b,
8223 int lb,
8224 const int imm8)
8225{
8226 int lb_cpy = lb;
8227 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8228 return !r2 & (lb_cpy > bound);
8229}
8230
8231// Compare packed strings in a and b with lengths la and lb using the control in
8232// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
8233// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
8234FORCE_INLINE int _mm_cmpestrc(__m128i a,
8235 int la,
8236 __m128i b,
8237 int lb,
8238 const int imm8)
8239{
8240 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8241 return r2 != 0;
8242}
8243
8244// Compare packed strings in a and b with lengths la and lb using the control
8245// in imm8, and store the generated index in dst.
8246// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
8247FORCE_INLINE int _mm_cmpestri(__m128i a,
8248 int la,
8249 __m128i b,
8250 int lb,
8251 const int imm8)
8252{
8253 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8254 SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
8255}
8256
8257// Compare packed strings in a and b with lengths la and lb using the control
8258// in imm8, and store the generated mask in dst.
8259// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
8260FORCE_INLINE __m128i
8261_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
8262{
8263 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8265}
8266
8267// Compare packed strings in a and b with lengths la and lb using the control in
8268// imm8, and returns bit 0 of the resulting bit mask.
8269// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
8270FORCE_INLINE int _mm_cmpestro(__m128i a,
8271 int la,
8272 __m128i b,
8273 int lb,
8274 const int imm8)
8275{
8276 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8277 return r2 & 1;
8278}
8279
8280// Compare packed strings in a and b with lengths la and lb using the control in
8281// imm8, and returns 1 if any character in a was null, and 0 otherwise.
8282// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
8283FORCE_INLINE int _mm_cmpestrs(__m128i a,
8284 int la,
8285 __m128i b,
8286 int lb,
8287 const int imm8)
8288{
8289 (void) a;
8290 (void) b;
8291 (void) lb;
8292 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8293 return la <= (bound - 1);
8294}
8295
8296// Compare packed strings in a and b with lengths la and lb using the control in
8297// imm8, and returns 1 if any character in b was null, and 0 otherwise.
8298// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
8299FORCE_INLINE int _mm_cmpestrz(__m128i a,
8300 int la,
8301 __m128i b,
8302 int lb,
8303 const int imm8)
8304{
8305 (void) a;
8306 (void) b;
8307 (void) la;
8308 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8309 return lb <= (bound - 1);
8310}
8311
8312#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8) \
8313 do { \
8314 if (imm8 & 0x01) { \
8315 uint16x8_t equal_mask_##str = \
8316 vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
8317 uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \
8318 uint64_t matches_##str = \
8319 vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \
8320 len = _sse2neon_ctzll(matches_##str) >> 3; \
8321 } else { \
8322 uint16x8_t equal_mask_##str = vreinterpretq_u16_u8( \
8323 vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0))); \
8324 uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \
8325 uint64_t matches_##str = \
8326 vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \
8327 len = _sse2neon_ctzll(matches_##str) >> 2; \
8328 } \
8329 } while (0)
8330
8331#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
8332 int la, lb; \
8333 do { \
8334 SSE2NEON_CMPISTRX_LENGTH(a, la, imm8); \
8335 SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8); \
8336 } while (0)
8337
8338// Compare packed strings with implicit lengths in a and b using the control in
8339// imm8, and returns 1 if b did not contain a null character and the resulting
8340// mask was zero, and 0 otherwise.
8341// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
8342FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
8343{
8344 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8345 return !r2 & (lb >= bound);
8346}
8347
8348// Compare packed strings with implicit lengths in a and b using the control in
8349// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
8350// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
8351FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
8352{
8353 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8354 return r2 != 0;
8355}
8356
8357// Compare packed strings with implicit lengths in a and b using the control in
8358// imm8, and store the generated index in dst.
8359// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
8360FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
8361{
8362 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8363 SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
8364}
8365
8366// Compare packed strings with implicit lengths in a and b using the control in
8367// imm8, and store the generated mask in dst.
8368// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
8369FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
8370{
8371 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8373}
8374
8375// Compare packed strings with implicit lengths in a and b using the control in
8376// imm8, and returns bit 0 of the resulting bit mask.
8377// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
8378FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
8379{
8380 SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8381 return r2 & 1;
8382}
8383
8384// Compare packed strings with implicit lengths in a and b using the control in
8385// imm8, and returns 1 if any character in a was null, and 0 otherwise.
8386// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
8387FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
8388{
8389 (void) b;
8390 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8391 int la;
8392 SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
8393 return la <= (bound - 1);
8394}
8395
8396// Compare packed strings with implicit lengths in a and b using the control in
8397// imm8, and returns 1 if any character in b was null, and 0 otherwise.
8398// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
8399FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
8400{
8401 (void) a;
8402 SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8403 int lb;
8404 SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
8405 return lb <= (bound - 1);
8406}
8407
8408// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
8409// in b for greater than.
8411{
8412#if defined(__aarch64__) || defined(_M_ARM64)
8415#else
8416 return vreinterpretq_m128i_s64(vshrq_n_s64(
8418 63));
8419#endif
8420}
8421
8422// Starting with the initial value in crc, accumulates a CRC32 value for
8423// unsigned 16-bit integer v, and stores the result in dst.
8424// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
8425FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
8426{
8427#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8428 __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
8429 : [c] "+r"(crc)
8430 : [v] "r"(v));
8431#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8432 (defined(_M_ARM64) && !defined(__clang__))
8433 crc = __crc32ch(crc, v);
8434#else
8435 crc = _mm_crc32_u8(crc, v & 0xff);
8436 crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
8437#endif
8438 return crc;
8439}
8440
8441// Starting with the initial value in crc, accumulates a CRC32 value for
8442// unsigned 32-bit integer v, and stores the result in dst.
8443// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
8444FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
8445{
8446#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8447 __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
8448 : [c] "+r"(crc)
8449 : [v] "r"(v));
8450#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8451 (defined(_M_ARM64) && !defined(__clang__))
8452 crc = __crc32cw(crc, v);
8453#else
8454 crc = _mm_crc32_u16(crc, v & 0xffff);
8455 crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
8456#endif
8457 return crc;
8458}
8459
8460// Starting with the initial value in crc, accumulates a CRC32 value for
8461// unsigned 64-bit integer v, and stores the result in dst.
8462// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
8463FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
8464{
8465#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8466 __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
8467 : [c] "+r"(crc)
8468 : [v] "r"(v));
8469#elif (defined(_M_ARM64) && !defined(__clang__))
8470 crc = __crc32cd((uint32_t) crc, v);
8471#else
8472 crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
8473 crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
8474#endif
8475 return crc;
8476}
8477
8478// Starting with the initial value in crc, accumulates a CRC32 value for
8479// unsigned 8-bit integer v, and stores the result in dst.
8480// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
8481FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
8482{
8483#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8484 __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
8485 : [c] "+r"(crc)
8486 : [v] "r"(v));
8487#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8488 (defined(_M_ARM64) && !defined(__clang__))
8489 crc = __crc32cb(crc, v);
8490#else
8491 crc ^= v;
8492 for (int bit = 0; bit < 8; bit++) {
8493 if (crc & 1)
8494 crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
8495 else
8496 crc = (crc >> 1);
8497 }
8498#endif
8499 return crc;
8500}
8501
8502/* AES */
8503
8504#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__))
8505/* clang-format off */
8506#define SSE2NEON_AES_SBOX(w) \
8507 { \
8508 w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
8509 w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
8510 w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
8511 w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
8512 w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
8513 w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
8514 w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
8515 w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
8516 w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
8517 w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
8518 w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
8519 w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
8520 w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
8521 w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
8522 w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
8523 w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
8524 w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
8525 w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
8526 w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
8527 w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
8528 w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
8529 w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
8530 w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
8531 w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
8532 w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
8533 w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
8534 w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
8535 w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
8536 w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
8537 w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
8538 w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
8539 w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
8540 w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
8541 w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
8542 w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
8543 w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
8544 w(0xb0), w(0x54), w(0xbb), w(0x16) \
8545 }
8546#define SSE2NEON_AES_RSBOX(w) \
8547 { \
8548 w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
8549 w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
8550 w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
8551 w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
8552 w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
8553 w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
8554 w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
8555 w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
8556 w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
8557 w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
8558 w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
8559 w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
8560 w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
8561 w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
8562 w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
8563 w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
8564 w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
8565 w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
8566 w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
8567 w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
8568 w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
8569 w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
8570 w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
8571 w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
8572 w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
8573 w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
8574 w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
8575 w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
8576 w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
8577 w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
8578 w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
8579 w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
8580 w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
8581 w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
8582 w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
8583 w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
8584 w(0x55), w(0x21), w(0x0c), w(0x7d) \
8585 }
8586/* clang-format on */
8587
8588/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
8589#define SSE2NEON_AES_H0(x) (x)
8592#undef SSE2NEON_AES_H0
8593
8594/* x_time function and matrix multiply function */
8595#if !defined(__aarch64__) && !defined(_M_ARM64)
8596#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
8597#define SSE2NEON_MULTIPLY(x, y) \
8598 (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^ \
8599 ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^ \
8600 ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
8601 ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
8602#endif
8603
8604// In the absence of crypto extensions, implement aesenc using regular NEON
8605// intrinsics instead. See:
8606// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
8607// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
8608// for more information.
8609FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
8610{
8611#if defined(__aarch64__) || defined(_M_ARM64)
8612 static const uint8_t shift_rows[] = {
8613 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
8614 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
8615 };
8616 static const uint8_t ror32by8[] = {
8617 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8618 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8619 };
8620
8621 uint8x16_t v;
8622 uint8x16_t w = vreinterpretq_u8_m128i(a);
8623
8624 /* shift rows */
8625 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8626
8627 /* sub bytes */
8628 // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
8629 // look up each of the table. After each lookup, we load the next table
8630 // which locates at the next 64-bytes. In the meantime, the index in the
8631 // table would be smaller than it was, so the index parameters of
8632 // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
8633 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
8634 // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
8635 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
8636 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
8637 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
8638
8639 /* mix columns */
8640 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8641 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8642 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8643
8644 /* add round key */
8645 return vreinterpretq_m128i_u8(w) ^ RoundKey;
8646
8647#else /* ARMv7-A implementation for a table-based AES */
8648#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
8649 (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
8650 ((uint32_t) (b1) << 8) | (uint32_t) (b0))
8651// muliplying 'x' by 2 in GF(2^8)
8652#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
8653// muliplying 'x' by 3 in GF(2^8)
8654#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8655#define SSE2NEON_AES_U0(p) \
8656 SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8657#define SSE2NEON_AES_U1(p) \
8658 SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8659#define SSE2NEON_AES_U2(p) \
8660 SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8661#define SSE2NEON_AES_U3(p) \
8662 SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8663
8664 // this generates a table containing every possible permutation of
8665 // shift_rows() and sub_bytes() with mix_columns().
8666 static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
8671 };
8672#undef SSE2NEON_AES_B2W
8673#undef SSE2NEON_AES_F2
8674#undef SSE2NEON_AES_F3
8675#undef SSE2NEON_AES_U0
8676#undef SSE2NEON_AES_U1
8677#undef SSE2NEON_AES_U2
8678#undef SSE2NEON_AES_U3
8679
8680 uint32_t x0 = _mm_cvtsi128_si32(a); // get a[31:0]
8681 uint32_t x1 =
8682 _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); // get a[63:32]
8683 uint32_t x2 =
8684 _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA)); // get a[95:64]
8685 uint32_t x3 =
8686 _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); // get a[127:96]
8687
8688 // finish the modulo addition step in mix_columns()
8689 __m128i out = _mm_set_epi32(
8690 (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8691 aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8692 (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8693 aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8694 (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8695 aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8696 (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8697 aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8698
8699 return _mm_xor_si128(out, RoundKey);
8700#endif
8701}
8702
8703// Perform one round of an AES decryption flow on data (state) in a using the
8704// round key in RoundKey, and store the result in dst.
8705// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
8706FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
8707{
8708#if defined(__aarch64__)
8709 static const uint8_t inv_shift_rows[] = {
8710 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
8711 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
8712 };
8713 static const uint8_t ror32by8[] = {
8714 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8715 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8716 };
8717
8718 uint8x16_t v;
8719 uint8x16_t w = vreinterpretq_u8_m128i(a);
8720
8721 // inverse shift rows
8722 w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
8723
8724 // inverse sub bytes
8725 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
8726 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
8727 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
8728 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
8729
8730 // inverse mix columns
8731 // multiplying 'v' by 4 in GF(2^8)
8732 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8733 w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
8734 v ^= w;
8735 v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
8736
8737 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
8738 0x1b); // muliplying 'v' by 2 in GF(2^8)
8739 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8740 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8741
8742 // add round key
8743 return vreinterpretq_m128i_u8(w) ^ RoundKey;
8744
8745#else /* ARMv7-A NEON implementation */
8746 /* FIXME: optimized for NEON */
8747 uint8_t i, e, f, g, h, v[4][4];
8748 uint8_t *_a = (uint8_t *) &a;
8749 for (i = 0; i < 16; ++i) {
8750 v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
8751 }
8752
8753 // inverse mix columns
8754 for (i = 0; i < 4; ++i) {
8755 e = v[i][0];
8756 f = v[i][1];
8757 g = v[i][2];
8758 h = v[i][3];
8759
8760 v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
8761 SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
8762 v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
8763 SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
8764 v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
8765 SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
8766 v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
8767 SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
8768 }
8769
8770 return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
8771#endif
8772}
8773
8774// Perform the last round of an AES encryption flow on data (state) in a using
8775// the round key in RoundKey, and store the result in dst.
8776// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
8778{
8779#if defined(__aarch64__)
8780 static const uint8_t shift_rows[] = {
8781 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
8782 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
8783 };
8784
8785 uint8x16_t v;
8786 uint8x16_t w = vreinterpretq_u8_m128i(a);
8787
8788 // shift rows
8789 w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8790
8791 // sub bytes
8792 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
8793 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
8794 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
8795 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
8796
8797 // add round key
8798 return vreinterpretq_m128i_u8(v) ^ RoundKey;
8799
8800#else /* ARMv7-A implementation */
8801 uint8_t v[16] = {
8802 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
8803 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
8804 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
8805 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
8806 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
8807 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
8808 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
8809 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
8810 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
8811 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
8812 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
8813 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
8814 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
8815 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
8816 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
8817 _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
8818 };
8819
8820 return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
8821#endif
8822}
8823
8824// Perform the last round of an AES decryption flow on data (state) in a using
8825// the round key in RoundKey, and store the result in dst.
8826// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
8828{
8829#if defined(__aarch64__)
8830 static const uint8_t inv_shift_rows[] = {
8831 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
8832 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
8833 };
8834
8835 uint8x16_t v;
8836 uint8x16_t w = vreinterpretq_u8_m128i(a);
8837
8838 // inverse shift rows
8839 w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
8840
8841 // inverse sub bytes
8842 v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
8843 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
8844 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
8845 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
8846
8847 // add round key
8848 return vreinterpretq_m128i_u8(v) ^ RoundKey;
8849
8850#else /* ARMv7-A NEON implementation */
8851 /* FIXME: optimized for NEON */
8852 uint8_t v[4][4];
8853 uint8_t *_a = (uint8_t *) &a;
8854 for (int i = 0; i < 16; ++i) {
8855 v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
8856 }
8857
8858 return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
8859#endif
8860}
8861
8862// Perform the InvMixColumns transformation on a and store the result in dst.
8863// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
8865{
8866#if defined(__aarch64__)
8867 static const uint8_t ror32by8[] = {
8868 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8869 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8870 };
8871 uint8x16_t v = vreinterpretq_u8_m128i(a);
8872 uint8x16_t w;
8873
8874 // multiplying 'v' by 4 in GF(2^8)
8875 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8876 w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
8877 v ^= w;
8878 v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
8879
8880 // multiplying 'v' by 2 in GF(2^8)
8881 w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8882 w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8883 w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8884 return vreinterpretq_m128i_u8(w);
8885
8886#else /* ARMv7-A NEON implementation */
8887 uint8_t i, e, f, g, h, v[4][4];
8888 vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
8889 for (i = 0; i < 4; ++i) {
8890 e = v[i][0];
8891 f = v[i][1];
8892 g = v[i][2];
8893 h = v[i][3];
8894
8895 v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
8896 SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
8897 v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
8898 SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
8899 v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
8900 SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
8901 v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
8902 SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
8903 }
8904
8905 return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
8906#endif
8907}
8908
8909// Assist in expanding the AES cipher key by computing steps towards generating
8910// a round key for encryption cipher using data from a and an 8-bit round
8911// constant specified in imm8, and store the result in dst.
8912// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
8913//
8914// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
8915// This instruction generates a round key for AES encryption. See
8916// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
8917// for details.
8918FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
8919{
8920#if defined(__aarch64__)
8921 uint8x16_t _a = vreinterpretq_u8_m128i(a);
8922 uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
8923 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
8924 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
8925 v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
8926
8927 uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
8928 uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
8929 uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
8930
8931 return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
8932
8933#else /* ARMv7-A NEON implementation */
8934 uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
8935 uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
8936 for (int i = 0; i < 4; ++i) {
8937 ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
8938 ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
8939 }
8940 return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
8941 ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
8942#endif
8943}
8944#undef SSE2NEON_AES_SBOX
8945#undef SSE2NEON_AES_RSBOX
8946
8947#if defined(__aarch64__)
8948#undef SSE2NEON_XT
8949#undef SSE2NEON_MULTIPLY
8950#endif
8951
8952#else /* __ARM_FEATURE_CRYPTO */
8953// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
8954// AESMC and then manually applying the real key as an xor operation. This
8955// unfortunately means an additional xor op; the compiler should be able to
8956// optimize this away for repeated calls however. See
8957// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
8958// for more details.
8959FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
8960{
8961 return vreinterpretq_m128i_u8(veorq_u8(
8962 vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8964}
8965
8966// Perform one round of an AES decryption flow on data (state) in a using the
8967// round key in RoundKey, and store the result in dst.
8968// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
8969FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
8970{
8971 return vreinterpretq_m128i_u8(veorq_u8(
8972 vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8973 vreinterpretq_u8_m128i(RoundKey)));
8974}
8975
8976// Perform the last round of an AES encryption flow on data (state) in a using
8977// the round key in RoundKey, and store the result in dst.
8978// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
8979FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
8980{
8981 return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
8982 vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8983 RoundKey);
8984}
8985
8986// Perform the last round of an AES decryption flow on data (state) in a using
8987// the round key in RoundKey, and store the result in dst.
8988// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
8989FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
8990{
8992 veorq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)),
8993 vreinterpretq_u8_m128i(RoundKey)));
8994}
8995
8996// Perform the InvMixColumns transformation on a and store the result in dst.
8997// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
8998FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
8999{
9000 return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
9001}
9002
9003// Assist in expanding the AES cipher key by computing steps towards generating
9004// a round key for encryption cipher using data from a and an 8-bit round
9005// constant specified in imm8, and store the result in dst."
9006// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
9007FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
9008{
9009 // AESE does ShiftRows and SubBytes on A
9010 uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
9011
9012#ifndef _MSC_VER
9013 uint8x16_t dest = {
9014 // Undo ShiftRows step from AESE and extract X1 and X3
9015 u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
9016 u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
9017 u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
9018 u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
9019 };
9020 uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
9022#else
9023 // We have to do this hack because MSVC is strictly adhering to the CPP
9024 // standard, in particular C++03 8.5.1 sub-section 15, which states that
9025 // unions must be initialized by their first member type.
9026
9027 // As per the Windows ARM64 ABI, it is always little endian, so this works
9028 __n128 dest{
9029 ((uint64_t) u8.n128_u8[0x4] << 0) | ((uint64_t) u8.n128_u8[0x1] << 8) |
9030 ((uint64_t) u8.n128_u8[0xE] << 16) |
9031 ((uint64_t) u8.n128_u8[0xB] << 24) |
9032 ((uint64_t) u8.n128_u8[0x1] << 32) |
9033 ((uint64_t) u8.n128_u8[0xE] << 40) |
9034 ((uint64_t) u8.n128_u8[0xB] << 48) |
9035 ((uint64_t) u8.n128_u8[0x4] << 56),
9036 ((uint64_t) u8.n128_u8[0xC] << 0) | ((uint64_t) u8.n128_u8[0x9] << 8) |
9037 ((uint64_t) u8.n128_u8[0x6] << 16) |
9038 ((uint64_t) u8.n128_u8[0x3] << 24) |
9039 ((uint64_t) u8.n128_u8[0x9] << 32) |
9040 ((uint64_t) u8.n128_u8[0x6] << 40) |
9041 ((uint64_t) u8.n128_u8[0x3] << 48) |
9042 ((uint64_t) u8.n128_u8[0xC] << 56)};
9043
9044 dest.n128_u32[1] = dest.n128_u32[1] ^ rcon;
9045 dest.n128_u32[3] = dest.n128_u32[3] ^ rcon;
9046
9047 return dest;
9048#endif
9049}
9050#endif
9051
9052/* Others */
9053
9054// Perform a carry-less multiplication of two 64-bit integers, selected from a
9055// and b according to imm8, and store the results in dst.
9056// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
9057FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
9058{
9059 uint64x2_t a = vreinterpretq_u64_m128i(_a);
9060 uint64x2_t b = vreinterpretq_u64_m128i(_b);
9061 switch (imm & 0x11) {
9062 case 0x00:
9064 _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
9065 case 0x01:
9067 _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
9068 case 0x10:
9070 _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
9071 case 0x11:
9073 _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
9074 default:
9075 abort();
9076 }
9077}
9078
9079FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
9080{
9081 union {
9082 fpcr_bitfield field;
9083#if defined(__aarch64__) || defined(_M_ARM64)
9084 uint64_t value;
9085#else
9086 uint32_t value;
9087#endif
9088 } r;
9089
9090#if defined(__aarch64__) || defined(_M_ARM64)
9091 r.value = _sse2neon_get_fpcr();
9092#else
9093 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
9094#endif
9095
9096 return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
9097}
9098
9099// Count the number of bits set to 1 in unsigned 32-bit integer a, and
9100// return that count in dst.
9101// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
9102FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
9103{
9104#if defined(__aarch64__) || defined(_M_ARM64)
9105#if __has_builtin(__builtin_popcount)
9106 return __builtin_popcount(a);
9107#elif defined(_MSC_VER)
9108 return _CountOneBits(a);
9109#else
9110 return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
9111#endif
9112#else
9113 uint32_t count = 0;
9114 uint8x8_t input_val, count8x8_val;
9115 uint16x4_t count16x4_val;
9116 uint32x2_t count32x2_val;
9117
9118 input_val = vld1_u8((uint8_t *) &a);
9119 count8x8_val = vcnt_u8(input_val);
9120 count16x4_val = vpaddl_u8(count8x8_val);
9121 count32x2_val = vpaddl_u16(count16x4_val);
9122
9123 vst1_u32(&count, count32x2_val);
9124 return count;
9125#endif
9126}
9127
9128// Count the number of bits set to 1 in unsigned 64-bit integer a, and
9129// return that count in dst.
9130// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
9131FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
9132{
9133#if defined(__aarch64__) || defined(_M_ARM64)
9134#if __has_builtin(__builtin_popcountll)
9135 return __builtin_popcountll(a);
9136#elif defined(_MSC_VER)
9137 return _CountOneBits64(a);
9138#else
9139 return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
9140#endif
9141#else
9142 uint64_t count = 0;
9143 uint8x8_t input_val, count8x8_val;
9144 uint16x4_t count16x4_val;
9145 uint32x2_t count32x2_val;
9146 uint64x1_t count64x1_val;
9147
9148 input_val = vld1_u8((uint8_t *) &a);
9149 count8x8_val = vcnt_u8(input_val);
9150 count16x4_val = vpaddl_u8(count8x8_val);
9151 count32x2_val = vpaddl_u16(count16x4_val);
9152 count64x1_val = vpaddl_u32(count32x2_val);
9153 vst1_u64(&count, count64x1_val);
9154 return count;
9155#endif
9156}
9157
9158FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
9159{
9160 // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
9161 // regardless of the value of the FZ bit.
9162 union {
9163 fpcr_bitfield field;
9164#if defined(__aarch64__) || defined(_M_ARM64)
9165 uint64_t value;
9166#else
9167 uint32_t value;
9168#endif
9169 } r;
9170
9171#if defined(__aarch64__) || defined(_M_ARM64)
9172 r.value = _sse2neon_get_fpcr();
9173#else
9174 __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
9175#endif
9176
9177 r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
9178
9179#if defined(__aarch64__) || defined(_M_ARM64)
9180 _sse2neon_set_fpcr(r.value);
9181#else
9182 __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
9183#endif
9184}
9185
9186// Return the current 64-bit value of the processor's time-stamp counter.
9187// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
9188FORCE_INLINE uint64_t _rdtsc(void)
9189{
9190#if defined(__aarch64__) || defined(_M_ARM64)
9191 uint64_t val;
9192
9193 /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
9194 * system counter is at least 56 bits wide; from Armv8.6, the counter
9195 * must be 64 bits wide. So the system counter could be less than 64
9196 * bits wide and it is attributed with the flag 'cap_user_time_short'
9197 * is true.
9198 */
9199#if defined(_MSC_VER)
9200 val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
9201#else
9202 __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
9203#endif
9204
9205 return val;
9206#else
9207 uint32_t pmccntr, pmuseren, pmcntenset;
9208 // Read the user mode Performance Monitoring Unit (PMU)
9209 // User Enable Register (PMUSERENR) access permissions.
9210 __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
9211 if (pmuseren & 1) { // Allows reading PMUSERENR for user mode code.
9212 __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
9213 if (pmcntenset & 0x80000000UL) { // Is it counting?
9214 __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
9215 // The counter is set up to count every 64th cycle
9216 return (uint64_t) (pmccntr) << 6;
9217 }
9218 }
9219
9220 // Fallback to syscall as we can't enable PMUSERENR in user mode.
9221 struct timeval tv;
9222 gettimeofday(&tv, NULL);
9223 return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
9224#endif
9225}
9226
9227#if defined(__GNUC__) || defined(__clang__)
9228#pragma pop_macro("ALIGN_STRUCT")
9229#pragma pop_macro("FORCE_INLINE")
9230#endif
9231
9232#if defined(__GNUC__) && !defined(__clang__)
9233#pragma GCC pop_options
9234#endif
9235
9236#endif