Grok 10.0.5
highway.h
Go to the documentation of this file.
1// Copyright 2020 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// This include guard is checked by foreach_target, so avoid the usual _H_
17// suffix to prevent copybara from renaming it. NOTE: ops/*-inl.h are included
18// after/outside this include guard.
19#ifndef HWY_HIGHWAY_INCLUDED
20#define HWY_HIGHWAY_INCLUDED
21
22// Main header required before using vector types.
23
24#include "hwy/base.h"
25#include "hwy/targets.h"
26
27namespace hwy {
28
29// API version (https://semver.org/); keep in sync with CMakeLists.txt.
30#define HWY_MAJOR 1
31#define HWY_MINOR 0
32#define HWY_PATCH 2
33
34//------------------------------------------------------------------------------
35// Shorthand for tags (defined in shared-inl.h) used to select overloads.
36// Note that ScalableTag<T> is preferred over HWY_FULL, and CappedTag<T, N> over
37// HWY_CAPPED(T, N).
38
39// HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of
40// registers in the group, and is ignored on targets that do not support groups.
41#define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T>
42#define HWY_FULL2(T, LMUL) \
43 hwy::HWY_NAMESPACE::ScalableTag<T, hwy::CeilLog2(HWY_MAX(0, LMUL))>
44#define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
45// Workaround for MSVC grouping __VA_ARGS__ into a single argument
46#define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
47// Trailing comma avoids -pedantic false alarm
48#define HWY_CHOOSE_FULL(...) \
49 HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
50#define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)
51
52// Vector of up to MAX_N lanes. It's better to use full vectors where possible.
53#define HWY_CAPPED(T, MAX_N) hwy::HWY_NAMESPACE::CappedTag<T, MAX_N>
54
55//------------------------------------------------------------------------------
56// Export user functions for static/dynamic dispatch
57
58// Evaluates to 0 inside a translation unit if it is generating anything but the
59// static target (the last one if multiple targets are enabled). Used to prevent
60// redefinitions of HWY_EXPORT. Unless foreach_target.h is included, we only
61// compile once anyway, so this is 1 unless it is or has been included.
62#ifndef HWY_ONCE
63#define HWY_ONCE 1
64#endif
65
66// HWY_STATIC_DISPATCH(FUNC_NAME) is the namespace-qualified FUNC_NAME for
67// HWY_STATIC_TARGET (the only defined namespace unless HWY_TARGET_INCLUDE is
68// defined), and can be used to deduce the return type of Choose*.
69#if HWY_STATIC_TARGET == HWY_SCALAR
70#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME
71#elif HWY_STATIC_TARGET == HWY_EMU128
72#define HWY_STATIC_DISPATCH(FUNC_NAME) N_EMU128::FUNC_NAME
73#elif HWY_STATIC_TARGET == HWY_RVV
74#define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME
75#elif HWY_STATIC_TARGET == HWY_WASM_EMU256
76#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM_EMU256::FUNC_NAME
77#elif HWY_STATIC_TARGET == HWY_WASM
78#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
79#elif HWY_STATIC_TARGET == HWY_NEON
80#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME
81#elif HWY_STATIC_TARGET == HWY_SVE
82#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME
83#elif HWY_STATIC_TARGET == HWY_SVE2
84#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
85#elif HWY_STATIC_TARGET == HWY_SVE_256
86#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE_256::FUNC_NAME
87#elif HWY_STATIC_TARGET == HWY_SVE2_128
88#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2_128::FUNC_NAME
89#elif HWY_STATIC_TARGET == HWY_PPC8
90#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
91#elif HWY_STATIC_TARGET == HWY_SSSE3
92#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSSE3::FUNC_NAME
93#elif HWY_STATIC_TARGET == HWY_SSE4
94#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME
95#elif HWY_STATIC_TARGET == HWY_AVX2
96#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME
97#elif HWY_STATIC_TARGET == HWY_AVX3
98#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME
99#elif HWY_STATIC_TARGET == HWY_AVX3_DL
100#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME
101#endif
102
103// HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or
104// nullptr is that target was not compiled.
105#if HWY_TARGETS & HWY_EMU128
106#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_EMU128::FUNC_NAME
107#elif HWY_TARGETS & HWY_SCALAR
108#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_SCALAR::FUNC_NAME
109#else
110// When HWY_SCALAR/HWY_EMU128 are not present and other targets were disabled at
111// runtime, fall back to the baseline with HWY_STATIC_DISPATCH().
112#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
113#endif
114
115#if HWY_TARGETS & HWY_WASM_EMU256
116#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) &N_WASM_EMU256::FUNC_NAME
117#else
118#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) nullptr
119#endif
120
121#if HWY_TARGETS & HWY_WASM
122#define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME
123#else
124#define HWY_CHOOSE_WASM(FUNC_NAME) nullptr
125#endif
126
127#if HWY_TARGETS & HWY_RVV
128#define HWY_CHOOSE_RVV(FUNC_NAME) &N_RVV::FUNC_NAME
129#else
130#define HWY_CHOOSE_RVV(FUNC_NAME) nullptr
131#endif
132
133#if HWY_TARGETS & HWY_NEON
134#define HWY_CHOOSE_NEON(FUNC_NAME) &N_NEON::FUNC_NAME
135#else
136#define HWY_CHOOSE_NEON(FUNC_NAME) nullptr
137#endif
138
139#if HWY_TARGETS & HWY_SVE
140#define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME
141#else
142#define HWY_CHOOSE_SVE(FUNC_NAME) nullptr
143#endif
144
145#if HWY_TARGETS & HWY_SVE2
146#define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME
147#else
148#define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr
149#endif
150
151#if HWY_TARGETS & HWY_SVE_256
152#define HWY_CHOOSE_SVE_256(FUNC_NAME) &N_SVE_256::FUNC_NAME
153#else
154#define HWY_CHOOSE_SVE_256(FUNC_NAME) nullptr
155#endif
156
157#if HWY_TARGETS & HWY_SVE2_128
158#define HWY_CHOOSE_SVE2_128(FUNC_NAME) &N_SVE2_128::FUNC_NAME
159#else
160#define HWY_CHOOSE_SVE2_128(FUNC_NAME) nullptr
161#endif
162
163#if HWY_TARGETS & HWY_PPC8
164#define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME
165#else
166#define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr
167#endif
168
169#if HWY_TARGETS & HWY_SSSE3
170#define HWY_CHOOSE_SSSE3(FUNC_NAME) &N_SSSE3::FUNC_NAME
171#else
172#define HWY_CHOOSE_SSSE3(FUNC_NAME) nullptr
173#endif
174
175#if HWY_TARGETS & HWY_SSE4
176#define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME
177#else
178#define HWY_CHOOSE_SSE4(FUNC_NAME) nullptr
179#endif
180
181#if HWY_TARGETS & HWY_AVX2
182#define HWY_CHOOSE_AVX2(FUNC_NAME) &N_AVX2::FUNC_NAME
183#else
184#define HWY_CHOOSE_AVX2(FUNC_NAME) nullptr
185#endif
186
187#if HWY_TARGETS & HWY_AVX3
188#define HWY_CHOOSE_AVX3(FUNC_NAME) &N_AVX3::FUNC_NAME
189#else
190#define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr
191#endif
192
193#if HWY_TARGETS & HWY_AVX3_DL
194#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) &N_AVX3_DL::FUNC_NAME
195#else
196#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr
197#endif
198
199// MSVC 2017 workaround: the non-type template parameter to ChooseAndCall
200// apparently cannot be an array. Use a function pointer instead, which has the
201// disadvantage that we call the static (not best) target on the first call to
202// any HWY_DYNAMIC_DISPATCH.
203#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1915
204#define HWY_DISPATCH_WORKAROUND 1
205#else
206#define HWY_DISPATCH_WORKAROUND 0
207#endif
208
209// Provides a static member function which is what is called during the first
210// HWY_DYNAMIC_DISPATCH, where GetIndex is still zero, and instantiations of
211// this function are the first entry in the tables created by HWY_EXPORT.
212template <typename RetType, typename... Args>
214 public:
215 typedef RetType(FunctionType)(Args...);
216
217#if HWY_DISPATCH_WORKAROUND
218 template <FunctionType* const func>
219 static RetType ChooseAndCall(Args... args) {
220 ChosenTarget& chosen_target = GetChosenTarget();
221 chosen_target.Update(SupportedTargets());
222 return (*func)(args...);
223 }
224#else
225 // A template function that when instantiated has the same signature as the
226 // function being called. This function initializes the bit array of targets
227 // supported by the current CPU and then calls the appropriate entry within
228 // the HWY_EXPORT table. Subsequent calls via HWY_DYNAMIC_DISPATCH to any
229 // exported functions, even those defined by different translation units,
230 // will dispatch directly to the best available target.
231 template <FunctionType* const table[]>
232 static RetType ChooseAndCall(Args... args) {
233 ChosenTarget& chosen_target = GetChosenTarget();
234 chosen_target.Update(SupportedTargets());
235 return (table[chosen_target.GetIndex()])(args...);
236 }
237#endif // HWY_DISPATCH_WORKAROUND
238};
239
240// Used to deduce the template parameters RetType and Args from a function.
241template <typename RetType, typename... Args>
242FunctionCache<RetType, Args...> DeduceFunctionCache(RetType (*)(Args...)) {
243 return FunctionCache<RetType, Args...>();
244}
245
246#define HWY_DISPATCH_TABLE(FUNC_NAME) \
247 HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)
248
249// HWY_EXPORT(FUNC_NAME); expands to a static array that is used by
250// HWY_DYNAMIC_DISPATCH() to call the appropriate function at runtime. This
251// static array must be defined at the same namespace level as the function
252// it is exporting.
253// After being exported, it can be called from other parts of the same source
254// file using HWY_DYNAMIC_DISPATCH(), in particular from a function wrapper
255// like in the following example:
256//
257// #include "hwy/highway.h"
258// HWY_BEFORE_NAMESPACE();
259// namespace skeleton {
260// namespace HWY_NAMESPACE {
261//
262// void MyFunction(int a, char b, const char* c) { ... }
263//
264// // NOLINTNEXTLINE(google-readability-namespace-comments)
265// } // namespace HWY_NAMESPACE
266// } // namespace skeleton
267// HWY_AFTER_NAMESPACE();
268//
269// namespace skeleton {
270// HWY_EXPORT(MyFunction); // Defines the dispatch table in this scope.
271//
272// void MyFunction(int a, char b, const char* c) {
273// return HWY_DYNAMIC_DISPATCH(MyFunction)(a, b, c);
274// }
275// } // namespace skeleton
276//
277
278#if HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
279
280// Simplified version for IDE or the dynamic dispatch case with only one target.
281// This case still uses a table, although of a single element, to provide the
282// same compile error conditions as with the dynamic dispatch case when multiple
283// targets are being compiled.
284#define HWY_EXPORT(FUNC_NAME) \
285 HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const \
286 HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)}
287#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME)
288
289#else
290
291// Simplified version for MSVC 2017: function pointer instead of table.
292#if HWY_DISPATCH_WORKAROUND
293
294#define HWY_EXPORT(FUNC_NAME) \
295 static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
296 FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
297 /* The first entry in the table initializes the global cache and \
298 * calls the function from HWY_STATIC_TARGET. */ \
299 &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \
300 FUNC_NAME)))::ChooseAndCall<&HWY_STATIC_DISPATCH(FUNC_NAME)>, \
301 HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
302 HWY_CHOOSE_FALLBACK(FUNC_NAME), \
303 }
304
305#else
306
307// Dynamic dispatch case with one entry per dynamic target plus the fallback
308// target and the initialization wrapper.
309#define HWY_EXPORT(FUNC_NAME) \
310 static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
311 FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
312 /* The first entry in the table initializes the global cache and \
313 * calls the appropriate function. */ \
314 &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \
315 FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>, \
316 HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
317 HWY_CHOOSE_FALLBACK(FUNC_NAME), \
318 }
319
320#endif // HWY_DISPATCH_WORKAROUND
321
322#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
323 (*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()]))
324
325#endif // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
326
327// DEPRECATED names; please use HWY_HAVE_* instead.
328#define HWY_CAP_INTEGER64 HWY_HAVE_INTEGER64
329#define HWY_CAP_FLOAT16 HWY_HAVE_FLOAT16
330#define HWY_CAP_FLOAT64 HWY_HAVE_FLOAT64
331
332} // namespace hwy
333
334#endif // HWY_HIGHWAY_INCLUDED
335
336//------------------------------------------------------------------------------
337
338// NOTE: the following definitions and ops/*.h depend on HWY_TARGET, so we want
339// to include them once per target, which is ensured by the toggle check.
340// Because ops/*.h are included under it, they do not need their own guard.
341#if defined(HWY_HIGHWAY_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
342#ifdef HWY_HIGHWAY_PER_TARGET
343#undef HWY_HIGHWAY_PER_TARGET
344#else
345#define HWY_HIGHWAY_PER_TARGET
346#endif
347
348// These define ops inside namespace hwy::HWY_NAMESPACE.
349#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
350#include "hwy/ops/x86_128-inl.h"
351#elif HWY_TARGET == HWY_AVX2
352#include "hwy/ops/x86_256-inl.h"
353#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL
354#include "hwy/ops/x86_512-inl.h"
355#elif HWY_TARGET == HWY_PPC8
356#error "PPC is not yet supported"
357#elif HWY_TARGET == HWY_NEON
358#include "hwy/ops/arm_neon-inl.h"
359#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || \
360 HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
361#include "hwy/ops/arm_sve-inl.h"
362#elif HWY_TARGET == HWY_WASM_EMU256
363#include "hwy/ops/wasm_256-inl.h"
364#elif HWY_TARGET == HWY_WASM
365#include "hwy/ops/wasm_128-inl.h"
366#elif HWY_TARGET == HWY_RVV
367#include "hwy/ops/rvv-inl.h"
368#elif HWY_TARGET == HWY_EMU128
369#include "hwy/ops/emu128-inl.h"
370#elif HWY_TARGET == HWY_SCALAR
371#include "hwy/ops/scalar-inl.h"
372#else
373#pragma message("HWY_TARGET does not match any known target")
374#endif // HWY_TARGET
375
377
378#endif // HWY_HIGHWAY_PER_TARGET
Definition aligned_allocator.h:27
FunctionCache< RetType, Args... > DeduceFunctionCache(RetType(*)(Args...))
Definition highway.h:242
HWY_DLLEXPORT ChosenTarget & GetChosenTarget()
HWY_DLLEXPORT int64_t SupportedTargets()
Definition targets.h:278
size_t HWY_INLINE GetIndex() const
Definition targets.h:301
void Update(int64_t targets)
Definition targets.h:282
Definition highway.h:213
RetType() FunctionType(Args...)
Definition highway.h:215
static RetType ChooseAndCall(Args... args)
Definition highway.h:232