Grok 10.0.5
ops/shared-inl.h
Go to the documentation of this file.
1// Copyright 2020 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Per-target definitions shared by ops/*.h and user code.
17
18// We are covered by the highway.h include guard, but generic_ops-inl.h
19// includes this again #if HWY_IDE.
20#if defined(HIGHWAY_HWY_OPS_SHARED_TOGGLE) == \
21 defined(HWY_TARGET_TOGGLE)
22#ifdef HIGHWAY_HWY_OPS_SHARED_TOGGLE
23#undef HIGHWAY_HWY_OPS_SHARED_TOGGLE
24#else
25#define HIGHWAY_HWY_OPS_SHARED_TOGGLE
26#endif
27
28#ifndef HWY_NO_LIBCXX
29#include <math.h>
30#endif
31
32#include "hwy/base.h"
33
34// Separate header because foreach_target.h re-enables its include guard.
36
37// Relies on the external include guard in highway.h.
39namespace hwy {
40namespace HWY_NAMESPACE {
41
42// Highway operations are implemented as overloaded functions selected using an
43// internal-only tag type D := Simd<T, N, kPow2>. T is the lane type. kPow2 is a
44// shift count applied to scalable vectors. Instead of referring to Simd<>
45// directly, users create D via aliases ScalableTag<T[, kPow2]>() (defaults to a
46// full vector, or fractions/groups if the argument is negative/positive),
47// CappedTag<T, kLimit> or FixedTag<T, kNumLanes>. The actual number of lanes is
48// Lanes(D()), a power of two. For scalable vectors, N is either HWY_LANES or a
49// cap. For constexpr-size vectors, N is the actual number of lanes. This
50// ensures Half<Full512<T>> is the same type as Full256<T>, as required by x86.
51template <typename Lane, size_t N, int kPow2>
52struct Simd {
53 constexpr Simd() = default;
54 using T = Lane;
55 static_assert((N & (N - 1)) == 0 && N != 0, "N must be a power of two");
56
57 // Only for use by MaxLanes, required by MSVC. Cannot be enum because GCC
58 // warns when using enums and non-enums in the same expression. Cannot be
59 // static constexpr function (another MSVC limitation).
60 static constexpr size_t kPrivateN = N;
61 static constexpr int kPrivatePow2 = kPow2;
62
63 template <typename NewT>
64 static constexpr size_t NewN() {
65 // Round up to correctly handle scalars with N=1.
66 return (N * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT);
67 }
68
69#if HWY_HAVE_SCALABLE
70 template <typename NewT>
71 static constexpr int Pow2Ratio() {
72 return (sizeof(NewT) > sizeof(T))
73 ? static_cast<int>(CeilLog2(sizeof(NewT) / sizeof(T)))
74 : -static_cast<int>(CeilLog2(sizeof(T) / sizeof(NewT)));
75 }
76#endif
77
78 // Widening/narrowing ops change the number of lanes and/or their type.
79 // To initialize such vectors, we need the corresponding tag types:
80
81// PromoteTo/DemoteTo() with another lane type, but same number of lanes.
82#if HWY_HAVE_SCALABLE
83 template <typename NewT>
84 using Rebind = Simd<NewT, N, kPow2 + Pow2Ratio<NewT>()>;
85#else
86 template <typename NewT>
88#endif
89
90 // Change lane type while keeping the same vector size, e.g. for MulEven.
91 template <typename NewT>
93
94// Half the lanes while keeping the same lane type, e.g. for LowerHalf.
95// Round up to correctly handle scalars with N=1.
96#if HWY_HAVE_SCALABLE
97 // Reducing the cap (N) is required for SVE - if N is the limiter for f32xN,
98 // then we expect Half<Rebind<u16>> to have N/2 lanes (rounded up).
99 using Half = Simd<T, (N + 1) / 2, kPow2 - 1>;
100#else
101 using Half = Simd<T, (N + 1) / 2, kPow2>;
102#endif
103
104// Twice the lanes while keeping the same lane type, e.g. for Combine.
105#if HWY_HAVE_SCALABLE
107#else
109#endif
110};
111
112namespace detail {
113
114template <typename T, size_t N, int kPow2>
115constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {
116 return N == HWY_LANES(T) && kPow2 == 0;
117}
118
119// Returns the number of lanes (possibly zero) after applying a shift:
120// - 0: no change;
121// - [1,3]: a group of 2,4,8 [fractional] vectors;
122// - [-3,-1]: a fraction of a vector from 1/8 to 1/2.
123constexpr size_t ScaleByPower(size_t N, int pow2) {
124#if HWY_TARGET == HWY_RVV
125 return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
126#else
127 return pow2 >= 0 ? N : (N >> (-pow2));
128#endif
129}
130
131// Struct wrappers enable validation of arguments via static_assert.
132template <typename T, int kPow2>
134 static_assert(-3 <= kPow2 && kPow2 <= 3, "Fraction must be 1/8 to 8");
135#if HWY_TARGET == HWY_RVV
136 // Only RVV supports register groups.
137 using type = Simd<T, HWY_LANES(T), kPow2>;
138#elif HWY_HAVE_SCALABLE
139 // For SVE[2], only allow full or fractions.
140 using type = Simd<T, HWY_LANES(T), HWY_MIN(kPow2, 0)>;
141#elif HWY_TARGET == HWY_SCALAR
142 using type = Simd<T, /*N=*/1, 0>;
143#else
144 // Only allow full or fractions.
145 using type = Simd<T, ScaleByPower(HWY_LANES(T), HWY_MIN(kPow2, 0)), 0>;
146#endif
147};
148
149template <typename T, size_t kLimit>
151 static_assert(kLimit != 0, "Does not make sense to have zero lanes");
152 // Safely handle non-power-of-two inputs by rounding down, which is allowed by
153 // CappedTag. Otherwise, Simd<T, 3, 0> would static_assert.
154 static constexpr size_t kLimitPow2 = size_t{1} << hwy::FloorLog2(kLimit);
155 using type = Simd<T, HWY_MIN(kLimitPow2, HWY_LANES(T)), 0>;
156};
157
158template <typename T, size_t kNumLanes>
160 static_assert(kNumLanes != 0, "Does not make sense to have zero lanes");
161 static_assert(kNumLanes <= HWY_LANES(T), "Too many lanes");
163};
164
165} // namespace detail
166
167// Alias for a tag describing a full vector (kPow2 == 0: the most common usage,
168// e.g. 1D loops where the application does not care about the vector size) or a
169// fraction/multiple of one. Multiples are the same as full vectors for all
170// targets except RVV. Fractions (kPow2 < 0) are useful as the argument/return
171// value of type promotion and demotion.
172template <typename T, int kPow2 = 0>
174
175// Alias for a tag describing a vector with *up to* kLimit active lanes, even on
176// targets with scalable vectors and HWY_SCALAR. The runtime lane count
177// `Lanes(tag)` may be less than kLimit, and is 1 on HWY_SCALAR. This alias is
178// typically used for 1D loops with a relatively low application-defined upper
179// bound, e.g. for 8x8 DCTs. However, it is better if data structures are
180// designed to be vector-length-agnostic (e.g. a hybrid SoA where there are
181// chunks of `M >= MaxLanes(d)` DC components followed by M AC1, .., and M AC63;
182// this would enable vector-length-agnostic loops using ScalableTag).
183template <typename T, size_t kLimit>
185
186// Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
187// even on targets with scalable vectors. Requires `kNumLanes` to be a power of
188// two not exceeding `HWY_LANES(T)`.
189//
190// NOTE: if the application does not need to support HWY_SCALAR (+), use this
191// instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
192// This is useful for data structures that rely on exactly 128-bit SIMD, but
193// these are discouraged because they cannot benefit from wider vectors.
194// Instead, applications would ideally define a larger problem size and loop
195// over it with the (unknown size) vectors from ScalableTag.
196//
197// + e.g. if the baseline is known to support SIMD, or the application requires
198// ops such as TableLookupBytes not supported by HWY_SCALAR.
199template <typename T, size_t kNumLanes>
201
202template <class D>
203using TFromD = typename D::T;
204
205// Tag for the same number of lanes as D, but with the LaneType T.
206template <class T, class D>
207using Rebind = typename D::template Rebind<T>;
208
209template <class D>
211template <class D>
213template <class D>
215
216// Tag for the same total size as D, but with the LaneType T.
217template <class T, class D>
218using Repartition = typename D::template Repartition<T>;
219
220template <class D>
222template <class D>
224
225// Tag for the same lane type as D, but half the lanes.
226template <class D>
227using Half = typename D::Half;
228
229// Tag for the same lane type as D, but twice the lanes.
230template <class D>
231using Twice = typename D::Twice;
232
233template <typename T>
234using Full16 = Simd<T, 2 / sizeof(T), 0>;
235
236template <typename T>
237using Full32 = Simd<T, 4 / sizeof(T), 0>;
238
239template <typename T>
240using Full64 = Simd<T, 8 / sizeof(T), 0>;
241
242template <typename T>
243using Full128 = Simd<T, 16 / sizeof(T), 0>;
244
245// Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
246#define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>)
247#define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>)
248#define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD<D>)
249#define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(TFromD<D>)
250#define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_LANE_SIZE(TFromD<D>, bytes)
251#define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_LANE_SIZE(TFromD<D>, bytes)
252#define HWY_IF_LANE_SIZE_ONE_OF_D(D, bit_array) \
253 HWY_IF_LANE_SIZE_ONE_OF(TFromD<D>, bit_array)
254
255// MSVC workaround: use PrivateN directly instead of MaxLanes.
256#define HWY_IF_LT128_D(D) \
257 hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) < 16>* = nullptr
258#define HWY_IF_GE128_D(D) \
259 hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) >= 16>* = nullptr
260
261// Same, but with a vector argument. ops/*-inl.h define their own TFromV.
262#define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
263#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
264#define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
265#define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
266#define HWY_IF_NOT_LANE_SIZE_V(V, bytes) HWY_IF_NOT_LANE_SIZE(TFromV<V>, bytes)
267#define HWY_IF_LANE_SIZE_ONE_OF_V(V, bit_array) \
268 HWY_IF_LANE_SIZE_ONE_OF(TFromV<V>, bit_array)
269
270template <class D>
271HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D /* d */) {
272 return D::kPrivatePow2;
273}
274
275// MSVC requires the explicit <D>.
276#define HWY_IF_POW2_GE(D, MIN) hwy::EnableIf<Pow2<D>(D()) >= (MIN)>* = nullptr
277
278#if HWY_HAVE_SCALABLE
279
280// Upper bound on the number of lanes. Intended for template arguments and
281// reducing code size (e.g. for SSE4, we know at compile-time that vectors will
282// not exceed 16 bytes). WARNING: this may be a loose bound, use Lanes() as the
283// actual size for allocating storage. WARNING: MSVC might not be able to deduce
284// arguments if this is used in EnableIf. See HWY_IF_LT128_D above.
285template <class D>
286HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
287 return detail::ScaleByPower(HWY_MIN(D::kPrivateN, HWY_LANES(TFromD<D>)),
288 D::kPrivatePow2);
289}
290
291#else
292// Workaround for MSVC 2017: T,N,kPow2 argument deduction fails, so returning N
293// is not an option, nor does a member function work.
294template <class D>
296 return D::kPrivateN;
297}
298
299// (Potentially) non-constant actual size of the vector at runtime, subject to
300// the limit imposed by the Simd. Useful for advancing loop counters.
301// Targets with scalable vectors define this themselves.
302template <typename T, size_t N, int kPow2>
306
307#endif // !HWY_HAVE_SCALABLE
308
309// NOTE: GCC generates incorrect code for vector arguments to non-inlined
310// functions in two situations:
311// - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
312// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
313// - on ARM64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not
314// all) tests to fail.
315//
316// We therefore pass by const& only on GCC and (Windows or ARM64). This alias
317// must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
318// and possibly also other functions that are not inlined.
319#if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64)
320template <class V>
321using VecArg = const V&;
322#else
323template <class V>
324using VecArg = V;
325#endif
326
327// NOLINTNEXTLINE(google-readability-namespace-comments)
328} // namespace HWY_NAMESPACE
329} // namespace hwy
331
332#endif // HIGHWAY_HWY_OPS_SHARED_TOGGLE
#define HWY_MIN(a, b)
Definition base.h:134
#define HWY_INLINE
Definition base.h:70
#define HWY_MAYBE_UNUSED
Definition base.h:82
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition ops/shared-inl.h:123
constexpr bool IsFull(Simd< T, N, kPow2 >)
Definition ops/shared-inl.h:115
V VecArg
Definition ops/shared-inl.h:324
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:221
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:295
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:212
typename detail::CappedTagChecker< T, kLimit >::type CappedTag
Definition ops/shared-inl.h:184
Rebind< MakeFloat< TFromD< D > >, D > RebindToFloat
Definition ops/shared-inl.h:214
typename D::Twice Twice
Definition ops/shared-inl.h:231
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:210
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:243
Repartition< MakeNarrow< TFromD< D > >, D > RepartitionToNarrow
Definition ops/shared-inl.h:223
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition ops/shared-inl.h:173
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:207
HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D)
Definition ops/shared-inl.h:271
typename detail::FixedTagChecker< T, kNumLanes >::type FixedTag
Definition ops/shared-inl.h:200
typename D::Half Half
Definition ops/shared-inl.h:227
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:218
N
Definition rvv-inl.h:1998
typename D::T TFromD
Definition ops/shared-inl.h:203
Definition aligned_allocator.h:27
constexpr size_t FloorLog2(TI x)
Definition base.h:892
constexpr size_t CeilLog2(TI x)
Definition base.h:899
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
#define HWY_LANES(T)
Definition set_macros-inl.h:85
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
Definition ops/shared-inl.h:52
constexpr Simd()=default
Simd< NewT, N, kPow2 > Rebind
Definition ops/shared-inl.h:87
static constexpr size_t NewN()
Definition ops/shared-inl.h:64
static constexpr int kPrivatePow2
Definition ops/shared-inl.h:61
static constexpr size_t kPrivateN
Definition ops/shared-inl.h:60
Lane T
Definition ops/shared-inl.h:54
Definition ops/shared-inl.h:150
static constexpr size_t kLimitPow2
Definition ops/shared-inl.h:154
Definition ops/shared-inl.h:159
Definition ops/shared-inl.h:133