Grok 10.0.5
transform-inl.h
Go to the documentation of this file.
1// Copyright 2022 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Per-target include guard
17#if defined(HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_) == \
18 defined(HWY_TARGET_TOGGLE)
19#ifdef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
20#undef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
21#else
22#define HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
23#endif
24
25#include "hwy/highway.h"
26
28namespace hwy {
29namespace HWY_NAMESPACE {
30
31// These functions avoid having to write a loop plus remainder handling in the
32// (unfortunately still common) case where arrays are not aligned/padded. If the
33// inputs are known to be aligned/padded, it is more efficient to write a single
34// loop using Load(). We do not provide a TransformAlignedPadded because it
35// would be more verbose than such a loop.
36//
37// Func is either a functor with a templated operator()(d, v[, v1[, v2]]), or a
38// generic lambda if using C++14. Due to apparent limitations of Clang on
39// Windows, it is currently necessary to add HWY_ATTR before the opening { of
40// the lambda to avoid errors about "always_inline function .. requires target".
41//
42// If HWY_MEM_OPS_MIGHT_FAULT, we use scalar code instead of masking. Otherwise,
43// we used `MaskedLoad` and `BlendedStore` to read/write the final partial
44// vector.
45
46// Fills `out[0, count)` with the vectors returned by `func(d, index_vec)`,
47// where `index_vec` is `Vec<RebindToUnsigned<D>>`. On the first call to `func`,
48// the value of its lane i is i, and increases by `Lanes(d)` after every call.
49// Note that some of these indices may be `>= count`, but the elements that
50// `func` returns in those lanes will not be written to `out`.
51template <class D, class Func, typename T = TFromD<D>>
52void Generate(D d, T* HWY_RESTRICT out, size_t count, const Func& func) {
53 const RebindToUnsigned<D> du;
54 using TU = TFromD<decltype(du)>;
55 const size_t N = Lanes(d);
56
57 size_t idx = 0;
58 Vec<decltype(du)> vidx = Iota(du, 0);
59 for (; idx + N <= count; idx += N) {
60 StoreU(func(d, vidx), d, out + idx);
61 vidx = Add(vidx, Set(du, static_cast<TU>(N)));
62 }
63
64 // `count` was a multiple of the vector length `N`: already done.
65 if (HWY_UNLIKELY(idx == count)) return;
66
67#if HWY_MEM_OPS_MIGHT_FAULT
68 // Proceed one by one.
69 const CappedTag<T, 1> d1;
70 const RebindToUnsigned<decltype(d1)> du1;
71 for (; idx < count; ++idx) {
72 StoreU(func(d1, Set(du1, static_cast<TU>(idx))), d1, out + idx);
73 }
74#else
75 const size_t remaining = count - idx;
76 HWY_DASSERT(0 != remaining && remaining < N);
77 const Mask<D> mask = FirstN(d, remaining);
78 BlendedStore(func(d, vidx), mask, d, out + idx);
79#endif
80}
81
82// Replaces `inout[idx]` with `func(d, inout[idx])`. Example usage: multiplying
83// array elements by a constant.
84template <class D, class Func, typename T = TFromD<D>>
85void Transform(D d, T* HWY_RESTRICT inout, size_t count, const Func& func) {
86 const size_t N = Lanes(d);
87
88 size_t idx = 0;
89 for (; idx + N <= count; idx += N) {
90 const Vec<D> v = LoadU(d, inout + idx);
91 StoreU(func(d, v), d, inout + idx);
92 }
93
94 // `count` was a multiple of the vector length `N`: already done.
95 if (HWY_UNLIKELY(idx == count)) return;
96
97#if HWY_MEM_OPS_MIGHT_FAULT
98 // Proceed one by one.
99 const CappedTag<T, 1> d1;
100 for (; idx < count; ++idx) {
101 using V1 = Vec<decltype(d1)>;
102 const V1 v = LoadU(d1, inout + idx);
103 StoreU(func(d1, v), d1, inout + idx);
104 }
105#else
106 const size_t remaining = count - idx;
107 HWY_DASSERT(0 != remaining && remaining < N);
108 const Mask<D> mask = FirstN(d, remaining);
109 const Vec<D> v = MaskedLoad(mask, d, inout + idx);
110 BlendedStore(func(d, v), mask, d, inout + idx);
111#endif
112}
113
114// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx])`. Example usage:
115// multiplying array elements by those of another array.
116template <class D, class Func, typename T = TFromD<D>>
117void Transform1(D d, T* HWY_RESTRICT inout, size_t count,
118 const T* HWY_RESTRICT in1, const Func& func) {
119 const size_t N = Lanes(d);
120
121 size_t idx = 0;
122 for (; idx + N <= count; idx += N) {
123 const Vec<D> v = LoadU(d, inout + idx);
124 const Vec<D> v1 = LoadU(d, in1 + idx);
125 StoreU(func(d, v, v1), d, inout + idx);
126 }
127
128 // `count` was a multiple of the vector length `N`: already done.
129 if (HWY_UNLIKELY(idx == count)) return;
130
131#if HWY_MEM_OPS_MIGHT_FAULT
132 // Proceed one by one.
133 const CappedTag<T, 1> d1;
134 for (; idx < count; ++idx) {
135 using V1 = Vec<decltype(d1)>;
136 const V1 v = LoadU(d1, inout + idx);
137 const V1 v1 = LoadU(d1, in1 + idx);
138 StoreU(func(d1, v, v1), d1, inout + idx);
139 }
140#else
141 const size_t remaining = count - idx;
142 HWY_DASSERT(0 != remaining && remaining < N);
143 const Mask<D> mask = FirstN(d, remaining);
144 const Vec<D> v = MaskedLoad(mask, d, inout + idx);
145 const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
146 BlendedStore(func(d, v, v1), mask, d, inout + idx);
147#endif
148}
149
150// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx], in2[idx])`. Example
151// usage: FMA of elements from three arrays, stored into the first array.
152template <class D, class Func, typename T = TFromD<D>>
153void Transform2(D d, T* HWY_RESTRICT inout, size_t count,
154 const T* HWY_RESTRICT in1, const T* HWY_RESTRICT in2,
155 const Func& func) {
156 const size_t N = Lanes(d);
157
158 size_t idx = 0;
159 for (; idx + N <= count; idx += N) {
160 const Vec<D> v = LoadU(d, inout + idx);
161 const Vec<D> v1 = LoadU(d, in1 + idx);
162 const Vec<D> v2 = LoadU(d, in2 + idx);
163 StoreU(func(d, v, v1, v2), d, inout + idx);
164 }
165
166 // `count` was a multiple of the vector length `N`: already done.
167 if (HWY_UNLIKELY(idx == count)) return;
168
169#if HWY_MEM_OPS_MIGHT_FAULT
170 // Proceed one by one.
171 const CappedTag<T, 1> d1;
172 for (; idx < count; ++idx) {
173 using V1 = Vec<decltype(d1)>;
174 const V1 v = LoadU(d1, inout + idx);
175 const V1 v1 = LoadU(d1, in1 + idx);
176 const V1 v2 = LoadU(d1, in2 + idx);
177 StoreU(func(d1, v, v1, v2), d1, inout + idx);
178 }
179#else
180 const size_t remaining = count - idx;
181 HWY_DASSERT(0 != remaining && remaining < N);
182 const Mask<D> mask = FirstN(d, remaining);
183 const Vec<D> v = MaskedLoad(mask, d, inout + idx);
184 const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
185 const Vec<D> v2 = MaskedLoad(mask, d, in2 + idx);
186 BlendedStore(func(d, v, v1, v2), mask, d, inout + idx);
187#endif
188}
189
190template <class D, typename T = TFromD<D>>
191void Replace(D d, T* HWY_RESTRICT inout, size_t count, T new_t, T old_t) {
192 const size_t N = Lanes(d);
193 const Vec<D> old_v = Set(d, old_t);
194 const Vec<D> new_v = Set(d, new_t);
195
196 size_t idx = 0;
197 for (; idx + N <= count; idx += N) {
198 Vec<D> v = LoadU(d, inout + idx);
199 StoreU(IfThenElse(Eq(v, old_v), new_v, v), d, inout + idx);
200 }
201
202 // `count` was a multiple of the vector length `N`: already done.
203 if (HWY_UNLIKELY(idx == count)) return;
204
205#if HWY_MEM_OPS_MIGHT_FAULT
206 // Proceed one by one.
207 const CappedTag<T, 1> d1;
208 const Vec<decltype(d1)> old_v1 = Set(d1, old_t);
209 const Vec<decltype(d1)> new_v1 = Set(d1, new_t);
210 for (; idx < count; ++idx) {
211 using V1 = Vec<decltype(d1)>;
212 const V1 v1 = LoadU(d1, inout + idx);
213 StoreU(IfThenElse(Eq(v1, old_v1), new_v1, v1), d1, inout + idx);
214 }
215#else
216 const size_t remaining = count - idx;
217 HWY_DASSERT(0 != remaining && remaining < N);
218 const Mask<D> mask = FirstN(d, remaining);
219 const Vec<D> v = MaskedLoad(mask, d, inout + idx);
220 BlendedStore(IfThenElse(Eq(v, old_v), new_v, v), mask, d, inout + idx);
221#endif
222}
223
224template <class D, class Func, typename T = TFromD<D>>
225void ReplaceIf(D d, T* HWY_RESTRICT inout, size_t count, T new_t,
226 const Func& func) {
227 const size_t N = Lanes(d);
228 const Vec<D> new_v = Set(d, new_t);
229
230 size_t idx = 0;
231 for (; idx + N <= count; idx += N) {
232 Vec<D> v = LoadU(d, inout + idx);
233 StoreU(IfThenElse(func(d, v), new_v, v), d, inout + idx);
234 }
235
236 // `count` was a multiple of the vector length `N`: already done.
237 if (HWY_UNLIKELY(idx == count)) return;
238
239#if HWY_MEM_OPS_MIGHT_FAULT
240 // Proceed one by one.
241 const CappedTag<T, 1> d1;
242 const Vec<decltype(d1)> new_v1 = Set(d1, new_t);
243 for (; idx < count; ++idx) {
244 using V1 = Vec<decltype(d1)>;
245 const V1 v = LoadU(d1, inout + idx);
246 StoreU(IfThenElse(func(d1, v), new_v1, v), d1, inout + idx);
247 }
248#else
249 const size_t remaining = count - idx;
250 HWY_DASSERT(0 != remaining && remaining < N);
251 const Mask<D> mask = FirstN(d, remaining);
252 const Vec<D> v = MaskedLoad(mask, d, inout + idx);
253 BlendedStore(IfThenElse(func(d, v), new_v, v), mask, d, inout + idx);
254#endif
255}
256
257// NOLINTNEXTLINE(google-readability-namespace-comments)
258} // namespace HWY_NAMESPACE
259} // namespace hwy
261
262#endif // HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
#define HWY_RESTRICT
Definition base.h:64
#define HWY_DASSERT(condition)
Definition base.h:238
#define HWY_UNLIKELY(expr)
Definition base.h:76
d
Definition rvv-inl.h:1998
void Generate(D d, T *HWY_RESTRICT out, size_t count, const Func &func)
Definition transform-inl.h:52
void ReplaceIf(D d, T *HWY_RESTRICT inout, size_t count, T new_t, const Func &func)
Definition transform-inl.h:225
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition arm_neon-inl.h:2456
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:212
typename detail::CappedTagChecker< T, kLimit >::type CappedTag
Definition ops/shared-inl.h:184
void Transform2(D d, T *HWY_RESTRICT inout, size_t count, const T *HWY_RESTRICT in1, const T *HWY_RESTRICT in2, const Func &func)
Definition transform-inl.h:153
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:2758
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition arm_neon-inl.h:2941
HWY_API constexpr size_t Lanes(Simd< T, N, kPow2 >)
Definition arm_sve-inl.h:243
HWY_API Vec128< T, N > IfThenElse(const Mask128< T, N > mask, const Vec128< T, N > yes, const Vec128< T, N > no)
Definition emu128-inl.h:303
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2772
void Replace(D d, T *HWY_RESTRICT inout, size_t count, T new_t, T old_t)
Definition transform-inl.h:191
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition arm_sve-inl.h:322
Vec128< T, N > Iota(const Simd< T, N, 0 > d, const T2 first)
Definition arm_neon-inl.h:1049
void Transform(D d, T *HWY_RESTRICT inout, size_t count, const Func &func)
Definition transform-inl.h:85
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:2591
decltype(MaskFromVec(Zero(D()))) Mask
Definition generic_ops-inl.h:46
N
Definition rvv-inl.h:1998
void Transform1(D d, T *HWY_RESTRICT inout, size_t count, const T *HWY_RESTRICT in1, const Func &func)
Definition transform-inl.h:117
const vfloat64m1_t v
Definition rvv-inl.h:1998
typename D::T TFromD
Definition ops/shared-inl.h:203
decltype(Zero(D())) Vec
Definition generic_ops-inl.h:40
Definition aligned_allocator.h:27
FuncOutput(*)(const void *, FuncInput) Func
Definition nanobenchmark.h:105
#define HWY_NAMESPACE
Definition set_macros-inl.h:82
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()