v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
simd-shuffle.h
Go to the documentation of this file.
1// Copyright 2020 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_WASM_SIMD_SHUFFLE_H_
6#define V8_WASM_SIMD_SHUFFLE_H_
7
8#if !V8_ENABLE_WEBASSEMBLY
9#error This header should only be included if WebAssembly is enabled.
10#endif // !V8_ENABLE_WEBASSEMBLY
11
12#include "src/base/macros.h"
13#include "src/common/globals.h"
15
16namespace v8 {
17namespace internal {
18namespace wasm {
19
20#ifdef V8_TARGET_ARCH_X64
21template <int simd_size,
22 typename = std::enable_if_t<simd_size == kSimd128Size ||
23 simd_size == kSimd256Size>>
24struct ShuffleEntry {};
25template <>
26struct ShuffleEntry<kSimd128Size> {
27 uint8_t shuffle[kSimd128Size];
28 compiler::ArchOpcode opcode;
29 bool src0_needs_reg;
30 bool src1_needs_reg;
31 // If AVX is supported, this shuffle can use AVX's three-operand encoding,
32 // so does not require same as first. We conservatively set this to false
33 // (original behavior), and selectively enable for specific arch shuffles.
34 bool no_same_as_first_if_avx;
35};
36
37template <>
38struct ShuffleEntry<kSimd256Size> {
39 uint8_t shuffle[kSimd256Size];
40 compiler::ArchOpcode opcode;
41};
42#endif // V8_TARGET_ARCH_X64
43
45 public:
46 // is in the range [0 .. 15] (or [0 .. 31] if simd_size is kSimd256Size). Set
47 // |inputs_equal| true if this is an explicit swizzle. Returns canonicalized
48 // |shuffle|, |needs_swap|, and |is_swizzle|. If |needs_swap| is true, inputs
49 // must be swapped. If |is_swizzle| is true, the second input can be ignored.
50 template <const int simd_size = kSimd128Size>
51 static void CanonicalizeShuffle(bool inputs_equal, uint8_t* shuffle,
52 bool* needs_swap, bool* is_swizzle)
53 requires(simd_size == kSimd128Size || simd_size == kSimd256Size)
54 {
55 *needs_swap = false;
56 // Inputs equal, then it's a swizzle.
57 if (inputs_equal) {
58 *is_swizzle = true;
59 } else {
60 // Inputs are distinct; check that both are required.
61 bool src0_is_used = false;
62 bool src1_is_used = false;
63 for (int i = 0; i < simd_size; ++i) {
64 if (shuffle[i] < simd_size) {
65 src0_is_used = true;
66 } else {
67 src1_is_used = true;
68 }
69 }
70 if (src0_is_used && !src1_is_used) {
71 *is_swizzle = true;
72 } else if (src1_is_used && !src0_is_used) {
73 *needs_swap = true;
74 *is_swizzle = true;
75 } else {
76 *is_swizzle = false;
77 // Canonicalize general 2 input shuffles so that the first input lanes
78 // are encountered first. This makes architectural shuffle pattern
79 // matching easier, since we only need to consider 1 input ordering
80 // instead of 2.
81 if (shuffle[0] >= simd_size) {
82 // The second operand is used first. Swap inputs and adjust the
83 // shuffle.
84 *needs_swap = true;
85 for (int i = 0; i < simd_size; ++i) {
86 shuffle[i] ^= simd_size;
87 }
88 }
89 }
90 }
91 if (*is_swizzle) {
92 for (int i = 0; i < simd_size; ++i) shuffle[i] &= simd_size - 1;
93 }
94 }
95
96 // Tries to match an 8x16 byte shuffle to the identity shuffle, which is
97 // [0 1 ... 15]. This should be called after canonicalizing the shuffle, so
98 // the second identity shuffle, [16 17 .. 31] is converted to the first one.
99 static bool TryMatchIdentity(const uint8_t* shuffle);
100
101 // Tries to match a byte shuffle to a scalar splat operation. Returns the
102 // index of the lane if successful.
103 template <int LANES>
104 static bool TryMatchSplat(const uint8_t* shuffle, int* index) {
105 const int kBytesPerLane = kSimd128Size / LANES;
106 // Get the first lane's worth of bytes and check that indices start at a
107 // lane boundary and are consecutive.
108 uint8_t lane0[kBytesPerLane];
109 lane0[0] = shuffle[0];
110 if (lane0[0] % kBytesPerLane != 0) return false;
111 for (int i = 1; i < kBytesPerLane; ++i) {
112 lane0[i] = shuffle[i];
113 if (lane0[i] != lane0[0] + i) return false;
114 }
115 // Now check that the other lanes are identical to lane0.
116 for (int i = 1; i < LANES; ++i) {
117 for (int j = 0; j < kBytesPerLane; ++j) {
118 if (lane0[j] != shuffle[i * kBytesPerLane + j]) return false;
119 }
120 }
121 *index = lane0[0] / kBytesPerLane;
122 return true;
123 }
124
125 // Tries to match a 32x4 rotate, only makes sense if the inputs are equal
126 // (is_swizzle). A rotation is a shuffle like [1, 2, 3, 0]. This will always
127 // match a Concat, but can have better codegen.
128 static bool TryMatch32x4Rotate(const uint8_t* shuffle, uint8_t* shuffle32x4,
129 bool is_swizzle);
130
131 // Tries to match a 32x4 reverse shuffle: [3, 2, 1, 0].
132 static bool TryMatch32x4Reverse(const uint8_t* shuffle32x4);
133
134 // Tries to match a one lane copy of 4x32.
135 static bool TryMatch32x4OneLaneSwizzle(const uint8_t* shuffle32x4,
136 uint8_t* from, uint8_t* to);
137
138 // Tries to match an 8x8 byte shuffle to an equivalent 64x1 shuffle. If
139 // successful, it writes the 64x1 shuffle word indices. E.g.
140 // [8 9 10 11 12 13 14 15] == [1]
141 static bool TryMatch64x1Shuffle(const uint8_t* shuffle, uint8_t* shuffle64x1);
142
143 // Tries to match an 8x16 byte shuffle to an equivalent 64x2 shuffle. If
144 // successful, it writes the 64x2 shuffle word indices. E.g.
145 // [8 9 10 11 12 13 14 15 0 1 2 3 4 5 6 7] == [1 0]
146 static bool TryMatch64x2Shuffle(const uint8_t* shuffle, uint8_t* shuffle64x2);
147
148 // Tries to match an 8x4 byte shuffle to an equivalent 32x1 shuffle. If
149 // successful, it writes the 32x1 shuffle word indices. E.g.
150 // [8 9 10 11] == [2]
151 static bool TryMatch32x1Shuffle(const uint8_t* shuffle, uint8_t* shuffle32x1);
152
153 // Tries to match an 8x8 byte shuffle to an equivalent 32x2 shuffle. If
154 // successful, it writes the 32x2 shuffle word indices. E.g.
155 // [0 1 2 3 8 9 10 11] == [0 2]
156 static bool TryMatch32x2Shuffle(const uint8_t* shuffle, uint8_t* shuffle32x2);
157
158 // Tries to match an 8x16 byte shuffle to an equivalent 32x4 shuffle. If
159 // successful, it writes the 32x4 shuffle word indices. E.g.
160 // [0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15] == [0 2 1 3]
161 static bool TryMatch32x4Shuffle(const uint8_t* shuffle, uint8_t* shuffle32x4);
162
163 // Tries to match an 8x32 byte shuffle to an equivalent 32x8 shuffle. If
164 // successful, it writes the 32x8 shuffle word indices. E.g.
165 // [0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15 16 17 18 19 24 25 26 27 20 21 22 23
166 // 28 29 30 31 == [0 2 1 3 4 6 5 7]
167 static bool TryMatch32x8Shuffle(const uint8_t* shuffle, uint8_t* shuffle32x8);
168
169 // Tries to match an 8x2 byte shuffle to an equivalent 16x1 shuffle. If
170 // successful, it writes the 16x1 shuffle word indices. E.g.
171 // [8 9] == [4]
172 static bool TryMatch16x1Shuffle(const uint8_t* shuffle, uint8_t* shuffle16x1);
173
174 // Tries to match an 8x4 byte shuffle to an equivalent 16x2 shuffle. If
175 // successful, it writes the 16x2 shuffle word indices. E.g.
176 // [0 1 8 9] == [0 4]
177 static bool TryMatch16x2Shuffle(const uint8_t* shuffle, uint8_t* shuffle16x2);
178
179 // Tries to match an 8x8 byte shuffle to an equivalent 16x4 shuffle. If
180 // successful, it writes the 16x4 shuffle word indices. E.g.
181 // [0 1 8 9 2 3 10 11] == [0 4 1 5]
182 static bool TryMatch16x4Shuffle(const uint8_t* shuffle, uint8_t* shuffle16x4);
183
184 // Tries to match an 8x16 byte shuffle to an equivalent 16x8 shuffle. If
185 // successful, it writes the 16x8 shuffle word indices. E.g.
186 // [0 1 8 9 2 3 10 11 4 5 12 13 6 7 14 15] == [0 4 1 5 2 6 3 7]
187 static bool TryMatch16x8Shuffle(const uint8_t* shuffle, uint8_t* shuffle16x8);
188
189 // Tries to match a byte shuffle to a concatenate operation, formed by taking
190 // 16 bytes from the 32 byte concatenation of the inputs. If successful, it
191 // writes the byte offset. E.g. [4 5 6 7 .. 16 17 18 19] concatenates both
192 // source vectors with offset 4. The shuffle should be canonicalized.
193 static bool TryMatchConcat(const uint8_t* shuffle, uint8_t* offset);
194
195 // Tries to match a byte shuffle to a blend operation, which is a shuffle
196 // where no lanes change position. E.g. [0 9 2 11 .. 14 31] interleaves the
197 // even lanes of the first source with the odd lanes of the second. The
198 // shuffle should be canonicalized.
199 static bool TryMatchBlend(const uint8_t* shuffle);
200
201 // Tries to match a byte shuffle to a packed byte to dword zero extend
202 // operation. E.g. [8 x x x 9 x x x 10 x x x 11 x x x ] (x is arbitrary value
203 // large than 15). The shuffle should be canonicalized. Its second input
204 // should be zero.
205 static bool TryMatchByteToDwordZeroExtend(const uint8_t* shuffle);
206
207 // Tries to match a four-step reduction shuffle where, in each step, the
208 // upper half of the vector is shuffled into the bottom half. This is only
209 // valid when only lane 0 of the final shuffle result is used.
210 static bool TryMatch8x16UpperToLowerReduce(const uint8_t* shuffle1,
211 const uint8_t* shuffle2,
212 const uint8_t* shuffle3,
213 const uint8_t* shuffle4);
214
215 // Tries to match a three-step reduction shuffle where, in each step, the
216 // upper half of the vector is shuffled into the bottom half. This is only
217 // valid when only lane 0 of the final shuffle result is used.
218 static bool TryMatch16x8UpperToLowerReduce(const uint8_t* shuffle1,
219 const uint8_t* shuffle2,
220 const uint8_t* shuffle3);
221
222 // Tries to match a two-step reduction shuffle where, in each step, the
223 // upper half of the vector is shuffled into the bottom half. This is only
224 // valid when only lane 0 of the final shuffle result is used.
225 static bool TryMatch32x4UpperToLowerReduce(const uint8_t* shuffle1,
226 const uint8_t* shuffle2);
227
228 // Tries to match a 32x4 pairwise shuffle chain where, in each step, every
229 // other element is shuffled into the lower adjacent position. This is only
230 // valid when only lane 0 of the final shuffle result is used.
231 static bool TryMatch32x4PairwiseReduce(const uint8_t* shuffle1,
232 const uint8_t* shuffle2);
233
234 // Tries to match a 64-bit reduction, where element 1 is shuffled into 0.
235 // This is only valid when only lane 0 of the result is used.
236 static bool TryMatch64x2Reduce(const uint8_t* shuffle64x2);
237
238 // Packs a 4 lane shuffle into a single imm8 suitable for use by pshufd,
239 // pshuflw, and pshufhw.
240 static uint8_t PackShuffle4(uint8_t* shuffle);
241 // Gets an 8 bit lane mask suitable for 16x8 pblendw.
242 static uint8_t PackBlend8(const uint8_t* shuffle16x8);
243 // Gets an 8 bit lane mask suitable for 32x4 pblendw.
244 static uint8_t PackBlend4(const uint8_t* shuffle32x4);
245 // Packs 2 bytes of shuffle into a 32 bit immediate.
246 static int32_t Pack2Lanes(const std::array<uint8_t, 2>& shuffle);
247 // Packs 4 bytes of shuffle into a 32 bit immediate.
248 static int32_t Pack4Lanes(const uint8_t* shuffle);
249 // Packs 16 bytes of shuffle into an array of 4 uint32_t.
250 static void Pack16Lanes(uint32_t* dst, const uint8_t* shuffle);
251
252 enum class CanonicalShuffle : uint8_t {
253 kUnknown,
254 kIdentity,
255 kS64x2Even,
256 kS64x2Odd,
257 kS64x2ReverseBytes,
258 kS64x2Reverse,
259 kS32x4Even,
260 kS32x4Odd,
261 kS32x4InterleaveLowHalves,
262 kS32x4InterleaveHighHalves,
263 kS32x4ReverseBytes,
264 kS32x4Reverse,
265 kS32x2Reverse,
266 kS32x4TransposeEven,
267 kS32x4TransposeOdd,
268 kS16x8Even,
269 kS16x8Odd,
270 kS16x8InterleaveLowHalves,
271 kS16x8InterleaveHighHalves,
272 kS16x8ReverseBytes,
273 kS16x2Reverse,
274 kS16x4Reverse,
275 kS16x8TransposeEven,
276 kS16x8TransposeOdd,
277 kS8x16Even,
278 kS8x16Odd,
279 kS8x16InterleaveLowHalves,
280 kS8x16InterleaveHighHalves,
281 kS8x16TransposeEven,
282 kS8x16TransposeOdd,
283 kMaxShuffles,
284 };
285
286 using ShuffleArray = std::array<uint8_t, kSimd128Size>;
287 static CanonicalShuffle TryMatchCanonical(const ShuffleArray& shuffle);
288
289#ifdef V8_TARGET_ARCH_X64
290 // If matching success, the corresponding instrution should be:
291 // vpshufd ymm, ymm, imm8
292 // The augument 'control' is 'imm8' in the instruction.
293 static bool TryMatchVpshufd(const uint8_t* shuffle32x8, uint8_t* control);
294
295 // If matching success, the corresponding instrution should be:
296 // vshufps ymm, ymm, ymm, imm8
297 // The augument 'control' is 'imm8' in the instruction.
298 static bool TryMatchShufps256(const uint8_t* shuffle32x8, uint8_t* control);
299
300 // Shuffles that map to architecture-specific instruction sequences. These are
301 // matched very early, so we shouldn't include shuffles that match better in
302 // later tests, like 32x4 and 16x8 shuffles. In general, these patterns should
303 // map to either a single instruction, or be finer grained, such as zip/unzip
304 // or transpose patterns.
305 static constexpr ShuffleEntry<kSimd128Size> arch_shuffles128[] = {
306 {{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23},
307 compiler::kX64S64x2UnpackLow,
308 true,
309 true,
310 true},
311 {{8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
312 compiler::kX64S64x2UnpackHigh,
313 true,
314 true,
315 true},
316 {{0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23},
317 compiler::kX64S32x4UnpackLow,
318 true,
319 true,
320 true},
321 {{8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31},
322 compiler::kX64S32x4UnpackHigh,
323 true,
324 true,
325 true},
326 {{0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23},
327 compiler::kX64S16x8UnpackLow,
328 true,
329 true,
330 true},
331 {{8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31},
332 compiler::kX64S16x8UnpackHigh,
333 true,
334 true,
335 true},
336 {{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23},
337 compiler::kX64S8x16UnpackLow,
338 true,
339 true,
340 true},
341 {{8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31},
342 compiler::kX64S8x16UnpackHigh,
343 true,
344 true,
345 true},
346
347 {{0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29},
348 compiler::kX64S16x8UnzipLow,
349 true,
350 true,
351 false},
352 {{2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31},
353 compiler::kX64S16x8UnzipHigh,
354 true,
355 true,
356 false},
357 {{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30},
358 compiler::kX64S8x16UnzipLow,
359 true,
360 true,
361 false},
362 {{1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31},
363 compiler::kX64S8x16UnzipHigh,
364 true,
365 true,
366 false},
367 {{0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30},
368 compiler::kX64S8x16TransposeLow,
369 true,
370 true,
371 false},
372 {{1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31},
373 compiler::kX64S8x16TransposeHigh,
374 true,
375 true,
376 false},
377 {{7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8},
378 compiler::kX64S8x8Reverse,
379 true,
380 true,
381 false},
382 {{3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12},
383 compiler::kX64S8x4Reverse,
384 true,
385 true,
386 false},
387 {{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14},
388 compiler::kX64S8x2Reverse,
389 true,
390 true,
391 false}};
392
393 static constexpr ShuffleEntry<kSimd256Size> arch_shuffles256[] = {
394 {{0, 1, 2, 3, 32, 33, 34, 35, 4, 5, 6, 7, 36, 37, 38, 39,
395 16, 17, 18, 19, 48, 49, 50, 51, 20, 21, 22, 23, 52, 53, 54, 55},
396 compiler::kX64S32x8UnpackLow},
397
398 {{8, 9, 10, 11, 40, 41, 42, 43, 12, 13, 14, 15, 44, 45, 46, 47,
399 24, 25, 26, 27, 56, 57, 58, 59, 28, 29, 30, 31, 60, 61, 62, 63},
400 compiler::kX64S32x8UnpackHigh}};
401
402 template <int simd_size>
403 static bool TryMatchArchShuffle(const uint8_t* shuffle, bool is_swizzle,
404 const ShuffleEntry<simd_size>** arch_shuffle)
405 requires(simd_size == kSimd128Size || simd_size == kSimd256Size)
406 {
407 uint8_t mask = is_swizzle ? simd_size - 1 : 2 * simd_size - 1;
408
409 const ShuffleEntry<simd_size>* table;
410 size_t num_entries;
411 if constexpr (simd_size == kSimd128Size) {
412 table = arch_shuffles128;
413 num_entries = arraysize(arch_shuffles128);
414 } else {
415 table = arch_shuffles256;
416 num_entries = arraysize(arch_shuffles256);
417 }
418
419 for (size_t i = 0; i < num_entries; ++i) {
420 const ShuffleEntry<simd_size>& entry = table[i];
421 int j = 0;
422 for (; j < simd_size; ++j) {
423 if ((entry.shuffle[j] & mask) != (shuffle[j] & mask)) {
424 break;
425 }
426 }
427 if (j == simd_size) {
428 *arch_shuffle = &entry;
429 return true;
430 }
431 }
432 return false;
433 }
434#endif // V8_TARGET_ARCH_X64
435};
436
438 public:
439 // Checks if all the immediates are in range (< kSimd128Size), and if they are
440 // not, the top bit is set.
441 static bool AllInRangeOrTopBitSet(std::array<uint8_t, kSimd128Size> shuffle);
442};
443
444} // namespace wasm
445} // namespace internal
446} // namespace v8
447
448#endif // V8_WASM_SIMD_SHUFFLE_H_
static bool TryMatchSplat(const uint8_t *shuffle, int *index)
static void CanonicalizeShuffle(bool inputs_equal, uint8_t *shuffle, bool *needs_swap, bool *is_swizzle)
std::array< uint8_t, kSimd128Size > ShuffleArray
int32_t offset
uint32_t const mask
constexpr int kSimd128Size
Definition globals.h:706
constexpr int kSimd256Size
Definition globals.h:709
Definition c-api.cc:87
#define V8_EXPORT_PRIVATE
Definition macros.h:460
#define arraysize(array)
Definition macros.h:67