v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
wasm-shuffle-reducer.cc
Go to the documentation of this file.
1// Copyright 2025 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
6
8
10
11void DemandedElementAnalysis::AddUnaryOp(const Simd128UnaryOp& unop,
12 LaneBitSet lanes) {
13 if (Visited(&unop)) return;
14 visited_.insert(&unop);
15
16 const Operation& input = input_graph().Get(unop.input());
17 if (!input.saturated_use_count.IsOne()) {
18 return;
19 }
20 // TODO(sparker): Add floating-point conversions:
21 // - PromoteLow
22 // - ConvertLow
23 static constexpr std::array low_half_ops = {
24 Simd128UnaryOp::Kind::kI16x8SConvertI8x16Low,
25 Simd128UnaryOp::Kind::kI16x8UConvertI8x16Low,
26 Simd128UnaryOp::Kind::kI32x4SConvertI16x8Low,
27 Simd128UnaryOp::Kind::kI32x4UConvertI16x8Low,
28 Simd128UnaryOp::Kind::kI64x2SConvertI32x4Low,
29 Simd128UnaryOp::Kind::kI64x2UConvertI32x4Low,
30 };
31
32 for (auto const kind : low_half_ops) {
33 if (kind == unop.kind) {
34 DCHECK(lanes == k8x16 || lanes == k8x8Low || lanes == k8x4Low);
35 lanes >>= lanes.count() / 2;
36 RecordOp(&input, lanes);
37 return;
38 }
39 }
40}
41
42void DemandedElementAnalysis::AddBinaryOp(const Simd128BinopOp& binop,
43 LaneBitSet lanes) {
44 if (Visited(&binop)) return;
45 visited_.insert(&binop);
46
47 static constexpr std::array low_half_ops = {
48 Simd128BinopOp::Kind::kI16x8ExtMulLowI8x16S,
49 Simd128BinopOp::Kind::kI16x8ExtMulLowI8x16U,
50 Simd128BinopOp::Kind::kI32x4ExtMulLowI16x8S,
51 Simd128BinopOp::Kind::kI32x4ExtMulLowI16x8U,
52 Simd128BinopOp::Kind::kI64x2ExtMulLowI32x4S,
53 Simd128BinopOp::Kind::kI64x2ExtMulLowI32x4U,
54 };
55 const Operation& left = input_graph().Get(binop.left());
56 const Operation& right = input_graph().Get(binop.right());
57 for (auto const& kind : low_half_ops) {
58 if (kind == binop.kind) {
59 DCHECK(lanes == k8x16 || lanes == k8x8Low);
60 lanes >>= lanes.count() / 2;
61 if (left.saturated_use_count.IsOne()) {
62 RecordOp(&left, lanes);
63 }
64 if (right.saturated_use_count.IsOne()) {
65 RecordOp(&right, lanes);
66 }
67 }
68 }
69}
70
72 if (auto* unop = op->TryCast<Simd128UnaryOp>()) {
73 AddUnaryOp(*unop, lanes);
74 } else if (auto* binop = op->TryCast<Simd128BinopOp>()) {
75 AddBinaryOp(*binop, lanes);
76 } else if (auto* shuffle = op->TryCast<Simd128ShuffleOp>()) {
77 demanded_elements_.emplace_back(shuffle, lanes);
78 }
79}
80
82 for (uint32_t processed = input_graph().block_count(); processed > 0;
83 --processed) {
84 BlockIndex block_index = static_cast<BlockIndex>(processed - 1);
85 const Block& block = input_graph().Get(block_index);
86 auto idx_range = input_graph().OperationIndices(block);
87 for (auto it = idx_range.rbegin(); it != idx_range.rend(); ++it) {
88 const Operation& op = input_graph().Get(*it);
89 Process(op);
90 }
91 }
92}
93
95 if (ShouldSkipOperation(op)) {
96 return;
97 }
98
99 if (auto* unop = op.TryCast<Simd128UnaryOp>()) {
100 ProcessUnary(*unop);
101 return;
102 }
103
104 if (auto* binop = op.TryCast<Simd128BinopOp>()) {
105 ProcessBinary(*binop);
106 return;
107 }
108
109 if (auto* shuffle_op = op.TryCast<Simd128ShuffleOp>()) {
110 ProcessShuffle(*shuffle_op);
111 return;
112 }
113}
114
118
122
124 const Simd128ShuffleOp& shuffle_op, const Simd128ShuffleOp& shuffle,
125 uint8_t lower_limit, uint8_t upper_limit) {
126 // Suppose we have two 16-byte shuffles:
127 // |---a1---|---b3---|--------|--------| shuffle_op = (a, b)
128 //
129 // |---a1---|---b3---|---c?---|---c?---| shuffle = (shf0, c)
130 //
131 // As only half of the shf0 is used, it means that half the work of shf0 is
132 // wasted so, here, we try to reduce shf0 to a more narrow kind. In the case
133 // above we can simply truncate shf0.shuffle but there are other situations
134 // which involve more work:
135 //
136 // In the following case, shf0.shuffle needs to be shifted left so that it
137 // writes the required lanes to the low half of the result. This then means
138 // that shf1.shuffle needs to be updated to read from the low half.
139 //
140 // |--------|--------|---a1---|---b3---| shuffle_op = (a, b)
141 //
142 // |---a1---|---b3---|---c?---|---c?---| shuffle = (shf0, c)
143 //
144
145 struct ShuffleHelper {
146 explicit ShuffleHelper(const uint8_t* shuffle) : shuffle(shuffle) {}
147
148 const uint8_t* begin() const { return shuffle; }
149
150 const uint8_t* midpoint() const {
151 constexpr size_t half_lanes = kSimd128Size / 2;
152 return shuffle + half_lanes;
153 }
154
155 const uint8_t* end() const { return shuffle + kSimd128Size; }
156
157 const uint8_t* shuffle;
158 };
159
160 ShuffleHelper view(shuffle.shuffle);
161
162 // Test whether the low half of the shuffle is within the inclusive range.
163 auto all_low_half = [&view](uint8_t lower_limit, uint8_t upper_limit) {
164 return std::all_of(view.begin(), view.midpoint(),
165 [lower_limit, upper_limit](uint8_t i) {
166 return i >= lower_limit && i <= upper_limit;
167 });
168 };
169 // Test whether the high half of the shuffle is within the inclusive range.
170 auto all_high_half = [&view](uint8_t lower_limit, uint8_t upper_limit) {
171 return std::all_of(view.midpoint(), view.end(),
172 [lower_limit, upper_limit](uint8_t i) {
173 return i >= lower_limit && i <= upper_limit;
174 });
175 };
176 // Test whether none of the low half of the shuffle contains lanes within the
177 // inclusive range.
178 auto none_low_half = [&view](uint8_t lower_limit, uint8_t upper_limit) {
179 return std::none_of(view.begin(), view.midpoint(),
180 [lower_limit, upper_limit](uint8_t i) {
181 return i >= lower_limit && i <= upper_limit;
182 });
183 };
184 // Test whether none of the high half of the shuffle contains lanes within the
185 // inclusive range.
186 auto none_high_half = [&view](uint8_t lower_limit, uint8_t upper_limit) {
187 return std::none_of(view.midpoint(), view.end(),
188 [lower_limit, upper_limit](uint8_t i) {
189 return i >= lower_limit && i <= upper_limit;
190 });
191 };
192
193 // lower_ and upper_limit and set from the caller depending on whether we're
194 // examining the left or right operand of shuffle. So, here we check if
195 // shuffle_op is being exclusively shuffled into the low or high half using
196 // either the lower and upper limits of {0,15} or {16,31}.
197 bool shf_into_low_half = all_low_half(lower_limit, upper_limit) &&
198 none_high_half(lower_limit, upper_limit);
199 bool shf_into_high_half = all_high_half(lower_limit, upper_limit) &&
200 none_low_half(lower_limit, upper_limit);
201 DCHECK(!(shf_into_low_half && shf_into_high_half));
202
203 constexpr size_t quarter_lanes = kSimd128Size / 4;
204 if (shf_into_low_half) {
205 if (all_low_half(lower_limit + quarter_lanes, upper_limit)) {
206 // Low half of shuffle is sourced from the high half of shuffle_op.
209 shift_shuffles_.push_back(&shuffle_op);
211 } else if (all_low_half(lower_limit, upper_limit - quarter_lanes)) {
212 // Low half of shuffle is sourced from the low half of shuffle_op.
215 }
216 } else if (shf_into_high_half) {
217 if (all_high_half(lower_limit + quarter_lanes, upper_limit)) {
218 // High half of shuffle is sourced from the high half of shuffle_op.
221 shift_shuffles_.push_back(&shuffle_op);
223 } else if (all_high_half(lower_limit, upper_limit - quarter_lanes)) {
224 // High half of shuffle is sourced from the low half of shuffle_op.
227 }
228 }
229}
230
231#if V8_ENABLE_WASM_DEINTERLEAVED_MEM_OPS
232bool WasmShuffleAnalyzer::CouldLoadPair(const LoadOp& load0,
233 const LoadOp& load1) const {
234 DCHECK_NE(load0, load1);
235 // Before adding a candidate, ensure that the loads can be scheduled together
236 // and that the accesses are consecutive.
237 if (load0.kind != load1.kind) {
238 return false;
239 }
240
241 // Only try this for loads in the same block.
242 if (input_graph().BlockIndexOf(load0) != input_graph().BlockIndexOf(load1)) {
243 return false;
244 }
245
246 if (load0.base() != load1.base()) {
247 return false;
248 }
249 if (load0.index() != load1.index()) {
250 return false;
251 }
252
253 // TODO(sparker): Can we use absolute diff..? I guess this would require
254 // some permutation after the load.
255 if (load1.offset - load0.offset != kSimd128Size) {
256 return false;
257 }
258
259 // Calculate whether second can be moved next to first by comparing the
260 // effects through the sequence of operations in between.
261 auto CanReschedule = [this](OpIndex first_idx, OpIndex second_idx) {
262 const Operation& second = input_graph().Get(second_idx);
263 OpEffects second_effects = second.Effects();
264 OpIndex prev_idx = input_graph().PreviousIndex(second_idx);
265 bool second_is_protected_load = second.IsProtectedLoad();
266
267 while (prev_idx != first_idx) {
268 const Operation& prev_op = input_graph().Get(prev_idx);
269 OpEffects prev_effects = prev_op.Effects();
270 bool both_protected =
271 second_is_protected_load && prev_op.IsProtectedLoad();
272 if (both_protected &&
273 CannotSwapProtectedLoads(prev_effects, second_effects)) {
274 return false;
275 } else if (CannotSwapOperations(prev_effects, second_effects)) {
276 return false;
277 }
278 prev_idx = input_graph().PreviousIndex(prev_idx);
279 }
280 return true;
281 };
282
283 OpIndex first_idx = input_graph().Index(load0);
284 OpIndex second_idx = input_graph().Index(load1);
285 if (first_idx.offset() > second_idx.offset()) {
286 std::swap(first_idx, second_idx);
287 }
288
289 return CanReschedule(first_idx, second_idx);
290}
291
292void WasmShuffleAnalyzer::AddLoadMultipleCandidate(
293 const Simd128ShuffleOp& even_shuffle, const Simd128ShuffleOp& odd_shuffle,
294 const LoadOp& left, const LoadOp& right,
295 Simd128LoadPairDeinterleaveOp::Kind kind) {
296 DCHECK_NE(even_shuffle, odd_shuffle);
297
298 if (CouldLoadPair(left, right)) {
299 deinterleave_load_candidates_.emplace_back(kind, left, right, even_shuffle,
300 odd_shuffle);
301 }
302}
303
304void WasmShuffleAnalyzer::ProcessShuffleOfLoads(const Simd128ShuffleOp& shuffle,
305 const LoadOp& left,
306 const LoadOp& right) {
307 DCHECK_EQ(shuffle.left(), input_graph().Index(left));
308 DCHECK_EQ(shuffle.right(), input_graph().Index(right));
309 // We're looking for two shuffle users of these two loads, so ensure the
310 // number of users doesn't exceed two.
311 if (left.saturated_use_count.Get() != 2 ||
312 right.saturated_use_count.Get() != 2) {
313 return;
314 }
315
316 // Look for even and odd shuffles. If we find one, then also check if we've
317 // already found another shuffle, performing the opposite action, on the same
318 // two loads. And if this is true, then we add a candidate including both
319 // shuffles and both loads.
320 auto AddShuffle = [this, &shuffle, &left, &right](
321 SmallShuffleVector& shuffles,
322 const SmallShuffleVector& other_shuffles, bool is_even,
323 Simd128LoadPairDeinterleaveOp::Kind kind) {
324 shuffles.push_back(&shuffle);
325 if (auto other_shuffle = GetOtherShuffleUser(left, right, other_shuffles)) {
326 if (is_even) {
327 AddLoadMultipleCandidate(shuffle, *other_shuffle.value(), left, right,
328 kind);
329 } else {
330 AddLoadMultipleCandidate(*other_shuffle.value(), shuffle, left, right,
331 kind);
332 }
333 }
334 };
335
336 if (!DemandedByteLanes(&shuffle)) {
337 // Full width shuffles.
339 std::copy_n(shuffle.shuffle, kSimd128Size, shuffle_bytes.begin());
340 auto canonical = wasm::SimdShuffle::TryMatchCanonical(shuffle_bytes);
341 switch (canonical) {
342 default:
343 return;
345 AddShuffle(even_64x2_shuffles_, odd_64x2_shuffles_, true,
346 Simd128LoadPairDeinterleaveOp::Kind::k64x4);
347 break;
349 AddShuffle(odd_64x2_shuffles_, even_64x2_shuffles_, false,
350 Simd128LoadPairDeinterleaveOp::Kind::k64x4);
351 break;
353 AddShuffle(even_32x4_shuffles_, odd_32x4_shuffles_, true,
354 Simd128LoadPairDeinterleaveOp::Kind::k32x8);
355 break;
357 AddShuffle(odd_32x4_shuffles_, even_32x4_shuffles_, false,
358 Simd128LoadPairDeinterleaveOp::Kind::k32x8);
359 break;
361 AddShuffle(even_16x8_shuffles_, odd_16x8_shuffles_, true,
362 Simd128LoadPairDeinterleaveOp::Kind::k16x16);
363 break;
365 AddShuffle(odd_16x8_shuffles_, even_16x8_shuffles_, false,
366 Simd128LoadPairDeinterleaveOp::Kind::k16x16);
367 break;
369 AddShuffle(even_8x16_shuffles_, odd_8x16_shuffles_, true,
370 Simd128LoadPairDeinterleaveOp::Kind::k8x32);
371 break;
373 AddShuffle(odd_8x16_shuffles_, even_8x16_shuffles_, false,
374 Simd128LoadPairDeinterleaveOp::Kind::k8x32);
375 break;
376 }
377 }
378}
379#endif // V8_ENABLE_WASM_DEINTERLEAVED_MEM_OPS
380
381void WasmShuffleAnalyzer::ProcessShuffle(const Simd128ShuffleOp& shuffle) {
382 if (shuffle.kind != Simd128ShuffleOp::Kind::kI8x16) {
383 return;
384 }
385 const Operation& left = input_graph().Get(shuffle.left());
386 const Operation& right = input_graph().Get(shuffle.right());
387
388#if V8_ENABLE_WASM_DEINTERLEAVED_MEM_OPS
389 auto* load_left = left.TryCast<LoadOp>();
390 auto* load_right = right.TryCast<LoadOp>();
391
392 // TODO(sparker): Handle I8x8 shuffles, which will likely mean that the input
393 // to the shuffles are Simd128LoadTransformOp::Load64Zero.
394 if (load_left && load_right) {
395 ProcessShuffleOfLoads(shuffle, *load_left, *load_right);
396 return;
397 }
398#endif // V8_ENABLE_WASM_DEINTERLEAVED_MEM_OPS
399
400 auto* shuffle_left = left.TryCast<Simd128ShuffleOp>();
401 auto* shuffle_right = right.TryCast<Simd128ShuffleOp>();
402 if (!shuffle_left && !shuffle_right) {
403 return;
404 }
405 constexpr uint8_t left_lower = 0;
406 constexpr uint8_t left_upper = 15;
407 constexpr uint8_t right_lower = 16;
408 constexpr uint8_t right_upper = 31;
409 if (shuffle_left && shuffle_left->kind == Simd128ShuffleOp::Kind::kI8x16 &&
410 shuffle_left->saturated_use_count.IsOne()) {
411 ProcessShuffleOfShuffle(*shuffle_left, shuffle, left_lower, left_upper);
412 }
413 if (shuffle_right && shuffle_right->kind == Simd128ShuffleOp::Kind::kI8x16 &&
414 shuffle_right->saturated_use_count.IsOne()) {
415 ProcessShuffleOfShuffle(*shuffle_right, shuffle, right_lower, right_upper);
416 }
417}
418
419} // namespace v8::internal::compiler::turboshaft
Builtins::Kind kind
Definition builtins.cc:40
T & emplace_back(Args &&... args)
void AddUnaryOp(const Simd128UnaryOp &unop, LaneBitSet lanes)
void AddBinaryOp(const Simd128BinopOp &binop, LaneBitSet lanes)
std::optional< DemandedElementAnalysis::LaneBitSet > DemandedByteLanes(const Operation *op) const
void ProcessShuffle(const Simd128ShuffleOp &shuffle_op)
void ProcessShuffleOfShuffle(const Simd128ShuffleOp &shuffle_op, const Simd128ShuffleOp &shuffle, uint8_t lower_limit, uint8_t upper_limit)
static CanonicalShuffle TryMatchCanonical(const ShuffleArray &shuffle)
std::array< uint8_t, kSimd128Size > ShuffleArray
int end
double second
V8_EXPORT_PRIVATE V8_INLINE bool ShouldSkipOperation(const Operation &op)
bool CannotSwapProtectedLoads(OpEffects first, OpEffects second)
Definition operations.h:858
bool CannotSwapOperations(OpEffects first, OpEffects second)
Definition operations.h:854
SmallZoneVector< const Simd128ShuffleOp *, 8 > SmallShuffleVector
Node::Uses::const_iterator begin(const Node::Uses &uses)
Definition node.h:708
constexpr int kSimd128Size
Definition globals.h:706
Operation
Definition operation.h:43
#define DCHECK_NE(v1, v2)
Definition logging.h:486
#define DCHECK(condition)
Definition logging.h:482
#define DCHECK_EQ(v1, v2)
Definition logging.h:485
const underlying_operation_t< Op > * TryCast() const
Definition operations.h:990