v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
inlining-tree.h
Go to the documentation of this file.
1// Copyright 2023 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_WASM_INLINING_TREE_H_
6#define V8_WASM_INLINING_TREE_H_
7
8#if !V8_ENABLE_WEBASSEMBLY
9#error This header should only be included if WebAssembly is enabled.
10#endif // !V8_ENABLE_WEBASSEMBLY
11
12#include <cinttypes>
13#include <cstdint>
14#include <queue>
15#include <vector>
16
17#include "src/utils/utils.h"
20
21namespace v8::internal::wasm {
22
23// Represents a tree of inlining decisions.
24// A node in the tree represents a function frame, and `function_calls_`
25// represent all direct/call_ref/call_indirect function calls in this frame.
26// Each element of `function_calls_` is itself a `Vector` of `InliningTree`s,
27// corresponding to the different speculative candidates for a
28// call_ref/call_indirect; for a direct call, it has a single element.
29// If a transitive element of `function_calls_` has its `is_inlined_` field set,
30// it should be inlined into the caller.
31// We have this additional datastructure for Turboshaft, since nodes in the
32// Turboshaft IR aren't easily expanded incrementally, so all the inlining
33// decisions are already made before graph building on this abstracted form of
34// the code.
35class InliningTree : public ZoneObject {
36 private:
37 struct Data;
38
39 public:
41
42 static InliningTree* CreateRoot(Zone* zone, const WasmModule* module,
43 uint32_t function_index) {
44 InliningTree* tree = zone->New<InliningTree>(
45 zone->New<Data>(zone, module, function_index), function_index,
46 0, // Call count.
47 0, // Wire byte size. `0` causes the root node to always get
48 // expanded, regardless of budget.
49 -1, -1, -1, // Caller, feedback slot, case.
50 0 // Inlining depth.
51 );
52 tree->FullyExpand();
53 return tree;
54 }
55
56 // This should stay roughly in sync with the full logic below, but not rely
57 // on having observed any call counts. Since it therefore can't simulate
58 // regular behavior accurately anyway, it may be a very coarse approximation.
59 static int NoLiftoffBudget(const WasmModule* module, uint32_t func_index) {
60 size_t wirebytes = module->functions[func_index].code.length();
61 double scaled = BudgetScaleFactor(module);
62 // TODO(jkummerow): When TF is gone, remove this adjustment by folding
63 // it into the flag's default value.
64 constexpr int kTurboshaftAdjustment = 2;
65 int high_growth =
66 static_cast<int>(v8_flags.wasm_inlining_factor) + kTurboshaftAdjustment;
67 constexpr int kLowestUsefulValue = 2;
68 int low_growth = std::max(kLowestUsefulValue, high_growth - 3);
69 double max_growth_factor = low_growth * (1 - scaled) + high_growth * scaled;
70 return std::max(static_cast<int>(v8_flags.wasm_inlining_min_budget),
71 static_cast<int>(max_growth_factor * wirebytes));
72 }
73
74 int64_t score() const {
75 // Note that the zero-point is arbitrary. Functions with negative score
76 // can still get inlined.
77 constexpr int count_factor = 2;
78 constexpr int size_factor = 3;
79 return int64_t{call_count_} * count_factor -
80 int64_t{wire_byte_size_} * size_factor;
81 }
82
83 // TODO(dlehmann,manoskouk): We are running into this limit, e.g., for the
84 // "argon2-wasm" benchmark.
85 // IIUC, this limit is in place because of the encoding of inlining IDs in
86 // a 6-bit bitfield in Turboshaft IR, which we should revisit.
87 static constexpr int kMaxInlinedCount = 60;
88
89 // Limit the nesting depth of inlining. Inlining decisions are based on call
90 // counts. A small function with high call counts that is called recursively
91 // would be inlined until all budget is used.
92 // TODO(14108): This still might not lead to ideal results. Other options
93 // could be explored like penalizing nested inlinees.
94 static constexpr uint32_t kMaxInliningNestingDepth = 7;
95
101 bool is_inlined() { return is_inlined_; }
102 uint32_t function_index() { return function_index_; }
103
104 private:
105 friend class v8::internal::Zone; // For `zone->New<InliningTree>`.
106
107 static double BudgetScaleFactor(const WasmModule* module) {
108 // If there are few small functions, that indicates that the toolchain
109 // already performed significant inlining, so we reduce the budget
110 // significantly as further inlining has diminishing benefits.
111 // For both major knobs, we apply a smoothened step function based on
112 // the module's percentage of small functions (sfp):
113 // sfp <= 25%: use "low" budget
114 // sfp >= 50%: use "high" budget
115 // 25% < sfp < 50%: interpolate linearly between both budgets.
116 double small_function_percentage =
117 module->num_small_functions * 100.0 / module->num_declared_functions;
118 if (small_function_percentage <= 25) {
119 return 0;
120 } else if (small_function_percentage >= 50) {
121 return 1;
122 } else {
123 return (small_function_percentage - 25) / 25;
124 }
125 }
126
127 struct Data {
129 : zone(zone),
130 module(module),
132 double scaled = BudgetScaleFactor(module);
133 // We found experimentally that we need to allow a larger growth factor
134 // for Turboshaft to achieve similar inlining decisions as in Turbofan;
135 // presumably because some functions that have a small wire size of their
136 // own still need to be allowed to inline some callees.
137 // TODO(jkummerow): When TF is gone, remove this adjustment by folding
138 // it into the flag's default value.
139 constexpr int kTurboshaftAdjustment = 2;
140 int high_growth = static_cast<int>(v8_flags.wasm_inlining_factor) +
141 kTurboshaftAdjustment;
142 // A value of 1 would be equivalent to disabling inlining entirely.
143 constexpr int kLowestUsefulValue = 2;
144 int low_growth = std::max(kLowestUsefulValue, high_growth - 3);
145 max_growth_factor = low_growth * (1 - scaled) + high_growth * scaled;
146 // The {wasm_inlining_budget} value has been tuned for Turbofan node
147 // counts. Turboshaft looks at wire bytes instead, and on average there
148 // are about 0.74 TF nodes per wire byte, so we apply a small factor to
149 // account for the difference, so we get similar inlining decisions in
150 // both compilers.
151 // TODO(jkummerow): When TF is gone, remove this factor by folding it
152 // into the flag's default value.
153 constexpr double kTurboshaftCorrectionFactor = 1.4;
154 double high_cap =
155 v8_flags.wasm_inlining_budget * kTurboshaftCorrectionFactor;
156 double low_cap = high_cap / 10;
157 budget_cap = low_cap * (1 - scaled) + high_cap * scaled;
158 }
159
165 };
166
167 InliningTree(Data* shared, uint32_t function_index, int call_count,
168 int wire_byte_size, uint32_t caller_index, int feedback_slot,
169 int the_case, uint32_t depth)
170 : data_(shared),
172 call_count_(call_count),
173 wire_byte_size_(wire_byte_size),
174 depth_(depth),
175 caller_index_(caller_index),
176 feedback_slot_(feedback_slot),
177 case_(the_case) {}
178
179 // Recursively expand the tree by expanding this node and children nodes etc.
180 // Nodes are prioritized by their `score`. Expansion continues until
181 // `kMaxInlinedCount` nodes are expanded or `budget` (in wire-bytes size) is
182 // depleted.
183 void FullyExpand();
184
185 // Mark this function call as inline and initialize `function_calls_` based
186 // on the `module_->type_feedback`.
187 void Inline();
188 bool SmallEnoughToInline(size_t initial_wire_byte_size,
189 size_t inlined_wire_byte_count);
190
195 bool is_inlined_ = false;
196 bool feedback_found_ = false;
197
200
201 uint32_t depth_;
202
203 // For tracing.
206 int case_;
207};
208
210 is_inlined_ = true;
211 auto& feedback_map = data_->module->type_feedback.feedback_for_function;
212 auto feedback_it = feedback_map.find(function_index_);
213 if (feedback_it == feedback_map.end()) return;
214 const FunctionTypeFeedback& feedback = feedback_it->second;
215 base::Vector<CallSiteFeedback> type_feedback =
216 feedback.feedback_vector.as_vector();
217 if (type_feedback.empty()) return; // No feedback yet.
218 DCHECK_EQ(type_feedback.size(), feedback.call_targets.size());
219 feedback_found_ = true;
221 data_->zone->AllocateVector<CasesPerCallSite>(type_feedback.size());
223 data_->zone->AllocateVector<bool>(type_feedback.size());
224 for (size_t i = 0; i < type_feedback.size(); i++) {
226 type_feedback[i].num_cases());
228 type_feedback[i].has_non_inlineable_targets();
229 for (int the_case = 0; the_case < type_feedback[i].num_cases();
230 the_case++) {
231 uint32_t callee_index = type_feedback[i].function_index(the_case);
232 // TODO(jkummerow): Experiment with propagating relative call counts
233 // into the nested InliningTree, and weighting scores there accordingly.
234 function_calls_[i][the_case] = data_->zone->New<InliningTree>(
235 data_, callee_index, type_feedback[i].call_count(the_case),
236 data_->module->functions[callee_index].code.length(), function_index_,
237 static_cast<int>(i), the_case, depth_ + 1);
238 }
239 }
240}
241
244 // Prefer callees with a higher score, and if the scores are equal,
245 // those with a lower function index (to make the queue ordering strict).
246 return std::make_pair(t1->score(), t2->function_index()) <
247 std::make_pair(t2->score(), t1->function_index());
248 }
249};
250
253 size_t initial_wire_byte_size =
254 data_->module->functions[function_index_].code.length();
255 size_t inlined_wire_byte_count = 0;
256 std::priority_queue<InliningTree*, std::vector<InliningTree*>,
258 queue;
259 queue.push(this);
260 int inlined_count = 0;
261 base::MutexGuard mutex_guard(&data_->module->type_feedback.mutex);
262 while (!queue.empty() && inlined_count < kMaxInlinedCount) {
263 InliningTree* top = queue.top();
264 if (v8_flags.trace_wasm_inlining) {
265 if (top != this) {
266 PrintF(
267 "[function %d: in function %d, considering call #%d, case #%d, to "
268 "function %d (count=%d, size=%d, score=%" PRId64 ")... ",
269 data_->topmost_caller_index, top->caller_index_,
270 top->feedback_slot_, static_cast<int>(top->case_),
271 static_cast<int>(top->function_index_), top->call_count_,
272 top->wire_byte_size_, static_cast<int64_t>(top->score()));
273 } else {
274 PrintF("[function %d: expanding topmost caller... ",
276 }
277 }
278 queue.pop();
279 if (top->function_index_ < data_->module->num_imported_functions) {
280 if (v8_flags.trace_wasm_inlining && top != this) {
281 PrintF("imported function]\n");
282 }
283 continue;
284 }
286 if (v8_flags.trace_wasm_inlining) {
287 PrintF("cannot inline asm.js function]\n");
288 }
289 continue;
290 }
291
292 // Key idea: inlining hot calls is good, inlining big functions is bad,
293 // so inline when a candidate is "hotter than it is big". Exception:
294 // tiny candidates can get inlined regardless of their call count.
295 if (top != this && top->wire_byte_size_ >= 12 &&
296 !v8_flags.wasm_inlining_ignore_call_counts) {
297 if (top->call_count_ < top->wire_byte_size_ / 2) {
298 if (v8_flags.trace_wasm_inlining) {
299 PrintF("not called often enough]\n");
300 }
301 continue;
302 }
303 }
304
305 if (!top->SmallEnoughToInline(initial_wire_byte_size,
306 inlined_wire_byte_count)) {
307 if (v8_flags.trace_wasm_inlining && top != this) {
308 PrintF("not enough inlining budget]\n");
309 }
310 continue;
311 }
312 if (v8_flags.trace_wasm_inlining && top != this) {
313 PrintF("decided to inline! ");
314 }
315 top->Inline();
316 inlined_count++;
317 // For tiny functions, inlining may actually decrease generated code size
318 // because we have one less call and don't need to push arguments, etc.
319 // Subtract a little bit from the code size increase, such that inlining
320 // these tiny functions doesn't use up any of the budget.
321 constexpr int kOneLessCall = 6; // Guesstimated savings per call.
322 inlined_wire_byte_count += std::max(top->wire_byte_size_ - kOneLessCall, 0);
323
324 if (!top->feedback_found()) {
325 if (v8_flags.trace_wasm_inlining) {
326 PrintF("no feedback yet or no callees]\n");
327 }
328 } else if (top->depth_ < kMaxInliningNestingDepth) {
329 if (v8_flags.trace_wasm_inlining) {
330 PrintF("queueing %zu callee(s)]\n", top->function_calls_.size());
331 }
332 for (CasesPerCallSite cases : top->function_calls_) {
333 for (InliningTree* call : cases) {
334 if (call != nullptr) {
335 queue.push(call);
336 }
337 }
338 }
339 } else if (v8_flags.trace_wasm_inlining) {
340 PrintF("max inlining depth reached]\n");
341 }
342 }
343 if (v8_flags.trace_wasm_inlining && !queue.empty()) {
344 PrintF("[function %d: too many inlining candidates, stopping...]\n",
346 }
347}
348
349// Returns true if there is still enough budget left to inline the current
350// candidate given the initial graph size and the already inlined wire bytes.
351bool InliningTree::SmallEnoughToInline(size_t initial_wire_byte_size,
352 size_t inlined_wire_byte_count) {
353 if (wire_byte_size_ > static_cast<int>(v8_flags.wasm_inlining_max_size)) {
354 return false;
355 }
356 // For tiny functions, let's be a bit more generous.
357 // TODO(dlehmann): Since we don't use up budget (i.e., increase
358 // `inlined_wire_byte_count` see above) for very tiny functions, we might be
359 // able to remove/simplify this code in the future.
360 if (wire_byte_size_ < 12) {
361 if (inlined_wire_byte_count > 100) {
362 inlined_wire_byte_count -= 100;
363 } else {
364 inlined_wire_byte_count = 0;
365 }
366 }
367 // For small-ish functions, the inlining budget is defined by the larger of
368 // 1) the wasm_inlining_min_budget and
369 // 2) the max_growth_factor * initial_wire_byte_size.
370 // Inlining a little bit should always be fine even for tiny functions (1),
371 // otherwise (2) makes sure that the budget scales in relation with the
372 // original function size, to limit the compile time increase caused by
373 // inlining.
374 size_t budget_small_function =
375 std::max<size_t>(v8_flags.wasm_inlining_min_budget,
376 data_->max_growth_factor * initial_wire_byte_size);
377
378 // For large functions, growing by the same factor would add too much
379 // compilation effort, so we also apply a fixed cap. However, independent
380 // of the budget cap, for large functions we should still allow a little
381 // inlining, which is why we allow 10% of the graph size is the minimal
382 // budget even for large functions that exceed the regular budget.
383 //
384 // Note for future tuning: it might make sense to allow 20% here, and in
385 // turn perhaps lower --wasm-inlining-budget. The drawback is that this
386 // would allow truly huge functions to grow even bigger; the benefit is
387 // that we wouldn't fall off as steep a cliff when hitting the cap.
388 size_t budget_large_function =
389 std::max<size_t>(data_->budget_cap, initial_wire_byte_size * 1.1);
390 size_t total_size = initial_wire_byte_size + inlined_wire_byte_count +
391 static_cast<size_t>(wire_byte_size_);
392 if (v8_flags.trace_wasm_inlining) {
393 PrintF("budget=min(%zu, %zu), size %zu->%zu ", budget_small_function,
394 budget_large_function,
395 (initial_wire_byte_size + inlined_wire_byte_count), total_size);
396 }
397 return total_size <
398 std::min<size_t>(budget_small_function, budget_large_function);
399}
400
401} // namespace v8::internal::wasm
402
403#endif // V8_WASM_INLINING_TREE_H_
constexpr bool empty() const
Definition vector.h:73
constexpr size_t size() const
Definition vector.h:70
T * New(Args &&... args)
Definition zone.h:114
base::Vector< T > AllocateVector(size_t length)
Definition zone.h:136
static double BudgetScaleFactor(const WasmModule *module)
base::Vector< CasesPerCallSite > function_calls_
base::Vector< CasesPerCallSite > function_calls()
static int NoLiftoffBudget(const WasmModule *module, uint32_t func_index)
base::Vector< bool > has_non_inlineable_targets()
base::Vector< bool > has_non_inlineable_targets_
InliningTree(Data *shared, uint32_t function_index, int call_count, int wire_byte_size, uint32_t caller_index, int feedback_slot, int the_case, uint32_t depth)
static constexpr uint32_t kMaxInliningNestingDepth
static constexpr int kMaxInlinedCount
static InliningTree * CreateRoot(Zone *zone, const WasmModule *module, uint32_t function_index)
bool SmallEnoughToInline(size_t initial_wire_byte_size, size_t inlined_wire_byte_count)
bool is_asmjs_module(const WasmModule *module)
void PrintF(const char *format,...)
Definition utils.cc:39
V8_EXPORT_PRIVATE FlagValues v8_flags
#define DCHECK_EQ(v1, v2)
Definition logging.h:485
Data(Zone *zone, const WasmModule *module, uint32_t topmost_caller_index)
bool operator()(InliningTree *t1, InliningTree *t2)