inlining-tree_8h_source.html

// Copyright 2023 the V8 project authors. All rights reserved.

// Use of this source code is governed by a BSD-style license that can be

// found in the LICENSE file.


#ifndef V8_WASM_INLINING_TREE_H_

#define V8_WASM_INLINING_TREE_H_


#if !V8_ENABLE_WEBASSEMBLY

#error This header should only be included if WebAssembly is enabled.

#endif  // !V8_ENABLE_WEBASSEMBLY


#include <cinttypes>

#include <cstdint>

#include <queue>

#include <vector>


#include "src/utils/utils.h"

#include "src/wasm/compilation-environment.h"

#include "src/wasm/wasm-module.h"


namespace v8::internal::wasm {


// Represents a tree of inlining decisions.

// A node in the tree represents a function frame, and `function_calls_`

// represent all direct/call_ref/call_indirect function calls in this frame.

// Each element of `function_calls_` is itself a `Vector` of `InliningTree`s,

// corresponding to the different speculative candidates for a

// call_ref/call_indirect; for a direct call, it has a single element.

// If a transitive element of `function_calls_` has its `is_inlined_` field set,

// it should be inlined into the caller.

// We have this additional datastructure for Turboshaft, since nodes in the

// Turboshaft IR aren't easily expanded incrementally, so all the inlining

// decisions are already made before graph building on this abstracted form of

// the code.


class InliningTree : public ZoneObject {

 private:

  struct Data;


 public:

  using CasesPerCallSite = base::Vector<InliningTree*>;


  static InliningTree* CreateRoot(Zone* zone, const WasmModule* module,

                                  uint32_t function_index) {

    InliningTree* tree = zone->New<InliningTree>(

        zone->New<Data>(zone, module, function_index), function_index,

        0,           // Call count.

        0,           // Wire byte size. `0` causes the root node to always get

                     // expanded, regardless of budget.

        -1, -1, -1,  // Caller, feedback slot, case.

        0            // Inlining depth.

    );

    tree->FullyExpand();

    return tree;

  }


  // This should stay roughly in sync with the full logic below, but not rely

  // on having observed any call counts. Since it therefore can't simulate

  // regular behavior accurately anyway, it may be a very coarse approximation.


  static int NoLiftoffBudget(const WasmModule* module, uint32_t func_index) {

    size_t wirebytes = module->functions[func_index].code.length();

    double scaled = BudgetScaleFactor(module);

    // TODO(jkummerow): When TF is gone, remove this adjustment by folding

    // it into the flag's default value.

    constexpr int kTurboshaftAdjustment = 2;

    int high_growth =

        static_cast<int>(v8_flags.wasm_inlining_factor) + kTurboshaftAdjustment;

    constexpr int kLowestUsefulValue = 2;

    int low_growth = std::max(kLowestUsefulValue, high_growth - 3);

    double max_growth_factor = low_growth * (1 - scaled) + high_growth * scaled;

    return std::max(static_cast<int>(v8_flags.wasm_inlining_min_budget),

                    static_cast<int>(max_growth_factor * wirebytes));

  }


  int64_t score() const {

    // Note that the zero-point is arbitrary. Functions with negative score

    // can still get inlined.

    constexpr int count_factor = 2;

    constexpr int size_factor = 3;

    return int64_t{call_count_} * count_factor -

           int64_t{wire_byte_size_} * size_factor;

  }


  // TODO(dlehmann,manoskouk): We are running into this limit, e.g., for the

  // "argon2-wasm" benchmark.

  // IIUC, this limit is in place because of the encoding of inlining IDs in

  // a 6-bit bitfield in Turboshaft IR, which we should revisit.

  static constexpr int kMaxInlinedCount = 60;


  // Limit the nesting depth of inlining. Inlining decisions are based on call

  // counts. A small function with high call counts that is called recursively

  // would be inlined until all budget is used.

  // TODO(14108): This still might not lead to ideal results. Other options

  // could be explored like penalizing nested inlinees.

  static constexpr uint32_t kMaxInliningNestingDepth = 7;


  base::Vector<CasesPerCallSite> function_calls() { return function_calls_; }


  base::Vector<bool> has_non_inlineable_targets() {

    return has_non_inlineable_targets_;

  }


  bool feedback_found() { return feedback_found_; }

  bool is_inlined() { return is_inlined_; }

  uint32_t function_index() { return function_index_; }


 private:

  friend class v8::internal::Zone;  // For `zone->New<InliningTree>`.


  static double BudgetScaleFactor(const WasmModule* module) {

    // If there are few small functions, that indicates that the toolchain

    // already performed significant inlining, so we reduce the budget

    // significantly as further inlining has diminishing benefits.

    // For both major knobs, we apply a smoothened step function based on

    // the module's percentage of small functions (sfp):

    //   sfp <= 25%: use "low" budget

    //   sfp >= 50%: use "high" budget

    //   25% < sfp < 50%: interpolate linearly between both budgets.

    double small_function_percentage =

        module->num_small_functions * 100.0 / module->num_declared_functions;

    if (small_function_percentage <= 25) {

      return 0;

    } else if (small_function_percentage >= 50) {

      return 1;

    } else {

      return (small_function_percentage - 25) / 25;

    }

  }


  struct Data {


    Data(Zone* zone, const WasmModule* module, uint32_t topmost_caller_index)

        : zone(zone),

          module(module),

          topmost_caller_index(topmost_caller_index) {

      double scaled = BudgetScaleFactor(module);

      // We found experimentally that we need to allow a larger growth factor

      // for Turboshaft to achieve similar inlining decisions as in Turbofan;

      // presumably because some functions that have a small wire size of their

      // own still need to be allowed to inline some callees.

      // TODO(jkummerow): When TF is gone, remove this adjustment by folding

      // it into the flag's default value.

      constexpr int kTurboshaftAdjustment = 2;

      int high_growth = static_cast<int>(v8_flags.wasm_inlining_factor) +

                        kTurboshaftAdjustment;

      // A value of 1 would be equivalent to disabling inlining entirely.

      constexpr int kLowestUsefulValue = 2;

      int low_growth = std::max(kLowestUsefulValue, high_growth - 3);

      max_growth_factor = low_growth * (1 - scaled) + high_growth * scaled;

      // The {wasm_inlining_budget} value has been tuned for Turbofan node

      // counts. Turboshaft looks at wire bytes instead, and on average there

      // are about 0.74 TF nodes per wire byte, so we apply a small factor to

      // account for the difference, so we get similar inlining decisions in

      // both compilers.

      // TODO(jkummerow): When TF is gone, remove this factor by folding it

      // into the flag's default value.

      constexpr double kTurboshaftCorrectionFactor = 1.4;

      double high_cap =

          v8_flags.wasm_inlining_budget * kTurboshaftCorrectionFactor;

      double low_cap = high_cap / 10;

      budget_cap = low_cap * (1 - scaled) + high_cap * scaled;

    }


    Zone* zone;

    const WasmModule* module;

    double max_growth_factor;

    size_t budget_cap;

    uint32_t topmost_caller_index;

  };


  InliningTree(Data* shared, uint32_t function_index, int call_count,

               int wire_byte_size, uint32_t caller_index, int feedback_slot,

               int the_case, uint32_t depth)

      : data_(shared),

        function_index_(function_index),

        call_count_(call_count),

        wire_byte_size_(wire_byte_size),

        depth_(depth),

        caller_index_(caller_index),

        feedback_slot_(feedback_slot),

        case_(the_case) {}


  // Recursively expand the tree by expanding this node and children nodes etc.

  // Nodes are prioritized by their `score`. Expansion continues until

  // `kMaxInlinedCount` nodes are expanded or `budget` (in wire-bytes size) is

  // depleted.

  void FullyExpand();


  // Mark this function call as inline and initialize `function_calls_` based

  // on the `module_->type_feedback`.

  void Inline();

  bool SmallEnoughToInline(size_t initial_wire_byte_size,

                           size_t inlined_wire_byte_count);


  Data* data_;

  uint32_t function_index_;

  int call_count_;

  int wire_byte_size_;

  bool is_inlined_ = false;

  bool feedback_found_ = false;


  base::Vector<CasesPerCallSite> function_calls_{};

  base::Vector<bool> has_non_inlineable_targets_{};


  uint32_t depth_;


  // For tracing.

  uint32_t caller_index_;

  int feedback_slot_;

  int case_;

};


void InliningTree::Inline() {

  is_inlined_ = true;

  auto& feedback_map = data_->module->type_feedback.feedback_for_function;

  auto feedback_it = feedback_map.find(function_index_);

  if (feedback_it == feedback_map.end()) return;

  const FunctionTypeFeedback& feedback = feedback_it->second;

  base::Vector<CallSiteFeedback> type_feedback =

      feedback.feedback_vector.as_vector();

  if (type_feedback.empty()) return;  // No feedback yet.

  DCHECK_EQ(type_feedback.size(), feedback.call_targets.size());

  feedback_found_ = true;

  function_calls_ =

      data_->zone->AllocateVector<CasesPerCallSite>(type_feedback.size());

  has_non_inlineable_targets_ =

      data_->zone->AllocateVector<bool>(type_feedback.size());

  for (size_t i = 0; i < type_feedback.size(); i++) {

    function_calls_[i] = data_->zone->AllocateVector<InliningTree*>(

        type_feedback[i].num_cases());

    has_non_inlineable_targets_[i] =

        type_feedback[i].has_non_inlineable_targets();

    for (int the_case = 0; the_case < type_feedback[i].num_cases();

         the_case++) {

      uint32_t callee_index = type_feedback[i].function_index(the_case);

      // TODO(jkummerow): Experiment with propagating relative call counts

      // into the nested InliningTree, and weighting scores there accordingly.

      function_calls_[i][the_case] = data_->zone->New<InliningTree>(

          data_, callee_index, type_feedback[i].call_count(the_case),

          data_->module->functions[callee_index].code.length(), function_index_,

          static_cast<int>(i), the_case, depth_ + 1);

    }

  }

}


struct TreeNodeOrdering {


  bool operator()(InliningTree* t1, InliningTree* t2) {

    // Prefer callees with a higher score, and if the scores are equal,

    // those with a lower function index (to make the queue ordering strict).

    return std::make_pair(t1->score(), t2->function_index()) <

           std::make_pair(t2->score(), t1->function_index());

  }


};


void InliningTree::FullyExpand() {

  DCHECK_EQ(this->function_index_, data_->topmost_caller_index);

  size_t initial_wire_byte_size =

      data_->module->functions[function_index_].code.length();

  size_t inlined_wire_byte_count = 0;

  std::priority_queue<InliningTree*, std::vector<InliningTree*>,

                      TreeNodeOrdering>

      queue;

  queue.push(this);

  int inlined_count = 0;

  base::MutexGuard mutex_guard(&data_->module->type_feedback.mutex);

  while (!queue.empty() && inlined_count < kMaxInlinedCount) {

    InliningTree* top = queue.top();

    if (v8_flags.trace_wasm_inlining) {

      if (top != this) {

        PrintF(

            "[function %d: in function %d, considering call #%d, case #%d, to "

            "function %d (count=%d, size=%d, score=%" PRId64 ")... ",

            data_->topmost_caller_index, top->caller_index_,

            top->feedback_slot_, static_cast<int>(top->case_),

            static_cast<int>(top->function_index_), top->call_count_,

            top->wire_byte_size_, static_cast<int64_t>(top->score()));

      } else {

        PrintF("[function %d: expanding topmost caller... ",

               data_->topmost_caller_index);

      }

    }

    queue.pop();

    if (top->function_index_ < data_->module->num_imported_functions) {

      if (v8_flags.trace_wasm_inlining && top != this) {

        PrintF("imported function]\n");

      }

      continue;

    }

    if (is_asmjs_module(data_->module)) {

      if (v8_flags.trace_wasm_inlining) {

        PrintF("cannot inline asm.js function]\n");

      }

      continue;

    }


    // Key idea: inlining hot calls is good, inlining big functions is bad,

    // so inline when a candidate is "hotter than it is big". Exception:

    // tiny candidates can get inlined regardless of their call count.

    if (top != this && top->wire_byte_size_ >= 12 &&

        !v8_flags.wasm_inlining_ignore_call_counts) {

      if (top->call_count_ < top->wire_byte_size_ / 2) {

        if (v8_flags.trace_wasm_inlining) {

          PrintF("not called often enough]\n");

        }

        continue;

      }

    }


    if (!top->SmallEnoughToInline(initial_wire_byte_size,

                                  inlined_wire_byte_count)) {

      if (v8_flags.trace_wasm_inlining && top != this) {

        PrintF("not enough inlining budget]\n");

      }

      continue;

    }

    if (v8_flags.trace_wasm_inlining && top != this) {

      PrintF("decided to inline! ");

    }

    top->Inline();

    inlined_count++;

    // For tiny functions, inlining may actually decrease generated code size

    // because we have one less call and don't need to push arguments, etc.

    // Subtract a little bit from the code size increase, such that inlining

    // these tiny functions doesn't use up any of the budget.

    constexpr int kOneLessCall = 6;  // Guesstimated savings per call.

    inlined_wire_byte_count += std::max(top->wire_byte_size_ - kOneLessCall, 0);


    if (!top->feedback_found()) {

      if (v8_flags.trace_wasm_inlining) {

        PrintF("no feedback yet or no callees]\n");

      }

    } else if (top->depth_ < kMaxInliningNestingDepth) {

      if (v8_flags.trace_wasm_inlining) {

        PrintF("queueing %zu callee(s)]\n", top->function_calls_.size());

      }

      for (CasesPerCallSite cases : top->function_calls_) {

        for (InliningTree* call : cases) {

          if (call != nullptr) {

            queue.push(call);

          }

        }

      }

    } else if (v8_flags.trace_wasm_inlining) {

      PrintF("max inlining depth reached]\n");

    }

  }

  if (v8_flags.trace_wasm_inlining && !queue.empty()) {

    PrintF("[function %d: too many inlining candidates, stopping...]\n",

           data_->topmost_caller_index);

  }

}


// Returns true if there is still enough budget left to inline the current

// candidate given the initial graph size and the already inlined wire bytes.


bool InliningTree::SmallEnoughToInline(size_t initial_wire_byte_size,

                                       size_t inlined_wire_byte_count) {

  if (wire_byte_size_ > static_cast<int>(v8_flags.wasm_inlining_max_size)) {

    return false;

  }

  // For tiny functions, let's be a bit more generous.

  // TODO(dlehmann): Since we don't use up budget (i.e., increase

  // `inlined_wire_byte_count` see above) for very tiny functions, we might be

  // able to remove/simplify this code in the future.

  if (wire_byte_size_ < 12) {

    if (inlined_wire_byte_count > 100) {

      inlined_wire_byte_count -= 100;

    } else {

      inlined_wire_byte_count = 0;

    }

  }

  // For small-ish functions, the inlining budget is defined by the larger of

  // 1) the wasm_inlining_min_budget and

  // 2) the max_growth_factor * initial_wire_byte_size.

  // Inlining a little bit should always be fine even for tiny functions (1),

  // otherwise (2) makes sure that the budget scales in relation with the

  // original function size, to limit the compile time increase caused by

  // inlining.

  size_t budget_small_function =

      std::max<size_t>(v8_flags.wasm_inlining_min_budget,

                       data_->max_growth_factor * initial_wire_byte_size);


  // For large functions, growing by the same factor would add too much

  // compilation effort, so we also apply a fixed cap. However, independent

  // of the budget cap, for large functions we should still allow a little

  // inlining, which is why we allow 10% of the graph size is the minimal

  // budget even for large functions that exceed the regular budget.

  //

  // Note for future tuning: it might make sense to allow 20% here, and in

  // turn perhaps lower --wasm-inlining-budget. The drawback is that this

  // would allow truly huge functions to grow even bigger; the benefit is

  // that we wouldn't fall off as steep a cliff when hitting the cap.

  size_t budget_large_function =

      std::max<size_t>(data_->budget_cap, initial_wire_byte_size * 1.1);

  size_t total_size = initial_wire_byte_size + inlined_wire_byte_count +

                      static_cast<size_t>(wire_byte_size_);

  if (v8_flags.trace_wasm_inlining) {

    PrintF("budget=min(%zu, %zu), size %zu->%zu ", budget_small_function,

           budget_large_function,

           (initial_wire_byte_size + inlined_wire_byte_count), total_size);

  }

  return total_size <

         std::min<size_t>(budget_small_function, budget_large_function);

}


}  // namespace v8::internal::wasm


#endif  // V8_WASM_INLINING_TREE_H_

v8::base::LockGuard
Definition mutex.h:192

v8::base::Vector
Definition zone-list.h:15

v8::base::Vector::empty
constexpr bool empty() const
Definition vector.h:73

v8::base::Vector::size
constexpr size_t size() const
Definition vector.h:70

v8::internal::ZoneObject
Definition zone.h:326

v8::internal::Zone
Definition zone.h:43

v8::internal::Zone::New
T * New(Args &&... args)
Definition zone.h:114

v8::internal::Zone::AllocateVector
base::Vector< T > AllocateVector(size_t length)
Definition zone.h:136

v8::internal::wasm::InliningTree
Definition inlining-tree.h:35

v8::internal::wasm::InliningTree::BudgetScaleFactor
static double BudgetScaleFactor(const WasmModule *module)
Definition inlining-tree.h:107

v8::internal::wasm::InliningTree::function_calls_
base::Vector< CasesPerCallSite > function_calls_
Definition inlining-tree.h:198

v8::internal::wasm::InliningTree::feedback_slot_
int feedback_slot_
Definition inlining-tree.h:205

v8::internal::wasm::InliningTree::function_calls
base::Vector< CasesPerCallSite > function_calls()
Definition inlining-tree.h:96

v8::internal::wasm::InliningTree::NoLiftoffBudget
static int NoLiftoffBudget(const WasmModule *module, uint32_t func_index)
Definition inlining-tree.h:59

v8::internal::wasm::InliningTree::caller_index_
uint32_t caller_index_
Definition inlining-tree.h:204

v8::internal::wasm::InliningTree::has_non_inlineable_targets
base::Vector< bool > has_non_inlineable_targets()
Definition inlining-tree.h:97

v8::internal::wasm::InliningTree::call_count_
int call_count_
Definition inlining-tree.h:193

v8::internal::wasm::InliningTree::function_index_
uint32_t function_index_
Definition inlining-tree.h:192

v8::internal::wasm::InliningTree::has_non_inlineable_targets_
base::Vector< bool > has_non_inlineable_targets_
Definition inlining-tree.h:199

v8::internal::wasm::InliningTree::InliningTree
InliningTree(Data *shared, uint32_t function_index, int call_count, int wire_byte_size, uint32_t caller_index, int feedback_slot, int the_case, uint32_t depth)
Definition inlining-tree.h:167

v8::internal::wasm::InliningTree::kMaxInliningNestingDepth
static constexpr uint32_t kMaxInliningNestingDepth
Definition inlining-tree.h:94

v8::internal::wasm::InliningTree::feedback_found
bool feedback_found()
Definition inlining-tree.h:100

v8::internal::wasm::InliningTree::Inline
void Inline()
Definition inlining-tree.h:209

v8::internal::wasm::InliningTree::FullyExpand
void FullyExpand()
Definition inlining-tree.h:251

v8::internal::wasm::InliningTree::function_index
uint32_t function_index()
Definition inlining-tree.h:102

v8::internal::wasm::InliningTree::is_inlined_
bool is_inlined_
Definition inlining-tree.h:195

v8::internal::wasm::InliningTree::feedback_found_
bool feedback_found_
Definition inlining-tree.h:196

v8::internal::wasm::InliningTree::kMaxInlinedCount
static constexpr int kMaxInlinedCount
Definition inlining-tree.h:87

v8::internal::wasm::InliningTree::data_
Data * data_
Definition inlining-tree.h:191

v8::internal::wasm::InliningTree::CreateRoot
static InliningTree * CreateRoot(Zone *zone, const WasmModule *module, uint32_t function_index)
Definition inlining-tree.h:42

v8::internal::wasm::InliningTree::wire_byte_size_
int wire_byte_size_
Definition inlining-tree.h:194

v8::internal::wasm::InliningTree::depth_
uint32_t depth_
Definition inlining-tree.h:201

v8::internal::wasm::InliningTree::is_inlined
bool is_inlined()
Definition inlining-tree.h:101

v8::internal::wasm::InliningTree::SmallEnoughToInline
bool SmallEnoughToInline(size_t initial_wire_byte_size, size_t inlined_wire_byte_count)
Definition inlining-tree.h:351

v8::internal::wasm::InliningTree::score
int64_t score() const
Definition inlining-tree.h:74

v8::internal::wasm::InliningTree::case_
int case_
Definition inlining-tree.h:206

compilation-environment.h

v8::internal::wasm
Definition asm-parser.cc:24

v8::internal::wasm::is_asmjs_module
bool is_asmjs_module(const WasmModule *module)
Definition wasm-module.h:998

v8::internal
Definition api-arguments-inl.h:20

v8::internal::PrintF
void PrintF(const char *format,...)
Definition utils.cc:39

v8::internal::v8_flags
V8_EXPORT_PRIVATE FlagValues v8_flags

DCHECK_EQ
#define DCHECK_EQ(v1, v2)
Definition logging.h:485

v8::internal::wasm::FunctionTypeFeedback
Definition wasm-module.h:633

v8::internal::wasm::InliningTree::Data
Definition inlining-tree.h:127

v8::internal::wasm::InliningTree::Data::budget_cap
size_t budget_cap
Definition inlining-tree.h:163

v8::internal::wasm::InliningTree::Data::max_growth_factor
double max_growth_factor
Definition inlining-tree.h:162

v8::internal::wasm::InliningTree::Data::zone
Zone * zone
Definition inlining-tree.h:160

v8::internal::wasm::InliningTree::Data::module
const WasmModule * module
Definition inlining-tree.h:161

v8::internal::wasm::InliningTree::Data::topmost_caller_index
uint32_t topmost_caller_index
Definition inlining-tree.h:164

v8::internal::wasm::InliningTree::Data::Data
Data(Zone *zone, const WasmModule *module, uint32_t topmost_caller_index)
Definition inlining-tree.h:128

v8::internal::wasm::TreeNodeOrdering
Definition inlining-tree.h:242

v8::internal::wasm::TreeNodeOrdering::operator()
bool operator()(InliningTree *t1, InliningTree *t2)
Definition inlining-tree.h:243

v8::internal::wasm::WasmModule
Definition wasm-module.h:705

utils.h

wasm-module.h