mul-toom_8cc_source.html

// Copyright 2021 the V8 project authors. All rights reserved.

// Use of this source code is governed by a BSD-style license that can be

// found in the LICENSE file.


// Toom-Cook multiplication.

// Reference: https://en.wikipedia.org/wiki/Toom%E2%80%93Cook_multiplication


#include <algorithm>


#include "src/bigint/bigint-internal.h"

#include "src/bigint/digit-arithmetic.h"

#include "src/bigint/vector-arithmetic.h"


namespace v8 {

namespace bigint {


namespace {


void TimesTwo(RWDigits X) {

  digit_t carry = 0;

  for (int i = 0; i < X.len(); i++) {

    digit_t d = X[i];

    X[i] = (d << 1) | carry;

    carry = d >> (kDigitBits - 1);

  }

}


void DivideByTwo(RWDigits X) {

  digit_t carry = 0;

  for (int i = X.len() - 1; i >= 0; i--) {

    digit_t d = X[i];

    X[i] = (d >> 1) | carry;

    carry = d << (kDigitBits - 1);

  }

}


void DivideByThree(RWDigits X) {

  digit_t remainder = 0;

  for (int i = X.len() - 1; i >= 0; i--) {

    digit_t d = X[i];

    digit_t upper = (remainder << kHalfDigitBits) | (d >> kHalfDigitBits);

    digit_t u_result = upper / 3;

    remainder = upper - 3 * u_result;

    digit_t lower = (remainder << kHalfDigitBits) | (d & kHalfDigitMask);

    digit_t l_result = lower / 3;

    remainder = lower - 3 * l_result;

    X[i] = (u_result << kHalfDigitBits) | l_result;

  }

}


}  // namespace


#if DEBUG

// Set {len_} to 1 rather than 0 so that attempts to access the first digit

// will crash.

#define MARK_INVALID(D) D = RWDigits(nullptr, 1)

#else

#define MARK_INVALID(D) (void(0))

#endif


void ProcessorImpl::Toom3Main(RWDigits Z, Digits X, Digits Y) {

  DCHECK(Z.len() >= X.len() + Y.len());

  // Phase 1: Splitting.

  int i = DIV_CEIL(std::max(X.len(), Y.len()), 3);

  Digits X0(X, 0, i);

  Digits X1(X, i, i);

  Digits X2(X, 2 * i, i);

  Digits Y0(Y, 0, i);

  Digits Y1(Y, i, i);

  Digits Y2(Y, 2 * i, i);


  // Temporary storage.

  int p_len = i + 1;      // For all px, qx below.

  int r_len = 2 * p_len;  // For all r_x, Rx below.

  Storage temp_storage(4 * r_len);

  // We will use the same variable names as the Wikipedia article, as much as

  // C++ lets us: our "p_m1" is their "p(-1)" etc. For consistency with other

  // algorithms, we use X and Y where Wikipedia uses m and n.

  // We will use and reuse the temporary storage as follows:

  //

  //   chunk                  | -------- time ----------->

  //   [0 .. i]               |( po )( p_m1 ) ( r_m2  )

  //   [i+1 .. rlen-1]        |( qo )( q_m1 ) ( r_m2  )

  //   [rlen .. rlen+i]       | (p_1 ) ( p_m2 ) (r_inf)

  //   [rlen+i+1 .. 2*rlen-1] | (q_1 ) ( q_m2 ) (r_inf)

  //   [2*rlen .. 3*rlen-1]   |      (   r_1          )

  //   [3*rlen .. 4*rlen-1]   |             (  r_m1   )

  //

  // This requires interleaving phases 2 and 3 a bit: after computing

  // r_1 = p_1 * q_1, we can reuse p_1's storage for p_m2, and so on.

  digit_t* t = temp_storage.get();

  RWDigits po(t, p_len);

  RWDigits qo(t + p_len, p_len);

  RWDigits p_1(t + r_len, p_len);

  RWDigits q_1(t + r_len + p_len, p_len);

  RWDigits r_1(t + 2 * r_len, r_len);

  RWDigits r_m1(t + 3 * r_len, r_len);


  // We can also share the  backing stores of Z, r_0, R0.

  DCHECK(Z.len() >= r_len);

  RWDigits r_0(Z, 0, r_len);


  // Phase 2a: Evaluation, steps 0, 1, m1.

  // po = X0 + X2

  Add(po, X0, X2);

  // p_0 = X0

  // p_1 = po + X1

  Add(p_1, po, X1);

  // p_m1 = po - X1

  RWDigits p_m1 = po;

  bool p_m1_sign = SubtractSigned(p_m1, po, false, X1, false);

  MARK_INVALID(po);


  // qo = Y0 + Y2

  Add(qo, Y0, Y2);

  // q_0 = Y0

  // q_1 = qo + Y1

  Add(q_1, qo, Y1);

  // q_m1 = qo - Y1

  RWDigits q_m1 = qo;

  bool q_m1_sign = SubtractSigned(q_m1, qo, false, Y1, false);

  MARK_INVALID(qo);


  // Phase 3a: Pointwise multiplication, steps 0, 1, m1.

  Multiply(r_0, X0, Y0);

  Multiply(r_1, p_1, q_1);

  Multiply(r_m1, p_m1, q_m1);

  bool r_m1_sign = p_m1_sign != q_m1_sign;


  // Phase 2b: Evaluation, steps m2 and inf.

  // p_m2 = (p_m1 + X2) * 2 - X0

  RWDigits p_m2 = p_1;

  MARK_INVALID(p_1);

  bool p_m2_sign = AddSigned(p_m2, p_m1, p_m1_sign, X2, false);

  TimesTwo(p_m2);

  p_m2_sign = SubtractSigned(p_m2, p_m2, p_m2_sign, X0, false);

  // p_inf = X2


  // q_m2 = (q_m1 + Y2) * 2 - Y0

  RWDigits q_m2 = q_1;

  MARK_INVALID(q_1);

  bool q_m2_sign = AddSigned(q_m2, q_m1, q_m1_sign, Y2, false);

  TimesTwo(q_m2);

  q_m2_sign = SubtractSigned(q_m2, q_m2, q_m2_sign, Y0, false);

  // q_inf = Y2


  // Phase 3b: Pointwise multiplication, steps m2 and inf.

  RWDigits r_m2(t, r_len);

  MARK_INVALID(p_m1);

  MARK_INVALID(q_m1);

  Multiply(r_m2, p_m2, q_m2);

  bool r_m2_sign = p_m2_sign != q_m2_sign;


  RWDigits r_inf(t + r_len, r_len);

  MARK_INVALID(p_m2);

  MARK_INVALID(q_m2);

  Multiply(r_inf, X2, Y2);


  // Phase 4: Interpolation.

  Digits R0 = r_0;

  Digits R4 = r_inf;

  // R3 <- (r_m2 - r_1) / 3

  RWDigits R3 = r_m2;

  bool R3_sign = SubtractSigned(R3, r_m2, r_m2_sign, r_1, false);

  DivideByThree(R3);

  // R1 <- (r_1 - r_m1) / 2

  RWDigits R1 = r_1;

  bool R1_sign = SubtractSigned(R1, r_1, false, r_m1, r_m1_sign);

  DivideByTwo(R1);

  // R2 <- r_m1 - r_0

  RWDigits R2 = r_m1;

  bool R2_sign = SubtractSigned(R2, r_m1, r_m1_sign, R0, false);

  // R3 <- (R2 - R3) / 2 + 2 * r_inf

  R3_sign = SubtractSigned(R3, R2, R2_sign, R3, R3_sign);

  DivideByTwo(R3);

  // TODO(jkummerow): Would it be a measurable improvement to write an

  // "AddTwice" helper?

  R3_sign = AddSigned(R3, R3, R3_sign, r_inf, false);

  R3_sign = AddSigned(R3, R3, R3_sign, r_inf, false);

  // R2 <- R2 + R1 - R4

  R2_sign = AddSigned(R2, R2, R2_sign, R1, R1_sign);

  R2_sign = SubtractSigned(R2, R2, R2_sign, R4, false);

  // R1 <- R1 - R3

  R1_sign = SubtractSigned(R1, R1, R1_sign, R3, R3_sign);


#if DEBUG

  R1.Normalize();

  R2.Normalize();

  R3.Normalize();

  DCHECK(R1_sign == false || R1.len() == 0);

  DCHECK(R2_sign == false || R2.len() == 0);

  DCHECK(R3_sign == false || R3.len() == 0);

#endif


  // Phase 5: Recomposition. R0 is already in place. Overflow can't happen.

  for (int j = R0.len(); j < Z.len(); j++) Z[j] = 0;

  AddAndReturnOverflow(Z + i, R1);

  AddAndReturnOverflow(Z + 2 * i, R2);

  AddAndReturnOverflow(Z + 3 * i, R3);

  AddAndReturnOverflow(Z + 4 * i, R4);

}


void ProcessorImpl::MultiplyToomCook(RWDigits Z, Digits X, Digits Y) {

  DCHECK(X.len() >= Y.len());

  int k = Y.len();

  // TODO(jkummerow): Would it be a measurable improvement to share the

  // scratch memory for several invocations?

  Digits X0(X, 0, k);

  Toom3Main(Z, X0, Y);

  if (X.len() > Y.len()) {

    ScratchDigits T(2 * k);

    for (int i = k; i < X.len(); i += k) {

      Digits Xi(X, i, k);

      // TODO(jkummerow): would it be a measurable improvement to craft a

      // "ToomChunk" method in the style of {KaratsubaChunk}?

      Toom3Main(T, Xi, Y);

      AddAndReturnOverflow(Z + i, T);  // Can't overflow.

    }

  }

}


}  // namespace bigint

}  // namespace v8

T
#define T

bigint-internal.h

v8::bigint::ProcessorImpl::Multiply
void Multiply(RWDigits Z, Digits X, Digits Y)
Definition bigint-internal.cc:36

digit-arithmetic.h

X
too high values may cause the compiler to set high thresholds for inlining to as much as possible avoid inlined allocation of objects that cannot escape trace load stores from virtual maglev objects use TurboFan fast string builder analyze liveness of environment slots and zap dead values trace TurboFan load elimination emit data about basic block usage in builtins to this enable builtin reordering when run mksnapshot flag for emit warnings when applying builtin profile data verify register allocation in TurboFan randomly schedule instructions to stress dependency tracking enable store store elimination in TurboFan rewrite far to near simulate GC compiler thread race related to allow float parameters to be passed in simulator mode JS Wasm Run additional turbo_optimize_inlined_js_wasm_wrappers enable experimental feedback collection in generic lowering enable Turboshaft s WasmLoadElimination enable Turboshaft s low level load elimination for JS enable Turboshaft s escape analysis for string concatenation use enable Turbolev features that we want to ship in the not too far future trace individual Turboshaft reduction steps trace intermediate Turboshaft reduction steps invocation count threshold for early optimization Enables optimizations which favor memory size over execution speed Enables sampling allocation profiler with X as a sample interval min size of a semi the new space consists of two semi spaces max size of the Collect garbage after Collect garbage after keeps maps alive for< n > old space garbage collections print one detailed trace line in allocation gc speed threshold for starting incremental marking via a task in percent of available threshold for starting incremental marking immediately in percent of available Use a single schedule for determining a marking schedule between JS and C objects schedules the minor GC task with kUserVisible priority max worker number of concurrent for NumberOfWorkerThreads start background threads that allocate memory concurrent_array_buffer_sweeping use parallel threads to clear weak refs in the atomic pause trace progress of the incremental marking trace object counts and memory usage report a tick only when allocated zone memory changes by this amount TracingFlags::gc_stats TracingFlags::gc_stats track native contexts that are expected to be garbage collected verify heap pointers before and after GC memory reducer runs GC with ReduceMemoryFootprint flag Maximum number of memory reducer GCs scheduled Old gen GC speed is computed directly from gc tracer counters Perform compaction on full GCs based on V8 s default heuristics Perform compaction on every full GC Perform code space compaction when finalizing a full GC with stack Stress GC compaction to flush out bugs with moving objects flush of baseline code when it has not been executed recently Use time base code flushing instead of age Use a progress bar to scan large objects in increments when incremental marking is active force incremental marking for small heaps and run it more often force marking at random points between and X(inclusive) percent " "of the regular marking start limit") DEFINE_INT(stress_scavenge

remainder
double remainder
Definition js-temporal-objects.cc:108

MARK_INVALID
#define MARK_INVALID(D)
Definition mul-toom.cc:58

v8::bigint::kHalfDigitBits
static constexpr int kHalfDigitBits
Definition digit-arithmetic.h:16

v8::bigint::kDigitBits
static constexpr int kDigitBits
Definition bigint.h:51

v8::bigint::kHalfDigitMask
static constexpr digit_t kHalfDigitMask
Definition digit-arithmetic.h:18

v8::bigint::Add
void Add(RWDigits Z, Digits X, Digits Y)
Definition vector-arithmetic.cc:41

v8::bigint::SubtractSigned
bool SubtractSigned(RWDigits Z, Digits X, bool x_negative, Digits Y, bool y_negative)
Definition vector-arithmetic.cc:107

v8::bigint::AddSigned
bool AddSigned(RWDigits Z, Digits X, bool x_negative, Digits Y, bool y_negative)
Definition vector-arithmetic.cc:93

v8::bigint::AddAndReturnOverflow
digit_t AddAndReturnOverflow(RWDigits Z, Digits X)
Definition vector-arithmetic.cc:13

v8::bigint::digit_t
uintptr_t digit_t
Definition bigint.h:34

v8::internal
Definition api-arguments-inl.h:20

v8::internal::carry
@ carry
Definition assembler-ia32.h:77

v8
Definition api-arguments-inl.h:19

DCHECK
#define DCHECK(condition)
Definition logging.h:482

DIV_CEIL
#define DIV_CEIL(x, y)
Definition util.h:19

vector-arithmetic.h