v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
mul-toom.cc
Go to the documentation of this file.
1// Copyright 2021 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Toom-Cook multiplication.
6// Reference: https://en.wikipedia.org/wiki/Toom%E2%80%93Cook_multiplication
7
8#include <algorithm>
9
13
14namespace v8 {
15namespace bigint {
16
17namespace {
18
19void TimesTwo(RWDigits X) {
20 digit_t carry = 0;
21 for (int i = 0; i < X.len(); i++) {
22 digit_t d = X[i];
23 X[i] = (d << 1) | carry;
24 carry = d >> (kDigitBits - 1);
25 }
26}
27
28void DivideByTwo(RWDigits X) {
29 digit_t carry = 0;
30 for (int i = X.len() - 1; i >= 0; i--) {
31 digit_t d = X[i];
32 X[i] = (d >> 1) | carry;
33 carry = d << (kDigitBits - 1);
34 }
35}
36
37void DivideByThree(RWDigits X) {
39 for (int i = X.len() - 1; i >= 0; i--) {
40 digit_t d = X[i];
41 digit_t upper = (remainder << kHalfDigitBits) | (d >> kHalfDigitBits);
42 digit_t u_result = upper / 3;
43 remainder = upper - 3 * u_result;
45 digit_t l_result = lower / 3;
46 remainder = lower - 3 * l_result;
47 X[i] = (u_result << kHalfDigitBits) | l_result;
48 }
49}
50
51} // namespace
52
53#if DEBUG
54// Set {len_} to 1 rather than 0 so that attempts to access the first digit
55// will crash.
56#define MARK_INVALID(D) D = RWDigits(nullptr, 1)
57#else
58#define MARK_INVALID(D) (void(0))
59#endif
60
61void ProcessorImpl::Toom3Main(RWDigits Z, Digits X, Digits Y) {
62 DCHECK(Z.len() >= X.len() + Y.len());
63 // Phase 1: Splitting.
64 int i = DIV_CEIL(std::max(X.len(), Y.len()), 3);
65 Digits X0(X, 0, i);
66 Digits X1(X, i, i);
67 Digits X2(X, 2 * i, i);
68 Digits Y0(Y, 0, i);
69 Digits Y1(Y, i, i);
70 Digits Y2(Y, 2 * i, i);
71
72 // Temporary storage.
73 int p_len = i + 1; // For all px, qx below.
74 int r_len = 2 * p_len; // For all r_x, Rx below.
75 Storage temp_storage(4 * r_len);
76 // We will use the same variable names as the Wikipedia article, as much as
77 // C++ lets us: our "p_m1" is their "p(-1)" etc. For consistency with other
78 // algorithms, we use X and Y where Wikipedia uses m and n.
79 // We will use and reuse the temporary storage as follows:
80 //
81 // chunk | -------- time ----------->
82 // [0 .. i] |( po )( p_m1 ) ( r_m2 )
83 // [i+1 .. rlen-1] |( qo )( q_m1 ) ( r_m2 )
84 // [rlen .. rlen+i] | (p_1 ) ( p_m2 ) (r_inf)
85 // [rlen+i+1 .. 2*rlen-1] | (q_1 ) ( q_m2 ) (r_inf)
86 // [2*rlen .. 3*rlen-1] | ( r_1 )
87 // [3*rlen .. 4*rlen-1] | ( r_m1 )
88 //
89 // This requires interleaving phases 2 and 3 a bit: after computing
90 // r_1 = p_1 * q_1, we can reuse p_1's storage for p_m2, and so on.
91 digit_t* t = temp_storage.get();
92 RWDigits po(t, p_len);
93 RWDigits qo(t + p_len, p_len);
94 RWDigits p_1(t + r_len, p_len);
95 RWDigits q_1(t + r_len + p_len, p_len);
96 RWDigits r_1(t + 2 * r_len, r_len);
97 RWDigits r_m1(t + 3 * r_len, r_len);
98
99 // We can also share the backing stores of Z, r_0, R0.
100 DCHECK(Z.len() >= r_len);
101 RWDigits r_0(Z, 0, r_len);
102
103 // Phase 2a: Evaluation, steps 0, 1, m1.
104 // po = X0 + X2
105 Add(po, X0, X2);
106 // p_0 = X0
107 // p_1 = po + X1
108 Add(p_1, po, X1);
109 // p_m1 = po - X1
110 RWDigits p_m1 = po;
111 bool p_m1_sign = SubtractSigned(p_m1, po, false, X1, false);
112 MARK_INVALID(po);
113
114 // qo = Y0 + Y2
115 Add(qo, Y0, Y2);
116 // q_0 = Y0
117 // q_1 = qo + Y1
118 Add(q_1, qo, Y1);
119 // q_m1 = qo - Y1
120 RWDigits q_m1 = qo;
121 bool q_m1_sign = SubtractSigned(q_m1, qo, false, Y1, false);
122 MARK_INVALID(qo);
123
124 // Phase 3a: Pointwise multiplication, steps 0, 1, m1.
125 Multiply(r_0, X0, Y0);
126 Multiply(r_1, p_1, q_1);
127 Multiply(r_m1, p_m1, q_m1);
128 bool r_m1_sign = p_m1_sign != q_m1_sign;
129
130 // Phase 2b: Evaluation, steps m2 and inf.
131 // p_m2 = (p_m1 + X2) * 2 - X0
132 RWDigits p_m2 = p_1;
133 MARK_INVALID(p_1);
134 bool p_m2_sign = AddSigned(p_m2, p_m1, p_m1_sign, X2, false);
135 TimesTwo(p_m2);
136 p_m2_sign = SubtractSigned(p_m2, p_m2, p_m2_sign, X0, false);
137 // p_inf = X2
138
139 // q_m2 = (q_m1 + Y2) * 2 - Y0
140 RWDigits q_m2 = q_1;
141 MARK_INVALID(q_1);
142 bool q_m2_sign = AddSigned(q_m2, q_m1, q_m1_sign, Y2, false);
143 TimesTwo(q_m2);
144 q_m2_sign = SubtractSigned(q_m2, q_m2, q_m2_sign, Y0, false);
145 // q_inf = Y2
146
147 // Phase 3b: Pointwise multiplication, steps m2 and inf.
148 RWDigits r_m2(t, r_len);
149 MARK_INVALID(p_m1);
150 MARK_INVALID(q_m1);
151 Multiply(r_m2, p_m2, q_m2);
152 bool r_m2_sign = p_m2_sign != q_m2_sign;
153
154 RWDigits r_inf(t + r_len, r_len);
155 MARK_INVALID(p_m2);
156 MARK_INVALID(q_m2);
157 Multiply(r_inf, X2, Y2);
158
159 // Phase 4: Interpolation.
160 Digits R0 = r_0;
161 Digits R4 = r_inf;
162 // R3 <- (r_m2 - r_1) / 3
163 RWDigits R3 = r_m2;
164 bool R3_sign = SubtractSigned(R3, r_m2, r_m2_sign, r_1, false);
165 DivideByThree(R3);
166 // R1 <- (r_1 - r_m1) / 2
167 RWDigits R1 = r_1;
168 bool R1_sign = SubtractSigned(R1, r_1, false, r_m1, r_m1_sign);
169 DivideByTwo(R1);
170 // R2 <- r_m1 - r_0
171 RWDigits R2 = r_m1;
172 bool R2_sign = SubtractSigned(R2, r_m1, r_m1_sign, R0, false);
173 // R3 <- (R2 - R3) / 2 + 2 * r_inf
174 R3_sign = SubtractSigned(R3, R2, R2_sign, R3, R3_sign);
175 DivideByTwo(R3);
176 // TODO(jkummerow): Would it be a measurable improvement to write an
177 // "AddTwice" helper?
178 R3_sign = AddSigned(R3, R3, R3_sign, r_inf, false);
179 R3_sign = AddSigned(R3, R3, R3_sign, r_inf, false);
180 // R2 <- R2 + R1 - R4
181 R2_sign = AddSigned(R2, R2, R2_sign, R1, R1_sign);
182 R2_sign = SubtractSigned(R2, R2, R2_sign, R4, false);
183 // R1 <- R1 - R3
184 R1_sign = SubtractSigned(R1, R1, R1_sign, R3, R3_sign);
185
186#if DEBUG
187 R1.Normalize();
188 R2.Normalize();
189 R3.Normalize();
190 DCHECK(R1_sign == false || R1.len() == 0);
191 DCHECK(R2_sign == false || R2.len() == 0);
192 DCHECK(R3_sign == false || R3.len() == 0);
193#endif
194
195 // Phase 5: Recomposition. R0 is already in place. Overflow can't happen.
196 for (int j = R0.len(); j < Z.len(); j++) Z[j] = 0;
197 AddAndReturnOverflow(Z + i, R1);
198 AddAndReturnOverflow(Z + 2 * i, R2);
199 AddAndReturnOverflow(Z + 3 * i, R3);
200 AddAndReturnOverflow(Z + 4 * i, R4);
201}
202
203void ProcessorImpl::MultiplyToomCook(RWDigits Z, Digits X, Digits Y) {
204 DCHECK(X.len() >= Y.len());
205 int k = Y.len();
206 // TODO(jkummerow): Would it be a measurable improvement to share the
207 // scratch memory for several invocations?
208 Digits X0(X, 0, k);
209 Toom3Main(Z, X0, Y);
210 if (X.len() > Y.len()) {
211 ScratchDigits T(2 * k);
212 for (int i = k; i < X.len(); i += k) {
213 Digits Xi(X, i, k);
214 // TODO(jkummerow): would it be a measurable improvement to craft a
215 // "ToomChunk" method in the style of {KaratsubaChunk}?
216 Toom3Main(T, Xi, Y);
217 AddAndReturnOverflow(Z + i, T); // Can't overflow.
218 }
219 }
220}
221
222} // namespace bigint
223} // namespace v8
#define T
void Multiply(RWDigits Z, Digits X, Digits Y)
too high values may cause the compiler to set high thresholds for inlining to as much as possible avoid inlined allocation of objects that cannot escape trace load stores from virtual maglev objects use TurboFan fast string builder analyze liveness of environment slots and zap dead values trace TurboFan load elimination emit data about basic block usage in builtins to this enable builtin reordering when run mksnapshot flag for emit warnings when applying builtin profile data verify register allocation in TurboFan randomly schedule instructions to stress dependency tracking enable store store elimination in TurboFan rewrite far to near simulate GC compiler thread race related to allow float parameters to be passed in simulator mode JS Wasm Run additional turbo_optimize_inlined_js_wasm_wrappers enable experimental feedback collection in generic lowering enable Turboshaft s WasmLoadElimination enable Turboshaft s low level load elimination for JS enable Turboshaft s escape analysis for string concatenation use enable Turbolev features that we want to ship in the not too far future trace individual Turboshaft reduction steps trace intermediate Turboshaft reduction steps invocation count threshold for early optimization Enables optimizations which favor memory size over execution speed Enables sampling allocation profiler with X as a sample interval min size of a semi the new space consists of two semi spaces max size of the Collect garbage after Collect garbage after keeps maps alive for< n > old space garbage collections print one detailed trace line in allocation gc speed threshold for starting incremental marking via a task in percent of available threshold for starting incremental marking immediately in percent of available Use a single schedule for determining a marking schedule between JS and C objects schedules the minor GC task with kUserVisible priority max worker number of concurrent for NumberOfWorkerThreads start background threads that allocate memory concurrent_array_buffer_sweeping use parallel threads to clear weak refs in the atomic pause trace progress of the incremental marking trace object counts and memory usage report a tick only when allocated zone memory changes by this amount TracingFlags::gc_stats TracingFlags::gc_stats track native contexts that are expected to be garbage collected verify heap pointers before and after GC memory reducer runs GC with ReduceMemoryFootprint flag Maximum number of memory reducer GCs scheduled Old gen GC speed is computed directly from gc tracer counters Perform compaction on full GCs based on V8 s default heuristics Perform compaction on every full GC Perform code space compaction when finalizing a full GC with stack Stress GC compaction to flush out bugs with moving objects flush of baseline code when it has not been executed recently Use time base code flushing instead of age Use a progress bar to scan large objects in increments when incremental marking is active force incremental marking for small heaps and run it more often force marking at random points between and X(inclusive) percent " "of the regular marking start limit") DEFINE_INT(stress_scavenge
double remainder
#define MARK_INVALID(D)
Definition mul-toom.cc:58
static constexpr int kHalfDigitBits
static constexpr int kDigitBits
Definition bigint.h:51
static constexpr digit_t kHalfDigitMask
void Add(RWDigits Z, Digits X, Digits Y)
bool SubtractSigned(RWDigits Z, Digits X, bool x_negative, Digits Y, bool y_negative)
bool AddSigned(RWDigits Z, Digits X, bool x_negative, Digits Y, bool y_negative)
digit_t AddAndReturnOverflow(RWDigits Z, Digits X)
uintptr_t digit_t
Definition bigint.h:34
#define DCHECK(condition)
Definition logging.h:482
#define DIV_CEIL(x, y)
Definition util.h:19