v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
div-barrett.cc
Go to the documentation of this file.
1// Copyright 2021 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Barrett division, finding the inverse with Newton's method.
6// Reference: "Fast Division of Large Integers" by Karl Hasselström,
7// found at https://treskal.com/s/masters-thesis.pdf
8
9// Many thanks to Karl Wiberg, k@w5.se, for both writing up an
10// understandable theoretical description of the algorithm and privately
11// providing a demo implementation, on which the implementation in this file is
12// based.
13
14#include <algorithm>
15
20
21namespace v8 {
22namespace bigint {
23
24namespace {
25
26void DcheckIntegerPartRange(Digits X, digit_t min, digit_t max) {
27#if DEBUG
28 digit_t integer_part = X.msd();
29 DCHECK(integer_part >= min);
30 DCHECK(integer_part <= max);
31#else
32 USE(X);
33 USE(min);
34 USE(max);
35#endif
36}
37
38} // namespace
39
40// Z := (the fractional part of) 1/V, via naive division.
41// See comments at {Invert} and {InvertNewton} below for details.
42void ProcessorImpl::InvertBasecase(RWDigits Z, Digits V, RWDigits scratch) {
43 DCHECK(Z.len() > V.len());
44 DCHECK(V.len() > 0);
45 DCHECK(scratch.len() >= 2 * V.len());
46 int n = V.len();
47 RWDigits X(scratch, 0, 2 * n);
48 digit_t borrow = 0;
49 int i = 0;
50 for (; i < n; i++) X[i] = 0;
51 for (; i < 2 * n; i++) X[i] = digit_sub2(0, V[i - n], borrow, &borrow);
52 DCHECK(borrow == 1);
53 RWDigits R(nullptr, 0); // We don't need the remainder.
54 if (n < kBurnikelThreshold) {
55 DivideSchoolbook(Z, R, X, V);
56 } else {
58 }
59}
60
61// This is Algorithm 4.2 from the paper.
62// Computes the inverse of V, shifted by kDigitBits * 2 * V.len, accurate to
63// V.len+1 digits. The V.len low digits of the result digits will be written
64// to Z, plus there is an implicit top digit with value 1.
65// Needs InvertNewtonScratchSpace(V.len) of scratch space.
66// The result is either correct or off by one (about half the time it is
67// correct, half the time it is one too much, and in the corner case where V is
68// minimal and the implicit top digit would have to be 2 it is one too little).
69// Barrett's division algorithm can handle that, so we don't care.
70void ProcessorImpl::InvertNewton(RWDigits Z, Digits V, RWDigits scratch) {
71 const int vn = V.len();
72 DCHECK(Z.len() >= vn);
73 DCHECK(scratch.len() >= InvertNewtonScratchSpace(vn));
74 const int kSOffset = 0;
75 const int kWOffset = 0; // S and W can share their scratch space.
76 const int kUOffset = vn + kInvertNewtonExtraSpace;
77
78 // The base case won't work otherwise.
79 DCHECK(V.len() >= 3);
80
81 constexpr int kBasecasePrecision = kNewtonInversionThreshold - 1;
82 // V must have more digits than the basecase.
83 DCHECK(V.len() > kBasecasePrecision);
85
86 // Step (1): Setup.
87 // Calculate precision required at each step.
88 // {k} is the number of fraction bits for the current iteration.
89 int k = vn * kDigitBits;
90 int target_fraction_bits[8 * sizeof(vn)]; // "k_i" in the paper.
91 int iteration = -1; // "i" in the paper, except inverted to run downwards.
92 while (k > kBasecasePrecision * kDigitBits) {
93 iteration++;
94 target_fraction_bits[iteration] = k;
95 k = DIV_CEIL(k, 2);
96 }
97 // At this point, k <= kBasecasePrecision*kDigitBits is the number of
98 // fraction bits to use in the base case. {iteration} is the highest index
99 // in use for f[].
100
101 // Step (2): Initial approximation.
102 int initial_digits = DIV_CEIL(k + 1, kDigitBits);
103 Digits top_part_of_v(V, vn - initial_digits, initial_digits);
104 InvertBasecase(Z, top_part_of_v, scratch);
105 Z[initial_digits] = Z[initial_digits] + 1; // Implicit top digit.
106 // From now on, we'll keep Z.len updated to the part that's already computed.
107 Z.set_len(initial_digits + 1);
108
109 // Step (3): Precision doubling loop.
110 while (true) {
111 DcheckIntegerPartRange(Z, 1, 2);
112
113 // (3b): S = Z^2
114 RWDigits S(scratch, kSOffset, 2 * Z.len());
115 Multiply(S, Z, Z);
116 if (should_terminate()) return;
117 S.TrimOne(); // Top digit of S is unused.
118 DcheckIntegerPartRange(S, 1, 4);
119
120 // (3c): T = V, truncated so that at least 2k+3 fraction bits remain.
121 int fraction_digits = DIV_CEIL(2 * k + 3, kDigitBits);
122 int t_len = std::min(V.len(), fraction_digits);
123 Digits T(V, V.len() - t_len, t_len);
124
125 // (3d): U = T * S, truncated so that at least 2k+1 fraction bits remain
126 // (U has one integer digit, which might be zero).
127 fraction_digits = DIV_CEIL(2 * k + 1, kDigitBits);
128 RWDigits U(scratch, kUOffset, S.len() + T.len());
129 DCHECK(U.len() > fraction_digits);
130 Multiply(U, S, T);
131 if (should_terminate()) return;
132 U = U + (U.len() - (1 + fraction_digits));
133 DcheckIntegerPartRange(U, 0, 3);
134
135 // (3e): W = 2 * Z, padded with "0" fraction bits so that it has the
136 // same number of fraction bits as U.
137 DCHECK(U.len() >= Z.len());
138 RWDigits W(scratch, kWOffset, U.len());
139 int padding_digits = U.len() - Z.len();
140 for (int i = 0; i < padding_digits; i++) W[i] = 0;
141 LeftShift(W + padding_digits, Z, 1);
142 DcheckIntegerPartRange(W, 2, 4);
143
144 // (3f): Z = W - U.
145 // This check is '<=' instead of '<' because U's top digit is its
146 // integer part, and we want vn fraction digits.
147 if (U.len() <= vn) {
148 // Normal subtraction.
149 // This is not the last iteration.
150 DCHECK(iteration > 0);
151 Z.set_len(U.len());
152 digit_t borrow = SubtractAndReturnBorrow(Z, W, U);
153 DCHECK(borrow == 0);
154 USE(borrow);
155 DcheckIntegerPartRange(Z, 1, 2);
156 } else {
157 // Truncate some least significant digits so that we get vn
158 // fraction digits, and compute the integer digit separately.
159 // This is the last iteration.
160 DCHECK(iteration == 0);
161 Z.set_len(vn);
162 Digits W_part(W, W.len() - vn - 1, vn);
163 Digits U_part(U, U.len() - vn - 1, vn);
164 digit_t borrow = SubtractAndReturnBorrow(Z, W_part, U_part);
165 digit_t integer_part = W.msd() - U.msd() - borrow;
166 DCHECK(integer_part == 1 || integer_part == 2);
167 if (integer_part == 2) {
168 // This is the rare case where the correct result would be 2.0, but
169 // since we can't express that by returning only the fractional part
170 // with an implicit 1-digit, we have to return [1.]9999... instead.
171 for (int i = 0; i < Z.len(); i++) Z[i] = ~digit_t{0};
172 }
173 break;
174 }
175 // (3g, 3h): Update local variables and loop.
176 k = target_fraction_bits[iteration];
177 iteration--;
178 }
179}
180
181// Computes the inverse of V, shifted by kDigitBits * 2 * V.len, accurate to
182// V.len+1 digits. The V.len low digits of the result digits will be written
183// to Z, plus there is an implicit top digit with value 1.
184// (Corner case: if V is minimal, the implicit digit should be 2; in that case
185// we return one less than the correct answer. DivideBarrett can handle that.)
186// Needs InvertScratchSpace(V.len) digits of scratch space.
187void ProcessorImpl::Invert(RWDigits Z, Digits V, RWDigits scratch) {
188 DCHECK(Z.len() > V.len());
189 DCHECK(V.len() >= 1);
191 DCHECK(scratch.len() >= InvertScratchSpace(V.len()));
192
193 int vn = V.len();
194 if (vn >= kNewtonInversionThreshold) {
195 return InvertNewton(Z, V, scratch);
196 }
197 if (vn == 1) {
198 digit_t d = V[0];
199 digit_t dummy_remainder;
200 Z[0] = digit_div(~d, ~digit_t{0}, d, &dummy_remainder);
201 Z[1] = 0;
202 } else {
203 InvertBasecase(Z, V, scratch);
204 if (Z[vn] == 1) {
205 for (int i = 0; i < vn; i++) Z[i] = ~digit_t{0};
206 Z[vn] = 0;
207 }
208 }
209}
210
211// This is algorithm 3.5 from the paper.
212// Computes Q(uotient) and R(emainder) for A/B using I, which is a
213// precomputed approximation of 1/B (e.g. with Invert() above).
214// Needs DivideBarrettScratchSpace(A.len) scratch space.
215void ProcessorImpl::DivideBarrett(RWDigits Q, RWDigits R, Digits A, Digits B,
216 Digits I, RWDigits scratch) {
217 DCHECK(Q.len() > A.len() - B.len());
218 DCHECK(R.len() >= B.len());
219 DCHECK(A.len() > B.len()); // Careful: This is *not* '>=' !
220 DCHECK(A.len() <= 2 * B.len());
221 DCHECK(B.len() > 0);
223 DCHECK(I.len() == A.len() - B.len());
224 DCHECK(scratch.len() >= DivideBarrettScratchSpace(A.len()));
225
226 int orig_q_len = Q.len();
227
228 // (1): A1 = A with B.len fewer digits.
229 Digits A1 = A + B.len();
230 DCHECK(A1.len() == I.len());
231
232 // (2): Q = A1*I with I.len fewer digits.
233 // {I} has an implicit high digit with value 1, so we add {A1} to the high
234 // part of the multiplication result.
235 RWDigits K(scratch, 0, 2 * I.len());
236 Multiply(K, A1, I);
237 if (should_terminate()) return;
238 Q.set_len(I.len() + 1);
239 Add(Q, K + I.len(), A1);
240 // K is no longer used, can reuse {scratch} for P.
241
242 // (3): R = A - B*Q (approximate remainder).
243 RWDigits P(scratch, 0, A.len() + 1);
244 Multiply(P, B, Q);
245 if (should_terminate()) return;
246 digit_t borrow = SubtractAndReturnBorrow(R, A, Digits(P, 0, B.len()));
247 // R may be allocated wider than B, zero out any extra digits if so.
248 for (int i = B.len(); i < R.len(); i++) R[i] = 0;
249 digit_t r_high = A[B.len()] - P[B.len()] - borrow;
250
251 // Adjust R and Q so that they become the correct remainder and quotient.
252 // The number of iterations is guaranteed to be at most some very small
253 // constant, unless the caller gave us a bad approximate quotient.
254 if (r_high >> (kDigitBits - 1) == 1) {
255 // (5b): R < 0, so R += B
256 digit_t q_sub = 0;
257 do {
258 r_high += AddAndReturnCarry(R, R, B);
259 q_sub++;
260 DCHECK(q_sub <= 5);
261 } while (r_high != 0);
262 Subtract(Q, q_sub);
263 } else {
264 digit_t q_add = 0;
265 while (r_high != 0 || GreaterThanOrEqual(R, B)) {
266 // (5c): R >= B, so R -= B
267 r_high -= SubtractAndReturnBorrow(R, R, B);
268 q_add++;
269 DCHECK(q_add <= 5);
270 }
271 Add(Q, q_add);
272 }
273 // (5a): Return.
274 int final_q_len = Q.len();
275 Q.set_len(orig_q_len);
276 for (int i = final_q_len; i < orig_q_len; i++) Q[i] = 0;
277}
278
279// Computes Q(uotient) and R(emainder) for A/B, using Barrett division.
280void ProcessorImpl::DivideBarrett(RWDigits Q, RWDigits R, Digits A, Digits B) {
281 DCHECK(Q.len() > A.len() - B.len());
282 DCHECK(R.len() >= B.len());
283 DCHECK(A.len() > B.len()); // Careful: This is *not* '>=' !
284 DCHECK(B.len() > 0);
285
286 // Normalize B, and shift A by the same amount.
287 ShiftedDigits b_normalized(B);
288 ShiftedDigits a_normalized(A, b_normalized.shift());
289 // Keep the code below more concise.
290 B = b_normalized;
291 A = a_normalized;
292
293 // The core DivideBarrett function above only supports A having at most
294 // twice as many digits as B. We generalize this to arbitrary inputs
295 // similar to Burnikel-Ziegler division by performing a t-by-1 division
296 // of B-sized chunks. It's easy to special-case the situation where we
297 // don't need to bother.
298 int barrett_dividend_length = A.len() <= 2 * B.len() ? A.len() : 2 * B.len();
299 int i_len = barrett_dividend_length - B.len();
300 ScratchDigits I(i_len + 1); // +1 is for temporary use by Invert().
301 int scratch_len =
302 std::max(InvertScratchSpace(i_len),
303 DivideBarrettScratchSpace(barrett_dividend_length));
304 ScratchDigits scratch(scratch_len);
305 Invert(I, Digits(B, B.len() - i_len, i_len), scratch);
306 if (should_terminate()) return;
307 I.TrimOne();
308 DCHECK(I.len() == i_len);
309 if (A.len() > 2 * B.len()) {
310 // This follows the variable names and and algorithmic steps of
311 // DivideBurnikelZiegler().
312 int n = B.len(); // Chunk length.
313 // (5): {t} is the number of B-sized chunks of A.
314 int t = DIV_CEIL(A.len(), n);
315 DCHECK(t >= 3);
316 // (6)/(7): Z is used for the current 2-chunk block to be divided by B,
317 // initialized to the two topmost chunks of A.
318 int z_len = n * 2;
319 ScratchDigits Z(z_len);
320 PutAt(Z, A + n * (t - 2), z_len);
321 // (8): For i from t-2 downto 0 do
322 int qi_len = n + 1;
323 ScratchDigits Qi(qi_len);
324 ScratchDigits Ri(n);
325 // First iteration unrolled and specialized.
326 {
327 int i = t - 2;
328 DivideBarrett(Qi, Ri, Z, B, I, scratch);
329 if (should_terminate()) return;
330 RWDigits target = Q + n * i;
331 // In the first iteration, all qi_len = n + 1 digits may be used.
332 int to_copy = std::min(qi_len, target.len());
333 for (int j = 0; j < to_copy; j++) target[j] = Qi[j];
334 for (int j = to_copy; j < target.len(); j++) target[j] = 0;
335#if DEBUG
336 for (int j = to_copy; j < Qi.len(); j++) {
337 DCHECK(Qi[j] == 0);
338 }
339#endif
340 }
341 // Now loop over any remaining iterations.
342 for (int i = t - 3; i >= 0; i--) {
343 // (8b): If i > 0, set Z_(i-1) = [Ri, A_(i-1)].
344 // (De-duped with unrolled first iteration, hence reading A_(i).)
345 PutAt(Z + n, Ri, n);
346 PutAt(Z, A + n * i, n);
347 // (8a): Compute Qi, Ri such that Zi = B*Qi + Ri.
348 DivideBarrett(Qi, Ri, Z, B, I, scratch);
349 DCHECK(Qi[qi_len - 1] == 0);
350 if (should_terminate()) return;
351 // (9): Return Q = [Q_(t-2), ..., Q_0]...
352 PutAt(Q + n * i, Qi, n);
353 }
354 Ri.Normalize();
355 DCHECK(Ri.len() <= R.len());
356 // (9): ...and R = R_0 * 2^(-leading_zeros).
357 RightShift(R, Ri, b_normalized.shift());
358 } else {
359 DivideBarrett(Q, R, A, B, I, scratch);
360 if (should_terminate()) return;
361 RightShift(R, R, b_normalized.shift());
362 }
363}
364
365} // namespace bigint
366} // namespace v8
#define V(Name)
#define T
void DivideBurnikelZiegler(RWDigits Q, RWDigits R, Digits A, Digits B)
void Multiply(RWDigits Z, Digits X, Digits Y)
void DivideSchoolbook(RWDigits Q, RWDigits R, Digits A, Digits B)
too high values may cause the compiler to set high thresholds for inlining to as much as possible avoid inlined allocation of objects that cannot escape trace load stores from virtual maglev objects use TurboFan fast string builder analyze liveness of environment slots and zap dead values trace TurboFan load elimination emit data about basic block usage in builtins to this enable builtin reordering when run mksnapshot flag for emit warnings when applying builtin profile data verify register allocation in TurboFan randomly schedule instructions to stress dependency tracking enable store store elimination in TurboFan rewrite far to near simulate GC compiler thread race related to allow float parameters to be passed in simulator mode JS Wasm Run additional turbo_optimize_inlined_js_wasm_wrappers enable experimental feedback collection in generic lowering enable Turboshaft s WasmLoadElimination enable Turboshaft s low level load elimination for JS enable Turboshaft s escape analysis for string concatenation use enable Turbolev features that we want to ship in the not too far future trace individual Turboshaft reduction steps trace intermediate Turboshaft reduction steps invocation count threshold for early optimization Enables optimizations which favor memory size over execution speed Enables sampling allocation profiler with X as a sample interval min size of a semi the new space consists of two semi spaces max size of the Collect garbage after Collect garbage after keeps maps alive for< n > old space garbage collections print one detailed trace line in allocation gc speed threshold for starting incremental marking via a task in percent of available threshold for starting incremental marking immediately in percent of available Use a single schedule for determining a marking schedule between JS and C objects schedules the minor GC task with kUserVisible priority max worker number of concurrent for NumberOfWorkerThreads start background threads that allocate memory concurrent_array_buffer_sweeping use parallel threads to clear weak refs in the atomic pause trace progress of the incremental marking trace object counts and memory usage report a tick only when allocated zone memory changes by this amount TracingFlags::gc_stats TracingFlags::gc_stats track native contexts that are expected to be garbage collected verify heap pointers before and after GC memory reducer runs GC with ReduceMemoryFootprint flag Maximum number of memory reducer GCs scheduled Old gen GC speed is computed directly from gc tracer counters Perform compaction on full GCs based on V8 s default heuristics Perform compaction on every full GC Perform code space compaction when finalizing a full GC with stack Stress GC compaction to flush out bugs with moving objects flush of baseline code when it has not been executed recently Use time base code flushing instead of age Use a progress bar to scan large objects in increments when incremental marking is active force incremental marking for small heaps and run it more often force marking at random points between and X(inclusive) percent " "of the regular marking start limit") DEFINE_INT(stress_scavenge
int K
Definition mul-fft.cc:295
int n
Definition mul-fft.cc:296
static digit_t digit_div(digit_t high, digit_t low, digit_t divisor, digit_t *remainder)
void PutAt(RWDigits Z, Digits A, int count)
Definition div-helpers.h:19
bool IsBitNormalized(Digits X)
void LeftShift(RWDigits Z, Digits X, digit_t shift)
Definition bitwise.cc:136
static constexpr int kDigitBits
Definition bigint.h:51
constexpr int InvertNewtonScratchSpace(int n)
void Add(RWDigits Z, Digits X, Digits Y)
bool GreaterThanOrEqual(Digits A, Digits B)
constexpr int kInvertNewtonExtraSpace
void Subtract(RWDigits Z, Digits X, Digits Y)
digit_t digit_sub2(digit_t a, digit_t b, digit_t borrow_in, digit_t *borrow_out)
digit_t SubtractAndReturnBorrow(RWDigits Z, Digits X, Digits Y)
void RightShift(RWDigits Z, Digits X, digit_t shift, const RightShiftState &state)
Definition bitwise.cc:195
digit_t AddAndReturnCarry(RWDigits Z, Digits X, Digits Y)
constexpr int kNewtonInversionThreshold
constexpr int kBurnikelThreshold
constexpr int InvertScratchSpace(int n)
constexpr int DivideBarrettScratchSpace(int n)
uintptr_t digit_t
Definition bigint.h:34
constexpr int W
constexpr int B
constexpr int U
constexpr int S
constexpr int A
#define P(name, number_of_args, result_size)
Definition runtime.cc:22
#define I(name, number_of_args, result_size)
Definition runtime.cc:36
#define DCHECK(condition)
Definition logging.h:482
#define USE(...)
Definition macros.h:293
#define DIV_CEIL(x, y)
Definition util.h:19