v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
div-burnikel.cc
Go to the documentation of this file.
1// Copyright 2021 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Burnikel-Ziegler division.
6// Reference: "Fast Recursive Division" by Christoph Burnikel and Joachim
7// Ziegler, found at http://cr.yp.to/bib/1998/burnikel.ps
8
9#include <string.h>
10
14#include "src/bigint/util.h"
16
17namespace v8 {
18namespace bigint {
19
20namespace {
21
22// Compares [a_high, A] with B.
23// Returns:
24// - a value < 0 if [a_high, A] < B
25// - 0 if [a_high, A] == B
26// - a value > 0 if [a_high, A] > B.
27int SpecialCompare(digit_t a_high, Digits A, Digits B) {
28 B.Normalize();
29 int a_len;
30 if (a_high == 0) {
31 A.Normalize();
32 a_len = A.len();
33 } else {
34 a_len = A.len() + 1;
35 }
36 int diff = a_len - B.len();
37 if (diff != 0) return diff;
38 int i = a_len - 1;
39 if (a_high != 0) {
40 if (a_high > B[i]) return 1;
41 if (a_high < B[i]) return -1;
42 i--;
43 }
44 while (i >= 0 && A[i] == B[i]) i--;
45 if (i < 0) return 0;
46 return A[i] > B[i] ? 1 : -1;
47}
48
49void SetOnes(RWDigits X) {
50 memset(X.digits(), 0xFF, X.len() * sizeof(digit_t));
51}
52
53// Since the Burnikel-Ziegler method is inherently recursive, we put
54// non-changing data into a container object.
55class BZ {
56 public:
57 BZ(ProcessorImpl* proc, int scratch_space)
58 : proc_(proc),
59 scratch_mem_(scratch_space >= kBurnikelThreshold ? scratch_space : 0) {}
60
61 void DivideBasecase(RWDigits Q, RWDigits R, Digits A, Digits B);
62 void D3n2n(RWDigits Q, RWDigits R, Digits A1A2, Digits A3, Digits B);
63 void D2n1n(RWDigits Q, RWDigits R, Digits A, Digits B);
64
65 private:
66 ProcessorImpl* proc_;
67 Storage scratch_mem_;
68};
69
70void BZ::DivideBasecase(RWDigits Q, RWDigits R, Digits A, Digits B) {
71 A.Normalize();
72 B.Normalize();
73 DCHECK(B.len() > 0);
74 int cmp = Compare(A, B);
75 if (cmp <= 0) {
76 Q.Clear();
77 if (cmp == 0) {
78 // If A == B, then Q=1, R=0.
79 R.Clear();
80 Q[0] = 1;
81 } else {
82 // If A < B, then Q=0, R=A.
83 PutAt(R, A, R.len());
84 }
85 return;
86 }
87 if (B.len() == 1) {
88 return proc_->DivideSingle(Q, R.digits(), A, B[0]);
89 }
90 return proc_->DivideSchoolbook(Q, R, A, B);
91}
92
93// Algorithm 2 from the paper. Variable names same as there.
94// Returns Q(uotient) and R(emainder) for A/B, with B having two thirds
95// the size of A = [A1, A2, A3].
96void BZ::D3n2n(RWDigits Q, RWDigits R, Digits A1A2, Digits A3, Digits B) {
97 DCHECK((B.len() & 1) == 0);
98 int n = B.len() / 2;
99 DCHECK(A1A2.len() == 2 * n);
100 // Actual condition is stricter than length: A < B * 2^(kDigitBits * n)
101 DCHECK(Compare(A1A2, B) < 0);
102 DCHECK(A3.len() == n);
103 DCHECK(Q.len() == n);
104 DCHECK(R.len() == 2 * n);
105 // 1. Split A into three parts A = [A1, A2, A3] with Ai < 2^(kDigitBits * n).
106 Digits A1(A1A2, n, n);
107 // 2. Split B into two parts B = [B1, B2] with Bi < 2^(kDigitBits * n).
108 Digits B1(B, n, n);
109 Digits B2(B, 0, n);
110 // 3. Distinguish the cases A1 < B1 or A1 >= B1.
111 RWDigits Qhat = Q;
112 RWDigits R1(R, n, n);
113 digit_t r1_high = 0;
114 if (Compare(A1, B1) < 0) {
115 // 3a. If A1 < B1, compute Qhat = floor([A1, A2] / B1) with remainder R1
116 // using algorithm D2n1n.
117 D2n1n(Qhat, R1, A1A2, B1);
118 if (proc_->should_terminate()) return;
119 } else {
120 // 3b. If A1 >= B1, set Qhat = 2^(kDigitBits * n) - 1 and set
121 // R1 = [A1, A2] - [B1, 0] + [0, B1]
122 SetOnes(Qhat);
123 // Step 1: compute A1 - B1, which can't underflow because of the comparison
124 // guarding this else-branch, and always has a one-digit result because
125 // of this function's preconditions.
126 RWDigits temp = R1;
127 Subtract(temp, A1, B1);
128 temp.Normalize();
129 DCHECK(temp.len() <= 1);
130 if (temp.len() > 0) r1_high = temp[0];
131 // Step 2: compute A2 + B1.
132 Digits A2(A1A2, 0, n);
133 r1_high += AddAndReturnCarry(R1, A2, B1);
134 }
135 // 4. Compute D = Qhat * B2 using (Karatsuba) multiplication.
136 RWDigits D(scratch_mem_.get(), 2 * n);
137 proc_->Multiply(D, Qhat, B2);
138 if (proc_->should_terminate()) return;
139
140 // 5. Compute Rhat = R1*2^(kDigitBits * n) + A3 - D = [R1, A3] - D.
141 PutAt(R, A3, n);
142 // 6. As long as Rhat < 0, repeat:
143 while (SpecialCompare(r1_high, R, D) < 0) {
144 // 6a. Rhat = Rhat + B
145 r1_high += AddAndReturnCarry(R, R, B);
146 // 6b. Qhat = Qhat - 1
147 Subtract(Qhat, 1);
148 }
149 // 5. Compute Rhat = R1*2^(kDigitBits * n) + A3 - D = [R1, A3] - D.
150 digit_t borrow = SubtractAndReturnBorrow(R, R, D);
151 DCHECK(borrow == r1_high);
152 DCHECK(Compare(R, B) < 0);
153 (void)borrow;
154 // 7. Return R = Rhat, Q = Qhat.
155}
156
157// Algorithm 1 from the paper. Variable names same as there.
158// Returns Q(uotient) and (R)emainder for A/B, with A twice the size of B.
159void BZ::D2n1n(RWDigits Q, RWDigits R, Digits A, Digits B) {
160 int n = B.len();
161 DCHECK(A.len() <= 2 * n);
162 // A < B * 2^(kDigitsBits * n)
163 DCHECK(Compare(Digits(A, n, n), B) < 0);
164 DCHECK(Q.len() == n);
165 DCHECK(R.len() == n);
166 // 1. If n is odd or smaller than some convenient constant, compute Q and R
167 // by school division and return.
168 if ((n & 1) == 1 || n < kBurnikelThreshold) {
169 return DivideBasecase(Q, R, A, B);
170 }
171 // 2. Split A into four parts A = [A1, ..., A4] with
172 // Ai < 2^(kDigitBits * n / 2). Split B into two parts [B2, B1] with
173 // Bi < 2^(kDigitBits * n / 2).
174 Digits A1A2(A, n, n);
175 Digits A3(A, n / 2, n / 2);
176 Digits A4(A, 0, n / 2);
177 // 3. Compute the high part Q1 of floor(A/B) as
178 // Q1 = floor([A1, A2, A3] / [B1, B2]) with remainder R1 = [R11, R12],
179 // using algorithm D3n2n.
180 RWDigits Q1(Q, n / 2, n / 2);
181 ScratchDigits R1(n);
182 D3n2n(Q1, R1, A1A2, A3, B);
183 if (proc_->should_terminate()) return;
184 // 4. Compute the low part Q2 of floor(A/B) as
185 // Q2 = floor([R11, R12, A4] / [B1, B2]) with remainder R, using
186 // algorithm D3n2n.
187 RWDigits Q2(Q, 0, n / 2);
188 D3n2n(Q2, R, R1, A4, B);
189 // 5. Return Q = [Q1, Q2] and R.
190}
191
192} // namespace
193
194// Algorithm 3 from the paper. Variables names same as there.
195// Returns Q(uotient) and R(emainder) for A/B (no size restrictions).
196// R is optional, Q is not.
198 Digits B) {
199 DCHECK(A.len() >= B.len());
200 DCHECK(R.len() == 0 || R.len() >= B.len());
201 DCHECK(Q.len() > A.len() - B.len());
202 int r = A.len();
203 int s = B.len();
204 // The requirements are:
205 // - n >= s, n as small as possible.
206 // - m must be a power of two.
207 // 1. Set m = min {2^k | 2^k * kBurnikelThreshold > s}.
208 int m = 1 << BitLength(s / kBurnikelThreshold);
209 // 2. Set j = roundup(s/m) and n = j * m.
210 int j = DIV_CEIL(s, m);
211 int n = j * m;
212 // 3. Set sigma = max{tao | 2^tao * B < 2^(kDigitBits * n)}.
213 int sigma = CountLeadingZeros(B[s - 1]);
214 int digit_shift = n - s;
215 // 4. Set B = B * 2^sigma to normalize B. Shift A by the same amount.
216 ScratchDigits B_shifted(n);
217 LeftShift(B_shifted + digit_shift, B, sigma);
218 for (int i = 0; i < digit_shift; i++) B_shifted[i] = 0;
219 B = B_shifted;
220 // We need an extra digit if A's top digit does not have enough space for
221 // the left-shift by {sigma}. Additionally, the top bit of A must be 0
222 // (see "-1" in step 5 below), which combined with B being normalized (i.e.
223 // B's top bit is 1) ensures the preconditions of the helper functions.
224 int extra_digit = CountLeadingZeros(A[r - 1]) < (sigma + 1) ? 1 : 0;
225 r = A.len() + digit_shift + extra_digit;
226 ScratchDigits A_shifted(r);
227 LeftShift(A_shifted + digit_shift, A, sigma);
228 for (int i = 0; i < digit_shift; i++) A_shifted[i] = 0;
229 A = A_shifted;
230 // 5. Set t = min{t >= 2 | A < 2^(kDigitBits * t * n - 1)}.
231 int t = std::max(DIV_CEIL(r, n), 2);
232 // 6. Split A conceptually into t blocks.
233 // 7. Set Z_(t-2) = [A_(t-1), A_(t-2)].
234 int z_len = n * 2;
235 ScratchDigits Z(z_len);
236 PutAt(Z, A + n * (t - 2), z_len);
237 // 8. For i from t-2 downto 0 do:
238 BZ bz(this, n);
239 ScratchDigits Ri(n);
240 {
241 // First iteration unrolled and specialized.
242 // We might not have n digits at the top of Q, so use temporary storage
243 // for Qi...
244 ScratchDigits Qi(n);
245 bz.D2n1n(Qi, Ri, Z, B);
246 if (should_terminate()) return;
247 // ...but there *will* be enough space for any non-zero result digits!
248 Qi.Normalize();
249 RWDigits target = Q + n * (t - 2);
250 DCHECK(Qi.len() <= target.len());
251 PutAt(target, Qi, target.len());
252 }
253 // Now loop over any remaining iterations.
254 for (int i = t - 3; i >= 0; i--) {
255 // 8b. If i > 0, set Z_(i-1) = [Ri, A_(i-1)].
256 // (De-duped with unrolled first iteration, hence reading A_(i).)
257 PutAt(Z + n, Ri, n);
258 PutAt(Z, A + n * i, n);
259 // 8a. Using algorithm D2n1n compute Qi, Ri such that Zi = B*Qi + Ri.
260 RWDigits Qi(Q, i * n, n);
261 bz.D2n1n(Qi, Ri, Z, B);
262 if (should_terminate()) return;
263 }
264 // 9. Return Q = [Q_(t-2), ..., Q_0] and R = R_0 * 2^(-sigma).
265#if DEBUG
266 for (int i = 0; i < digit_shift; i++) {
267 DCHECK(Ri[i] == 0);
268 }
269#endif
270 if (R.len() != 0) {
271 Digits Ri_part(Ri, digit_shift, Ri.len());
272 Ri_part.Normalize();
273 DCHECK(Ri_part.len() <= R.len());
274 RightShift(R, Ri_part, sigma);
275 }
276}
277
278} // namespace bigint
279} // namespace v8
void DivideBurnikelZiegler(RWDigits Q, RWDigits R, Digits A, Digits B)
Storage scratch_mem_
ProcessorImpl * proc_
too high values may cause the compiler to set high thresholds for inlining to as much as possible avoid inlined allocation of objects that cannot escape trace load stores from virtual maglev objects use TurboFan fast string builder analyze liveness of environment slots and zap dead values trace TurboFan load elimination emit data about basic block usage in builtins to this enable builtin reordering when run mksnapshot flag for emit warnings when applying builtin profile data verify register allocation in TurboFan randomly schedule instructions to stress dependency tracking enable store store elimination in TurboFan rewrite far to near simulate GC compiler thread race related to allow float parameters to be passed in simulator mode JS Wasm Run additional turbo_optimize_inlined_js_wasm_wrappers enable experimental feedback collection in generic lowering enable Turboshaft s WasmLoadElimination enable Turboshaft s low level load elimination for JS enable Turboshaft s escape analysis for string concatenation use enable Turbolev features that we want to ship in the not too far future trace individual Turboshaft reduction steps trace intermediate Turboshaft reduction steps invocation count threshold for early optimization Enables optimizations which favor memory size over execution speed Enables sampling allocation profiler with X as a sample interval min size of a semi the new space consists of two semi spaces max size of the Collect garbage after Collect garbage after keeps maps alive for< n > old space garbage collections print one detailed trace line in allocation gc speed threshold for starting incremental marking via a task in percent of available threshold for starting incremental marking immediately in percent of available Use a single schedule for determining a marking schedule between JS and C objects schedules the minor GC task with kUserVisible priority max worker number of concurrent for NumberOfWorkerThreads start background threads that allocate memory concurrent_array_buffer_sweeping use parallel threads to clear weak refs in the atomic pause trace progress of the incremental marking trace object counts and memory usage report a tick only when allocated zone memory changes by this amount TracingFlags::gc_stats TracingFlags::gc_stats track native contexts that are expected to be garbage collected verify heap pointers before and after GC memory reducer runs GC with ReduceMemoryFootprint flag Maximum number of memory reducer GCs scheduled Old gen GC speed is computed directly from gc tracer counters Perform compaction on full GCs based on V8 s default heuristics Perform compaction on every full GC Perform code space compaction when finalizing a full GC with stack Stress GC compaction to flush out bugs with moving objects flush of baseline code when it has not been executed recently Use time base code flushing instead of age Use a progress bar to scan large objects in increments when incremental marking is active force incremental marking for small heaps and run it more often force marking at random points between and X(inclusive) percent " "of the regular marking start limit") DEFINE_INT(stress_scavenge
#define D(Name)
Definition maglev-ir.h:6426
int s
Definition mul-fft.cc:297
int m
Definition mul-fft.cc:294
int r
Definition mul-fft.cc:298
constexpr int BitLength(int n)
Definition util.h:65
void PutAt(RWDigits Z, Digits A, int count)
Definition div-helpers.h:19
void LeftShift(RWDigits Z, Digits X, digit_t shift)
Definition bitwise.cc:136
constexpr int CountLeadingZeros(T value)
Definition util.h:30
void Subtract(RWDigits Z, Digits X, Digits Y)
digit_t SubtractAndReturnBorrow(RWDigits Z, Digits X, Digits Y)
void RightShift(RWDigits Z, Digits X, digit_t shift, const RightShiftState &state)
Definition bitwise.cc:195
digit_t AddAndReturnCarry(RWDigits Z, Digits X, Digits Y)
constexpr int kBurnikelThreshold
uintptr_t digit_t
Definition bigint.h:34
constexpr int B
constexpr int A
int Compare(const T &a, const T &b)
#define DCHECK(condition)
Definition logging.h:482
#define DIV_CEIL(x, y)
Definition util.h:19