1 // Copyright 2021 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/bigint/bigint-internal.h"
6 #include "src/bigint/vector-arithmetic.h"
7
8 namespace v8 {
9 namespace bigint {
10
11 // The classic algorithm: for every part, multiply the accumulator with
12 // the appropriate multiplier, and add the part. O(n²) overall.
FromStringClassic(RWDigits Z,FromStringAccumulator * accumulator)13 void ProcessorImpl::FromStringClassic(RWDigits Z,
14 FromStringAccumulator* accumulator) {
15 // We always have at least one part to process.
16 DCHECK(accumulator->stack_parts_used_ > 0);
17 Z[0] = accumulator->stack_parts_[0];
18 RWDigits already_set(Z, 0, 1);
19 for (int i = 1; i < Z.len(); i++) Z[i] = 0;
20
21 // The {FromStringAccumulator} uses stack-allocated storage for the first
22 // few parts; if heap storage is used at all then all parts are copied there.
23 int num_stack_parts = accumulator->stack_parts_used_;
24 if (num_stack_parts == 1) return;
25 const std::vector<digit_t>& heap_parts = accumulator->heap_parts_;
26 int num_heap_parts = static_cast<int>(heap_parts.size());
27 // All multipliers are the same, except possibly for the last.
28 const digit_t max_multiplier = accumulator->max_multiplier_;
29
30 if (num_heap_parts == 0) {
31 for (int i = 1; i < num_stack_parts - 1; i++) {
32 MultiplySingle(Z, already_set, max_multiplier);
33 Add(Z, accumulator->stack_parts_[i]);
34 already_set.set_len(already_set.len() + 1);
35 }
36 MultiplySingle(Z, already_set, accumulator->last_multiplier_);
37 Add(Z, accumulator->stack_parts_[num_stack_parts - 1]);
38 return;
39 }
40 // Parts are stored on the heap.
41 for (int i = 1; i < num_heap_parts - 1; i++) {
42 MultiplySingle(Z, already_set, max_multiplier);
43 Add(Z, accumulator->heap_parts_[i]);
44 already_set.set_len(already_set.len() + 1);
45 }
46 MultiplySingle(Z, already_set, accumulator->last_multiplier_);
47 Add(Z, accumulator->heap_parts_.back());
48 }
49
50 // The fast algorithm: combine parts in a balanced-binary-tree like order:
51 // Multiply-and-add neighboring pairs of parts, then loop, until only one
52 // part is left. The benefit is that the multiplications will have inputs of
53 // similar sizes, which makes them amenable to fast multiplication algorithms.
54 // We have to do more multiplications than the classic algorithm though,
55 // because we also have to multiply the multipliers.
56 // Optimizations:
57 // - We can skip the multiplier for the first part, because we never need it.
58 // - Most multipliers are the same; we can avoid repeated multiplications and
59 // just copy the previous result. (In theory we could even de-dupe them, but
60 // as the parts/multipliers grow, we'll need most of the memory anyway.)
61 // Copied results are marked with a * below.
62 // - We can re-use memory using a system of three buffers whose usage rotates:
63 // - one is considered empty, and is overwritten with the new parts,
64 // - one holds the multipliers (and will be "empty" in the next round), and
65 // - one initially holds the parts and is overwritten with the new multipliers
66 // Parts and multipliers both grow in each iteration, and get fewer, so we
67 // use the space of two adjacent old chunks for one new chunk.
68 // Since the {heap_parts_} vectors has the right size, and so does the
69 // result {Z}, we can use that memory, and only need to allocate one scratch
70 // vector. If the final result ends up in the wrong bucket, we have to copy it
71 // to the correct one.
72 // - We don't have to keep track of the positions and sizes of the chunks,
73 // because we can deduce their precise placement from the iteration index.
74 //
75 // Example, assuming digit_t is 4 bits, fitting one decimal digit:
76 // Initial state:
77 // parts_: 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
78 // multipliers_: 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
79 // After the first iteration of the outer loop:
80 // parts: 12 34 56 78 90 12 34 5
81 // multipliers: 100 *100 *100 *100 *100 *100 10
82 // After the second iteration:
83 // parts: 1234 5678 9012 345
84 // multipliers: 10000 *10000 1000
85 // After the third iteration:
86 // parts: 12345678 9012345
87 // multipliers: 10000000
88 // And then there's an obvious last iteration.
FromStringLarge(RWDigits Z,FromStringAccumulator * accumulator)89 void ProcessorImpl::FromStringLarge(RWDigits Z,
90 FromStringAccumulator* accumulator) {
91 int num_parts = static_cast<int>(accumulator->heap_parts_.size());
92 DCHECK(num_parts >= 2);
93 DCHECK(Z.len() >= num_parts);
94 RWDigits parts(accumulator->heap_parts_.data(), num_parts);
95 Storage multipliers_storage(num_parts);
96 RWDigits multipliers(multipliers_storage.get(), num_parts);
97 RWDigits temp(Z, 0, num_parts);
98 // Unrolled and specialized first iteration: part_len == 1, so instead of
99 // Digits sub-vectors we have individual digit_t values, and the multipliers
100 // are known up front.
101 {
102 digit_t max_multiplier = accumulator->max_multiplier_;
103 digit_t last_multiplier = accumulator->last_multiplier_;
104 RWDigits new_parts = temp;
105 RWDigits new_multipliers = parts;
106 int i = 0;
107 for (; i + 1 < num_parts; i += 2) {
108 digit_t p_in = parts[i];
109 digit_t p_in2 = parts[i + 1];
110 digit_t m_in = max_multiplier;
111 digit_t m_in2 = i == num_parts - 2 ? last_multiplier : max_multiplier;
112 // p[j] = p[i] * m[i+1] + p[i+1]
113 digit_t p_high;
114 digit_t p_low = digit_mul(p_in, m_in2, &p_high);
115 digit_t carry;
116 new_parts[i] = digit_add2(p_low, p_in2, &carry);
117 new_parts[i + 1] = p_high + carry;
118 // m[j] = m[i] * m[i+1]
119 if (i > 0) {
120 if (i > 2 && m_in2 != last_multiplier) {
121 new_multipliers[i] = new_multipliers[i - 2];
122 new_multipliers[i + 1] = new_multipliers[i - 1];
123 } else {
124 digit_t m_high;
125 new_multipliers[i] = digit_mul(m_in, m_in2, &m_high);
126 new_multipliers[i + 1] = m_high;
127 }
128 }
129 }
130 // Trailing last part (if {num_parts} was odd).
131 if (i < num_parts) {
132 new_parts[i] = parts[i];
133 new_multipliers[i] = last_multiplier;
134 i += 2;
135 }
136 num_parts = i >> 1;
137 RWDigits new_temp = multipliers;
138 parts = new_parts;
139 multipliers = new_multipliers;
140 temp = new_temp;
141 AddWorkEstimate(num_parts);
142 }
143 int part_len = 2;
144
145 // Remaining iterations.
146 while (num_parts > 1) {
147 RWDigits new_parts = temp;
148 RWDigits new_multipliers = parts;
149 int new_part_len = part_len * 2;
150 int i = 0;
151 for (; i + 1 < num_parts; i += 2) {
152 int start = i * part_len;
153 Digits p_in(parts, start, part_len);
154 Digits p_in2(parts, start + part_len, part_len);
155 Digits m_in(multipliers, start, part_len);
156 Digits m_in2(multipliers, start + part_len, part_len);
157 RWDigits p_out(new_parts, start, new_part_len);
158 RWDigits m_out(new_multipliers, start, new_part_len);
159 // p[j] = p[i] * m[i+1] + p[i+1]
160 Multiply(p_out, p_in, m_in2);
161 if (should_terminate()) return;
162 digit_t overflow = AddAndReturnOverflow(p_out, p_in2);
163 DCHECK(overflow == 0);
164 USE(overflow);
165 // m[j] = m[i] * m[i+1]
166 if (i > 0) {
167 bool copied = false;
168 if (i > 2) {
169 int prev_start = (i - 2) * part_len;
170 Digits m_in_prev(multipliers, prev_start, part_len);
171 Digits m_in2_prev(multipliers, prev_start + part_len, part_len);
172 if (Compare(m_in, m_in_prev) == 0 &&
173 Compare(m_in2, m_in2_prev) == 0) {
174 copied = true;
175 Digits m_out_prev(new_multipliers, prev_start, new_part_len);
176 for (int k = 0; k < new_part_len; k++) m_out[k] = m_out_prev[k];
177 }
178 }
179 if (!copied) {
180 Multiply(m_out, m_in, m_in2);
181 if (should_terminate()) return;
182 }
183 }
184 }
185 // Trailing last part (if {num_parts} was odd).
186 if (i < num_parts) {
187 Digits p_in(parts, i * part_len, part_len);
188 Digits m_in(multipliers, i * part_len, part_len);
189 RWDigits p_out(new_parts, i * part_len, new_part_len);
190 RWDigits m_out(new_multipliers, i * part_len, new_part_len);
191 int k = 0;
192 for (; k < p_in.len(); k++) p_out[k] = p_in[k];
193 for (; k < p_out.len(); k++) p_out[k] = 0;
194 k = 0;
195 for (; k < m_in.len(); k++) m_out[k] = m_in[k];
196 for (; k < m_out.len(); k++) m_out[k] = 0;
197 i += 2;
198 }
199 num_parts = i >> 1;
200 part_len = new_part_len;
201 RWDigits new_temp = multipliers;
202 parts = new_parts;
203 multipliers = new_multipliers;
204 temp = new_temp;
205 }
206 // Copy the result to Z, if it doesn't happen to be there already.
207 if (parts.digits() != Z.digits()) {
208 int i = 0;
209 for (; i < parts.len(); i++) Z[i] = parts[i];
210 // Z might be bigger than we requested; be robust towards that.
211 for (; i < Z.len(); i++) Z[i] = 0;
212 }
213 }
214
215 // Specialized algorithms for power-of-two radixes. Designed to work with
216 // {ParsePowerTwo}: {max_multiplier_} isn't saved, but {radix_} is, and
217 // {last_multiplier_} has special meaning, namely the number of unpopulated bits
218 // in the last part.
219 // For these radixes, {parts} already is a list of correct bit sequences, we
220 // just have to put them together in the right way:
221 // - The parts are currently in reversed order. The highest-index parts[i]
222 // will go into Z[0].
223 // - All parts, possibly except for the last, are maximally populated.
224 // - A maximally populated part stores a non-fractional number of characters,
225 // i.e. the largest fitting multiple of {char_bits} of it is populated.
226 // - The populated bits in a part are at the low end.
227 // - The number of unused bits in the last part is stored in
228 // {accumulator->last_multiplier_}.
229 //
230 // Example: Given the following parts vector, where letters are used to
231 // label bits, bit order is big endian (i.e. [00000101] encodes "5"),
232 // 'x' means "unpopulated", kDigitBits == 8, radix == 8, and char_bits == 3:
233 //
234 // parts[0] -> [xxABCDEF][xxGHIJKL][xxMNOPQR][xxxxxSTU] <- parts[3]
235 //
236 // We have to assemble the following result:
237 //
238 // Z[0] -> [NOPQRSTU][FGHIJKLM][xxxABCDE] <- Z[2]
239 //
FromStringBasePowerOfTwo(RWDigits Z,FromStringAccumulator * accumulator)240 void ProcessorImpl::FromStringBasePowerOfTwo(
241 RWDigits Z, FromStringAccumulator* accumulator) {
242 const int num_parts = accumulator->ResultLength();
243 DCHECK(num_parts >= 1);
244 DCHECK(Z.len() >= num_parts);
245 Digits parts(accumulator->heap_parts_.size() > 0
246 ? accumulator->heap_parts_.data()
247 : accumulator->stack_parts_,
248 num_parts);
249 uint8_t radix = accumulator->radix_;
250 DCHECK(radix == 2 || radix == 4 || radix == 8 || radix == 16 || radix == 32);
251 const int char_bits = BitLength(radix - 1);
252 const int unused_last_part_bits =
253 static_cast<int>(accumulator->last_multiplier_);
254 const int unused_part_bits = kDigitBits % char_bits;
255 const int max_part_bits = kDigitBits - unused_part_bits;
256 int z_index = 0;
257 int part_index = num_parts - 1;
258
259 // If the last part is fully populated, then all parts must be, and we can
260 // simply copy them (in reversed order).
261 if (unused_last_part_bits == 0) {
262 DCHECK(kDigitBits % char_bits == 0);
263 while (part_index >= 0) {
264 Z[z_index++] = parts[part_index--];
265 }
266 for (; z_index < Z.len(); z_index++) Z[z_index] = 0;
267 return;
268 }
269
270 // Otherwise we have to shift parts contents around as needed.
271 // Holds the next Z digit that we want to store...
272 digit_t digit = parts[part_index--];
273 // ...and the number of bits (at the right end) we already know.
274 int digit_bits = kDigitBits - unused_last_part_bits;
275 while (part_index >= 0) {
276 // Holds the last part that we read from {parts}...
277 digit_t part;
278 // ...and the number of bits (at the right end) that we haven't used yet.
279 int part_bits;
280 while (digit_bits < kDigitBits) {
281 part = parts[part_index--];
282 part_bits = max_part_bits;
283 digit |= part << digit_bits;
284 int part_shift = kDigitBits - digit_bits;
285 if (part_shift > part_bits) {
286 digit_bits += part_bits;
287 part = 0;
288 part_bits = 0;
289 if (part_index < 0) break;
290 } else {
291 digit_bits = kDigitBits;
292 part >>= part_shift;
293 part_bits -= part_shift;
294 }
295 }
296 Z[z_index++] = digit;
297 digit = part;
298 digit_bits = part_bits;
299 }
300 if (digit_bits > 0) {
301 Z[z_index++] = digit;
302 }
303 for (; z_index < Z.len(); z_index++) Z[z_index] = 0;
304 }
305
FromString(RWDigits Z,FromStringAccumulator * accumulator)306 void ProcessorImpl::FromString(RWDigits Z, FromStringAccumulator* accumulator) {
307 if (accumulator->inline_everything_) {
308 int i = 0;
309 for (; i < accumulator->stack_parts_used_; i++) {
310 Z[i] = accumulator->stack_parts_[i];
311 }
312 for (; i < Z.len(); i++) Z[i] = 0;
313 } else if (accumulator->stack_parts_used_ == 0) {
314 for (int i = 0; i < Z.len(); i++) Z[i] = 0;
315 } else if (IsPowerOfTwo(accumulator->radix_)) {
316 FromStringBasePowerOfTwo(Z, accumulator);
317 } else if (accumulator->ResultLength() < kFromStringLargeThreshold) {
318 FromStringClassic(Z, accumulator);
319 } else {
320 FromStringLarge(Z, accumulator);
321 }
322 }
323
FromString(RWDigits Z,FromStringAccumulator * accumulator)324 Status Processor::FromString(RWDigits Z, FromStringAccumulator* accumulator) {
325 ProcessorImpl* impl = static_cast<ProcessorImpl*>(this);
326 impl->FromString(Z, accumulator);
327 return impl->get_and_clear_status();
328 }
329
330 } // namespace bigint
331 } // namespace v8
332