// Copyright 2021 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "src/bigint/bigint-internal.h" #include "src/bigint/vector-arithmetic.h" namespace v8 { namespace bigint { // The classic algorithm: for every part, multiply the accumulator with // the appropriate multiplier, and add the part. O(n²) overall. void ProcessorImpl::FromStringClassic(RWDigits Z, FromStringAccumulator* accumulator) { // We always have at least one part to process. DCHECK(accumulator->stack_parts_used_ > 0); Z[0] = accumulator->stack_parts_[0]; RWDigits already_set(Z, 0, 1); for (int i = 1; i < Z.len(); i++) Z[i] = 0; // The {FromStringAccumulator} uses stack-allocated storage for the first // few parts; if heap storage is used at all then all parts are copied there. int num_stack_parts = accumulator->stack_parts_used_; if (num_stack_parts == 1) return; const std::vector& heap_parts = accumulator->heap_parts_; int num_heap_parts = static_cast(heap_parts.size()); // All multipliers are the same, except possibly for the last. const digit_t max_multiplier = accumulator->max_multiplier_; if (num_heap_parts == 0) { for (int i = 1; i < num_stack_parts - 1; i++) { MultiplySingle(Z, already_set, max_multiplier); Add(Z, accumulator->stack_parts_[i]); already_set.set_len(already_set.len() + 1); } MultiplySingle(Z, already_set, accumulator->last_multiplier_); Add(Z, accumulator->stack_parts_[num_stack_parts - 1]); return; } // Parts are stored on the heap. for (int i = 1; i < num_heap_parts - 1; i++) { MultiplySingle(Z, already_set, max_multiplier); Add(Z, accumulator->heap_parts_[i]); already_set.set_len(already_set.len() + 1); } MultiplySingle(Z, already_set, accumulator->last_multiplier_); Add(Z, accumulator->heap_parts_.back()); } // The fast algorithm: combine parts in a balanced-binary-tree like order: // Multiply-and-add neighboring pairs of parts, then loop, until only one // part is left. The benefit is that the multiplications will have inputs of // similar sizes, which makes them amenable to fast multiplication algorithms. // We have to do more multiplications than the classic algorithm though, // because we also have to multiply the multipliers. // Optimizations: // - We can skip the multiplier for the first part, because we never need it. // - Most multipliers are the same; we can avoid repeated multiplications and // just copy the previous result. (In theory we could even de-dupe them, but // as the parts/multipliers grow, we'll need most of the memory anyway.) // Copied results are marked with a * below. // - We can re-use memory using a system of three buffers whose usage rotates: // - one is considered empty, and is overwritten with the new parts, // - one holds the multipliers (and will be "empty" in the next round), and // - one initially holds the parts and is overwritten with the new multipliers // Parts and multipliers both grow in each iteration, and get fewer, so we // use the space of two adjacent old chunks for one new chunk. // Since the {heap_parts_} vectors has the right size, and so does the // result {Z}, we can use that memory, and only need to allocate one scratch // vector. If the final result ends up in the wrong bucket, we have to copy it // to the correct one. // - We don't have to keep track of the positions and sizes of the chunks, // because we can deduce their precise placement from the iteration index. // // Example, assuming digit_t is 4 bits, fitting one decimal digit: // Initial state: // parts_: 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 // multipliers_: 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 // After the first iteration of the outer loop: // parts: 12 34 56 78 90 12 34 5 // multipliers: 100 *100 *100 *100 *100 *100 10 // After the second iteration: // parts: 1234 5678 9012 345 // multipliers: 10000 *10000 1000 // After the third iteration: // parts: 12345678 9012345 // multipliers: 10000000 // And then there's an obvious last iteration. void ProcessorImpl::FromStringLarge(RWDigits Z, FromStringAccumulator* accumulator) { int num_parts = static_cast(accumulator->heap_parts_.size()); DCHECK(num_parts >= 2); DCHECK(Z.len() >= num_parts); RWDigits parts(accumulator->heap_parts_.data(), num_parts); Storage multipliers_storage(num_parts); RWDigits multipliers(multipliers_storage.get(), num_parts); RWDigits temp(Z, 0, num_parts); // Unrolled and specialized first iteration: part_len == 1, so instead of // Digits sub-vectors we have individual digit_t values, and the multipliers // are known up front. { digit_t max_multiplier = accumulator->max_multiplier_; digit_t last_multiplier = accumulator->last_multiplier_; RWDigits new_parts = temp; RWDigits new_multipliers = parts; int i = 0; for (; i + 1 < num_parts; i += 2) { digit_t p_in = parts[i]; digit_t p_in2 = parts[i + 1]; digit_t m_in = max_multiplier; digit_t m_in2 = i == num_parts - 2 ? last_multiplier : max_multiplier; // p[j] = p[i] * m[i+1] + p[i+1] digit_t p_high; digit_t p_low = digit_mul(p_in, m_in2, &p_high); digit_t carry; new_parts[i] = digit_add2(p_low, p_in2, &carry); new_parts[i + 1] = p_high + carry; // m[j] = m[i] * m[i+1] if (i > 0) { if (i > 2 && m_in2 != last_multiplier) { new_multipliers[i] = new_multipliers[i - 2]; new_multipliers[i + 1] = new_multipliers[i - 1]; } else { digit_t m_high; new_multipliers[i] = digit_mul(m_in, m_in2, &m_high); new_multipliers[i + 1] = m_high; } } } // Trailing last part (if {num_parts} was odd). if (i < num_parts) { new_parts[i] = parts[i]; new_multipliers[i] = last_multiplier; i += 2; } num_parts = i >> 1; RWDigits new_temp = multipliers; parts = new_parts; multipliers = new_multipliers; temp = new_temp; AddWorkEstimate(num_parts); } int part_len = 2; // Remaining iterations. while (num_parts > 1) { RWDigits new_parts = temp; RWDigits new_multipliers = parts; int new_part_len = part_len * 2; int i = 0; for (; i + 1 < num_parts; i += 2) { int start = i * part_len; Digits p_in(parts, start, part_len); Digits p_in2(parts, start + part_len, part_len); Digits m_in(multipliers, start, part_len); Digits m_in2(multipliers, start + part_len, part_len); RWDigits p_out(new_parts, start, new_part_len); RWDigits m_out(new_multipliers, start, new_part_len); // p[j] = p[i] * m[i+1] + p[i+1] Multiply(p_out, p_in, m_in2); if (should_terminate()) return; digit_t overflow = AddAndReturnOverflow(p_out, p_in2); DCHECK(overflow == 0); USE(overflow); // m[j] = m[i] * m[i+1] if (i > 0) { bool copied = false; if (i > 2) { int prev_start = (i - 2) * part_len; Digits m_in_prev(multipliers, prev_start, part_len); Digits m_in2_prev(multipliers, prev_start + part_len, part_len); if (Compare(m_in, m_in_prev) == 0 && Compare(m_in2, m_in2_prev) == 0) { copied = true; Digits m_out_prev(new_multipliers, prev_start, new_part_len); for (int k = 0; k < new_part_len; k++) m_out[k] = m_out_prev[k]; } } if (!copied) { Multiply(m_out, m_in, m_in2); if (should_terminate()) return; } } } // Trailing last part (if {num_parts} was odd). if (i < num_parts) { Digits p_in(parts, i * part_len, part_len); Digits m_in(multipliers, i * part_len, part_len); RWDigits p_out(new_parts, i * part_len, new_part_len); RWDigits m_out(new_multipliers, i * part_len, new_part_len); int k = 0; for (; k < p_in.len(); k++) p_out[k] = p_in[k]; for (; k < p_out.len(); k++) p_out[k] = 0; k = 0; for (; k < m_in.len(); k++) m_out[k] = m_in[k]; for (; k < m_out.len(); k++) m_out[k] = 0; i += 2; } num_parts = i >> 1; part_len = new_part_len; RWDigits new_temp = multipliers; parts = new_parts; multipliers = new_multipliers; temp = new_temp; } // Copy the result to Z, if it doesn't happen to be there already. if (parts.digits() != Z.digits()) { int i = 0; for (; i < parts.len(); i++) Z[i] = parts[i]; // Z might be bigger than we requested; be robust towards that. for (; i < Z.len(); i++) Z[i] = 0; } } // Specialized algorithms for power-of-two radixes. Designed to work with // {ParsePowerTwo}: {max_multiplier_} isn't saved, but {radix_} is, and // {last_multiplier_} has special meaning, namely the number of unpopulated bits // in the last part. // For these radixes, {parts} already is a list of correct bit sequences, we // just have to put them together in the right way: // - The parts are currently in reversed order. The highest-index parts[i] // will go into Z[0]. // - All parts, possibly except for the last, are maximally populated. // - A maximally populated part stores a non-fractional number of characters, // i.e. the largest fitting multiple of {char_bits} of it is populated. // - The populated bits in a part are at the low end. // - The number of unused bits in the last part is stored in // {accumulator->last_multiplier_}. // // Example: Given the following parts vector, where letters are used to // label bits, bit order is big endian (i.e. [00000101] encodes "5"), // 'x' means "unpopulated", kDigitBits == 8, radix == 8, and char_bits == 3: // // parts[0] -> [xxABCDEF][xxGHIJKL][xxMNOPQR][xxxxxSTU] <- parts[3] // // We have to assemble the following result: // // Z[0] -> [NOPQRSTU][FGHIJKLM][xxxABCDE] <- Z[2] // void ProcessorImpl::FromStringBasePowerOfTwo( RWDigits Z, FromStringAccumulator* accumulator) { const int num_parts = accumulator->ResultLength(); DCHECK(num_parts >= 1); DCHECK(Z.len() >= num_parts); Digits parts(accumulator->heap_parts_.size() > 0 ? accumulator->heap_parts_.data() : accumulator->stack_parts_, num_parts); uint8_t radix = accumulator->radix_; DCHECK(radix == 2 || radix == 4 || radix == 8 || radix == 16 || radix == 32); const int char_bits = BitLength(radix - 1); const int unused_last_part_bits = static_cast(accumulator->last_multiplier_); const int unused_part_bits = kDigitBits % char_bits; const int max_part_bits = kDigitBits - unused_part_bits; int z_index = 0; int part_index = num_parts - 1; // If the last part is fully populated, then all parts must be, and we can // simply copy them (in reversed order). if (unused_last_part_bits == 0) { DCHECK(kDigitBits % char_bits == 0); while (part_index >= 0) { Z[z_index++] = parts[part_index--]; } for (; z_index < Z.len(); z_index++) Z[z_index] = 0; return; } // Otherwise we have to shift parts contents around as needed. // Holds the next Z digit that we want to store... digit_t digit = parts[part_index--]; // ...and the number of bits (at the right end) we already know. int digit_bits = kDigitBits - unused_last_part_bits; while (part_index >= 0) { // Holds the last part that we read from {parts}... digit_t part; // ...and the number of bits (at the right end) that we haven't used yet. int part_bits; while (digit_bits < kDigitBits) { part = parts[part_index--]; part_bits = max_part_bits; digit |= part << digit_bits; int part_shift = kDigitBits - digit_bits; if (part_shift > part_bits) { digit_bits += part_bits; part = 0; part_bits = 0; if (part_index < 0) break; } else { digit_bits = kDigitBits; part >>= part_shift; part_bits -= part_shift; } } Z[z_index++] = digit; digit = part; digit_bits = part_bits; } if (digit_bits > 0) { Z[z_index++] = digit; } for (; z_index < Z.len(); z_index++) Z[z_index] = 0; } void ProcessorImpl::FromString(RWDigits Z, FromStringAccumulator* accumulator) { if (accumulator->inline_everything_) { int i = 0; for (; i < accumulator->stack_parts_used_; i++) { Z[i] = accumulator->stack_parts_[i]; } for (; i < Z.len(); i++) Z[i] = 0; } else if (accumulator->stack_parts_used_ == 0) { for (int i = 0; i < Z.len(); i++) Z[i] = 0; } else if (IsPowerOfTwo(accumulator->radix_)) { FromStringBasePowerOfTwo(Z, accumulator); } else if (accumulator->ResultLength() < kFromStringLargeThreshold) { FromStringClassic(Z, accumulator); } else { FromStringLarge(Z, accumulator); } } Status Processor::FromString(RWDigits Z, FromStringAccumulator* accumulator) { ProcessorImpl* impl = static_cast(this); impl->FromString(Z, accumulator); return impl->get_and_clear_status(); } } // namespace bigint } // namespace v8