• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Toom-Cook multiplication.
6 // Reference: https://en.wikipedia.org/wiki/Toom%E2%80%93Cook_multiplication
7 
8 #include <algorithm>
9 
10 #include "src/bigint/bigint-internal.h"
11 #include "src/bigint/digit-arithmetic.h"
12 #include "src/bigint/vector-arithmetic.h"
13 
14 namespace v8 {
15 namespace bigint {
16 
17 namespace {
18 
TimesTwo(RWDigits X)19 void TimesTwo(RWDigits X) {
20   digit_t carry = 0;
21   for (int i = 0; i < X.len(); i++) {
22     digit_t d = X[i];
23     X[i] = (d << 1) | carry;
24     carry = d >> (kDigitBits - 1);
25   }
26 }
27 
DivideByTwo(RWDigits X)28 void DivideByTwo(RWDigits X) {
29   digit_t carry = 0;
30   for (int i = X.len() - 1; i >= 0; i--) {
31     digit_t d = X[i];
32     X[i] = (d >> 1) | carry;
33     carry = d << (kDigitBits - 1);
34   }
35 }
36 
DivideByThree(RWDigits X)37 void DivideByThree(RWDigits X) {
38   digit_t remainder = 0;
39   for (int i = X.len() - 1; i >= 0; i--) {
40     digit_t d = X[i];
41     digit_t upper = (remainder << kHalfDigitBits) | (d >> kHalfDigitBits);
42     digit_t u_result = upper / 3;
43     remainder = upper - 3 * u_result;
44     digit_t lower = (remainder << kHalfDigitBits) | (d & kHalfDigitMask);
45     digit_t l_result = lower / 3;
46     remainder = lower - 3 * l_result;
47     X[i] = (u_result << kHalfDigitBits) | l_result;
48   }
49 }
50 
51 }  // namespace
52 
53 #if DEBUG
54 // Set {len_} to 1 rather than 0 so that attempts to access the first digit
55 // will crash.
56 #define MARK_INVALID(D) D = RWDigits(nullptr, 1)
57 #else
58 #define MARK_INVALID(D) (void(0))
59 #endif
60 
Toom3Main(RWDigits Z,Digits X,Digits Y)61 void ProcessorImpl::Toom3Main(RWDigits Z, Digits X, Digits Y) {
62   DCHECK(Z.len() >= X.len() + Y.len());
63   // Phase 1: Splitting.
64   int i = DIV_CEIL(std::max(X.len(), Y.len()), 3);
65   Digits X0(X, 0, i);
66   Digits X1(X, i, i);
67   Digits X2(X, 2 * i, i);
68   Digits Y0(Y, 0, i);
69   Digits Y1(Y, i, i);
70   Digits Y2(Y, 2 * i, i);
71 
72   // Temporary storage.
73   int p_len = i + 1;      // For all px, qx below.
74   int r_len = 2 * p_len;  // For all r_x, Rx below.
75   Storage temp_storage(4 * r_len);
76   // We will use the same variable names as the Wikipedia article, as much as
77   // C++ lets us: our "p_m1" is their "p(-1)" etc. For consistency with other
78   // algorithms, we use X and Y where Wikipedia uses m and n.
79   // We will use and re-use the temporary storage as follows:
80   //
81   //   chunk                  | -------- time ----------->
82   //   [0 .. i]               |( po )( p_m1 ) ( r_m2  )
83   //   [i+1 .. rlen-1]        |( qo )( q_m1 ) ( r_m2  )
84   //   [rlen .. rlen+i]       | (p_1 ) ( p_m2 ) (r_inf)
85   //   [rlen+i+1 .. 2*rlen-1] | (q_1 ) ( q_m2 ) (r_inf)
86   //   [2*rlen .. 3*rlen-1]   |      (   r_1          )
87   //   [3*rlen .. 4*rlen-1]   |             (  r_m1   )
88   //
89   // This requires interleaving phases 2 and 3 a bit: after computing
90   // r_1 = p_1 * q_1, we can re-use p_1's storage for p_m2, and so on.
91   digit_t* t = temp_storage.get();
92   RWDigits po(t, p_len);
93   RWDigits qo(t + p_len, p_len);
94   RWDigits p_1(t + r_len, p_len);
95   RWDigits q_1(t + r_len + p_len, p_len);
96   RWDigits r_1(t + 2 * r_len, r_len);
97   RWDigits r_m1(t + 3 * r_len, r_len);
98 
99   // We can also share the  backing stores of Z, r_0, R0.
100   DCHECK(Z.len() >= r_len);
101   RWDigits r_0(Z, 0, r_len);
102 
103   // Phase 2a: Evaluation, steps 0, 1, m1.
104   // po = X0 + X2
105   Add(po, X0, X2);
106   // p_0 = X0
107   // p_1 = po + X1
108   Add(p_1, po, X1);
109   // p_m1 = po - X1
110   RWDigits p_m1 = po;
111   bool p_m1_sign = SubtractSigned(p_m1, po, false, X1, false);
112   MARK_INVALID(po);
113 
114   // qo = Y0 + Y2
115   Add(qo, Y0, Y2);
116   // q_0 = Y0
117   // q_1 = qo + Y1
118   Add(q_1, qo, Y1);
119   // q_m1 = qo - Y1
120   RWDigits q_m1 = qo;
121   bool q_m1_sign = SubtractSigned(q_m1, qo, false, Y1, false);
122   MARK_INVALID(qo);
123 
124   // Phase 3a: Pointwise multiplication, steps 0, 1, m1.
125   Multiply(r_0, X0, Y0);
126   Multiply(r_1, p_1, q_1);
127   Multiply(r_m1, p_m1, q_m1);
128   bool r_m1_sign = p_m1_sign != q_m1_sign;
129 
130   // Phase 2b: Evaluation, steps m2 and inf.
131   // p_m2 = (p_m1 + X2) * 2 - X0
132   RWDigits p_m2 = p_1;
133   MARK_INVALID(p_1);
134   bool p_m2_sign = AddSigned(p_m2, p_m1, p_m1_sign, X2, false);
135   TimesTwo(p_m2);
136   p_m2_sign = SubtractSigned(p_m2, p_m2, p_m2_sign, X0, false);
137   // p_inf = X2
138 
139   // q_m2 = (q_m1 + Y2) * 2 - Y0
140   RWDigits q_m2 = q_1;
141   MARK_INVALID(q_1);
142   bool q_m2_sign = AddSigned(q_m2, q_m1, q_m1_sign, Y2, false);
143   TimesTwo(q_m2);
144   q_m2_sign = SubtractSigned(q_m2, q_m2, q_m2_sign, Y0, false);
145   // q_inf = Y2
146 
147   // Phase 3b: Pointwise multiplication, steps m2 and inf.
148   RWDigits r_m2(t, r_len);
149   MARK_INVALID(p_m1);
150   MARK_INVALID(q_m1);
151   Multiply(r_m2, p_m2, q_m2);
152   bool r_m2_sign = p_m2_sign != q_m2_sign;
153 
154   RWDigits r_inf(t + r_len, r_len);
155   MARK_INVALID(p_m2);
156   MARK_INVALID(q_m2);
157   Multiply(r_inf, X2, Y2);
158 
159   // Phase 4: Interpolation.
160   Digits R0 = r_0;
161   Digits R4 = r_inf;
162   // R3 <- (r_m2 - r_1) / 3
163   RWDigits R3 = r_m2;
164   bool R3_sign = SubtractSigned(R3, r_m2, r_m2_sign, r_1, false);
165   DivideByThree(R3);
166   // R1 <- (r_1 - r_m1) / 2
167   RWDigits R1 = r_1;
168   bool R1_sign = SubtractSigned(R1, r_1, false, r_m1, r_m1_sign);
169   DivideByTwo(R1);
170   // R2 <- r_m1 - r_0
171   RWDigits R2 = r_m1;
172   bool R2_sign = SubtractSigned(R2, r_m1, r_m1_sign, R0, false);
173   // R3 <- (R2 - R3) / 2 + 2 * r_inf
174   R3_sign = SubtractSigned(R3, R2, R2_sign, R3, R3_sign);
175   DivideByTwo(R3);
176   // TODO(jkummerow): Would it be a measurable improvement to write an
177   // "AddTwice" helper?
178   R3_sign = AddSigned(R3, R3, R3_sign, r_inf, false);
179   R3_sign = AddSigned(R3, R3, R3_sign, r_inf, false);
180   // R2 <- R2 + R1 - R4
181   R2_sign = AddSigned(R2, R2, R2_sign, R1, R1_sign);
182   R2_sign = SubtractSigned(R2, R2, R2_sign, R4, false);
183   // R1 <- R1 - R3
184   R1_sign = SubtractSigned(R1, R1, R1_sign, R3, R3_sign);
185 
186 #if DEBUG
187   R1.Normalize();
188   R2.Normalize();
189   R3.Normalize();
190   DCHECK(R1_sign == false || R1.len() == 0);
191   DCHECK(R2_sign == false || R2.len() == 0);
192   DCHECK(R3_sign == false || R3.len() == 0);
193 #endif
194 
195   // Phase 5: Recomposition. R0 is already in place. Overflow can't happen.
196   for (int j = R0.len(); j < Z.len(); j++) Z[j] = 0;
197   AddAndReturnOverflow(Z + i, R1);
198   AddAndReturnOverflow(Z + 2 * i, R2);
199   AddAndReturnOverflow(Z + 3 * i, R3);
200   AddAndReturnOverflow(Z + 4 * i, R4);
201 }
202 
MultiplyToomCook(RWDigits Z,Digits X,Digits Y)203 void ProcessorImpl::MultiplyToomCook(RWDigits Z, Digits X, Digits Y) {
204   DCHECK(X.len() >= Y.len());
205   int k = Y.len();
206   // TODO(jkummerow): Would it be a measurable improvement to share the
207   // scratch memory for several invocations?
208   Digits X0(X, 0, k);
209   Toom3Main(Z, X0, Y);
210   if (X.len() > Y.len()) {
211     ScratchDigits T(2 * k);
212     for (int i = k; i < X.len(); i += k) {
213       Digits Xi(X, i, k);
214       // TODO(jkummerow): would it be a measurable improvement to craft a
215       // "ToomChunk" method in the style of {KaratsubaChunk}?
216       Toom3Main(T, Xi, Y);
217       AddAndReturnOverflow(Z + i, T);  // Can't overflow.
218     }
219   }
220 }
221 
222 }  // namespace bigint
223 }  // namespace v8
224