• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2# Copyright 2019 The BoringSSL Authors
3#
4# Permission to use, copy, modify, and/or distribute this software for any
5# purpose with or without fee is hereby granted, provided that the above
6# copyright notice and this permission notice appear in all copies.
7#
8# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
11# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
13# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
14# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15
16# ghash-ssse3-x86_64.pl is a constant-time variant of the traditional 4-bit
17# table-based GHASH implementation. It requires SSSE3 instructions.
18#
19# For background, the table-based strategy is a 4-bit windowed multiplication.
20# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops
21# over 4-bit windows of the input and indexes them up into the table. Visually,
22# it multiplies as in the schoolbook multiplication diagram below, but with
23# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First
24# it incorporates the terms labeled '1' by indexing the most significant term
25# of X into the table. Then it shifts and repeats for '2' and so on.
26#
27#        hhhhhh
28#  *     xxxxxx
29#  ============
30#        666666
31#       555555
32#      444444
33#     333333
34#    222222
35#   111111
36#
37# This implementation changes the order. We treat the table as a 16×16 matrix
38# and transpose it. The first row is then the first byte of each multiple of H,
39# and so on. We then reorder terms as below. Observe that the terms labeled '1'
40# and '2' are all lookups into the first row, etc. This maps well to the SSSE3
41# pshufb instruction, using alternating terms of X in parallel as indices. This
42# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and
43# repeat for each row.
44#
45#        hhhhhh
46#  *     xxxxxx
47#  ============
48#        224466
49#       113355
50#      224466
51#     113355
52#    224466
53#   113355
54#
55# Next we account for GCM's confusing bit order. The "first" bit is the least
56# significant coefficient, but GCM treats the most sigificant bit within a byte
57# as first. Bytes are little-endian, and bits are big-endian. We reverse the
58# bytes in XMM registers for a consistent bit and byte ordering, but this means
59# the least significant bit is the most significant coefficient and vice versa.
60#
61# For consistency, "low", "high", "left-shift", and "right-shift" refer to the
62# bit ordering within the XMM register, rather than the reversed coefficient
63# ordering. Low bits are less significant bits and more significant
64# coefficients. Right-shifts move from MSB to the LSB and correspond to
65# increasing the power of each coefficient.
66#
67# Note this bit reversal enters into the table's column indices. H*1 is stored
68# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier
69# table rows contain more significant coefficients, so we iterate forwards.
70
71use strict;
72
73my $flavour = shift;
74my $output  = shift;
75if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
76
77my $win64 = 0;
78$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
79
80$0 =~ m/(.*[\/\\])[^\/\\]+$/;
81my $dir = $1;
82my $xlate;
83( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
84( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
85die "can't locate x86_64-xlate.pl";
86
87open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\"";
88*STDOUT = *OUT;
89
90my ($Xi, $Htable, $in, $len) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9") :
91                                        ("%rdi", "%rsi", "%rdx", "%rcx");
92
93
94my $code = <<____;
95.text
96
97# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|.
98# |Xi| is represented in GHASH's serialized byte representation. |Htable| is
99# formatted as described above.
100# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]);
101.type	gcm_gmult_ssse3, \@abi-omnipotent
102.globl	gcm_gmult_ssse3
103.align	16
104gcm_gmult_ssse3:
105.cfi_startproc
106.seh_startproc
107	_CET_ENDBR
108____
109$code .= <<____ if ($win64);
110	subq	\$40, %rsp
111.seh_stackalloc	40
112	movdqa	%xmm6, (%rsp)
113.seh_savexmm	%xmm6, 0
114	movdqa	%xmm10, 16(%rsp)
115.seh_savexmm	%xmm10, 16
116.seh_endprologue
117____
118$code .= <<____;
119	movdqu	($Xi), %xmm0
120	movdqa	.Lreverse_bytes(%rip), %xmm10
121	movdqa	.Llow4_mask(%rip), %xmm2
122
123	# Reverse input bytes to deserialize.
124	pshufb	%xmm10, %xmm0
125
126	# Split each byte into low (%xmm0) and high (%xmm1) halves.
127	movdqa	%xmm2, %xmm1
128	pandn	%xmm0, %xmm1
129	psrld	\$4, %xmm1
130	pand	%xmm2, %xmm0
131
132	# Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note
133	# that, due to bit reversal, %xmm3 contains bits that fall off when
134	# right-shifting, not left-shifting.
135	pxor	%xmm2, %xmm2
136	pxor	%xmm3, %xmm3
137____
138
139my $call_counter = 0;
140# process_rows returns assembly code to process $rows rows of the table. On
141# input, $Htable stores the pointer to the next row. %xmm0 and %xmm1 store the
142# low and high halves of the input. The result so far is passed in %xmm2. %xmm3
143# must be zero. On output, $Htable is advanced to the next row and %xmm2 is
144# updated. %xmm3 remains zero. It clobbers %rax, %xmm4, %xmm5, and %xmm6.
145sub process_rows {
146    my ($rows) = @_;
147    $call_counter++;
148
149    # Shifting whole XMM registers by bits is complex. psrldq shifts by bytes,
150    # and psrlq shifts the two 64-bit halves separately. Each row produces 8
151    # bits of carry, and the reduction needs an additional 7-bit shift. This
152    # must fit in 64 bits so reduction can use psrlq. This allows up to 7 rows
153    # at a time.
154    die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64);
155
156    return <<____;
157	movq	\$$rows, %rax
158.Loop_row_$call_counter:
159	movdqu	($Htable), %xmm4
160	leaq	16($Htable), $Htable
161
162	# Right-shift %xmm2 and %xmm3 by 8 bytes.
163	movdqa	%xmm2, %xmm6
164	palignr	\$1, %xmm3, %xmm6
165	movdqa	%xmm6, %xmm3
166	psrldq	\$1, %xmm2
167
168	# Load the next table row and index the low and high bits of the input.
169	# Note the low (respectively, high) half corresponds to more
170	# (respectively, less) significant coefficients.
171	movdqa	%xmm4, %xmm5
172	pshufb	%xmm0, %xmm4
173	pshufb	%xmm1, %xmm5
174
175	# Add the high half (%xmm5) without shifting.
176	pxor	%xmm5, %xmm2
177
178	# Add the low half (%xmm4). This must be right-shifted by 4 bits. First,
179	# add into the carry register (%xmm3).
180	movdqa	%xmm4, %xmm5
181	psllq	\$60, %xmm5
182	movdqa	%xmm5, %xmm6
183	pslldq	\$8, %xmm6
184	pxor	%xmm6, %xmm3
185
186	# Next, add into %xmm2.
187	psrldq	\$8, %xmm5
188	pxor	%xmm5, %xmm2
189	psrlq	\$4, %xmm4
190	pxor	%xmm4, %xmm2
191
192	subq	\$1, %rax
193	jnz	.Loop_row_$call_counter
194
195	# Reduce the carry register. The reduction polynomial is 1 + x + x^2 +
196	# x^7, so we shift and XOR four times.
197	pxor	%xmm3, %xmm2	# x^0 = 0
198	psrlq	\$1, %xmm3
199	pxor	%xmm3, %xmm2	# x^1 = x
200	psrlq	\$1, %xmm3
201	pxor	%xmm3, %xmm2	# x^(1+1) = x^2
202	psrlq	\$5, %xmm3
203	pxor	%xmm3, %xmm2	# x^(1+1+5) = x^7
204	pxor	%xmm3, %xmm3
205____
206}
207
208# We must reduce at least once every 7 rows, so divide into three chunks.
209$code .= process_rows(5);
210$code .= process_rows(5);
211$code .= process_rows(6);
212
213$code .= <<____;
214	# Store the result. Reverse bytes to serialize.
215	pshufb	%xmm10, %xmm2
216	movdqu	%xmm2, ($Xi)
217
218	# Zero any registers which contain secrets.
219	pxor	%xmm0, %xmm0
220	pxor	%xmm1, %xmm1
221	pxor	%xmm2, %xmm2
222	pxor	%xmm3, %xmm3
223	pxor	%xmm4, %xmm4
224	pxor	%xmm5, %xmm5
225	pxor	%xmm6, %xmm6
226____
227$code .= <<____ if ($win64);
228	movdqa	(%rsp), %xmm6
229	movdqa	16(%rsp), %xmm10
230	addq	\$40, %rsp
231____
232$code .= <<____;
233	ret
234.cfi_endproc
235.seh_endproc
236.size	gcm_gmult_ssse3,.-gcm_gmult_ssse3
237____
238
239$code .= <<____;
240# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as
241# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's
242# serialized byte representation. |Htable| is formatted as described above.
243# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in,
244#                      size_t len);
245.type	gcm_ghash_ssse3, \@abi-omnipotent
246.globl	gcm_ghash_ssse3
247.align	16
248gcm_ghash_ssse3:
249.cfi_startproc
250.seh_startproc
251	_CET_ENDBR
252____
253$code .= <<____ if ($win64);
254	subq	\$56, %rsp
255.seh_stackalloc	56
256	movdqa	%xmm6, (%rsp)
257.seh_savexmm	%xmm6, 0
258	movdqa	%xmm10, 16(%rsp)
259.seh_savexmm	%xmm10, 16
260	movdqa	%xmm11, 32(%rsp)
261.seh_savexmm	%xmm11, 32
262.seh_endprologue
263____
264$code .= <<____;
265	movdqu	($Xi), %xmm0
266	movdqa	.Lreverse_bytes(%rip), %xmm10
267	movdqa	.Llow4_mask(%rip), %xmm11
268
269	# This function only processes whole blocks.
270	andq	\$-16, $len
271
272	# Reverse input bytes to deserialize. We maintain the running
273	# total in %xmm0.
274	pshufb	%xmm10, %xmm0
275
276	# Iterate over each block. On entry to each iteration, %xmm3 is zero.
277	pxor	%xmm3, %xmm3
278.Loop_ghash:
279	# Incorporate the next block of input.
280	movdqu	($in), %xmm1
281	pshufb	%xmm10, %xmm1	# Reverse bytes.
282	pxor	%xmm1, %xmm0
283
284	# Split each byte into low (%xmm0) and high (%xmm1) halves.
285	movdqa	%xmm11, %xmm1
286	pandn	%xmm0, %xmm1
287	psrld	\$4, %xmm1
288	pand	%xmm11, %xmm0
289
290	# Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note
291	# that, due to bit reversal, %xmm3 contains bits that fall off when
292	# right-shifting, not left-shifting.
293	pxor	%xmm2, %xmm2
294	# %xmm3 is already zero at this point.
295____
296
297# We must reduce at least once every 7 rows, so divide into three chunks.
298$code .= process_rows(5);
299$code .= process_rows(5);
300$code .= process_rows(6);
301
302$code .= <<____;
303	movdqa	%xmm2, %xmm0
304
305	# Rewind $Htable for the next iteration.
306	leaq	-256($Htable), $Htable
307
308	# Advance input and continue.
309	leaq	16($in), $in
310	subq	\$16, $len
311	jnz	.Loop_ghash
312
313	# Reverse bytes and store the result.
314	pshufb	%xmm10, %xmm0
315	movdqu	%xmm0, ($Xi)
316
317	# Zero any registers which contain secrets.
318	pxor	%xmm0, %xmm0
319	pxor	%xmm1, %xmm1
320	pxor	%xmm2, %xmm2
321	pxor	%xmm3, %xmm3
322	pxor	%xmm4, %xmm4
323	pxor	%xmm5, %xmm5
324	pxor	%xmm6, %xmm6
325____
326$code .= <<____ if ($win64);
327	movdqa	(%rsp), %xmm6
328	movdqa	16(%rsp), %xmm10
329	movdqa	32(%rsp), %xmm11
330	addq	\$56, %rsp
331____
332$code .= <<____;
333	ret
334.cfi_endproc
335.seh_endproc
336.size	gcm_ghash_ssse3,.-gcm_ghash_ssse3
337
338.section .rodata
339.align	16
340# .Lreverse_bytes is a permutation which, if applied with pshufb, reverses the
341# bytes in an XMM register.
342.Lreverse_bytes:
343.byte	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
344# .Llow4_mask is an XMM mask which selects the low four bits of each byte.
345.Llow4_mask:
346.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
347.text
348____
349
350print $code;
351close STDOUT or die "error closing STDOUT: $!";
352