1#!/usr/bin/env perl 2# Copyright 2019 The BoringSSL Authors 3# 4# Permission to use, copy, modify, and/or distribute this software for any 5# purpose with or without fee is hereby granted, provided that the above 6# copyright notice and this permission notice appear in all copies. 7# 8# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 9# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 10# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 11# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 12# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION 13# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 14# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 15 16# ghash-ssse3-x86_64.pl is a constant-time variant of the traditional 4-bit 17# table-based GHASH implementation. It requires SSSE3 instructions. 18# 19# For background, the table-based strategy is a 4-bit windowed multiplication. 20# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops 21# over 4-bit windows of the input and indexes them up into the table. Visually, 22# it multiplies as in the schoolbook multiplication diagram below, but with 23# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First 24# it incorporates the terms labeled '1' by indexing the most significant term 25# of X into the table. Then it shifts and repeats for '2' and so on. 26# 27# hhhhhh 28# * xxxxxx 29# ============ 30# 666666 31# 555555 32# 444444 33# 333333 34# 222222 35# 111111 36# 37# This implementation changes the order. We treat the table as a 16×16 matrix 38# and transpose it. The first row is then the first byte of each multiple of H, 39# and so on. We then reorder terms as below. Observe that the terms labeled '1' 40# and '2' are all lookups into the first row, etc. This maps well to the SSSE3 41# pshufb instruction, using alternating terms of X in parallel as indices. This 42# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and 43# repeat for each row. 44# 45# hhhhhh 46# * xxxxxx 47# ============ 48# 224466 49# 113355 50# 224466 51# 113355 52# 224466 53# 113355 54# 55# Next we account for GCM's confusing bit order. The "first" bit is the least 56# significant coefficient, but GCM treats the most sigificant bit within a byte 57# as first. Bytes are little-endian, and bits are big-endian. We reverse the 58# bytes in XMM registers for a consistent bit and byte ordering, but this means 59# the least significant bit is the most significant coefficient and vice versa. 60# 61# For consistency, "low", "high", "left-shift", and "right-shift" refer to the 62# bit ordering within the XMM register, rather than the reversed coefficient 63# ordering. Low bits are less significant bits and more significant 64# coefficients. Right-shifts move from MSB to the LSB and correspond to 65# increasing the power of each coefficient. 66# 67# Note this bit reversal enters into the table's column indices. H*1 is stored 68# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier 69# table rows contain more significant coefficients, so we iterate forwards. 70 71use strict; 72 73my $flavour = shift; 74my $output = shift; 75if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 76 77my $win64 = 0; 78$win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 79 80$0 =~ m/(.*[\/\\])[^\/\\]+$/; 81my $dir = $1; 82my $xlate; 83( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 84( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 85die "can't locate x86_64-xlate.pl"; 86 87open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""; 88*STDOUT = *OUT; 89 90my ($Xi, $Htable, $in, $len) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9") : 91 ("%rdi", "%rsi", "%rdx", "%rcx"); 92 93 94my $code = <<____; 95.text 96 97# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|. 98# |Xi| is represented in GHASH's serialized byte representation. |Htable| is 99# formatted as described above. 100# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]); 101.type gcm_gmult_ssse3, \@abi-omnipotent 102.globl gcm_gmult_ssse3 103.align 16 104gcm_gmult_ssse3: 105.cfi_startproc 106.seh_startproc 107 _CET_ENDBR 108____ 109$code .= <<____ if ($win64); 110 subq \$40, %rsp 111.seh_stackalloc 40 112 movdqa %xmm6, (%rsp) 113.seh_savexmm %xmm6, 0 114 movdqa %xmm10, 16(%rsp) 115.seh_savexmm %xmm10, 16 116.seh_endprologue 117____ 118$code .= <<____; 119 movdqu ($Xi), %xmm0 120 movdqa .Lreverse_bytes(%rip), %xmm10 121 movdqa .Llow4_mask(%rip), %xmm2 122 123 # Reverse input bytes to deserialize. 124 pshufb %xmm10, %xmm0 125 126 # Split each byte into low (%xmm0) and high (%xmm1) halves. 127 movdqa %xmm2, %xmm1 128 pandn %xmm0, %xmm1 129 psrld \$4, %xmm1 130 pand %xmm2, %xmm0 131 132 # Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note 133 # that, due to bit reversal, %xmm3 contains bits that fall off when 134 # right-shifting, not left-shifting. 135 pxor %xmm2, %xmm2 136 pxor %xmm3, %xmm3 137____ 138 139my $call_counter = 0; 140# process_rows returns assembly code to process $rows rows of the table. On 141# input, $Htable stores the pointer to the next row. %xmm0 and %xmm1 store the 142# low and high halves of the input. The result so far is passed in %xmm2. %xmm3 143# must be zero. On output, $Htable is advanced to the next row and %xmm2 is 144# updated. %xmm3 remains zero. It clobbers %rax, %xmm4, %xmm5, and %xmm6. 145sub process_rows { 146 my ($rows) = @_; 147 $call_counter++; 148 149 # Shifting whole XMM registers by bits is complex. psrldq shifts by bytes, 150 # and psrlq shifts the two 64-bit halves separately. Each row produces 8 151 # bits of carry, and the reduction needs an additional 7-bit shift. This 152 # must fit in 64 bits so reduction can use psrlq. This allows up to 7 rows 153 # at a time. 154 die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64); 155 156 return <<____; 157 movq \$$rows, %rax 158.Loop_row_$call_counter: 159 movdqu ($Htable), %xmm4 160 leaq 16($Htable), $Htable 161 162 # Right-shift %xmm2 and %xmm3 by 8 bytes. 163 movdqa %xmm2, %xmm6 164 palignr \$1, %xmm3, %xmm6 165 movdqa %xmm6, %xmm3 166 psrldq \$1, %xmm2 167 168 # Load the next table row and index the low and high bits of the input. 169 # Note the low (respectively, high) half corresponds to more 170 # (respectively, less) significant coefficients. 171 movdqa %xmm4, %xmm5 172 pshufb %xmm0, %xmm4 173 pshufb %xmm1, %xmm5 174 175 # Add the high half (%xmm5) without shifting. 176 pxor %xmm5, %xmm2 177 178 # Add the low half (%xmm4). This must be right-shifted by 4 bits. First, 179 # add into the carry register (%xmm3). 180 movdqa %xmm4, %xmm5 181 psllq \$60, %xmm5 182 movdqa %xmm5, %xmm6 183 pslldq \$8, %xmm6 184 pxor %xmm6, %xmm3 185 186 # Next, add into %xmm2. 187 psrldq \$8, %xmm5 188 pxor %xmm5, %xmm2 189 psrlq \$4, %xmm4 190 pxor %xmm4, %xmm2 191 192 subq \$1, %rax 193 jnz .Loop_row_$call_counter 194 195 # Reduce the carry register. The reduction polynomial is 1 + x + x^2 + 196 # x^7, so we shift and XOR four times. 197 pxor %xmm3, %xmm2 # x^0 = 0 198 psrlq \$1, %xmm3 199 pxor %xmm3, %xmm2 # x^1 = x 200 psrlq \$1, %xmm3 201 pxor %xmm3, %xmm2 # x^(1+1) = x^2 202 psrlq \$5, %xmm3 203 pxor %xmm3, %xmm2 # x^(1+1+5) = x^7 204 pxor %xmm3, %xmm3 205____ 206} 207 208# We must reduce at least once every 7 rows, so divide into three chunks. 209$code .= process_rows(5); 210$code .= process_rows(5); 211$code .= process_rows(6); 212 213$code .= <<____; 214 # Store the result. Reverse bytes to serialize. 215 pshufb %xmm10, %xmm2 216 movdqu %xmm2, ($Xi) 217 218 # Zero any registers which contain secrets. 219 pxor %xmm0, %xmm0 220 pxor %xmm1, %xmm1 221 pxor %xmm2, %xmm2 222 pxor %xmm3, %xmm3 223 pxor %xmm4, %xmm4 224 pxor %xmm5, %xmm5 225 pxor %xmm6, %xmm6 226____ 227$code .= <<____ if ($win64); 228 movdqa (%rsp), %xmm6 229 movdqa 16(%rsp), %xmm10 230 addq \$40, %rsp 231____ 232$code .= <<____; 233 ret 234.cfi_endproc 235.seh_endproc 236.size gcm_gmult_ssse3,.-gcm_gmult_ssse3 237____ 238 239$code .= <<____; 240# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as 241# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's 242# serialized byte representation. |Htable| is formatted as described above. 243# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, 244# size_t len); 245.type gcm_ghash_ssse3, \@abi-omnipotent 246.globl gcm_ghash_ssse3 247.align 16 248gcm_ghash_ssse3: 249.cfi_startproc 250.seh_startproc 251 _CET_ENDBR 252____ 253$code .= <<____ if ($win64); 254 subq \$56, %rsp 255.seh_stackalloc 56 256 movdqa %xmm6, (%rsp) 257.seh_savexmm %xmm6, 0 258 movdqa %xmm10, 16(%rsp) 259.seh_savexmm %xmm10, 16 260 movdqa %xmm11, 32(%rsp) 261.seh_savexmm %xmm11, 32 262.seh_endprologue 263____ 264$code .= <<____; 265 movdqu ($Xi), %xmm0 266 movdqa .Lreverse_bytes(%rip), %xmm10 267 movdqa .Llow4_mask(%rip), %xmm11 268 269 # This function only processes whole blocks. 270 andq \$-16, $len 271 272 # Reverse input bytes to deserialize. We maintain the running 273 # total in %xmm0. 274 pshufb %xmm10, %xmm0 275 276 # Iterate over each block. On entry to each iteration, %xmm3 is zero. 277 pxor %xmm3, %xmm3 278.Loop_ghash: 279 # Incorporate the next block of input. 280 movdqu ($in), %xmm1 281 pshufb %xmm10, %xmm1 # Reverse bytes. 282 pxor %xmm1, %xmm0 283 284 # Split each byte into low (%xmm0) and high (%xmm1) halves. 285 movdqa %xmm11, %xmm1 286 pandn %xmm0, %xmm1 287 psrld \$4, %xmm1 288 pand %xmm11, %xmm0 289 290 # Maintain the result in %xmm2 (the value) and %xmm3 (carry bits). Note 291 # that, due to bit reversal, %xmm3 contains bits that fall off when 292 # right-shifting, not left-shifting. 293 pxor %xmm2, %xmm2 294 # %xmm3 is already zero at this point. 295____ 296 297# We must reduce at least once every 7 rows, so divide into three chunks. 298$code .= process_rows(5); 299$code .= process_rows(5); 300$code .= process_rows(6); 301 302$code .= <<____; 303 movdqa %xmm2, %xmm0 304 305 # Rewind $Htable for the next iteration. 306 leaq -256($Htable), $Htable 307 308 # Advance input and continue. 309 leaq 16($in), $in 310 subq \$16, $len 311 jnz .Loop_ghash 312 313 # Reverse bytes and store the result. 314 pshufb %xmm10, %xmm0 315 movdqu %xmm0, ($Xi) 316 317 # Zero any registers which contain secrets. 318 pxor %xmm0, %xmm0 319 pxor %xmm1, %xmm1 320 pxor %xmm2, %xmm2 321 pxor %xmm3, %xmm3 322 pxor %xmm4, %xmm4 323 pxor %xmm5, %xmm5 324 pxor %xmm6, %xmm6 325____ 326$code .= <<____ if ($win64); 327 movdqa (%rsp), %xmm6 328 movdqa 16(%rsp), %xmm10 329 movdqa 32(%rsp), %xmm11 330 addq \$56, %rsp 331____ 332$code .= <<____; 333 ret 334.cfi_endproc 335.seh_endproc 336.size gcm_ghash_ssse3,.-gcm_ghash_ssse3 337 338.section .rodata 339.align 16 340# .Lreverse_bytes is a permutation which, if applied with pshufb, reverses the 341# bytes in an XMM register. 342.Lreverse_bytes: 343.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 344# .Llow4_mask is an XMM mask which selects the low four bits of each byte. 345.Llow4_mask: 346.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f 347.text 348____ 349 350print $code; 351close STDOUT or die "error closing STDOUT: $!"; 352