Lines Matching +full:16 +full:- +full:bits
2 # Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
50 # /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
68 movdqu \offset+16(buf), %xmm12
95 # Assumes len >= 16.
97 .align 16
108 movdqu 16*0(buf), %xmm0
109 movdqu 16*1(buf), %xmm1
110 movdqu 16*2(buf), %xmm2
111 movdqu 16*3(buf), %xmm3
112 movdqu 16*4(buf), %xmm4
113 movdqu 16*5(buf), %xmm5
114 movdqu 16*6(buf), %xmm6
115 movdqu 16*7(buf), %xmm7
126 # XOR the first 16 data *bits* with the initial CRC value.
137 # While >= 128 data bytes remain (not counting xmm0-7), fold the 128
138 # bytes xmm0-7 into them, storing the result back into xmm0-7.
148 # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7.
160 # Fold across 16 bytes.
166 # Then subtract 16 to simplify the termination condition of the
168 add $128-16, len
170 # While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes
181 add $16, buf
182 sub $16, len
186 # Add 16 to get the correct number of data bytes remaining in 0...15
187 # (not counting xmm7), following the previous extra subtraction by 16.
188 add $16, len
192 # Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16
195 # the bytes into a first chunk of 'len' bytes and a second chunk of 16
200 # xmm1 = last 16 original data bytes
201 movdqu -16(buf, len), %xmm1
204 # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes.
205 lea .Lbyteshift_table+16(%rip), %rax
210 # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes.
214 # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes),
215 # then '16-len' bytes from xmm2 (high-order bytes).
226 # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC
231 # Fold the high 64 bits into the low 64 bits, while also multiplying by
232 # x^64. This produces a 128-bit value congruent to x^64 * M(x) and
233 # whose low 48 bits are 0.
235 pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x))
237 pxor %xmm0, %xmm7 # + low bits * x^64
239 # Fold the high 32 bits into the low 96 bits. This produces a 96-bit
240 # value congruent to x^64 * M(x) and whose low 48 bits are 0.
242 pand .Lmask2(%rip), %xmm0 # zero high 32 bits
243 psrldq $12, %xmm7 # extract high 32 bits
244 pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x))
245 pxor %xmm0, %xmm7 # + low bits
252 pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x))
256 pxor %xmm7, %xmm0 # + low 16 nonzero bits
257 # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0.
262 .align 16
264 # Checksumming a buffer of length 16...255 bytes
266 # Load the first 16 data bytes.
269 add $16, buf
271 # XOR the first 16 data *bits* with the initial CRC value.
277 cmp $16, len
278 je .Lreduce_final_16_bytes # len == 16
281 add $16, len
286 .align 16
289 # G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
309 .section .rodata.cst16.mask1, "aM", @progbits, 16
310 .align 16
314 .section .rodata.cst16.mask2, "aM", @progbits, 16
315 .align 16
319 .section .rodata.cst16.bswap_mask, "aM", @progbits, 16
320 .align 16
325 .align 16
326 # For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len]
328 # 0x80} XOR the index vector to shift right by '16 - len' bytes.