1/* 2 * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) 3 * 4 * The white papers on CRC32C calculations with PCLMULQDQ instruction can be 5 * downloaded from: 6 * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf 7 * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf 8 * 9 * Copyright (C) 2012 Intel Corporation. 10 * 11 * Authors: 12 * Wajdi Feghali <wajdi.k.feghali@intel.com> 13 * James Guilford <james.guilford@intel.com> 14 * David Cote <david.m.cote@intel.com> 15 * Tim Chen <tim.c.chen@linux.intel.com> 16 * 17 * This software is available to you under a choice of one of two 18 * licenses. You may choose to be licensed under the terms of the GNU 19 * General Public License (GPL) Version 2, available from the file 20 * COPYING in the main directory of this source tree, or the 21 * OpenIB.org BSD license below: 22 * 23 * Redistribution and use in source and binary forms, with or 24 * without modification, are permitted provided that the following 25 * conditions are met: 26 * 27 * - Redistributions of source code must retain the above 28 * copyright notice, this list of conditions and the following 29 * disclaimer. 30 * 31 * - Redistributions in binary form must reproduce the above 32 * copyright notice, this list of conditions and the following 33 * disclaimer in the documentation and/or other materials 34 * provided with the distribution. 35 * 36 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 37 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 38 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 39 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 40 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 41 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 42 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 43 * SOFTWARE. 44 */ 45 46#include <asm/inst.h> 47#include <linux/linkage.h> 48#include <asm/nospec-branch.h> 49 50## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction 51 52.macro LABEL prefix n 53\prefix\n\(): 54.endm 55 56.macro JMPTBL_ENTRY i 57.word crc_\i - crc_array 58.endm 59 60.macro JNC_LESS_THAN j 61 jnc less_than_\j 62.endm 63 64# Define threshold where buffers are considered "small" and routed to more 65# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so 66# SMALL_SIZE can be no larger than 255. 67 68#define SMALL_SIZE 200 69 70.if (SMALL_SIZE > 255) 71.error "SMALL_ SIZE must be < 256" 72.endif 73 74# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); 75 76.text 77ENTRY(crc_pcl) 78#define bufp %rdi 79#define bufp_dw %edi 80#define bufp_w %di 81#define bufp_b %dil 82#define bufptmp %rcx 83#define block_0 %rcx 84#define block_1 %rdx 85#define block_2 %r11 86#define len %rsi 87#define len_dw %esi 88#define len_w %si 89#define len_b %sil 90#define crc_init_arg %rdx 91#define tmp %rbx 92#define crc_init %r8 93#define crc_init_dw %r8d 94#define crc1 %r9 95#define crc2 %r10 96 97 pushq %rbx 98 pushq %rdi 99 pushq %rsi 100 101 ## Move crc_init for Linux to a different 102 mov crc_init_arg, crc_init 103 104 ################################################################ 105 ## 1) ALIGN: 106 ################################################################ 107 108 mov bufp, bufptmp # rdi = *buf 109 neg bufp 110 and $7, bufp # calculate the unalignment amount of 111 # the address 112 je proc_block # Skip if aligned 113 114 ## If len is less than 8 and we're unaligned, we need to jump 115 ## to special code to avoid reading beyond the end of the buffer 116 cmp $8, len 117 jae do_align 118 # less_than_8 expects length in upper 3 bits of len_dw 119 # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] 120 shl $32-3+1, len_dw 121 jmp less_than_8_post_shl1 122 123do_align: 124 #### Calculate CRC of unaligned bytes of the buffer (if any) 125 movq (bufptmp), tmp # load a quadward from the buffer 126 add bufp, bufptmp # align buffer pointer for quadword 127 # processing 128 sub bufp, len # update buffer length 129align_loop: 130 crc32b %bl, crc_init_dw # compute crc32 of 1-byte 131 shr $8, tmp # get next byte 132 dec bufp 133 jne align_loop 134 135proc_block: 136 137 ################################################################ 138 ## 2) PROCESS BLOCKS: 139 ################################################################ 140 141 ## compute num of bytes to be processed 142 movq len, tmp # save num bytes in tmp 143 144 cmpq $128*24, len 145 jae full_block 146 147continue_block: 148 cmpq $SMALL_SIZE, len 149 jb small 150 151 ## len < 128*24 152 movq $2731, %rax # 2731 = ceil(2^16 / 24) 153 mul len_dw 154 shrq $16, %rax 155 156 ## eax contains floor(bytes / 24) = num 24-byte chunks to do 157 158 ## process rax 24-byte chunks (128 >= rax >= 0) 159 160 ## compute end address of each block 161 ## block 0 (base addr + RAX * 8) 162 ## block 1 (base addr + RAX * 16) 163 ## block 2 (base addr + RAX * 24) 164 lea (bufptmp, %rax, 8), block_0 165 lea (block_0, %rax, 8), block_1 166 lea (block_1, %rax, 8), block_2 167 168 xor crc1, crc1 169 xor crc2, crc2 170 171 ## branch into array 172 lea jump_table(%rip), bufp 173 movzxw (bufp, %rax, 2), len 174 offset=crc_array-jump_table 175 lea offset(bufp, len, 1), bufp 176 JMP_NOSPEC bufp 177 178 ################################################################ 179 ## 2a) PROCESS FULL BLOCKS: 180 ################################################################ 181full_block: 182 movl $128,%eax 183 lea 128*8*2(block_0), block_1 184 lea 128*8*3(block_0), block_2 185 add $128*8*1, block_0 186 187 xor crc1,crc1 188 xor crc2,crc2 189 190 # Fall thruogh into top of crc array (crc_128) 191 192 ################################################################ 193 ## 3) CRC Array: 194 ################################################################ 195 196crc_array: 197 i=128 198.rept 128-1 199.altmacro 200LABEL crc_ %i 201.noaltmacro 202 crc32q -i*8(block_0), crc_init 203 crc32q -i*8(block_1), crc1 204 crc32q -i*8(block_2), crc2 205 i=(i-1) 206.endr 207 208.altmacro 209LABEL crc_ %i 210.noaltmacro 211 crc32q -i*8(block_0), crc_init 212 crc32q -i*8(block_1), crc1 213# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet 214 215 mov block_2, block_0 216 217 ################################################################ 218 ## 4) Combine three results: 219 ################################################################ 220 221 lea (K_table-8)(%rip), bufp # first entry is for idx 1 222 shlq $3, %rax # rax *= 8 223 pmovzxdq (bufp,%rax), %xmm0 # 2 consts: K1:K2 224 leal (%eax,%eax,2), %eax # rax *= 3 (total *24) 225 subq %rax, tmp # tmp -= rax*24 226 227 movq crc_init, %xmm1 # CRC for block 1 228 PCLMULQDQ 0x00,%xmm0,%xmm1 # Multiply by K2 229 230 movq crc1, %xmm2 # CRC for block 2 231 PCLMULQDQ 0x10, %xmm0, %xmm2 # Multiply by K1 232 233 pxor %xmm2,%xmm1 234 movq %xmm1, %rax 235 xor -i*8(block_2), %rax 236 mov crc2, crc_init 237 crc32 %rax, crc_init 238 239 ################################################################ 240 ## 5) Check for end: 241 ################################################################ 242 243LABEL crc_ 0 244 mov tmp, len 245 cmp $128*24, tmp 246 jae full_block 247 cmp $24, tmp 248 jae continue_block 249 250less_than_24: 251 shl $32-4, len_dw # less_than_16 expects length 252 # in upper 4 bits of len_dw 253 jnc less_than_16 254 crc32q (bufptmp), crc_init 255 crc32q 8(bufptmp), crc_init 256 jz do_return 257 add $16, bufptmp 258 # len is less than 8 if we got here 259 # less_than_8 expects length in upper 3 bits of len_dw 260 # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] 261 shl $2, len_dw 262 jmp less_than_8_post_shl1 263 264 ####################################################################### 265 ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) 266 ####################################################################### 267small: 268 shl $32-8, len_dw # Prepare len_dw for less_than_256 269 j=256 270.rept 5 # j = {256, 128, 64, 32, 16} 271.altmacro 272LABEL less_than_ %j # less_than_j: Length should be in 273 # upper lg(j) bits of len_dw 274 j=(j/2) 275 shl $1, len_dw # Get next MSB 276 JNC_LESS_THAN %j 277.noaltmacro 278 i=0 279.rept (j/8) 280 crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data 281 i=i+8 282.endr 283 jz do_return # Return if remaining length is zero 284 add $j, bufptmp # Advance buf 285.endr 286 287less_than_8: # Length should be stored in 288 # upper 3 bits of len_dw 289 shl $1, len_dw 290less_than_8_post_shl1: 291 jnc less_than_4 292 crc32l (bufptmp), crc_init_dw # CRC of 4 bytes 293 jz do_return # return if remaining data is zero 294 add $4, bufptmp 295less_than_4: # Length should be stored in 296 # upper 2 bits of len_dw 297 shl $1, len_dw 298 jnc less_than_2 299 crc32w (bufptmp), crc_init_dw # CRC of 2 bytes 300 jz do_return # return if remaining data is zero 301 add $2, bufptmp 302less_than_2: # Length should be stored in the MSB 303 # of len_dw 304 shl $1, len_dw 305 jnc less_than_1 306 crc32b (bufptmp), crc_init_dw # CRC of 1 byte 307less_than_1: # Length should be zero 308do_return: 309 movq crc_init, %rax 310 popq %rsi 311 popq %rdi 312 popq %rbx 313 ret 314 315 ################################################################ 316 ## jump table Table is 129 entries x 2 bytes each 317 ################################################################ 318.align 4 319jump_table: 320 i=0 321.rept 129 322.altmacro 323JMPTBL_ENTRY %i 324.noaltmacro 325 i=i+1 326.endr 327 328ENDPROC(crc_pcl) 329 330 ################################################################ 331 ## PCLMULQDQ tables 332 ## Table is 128 entries x 2 words (8 bytes) each 333 ################################################################ 334.section .rodata, "a", %progbits 335.align 8 336K_table: 337 .long 0x493c7d27, 0x00000001 338 .long 0xba4fc28e, 0x493c7d27 339 .long 0xddc0152b, 0xf20c0dfe 340 .long 0x9e4addf8, 0xba4fc28e 341 .long 0x39d3b296, 0x3da6d0cb 342 .long 0x0715ce53, 0xddc0152b 343 .long 0x47db8317, 0x1c291d04 344 .long 0x0d3b6092, 0x9e4addf8 345 .long 0xc96cfdc0, 0x740eef02 346 .long 0x878a92a7, 0x39d3b296 347 .long 0xdaece73e, 0x083a6eec 348 .long 0xab7aff2a, 0x0715ce53 349 .long 0x2162d385, 0xc49f4f67 350 .long 0x83348832, 0x47db8317 351 .long 0x299847d5, 0x2ad91c30 352 .long 0xb9e02b86, 0x0d3b6092 353 .long 0x18b33a4e, 0x6992cea2 354 .long 0xb6dd949b, 0xc96cfdc0 355 .long 0x78d9ccb7, 0x7e908048 356 .long 0xbac2fd7b, 0x878a92a7 357 .long 0xa60ce07b, 0x1b3d8f29 358 .long 0xce7f39f4, 0xdaece73e 359 .long 0x61d82e56, 0xf1d0f55e 360 .long 0xd270f1a2, 0xab7aff2a 361 .long 0xc619809d, 0xa87ab8a8 362 .long 0x2b3cac5d, 0x2162d385 363 .long 0x65863b64, 0x8462d800 364 .long 0x1b03397f, 0x83348832 365 .long 0xebb883bd, 0x71d111a8 366 .long 0xb3e32c28, 0x299847d5 367 .long 0x064f7f26, 0xffd852c6 368 .long 0xdd7e3b0c, 0xb9e02b86 369 .long 0xf285651c, 0xdcb17aa4 370 .long 0x10746f3c, 0x18b33a4e 371 .long 0xc7a68855, 0xf37c5aee 372 .long 0x271d9844, 0xb6dd949b 373 .long 0x8e766a0c, 0x6051d5a2 374 .long 0x93a5f730, 0x78d9ccb7 375 .long 0x6cb08e5c, 0x18b0d4ff 376 .long 0x6b749fb2, 0xbac2fd7b 377 .long 0x1393e203, 0x21f3d99c 378 .long 0xcec3662e, 0xa60ce07b 379 .long 0x96c515bb, 0x8f158014 380 .long 0xe6fc4e6a, 0xce7f39f4 381 .long 0x8227bb8a, 0xa00457f7 382 .long 0xb0cd4768, 0x61d82e56 383 .long 0x39c7ff35, 0x8d6d2c43 384 .long 0xd7a4825c, 0xd270f1a2 385 .long 0x0ab3844b, 0x00ac29cf 386 .long 0x0167d312, 0xc619809d 387 .long 0xf6076544, 0xe9adf796 388 .long 0x26f6a60a, 0x2b3cac5d 389 .long 0xa741c1bf, 0x96638b34 390 .long 0x98d8d9cb, 0x65863b64 391 .long 0x49c3cc9c, 0xe0e9f351 392 .long 0x68bce87a, 0x1b03397f 393 .long 0x57a3d037, 0x9af01f2d 394 .long 0x6956fc3b, 0xebb883bd 395 .long 0x42d98888, 0x2cff42cf 396 .long 0x3771e98f, 0xb3e32c28 397 .long 0xb42ae3d9, 0x88f25a3a 398 .long 0x2178513a, 0x064f7f26 399 .long 0xe0ac139e, 0x4e36f0b0 400 .long 0x170076fa, 0xdd7e3b0c 401 .long 0x444dd413, 0xbd6f81f8 402 .long 0x6f345e45, 0xf285651c 403 .long 0x41d17b64, 0x91c9bd4b 404 .long 0xff0dba97, 0x10746f3c 405 .long 0xa2b73df1, 0x885f087b 406 .long 0xf872e54c, 0xc7a68855 407 .long 0x1e41e9fc, 0x4c144932 408 .long 0x86d8e4d2, 0x271d9844 409 .long 0x651bd98b, 0x52148f02 410 .long 0x5bb8f1bc, 0x8e766a0c 411 .long 0xa90fd27a, 0xa3c6f37a 412 .long 0xb3af077a, 0x93a5f730 413 .long 0x4984d782, 0xd7c0557f 414 .long 0xca6ef3ac, 0x6cb08e5c 415 .long 0x234e0b26, 0x63ded06a 416 .long 0xdd66cbbb, 0x6b749fb2 417 .long 0x4597456a, 0x4d56973c 418 .long 0xe9e28eb4, 0x1393e203 419 .long 0x7b3ff57a, 0x9669c9df 420 .long 0xc9c8b782, 0xcec3662e 421 .long 0x3f70cc6f, 0xe417f38a 422 .long 0x93e106a4, 0x96c515bb 423 .long 0x62ec6c6d, 0x4b9e0f71 424 .long 0xd813b325, 0xe6fc4e6a 425 .long 0x0df04680, 0xd104b8fc 426 .long 0x2342001e, 0x8227bb8a 427 .long 0x0a2a8d7e, 0x5b397730 428 .long 0x6d9a4957, 0xb0cd4768 429 .long 0xe8b6368b, 0xe78eb416 430 .long 0xd2c3ed1a, 0x39c7ff35 431 .long 0x995a5724, 0x61ff0e01 432 .long 0x9ef68d35, 0xd7a4825c 433 .long 0x0c139b31, 0x8d96551c 434 .long 0xf2271e60, 0x0ab3844b 435 .long 0x0b0bf8ca, 0x0bf80dd2 436 .long 0x2664fd8b, 0x0167d312 437 .long 0xed64812d, 0x8821abed 438 .long 0x02ee03b2, 0xf6076544 439 .long 0x8604ae0f, 0x6a45d2b2 440 .long 0x363bd6b3, 0x26f6a60a 441 .long 0x135c83fd, 0xd8d26619 442 .long 0x5fabe670, 0xa741c1bf 443 .long 0x35ec3279, 0xde87806c 444 .long 0x00bcf5f6, 0x98d8d9cb 445 .long 0x8ae00689, 0x14338754 446 .long 0x17f27698, 0x49c3cc9c 447 .long 0x58ca5f00, 0x5bd2011f 448 .long 0xaa7c7ad5, 0x68bce87a 449 .long 0xb5cfca28, 0xdd07448e 450 .long 0xded288f8, 0x57a3d037 451 .long 0x59f229bc, 0xdde8f5b9 452 .long 0x6d390dec, 0x6956fc3b 453 .long 0x37170390, 0xa3e3e02c 454 .long 0x6353c1cc, 0x42d98888 455 .long 0xc4584f5c, 0xd73c7bea 456 .long 0xf48642e9, 0x3771e98f 457 .long 0x531377e2, 0x80ff0093 458 .long 0xdd35bc8d, 0xb42ae3d9 459 .long 0xb25b29f2, 0x8fe4c34d 460 .long 0x9a5ede41, 0x2178513a 461 .long 0xa563905d, 0xdf99fc11 462 .long 0x45cddf4e, 0xe0ac139e 463 .long 0xacfa3103, 0x6c23e841 464 .long 0xa51b6135, 0x170076fa 465