1/* 2 * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions 3 * 4 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11/* GPL HEADER START 12 * 13 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 14 * 15 * This program is free software; you can redistribute it and/or modify 16 * it under the terms of the GNU General Public License version 2 only, 17 * as published by the Free Software Foundation. 18 * 19 * This program is distributed in the hope that it will be useful, but 20 * WITHOUT ANY WARRANTY; without even the implied warranty of 21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 * General Public License version 2 for more details (a copy is included 23 * in the LICENSE file that accompanied this code). 24 * 25 * You should have received a copy of the GNU General Public License 26 * version 2 along with this program; If not, see http://www.gnu.org/licenses 27 * 28 * Please visit http://www.xyratex.com/contact if you need additional 29 * information or have any questions. 30 * 31 * GPL HEADER END 32 */ 33 34/* 35 * Copyright 2012 Xyratex Technology Limited 36 * 37 * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 38 * calculation. 39 * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) 40 * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found 41 * at: 42 * http://www.intel.com/products/processor/manuals/ 43 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual 44 * Volume 2B: Instruction Set Reference, N-Z 45 * 46 * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> 47 * Alexander Boyko <Alexander_Boyko@xyratex.com> 48 */ 49 50#include <linux/linkage.h> 51#include <asm/assembler.h> 52 53 .text 54 .align 6 55 .cpu generic+crypto+crc 56 57.Lcrc32_constants: 58 /* 59 * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 60 * #define CONSTANT_R1 0x154442bd4LL 61 * 62 * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 63 * #define CONSTANT_R2 0x1c6e41596LL 64 */ 65 .octa 0x00000001c6e415960000000154442bd4 66 67 /* 68 * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 69 * #define CONSTANT_R3 0x1751997d0LL 70 * 71 * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e 72 * #define CONSTANT_R4 0x0ccaa009eLL 73 */ 74 .octa 0x00000000ccaa009e00000001751997d0 75 76 /* 77 * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 78 * #define CONSTANT_R5 0x163cd6124LL 79 */ 80 .quad 0x0000000163cd6124 81 .quad 0x00000000FFFFFFFF 82 83 /* 84 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL 85 * 86 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` 87 * = 0x1F7011641LL 88 * #define CONSTANT_RU 0x1F7011641LL 89 */ 90 .octa 0x00000001F701164100000001DB710641 91 92.Lcrc32c_constants: 93 .octa 0x000000009e4addf800000000740eef02 94 .octa 0x000000014cd00bd600000000f20c0dfe 95 .quad 0x00000000dd45aab8 96 .quad 0x00000000FFFFFFFF 97 .octa 0x00000000dea713f10000000105ec76f0 98 99 vCONSTANT .req v0 100 dCONSTANT .req d0 101 qCONSTANT .req q0 102 103 BUF .req x0 104 LEN .req x1 105 CRC .req x2 106 107 vzr .req v9 108 109 /** 110 * Calculate crc32 111 * BUF - buffer 112 * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63 113 * CRC - initial crc32 114 * return %eax crc32 115 * uint crc32_pmull_le(unsigned char const *buffer, 116 * size_t len, uint crc32) 117 */ 118ENTRY(crc32_pmull_le) 119 adr x3, .Lcrc32_constants 120 b 0f 121 122ENTRY(crc32c_pmull_le) 123 adr x3, .Lcrc32c_constants 124 1250: bic LEN, LEN, #15 126 ld1 {v1.16b-v4.16b}, [BUF], #0x40 127 movi vzr.16b, #0 128 fmov dCONSTANT, CRC 129 eor v1.16b, v1.16b, vCONSTANT.16b 130 sub LEN, LEN, #0x40 131 cmp LEN, #0x40 132 b.lt less_64 133 134 ldr qCONSTANT, [x3] 135 136loop_64: /* 64 bytes Full cache line folding */ 137 sub LEN, LEN, #0x40 138 139 pmull2 v5.1q, v1.2d, vCONSTANT.2d 140 pmull2 v6.1q, v2.2d, vCONSTANT.2d 141 pmull2 v7.1q, v3.2d, vCONSTANT.2d 142 pmull2 v8.1q, v4.2d, vCONSTANT.2d 143 144 pmull v1.1q, v1.1d, vCONSTANT.1d 145 pmull v2.1q, v2.1d, vCONSTANT.1d 146 pmull v3.1q, v3.1d, vCONSTANT.1d 147 pmull v4.1q, v4.1d, vCONSTANT.1d 148 149 eor v1.16b, v1.16b, v5.16b 150 ld1 {v5.16b}, [BUF], #0x10 151 eor v2.16b, v2.16b, v6.16b 152 ld1 {v6.16b}, [BUF], #0x10 153 eor v3.16b, v3.16b, v7.16b 154 ld1 {v7.16b}, [BUF], #0x10 155 eor v4.16b, v4.16b, v8.16b 156 ld1 {v8.16b}, [BUF], #0x10 157 158 eor v1.16b, v1.16b, v5.16b 159 eor v2.16b, v2.16b, v6.16b 160 eor v3.16b, v3.16b, v7.16b 161 eor v4.16b, v4.16b, v8.16b 162 163 cmp LEN, #0x40 164 b.ge loop_64 165 166less_64: /* Folding cache line into 128bit */ 167 ldr qCONSTANT, [x3, #16] 168 169 pmull2 v5.1q, v1.2d, vCONSTANT.2d 170 pmull v1.1q, v1.1d, vCONSTANT.1d 171 eor v1.16b, v1.16b, v5.16b 172 eor v1.16b, v1.16b, v2.16b 173 174 pmull2 v5.1q, v1.2d, vCONSTANT.2d 175 pmull v1.1q, v1.1d, vCONSTANT.1d 176 eor v1.16b, v1.16b, v5.16b 177 eor v1.16b, v1.16b, v3.16b 178 179 pmull2 v5.1q, v1.2d, vCONSTANT.2d 180 pmull v1.1q, v1.1d, vCONSTANT.1d 181 eor v1.16b, v1.16b, v5.16b 182 eor v1.16b, v1.16b, v4.16b 183 184 cbz LEN, fold_64 185 186loop_16: /* Folding rest buffer into 128bit */ 187 subs LEN, LEN, #0x10 188 189 ld1 {v2.16b}, [BUF], #0x10 190 pmull2 v5.1q, v1.2d, vCONSTANT.2d 191 pmull v1.1q, v1.1d, vCONSTANT.1d 192 eor v1.16b, v1.16b, v5.16b 193 eor v1.16b, v1.16b, v2.16b 194 195 b.ne loop_16 196 197fold_64: 198 /* perform the last 64 bit fold, also adds 32 zeroes 199 * to the input stream */ 200 ext v2.16b, v1.16b, v1.16b, #8 201 pmull2 v2.1q, v2.2d, vCONSTANT.2d 202 ext v1.16b, v1.16b, vzr.16b, #8 203 eor v1.16b, v1.16b, v2.16b 204 205 /* final 32-bit fold */ 206 ldr dCONSTANT, [x3, #32] 207 ldr d3, [x3, #40] 208 209 ext v2.16b, v1.16b, vzr.16b, #4 210 and v1.16b, v1.16b, v3.16b 211 pmull v1.1q, v1.1d, vCONSTANT.1d 212 eor v1.16b, v1.16b, v2.16b 213 214 /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ 215 ldr qCONSTANT, [x3, #48] 216 217 and v2.16b, v1.16b, v3.16b 218 ext v2.16b, vzr.16b, v2.16b, #8 219 pmull2 v2.1q, v2.2d, vCONSTANT.2d 220 and v2.16b, v2.16b, v3.16b 221 pmull v2.1q, v2.1d, vCONSTANT.1d 222 eor v1.16b, v1.16b, v2.16b 223 mov w0, v1.s[1] 224 225 ret 226ENDPROC(crc32_pmull_le) 227ENDPROC(crc32c_pmull_le) 228 229 .macro __crc32, c 2300: subs x2, x2, #16 231 b.mi 8f 232 ldp x3, x4, [x1], #16 233CPU_BE( rev x3, x3 ) 234CPU_BE( rev x4, x4 ) 235 crc32\c\()x w0, w0, x3 236 crc32\c\()x w0, w0, x4 237 b.ne 0b 238 ret 239 2408: tbz x2, #3, 4f 241 ldr x3, [x1], #8 242CPU_BE( rev x3, x3 ) 243 crc32\c\()x w0, w0, x3 2444: tbz x2, #2, 2f 245 ldr w3, [x1], #4 246CPU_BE( rev w3, w3 ) 247 crc32\c\()w w0, w0, w3 2482: tbz x2, #1, 1f 249 ldrh w3, [x1], #2 250CPU_BE( rev16 w3, w3 ) 251 crc32\c\()h w0, w0, w3 2521: tbz x2, #0, 0f 253 ldrb w3, [x1] 254 crc32\c\()b w0, w0, w3 2550: ret 256 .endm 257 258 .align 5 259ENTRY(crc32_armv8_le) 260 __crc32 261ENDPROC(crc32_armv8_le) 262 263 .align 5 264ENTRY(crc32c_armv8_le) 265 __crc32 c 266ENDPROC(crc32c_armv8_le) 267