1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15#include "hitls_build.h" 16#ifdef HITLS_CRYPTO_CHACHA20 17 18.text 19 20.macro CHA512_EXTA 21 VEXT2 VREG04.16b, VREG14.16b, #12 22 VEXT2 VREG24.16b, VREG34.16b, #12 23 VEXT2 VREG44.16b, VREG54.16b, #12 24 VEXT2 VREG02.16b, VREG12.16b, #4 25 VEXT2 VREG22.16b, VREG32.16b, #4 26 VEXT2 VREG42.16b, VREG52.16b, #4 27.endm 28 29.macro CHA512_EXTB 30 VEXT2 VREG04.16b, VREG14.16b, #4 31 VEXT2 VREG24.16b, VREG34.16b, #4 32 VEXT2 VREG44.16b, VREG54.16b, #4 33 VEXT2 VREG02.16b, VREG12.16b, #12 34 VEXT2 VREG22.16b, VREG32.16b, #12 35 VEXT2 VREG42.16b, VREG52.16b, #12 36.endm 37 38.macro CHA512_SET_VDATA 39 mov VREG01.16b, VSIGMA.16b 40 mov VREG11.16b, VSIGMA.16b 41 mov VREG21.16b, VSIGMA.16b 42 mov VREG31.16b, VSIGMA.16b 43 mov VREG41.16b, VSIGMA.16b 44 mov VREG51.16b, VSIGMA.16b 45 mov VREG02.16b, VKEY01.16b 46 mov VREG12.16b, VKEY01.16b 47 mov VREG22.16b, VKEY01.16b 48 mov VREG32.16b, VKEY01.16b 49 mov VREG42.16b, VKEY01.16b 50 mov VREG52.16b, VKEY01.16b 51 mov VREG03.16b, VKEY02.16b 52 mov VREG13.16b, VKEY02.16b 53 mov VREG23.16b, VKEY02.16b 54 mov VREG33.16b, VKEY02.16b 55 mov VREG43.16b, VKEY02.16b 56 mov VREG53.16b, VKEY02.16b 57 mov VREG04.16b, VCUR01.16b // Counter + 2 58 mov VREG14.16b, VCUR02.16b // Counter + 3 59 mov VREG24.16b, VCUR03.16b // Counter + 4 60 mov VREG34.16b, VCUR04.16b // Counter + 5 61 add VREG44.4s, VREG04.4s, VADDER.4s // Counter + 6 = 4 + 2 62 add VREG54.4s, VREG14.4s, VADDER.4s // Counter + 7 = 4 + 3 63.endm 64 65.macro CHA512_ROUND_END 66 add VREG01.4s, VREG01.4s, VSIGMA.4s // After the loop is complete, add input. 67 add VREG11.4s, VREG11.4s, VSIGMA.4s 68 add VREG21.4s, VREG21.4s, VSIGMA.4s 69 add VREG31.4s, VREG31.4s, VSIGMA.4s 70 add VREG41.4s, VREG41.4s, VSIGMA.4s 71 add VREG51.4s, VREG51.4s, VSIGMA.4s 72 add VREG02.4s, VREG02.4s, VKEY01.4s // After the loop is complete, add input. 73 add VREG12.4s, VREG12.4s, VKEY01.4s 74 add VREG22.4s, VREG22.4s, VKEY01.4s 75 add VREG32.4s, VREG32.4s, VKEY01.4s 76 add VREG42.4s, VREG42.4s, VKEY01.4s 77 add VREG52.4s, VREG52.4s, VKEY01.4s 78 add VREG03.4s, VREG03.4s, VKEY02.4s // After the loop is complete, add input. 79 add VREG13.4s, VREG13.4s, VKEY02.4s 80 add VREG23.4s, VREG23.4s, VKEY02.4s 81 add VREG33.4s, VREG33.4s, VKEY02.4s 82 add VREG43.4s, VREG43.4s, VKEY02.4s 83 add VREG53.4s, VREG53.4s, VKEY02.4s 84 add VREG44.4s, VREG44.4s, VCUR01.4s // 2 85 add VREG54.4s, VREG54.4s, VCUR02.4s // 3 86 add VREG04.4s, VREG04.4s, VCUR01.4s // 2 87 add VREG14.4s, VREG14.4s, VCUR02.4s // 3 88 add VREG24.4s, VREG24.4s, VCUR03.4s // 4 89 add VREG34.4s, VREG34.4s, VCUR04.4s // 5 90 add VREG44.4s, VREG44.4s, VADDER.4s // 4 + 2 91 add VREG54.4s, VREG54.4s, VADDER.4s // 4 + 3 92.endm 93 94.macro CHA512_WRITE_BACK 95 ld1 {VCUR01.16b, VCUR02.16b, VCUR03.16b, VCUR04.16b}, [REGINC], #64 // Load 64 bytes. 96 eor VREG01.16b, VREG01.16b, VCUR01.16b 97 eor VREG02.16b, VREG02.16b, VCUR02.16b 98 eor VREG03.16b, VREG03.16b, VCUR03.16b 99 eor VREG04.16b, VREG04.16b, VCUR04.16b 100 ld1 {VCUR01.16b, VCUR02.16b, VCUR03.16b, VCUR04.16b}, [REGINC], #64 // Load 64 bytes. 101 st1 {VREG01.16b, VREG02.16b, VREG03.16b, VREG04.16b}, [REGOUT], #64 // Write 64 bytes. 102 eor VREG11.16b, VREG11.16b, VCUR01.16b 103 eor VREG12.16b, VREG12.16b, VCUR02.16b 104 eor VREG13.16b, VREG13.16b, VCUR03.16b 105 eor VREG14.16b, VREG14.16b, VCUR04.16b 106 ld1 {VREG01.16b, VREG02.16b, VREG03.16b, VREG04.16b}, [REGINC], #64 // Load 64 bytes. 107 st1 {VREG11.16b, VREG12.16b, VREG13.16b, VREG14.16b}, [REGOUT], #64 // Write 64 bytes. 108 eor VREG21.16b, VREG21.16b, VREG01.16b 109 eor VREG22.16b, VREG22.16b, VREG02.16b 110 eor VREG23.16b, VREG23.16b, VREG03.16b 111 eor VREG24.16b, VREG24.16b, VREG04.16b 112 ld1 {VREG11.16b, VREG12.16b, VREG13.16b, VREG14.16b}, [REGINC], #64 // Load 64 bytes. 113 st1 {VREG21.16b, VREG22.16b, VREG23.16b, VREG24.16b}, [REGOUT], #64 // Write 64 bytes. 114 eor VREG31.16b, VREG31.16b, VREG11.16b 115 eor VREG32.16b, VREG32.16b, VREG12.16b 116 eor VREG33.16b, VREG33.16b, VREG13.16b 117 eor VREG34.16b, VREG34.16b, VREG14.16b 118 ld1 {VREG01.16b, VREG02.16b, VREG03.16b, VREG04.16b}, [REGINC], #64 // Load 64 bytes. 119 st1 {VREG31.16b, VREG32.16b, VREG33.16b, VREG34.16b}, [REGOUT], #64 // Write 64 bytes. 120 shl VREG21.4s, VADDER.4s, #1 // 4 -> 8 121 eor VREG41.16b, VREG41.16b, VREG01.16b 122 eor VREG42.16b, VREG42.16b, VREG02.16b 123 eor VREG43.16b, VREG43.16b, VREG03.16b 124 eor VREG44.16b, VREG44.16b, VREG04.16b 125 ld1 {VREG11.16b, VREG12.16b, VREG13.16b, VREG14.16b}, [REGINC], #64 // Load 64 bytes. 126 st1 {VREG41.16b, VREG42.16b, VREG43.16b, VREG44.16b}, [REGOUT], #64 // Write 64 bytes. 127 ldp QCUR01, QCUR02, [sp, #32] // restore counter 0 1 2 4 128 ldp QCUR03, QCUR04, [sp, #64] 129 eor VREG51.16b, VREG51.16b, VREG11.16b 130 eor VREG52.16b, VREG52.16b, VREG12.16b 131 eor VREG53.16b, VREG53.16b, VREG13.16b 132 eor VREG54.16b, VREG54.16b, VREG14.16b 133 st1 {VREG51.16b, VREG52.16b, VREG53.16b, VREG54.16b}, [REGOUT], #64 // Write 64 bytes. 134 add VCUR01.4s, VCUR01.4s, VREG21.4s 135 add VCUR02.4s, VCUR02.4s, VREG21.4s 136 add VCUR03.4s, VCUR03.4s, VREG21.4s 137 add VCUR04.4s, VCUR04.4s, VREG21.4s 138.endm 139 140.macro CHA512_ROUND 141 WCHA_ADD_A_B // a += b 142 VADD2 VREG02.4s, VREG01.4s, VREG12.4s, VREG11.4s // a[0,1,2,3] += b[4,5,6,7] 143 VADD2 VREG22.4s, VREG21.4s, VREG32.4s, VREG31.4s 144 WCHA_EOR_D_A // d ^= a 145 VADD2 VREG42.4s, VREG41.4s, VREG52.4s, VREG51.4s 146 VEOR2 VREG01.16b, VREG04.16b, VREG11.16b, VREG14.16b // d[12,13,14,15] ^= a[0,1,2,3] 147 WCHA_ROR_D #16 // d <<<= 16 ror Cyclic shift right by 16 bits. 148 VEOR2 VREG21.16b, VREG24.16b, VREG31.16b, VREG34.16b 149 VEOR2 VREG41.16b, VREG44.16b, VREG51.16b, VREG54.16b 150 WCHA_ADD_C_D // c += d 151 VREV322 VREG04.8h, VREG14.8h // d[12,13,14,15] (#16 inverse). 152 VREV322 VREG24.8h, VREG34.8h 153 WCHA_EOR_B_C 154 VREV322 VREG44.8h, VREG54.8h 155 VADD2 VREG04.4s, VREG03.4s, VREG14.4s, VREG13.4s // c[8,9,10,11] += d[12,13,14,15] 156 WCHA_ROR_B #20 157 VADD2 VREG24.4s, VREG23.4s, VREG34.4s, VREG33.4s 158 VADD2 VREG44.4s, VREG43.4s, VREG54.4s, VREG53.4s 159 WCHA_ADD_A_B // a += b 160 VEORX VREG03.16b, VREG02.16b, VCUR01.16b, VREG13.16b, VREG12.16b, VCUR02.16b // m = b[4,5,6,7] ^ c[8,9,10,11] 161 VEORX VREG23.16b, VREG22.16b, VCUR03.16b, VREG33.16b, VREG32.16b, VCUR04.16b 162 WCHA_EOR_D_A 163 VEORX VREG43.16b, VREG42.16b, VCUR05.16b, VREG53.16b, VREG52.16b, VCUR06.16b 164 VUSHR2 VCUR01.4s, VREG02.4s, VCUR02.4s, VREG12.4s, #20 // b[4,5,6,7] = m << 20 165 WCHA_ROR_D #24 166 VUSHR2 VCUR03.4s, VREG22.4s, VCUR04.4s, VREG32.4s, #20 167 VUSHR2 VCUR05.4s, VREG42.4s, VCUR06.4s, VREG52.4s, #20 168 WCHA_ADD_C_D // c += d 169 VSLI2 VCUR01.4s, VREG02.4s, VCUR02.4s, VREG12.4s, #12 // b[4,5,6,7] = m >> 12 170 VSLI2 VCUR03.4s, VREG22.4s, VCUR04.4s, VREG32.4s, #12 171 WCHA_EOR_B_C 172 VSLI2 VCUR05.4s, VREG42.4s, VCUR06.4s, VREG52.4s, #12 173 VADD2 VREG02.4s, VREG01.4s, VREG12.4s, VREG11.4s // a[0,1,2,3] += b[4,5,6,7] 174 WCHA_ROR_B #25 175 VADD2 VREG22.4s, VREG21.4s, VREG32.4s, VREG31.4s 176 VADD2 VREG42.4s, VREG41.4s, VREG52.4s, VREG51.4s 177 WCHA_ADD2_A_B 178 VEORX VREG04.16b, VREG01.16b, VCUR01.16b, VREG14.16b, VREG11.16b, VCUR02.16b // m = d[12,13,14,15] ^ a[0,1,2,3] 179 VEORX VREG24.16b, VREG21.16b, VCUR03.16b, VREG34.16b, VREG31.16b, VCUR04.16b 180 WCHA_EOR2_D_A 181 VEORX VREG44.16b, VREG41.16b, VCUR05.16b, VREG54.16b, VREG51.16b, VCUR06.16b 182 VUSHR2 VCUR01.4s, VREG04.4s, VCUR02.4s, VREG14.4s, #24 // d[12,13,14,15] = m << 24 183 WCHA_ROR_D #16 184 VUSHR2 VCUR03.4s, VREG24.4s, VCUR04.4s, VREG34.4s, #24 185 VUSHR2 VCUR05.4s, VREG44.4s, VCUR06.4s, VREG54.4s, #24 186 WCHA_ADD2_C_D 187 VSLI2 VCUR01.4s, VREG04.4s, VCUR02.4s, VREG14.4s, #8 // d[12,13,14,15] = m >> 8 188 VSLI2 VCUR03.4s, VREG24.4s, VCUR04.4s, VREG34.4s, #8 189 WCHA_EOR2_B_C 190 VSLI2 VCUR05.4s, VREG44.4s, VCUR06.4s, VREG54.4s, #8 191 VADD2 VREG04.4s, VREG03.4s, VREG14.4s, VREG13.4s // c[8,9,10,11] += d[12,13,14,15] 192 WCHA_ROR_B #20 193 VADD2 VREG24.4s, VREG23.4s, VREG34.4s, VREG33.4s 194 VADD2 VREG44.4s, VREG43.4s, VREG54.4s, VREG53.4s 195 WCHA_ADD2_A_B 196 VEORX VREG03.16b, VREG02.16b, VCUR01.16b, VREG13.16b, VREG12.16b, VCUR02.16b // m = b[4,5,6,7] ^ c[8,9,10,11] 197 VEORX VREG23.16b, VREG22.16b, VCUR03.16b, VREG33.16b, VREG32.16b, VCUR04.16b 198 WCHA_EOR2_D_A 199 VEORX VREG43.16b, VREG42.16b, VCUR05.16b, VREG53.16b, VREG52.16b, VCUR06.16b 200 VUSHR2 VCUR01.4s, VREG02.4s, VCUR02.4s, VREG12.4s, #25 // b[4,5,6,7] = m << 25 201 WCHA_ROR_D #24 202 VUSHR2 VCUR03.4s, VREG22.4s, VCUR04.4s, VREG32.4s, #25 203 VUSHR2 VCUR05.4s, VREG42.4s, VCUR06.4s, VREG52.4s, #25 204 WCHA_ADD2_C_D 205 VSLI2 VCUR01.4s, VREG02.4s, VCUR02.4s, VREG12.4s, #7 // b[4,5,6,7] = m >> 7 206 VSLI2 VCUR03.4s, VREG22.4s, VCUR04.4s, VREG32.4s, #7 207 WCHA_EOR2_B_C 208 VSLI2 VCUR05.4s, VREG42.4s, VCUR06.4s, VREG52.4s, #7 209 VEXT2 VREG03.16b, VREG13.16b, #8 210 WCHA_ROR_B #25 211 VEXT2 VREG23.16b, VREG33.16b, #8 212 VEXT2 VREG43.16b, VREG53.16b, #8 213.endm 214 215#endif 216