Lines Matching full:16
123 movi v17.16b, #0x0f
150 adrp x11, .Lk_mc_forward+16
151 add x11, x11, :lo12:.Lk_mc_forward+16
153 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
154 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
155 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
156 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
157 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
158 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
159 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
160 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
167 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
168 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
169 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
170 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
171 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
172 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
173 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
175 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
176 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
177 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
178 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
179 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
180 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
182 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
187 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
188 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
189 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
190 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
191 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
192 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
193 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
194 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
195 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
196 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
197 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
198 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
199 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
205 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
206 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
208 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
209 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
210 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
211 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
221 stp x29,x30,[sp,#-16]!
224 ld1 {v7.16b}, [x0]
227 st1 {v0.16b}, [x1]
229 ldp x29,x30,[sp],#16
239 adrp x11, .Lk_mc_forward+16
240 add x11, x11, :lo12:.Lk_mc_forward+16
242 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
243 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
244 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
245 and v9.16b, v15.16b, v17.16b
246 ushr v8.16b, v15.16b, #4
247 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
248 tbl v9.16b, {v20.16b}, v9.16b
249 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
250 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
251 tbl v10.16b, {v21.16b}, v8.16b
252 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
253 eor v8.16b, v9.16b, v16.16b
254 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
255 eor v8.16b, v8.16b, v10.16b
262 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
263 tbl v12.16b, {v25.16b}, v10.16b
264 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
265 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
266 tbl v8.16b, {v24.16b}, v11.16b
267 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
268 eor v12.16b, v12.16b, v16.16b
269 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
270 tbl v13.16b, {v27.16b}, v10.16b
271 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
272 eor v8.16b, v8.16b, v12.16b
273 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
274 tbl v10.16b, {v26.16b}, v11.16b
276 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
277 tbl v11.16b, {v8.16b}, v1.16b
278 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
279 eor v10.16b, v10.16b, v13.16b
280 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
281 tbl v8.16b, {v8.16b}, v4.16b
282 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
283 eor v11.16b, v11.16b, v10.16b
284 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
285 tbl v12.16b, {v11.16b},v1.16b
286 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
287 eor v8.16b, v8.16b, v11.16b
289 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
290 eor v8.16b, v8.16b, v12.16b
295 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
296 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
297 and v9.16b, v8.16b, v17.16b
298 ushr v8.16b, v8.16b, #4
299 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
300 tbl v13.16b, {v19.16b},v9.16b
301 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
302 eor v9.16b, v9.16b, v8.16b
303 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
304 tbl v11.16b, {v18.16b},v8.16b
305 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
306 tbl v12.16b, {v18.16b},v9.16b
307 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
308 eor v11.16b, v11.16b, v13.16b
309 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
310 eor v12.16b, v12.16b, v13.16b
311 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
312 tbl v10.16b, {v18.16b},v11.16b
313 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
314 tbl v11.16b, {v18.16b},v12.16b
315 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
316 eor v10.16b, v10.16b, v9.16b
317 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
318 eor v11.16b, v11.16b, v8.16b
319 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
325 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
326 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
327 tbl v12.16b, {v22.16b}, v10.16b
329 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
330 tbl v8.16b, {v23.16b}, v11.16b
331 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
332 eor v12.16b, v12.16b, v16.16b
333 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
334 eor v8.16b, v8.16b, v12.16b
335 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
336 tbl v1.16b, {v8.16b},v1.16b
345 movi v17.16b, #0x0f
376 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
377 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
378 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
379 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
381 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
382 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
383 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
384 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
394 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
395 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
396 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
398 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
401 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
402 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
403 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
404 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
406 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
409 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
410 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
411 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
412 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
414 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
417 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
418 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
419 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
420 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
421 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
422 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
427 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
428 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
429 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
430 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
431 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
432 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
433 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
434 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
435 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
436 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
437 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
438 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
439 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
444 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
447 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
448 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
449 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
450 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
460 stp x29,x30,[sp,#-16]!
463 ld1 {v7.16b}, [x0]
466 st1 {v0.16b}, [x1]
468 ldp x29,x30,[sp],#16
490 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
491 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
492 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
493 and v9.16b, v15.16b, v17.16b
494 ushr v8.16b, v15.16b, #4
495 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
496 tbl v10.16b, {v20.16b},v9.16b
498 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
499 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
500 tbl v8.16b, {v21.16b},v8.16b
501 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
502 eor v10.16b, v10.16b, v16.16b
503 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
504 eor v8.16b, v8.16b, v10.16b
514 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
515 tbl v12.16b, {v24.16b}, v10.16b
516 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
517 tbl v9.16b, {v25.16b}, v11.16b
518 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
519 eor v8.16b, v12.16b, v16.16b
521 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
522 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
525 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
526 tbl v12.16b, {v26.16b}, v10.16b
527 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
528 tbl v8.16b, {v8.16b},v5.16b
529 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
530 tbl v9.16b, {v27.16b}, v11.16b
531 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
532 eor v8.16b, v8.16b, v12.16b
534 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
535 eor v8.16b, v8.16b, v9.16b
538 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
539 tbl v12.16b, {v28.16b}, v10.16b
540 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
541 tbl v8.16b, {v8.16b},v5.16b
542 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
543 tbl v9.16b, {v29.16b}, v11.16b
544 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
545 eor v8.16b, v8.16b, v12.16b
547 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
548 eor v8.16b, v8.16b, v9.16b
551 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
552 tbl v12.16b, {v30.16b}, v10.16b
553 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
554 tbl v8.16b, {v8.16b},v5.16b
555 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
556 tbl v9.16b, {v31.16b}, v11.16b
557 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
558 eor v8.16b, v8.16b, v12.16b
559 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
560 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
561 eor v8.16b, v8.16b, v9.16b
566 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
567 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
568 and v9.16b, v8.16b, v17.16b
569 ushr v8.16b, v8.16b, #4
570 tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
571 tbl v10.16b, {v19.16b},v9.16b
572 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
573 eor v9.16b, v9.16b, v8.16b
574 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
575 tbl v11.16b, {v18.16b},v8.16b
576 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
577 tbl v12.16b, {v18.16b},v9.16b
578 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
579 eor v11.16b, v11.16b, v10.16b
580 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
581 eor v12.16b, v12.16b, v10.16b
582 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
583 tbl v10.16b, {v18.16b},v11.16b
584 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
585 tbl v11.16b, {v18.16b},v12.16b
586 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
587 eor v10.16b, v10.16b, v9.16b
588 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
589 eor v11.16b, v11.16b, v8.16b
590 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
595 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
596 tbl v12.16b, {v22.16b}, v10.16b
598 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
599 tbl v9.16b, {v23.16b}, v11.16b
601 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
602 eor v12.16b, v12.16b, v16.16b
603 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
604 eor v8.16b, v9.16b, v12.16b
605 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
606 tbl v1.16b, {v8.16b},v2.16b
619 movi v16.16b, #0x5b // .Lk_s63
622 movi v17.16b, #0x0f // .Lk_s0F
640 stp x29, x30, [sp,#-16]!
645 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
648 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
650 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
665 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
711 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
713 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
714 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
721 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
743 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
750 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
759 movi v4.16b, #0
760 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
761 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
763 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
790 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
794 sub x2, x2, #16 // add $-16, %rdx
795 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
800 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
801 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
802 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
803 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
804 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
805 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
806 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
807 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
808 ldp x29, x30, [sp],#16
830 movi v1.16b, #0
834 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
835 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
836 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
837 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
864 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
865 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1
866 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8
867 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
871 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
878 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1
879 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
880 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4
883 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
884 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
885 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
886 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
887 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
888 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
889 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
890 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
891 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
892 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
893 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
894 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
895 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
896 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
897 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
898 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
899 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
902 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
903 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
919 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
920 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
922 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
923 // vmovdqa 16(%r11), %xmm1 # hi
924 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
925 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
955 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
960 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
961 add x2, x2, #16 // add $16, %rdx
962 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
963 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
964 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
965 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
967 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
974 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
975 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
978 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
980 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
981 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
982 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
985 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
986 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
988 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
989 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
990 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
993 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
994 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
996 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
997 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1000 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1001 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1003 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
1005 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1006 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
1008 sub x2, x2, #16 // add $-16, %rdx
1011 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1012 add x8, x8, #48 // add $-16, %r8
1024 stp x29,x30,[sp,#-16]!
1026 stp d8,d9,[sp,#-16]! // ABI spec says so
1037 ldp d8,d9,[sp],#16
1038 ldp x29,x30,[sp],#16
1049 stp x29,x30,[sp,#-16]!
1051 stp d8,d9,[sp,#-16]! // ABI spec says so
1057 add x2, x2, #16 // lea 16(%rdx,%rax),%rdx
1066 ldp d8,d9,[sp],#16
1067 ldp x29,x30,[sp],#16
1081 stp x29,x30,[sp,#-16]!
1087 ld1 {v0.16b}, [x4] // load ivec
1093 ld1 {v7.16b}, [x0],#16 // load input
1094 eor v7.16b, v7.16b, v0.16b // xor with ivec
1096 st1 {v0.16b}, [x1],#16 // save output
1097 subs x17, x17, #16
1100 st1 {v0.16b}, [x4] // write ivec
1102 ldp x29,x30,[sp],#16
1113 stp x29,x30,[sp,#-16]!
1115 stp d8,d9,[sp,#-16]! // ABI spec says so
1116 stp d10,d11,[sp,#-16]!
1117 stp d12,d13,[sp,#-16]!
1118 stp d14,d15,[sp,#-16]!
1122 ld1 {v6.16b}, [x4] // load ivec
1124 tst x17, #16
1127 ld1 {v7.16b}, [x0], #16 // load input
1129 eor v0.16b, v0.16b, v6.16b // xor with ivec
1130 orr v6.16b, v7.16b, v7.16b // next ivec value
1131 st1 {v0.16b}, [x1], #16
1132 subs x17, x17, #16
1137 ld1 {v14.16b,v15.16b}, [x0], #32
1139 eor v0.16b, v0.16b, v6.16b // xor with ivec
1140 eor v1.16b, v1.16b, v14.16b
1141 orr v6.16b, v15.16b, v15.16b
1142 st1 {v0.16b,v1.16b}, [x1], #32
1147 st1 {v6.16b}, [x4]
1149 ldp d14,d15,[sp],#16
1150 ldp d12,d13,[sp],#16
1151 ldp d10,d11,[sp],#16
1152 ldp d8,d9,[sp],#16
1153 ldp x29,x30,[sp],#16
1163 stp x29,x30,[sp,#-16]!
1165 stp d8,d9,[sp,#-16]! // ABI spec says so
1166 stp d10,d11,[sp,#-16]!
1167 stp d12,d13,[sp,#-16]!
1168 stp d14,d15,[sp,#-16]!
1179 ld1 {v7.16b}, [x4]
1188 ld1 {v6.16b}, [x0], #16 // .Load input ahead of time
1190 eor v0.16b, v0.16b, v6.16b // XOR input and result
1191 st1 {v0.16b}, [x1], #16
1202 mov v15.16b, v7.16b
1203 mov v14.16b, v7.16b
1209 ld1 {v6.16b,v7.16b}, [x0], #32 // .Load input ahead of time
1211 eor v0.16b, v0.16b, v6.16b // XOR input and result
1212 eor v1.16b, v1.16b, v7.16b // XOR input and result (#2)
1213 st1 {v0.16b,v1.16b}, [x1], #32
1225 ldp d14,d15,[sp],#16
1226 ldp d12,d13,[sp],#16
1227 ldp d10,d11,[sp],#16
1228 ldp d8,d9,[sp],#16
1229 ldp x29,x30,[sp],#16