• Home
  • Raw
  • Download

Lines Matching full:16

122 	movi	v17.16b, #0x0f
149 adrp x11, Lk_mc_forward@PAGE+16
150 add x11, x11, Lk_mc_forward@PAGEOFF+16
152 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
153 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
154 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
155 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
156 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
157 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
158 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
159 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
166 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
167 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[]
168 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
169 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
170 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
171 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
172 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
174 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
175 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
176 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
177 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
178 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
179 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
181 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
186 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
187 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
188 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
189 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
190 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
191 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
192 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
193 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
194 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
195 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
196 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
197 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
198 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
204 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
205 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
207 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
208 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
209 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
210 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
220 stp x29,x30,[sp,#-16]!
223 ld1 {v7.16b}, [x0]
226 st1 {v0.16b}, [x1]
228 ldp x29,x30,[sp],#16
238 adrp x11, Lk_mc_forward@PAGE+16
239 add x11, x11, Lk_mc_forward@PAGEOFF+16
241 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
242 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
243 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
244 and v9.16b, v15.16b, v17.16b
245 ushr v8.16b, v15.16b, #4
246 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
247 tbl v9.16b, {v20.16b}, v9.16b
248 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
249 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
250 tbl v10.16b, {v21.16b}, v8.16b
251 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
252 eor v8.16b, v9.16b, v16.16b
253 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
254 eor v8.16b, v8.16b, v10.16b
261 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
262 tbl v12.16b, {v25.16b}, v10.16b
263 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[]
264 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
265 tbl v8.16b, {v24.16b}, v11.16b
266 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
267 eor v12.16b, v12.16b, v16.16b
268 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
269 tbl v13.16b, {v27.16b}, v10.16b
270 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
271 eor v8.16b, v8.16b, v12.16b
272 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
273 tbl v10.16b, {v26.16b}, v11.16b
275 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
276 tbl v11.16b, {v8.16b}, v1.16b
277 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
278 eor v10.16b, v10.16b, v13.16b
279 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
280 tbl v8.16b, {v8.16b}, v4.16b
281 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
282 eor v11.16b, v11.16b, v10.16b
283 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
284 tbl v12.16b, {v11.16b},v1.16b
285 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
286 eor v8.16b, v8.16b, v11.16b
288 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
289 eor v8.16b, v8.16b, v12.16b
294 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
295 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
296 and v9.16b, v8.16b, v17.16b
297 ushr v8.16b, v8.16b, #4
298 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
299 tbl v13.16b, {v19.16b},v9.16b
300 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
301 eor v9.16b, v9.16b, v8.16b
302 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
303 tbl v11.16b, {v18.16b},v8.16b
304 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
305 tbl v12.16b, {v18.16b},v9.16b
306 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
307 eor v11.16b, v11.16b, v13.16b
308 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
309 eor v12.16b, v12.16b, v13.16b
310 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
311 tbl v10.16b, {v18.16b},v11.16b
312 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
313 tbl v11.16b, {v18.16b},v12.16b
314 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
315 eor v10.16b, v10.16b, v9.16b
316 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
317 eor v11.16b, v11.16b, v8.16b
318 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
324 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
325 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
326 tbl v12.16b, {v22.16b}, v10.16b
328 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
329 tbl v8.16b, {v23.16b}, v11.16b
330 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
331 eor v12.16b, v12.16b, v16.16b
332 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
333 eor v8.16b, v8.16b, v12.16b
334 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
335 tbl v1.16b, {v8.16b},v1.16b
344 movi v17.16b, #0x0f
375 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
376 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
377 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
378 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
380 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
381 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
382 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
383 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
393 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
394 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
395 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
397 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
400 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
401 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
402 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
403 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
405 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
408 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
409 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
410 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
411 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
413 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
416 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
417 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
418 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
419 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
420 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
421 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
426 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
427 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
428 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
429 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
430 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
431 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
432 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
433 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
434 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
435 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
436 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
437 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
438 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
443 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
446 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
447 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
448 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
449 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
459 stp x29,x30,[sp,#-16]!
462 ld1 {v7.16b}, [x0]
465 st1 {v0.16b}, [x1]
467 ldp x29,x30,[sp],#16
489 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
490 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
491 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
492 and v9.16b, v15.16b, v17.16b
493 ushr v8.16b, v15.16b, #4
494 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
495 tbl v10.16b, {v20.16b},v9.16b
497 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
498 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
499 tbl v8.16b, {v21.16b},v8.16b
500 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
501 eor v10.16b, v10.16b, v16.16b
502 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
503 eor v8.16b, v8.16b, v10.16b
513 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
514 tbl v12.16b, {v24.16b}, v10.16b
515 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
516 tbl v9.16b, {v25.16b}, v11.16b
517 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
518 eor v8.16b, v12.16b, v16.16b
520 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
521 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
524 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
525 tbl v12.16b, {v26.16b}, v10.16b
526 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
527 tbl v8.16b, {v8.16b},v5.16b
528 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
529 tbl v9.16b, {v27.16b}, v11.16b
530 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
531 eor v8.16b, v8.16b, v12.16b
533 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
534 eor v8.16b, v8.16b, v9.16b
537 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
538 tbl v12.16b, {v28.16b}, v10.16b
539 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
540 tbl v8.16b, {v8.16b},v5.16b
541 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
542 tbl v9.16b, {v29.16b}, v11.16b
543 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
544 eor v8.16b, v8.16b, v12.16b
546 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
547 eor v8.16b, v8.16b, v9.16b
550 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
551 tbl v12.16b, {v30.16b}, v10.16b
552 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
553 tbl v8.16b, {v8.16b},v5.16b
554 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
555 tbl v9.16b, {v31.16b}, v11.16b
556 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
557 eor v8.16b, v8.16b, v12.16b
558 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
559 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
560 eor v8.16b, v8.16b, v9.16b
565 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
566 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
567 and v9.16b, v8.16b, v17.16b
568 ushr v8.16b, v8.16b, #4
569 tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
570 tbl v10.16b, {v19.16b},v9.16b
571 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
572 eor v9.16b, v9.16b, v8.16b
573 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
574 tbl v11.16b, {v18.16b},v8.16b
575 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
576 tbl v12.16b, {v18.16b},v9.16b
577 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
578 eor v11.16b, v11.16b, v10.16b
579 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
580 eor v12.16b, v12.16b, v10.16b
581 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
582 tbl v10.16b, {v18.16b},v11.16b
583 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
584 tbl v11.16b, {v18.16b},v12.16b
585 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
586 eor v10.16b, v10.16b, v9.16b
587 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
588 eor v11.16b, v11.16b, v8.16b
589 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
594 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
595 tbl v12.16b, {v22.16b}, v10.16b
597 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
598 tbl v9.16b, {v23.16b}, v11.16b
600 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
601 eor v12.16b, v12.16b, v16.16b
602 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
603 eor v8.16b, v9.16b, v12.16b
604 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
605 tbl v1.16b, {v8.16b},v2.16b
618 movi v16.16b, #0x5b // Lk_s63
621 movi v17.16b, #0x0f // Lk_s0F
639 stp x29, x30, [sp,#-16]!
644 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
647 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
649 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
664 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
710 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
712 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
713 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
720 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
742 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
749 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
758 movi v4.16b, #0
759 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
760 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
762 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
789 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
793 sub x2, x2, #16 // add $-16, %rdx
794 eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0
799 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
800 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
801 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
802 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
803 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
804 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
805 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
806 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
807 ldp x29, x30, [sp],#16
829 movi v1.16b, #0
833 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
834 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
835 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
836 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
863 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
864 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1
865 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8
866 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
870 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
877 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1
878 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
879 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4
882 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
883 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
884 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
885 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
886 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
887 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
888 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
889 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
890 eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7
891 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
892 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
893 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
894 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
895 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
896 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
897 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
898 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
901 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
902 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
918 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
919 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
921 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
922 // vmovdqa 16(%r11), %xmm1 # hi
923 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
924 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
954 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
959 eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4
960 add x2, x2, #16 // add $16, %rdx
961 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
962 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
963 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
964 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
966 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
973 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
974 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
977 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
979 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
980 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
981 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
984 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
985 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
987 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
988 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
989 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
992 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
993 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
995 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
996 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
999 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1000 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1002 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
1004 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1005 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
1007 sub x2, x2, #16 // add $-16, %rdx
1010 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1011 add x8, x8, #48 // add $-16, %r8
1023 stp x29,x30,[sp,#-16]!
1025 stp d8,d9,[sp,#-16]! // ABI spec says so
1036 ldp d8,d9,[sp],#16
1037 ldp x29,x30,[sp],#16
1048 stp x29,x30,[sp,#-16]!
1050 stp d8,d9,[sp,#-16]! // ABI spec says so
1056 add x2, x2, #16 // lea 16(%rdx,%rax),%rdx
1065 ldp d8,d9,[sp],#16
1066 ldp x29,x30,[sp],#16
1080 stp x29,x30,[sp,#-16]!
1086 ld1 {v0.16b}, [x4] // load ivec
1092 ld1 {v7.16b}, [x0],#16 // load input
1093 eor v7.16b, v7.16b, v0.16b // xor with ivec
1095 st1 {v0.16b}, [x1],#16 // save output
1096 subs x17, x17, #16
1099 st1 {v0.16b}, [x4] // write ivec
1101 ldp x29,x30,[sp],#16
1112 stp x29,x30,[sp,#-16]!
1114 stp d8,d9,[sp,#-16]! // ABI spec says so
1115 stp d10,d11,[sp,#-16]!
1116 stp d12,d13,[sp,#-16]!
1117 stp d14,d15,[sp,#-16]!
1121 ld1 {v6.16b}, [x4] // load ivec
1123 tst x17, #16
1126 ld1 {v7.16b}, [x0], #16 // load input
1128 eor v0.16b, v0.16b, v6.16b // xor with ivec
1129 orr v6.16b, v7.16b, v7.16b // next ivec value
1130 st1 {v0.16b}, [x1], #16
1131 subs x17, x17, #16
1136 ld1 {v14.16b,v15.16b}, [x0], #32
1138 eor v0.16b, v0.16b, v6.16b // xor with ivec
1139 eor v1.16b, v1.16b, v14.16b
1140 orr v6.16b, v15.16b, v15.16b
1141 st1 {v0.16b,v1.16b}, [x1], #32
1146 st1 {v6.16b}, [x4]
1148 ldp d14,d15,[sp],#16
1149 ldp d12,d13,[sp],#16
1150 ldp d10,d11,[sp],#16
1151 ldp d8,d9,[sp],#16
1152 ldp x29,x30,[sp],#16
1162 stp x29,x30,[sp,#-16]!
1164 stp d8,d9,[sp,#-16]! // ABI spec says so
1165 stp d10,d11,[sp,#-16]!
1166 stp d12,d13,[sp,#-16]!
1167 stp d14,d15,[sp,#-16]!
1178 ld1 {v7.16b}, [x4]
1187 ld1 {v6.16b}, [x0], #16 // Load input ahead of time
1189 eor v0.16b, v0.16b, v6.16b // XOR input and result
1190 st1 {v0.16b}, [x1], #16
1201 mov v15.16b, v7.16b
1202 mov v14.16b, v7.16b
1208 ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time
1210 eor v0.16b, v0.16b, v6.16b // XOR input and result
1211 eor v1.16b, v1.16b, v7.16b // XOR input and result (#2)
1212 st1 {v0.16b,v1.16b}, [x1], #32
1224 ldp d14,d15,[sp],#16
1225 ldp d12,d13,[sp],#16
1226 ldp d10,d11,[sp],#16
1227 ldp d8,d9,[sp],#16
1228 ldp x29,x30,[sp],#16