• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1.macro push_v_regs
2    stp             q8, q9, [sp, #-32]!
3    stp             q10, q11, [sp, #-32]!
4    stp             q12, q13, [sp, #-32]!
5    stp             q14, q15, [sp, #-32]!
6//st1 { v8.2d,  v9.2d, v10.2d, v11.2d}, [sp, #-64]!
7//st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp, #-64]!
8    stp             X8, X9, [sp, #-16]!
9    stp             X10, X11, [sp, #-16]!
10    stp             X12, X13, [sp, #-16]!
11    stp             X14, X15, [sp, #-16]!
12    stp             X16, X17, [sp, #-16]!
13    stp             X18, X19, [sp, #-16]!
14    stp             X20, X21, [sp, #-16]!
15    stp             X22, X23, [sp, #-16]!
16    stp             X24, X25, [sp, #-16]!
17    stp             X26, X27, [sp, #-16]!
18    stp             X28, X29, [sp, #-16]!
19    stp             X30, X29, [sp, #-16]!
20.endm
21
22.macro pop_v_regs
23    ldp             X30, X29, [sp], #16
24    ldp             X28, X29, [sp], #16
25    ldp             X26, X27, [sp], #16
26    ldp             X24, X25, [sp], #16
27    ldp             X22, X23, [sp], #16
28    ldp             X20, X21, [sp], #16
29    ldp             X18, X19, [sp], #16
30    ldp             X16, X17, [sp], #16
31    ldp             X14, X15, [sp], #16
32    ldp             X12, X13, [sp], #16
33    ldp             X10, X11, [sp], #16
34    ldp             X8, X9, [sp], #16
35//ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64
36//ld1 { v8.2d,  v9.2d, v10.2d, v11.2d}, [sp], #64
37    ldp             q14, q15, [sp], #32
38    ldp             q12, q13, [sp], #32
39    ldp             q10, q11, [sp], #32
40    ldp             q8, q9, [sp], #32
41.endm
42
43
44.text
45.p2align 2
46.global ixheaacd_post_twid_overlap_add_armv8
47
48ixheaacd_post_twid_overlap_add_armv8:
49
50    // STMFD sp!, {x4-x12}
51    push_v_regs
52    //stp x19, x20,[sp,#-16]!
53    //VPUSH           {d8 - d15}
54
55    //LDR w4,  [sp, #100]
56    //sxtw x4,w4
57    //LDR w5,  [sp, #104]
58    //sxtw x5,w5
59    //LDR w6,  [sp, #108]
60    //sxtw x6,w6
61    MOV             x16, x5
62    MOV             x17, x7
63    LSL             x9, x3, #2
64    ASR             x9, x9, #1
65    ADD             x6, x6, x9
66    SUB             x6, x6, #4
67
68    MOV             w8, #7500
69    sxtw            x8, w8
70    ADD             x2, x2, x8
71
72
73
74    movi            v18.4h, #50
75    sub             x20, x5, #15
76    neg             x9, x20
77    movi            v20.4s, #0x80, LSL #8
78    dup             v16.4s, w5
79    SUB             x5, x5, #16
80    //STR w5,  [sp, #116]
81    MOV             w25, w5
82    sxtw            x25, w25
83    MOV             x8, #1
84    LSL             x8, x8, x9
85    //STR w8,  [sp, #120]
86    MOV             w26, w8
87
88    //sxtw x8,w8
89
90
91ARM_PROLOGUE:
92
93
94    LDR             w8, [x1], #4
95    sxtw            x8, w8
96    LDR             w9, [x1], #4
97    sxtw            x9, w9
98
99    LDR             w10, [x2], #4
100    sxtw            x10, w10
101
102    AND             w19, w10, 0xFFFF
103    sxth            x19, w19
104    ASR             w10, w10, #16
105//    SMULWT          x11, x8, x10
106//
107//    SMULWB          x12, x9, x10
108//    SMULWB          x5, x8, x10
109//    SMLAWT          x7, x9, x10, x5
110
111    SMULL           x11, w8, w10
112    ASR             x11, x11, #16
113    SMULL           x12, w9, w19
114    ASR             x12, x12, #16
115    SMULL           x5, w8, w19
116    ASR             x5, x5, #16
117    SMULL           x7, w9, w10
118    ASR             x7, x7, #16
119    ADD             x7, x7, x5
120
121    SUB             x8, x12, x11
122    MVN             x5, x7
123    ADD             x5, x5, #1
124
125
126    MOV             x9, #50
127    MOV             x12, #-50
128    AND             w19, w9, 0xFFFF
129    sxth            x19, w19
130    SMULL           x10, w5, w19
131    ASR             x10, x10, #16
132    AND             w19, w12, 0xFFFF
133    sxth            x19, w19
134    SMULL           x11, w8, w19
135    ASR             x11, x11, #16
136
137    ADD             x8, x8, x10
138    ADD             x5, x5, x11
139
140    //LDR w11,  [sp, #104]
141    MOV             w11, w16
142    sxth            x11, w11
143    LDR             w10, [x6], #-32
144    sxtw            x10, w10
145
146    AND             w19, w10, 0xFFFF
147    sxth            x19, w19
148    ASR             w20, w10, #16
149
150    //SMULWB          x7, x8, x10
151    SMULL           x7, w8, w19
152    ASR             x7, x7, #16
153    MVN             x8, x8
154    ADD             x8, x8, #1
155    //SMULWT          x12, x8, x10
156    SMULL           x12, w8, w20
157    ASR             x12, x12, #16
158
159    CMP             x11, #0
160    BLT             NEXT
161
162    SUB             x9, x11, #16
163    negs            x9, x9
164
165
166
167
168    // LDR w8,  [sp, #120]
169    //sxtw x8,w8
170    MOV             v1.s[0], w26
171    MOV             v2.s[0], w5
172
173    //sQADD            w5, w5, w8
174    //ASR             w5, w5, w9
175
176    SQADD           v2.2s, v2.2s, v1.2s
177    MOV             w5, v2.s[0]
178    ASR             w5, w5, w9
179
180    SUB             x9, x11, #31
181    negs            x9, x9
182    ASR             x20, x7, x9
183    //MOV            x8, x20
184    ADDS            x8, x20, #0
185    BGE             NEXT2
186    CMN             x8, #1
187NEXT2:
188    MOV             x20, #0x80000000
189    csel            x7, x20, x7, LT
190    MOV             x20, #0x7fffffff
191    csel            x7, x20, x7, GT
192    LSL             x20, x7, x11
193    csel            x7, x20, x7, EQ
194
195    SUB             x9, x11, #31
196    negs            x9, x9
197    ASR             x20, x12, x9
198    //MOV            x8, x20
199    ADDS            x8, x20, #0
200    BGE             NEXT3
201    CMN             x8, #1
202NEXT3:
203    MOV             x20, #0x80000000
204    csel            x12, x20, x12, LT
205    MOV             x20, #0x7fffffff
206    csel            x12, x20, x12, GT
207    LSL             x20, x12, x11
208    csel            x12, x20, x12, EQ
209
210    B               NEXT1
211NEXT:
212    MVN             w11, w11
213    ADD             w11, w11, #1
214    ASR             w5, w5, w11
215    MOV             w8, #0x8000
216
217    MOV             v1.s[0], w8
218    MOV             v2.s[0], w5
219
220    //QADD            x5, x5, x8
221
222    SQADD           v2.2s, v2.2s, v1.2s
223    MOV             w5, v2.s[0]
224
225    ASR             w5, w5, #16
226    ASR             w7, w7, w11
227    ASR             w12, w12, w11
228
229NEXT1:
230    LDR             w9, [x4]
231    sxtw            x9, w9
232    MOV             w8, #0x8000
233    //sxtw x8,w8
234
235    STR             w5, [x4], #4
236    sxtw            x5, w5
237
238
239    ROR             w20, w10, #16
240    //UXTH            x5, x10, ROR #16
241    UXTH            w5, w20
242    UXTH            w10, w10
243
244
245    dup             v0.2s, w9
246    dup             v2.2s, w10
247    dup             v3.2s, w5
248    //VZIP.32         D2, D3
249    ZIP1            v28.2s, v2.2s, v3.2s
250    ZIP2            v3.2s, v2.2s, v3.2s
251    MOV             v2.8b, v28.8b
252    sMULL           v0.2d, v2.2s, v0.2s
253    Sqxtn           v8.2s, v0.2d
254
255
256    dup             v0.2s, w12
257    dup             v1.2s, w7
258
259    //VZIP.32         D0, D1
260
261    ZIP1            v28.2s, v0.2s, v1.2s
262    ZIP2            v1.2s, v0.2s, v1.2s
263    MOV             v0.8b, v28.8b
264
265    SQSUB           v8.2s, v0.2s , v8.2s
266
267
268    sQshL           v8.2s, v8.2s, #2
269    dup             v0.2s, w8
270    SQADD           v8.2s, v8.2s , v0.2s
271    sshR            v8.2s, v8.2s, #16
272
273
274
275    MOV             x7, x17
276    //sxtw x7,w7
277    LSL             x10, x7, #1
278
279    ASR             x5, x3, #1
280    //SMULBB          x5, x10, x5
281    AND             w5, w5, 0xFFFF
282    sxth            x5, w5
283    AND             w19, w10, 0xFFFF
284    sxth            x19, w19
285    SMULL           x5, w19, w5
286
287    ADD             x5, x5, x0
288    SUB             x0, x5, x10
289    MVN             x9, x10
290    ADD             x9, x9, #1
291
292    ST1             {v8.h}[2], [x0], x9
293    ST1             {v8.h}[0], [x5], x10
294
295
296    MOV             x8, x1
297    LSL             x12, x3, #2
298
299    ADD             x1, x1, x12
300
301    SUB             x1, x1, #40
302
303    MOV             x12, #-32
304
305
306
307PROLOGUE_NEON:
308
309    ASR             x3, x3, #2
310    SUB             x3, x3, #4
311    ASR             x3, x3, #2
312    SUB             x3, x3, #2
313
314    LD2             { v0.4s, v1.4s}, [x1]
315    MOV             v2.16b, v1.16b
316    ADD             x1, x1, x12
317
318    //VUZP.16         D0, D1
319    UZP1            v28.8h, v0.8h, v0.8h
320    UZP2            v29.8h, v0.8h, v0.8h
321    MOV             v0.d[0], v28.d[0]
322    MOV             v0.d[1], v29.d[0]
323
324    //VUZP.16         D2, D3
325
326    UZP1            v28.8h, v2.8h, v2.8h
327    UZP2            v29.8h, v2.8h, v2.8h
328    MOV             v2.d[0], v28.d[0]
329    MOV             v2.d[1], v29.d[0]
330
331
332    //rev64  v0.8h,  v0.8h
333    rev64           v0.8h, v0.8h
334    MOV             v1.d[0], v0.d[1]
335    rev64           v2.8h, v2.8h
336    MOV             v3.d[0], v2.d[1]
337    LD2             {v8.4h, v9.4h}, [x2]
338    ADD             x2, x2, #16
339
340    LD2             { v4.4s, v5.4s}, [x8]
341    MOV             v6.16b, v5.16b
342    ADD             x8, x8, #32
343    uMULL           v30.4s, v0.4h, v9.4h
344
345//    VUZP.16         D4, D5
346
347    UZP1            v28.8h, v4.8h, v4.8h
348    UZP2            v29.8h, v4.8h, v4.8h
349    MOV             v4.d[0], v28.d[0]
350    MOV             v5.d[0], v29.d[0]
351
352    uMULL           v28.4s, v2.4h, v8.4h
353
354//    VUZP.16         D6, D7
355    UZP1            v26.8h, v6.8h, v6.8h
356    UZP2            v27.8h, v6.8h, v6.8h
357    MOV             v6.d[0], v26.d[0]
358    MOV             v7.d[0], v27.d[0]
359
360    uMULL           v26.4s, v0.4h, v8.4h
361
362
363    uMULL           v24.4s, v2.4h, v9.4h
364
365    LD2             { v10.4s, v11.4s}, [x6]
366    MOV             v12.16b, v11.16b
367    ADD             x6, x6, x12
368    ushR            v30.4s, v30.4s, #16
369
370    //VUZP.16         D10, D11
371
372    UZP1            v22.8h, v10.8h, v10.8h
373    UZP2            v23.8h, v10.8h, v10.8h
374    MOV             v10.d[0], v22.d[0]
375    MOV             v10.d[1], v23.d[0]
376
377    ushR            v28.4s, v28.4s, #16
378
379    //VUZP.16         D12, D13
380
381    UZP1            v22.8h, v12.8h, v12.8h
382    UZP2            v23.8h, v12.8h, v12.8h
383    MOV             v12.d[0], v22.d[0]
384    MOV             v12.d[1], v23.d[0]
385
386    sMLAL           v30.4s, v1.4h, v9.4h
387
388    rev64           v10.8h, v10.8h
389    MOV             v11.d[0], v10.d[1]
390    sMLAL           v28.4s, v3.4h, v8.4h
391
392    rev64           v12.8h, v12.8h
393    MOV             v13.d[0], v12.d[1]
394    ushR            v26.4s, v26.4s, #16
395
396
397    ushR            v24.4s, v24.4s, #16
398
399    sMLAL           v26.4s, v1.4h, v8.4h
400    sMLAL           v24.4s, v3.4h, v9.4h
401
402
403
404    ADD             v30.4s, v30.4s , v28.4s
405    NEG             v30.4s, v30.4s
406
407    uMULL           v22.4s, v4.4h, v8.4h
408
409    SUB             v28.4s, v24.4s , v26.4s
410
411
412    mov             v26.16b, v30.16b
413    mov             v24.16b, v28.16b
414
415//    VUZP.16         D24, D25
416
417    UZP1            v19.8h, v24.8h, v24.8h
418    UZP2            v21.8h, v24.8h, v24.8h
419    MOV             v24.d[0], v19.d[0]
420    MOV             v25.d[0], v21.d[0]
421
422
423//    VUZP.16         D26, D27
424
425    UZP1            v19.8h, v26.8h, v26.8h
426    UZP2            v21.8h, v26.8h, v26.8h
427    MOV             v26.d[0], v19.d[0]
428    MOV             v27.d[0], v21.d[0]
429
430    uMULL           v2.4s, v24.4h, v18.4h
431
432    uMULL           v0.4s, v26.4h, v18.4h
433
434    ushR            v22.4s, v22.4s, #16
435    sMLAL           v22.4s, v5.4h, v8.4h
436
437    ushR            v2.4s, v2.4s, #16
438    ushR            v0.4s, v0.4s, #16
439    sMLAL           v2.4s, v25.4h, v18.4h
440    sMLAL           v0.4s, v27.4h, v18.4h
441
442    uMULL           v24.4s, v4.4h, v9.4h
443    uMULL           v26.4s, v6.4h, v8.4h
444
445    NEG             v2.4s, v2.4s
446    ADD             v28.4s, v28.4s , v0.4s
447    ADD             v30.4s, v30.4s , v2.4s
448
449    uMULL           v0.4s, v6.4h, v9.4h
450    sshR            v24.4s, v24.4s, #16
451    sMLAL           v24.4s, v5.4h, v9.4h
452    sshR            v26.4s, v26.4s, #16
453    sshR            v0.4s, v0.4s, #16
454    sMLAL           v26.4s, v7.4h, v8.4h
455    sMLAL           v0.4s, v7.4h, v9.4h
456
457
458
459
460    ADD             v22.4s, v22.4s , v0.4s
461    NEG             v22.4s, v22.4s
462    SUB             v24.4s, v26.4s , v24.4s
463
464
465
466    //LDR w11,  [sp, #120]
467    //sxtw x11,w11
468    MOV             w11, w26
469    dup             v14.4s, w11
470    SQADD           v28.4s, v28.4s , v14.4s
471    //LDR w11,  [sp, #116]
472    MOV             w11, w25
473    //sxtw x11,w11
474    dup             v0.4s, w11
475    sQshL           v28.4s, v28.4s, v0.4s
476
477    mov             v0.16b, v22.16b
478    mov             v14.16b, v24.16b
479
480
481//    VUZP.16         D24, D25
482
483    UZP1            v19.8h, v24.8h, v24.8h
484    UZP2            v21.8h, v24.8h, v24.8h
485    MOV             v24.d[0], v19.d[0]
486    MOV             v25.d[0], v21.d[0]
487
488
489//    VUZP.16         D22, D23
490
491    UZP1            v19.8h, v22.8h, v22.8h
492    UZP2            v21.8h, v22.8h, v22.8h
493    MOV             v22.d[0], v19.d[0]
494    MOV             v23.d[0], v21.d[0]
495
496    uMULL           v8.4s, v24.4h, v18.4h
497    uMULL           v26.4s, v22.4h, v18.4h
498
499    NEG             v2.4s, v30.4s
500//    VUZP.16         D30, D31
501
502    UZP1            v19.8h, v30.8h, v30.8h
503    UZP2            v21.8h, v30.8h, v30.8h
504    MOV             v30.d[0], v19.d[0]
505    MOV             v30.d[1], v21.d[0]
506
507//    VUZP.16         D2, D3
508
509    UZP1            v19.8h, v2.8h, v2.8h
510    UZP2            v21.8h, v2.8h, v2.8h
511    MOV             v2.d[0], v19.d[0]
512    MOV             v3.d[0], v21.d[0]
513
514    uMULL           v4.4s, v30.4h, v12.4h
515
516    uMULL           v6.4s, v2.4h, v13.4h
517
518    ushR            v8.4s, v8.4s, #16
519    ushR            v26.4s, v26.4s, #16
520
521    sMLAL           v8.4s, v25.4h, v18.4h
522    sMLAL           v26.4s, v23.4h, v18.4h
523
524    ushR            v4.4s, v4.4s, #16
525    ushR            v6.4s, v6.4s, #16
526
527    MOV             v19.d[0], v30.d[1]
528
529    sMLAL           v4.4s, v19.4h, v12.4h
530    sMLAL           v6.4s, v3.4h, v13.4h
531
532    NEG             v8.4s, v8.4s
533    ADD             v14.4s, v14.4s , v26.4s
534    ADD             v0.4s, v0.4s , v8.4s
535
536    //LDR w11,  [sp, #120]
537    //sxtw x11,w11
538    MOV             w11, w26
539    dup             v8.4s, w11
540    SQADD           v0.4s, v0.4s , v8.4s
541    //LDR w11,  [sp, #116]
542    //sxtw x11,w11
543    MOV             w11, w25
544    dup             v26.4s, w11
545    sQshL           v0.4s, v0.4s, v26.4s
546
547    mov             v26.16b, v28.16b
548
549    LD2             { v28.4s, v29.4s}, [x4]
550    MOV             v30.16b, v29.16b
551    MOV             v29.d[0], v28.d[1]
552//   VZIP.32         Q13, Q0
553
554    ZIP1            v19.4s, v26.4s, v0.4s
555    ZIP2            v0.4s, v26.4s, v0.4s
556    MOV             v26.16b, v19.16b
557
558    ST1             { v26.4s}, [x4], #16
559    ST1             { v0.4s}, [x4], #16
560
561    movi            v1.2s, #0
562    //VADDL.S16       Q0, D13, D1
563
564    SADDL           v0.4s, v13.4h, v1.4h
565    MOV             v1.d[0], v0.d[1]
566    sMULL           v26.2d, v28.2s, v0.2s
567    Sqxtn           v8.2s, v26.2d
568    sMULL           v26.2d, v29.2s, v1.2s
569    Sqxtn           v9.2s, v26.2d
570    MOV             v8.d[1], v9.d[0]
571    movi            v1.2s, #0
572//    VADDL.S16       Q0, D12, D1
573    SADDL           v0.4s, v12.4h, v1.4h
574    MOV             v1.d[0], v0.d[1]
575    sMULL           v24.2d, v28.2s, v0.2s
576    Sqxtn           v26.2s, v24.2d
577    sMULL           v24.2d, v29.2s, v1.2s
578    Sqxtn           v27.2s, v24.2d
579    MOV             v26.d[1], v27.d[0]
580
581    sQshL           v4.4s, v4.4s, v16.4s
582    sQshL           v6.4s, v6.4s, v16.4s
583
584    SQSUB           v4.4s, v4.4s , v8.4s
585    SQSUB           v6.4s, v6.4s , v26.4s
586
587    NEG             v26.4s, v14.4s
588//    VUZP.16         D14, D15
589
590
591    UZP1            v19.8h, v14.8h, v14.8h
592    UZP2            v21.8h, v14.8h, v14.8h
593    MOV             v14.d[0], v19.d[0]
594    MOV             v15.d[0], v21.d[0]
595
596//    VUZP.16         D26, D27
597
598
599    UZP1            v19.8h, v26.8h, v26.8h
600    UZP2            v21.8h, v26.8h, v26.8h
601    MOV             v26.d[0], v19.d[0]
602    MOV             v27.d[0], v21.d[0]
603
604
605    movi            v1.2s, #0
606//    VADDL.S16       Q0, D10, D1
607    SADDL           v0.4s, v10.4h, v1.4h
608    MOV             v1.d[0], v0.d[0]
609    sMULL           v22.2d, v30.2s, v0.2s
610    Sqxtn           v24.2s, v22.2d
611    sMULL2          v22.2d, v30.4s, v0.4s
612    Sqxtn           v25.2s, v22.2d
613    MOV             v24.d[1], v25.d[0]
614    movi            v1.2s, #0
615//    VADDL.S16       Q0, D11, D1
616    SADDL           v0.4s, v11.4h, v1.4h
617    MOV             v1.d[0], v0.d[1]
618
619    sMULL           v8.2d, v30.2s, v0.2s
620    Sqxtn           v22.2s, v8.2d
621    sMULL2          v8.2d, v30.4s, v0.4s
622    Sqxtn           v23.2s, v8.2d
623    MOV             v22.d[1], v23.d[0]
624    uMULL           v8.4s, v26.4h, v11.4h
625    uMULL           v30.4s, v14.4h, v10.4h
626
627    LD2             { v0.4s, v1.4s}, [x1]
628    MOV             v2.16b, v1.16b
629    ADD             x1, x1, x12
630
631//    VUZP.16         D0, D1
632
633    UZP1            v19.8h, v0.8h, v0.8h
634    UZP2            v21.8h, v0.8h, v0.8h
635    MOV             v0.d[0], v19.d[0]
636    MOV             v0.d[1], v21.d[0]
637
638//    VUZP.16         D2, D3
639
640    UZP1            v19.8h, v2.8h, v2.8h
641    UZP2            v21.8h, v2.8h, v2.8h
642    MOV             v2.d[0], v19.d[0]
643    MOV             v2.d[1], v21.d[0]
644
645    ushR            v8.4s, v8.4s, #16
646
647    rev64           v0.8h, v0.8h
648    MOV             v1.d[0], v0.d[1]
649    ushR            v30.4s, v30.4s, #16
650
651    rev64           v2.8h, v2.8h
652    MOV             v3.d[0], v2.d[1]
653    sMLAL           v8.4s, v27.4h, v11.4h
654
655    sMLAL           v30.4s, v15.4h, v10.4h
656
657    LD2             { v10.4s, v11.4s}, [x6]
658    ADD             x6, x6, x12
659    MOV             v12.16b, v11.16b
660    sQshL           v4.4s, v4.4s, #2
661
662//   VUZP.16         D10, D11
663
664    UZP1            v19.8h, v10.8h, v10.8h
665    UZP2            v21.8h, v10.8h, v10.8h
666    MOV             v10.d[0], v19.d[0]
667    MOV             v10.d[1], v21.d[0]
668
669    sQshL           v6.4s, v6.4s, #2
670
671//    VUZP.16         D12, D13
672
673    UZP1            v19.8h, v12.8h, v12.8h
674    UZP2            v21.8h, v12.8h, v12.8h
675    MOV             v12.d[0], v19.d[0]
676    MOV             v12.d[1], v21.d[0]
677
678    SQADD           v14.4s, v4.4s , v20.4s
679
680    rev64           v10.8h, v10.8h
681    MOV             v11.d[0], v10.d[1]
682    SQADD           v6.4s, v6.4s , v20.4s
683
684    rev64           v12.8h, v12.8h
685    MOV             v13.d[0], v12.d[1]
686    sshR            v14.4s, v14.4s, #16
687
688//    VUZP.16         D14, D15
689
690    UZP1            v19.8h, v14.8h, v14.8h
691    UZP2            v21.8h, v14.8h, v14.8h
692    MOV             v14.d[0], v19.d[0]
693    MOV             v15.d[0], v21.d[0]
694
695    sshR            v6.4s, v6.4s, #16
696
697//    VUZP.16         D6, D7
698
699    UZP1            v19.8h, v6.8h, v6.8h
700    UZP2            v21.8h, v6.8h, v6.8h
701    MOV             v6.d[0], v19.d[0]
702    MOV             v7.d[0], v21.d[0]
703
704    mov             v15.8b, v6.8b
705    sQshL           v8.4s, v8.4s, v16.4s
706
707    LD2             { v4.4s, v5.4s}, [x8]
708    ADD             x8, x8, #32
709    MOV             v6.16b, v5.16b
710    sQshL           v30.4s, v30.4s, v16.4s
711
712//    VUZP.16         D4, D5
713
714    UZP1            v19.8h, v4.8h, v4.8h
715    UZP2            v21.8h, v4.8h, v4.8h
716    MOV             v4.d[0], v19.d[0]
717    MOV             v5.d[0], v21.d[0]
718
719    SQSUB           v8.4s, v8.4s , v24.4s
720
721//    VUZP.16         D6, D7
722
723    UZP1            v19.8h, v6.8h, v6.8h
724    UZP2            v21.8h, v6.8h, v6.8h
725    MOV             v6.d[0], v19.d[0]
726    MOV             v7.d[0], v21.d[0]
727
728    SQSUB           v22.4s, v30.4s , v22.4s
729
730    sQshL           v30.4s, v8.4s, #2
731
732    LD2             {v8.4h, v9.4h}, [x2]
733    ADD             x2, x2, #16
734    sQshL           v22.4s, v22.4s, #2
735
736    SQADD           v30.4s, v30.4s , v20.4s
737    SQADD           v22.4s, v22.4s , v20.4s
738
739    sshR            v30.4s, v30.4s, #16
740
741//    VUZP.16         D30, D31
742
743    UZP1            v19.8h, v30.8h, v30.8h
744    UZP2            v21.8h, v30.8h, v30.8h
745    MOV             v30.d[0], v19.d[0]
746    MOV             v30.d[1], v21.d[0]
747
748    sshR            v22.4s, v22.4s, #16
749
750
751//    VUZP.16         D22, D23
752
753    UZP1            v19.8h, v22.8h, v22.8h
754    UZP2            v21.8h, v22.8h, v22.8h
755    MOV             v22.d[0], v19.d[0]
756    MOV             v23.d[0], v21.d[0]
757
758
759    mov             v23.8b, v30.8b
760
761CORE_LOOP:
762    ST1             {v14.h}[0], [x0]
763    ADD             x0, x0, x9
764    uMULL           v30.4s, v0.4h, v9.4h
765
766    ST1             {v22.h}[0], [x0]
767    ADD             x0, x0, x9
768    uMULL           v28.4s, v2.4h, v8.4h
769
770    ST1             {v14.h}[1], [x0]
771    ADD             x0, x0, x9
772    uMULL           v26.4s, v0.4h, v8.4h
773
774    ST1             {v22.h}[1], [x0]
775    ADD             x0, x0, x9
776    uMULL           v24.4s, v2.4h, v9.4h
777
778    ST1             {v14.h}[2], [x0]
779    ADD             x0, x0, x9
780    ushR            v30.4s, v30.4s, #16
781
782    ST1             {v22.h}[2], [x0]
783    ADD             x0, x0, x9
784    ushR            v28.4s, v28.4s, #16
785
786    ST1             {v14.h}[3], [x0]
787    ADD             x0, x0, x9
788    sMLAL           v30.4s, v1.4h, v9.4h
789
790    ST1             {v22.h}[3], [x0]
791    ADD             x0, x0, x9
792    sMLAL           v28.4s, v3.4h, v8.4h
793
794    ST1             {v15.h}[0], [x5]
795    ADD             x5, x5, x10
796    ushR            v26.4s, v26.4s, #16
797
798    ST1             {v23.h}[0], [x5]
799    ADD             x5, x5, x10
800    ushR            v24.4s, v24.4s, #16
801
802    ST1             {v15.h}[1], [x5]
803    ADD             x5, x5, x10
804    sMLAL           v26.4s, v1.4h, v8.4h
805
806    ST1             {v23.h}[1], [x5]
807    ADD             x5, x5, x10
808    sMLAL           v24.4s, v3.4h, v9.4h
809
810    ST1             {v15.h}[2], [x5]
811    ADD             x5, x5, x10
812    ADD             v30.4s, v30.4s , v28.4s
813
814    ST1             {v23.h}[2], [x5]
815    ADD             x5, x5, x10
816    NEG             v30.4s, v30.4s
817
818    ST1             {v15.h}[3], [x5]
819    ADD             x5, x5, x10
820
821    ST1             {v23.h}[3], [x5]
822    ADD             x5, x5, x10
823    SUB             v28.4s, v24.4s , v26.4s
824
825
826    mov             v26.16b, v30.16b
827    uMULL           v22.4s, v4.4h, v8.4h
828
829    mov             v24.16b, v28.16b
830
831//    VUZP.16         D24, D25
832
833    UZP1            v19.8h, v24.8h, v24.8h
834    UZP2            v21.8h, v24.8h, v24.8h
835    MOV             v24.d[0], v19.d[0]
836    MOV             v25.d[0], v21.d[0]
837
838
839//    VUZP.16         D26, D27
840
841    UZP1            v19.8h, v26.8h, v26.8h
842    UZP2            v21.8h, v26.8h, v26.8h
843    MOV             v26.d[0], v19.d[0]
844    MOV             v27.d[0], v21.d[0]
845
846    uMULL           v2.4s, v24.4h, v18.4h
847    uMULL           v0.4s, v26.4h, v18.4h
848
849    ushR            v22.4s, v22.4s, #16
850    sMLAL           v22.4s, v5.4h, v8.4h
851
852    ushR            v2.4s, v2.4s, #16
853    ushR            v0.4s, v0.4s, #16
854    sMLAL           v2.4s, v25.4h, v18.4h
855    sMLAL           v0.4s, v27.4h, v18.4h
856
857    uMULL           v24.4s, v4.4h, v9.4h
858    uMULL           v26.4s, v6.4h, v8.4h
859
860    NEG             v2.4s, v2.4s
861    ADD             v28.4s, v28.4s , v0.4s
862    ADD             v30.4s, v30.4s , v2.4s
863
864    uMULL           v0.4s, v6.4h, v9.4h
865    sshR            v24.4s, v24.4s, #16
866    sMLAL           v24.4s, v5.4h, v9.4h
867    sshR            v26.4s, v26.4s, #16
868    sshR            v0.4s, v0.4s, #16
869    sMLAL           v26.4s, v7.4h, v8.4h
870    sMLAL           v0.4s, v7.4h, v9.4h
871
872
873
874    ADD             v22.4s, v22.4s , v0.4s
875
876    NEG             v22.4s, v22.4s
877    SUB             v24.4s, v26.4s , v24.4s
878
879
880    //LDR w11,  [sp, #120]
881    //sxtw x11,w11
882    MOV             w11, w26
883    dup             v14.4s, w11
884    SQADD           v28.4s, v28.4s , v14.4s
885    //LDR w11,  [sp, #116]
886    //sxtw x11,w11
887    MOV             w11, w25
888    dup             v0.4s, w11
889    sQshL           v28.4s, v28.4s, v0.4s
890
891
892    mov             v0.16b, v22.16b
893    mov             v14.16b, v24.16b
894
895//    VUZP.16         D24, D25
896
897    UZP1            v19.8h, v24.8h, v24.8h
898    UZP2            v21.8h, v24.8h, v24.8h
899    MOV             v24.d[0], v19.d[0]
900    MOV             v25.d[0], v21.d[0]
901
902
903//    VUZP.16         D22, D23
904
905    UZP1            v19.8h, v22.8h, v22.8h
906    UZP2            v21.8h, v22.8h, v22.8h
907    MOV             v22.d[0], v19.d[0]
908    MOV             v23.d[0], v21.d[0]
909
910    uMULL           v8.4s, v24.4h, v18.4h
911    uMULL           v26.4s, v22.4h, v18.4h
912
913    NEG             v2.4s, v30.4s
914
915//    VUZP.16         D30, D31
916
917    UZP1            v19.8h, v30.8h, v30.8h
918    UZP2            v21.8h, v30.8h, v30.8h
919    MOV             v30.d[0], v19.d[0]
920    MOV             v30.d[1], v21.d[0]
921
922
923//    VUZP.16         D2, D3
924
925    UZP1            v19.8h, v2.8h, v2.8h
926    UZP2            v21.8h, v2.8h, v2.8h
927    MOV             v2.d[0], v19.d[0]
928    MOV             v3.d[0], v21.d[0]
929
930    uMULL           v4.4s, v30.4h, v12.4h
931    uMULL           v6.4s, v2.4h, v13.4h
932
933    ushR            v8.4s, v8.4s, #16
934    ushR            v26.4s, v26.4s, #16
935
936    sMLAL           v8.4s, v25.4h, v18.4h
937    sMLAL           v26.4s, v23.4h, v18.4h
938
939    ushR            v4.4s, v4.4s, #16
940    ushR            v6.4s, v6.4s, #16
941
942    MOV             v19.d[0], v30.d[1]
943
944    sMLAL           v4.4s, v19.4h, v12.4h
945    sMLAL           v6.4s, v3.4h, v13.4h
946
947    NEG             v8.4s, v8.4s
948    ADD             v14.4s, v14.4s , v26.4s
949    ADD             v0.4s, v0.4s , v8.4s
950
951
952
953    //LDR w11,  [sp, #120]
954    //sxtw x11,w11
955    MOV             w11, w26
956    dup             v8.4s, w11
957    SQADD           v0.4s, v0.4s , v8.4s
958    //LDR w11,  [sp, #116]
959    //sxtw x11,w11
960    MOV             w11, w25
961    dup             v26.4s, w11
962    sQshL           v0.4s, v0.4s, v26.4s
963    mov             v26.16b, v28.16b
964
965    LD2             { v28.4s, v29.4s}, [x4]
966    MOV             v30.16b, v29.16b
967    MOV             v29.d[0], v28.d[1]
968//    VZIP.32         Q13, Q0
969
970    ZIP1            v19.4s, v26.4s, v0.4s
971    ZIP2            v0.4s, v26.4s, v0.4s
972    MOV             v26.16b, v19.16b
973
974    ST1             { v26.4s}, [x4]
975    ADD             x4, x4, #16
976    ST1             { v0.4s}, [x4]
977    ADD             x4, x4, #16
978
979    movi            v1.2s, #0
980//    VADDL.S16       Q0, D13, D1
981    SADDL           v0.4s, v13.4h, v1.4h
982    MOV             v1.d[0], v0.d[1]
983
984    sMULL           v26.2d, v28.2s, v0.2s
985    Sqxtn           v8.2s, v26.2d
986    sMULL           v26.2d, v29.2s, v1.2s
987    Sqxtn           v9.2s, v26.2d
988    MOV             v8.d[1], v9.d[0]
989    movi            v1.2s, #0
990    //VADDL.S16       Q0, D12, D1
991    SADDL           v0.4s, v12.4h, v1.4h
992    MOV             v1.d[0], v0.d[1]
993
994    sMULL           v24.2d, v28.2s, v0.2s
995    Sqxtn           v26.2s, v24.2d
996    sMULL           v24.2d, v29.2s, v1.2s
997    Sqxtn           v27.2s, v24.2d
998    MOV             v26.d[1], v27.d[0]
999    sQshL           v4.4s, v4.4s, v16.4s
1000    sQshL           v6.4s, v6.4s, v16.4s
1001
1002
1003
1004    SQSUB           v4.4s, v4.4s , v8.4s
1005    SQSUB           v6.4s, v6.4s , v26.4s
1006
1007    NEG             v26.4s, v14.4s
1008//    VUZP.16         D26, D27
1009    UZP1            v19.8h, v26.8h, v26.8h
1010    UZP2            v21.8h, v26.8h, v26.8h
1011    MOV             v26.d[0], v19.d[0]
1012    MOV             v27.d[0], v21.d[0]
1013
1014    movi            v1.2s, #0
1015    //VADDL.S16       Q0, D10, D1
1016    SADDL           v0.4s, v10.4h, v1.4h
1017    MOV             v1.d[0], v0.d[1]
1018
1019    sMULL           v22.2d, v30.2s, v0.2s
1020    Sqxtn           v24.2s, v22.2d
1021    sMULL2          v22.2d, v30.4s, v0.4s
1022    Sqxtn           v25.2s, v22.2d
1023    MOV             v24.d[1], v25.d[0]
1024    movi            v1.2s, #0
1025    //VADDL.S16       Q0, D11, D1
1026    SADDL           v0.4s, v11.4h, v1.4h
1027
1028    sMULL           v8.2d, v30.2s, v0.2s
1029    Sqxtn           v22.2s, v8.2d
1030    sMULL2          v8.2d, v30.4s, v0.4s
1031    Sqxtn           v23.2s, v8.2d
1032    MOV             v22.d[1], v23.d[0]
1033
1034//    VUZP.16         D14, D15
1035
1036    UZP1            v19.8h, v14.8h, v14.8h
1037    UZP2            v21.8h, v14.8h, v14.8h
1038    MOV             v14.d[0], v19.d[0]
1039    MOV             v15.d[0], v21.d[0]
1040
1041    uMULL           v8.4s, v26.4h, v11.4h
1042    uMULL           v30.4s, v14.4h, v10.4h
1043
1044
1045    LD2             { v0.4s, v1.4s}, [x1]
1046    MOV             v2.16b, v1.16b
1047    ADD             X1, X1, x12
1048
1049//    VUZP.16         D0, D1
1050    UZP1            v19.8h, v0.8h, v0.8h
1051    UZP2            v21.8h, v0.8h, v0.8h
1052    MOV             v0.d[0], v19.d[0]
1053    MOV             v0.d[1], v21.d[0]
1054
1055//    VUZP.16         D2, D3
1056
1057    UZP1            v19.8h, v2.8h, v2.8h
1058    UZP2            v21.8h, v2.8h, v2.8h
1059    MOV             v2.d[0], v19.d[0]
1060    MOV             v2.d[1], v21.d[0]
1061
1062    ushR            v8.4s, v8.4s, #16
1063
1064    rev64           v0.8h, v0.8h
1065    MOV             v1.d[0], v0.d[1]
1066    ushR            v30.4s, v30.4s, #16
1067
1068    rev64           v2.8h, v2.8h
1069    MOV             v3.d[0], v2.d[1]
1070    sMLAL           v8.4s, v27.4h, v11.4h
1071
1072    sMLAL           v30.4s, v15.4h, v10.4h
1073
1074    LD2             { v10.4s, v11.4s}, [x6]
1075    add             X6, x6, x12
1076    MOV             v12.16b, v11.16b
1077    sQshL           v4.4s, v4.4s, #2
1078
1079    //VUZP.16         D10, D11
1080
1081    UZP1            v19.8h, v10.8h, v10.8h
1082    UZP2            v21.8h, v10.8h, v10.8h
1083    MOV             v10.d[0], v19.d[0]
1084    MOV             v10.d[1], v21.d[0]
1085
1086    sQshL           v6.4s, v6.4s, #2
1087
1088//    VUZP.16         D12, D13
1089
1090    UZP1            v19.8h, v12.8h, v12.8h
1091    UZP2            v21.8h, v12.8h, v12.8h
1092    MOV             v12.d[0], v19.d[0]
1093    MOV             v12.d[1], v21.d[0]
1094
1095
1096    SQADD           v14.4s, v4.4s , v20.4s
1097
1098    rev64           v10.8h, v10.8h
1099    MOV             v11.d[0], v10.d[1]
1100    SQADD           v6.4s, v6.4s , v20.4s
1101
1102    rev64           v12.8h, v12.8h
1103    MOV             v13.d[0], v12.d[1]
1104    sshR            v14.4s, v14.4s, #16
1105
1106//    VUZP.16         D14, D15
1107
1108    UZP1            v19.8h, v14.8h, v14.8h
1109    UZP2            v21.8h, v14.8h, v14.8h
1110    MOV             v14.d[0], v19.d[0]
1111    MOV             v15.d[0], v21.d[0]
1112
1113
1114    sshR            v6.4s, v6.4s, #16
1115
1116//    VUZP.16         D6, D7
1117
1118    UZP1            v19.8h, v6.8h, v6.8h
1119    UZP2            v21.8h, v6.8h, v6.8h
1120    MOV             v6.d[0], v19.d[0]
1121    MOV             v7.d[0], v21.d[0]
1122
1123
1124    mov             v15.8b, v6.8b
1125    sQshL           v8.4s, v8.4s, v16.4s
1126
1127    LD2             { v4.4s, v5.4s}, [x8]
1128    ADD             x8, x8, #32
1129    MOV             v6.16b, v5.16b
1130
1131    sQshL           v30.4s, v30.4s, v16.4s
1132
1133//    VUZP.16         D4, D5
1134
1135    UZP1            v19.8h, v4.8h, v4.8h
1136    UZP2            v21.8h, v4.8h, v4.8h
1137    MOV             v4.d[0], v19.d[0]
1138    MOV             v5.d[0], v21.d[0]
1139
1140
1141    SQSUB           v8.4s, v8.4s , v24.4s
1142
1143//    VUZP.16         D6, D7
1144
1145    UZP1            v19.8h, v6.8h, v6.8h
1146    UZP2            v21.8h, v6.8h, v6.8h
1147    MOV             v6.d[0], v19.d[0]
1148    MOV             v7.d[0], v21.d[0]
1149
1150
1151    SQSUB           v22.4s, v30.4s , v22.4s
1152
1153    sQshL           v30.4s, v8.4s, #2
1154
1155    LD2             {v8.4h, v9.4h}, [x2]
1156    ADD             x2, x2, #16
1157    sQshL           v22.4s, v22.4s, #2
1158
1159    SQADD           v30.4s, v30.4s , v20.4s
1160    SQADD           v22.4s, v22.4s , v20.4s
1161
1162    sshR            v30.4s, v30.4s, #16
1163
1164//   VUZP.16         D30, D31
1165
1166    UZP1            v19.8h, v30.8h, v30.8h
1167    UZP2            v21.8h, v30.8h, v30.8h
1168    MOV             v30.d[0], v19.d[0]
1169    MOV             v30.d[1], v21.d[0]
1170
1171
1172    sshR            v22.4s, v22.4s, #16
1173
1174
1175//    VUZP.16         D22, D23
1176    UZP1            v19.8h, v22.8h, v22.8h
1177    UZP2            v21.8h, v22.8h, v22.8h
1178    MOV             v22.d[0], v19.d[0]
1179    MOV             v23.d[0], v21.d[0]
1180
1181
1182    mov             v23.8b, v30.8b
1183
1184    SUBS            x3, x3, #1
1185    BNE             CORE_LOOP
1186
1187
1188
1189
1190
1191EPILOGUE:
1192
1193    ST1             {v14.h}[0], [x0]
1194    ADD             x0, x0, x9
1195    uMULL           v30.4s, v0.4h, v9.4h
1196
1197    ST1             {v22.h}[0], [x0]
1198    ADD             x0, x0, x9
1199    uMULL           v28.4s, v2.4h, v8.4h
1200
1201    ST1             {v14.h}[1], [x0]
1202    ADD             x0, x0, x9
1203    uMULL           v26.4s, v0.4h, v8.4h
1204
1205    ST1             {v22.h}[1], [x0]
1206    ADD             x0, x0, x9
1207    uMULL           v24.4s, v2.4h, v9.4h
1208
1209    ST1             {v14.h}[2], [x0]
1210    ADD             x0, x0, x9
1211    ushR            v30.4s, v30.4s, #16
1212
1213    ST1             {v22.h}[2], [x0]
1214    ADD             x0, x0, x9
1215    ushR            v28.4s, v28.4s, #16
1216
1217    ST1             {v14.h}[3], [x0]
1218    ADD             x0, x0, x9
1219    sMLAL           v30.4s, v1.4h, v9.4h
1220
1221    ST1             {v22.h}[3], [x0]
1222    ADD             x0, x0, x9
1223    sMLAL           v28.4s, v3.4h, v8.4h
1224
1225    ST1             {v15.h}[0], [x5]
1226    ADD             x5, x5, x10
1227    ushR            v26.4s, v26.4s, #16
1228
1229    ST1             {v23.h}[0], [x5]
1230    ADD             x5, x5, x10
1231    ushR            v24.4s, v24.4s, #16
1232
1233    ST1             {v15.h}[1], [x5]
1234    ADD             x5, x5, x10
1235    sMLAL           v26.4s, v1.4h, v8.4h
1236
1237    ST1             {v23.h}[1], [x5]
1238    ADD             x5, x5, x10
1239    sMLAL           v24.4s, v3.4h, v9.4h
1240
1241    ST1             {v15.h}[2], [x5]
1242    ADD             x5, x5, x10
1243    ADD             v30.4s, v30.4s , v28.4s
1244
1245    ST1             {v23.h}[2], [x5]
1246    ADD             x5, x5, x10
1247    NEG             v30.4s, v30.4s
1248
1249    ST1             {v15.h}[3], [x5]
1250    ADD             x5, x5, x10
1251
1252
1253    ST1             {v23.h}[3], [x5]
1254    ADD             x5, x5, x10
1255    SUB             v28.4s, v24.4s , v26.4s
1256
1257
1258    uMULL           v22.4s, v4.4h, v8.4h
1259    mov             v26.16b, v30.16b
1260    mov             v24.16b, v28.16b
1261
1262    mov             v26.16b, v30.16b
1263    mov             v24.16b, v28.16b
1264
1265    //VUZP.16         D26, D27
1266
1267    UZP1            v19.8h, v26.8h, v26.8h
1268    UZP2            v21.8h, v26.8h, v26.8h
1269    MOV             v26.d[0], v19.d[0]
1270    MOV             v27.d[0], v21.d[0]
1271
1272//    VUZP.16         D24, D25
1273
1274    UZP1            v19.8h, v24.8h, v24.8h
1275    UZP2            v21.8h, v24.8h, v24.8h
1276    MOV             v24.d[0], v19.d[0]
1277    MOV             v25.d[0], v21.d[0]
1278
1279    uMULL           v2.4s, v24.4h, v18.4h
1280    uMULL           v0.4s, v26.4h, v18.4h
1281
1282    ushR            v22.4s, v22.4s, #16
1283    sMLAL           v22.4s, v5.4h, v8.4h
1284
1285    ushR            v2.4s, v2.4s, #16
1286    ushR            v0.4s, v0.4s, #16
1287    sMLAL           v2.4s, v25.4h, v18.4h
1288    sMLAL           v0.4s, v27.4h, v18.4h
1289
1290    uMULL           v24.4s, v4.4h, v9.4h
1291    uMULL           v26.4s, v6.4h, v8.4h
1292
1293    NEG             v2.4s, v2.4s
1294    ADD             v28.4s, v28.4s , v0.4s
1295    ADD             v30.4s, v30.4s , v2.4s
1296
1297    uMULL           v0.4s, v6.4h, v9.4h
1298    sshR            v24.4s, v24.4s, #16
1299    sMLAL           v24.4s, v5.4h, v9.4h
1300    sshR            v26.4s, v26.4s, #16
1301    sshR            v0.4s, v0.4s, #16
1302    sMLAL           v26.4s, v7.4h, v8.4h
1303    sMLAL           v0.4s, v7.4h, v9.4h
1304
1305
1306
1307
1308
1309    ADD             v22.4s, v22.4s , v0.4s
1310    NEG             v22.4s, v22.4s
1311    SUB             v24.4s, v26.4s , v24.4s
1312
1313
1314
1315
1316    //LDR w11,  [sp, #120]
1317    //sxtw x11,w11
1318    MOV             w11, w26
1319    dup             v14.4s, w11
1320    SQADD           v28.4s, v28.4s , v14.4s
1321    //LDR w11,  [sp, #116]
1322    //sxtw x11,w11
1323    MOV             w11, w25
1324    dup             v0.4s, w11
1325    sQshL           v28.4s, v28.4s, v0.4s
1326
1327
1328    mov             v0.16b, v22.16b
1329    mov             v14.16b, v24.16b
1330
1331
1332//    VUZP.16         D22, D23
1333
1334    UZP1            v19.8h, v22.8h, v22.8h
1335    UZP2            v21.8h, v22.8h, v22.8h
1336    MOV             v22.d[0], v19.d[0]
1337    MOV             v23.d[0], v21.d[0]
1338
1339//    VUZP.16         D24, D25
1340
1341    UZP1            v19.8h, v24.8h, v24.8h
1342    UZP2            v21.8h, v24.8h, v24.8h
1343    MOV             v24.d[0], v19.d[0]
1344    MOV             v25.d[0], v21.d[0]
1345
1346    uMULL           v8.4s, v24.4h, v18.4h
1347    uMULL           v26.4s, v22.4h, v18.4h
1348
1349    NEG             v2.4s, v30.4s
1350
1351//    VUZP.16         D30, D31
1352
1353    UZP1            v19.8h, v30.8h, v30.8h
1354    UZP2            v21.8h, v30.8h, v30.8h
1355    MOV             v30.d[0], v19.d[0]
1356    MOV             v30.d[1], v21.d[0]
1357
1358//    VUZP.16         D2, D3
1359
1360    UZP1            v19.8h, v2.8h, v2.8h
1361    UZP2            v21.8h, v2.8h, v2.8h
1362    MOV             v2.d[0], v19.d[0]
1363    MOV             v3.d[0], v21.d[0]
1364
1365    uMULL           v4.4s, v30.4h, v12.4h
1366    uMULL           v6.4s, v2.4h, v13.4h
1367
1368    ushR            v8.4s, v8.4s, #16
1369    ushR            v26.4s, v26.4s, #16
1370
1371    sMLAL           v8.4s, v25.4h, v18.4h
1372    sMLAL           v26.4s, v23.4h, v18.4h
1373
1374    ushR            v4.4s, v4.4s, #16
1375    ushR            v6.4s, v6.4s, #16
1376
1377    MOV             v19.d[0], v30.d[1]
1378
1379    sMLAL           v4.4s, v19.4h, v12.4h
1380    sMLAL           v6.4s, v3.4h, v13.4h
1381
1382    NEG             v8.4s, v8.4s
1383    ADD             v14.4s, v14.4s , v26.4s
1384    ADD             v0.4s, v0.4s , v8.4s
1385
1386    //LDR w11,  [sp, #120]
1387    //sxtw x11,w11
1388    MOV             w11, w26
1389    dup             v8.4s, w11
1390    SQADD           v0.4s, v0.4s , v8.4s
1391    //LDR w11,  [sp, #116]
1392    //sxtw x11,w11
1393    MOV             w11, w25
1394    dup             v26.4s, w11
1395    sQshL           v0.4s, v0.4s, v26.4s
1396
1397
1398    mov             v26.16b, v28.16b
1399
1400    LD2             { v28.4s, v29.4s}, [x4]
1401    MOV             v30.16b, v29.16b
1402    MOV             v29.d[0], v28.d[1]
1403//    VZIP.32         Q13, Q0
1404
1405    ZIP1            v19.4s, v26.4s, v0.4s
1406    ZIP2            v0.4s, v26.4s, v0.4s
1407    MOV             v26.16b, v19.16b
1408
1409    ST1             { v26.4s}, [x4], #16
1410    ST1             { v0.4s}, [x4], #16
1411
1412    movi            v1.2s, #0
1413//    VADDL.S16       Q0, D13, D1
1414    SADDL           v0.4s, v13.4h, v1.4h
1415    MOV             v1.d[0], v0.d[1]
1416
1417    sMULL           v26.2d, v28.2s, v0.2s
1418    Sqxtn           v8.2s, v26.2d
1419    sMULL           v26.2d, v29.2s, v1.2s
1420    Sqxtn           v9.2s, v26.2d
1421    MOV             v8.d[1], v9.d[0]
1422    movi            v1.2s, #0
1423//    VADDL.S16       Q0, D12, D1
1424    SADDL           v0.4s, v12.4h, v1.4h
1425    MOV             v1.d[0], v0.d[1]
1426
1427    sMULL           v24.2d, v28.2s, v0.2s
1428    Sqxtn           v26.2s, v24.2d
1429    sMULL           v24.2d, v29.2s, v1.2s
1430    Sqxtn           v27.2s, v24.2d
1431    MOV             v26.d[1], v27.d[0]
1432
1433    sQshL           v4.4s, v4.4s, v16.4s
1434    sQshL           v6.4s, v6.4s, v16.4s
1435
1436    SQSUB           v4.4s, v4.4s , v8.4s
1437    SQSUB           v6.4s, v6.4s , v26.4s
1438
1439    NEG             v26.4s, v14.4s
1440//    VUZP.16         D14, D15
1441
1442    UZP1            v19.8h, v14.8h, v14.8h
1443    UZP2            v21.8h, v14.8h, v14.8h
1444    MOV             v14.d[0], v19.d[0]
1445    MOV             v15.d[0], v21.d[0]
1446
1447
1448//   VUZP.16         D26, D27
1449
1450    UZP1            v19.8h, v26.8h, v26.8h
1451    UZP2            v21.8h, v26.8h, v26.8h
1452    MOV             v26.d[0], v19.d[0]
1453    MOV             v27.d[0], v21.d[0]
1454
1455
1456    movi            v1.2s, #0
1457    //VADDL.S16       Q0, D10, D1
1458    SADDL           v0.4s, v10.4h, v1.4h
1459    MOV             v1.d[0], v0.d[1]
1460
1461    sMULL           v22.2d, v30.2s, v0.2s
1462    Sqxtn           v24.2s, v22.2d
1463    sMULL2          v22.2d, v30.4s, v0.4s
1464    Sqxtn           v25.2s, v22.2d
1465    MOV             v24.d[1], v25.d[0]
1466    movi            v1.2s, #0
1467    //VADDL.S16       Q0, D11, D1
1468    SADDL           v0.4s, v11.4h, v1.4h
1469    MOV             v1.d[0], v0.d[1]
1470
1471    sMULL           v8.2d, v30.2s, v0.2s
1472    Sqxtn           v22.2s, v8.2d
1473    sMULL2          v8.2d, v30.4s, v0.4s
1474    Sqxtn           v23.2s, v8.2d
1475    MOV             v22.d[1], v23.d[0]
1476
1477    uMULL           v8.4s, v26.4h, v11.4h
1478    uMULL           v30.4s, v14.4h, v10.4h
1479
1480    ushR            v8.4s, v8.4s, #16
1481
1482    ushR            v30.4s, v30.4s, #16
1483
1484    sMLAL           v8.4s, v27.4h, v11.4h
1485
1486    sMLAL           v30.4s, v15.4h, v10.4h
1487
1488    sQshL           v4.4s, v4.4s, #2
1489
1490    sQshL           v6.4s, v6.4s, #2
1491
1492    SQADD           v14.4s, v4.4s , v20.4s
1493
1494    SQADD           v6.4s, v6.4s , v20.4s
1495
1496    sshR            v14.4s, v14.4s, #16
1497
1498//    VUZP.16         D14, D15
1499
1500    UZP1            v19.8h, v14.8h, v14.8h
1501    UZP2            v21.8h, v14.8h, v14.8h
1502    MOV             v14.d[0], v19.d[0]
1503    MOV             v15.d[0], v21.d[0]
1504
1505    sshR            v6.4s, v6.4s, #16
1506
1507//    VUZP.16         D6, D7
1508
1509    UZP1            v19.8h, v6.8h, v6.8h
1510    UZP2            v21.8h, v6.8h, v6.8h
1511    MOV             v6.d[0], v19.d[0]
1512    MOV             v7.d[0], v21.d[0]
1513
1514    mov             v15.8b, v6.8b
1515    sQshL           v8.4s, v8.4s, v16.4s
1516
1517    sQshL           v30.4s, v30.4s, v16.4s
1518
1519    SQSUB           v8.4s, v8.4s , v24.4s
1520
1521    SQSUB           v22.4s, v30.4s , v22.4s
1522
1523    sQshL           v30.4s, v8.4s, #2
1524
1525    sQshL           v22.4s, v22.4s, #2
1526
1527    SQADD           v30.4s, v30.4s , v20.4s
1528    SQADD           v22.4s, v22.4s , v20.4s
1529
1530    sshR            v30.4s, v30.4s, #16
1531
1532    //VUZP.16         D30, D31
1533
1534    UZP1            v19.8h, v30.8h, v30.8h
1535    UZP2            v21.8h, v30.8h, v30.8h
1536    MOV             v30.d[0], v19.d[0]
1537    MOV             v30.d[1], v21.d[0]
1538
1539    sshR            v22.4s, v22.4s, #16
1540
1541//    VUZP.16         D22, D23
1542    UZP1            v19.8h, v22.8h, v22.8h
1543    UZP2            v21.8h, v22.8h, v22.8h
1544    MOV             v22.d[0], v19.d[0]
1545    MOV             v23.d[0], v21.d[0]
1546
1547    mov             v23.8b, v30.8b
1548
1549
1550
1551
1552    ST1             {v14.h}[0], [x0]
1553    ADD             x0, x0, x9
1554    ST1             {v22.h}[0], [x0]
1555    ADD             x0, x0, x9
1556    ST1             {v14.h}[1], [x0]
1557    ADD             x0, x0, x9
1558    ST1             {v22.h}[1], [x0]
1559    ADD             x0, x0, x9
1560    ST1             {v14.h}[2], [x0]
1561    ADD             x0, x0, x9
1562    ST1             {v22.h}[2], [x0]
1563    ADD             x0, x0, x9
1564    ST1             {v14.h}[3], [x0]
1565    ADD             x0, x0, x9
1566    ST1             {v22.h}[3], [x0]
1567    ADD             x0, x0, x9
1568    ST1             {v15.h}[0], [x5]
1569    ADD             x5, x5, x10
1570    ST1             {v23.h}[0], [x5]
1571    ADD             x5, x5, x10
1572    ST1             {v15.h}[1], [x5]
1573    ADD             x5, x5, x10
1574    ST1             {v23.h}[1], [x5]
1575    ADD             x5, x5, x10
1576    ST1             {v15.h}[2], [x5]
1577    ADD             x5, x5, x10
1578    ST1             {v23.h}[2], [x5]
1579    ADD             x5, x5, x10
1580    ST1             {v15.h}[3], [x5]
1581    ADD             x5, x5, x10
1582    ST1             {v23.h}[3], [x5]
1583    ADD             x5, x5, x10
1584
1585ARM_EPILOGUE:
1586
1587ARM_LOOP:
1588
1589    LD2             { v0.4s, v1.4s}, [x1]
1590    MOV             v2.16b, v1.16b
1591
1592    //VUZP.16         D0, D1
1593    UZP1            v19.8h, v0.8h, v0.8h
1594    UZP2            v21.8h, v0.8h, v0.8h
1595    MOV             v0.d[0], v19.d[0]
1596    MOV             v0.d[1], v21.d[0]
1597
1598    //VUZP.16         D2, D3
1599    UZP1            v19.8h, v2.8h, v2.8h
1600    UZP2            v21.8h, v2.8h, v2.8h
1601    MOV             v2.d[0], v19.d[0]
1602    MOV             v2.d[1], v21.d[0]
1603
1604
1605    rev64           v0.8h, v0.8h
1606    MOV             v1.d[0], v0.d[1]
1607    rev64           v2.8h, v2.8h
1608    MOV             v3.d[0], v2.d[1]
1609
1610    LD2             {v8.4h, v9.4h}, [x2]
1611    ADD             x2, x2, #16
1612
1613    LD2             {v4.2s, v5.2s}, [x8]
1614    ADD             x8, x8, #16
1615    MOV             v6.16b, v5.16b
1616    movi            v5.2s, #0x00000000
1617    movi            v7.2s, #0x00000000
1618
1619    LD1             {v5.s}[0], [x8], #4
1620    LD1             {v7.s}[0], [x8]
1621
1622    MOV             x12, #16
1623    MOV             v4.d[1], v5.d[0]
1624    MOV             v6.d[1], v7.d[0]
1625//    VUZP.16         D4, D5
1626
1627    UZP1            v19.8h, v4.8h, v4.8h
1628    UZP2            v21.8h, v4.8h, v4.8h
1629    MOV             v4.d[0], v19.d[0]
1630    MOV             v5.d[0], v21.d[0]
1631
1632//    VUZP.16         D6, D7
1633
1634    UZP1            v19.8h, v6.8h, v6.8h
1635    UZP2            v21.8h, v6.8h, v6.8h
1636    MOV             v6.d[0], v19.d[0]
1637    MOV             v7.d[0], v21.d[0]
1638
1639    ADD             x6, x6, #16
1640
1641    MOV             x12, #-4
1642    LD2             {v11.2s, v12.2s}, [x6]
1643    ADD             x6, x6, x12
1644    MOV             v13.16b, v12.16b
1645
1646
1647    movi            v10.2s, #0x00000000
1648
1649    LD1             {v12.s}[1], [x6]
1650    ADD             x6, x6, x12
1651    LD1             {v10.s}[1], [x6]
1652    ADD             x6, x6, x12
1653    LD1             {v12.s}[0], [x6]
1654    ADD             x6, x6, x12
1655
1656    MOV             v10.d[1], v11.d[0]
1657    MOV             v12.d[1], v13.d[0]
1658
1659    //VUZP.16         D10, D11
1660
1661    UZP1            v19.8h, v10.8h, v10.8h
1662    UZP2            v21.8h, v10.8h, v10.8h
1663    MOV             v10.d[0], v19.d[0]
1664    MOV             v10.d[1], v21.d[0]
1665
1666    //VUZP.16         D12, D13
1667
1668    UZP1            v19.8h, v12.8h, v12.8h
1669    UZP2            v21.8h, v12.8h, v12.8h
1670    MOV             v12.d[0], v19.d[0]
1671    MOV             v12.d[1], v21.d[0]
1672
1673
1674    rev64           v10.8h, v10.8h
1675    MOV             v11.d[0], v10.d[1]
1676    rev64           v12.8h, v12.8h
1677    MOV             v13.d[0], v12.d[1]
1678
1679    uMULL           v30.4s, v0.4h, v9.4h
1680    uMULL           v28.4s, v2.4h, v8.4h
1681    uMULL           v26.4s, v0.4h, v8.4h
1682    uMULL           v24.4s, v2.4h, v9.4h
1683
1684    ushR            v30.4s, v30.4s, #16
1685    ushR            v28.4s, v28.4s, #16
1686
1687    sMLAL           v30.4s, v1.4h, v9.4h
1688    sMLAL           v28.4s, v3.4h, v8.4h
1689
1690    ushR            v26.4s, v26.4s, #16
1691    ushR            v24.4s, v24.4s, #16
1692
1693    sMLAL           v26.4s, v1.4h, v8.4h
1694    sMLAL           v24.4s, v3.4h, v9.4h
1695
1696    ADD             v30.4s, v30.4s , v28.4s
1697    NEG             v30.4s, v30.4s
1698
1699    uMULL           v22.4s, v4.4h, v8.4h
1700
1701    SUB             v28.4s, v24.4s , v26.4s
1702
1703
1704    mov             v26.16b, v30.16b
1705    mov             v24.16b, v28.16b
1706
1707//    VUZP.16         D26, D27
1708
1709    UZP1            v19.8h, v26.8h, v26.8h
1710    UZP2            v21.8h, v26.8h, v26.8h
1711    MOV             v26.d[0], v19.d[0]
1712    MOV             v27.d[0], v21.d[0]
1713
1714    //VUZP.16         D24, D25
1715
1716    UZP1            v19.8h, v24.8h, v24.8h
1717    UZP2            v21.8h, v24.8h, v24.8h
1718    MOV             v24.d[0], v19.d[0]
1719    MOV             v25.d[0], v21.d[0]
1720
1721    uMULL           v2.4s, v24.4h, v18.4h
1722    uMULL           v0.4s, v26.4h, v18.4h
1723
1724    ushR            v22.4s, v22.4s, #16
1725    sMLAL           v22.4s, v5.4h, v8.4h
1726
1727    ushR            v2.4s, v2.4s, #16
1728    ushR            v0.4s, v0.4s, #16
1729    sMLAL           v2.4s, v25.4h, v18.4h
1730    sMLAL           v0.4s, v27.4h, v18.4h
1731
1732    uMULL           v24.4s, v4.4h, v9.4h
1733    uMULL           v26.4s, v6.4h, v8.4h
1734
1735    NEG             v2.4s, v2.4s
1736    ADD             v28.4s, v28.4s , v0.4s
1737    ADD             v30.4s, v30.4s , v2.4s
1738
1739    uMULL           v0.4s, v6.4h, v9.4h
1740    sshR            v24.4s, v24.4s, #16
1741    sMLAL           v24.4s, v5.4h, v9.4h
1742    sshR            v26.4s, v26.4s, #16
1743    sshR            v0.4s, v0.4s, #16
1744    sMLAL           v26.4s, v7.4h, v8.4h
1745    sMLAL           v0.4s, v7.4h, v9.4h
1746
1747    ADD             v22.4s, v22.4s , v0.4s
1748    NEG             v22.4s, v22.4s
1749    SUB             v24.4s, v26.4s , v24.4s
1750
1751    //LDR w11,  [sp, #120]
1752    //sxtw x11,w11
1753    MOV             w11, w26
1754    dup             v14.4s, w11
1755    SQADD           v28.4s, v28.4s , v14.4s
1756    //LDR w11,  [sp, #116]
1757    //sxtw x11,w11
1758    MOV             w11, w25
1759    dup             v0.4s, w11
1760    sQshL           v28.4s, v28.4s, v0.4s
1761
1762    mov             v0.16b, v22.16b
1763    mov             v14.16b, v24.16b
1764
1765//    VUZP.16         D22, D23
1766
1767    UZP1            v19.8h, v22.8h, v22.8h
1768    UZP2            v21.8h, v22.8h, v22.8h
1769    MOV             v22.d[0], v19.d[0]
1770    MOV             v23.d[0], v21.d[0]
1771
1772//   VUZP.16         D24, D25
1773
1774    UZP1            v19.8h, v24.8h, v24.8h
1775    UZP2            v21.8h, v24.8h, v24.8h
1776    MOV             v24.d[0], v19.d[0]
1777    MOV             v25.d[0], v21.d[0]
1778
1779    uMULL           v8.4s, v24.4h, v18.4h
1780    uMULL           v26.4s, v22.4h, v18.4h
1781
1782    NEG             v2.4s, v30.4s
1783//    VUZP.16         D30, D31
1784
1785    UZP1            v19.8h, v30.8h, v30.8h
1786    UZP2            v21.8h, v30.8h, v30.8h
1787    MOV             v30.d[0], v19.d[0]
1788    MOV             v30.d[1], v21.d[0]
1789
1790//    VUZP.16         D2, D3
1791
1792    UZP1            v19.8h, v2.8h, v2.8h
1793    UZP2            v21.8h, v2.8h, v2.8h
1794    MOV             v2.d[0], v19.d[0]
1795    MOV             v3.d[0], v21.d[0]
1796
1797    uMULL           v4.4s, v30.4h, v12.4h
1798    uMULL           v6.4s, v2.4h, v13.4h
1799
1800    ushR            v8.4s, v8.4s, #16
1801    ushR            v26.4s, v26.4s, #16
1802
1803    sMLAL           v8.4s, v25.4h, v18.4h
1804    sMLAL           v26.4s, v23.4h, v18.4h
1805
1806    ushR            v4.4s, v4.4s, #16
1807    ushR            v6.4s, v6.4s, #16
1808
1809    MOV             v19.d[0], v30.d[1]
1810
1811    sMLAL           v4.4s, v19.4h, v12.4h
1812    sMLAL           v6.4s, v3.4h, v13.4h
1813
1814    NEG             v8.4s, v8.4s
1815    ADD             v14.4s, v14.4s , v26.4s
1816    ADD             v0.4s, v0.4s , v8.4s
1817
1818    //LDR w11,  [sp, #120]
1819    //sxtw x11,w11
1820    MOV             w11, w26
1821    dup             v8.4s, w11
1822    SQADD           v0.4s, v0.4s , v8.4s
1823    //LDR w11,  [sp, #116]
1824    //sxtw x11,w11
1825    MOV             w11, w25
1826    dup             v26.4s, w11
1827    sQshL           v0.4s, v0.4s, v26.4s
1828
1829    mov             v26.16b, v28.16b
1830
1831    MOV             x6, x4
1832
1833    LD1             {v28.2s, v29.2s}, [x4], #16
1834    movi            v19.2s, #0x00000000
1835    LD1             {v30.s}[0], [x4], #4
1836    LD1             {v30.s}[1], [x4], #4
1837    LD1             {v19.s}[0], [x4], #4
1838
1839    MOV             v28.d[1], v29.d[0]
1840    MOV             v30.d[1], v19.d[0]
1841
1842    //VUZP.32         Q14, Q15
1843
1844    UZP1            v19.4s, v28.4s, v30.4s
1845    UZP2            v30.4s, v28.4s, v30.4s
1846    MOV             v28.16b, v19.16b
1847    MOV             v29.d[0], v28.d[1]
1848
1849    ST1             {v26.s}[0], [x6], #4
1850    ST1             {v0.s}[0], [x6], #4
1851    ST1             {v26.s}[1], [x6], #4
1852    ST1             {v0.s}[1], [x6], #4
1853    ST1             {v26.s}[2], [x6], #4
1854    ST1             {v0.s}[2], [x6], #4
1855    ST1             {v26.s}[3], [x6], #4
1856
1857    movi            v1.2s, #0
1858    //VADDL.S16       Q0, D13, D1
1859    SADDL           v0.4s, v13.4h, v1.4h
1860    MOV             v1.d[0], v0.d[1]
1861
1862    sMULL           v26.2d, v28.2s, v0.2s
1863    Sqxtn           v8.2s, v26.2d
1864    sMULL           v26.2d, v29.2s, v1.2s
1865    Sqxtn           v9.2s, v26.2d
1866    MOV             v8.d[1], v9.d[0]
1867    movi            v1.2s, #0
1868    //VADDL.S16       Q0, D12, D1
1869    SADDL           v0.4s, v12.4h, v1.4h
1870    MOV             v1.d[0], v0.d[1]
1871
1872    sMULL           v24.2d, v28.2s, v0.2s
1873    Sqxtn           v26.2s, v24.2d
1874    sMULL           v24.2d, v29.2s, v1.2s
1875    Sqxtn           v27.2s, v24.2d
1876    MOV             v26.d[1], v27.d[0]
1877
1878    sQshL           v4.4s, v4.4s, v16.4s
1879    sQshL           v6.4s, v6.4s, v16.4s
1880
1881    SQSUB           v4.4s, v4.4s , v8.4s
1882    SQSUB           v6.4s, v6.4s , v26.4s
1883
1884    NEG             v26.4s, v14.4s
1885    //VUZP.16         D14, D15
1886
1887    UZP1            v19.8h, v14.8h, v14.8h
1888    UZP2            v21.8h, v14.8h, v14.8h
1889    MOV             v14.d[0], v19.d[0]
1890    MOV             v15.d[0], v21.d[0]
1891
1892//    VUZP.16         D26, D27
1893
1894    UZP1            v19.8h, v26.8h, v26.8h
1895    UZP2            v21.8h, v26.8h, v26.8h
1896    MOV             v26.d[0], v19.d[0]
1897    MOV             v27.d[0], v21.d[0]
1898
1899
1900    movi            v1.2s, #0
1901    //VADDL.S16       Q0, D10, D1
1902    SADDL           v0.4s, v10.4h, v1.4h
1903    MOV             v1.d[0], v0.d[1]
1904
1905    sMULL           v22.2d, v30.2s, v0.2s
1906    Sqxtn           v24.2s, v22.2d
1907    sMULL2          v22.2d, v30.4s, v0.4s
1908    Sqxtn           v25.2s, v22.2d
1909    MOV             v24.d[1], v25.d[0]
1910
1911    movi            v1.2s, #0
1912//    VADDL.S16       Q0, D11, D1
1913    SADDL           v0.4s, v11.4h, v1.4h
1914    MOV             v1.d[0], v0.d[1]
1915
1916    sMULL           v8.2d, v30.2s, v0.2s
1917    Sqxtn           v22.2s, v8.2d
1918    sMULL2          v8.2d, v30.4s, v0.4s
1919    Sqxtn           v23.2s, v8.2d
1920    MOV             v22.d[1], v23.d[0]
1921
1922    uMULL           v8.4s, v26.4h, v11.4h
1923    uMULL           v30.4s, v14.4h, v10.4h
1924
1925    ushR            v8.4s, v8.4s, #16
1926
1927    ushR            v30.4s, v30.4s, #16
1928
1929    sMLAL           v8.4s, v27.4h, v11.4h
1930
1931    sMLAL           v30.4s, v15.4h, v10.4h
1932
1933    sQshL           v4.4s, v4.4s, #2
1934
1935    sQshL           v6.4s, v6.4s, #2
1936
1937    SQADD           v14.4s, v4.4s , v20.4s
1938
1939    SQADD           v6.4s, v6.4s , v20.4s
1940
1941    sshR            v14.4s, v14.4s, #16
1942
1943//    VUZP.16         D14, D15
1944
1945    UZP1            v19.8h, v14.8h, v14.8h
1946    UZP2            v21.8h, v14.8h, v14.8h
1947    MOV             v14.d[0], v19.d[0]
1948    MOV             v15.d[0], v21.d[0]
1949
1950    sshR            v6.4s, v6.4s, #16
1951
1952    //VUZP.16         D6, D7
1953
1954    UZP1            v19.8h, v6.8h, v6.8h
1955    UZP2            v21.8h, v6.8h, v6.8h
1956    MOV             v6.d[0], v19.d[0]
1957    MOV             v7.d[0], v21.d[0]
1958
1959    mov             v15.8b, v6.8b
1960    sQshL           v8.4s, v8.4s, v16.4s
1961
1962    sQshL           v30.4s, v30.4s, v16.4s
1963
1964    SQSUB           v8.4s, v8.4s , v24.4s
1965
1966    SQSUB           v22.4s, v30.4s , v22.4s
1967
1968    sQshL           v30.4s, v8.4s, #2
1969
1970    sQshL           v22.4s, v22.4s, #2
1971
1972    SQADD           v30.4s, v30.4s , v20.4s
1973    SQADD           v22.4s, v22.4s , v20.4s
1974
1975    sshR            v30.4s, v30.4s, #16
1976
1977//    VUZP.16         D30, D31
1978
1979    UZP1            v19.8h, v30.8h, v30.8h
1980    UZP2            v21.8h, v30.8h, v30.8h
1981    MOV             v30.d[0], v19.d[0]
1982    MOV             v30.d[1], v21.d[0]
1983
1984    sshR            v22.4s, v22.4s, #16
1985
1986//    VUZP.16         D22, D23
1987
1988    UZP1            v19.8h, v22.8h, v22.8h
1989    UZP2            v21.8h, v22.8h, v22.8h
1990    MOV             v22.d[0], v19.d[0]
1991    MOV             v23.d[0], v21.d[0]
1992
1993    mov             v23.8b, v30.8b
1994
1995
1996
1997
1998    ST1             {v14.h}[0], [x0]
1999    ADD             x0, x0, x9
2000    ST1             {v22.h}[0], [x0]
2001    ADD             x0, x0, x9
2002    ST1             {v14.h}[1], [x0]
2003    ADD             x0, x0, x9
2004    ST1             {v22.h}[1], [x0]
2005    ADD             x0, x0, x9
2006    ST1             {v14.h}[2], [x0]
2007    ADD             x0, x0, x9
2008    ST1             {v22.h}[2], [x0]
2009    ADD             x0, x0, x9
2010    ST1             {v14.h}[3], [x0]
2011    ADD             x0, x0, x9
2012
2013    ST1             {v15.h}[0], [x5]
2014    ADD             x5, x5, x10
2015    ST1             {v23.h}[0], [x5]
2016    ADD             x5, x5, x10
2017    ST1             {v15.h}[1], [x5]
2018    ADD             x5, x5, x10
2019    ST1             {v23.h}[1], [x5]
2020    ADD             x5, x5, x10
2021    ST1             {v15.h}[2], [x5]
2022    ADD             x5, x5, x10
2023    ST1             {v23.h}[2], [x5]
2024    ADD             x5, x5, x10
2025    ST1             {v15.h}[3], [x5]
2026    ADD             x5, x5, x10
2027
2028    // VPOP            {d8 - d15}
2029    // LDMFD sp!, {x4-x12}
2030    //ldp x19, x20,[sp],#16
2031    pop_v_regs
2032    ret
2033    //BX              x14
2034