• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1///******************************************************************************
2// *
3// * Copyright (C) 2018 The Android Open Source Project
4// *
5// * Licensed under the Apache License, Version 2.0 (the "License");
6// * you may not use this file except in compliance with the License.
7// * You may obtain a copy of the License at:
8// *
9// * http://www.apache.org/licenses/LICENSE-2.0
10// *
11// * Unless required by applicable law or agreed to in writing, software
12// * distributed under the License is distributed on an "AS IS" BASIS,
13// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// * See the License for the specific language governing permissions and
15// * limitations under the License.
16// *
17// *****************************************************************************
18// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20
21
22.macro push_v_regs
23    stp             d8, d9, [sp, #-16]!
24    stp             d10, d11, [sp, #-16]!
25    stp             d12, d13, [sp, #-16]!
26    stp             d14, d15, [sp, #-16]!
27    stp             X8, X9, [sp, #-16]!
28    stp             X10, X11, [sp, #-16]!
29    stp             X12, X13, [sp, #-16]!
30    stp             X14, X15, [sp, #-16]!
31    stp             X16, X17, [sp, #-16]!
32    stp             X29, X30, [sp, #-16]!
33.endm
34.macro pop_v_regs
35    ldp             X29, X30, [sp], #16
36    ldp             X16, X17, [sp], #16
37    ldp             X14, X15, [sp], #16
38    ldp             X12, X13, [sp], #16
39    ldp             X10, X11, [sp], #16
40    ldp             X8, X9, [sp], #16
41    ldp             d14, d15, [sp], #16
42    ldp             d12, d13, [sp], #16
43    ldp             d10, d11, [sp], #16
44    ldp             d8, d9, [sp], #16
45.endm
46
47.macro swp reg1, reg2
48    MOV             x16, \reg1
49    MOV             \reg1, \reg2
50    MOV             \reg2, x16
51.endm
52.text
53.p2align 2
54.global ixheaacd_sbr_imdct_using_fft
55ixheaacd_sbr_imdct_using_fft:
56    push_v_regs
57
58
59COND_6: cmp         x1, #0x10
60    bne             COND_7
61    MOV             X8, #1
62    MOV             X4, X7
63    B               RADIX_4_FIRST_START
64
65COND_7: cmp         x1, #0x20
66
67    mov             x8, #1
68    mov             x4, x7
69
70
71RADIX_8_FIRST_START:
72
73    LSR             W9 , W1, #5
74    LSL             W1, W1, #1
75
76RADIX_8_FIRST_LOOP:
77
78    MOV             X5 , X2
79    MOV             X6 , X2
80    MOV             X7 , X2
81    MOV             X11 , X2
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104    LDRB            W12, [X4]
105    ADD             X5, X5, X12, LSL #3
106    LD2             {V0.S, V1.S}[0], [X5], X1
107    ADD             X5, X5, X1
108    LD2             {V4.S, V5.S}[0], [X5], X1
109    SUB             X5, X5, X1, LSL #1
110    LD2             {V2.S, V3.S}[0], [X5], X1
111    ADD             X5, X5, X1
112    LD2             {V6.S, V7.S}[0], [X5], X1
113    SUB             X5, X5, X1, LSL #2
114
115    LDRB            W12, [X4, #1]
116    ADD             X6, X6, X12, LSL #3
117    LD2             {V0.S, V1.S}[1], [X6] , X1
118    ADD             X6, X6, X1
119    LD2             {V4.S, V5.S}[1], [X6] , X1
120    SUB             X6, X6, X1, LSL #1
121    LD2             {V2.S, V3.S}[1], [X6] , X1
122    ADD             X6, X6, X1
123    LD2             {V6.S, V7.S}[1], [X6], X1
124    SUB             X6, X6, X1, LSL #2
125
126
127    LDRB            W12, [X4, #2]
128    ADD             X7, X7, X12, LSL #3
129    LD2             {V0.S, V1.S}[2], [X7] , X1
130    ADD             X7, X7, X1
131    LD2             {V4.S, V5.S}[2], [X7] , X1
132    SUB             X7, X7, X1, LSL #1
133
134    LDRB            W12, [X4, #3]
135    ADD             X11, X11, X12, LSL #3
136    LD2             {V0.S, V1.S}[3], [X11] , X1
137    ADD             X11, X11, X1
138    LD2             {V4.S, V5.S}[3], [X11] , X1
139    SUB             X11, X11, X1, LSL #1
140
141
142    ADD             V8.4S, V0.4S, V4.4S
143    LD2             {V2.S, V3.S}[2], [X7] , X1
144    ADD             X7, X7, X1
145
146
147    SUB             V9.4S, V0.4S, V4.4S
148    LD2             {V6.S, V7.S}[2], [X7], X1
149    SUB             X7, X7, X1, LSL #2
150
151
152    ADD             V0.4S, V1.4S, V5.4S
153    LD2             {V2.S, V3.S}[3], [X11] , X1
154    ADD             X11, X11, X1
155
156    SUB             V4.4S, V1.4S, V5.4S
157    LD2             {V6.S, V7.S}[3], [X11], X1
158    SUB             X11, X11, X1, LSL #2
159
160    ADD             X4, X4, #4
161
162    ADD             X5, X5, X1, LSR #1
163    ADD             X6, X6, X1, LSR #1
164    ADD             X7, X7, X1, LSR #1
165    ADD             X11, X11, X1, LSR #1
166
167
168    ADD             V1.4S, V2.4S, V6.4S
169    LD2             {V14.S, V15.S}[0], [X5] , X1
170
171
172    SUB             V5.4S, V2.4S, V6.4S
173    LD2             {V10.S, V11.S}[0], [X5] , X1
174
175
176    ADD             V2.4S, V3.4S, V7.4S
177    LD2             {V12.S, V13.S}[0], [X5] , X1
178
179
180    SUB             V6.4S, V3.4S, V7.4S
181    LD2             {V14.S, V15.S}[1], [X6] , X1
182
183    ADD             V3.4S, V9.4S, V6.4S
184    LD2             {V10.S, V11.S}[1], [X6] , X1
185
186    SUB             V7.4S, V9.4S, V6.4S
187    LD2             {V12.S, V13.S}[1], [X6] , X1
188
189    SUB             V6.4S, V4.4S, V5.4S
190    LD2             {V14.S, V15.S}[2], [X7] , X1
191
192    ADD             V9.4S, V4.4S, V5.4S
193    LD2             {V10.S, V11.S}[2], [X7] , X1
194
195    ADD             V4.4S, V8.4S, V1.4S
196    LD2             {V12.S, V13.S}[2], [X7] , X1
197
198    SUB             V5.4S, V8.4S, V1.4S
199    LD2             {V14.S, V15.S}[3], [X11] , X1
200
201    ADD             V8.4S, V0.4S, V2.4S
202    LD2             {V10.S, V11.S}[3], [X11] , X1
203
204    SUB             V0.4S, V0.4S, V2.4S
205    LD2             {V12.S, V13.S}[3], [X11] , X1
206
207
208    LD2             {V1.S, V2.S}[0], [X5], X1
209
210    ADD             V17.4S, V14.4S, V12.4S
211
212    LD2             {V1.S, V2.S}[1], [X6] , X1
213
214    SUB             V16.4S, V14.4S, V12.4S
215
216    LD2             {V1.S, V2.S}[2], [X7] , X1
217
218    ADD             V14.4S, V15.4S, V13.4S
219
220    LD2             {V1.S, V2.S}[3], [X11] , X1
221
222    SUB             V12.4S, V15.4S, V13.4S
223
224    ADD             V15.4S, V10.4S, V1.4S
225    SUB             V13.4S, V10.4S, V1.4S
226    ADD             V10.4S, V11.4S, V2.4S
227    SUB             V1.4S, V11.4S, V2.4S
228
229    ADD             V11.4S, V17.4S, V15.4S
230    SUB             V2.4S, V17.4S, V15.4S
231    ADD             V17.4S, V14.4S, V10.4S
232    SUB             V15.4S, V14.4S, V10.4S
233
234    ADD             V14.4S, V16.4S, V12.4S
235    SUB             V10.4S, V16.4S, V12.4S
236    ADD             V16.4S, V13.4S, V1.4S
237    SUB             V12.4S, V13.4S, V1.4S
238
239    ADD             V1.4S , V14.4S, V12.4S
240    SUB             V13.4S, V14.4S, V12.4S
241    SUB             V12.4S, V16.4S, V10.4S
242
243    UZP1            V22.8H, V1.8H, V1.8H
244    UZP2            V23.8H, V1.8H, V1.8H
245    ADD             V14.4S, V16.4S, V10.4S
246
247    UZP1            V26.8H, V13.8H, V13.8H
248    UZP2            V27.8H, V13.8H, V13.8H
249    ADD             V16.4S, V4.4S, V11.4S
250
251    UZP1            V24.8H, V12.8H, V12.8H
252    UZP2            V25.8H, V12.8H, V12.8H
253    SUB             V10.4S, V4.4S, V11.4S
254
255    UZP1            V28.8H, V14.8H, V14.8H
256    UZP2            V29.8H, V14.8H, V14.8H
257    ADD             V4.4S, V8.4S, V17.4S
258
259    MOV             W14, #0x5a82
260
261    SUB             V11.4S, V8.4S, V17.4S
262
263    ADD             V8.4S, V5.4S, V15.4S
264    SUB             V17.4S, V5.4S, V15.4S
265    SUB             V5.4S, V0.4S, V2.4S
266    ADD             V15.4S, V0.4S, V2.4S
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282    DUP             V31.4H, W14
283
284    UMULL           V19.4S, V26.4H, V31.4H
285    UMULL           V18.4S, V28.4H, V31.4H
286    SSHR            V19.4S, V19.4S, #15
287    SSHR            V18.4S, V18.4S, #15
288
289    SQDMLAL         V19.4S, V27.4H, V31.4H
290    SQDMLAL         V18.4S, V29.4H, V31.4H
291
292    UMULL           V13.4S, V24.4H, V31.4H
293    UMULL           V14.4S, V22.4H, V31.4H
294
295    ADD             V20.4S, V3.4S, V19.4S
296    SUB             V21.4S, V3.4S, V19.4S
297    ADD             V30.4S, V6.4S, V18.4S
298    SUB             V6.4S, V6.4S, V18.4S
299
300    SSHR            V13.4S, V13.4S, #15
301    SSHR            V14.4S, V14.4S, #15
302
303    SQDMLAL         V13.4S, V25.4H, V31.4H
304    SQDMLAL         V14.4S, V23.4H, V31.4H
305
306    ADD             V3.4S, V7.4S, V13.4S
307    SUB             V19.4S, V7.4S, V13.4S
308    ADD             V1.4S, V9.4S, V14.4S
309    SUB             V18.4S, V9.4S, V14.4S
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335    swp             V17.D[0], V8.D[0]
336    swp             V17.D[1], V8.D[1]
337    swp             V4.D[0], V16.D[0]
338    swp             V4.D[1], V16.D[1]
339
340    TRN1            V12.4S, V4.4S, V20.4S
341    TRN2            V22.4S, V4.4S, V20.4S
342
343    SHL             V12.4S, V12.4S, #1
344    TRN1            V9.4S, V17.4S, V3.4S
345    TRN2            V2.4S, V17.4S, V3.4S
346    SHL             V22.4S, V22.4S, #1
347
348    SHL             V9.4S, V9.4S, #1
349    TRN1            V24.4S, V10.4S, V21.4S
350    TRN2            V7.4S, V10.4S, V21.4S
351    SHL             V2.4S, V2.4S, #1
352
353    SHL             V24.4S, V24.4S, #1
354    TRN1            V13.4S, V16.4S, V6.4S
355    TRN2            V23.4S, V16.4S, V6.4S
356    SHL             V7.4S, V7.4S, #1
357
358    SHL             V13.4S, V13.4S, #1
359    TRN1            V10.4S, V5.4S, V18.4S
360    TRN2            V3.4S, V5.4S, V18.4S
361    SHL             V23.4S, V23.4S, #1
362
363    SHL             V10.4S, V10.4S, #1
364    TRN1            V26.4S, V8.4S, V19.4S
365    TRN2            V4.4S, V8.4S, V19.4S
366    SHL             V3.4S, V3.4S, #1
367
368    SHL             V26.4S, V26.4S, #1
369    TRN1            V25.4S, V11.4S, V30.4S
370    TRN2            V8.4S, V11.4S, V30.4S
371    SHL             V4.4S, V4.4S, #1
372
373    SHL             V25.4S, V25.4S, #1
374    TRN1            V27.4S, V15.4S, V1.4S
375    TRN2            V5.4S, V15.4S, V1.4S
376    SHL             V8.4S, V8.4S, #1
377
378    SHL             V27.4S, V27.4S, #1
379    swp             V9.D[0], V12.D[1]
380    SHL             V5.4S, V5.4S, #1
381    swp             V2.D[0], V22.D[1]
382
383    swp             V24.D[1], V26.D[0]
384    swp             V7.D[1], V4.D[0]
385    swp             V10.D[0], V13.D[1]
386    swp             V3.D[0], V23.D[1]
387    swp             V27.D[0], V25.D[1]
388    swp             V5.D[0], V8.D[1]
389
390
391    MOV             X15, #32
392    ST2             {V12.4S, V13.4S}, [X3], X15
393    ST2             {V24.4S, V25.4S}, [X3], X15
394    ST2             {V22.4S, V23.4S}, [X3], X15
395    ST2             {V7.4S, V8.4S}, [X3], X15
396    ST2             {V9.4S, V10.4S}, [X3], X15
397    ST2             {V26.4S, V27.4S}, [X3], X15
398    ST2             {V2.4S, V3.4S}, [X3], X15
399    ST2             {V4.4S, V5.4S}, [X3], X15
400
401
402    SUBS            X9, X9, #1
403    BNE             RADIX_8_FIRST_LOOP
404
405    LSR             X1, X1, #1
406    LSL             X15, X1, #3
407    SUB             X3, X3, X15
408
409    MOV             X5, #8
410    MOV             X4, #32
411    LSR             X15, X1, #5
412    MOV             X6, X15
413    B               RADIX_4_FIRST_ENDS
414
415RADIX_8_FIRST_ENDS:
416
417
418
419RADIX_4_FIRST_START:
420
421
422    LSR             W9, W1, #4
423    LSL             W1, W1, #1
424
425RADIX_4_LOOP:
426
427    MOV             X5 , X2
428    MOV             X6 , X2
429    MOV             X7 , X2
430    MOV             X11 , X2
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446    LDRB            W12, [X4, #0]
447    ADD             X5, X5, X12, LSL #3
448
449    LD2             {V0.S, V1.S}[0], [X5] , X1
450    ADD             X5, X5, X1
451    LD2             {V8.S, V9.S}[0], [X5] , X1
452    SUB             X5, X5, X1, LSL #1
453    LD2             {V4.S, V5.S}[0], [X5] , X1
454    ADD             X5, X5, X1
455    LD2             {V12.S, V13.S}[0], [X5] , X1
456
457    LDRB            W12, [X4, #1]
458    ADD             X6, X6, X12, LSL #3
459    LD2             {V0.S, V1.S}[1], [X6] , X1
460    ADD             X6, X6, X1
461    LD2             {V8.S, V9.S}[1], [X6] , X1
462    SUB             X6, X6, X1, LSL #1
463    LD2             {V4.S, V5.S}[1], [X6] , X1
464    ADD             X6, X6, X1
465    LD2             {V12.S, V13.S}[1], [X6] , X1
466
467    LDRB            W12, [X4, #2]
468    ADD             X7, X7, X12, LSL #3
469
470    LD2             {V0.S, V1.S}[2], [X7] , X1
471    ADD             X7, X7, X1
472    LD2             {V8.S, V9.S}[2], [X7] , X1
473
474
475    LDRB            W12, [X4, #3]
476    ADD             X11, X11, X12 , LSL #3
477
478
479    LD2             {V0.S, V1.S}[3], [X11] , X1
480    ADD             X11, X11, X1
481    LD2             {V8.S, V9.S}[3], [X11] , X1
482
483    SUB             X7, X7, X1, LSL #1
484    ADD             V16.4S, V0.4S, V8.4S
485    LD2             {V4.S, V5.S}[2], [X7] , X1
486    ADD             X7, X7, X1
487    ADD             V18.4S, V1.4S, V9.4S
488    LD2             {V12.S, V13.S}[2], [X7] , X1
489
490    SUB             X11, X11, X1, LSL #1
491    SUB             V20.4S, V0.4S, V8.4S
492    LD2             {V4.S, V5.S}[3], [X11] , X1
493    ADD             X11, X11, X1
494    SUB             V22.4S, V1.4S, V9.4S
495    LD2             {V12.S, V13.S}[3], [X11] , X1
496
497    ADD             X4, X4, #4
498
499    ADD             V24.4S, V4.4S, V12.4S
500    ADD             V26.4S, V5.4S, V13.4S
501    SUB             V28.4S, V4.4S, V12.4S
502    SUB             V30.4S, V5.4S, V13.4S
503
504    ADD             V17.4S, V16.4S, V24.4S
505    ADD             V11.4S, V18.4S, V26.4S
506    SUB             V19.4S, V16.4S, V24.4S
507    SUB             V15.4S, V18.4S, V26.4S
508
509    ADD             V8.4S, V20.4S, V30.4S
510    SUB             V9.4S, V22.4S, V28.4S
511    ADD             V13.4S, V22.4S, V28.4S
512    SUB             V12.4S, V20.4S, V30.4S
513
514
515
516
517    TRN1            V0.4S, V17.4S, V8.4S
518    TRN2            V8.4S, V17.4S, V8.4S
519
520    SHL             V0.4S, V0.4S, #1
521    TRN1            V4.4S, V19.4S, V12.4S
522    TRN2            V12.4S, V19.4S, V12.4S
523    SHL             V8.4S, V8.4S, #1
524
525    SHL             V4.4S, V4.4S, #1
526    TRN1            V1.4S, V11.4S, V9.4S
527    TRN2            V9.4S, V11.4S, V9.4S
528    SHL             V12.4S, V12.4S, #1
529
530    SHL             V1.4S, V1.4S, #1
531    TRN1            V5.4S, V15.4S, V13.4S
532    TRN2            V13.4S, V15.4S, V13.4S
533    SHL             V9.4S, V9.4S, #1
534
535    SHL             V5.4S, V5.4S, #1
536    swp             V4.D[0], V0.D[1]
537    SHL             V13.4S, V13.4S, #1
538
539    swp             V12.D[0], V8.D[1]
540
541
542    swp             V5.D[0], V1.D[1]
543    swp             V13.D[0], V9.D[1]
544
545    MOV             X15, #32
546    ST2             {V0.4S, V1.4S}, [X3], X15
547    ST2             {V8.4S, V9.4S}, [X3], X15
548    ST2             {V4.4S, V5.4S}, [X3], X15
549    ST2             {V12.4S, V13.4S}, [X3], X15
550
551
552    SUBS            W9, W9, #1
553    BNE             RADIX_4_LOOP
554
555    LSR             X1, X1, #1
556    SUB             X3, X3, X1, LSL #3
557    MOV             X5, #4
558    MOV             X4, #64
559    LSR             X6, X1, #4
560
561
562RADIX_4_FIRST_ENDS:
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585    MOV             x30, X3
586    LSR             X5, X5, #2
587
588OUTER_LOOP_R4:
589
590
591    MOV             X14, x30
592
593    MOV             X7, X5
594    MOV             X2, #0
595    MOV             X9, X0
596    LSL             X12, X5, #5
597MIDDLE_LOOP_R4:
598
599
600    LD2             {V20.H, V21.H}[0], [X9], X2
601    LD2             {V22.H, V23.H}[0], [X9], X2
602    ADD             X11, X2, X4, LSL #2
603    LD2             {V24.H, V25.H}[0], [X9]
604    ADD             X10, X0, X11
605
606    LD2             {V20.H, V21.H}[1], [X10], X11
607    LD2             {V22.H, V23.H}[1], [X10], X11
608    ADD             X2, X11, X4, LSL #2
609    LD2             {V24.H, V25.H}[1], [X10]
610    ADD             X9, X0, X2
611
612    LD2             {V20.H, V21.H}[2], [X9], X2
613    LD2             {V22.H, V23.H}[2], [X9], X2
614    ADD             X11, X2, X4, LSL #2
615    LD2             {V24.H, V25.H}[2], [X9]
616    ADD             X10, X0, X11
617
618    LD2             {V20.H, V21.H}[3], [X10], X11
619    LD2             {V22.H, V23.H}[3], [X10], X11
620    ADD             X2, X11, X4, LSL #2
621    LD2             {V24.H, V25.H}[3], [X10]
622    ADD             X9, X0, X2
623
624    MOV             X10, X6
625INNER_LOOP_R4:
626
627    LD2             {V30.4S, V31.4S}, [X14], X12
628    SSHR            V30.4S, V30.4S, #1
629    LD4             {V16.4H, V17.4H, V18.4H, V19.4H}, [X14], X12
630    SSHR            V31.4S, V31.4S, #1
631
632    USHR            V16.4H, V16.4H, #1
633    LD4             {V26.4H, V27.4H, V28.4H, V29.4H}, [X14], X12
634    USHR            V18.4H, V18.4H, #1
635
636    SMULL           V11.4S, V16.4H, V20.4H
637    SMLSL           V11.4S, V18.4H, V21.4H
638    LD4             {V0.4H, V1.4H, V2.4H, V3.4H}, [X14], X12
639    SMULL           V12.4S, V16.4H, V21.4H
640    SMLAL           V12.4S, V18.4H, V20.4H
641
642    USHR            V26.4H, V26.4H, #1
643    USHR            V28.4H, V28.4H, #1
644
645    LSL             x29, X12, #2
646    SUB             X14, X14, X12, LSL #2
647
648    USHR            V0.4H, V0.4H, #1
649    USHR            V2.4H, V2.4H, #1
650
651    SMULL           V13.4S, V26.4H, V22.4H
652    SMLSL           V13.4S, V28.4H, V23.4H
653
654    SSHR            V11.4S, V11.4S, #15
655
656    SMULL           V14.4S, V26.4H, V23.4H
657    SMLAL           V14.4S, V28.4H, V22.4H
658
659    SMULL           V15.4S, V0.4H, V24.4H
660    SMLSL           V15.4S, V2.4H, V25.4H
661
662    SMLAL           V11.4S, V17.4H, V20.4H
663    SMLSL           V11.4S, V19.4H, V21.4H
664
665    SSHR            V12.4S, V12.4S, #15
666    SSHR            V13.4S, V13.4S, #15
667    SSHR            V14.4S, V14.4S, #15
668    SSHR            V15.4S, V15.4S, #15
669
670    SMLAL           V12.4S, V17.4H, V21.4H
671    SMLAL           V12.4S, V19.4H, V20.4H
672
673    SMULL           V5.4S, V0.4H, V25.4H
674    SMLAL           V5.4S, V2.4H, V24.4H
675
676    SMLAL           V13.4S, V27.4H, V22.4H
677    SMLSL           V13.4S, V29.4H, V23.4H
678
679    SMLAL           V14.4S, V27.4H, V23.4H
680    SMLAL           V14.4S, V29.4H, V22.4H
681
682    SMLAL           V15.4S, V1.4H, V24.4H
683    SMLSL           V15.4S, V3.4H, V25.4H
684
685    SSHR            V5.4S, V5.4S, #15
686
687    SMLAL           V5.4S, V1.4H, V25.4H
688    SMLAL           V5.4S, V3.4H, V24.4H
689
690
691
692    SUBS            x17, X7, X5
693    BNE             BYPASS_IF
694
695    ADD             X14, X14, X12
696
697    LDR             W3, [X14]
698    ADD             X14, X14, X12
699    ASR             W3, W3, #1
700    MOV             V11.S[0], W3
701
702    LDR             W3, [X14]
703    ADD             X14, X14, X12
704    ASR             W3, W3, #1
705    MOV             V13.S[0], W3
706
707    LDR             W3, [X14]
708    ASR             W3, W3, #1
709    MOV             V15.S[0], W3
710
711    SUB             X14, X14, X12, LSL #1
712    ADD             X14, X14, #4
713
714    LDR             W3, [X14]
715    ADD             X14, X14, X12
716    ASR             W3, W3, #1
717    MOV             V12.S[0], W3
718
719    LDR             W3, [X14]
720    ADD             X14, X14, X12
721    ASR             W3, W3, #1
722    MOV             V14.S[0], W3
723
724    LDR             W3, [X14]
725    ADD             X14, X14, X12
726    ASR             W3, W3, #1
727    MOV             V5.S[0], W3
728
729    SUB             X14, X14, #4
730
731    SUB             X14, X14, x29
732
733BYPASS_IF:
734
735    ADD             V6.4S, V30.4S, V13.4S
736    ADD             V7.4S, V31.4S, V14.4S
737    SUB             V30.4S, V30.4S, V13.4S
738    SUB             V31.4S, V31.4S, V14.4S
739    ADD             V8.4S, V11.4S, V15.4S
740    ADD             V9.4S, V12.4S, V5.4S
741
742    SUB             V15.4S, V11.4S, V15.4S
743    SUB             V14.4S, V12.4S, V5.4S
744
745
746    ADD             V10.4S, V6.4S, V8.4S
747    ADD             V11.4S, V7.4S, V9.4S
748    ADD             V12.4S, V30.4S, V14.4S
749    SUB             V13.4S, V31.4S, V15.4S
750
751    SUB             V6.4S, V6.4S, V8.4S
752    ST2             {V10.4S, V11.4S}, [X14], X12
753    SUB             V7.4S, V7.4S, V9.4S
754
755    SUB             V8.4S, V30.4S, V14.4S
756    ST2             {V12.4S, V13.4S}, [X14], X12
757    ADD             V9.4S, V31.4S, V15.4S
758
759    ST2             {V6.4S, V7.4S}, [X14], X12
760    ST2             {V8.4S, V9.4S}, [X14], X12
761    SUBS            X10, X10, #1
762    BNE             INNER_LOOP_R4
763
764    SUB             X14, X14, X1, LSL #3
765    ADD             X14, X14, #32
766
767    SUBS            X7, X7, #1
768    BNE             MIDDLE_LOOP_R4
769
770    LSR             X4, X4, #2
771    LSL             X5, X5, #2
772    LSR             X6, X6, #2
773    SUBS            X8, X8, #1
774    BNE             OUTER_LOOP_R4
775END_LOOPS:
776    pop_v_regs
777    RET
778