• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1///******************************************************************************
2// *
3// * Copyright (C) 2018 The Android Open Source Project
4// *
5// * Licensed under the Apache License, Version 2.0 (the "License");
6// * you may not use this file except in compliance with the License.
7// * You may obtain a copy of the License at:
8// *
9// * http://www.apache.org/licenses/LICENSE-2.0
10// *
11// * Unless required by applicable law or agreed to in writing, software
12// * distributed under the License is distributed on an "AS IS" BASIS,
13// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// * See the License for the specific language governing permissions and
15// * limitations under the License.
16// *
17// *****************************************************************************
18// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20
21
22.macro push_v_regs
23    stp             q8, q9, [sp, #-32]!
24    stp             q10, q11, [sp, #-32]!
25    stp             q12, q13, [sp, #-32]!
26    stp             q14, q15, [sp, #-32]!
27    stp             x21, x22, [sp, #-16]!
28    stp             x23, x24, [sp, #-16]!
29.endm
30.macro pop_v_regs
31    ldp             x23, x24, [sp], #16
32    ldp             x21, x22, [sp], #16
33    ldp             q14, q15, [sp], #32
34    ldp             q12, q13, [sp], #32
35    ldp             q10, q11, [sp], #32
36    ldp             q8, q9, [sp], #32
37.endm
38.macro swp reg1, reg2
39    MOV             X16, \reg1
40    MOV             \reg1, \reg2
41    MOV             \reg2, x16
42.endm
43.text
44.global ixheaacd_post_twiddle_armv8
45ixheaacd_post_twiddle_armv8:
46
47
48    push_v_regs
49
50ARM_PROLOGUE:
51    CMP             w3, #0x400
52    MOV             x21, #7500
53    ADD             x2, x2, x21
54    BLT             NEXT
55    MOV             w4, #50
56    MOV             w5, #-50
57    MOV             x6, #4
58    dup             v10.4h, w4
59    B               NEXT1
60
61NEXT:
62    MOV             w4, #0x192
63    MOV             w5, #0xfe6e
64    MOV             x6, #32
65    dup             v10.4h, w4
66
67NEXT1:
68    LDR             w9, [x2]
69    LSL             W22, W9, #16
70    AND             W21, W9, #0xFFFF0000
71
72    LDR             w7, [x1], #4
73    LDR             w8, [x1], #4
74
75    ADD             x2, x2, x6
76
77
78    SMULL           X11, w8, w21
79    ASR             X11, x11, #32
80    SMULL           X10, w8, w22
81    ASR             X10, x10, #32
82    SMULL           X12, w7, w21
83    ASR             X12, x12, #32
84    SMULL           X23, w7, w22
85    ASR             X23, x23, #32
86    ADD             w8, w11, w23
87
88
89    SUB             w10, w10, w12
90
91    MVN             w8, w8
92    ADD             w8, w8, #1
93
94
95
96    LSL             w21, w5, #16
97    LSL             w22, w4, #16
98    SMULL           X23, w10, w21
99    ASR             X23, x23, #32
100    ADD             w9, w8, w23
101    SMULL           X23, w8, w22
102    ASR             X23, x23, #32
103    ADD             w11, w10, w23
104
105    LSL             x7, x3, #2
106    ADD             x7, x0, x7
107    SUB             x7, x7, #4
108
109    STR             w11, [x7], #-4
110
111    STR             w9, [x0], #4
112
113    LSL             x5, x3, #2
114    ADD             x5, x1, x5
115    SUB             x5, x5, #40
116
117
118    SUB             w3, w3, #1
119    ASR             w3, w3, #4
120
121
122    SUB             x7, x7, #28
123
124
125
126
127
128
129
130
131
132
133
134
135    MOV             x8, #-32
136
137NEON_PROLOGUE:
138
139    LD4             {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8
140
141    LD4             {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], #32
142    LD2             {v8.h, v9.h}[0], [x2], x6
143    LD2             {v8.h, v9.h}[1], [x2], x6
144    LD2             {v8.h, v9.h}[2], [x2], x6
145    LD2             {v8.h, v9.h}[3], [x2], x6
146
147    rev64           v12.4h, v8.4h
148    rev64           v13.4h, v9.4h
149
150    uMULL           v30.4s, v2.4h, v13.4h
151    uMULL           v28.4s, v0.4h, v13.4h
152    uMULL           v26.4s, v2.4h, v12.4h
153    uMULL           v24.4s, v0.4h, v12.4h
154
155    ushR            v30.4s, v30.4s, #16
156    ushR            v28.4s, v28.4s, #16
157    ushR            v26.4s, v26.4s, #16
158    ushR            v24.4s, v24.4s, #16
159
160    sMLAL           v30.4s, v3.4h, v13.4h
161    sMLAL           v28.4s, v1.4h, v13.4h
162    sMLAL           v26.4s, v3.4h, v12.4h
163    sMLAL           v24.4s, v1.4h, v12.4h
164
165    uMULL           v22.4s, v6.4h, v9.4h
166    uMULL           v20.4s, v4.4h, v9.4h
167
168    ADD             v28.4s, v28.4s , v26.4s
169    SUB             v30.4s, v30.4s , v24.4s
170    NEG             v28.4s, v28.4s
171
172    uMULL           v18.4s, v6.4h, v8.4h
173    uMULL           v16.4s, v4.4h, v8.4h
174
175    mov             v31.8b, v30.8b
176    mov             v27.D[0], v30.D[1]
177    ushR            v22.4s, v22.4s, #16
178
179    mov             v24.8b, v28.8b
180    mov             v25.D[0], v28.D[1]
181    ushR            v20.4s, v20.4s, #16
182
183
184    UZP1            v26.4h, v31.4h, v27.4h
185    UZP2            v27.4h, v31.4h, v27.4h
186    ushR            v18.4s, v18.4s, #16
187
188
189    mov             v31.8B , v24.8B
190    UZP1            v24.4h, v31.4h, v25.4h
191    UZP2            v25.4h, v31.4h, v25.4h
192    ushR            v16.4s, v16.4s, #16
193
194
195    sMLAL           v22.4s, v7.4h, v9.4h
196    sMLAL           v20.4s, v5.4h, v9.4h
197    sMLAL           v18.4s, v7.4h, v8.4h
198    sMLAL           v16.4s, v5.4h, v8.4h
199
200    LD2             {v8.h, v9.h}[0], [x2], x6
201    uMULL           v0.4s, v26.4h, v10.4h
202
203    LD2             {v8.h, v9.h}[1], [x2], x6
204    uMULL           v2.4s, v24.4h, v10.4h
205
206
207    LD2             {v8.h, v9.h}[2], [x2], x6
208    ADD             v22.4s, v22.4s , v16.4s
209
210    LD2             {v8.h, v9.h}[3], [x2], x6
211    SUB             v20.4s, v18.4s , v20.4s
212
213    rev64           v12.4h, v8.4h
214    rev64           v13.4h, v9.4h
215    NEG             v22.4s, v22.4s
216
217
218    mov             v18.8b, v22.8b
219    mov             v19.D[0], v22.D[1]
220    ushR            v0.4s, v0.4s, #16
221
222    mov             v16.16b, v20.16b
223    mov             v17.D[0], v20.D[1]
224    ushR            v2.4s, v2.4s, #16
225
226
227    MOV             v31.8b, v18.8b
228    UZP1            v18.4h, v31.4h, v19.4h
229    UZP2            v19.4h, v31.4h, v19.4h
230    sMLAL           v0.4s, v27.4h, v10.4h
231
232
233    MOV             v31.8b, v16.8b
234    UZP1            v16.4h, v31.4h, v17.4h
235    UZP2            v17.4h, v31.4h, v17.4h
236    sMLAL           v2.4s, v25.4h, v10.4h
237
238    uMULL           v4.4s, v18.4h, v10.4h
239    uMULL           v6.4s, v16.4h, v10.4h
240
241    NEG             v0.4s, v0.4s
242    ADD             v14.4s, v30.4s , v2.4s
243    ADD             v26.4s, v28.4s , v0.4s
244
245    rev64           v14.4s, v14.4s
246    ushR            v4.4s, v4.4s, #16
247
248    swp             v14.D[0], v14.D[1]
249    ushR            v6.4s, v6.4s, #16
250
251    sMLAL           v4.4s, v19.4h, v10.4h
252    LD4             {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8
253    sMLAL           v6.4s, v17.4h, v10.4h
254
255
256    SUB             x3, x3, #2
257
258    ADD             v24.4s, v20.4s , v4.4s
259
260    rev64           v24.4s, v24.4s
261    NEG             v16.4s, v6.4s
262
263    LD4             {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], #32
264
265    swp             v24.D[0], v24.D[1]
266    ADD             v16.4s, v22.4s , v16.4s
267
268
269
270CORE_LOOP:
271    uMULL           v30.4s, v2.4h, v13.4h
272    MOV             v25.16B, v24.16B
273    ST2             { v25.4s, v26.4s}, [x7], x8
274    uMULL           v28.4s, v0.4h, v13.4h
275
276    uMULL           v26.4s, v2.4h, v12.4h
277    MOV             v15.16B, v14.16B
278    ST2             { v15.4s, v16.4s}, [x0], #32
279    uMULL           v24.4s, v0.4h, v12.4h
280
281    ushR            v30.4s, v30.4s, #16
282    ushR            v28.4s, v28.4s, #16
283    ushR            v26.4s, v26.4s, #16
284    ushR            v24.4s, v24.4s, #16
285
286    sMLAL           v30.4s, v3.4h, v13.4h
287    sMLAL           v28.4s, v1.4h, v13.4h
288    sMLAL           v26.4s, v3.4h, v12.4h
289    sMLAL           v24.4s, v1.4h, v12.4h
290
291    uMULL           v22.4s, v6.4h, v9.4h
292    uMULL           v20.4s, v4.4h, v9.4h
293
294
295    ADD             v28.4s, v28.4s , v26.4s
296    SUB             v30.4s, v30.4s , v24.4s
297    NEG             v28.4s, v28.4s
298
299    uMULL           v18.4s, v6.4h, v8.4h
300    uMULL           v16.4s, v4.4h, v8.4h
301
302
303    mov             v26.8b, v30.8b
304    mov             v27.D[0], v30.D[1]
305    ushR            v22.4s, v22.4s, #16
306
307
308    mov             v24.8b, v28.8b
309    mov             v25.D[0], v28.D[1]
310    ushR            v20.4s, v20.4s, #16
311
312
313    MOV             v31.8b, v26.8b
314    UZP1            v26.4h, v31.4h, v27.4h
315    UZP2            v27.4h, v31.4h, v27.4h
316    ushR            v18.4s, v18.4s, #16
317
318
319    MOV             v31.8b, v24.8b
320    UZP1            v24.4h, v31.4h, v25.4h
321    UZP2            v25.4h, v31.4h, v25.4h
322    ushR            v16.4s, v16.4s, #16
323
324
325    sMLAL           v22.4s, v7.4h, v9.4h
326    sMLAL           v20.4s, v5.4h, v9.4h
327    sMLAL           v18.4s, v7.4h, v8.4h
328    sMLAL           v16.4s, v5.4h, v8.4h
329
330    LD2             {v8.h, v9.h}[0], [x2], x6
331    uMULL           v0.4s, v26.4h, v10.4h
332
333    LD2             {v8.h, v9.h}[1], [x2], x6
334    uMULL           v2.4s, v24.4h, v10.4h
335
336    LD2             {v8.h, v9.h}[2], [x2], x6
337    ADD             v22.4s, v22.4s , v16.4s
338
339    LD2             {v8.h, v9.h}[3], [x2], x6
340    SUB             v20.4s, v18.4s , v20.4s
341
342    rev64           v12.4h, v8.4h
343    rev64           v13.4h, v9.4h
344    NEG             v22.4s, v22.4s
345
346    mov             v18.8b, v22.8b
347    mov             v19.D[0], v22.D[1]
348    ushR            v0.4s, v0.4s, #16
349
350    mov             v16.8b, v20.8b
351    mov             v17.D[0], v20.D[1]
352    ushR            v2.4s, v2.4s, #16
353
354
355    MOV             v31.8b, v18.8b
356    UZP1            v18.4h, v31.4h, v19.4h
357    UZP2            v19.4h, v31.4h, v19.4h
358    sMLAL           v0.4s, v27.4h, v10.4h
359
360
361    MOV             v31.8b, v16.8b
362    UZP1            v16.4h, v31.4h, v17.4h
363    UZP2            v17.4h, v31.4h, v17.4h
364    sMLAL           v2.4s, v25.4h, v10.4h
365
366    uMULL           v4.4s, v18.4h, v10.4h
367    uMULL           v6.4s, v16.4h, v10.4h
368
369    NEG             v0.4s, v0.4s
370    ADD             v14.4s, v30.4s , v2.4s
371    ADD             v26.4s, v28.4s , v0.4s
372
373    rev64           v14.4s, v14.4s
374    ushR            v4.4s, v4.4s, #16
375
376    swp             v14.D[0], v14.D[1]
377    ushR            v6.4s, v6.4s, #16
378
379    sMLAL           v4.4s, v19.4h, v10.4h
380    LD4             {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8
381    sMLAL           v6.4s, v17.4h, v10.4h
382
383
384
385
386
387    ADD             v24.4s, v20.4s , v4.4s
388
389    rev64           v24.4s, v24.4s
390    NEG             v16.4s, v6.4s
391
392    LD4             {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], #32
393
394    swp             v24.D[0], v24.D[1]
395    ADD             v16.4s, v22.4s , v16.4s
396
397    SUBS            x3, x3, #1
398
399    BNE             CORE_LOOP
400
401
402
403
404NEON_EPILOGUE:
405    uMULL           v30.4s, v2.4h, v13.4h
406    MOV             v25.16B, v24.16B
407    ST2             { v25.4s, v26.4s}, [x7], x8
408    uMULL           v28.4s, v0.4h, v13.4h
409
410    uMULL           v26.4s, v2.4h, v12.4h
411    MOV             v15.16B, v14.16B
412    ST2             { v15.4s, v16.4s}, [x0], #32
413    uMULL           v24.4s, v0.4h, v12.4h
414
415
416
417    ushR            v30.4s, v30.4s, #16
418    ushR            v28.4s, v28.4s, #16
419    ushR            v26.4s, v26.4s, #16
420    ushR            v24.4s, v24.4s, #16
421
422    sMLAL           v30.4s, v3.4h, v13.4h
423    sMLAL           v28.4s, v1.4h, v13.4h
424    sMLAL           v26.4s, v3.4h, v12.4h
425    sMLAL           v24.4s, v1.4h, v12.4h
426
427
428    uMULL           v22.4s, v6.4h, v9.4h
429    uMULL           v20.4s, v4.4h, v9.4h
430
431
432    ADD             v28.4s, v28.4s , v26.4s
433    SUB             v30.4s, v30.4s , v24.4s
434    NEG             v28.4s, v28.4s
435
436    uMULL           v18.4s, v6.4h, v8.4h
437    uMULL           v16.4s, v4.4h, v8.4h
438
439
440    mov             v26.8b, v30.8b
441    mov             v27.D[0], v30.D[1]
442    ushR            v22.4s, v22.4s, #16
443
444    mov             v24.16b, v28.16b
445    mov             v25.D[0], v28.D[1]
446    ushR            v20.4s, v20.4s, #16
447
448
449    mov             v31.8b, v26.8b
450    UZP1            v26.4h, v31.4h, v27.4h
451    UZP2            v27.4h, v31.4h, v27.4h
452    ushR            v18.4s, v18.4s, #16
453
454
455    mov             v31.8b, v24.8b
456    UZP1            v24.4h, v31.4h, v25.4h
457    UZP2            v25.4h, v31.4h, v25.4h
458    ushR            v16.4s, v16.4s, #16
459
460
461    sMLAL           v22.4s, v7.4h, v9.4h
462    sMLAL           v20.4s, v5.4h, v9.4h
463    sMLAL           v18.4s, v7.4h, v8.4h
464    sMLAL           v16.4s, v5.4h, v8.4h
465
466
467    uMULL           v0.4s, v26.4h, v10.4h
468
469
470    uMULL           v2.4s, v24.4h, v10.4h
471
472
473    ADD             v22.4s, v22.4s , v16.4s
474
475
476    SUB             v20.4s, v18.4s , v20.4s
477
478
479    NEG             v22.4s, v22.4s
480
481
482    mov             v18.16b, v22.16b
483    ushR            v0.4s, v0.4s, #16
484
485    mov             v16.16b, v20.16b
486    ushR            v2.4s, v2.4s, #16
487
488
489    mov             v31.16b, v18.16b
490    mov             v19.d[0], v31.d[1]
491    UZP1            v18.4h, v31.4h, v19.4h
492    UZP2            v19.4h, v31.4h, v19.4h
493    sMLAL           v0.4s, v27.4h, v10.4h
494
495
496    mov             v31.16b, v16.16b
497    mov             v17.d[0], v31.d[1]
498    UZP1            v16.4h, v31.4h, v17.4h
499    UZP2            v17.4h, v31.4h, v17.4h
500    sMLAL           v2.4s, v25.4h, v10.4h
501
502    uMULL           v4.4s, v18.4h, v10.4h
503    uMULL           v6.4s, v16.4h, v10.4h
504
505    NEG             v0.4s, v0.4s
506    ADD             v14.4s, v30.4s , v2.4s
507    ADD             v26.4s, v28.4s , v0.4s
508
509    rev64           v14.4s, v14.4s
510    ushR            v4.4s, v4.4s, #16
511
512    swp             v14.D[0], v14.D[1]
513    ushR            v6.4s, v6.4s, #16
514
515    sMLAL           v4.4s, v19.4h, v10.4h
516
517    sMLAL           v6.4s, v17.4h, v10.4h
518
519
520
521
522    ADD             v24.4s, v20.4s , v4.4s
523
524    rev64           v24.4s, v24.4s
525    NEG             v16.4s, v6.4s
526
527
528
529    swp             v24.D[0], v24.D[1]
530    ADD             v16.4s, v22.4s , v16.4s
531
532    MOV             v25.16B, v24.16B
533    MOV             v15.16B, v14.16B
534    ST2             { v15.4s, v16.4s}, [x0], #32
535    ST2             { v25.4s, v26.4s}, [x7], x8
536
537
538
539
540    LD4             {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8
541
542    movi            v6.2s, #0x00000000
543    movi            v7.2s, #0x00000000
544
545    LD2             {v4.2s, v5.2s}, [x1], #16
546    LD2             {v6.s, v7.s}[0], [x1]
547
548    LD2             {v8.h, v9.h}[0], [x2], x6
549    LD2             {v8.h, v9.h}[1], [x2], x6
550    LD2             {v8.h, v9.h}[2], [x2], x6
551    LD2             {v8.h, v9.h}[3], [x2], x6
552
553    rev64           v12.8h, v8.8h
554    rev64           v13.8h, v9.8h
555    swp             v5.D[0], v6.D[0]
556
557
558    MOV             v30.8B, V4.8B
559    UZP1            v4.4h, v30.4h, v5.4h
560    UZP2            v5.4h, v30.4h, v5.4h
561    MOV             v30.8B, V6.8B
562    UZP1            v6.4h, v30.4h, v7.4h
563    UZP2            v7.4h, v30.4h, v7.4h
564    uMULL           v30.4s, v2.4h, v13.4h
565    uMULL           v28.4s, v0.4h, v13.4h
566
567    uMULL           v26.4s, v2.4h, v12.4h
568    uMULL           v24.4s, v0.4h, v12.4h
569
570    ushR            v30.4s, v30.4s, #16
571    ushR            v28.4s, v28.4s, #16
572    ushR            v26.4s, v26.4s, #16
573    ushR            v24.4s, v24.4s, #16
574
575    sMLAL           v30.4s, v3.4h, v13.4h
576    sMLAL           v28.4s, v1.4h, v13.4h
577    sMLAL           v26.4s, v3.4h, v12.4h
578    sMLAL           v24.4s, v1.4h, v12.4h
579
580    uMULL           v22.4s, v6.4h, v9.4h
581    uMULL           v20.4s, v4.4h, v9.4h
582
583
584    ADD             v28.4s, v28.4s , v26.4s
585    SUB             v30.4s, v30.4s , v24.4s
586    NEG             v28.4s, v28.4s
587
588    uMULL           v18.4s, v6.4h, v8.4h
589    uMULL           v16.4s, v4.4h, v8.4h
590
591    mov             v26.8b, v30.8b
592    mov             v27.D[0], v30.D[1]
593    ushR            v22.4s, v22.4s, #16
594
595    mov             v24.16b, v28.16b
596    mov             v25.D[0], v28.D[1]
597    ushR            v20.4s, v20.4s, #16
598
599
600    MOV             v31.8B, V26.8B
601    UZP1            v26.4h, v31.4h, v27.4h
602    UZP2            v27.4h, v31.4h, v27.4h
603    ushr            v18.4s, v18.4s, #16
604
605    MOV             v31.8B, V24.8B
606    UZP1            v24.4h, v31.4h, v25.4h
607    UZP2            v25.4h, v31.4h, v25.4h
608    ushR            v16.4s, v16.4s, #16
609
610    sMLAL           v22.4s, v7.4h, v9.4h
611    sMLAL           v20.4s, v5.4h, v9.4h
612    sMLAL           v18.4s, v7.4h, v8.4h
613    sMLAL           v16.4s, v5.4h, v8.4h
614
615
616    uMULL           v0.4s, v26.4h, v10.4h
617
618
619    uMULL           v2.4s, v24.4h, v10.4h
620
621    ADD             v22.4s, v22.4s , v16.4s
622
623
624    SUB             v20.4s, v18.4s , v20.4s
625
626
627    NEG             v22.4s, v22.4s
628
629
630    mov             v18.8B, v22.8B
631    mov             v19.D[0], v22.D[1]
632    ushR            v0.4s, v0.4s, #16
633
634    mov             v16.16b, v20.16b
635    mov             v17.D[0], v20.D[1]
636    ushR            v2.4s, v2.4s, #16
637
638
639    MOV             v31.8B, V18.8B
640    UZP1            v18.4h, v31.4h, v19.4h
641    UZP2            v19.4h, v31.4h, v19.4h
642    sMLAL           v0.4s, v27.4h, v10.4h
643
644
645    MOV             v31.8B, V16.8B
646    UZP1            v16.4h, v31.4h, v17.4h
647    UZP2            v17.4h, v31.4h, v17.4h
648    sMLAL           v2.4s, v25.4h, v10.4h
649
650    uMULL           v4.4s, v18.4h, v10.4h
651    uMULL           v6.4s, v16.4h, v10.4h
652
653    NEG             v0.4s, v0.4s
654    ADD             v14.4s, v30.4s , v2.4s
655    ADD             v26.4s, v28.4s , v0.4s
656
657    rev64           v14.4s, v14.4s
658    ushR            v4.4s, v4.4s, #16
659
660    swp             v14.D[0], v14.D[1]
661    ushR            v6.4s, v6.4s, #16
662
663    sMLAL           v4.4s, v19.4h, v10.4h
664
665    sMLAL           v6.4s, v17.4h, v10.4h
666
667
668
669
670    ADD             v24.4s, v20.4s , v4.4s
671
672    rev64           v24.4s, v24.4s
673    NEG             v16.4s, v6.4s
674
675    swp             v24.D[0], v24.D[1]
676    ADD             v16.4s, v22.4s , v16.4s
677
678
679    MOV             v15.16B, v14.16B
680    ST2             {v15.2s, v16.2s}, [x0], #16
681
682    ST2             {v15.s, v16.s}[2], [x0], #8
683
684    ST1             {v15.s}[3], [x0]
685
686    ADD             x7, x7, #4
687
688    ST1             {v26.s}[0], [x7], #4
689    MOV             v25.16B, v24.16B
690    ST2             {v25.s, v26.s}[1], [x7], #8
691    MOV             v27.D[0], V26.d[1]
692    mov             v26.d[0], v25.d[1]
693    ST2             {v26.2s, v27.2s}, [x7]
694
695
696
697
698
699
700    pop_v_regs
701    ret
702
703
704
705
706
707
708
709
710
711
712
713
714