1//VOID ixheaacd_inv_dit_fft_8pt(WORD32 *y, 2// WORD32 *real, 3// WORD32 *imag) 4 5.macro push_v_regs 6 stp q8, q9, [sp, #-32]! 7 stp q10, q11, [sp, #-32]! 8 stp q12, q13, [sp, #-32]! 9 stp q14, q15, [sp, #-32]! 10.endm 11.macro pop_v_regs 12 ldp q14, q15, [sp], #32 13 ldp q12, q13, [sp], #32 14 ldp q10, q11, [sp], #32 15 ldp q8, q9, [sp], #32 16.endm 17 18 19.text 20.global ixheaacd_inv_dit_fft_8pt_armv8 21ixheaacd_inv_dit_fft_8pt_armv8: 22 push_v_regs 23 MOV w3, #0x5A820000 24 DUP v0.2s, w3 25 MOV x5, #8 26 ADD x6, x0, #4 27 28 //LD2 {v1.2s,v2.2s},[x0],x5 29 //LD2 {v3.2s,v4.2s},[x0],x5 30 //LD2 {v5.2s,v6.2s},[x0],x5 31 //LD2 {v7.2s,v8.2s},[x0],x5 32 33 LD1 {v1.s}[0], [x0], x5 34 LD1 {v2.s}[0], [x6], x5 35 LD1 {v1.s}[1], [x0], x5 36 LD1 {v2.s}[1], [x6], x5 37 LD1 {v3.s}[0], [x0], x5 38 LD1 {v4.s}[0], [x6], x5 39 LD1 {v3.s}[1], [x0], x5 40 LD1 {v4.s}[1], [x6], x5 41 LD1 {v5.s}[0], [x0], x5 42 LD1 {v6.s}[0], [x6], x5 43 LD1 {v5.s}[1], [x0], x5 44 LD1 {v6.s}[1], [x6], x5 45 LD1 {v7.s}[0], [x0], x5 46 LD1 {v8.s}[0], [x6], x5 47 LD1 {v7.s}[1], [x0], x5 48 LD1 {v8.s}[1], [x6], x5 49 50 //v1 - y0_2 51 //v2 - y1_3 52 //v3 - y4_6 53 //v4 - y5_7 54 //v5 - y8_10 55 //v6 - y9_11 56 //v7 - y12_14 57 //v8 - y13_15 58 59 SQADD v9.2s, v1.2s, v5.2s //a00_v = vqadd_s32(y0_2,y8_10); 60 SQADD v10.2s, v2.2s, v6.2s //a20_v = vqadd_s32(y1_3,y9_11); 61 SQADD v11.2s, v3.2s, v7.2s //a10_v = vqadd_s32(y4_6,y12_14); 62 SQADD v12.2s, v4.2s, v8.2s //a30_v = vqadd_s32(y5_7,y13_15); 63 64 SQSUB v1.2s, v1.2s, v5.2s //a0_v = vqsub_s32(y0_2,y8_10); 65 SQSUB v5.2s, v2.2s, v6.2s //a3_v = vqsub_s32(y1_3,y9_11); 66 SQSUB v2.2s, v3.2s, v7.2s //a2_v = vqsub_s32(y4_6,y12_14); 67 SQSUB v6.2s, v4.2s, v8.2s //a1_v = vqsub_s32(y5_7,y13_15); 68 69 SQADD v3.2s, v9.2s, v11.2s //x0_8 = vqadd_s32(a00_v,a10_v); 70 SQADD v7.2s, v10.2s, v12.2s //x1_9 = vqadd_s32(a20_v,a30_v); 71 72 SQSUB v4.2s, v9.2s, v11.2s //x4_12 = vqsub_s32(a00_v,a10_v); 73 SQSUB v8.2s, v10.2s, v12.2s //x5_13 = vqsub_s32(a20_v,a30_v); 74 75 SQADD v9.2s, v1.2s, v6.2s //x6_14 = vqadd_s32(a0_v,a1_v); 76 SQADD v11.2s, v5.2s, v2.2s //x3_11 = vqadd_s32(a3_v,a2_v); 77 SQSUB v10.2s, v1.2s, v6.2s //x2_10 = vqsub_s32(a0_v,a1_v); 78 SQSUB v13.2s, v5.2s, v2.2s //x7_15 = vqsub_s32(a3_v,a2_v); 79 80 UZP1 v1.2s, v3.2s, v7.2s //x0_1 = vuzp1_s32(x0_8,x1_9); 81 UZP2 v5.2s, v3.2s, v7.2s //x8_9 = vuzp2_s32(x0_8,x1_9); 82 83 UZP1 v6.2s, v4.2s, v8.2s //x4_5 = vuzp1_s32(x4_12,x5_13); 84 UZP2 v7.2s, v4.2s, v8.2s //x12_13 = vuzp2_s32(x4_12,x5_13); 85 REV64 v7.2s, v7.2s //x13_12 = vrev64_s32(x12_13); 86 87 SQADD v3.2s, v1.2s, v5.2s //real_imag0 = vqadd_s32(x0_1,x8_9); 88 SQSUB v8.2s, v1.2s, v5.2s //a00_10_v = vqsub_s32(x0_1,x8_9); 89 90 SQADD v12.2s, v6.2s, v7.2s //real_imag4 = vqadd_s32(x4_5,x13_12); 91 SQSUB v14.2s, v6.2s, v7.2s //a0_1_v = vqsub_s32(x4_5,x13_12); 92 93 94 MOV w4, v12.s[1] 95 MOV v12.s[1], v14.s[1] 96 MOV v14.s[1], w4 97 98 UZP1 v6.2s, v10.2s, v11.2s //x2_3 99 100 SQSUB v1.2s, v10.2s, v11.2s //tempr = vqsub_s32(x2_10,x3_11) 101 SQADD v5.2s, v10.2s, v11.2s //tempi = vqadd_s32(x2_10,x3_11) 102 103 SMULL v7.2d, v1.2s, v0.2s 104 SMULL v10.2d, v5.2s, v0.2s 105 106 SSHR v7.2d, v7.2d, #32 //tempr_q 107 SSHR v10.2d, v10.2d, #32 //tempi_q 108 109 SHL v7.4s, v7.4s, #1 110 SHL v10.4s, v10.4s, #1 111 112 113 114 MOV v1.s[0], v7.s[2] 115 MOV v1.s[1], v10.s[2] //vr_i 116 117 SQSUB v7.2s, v6.2s, v1.2s //a2_3_v = vqsub_s32(x2_3,vr_i); 118 SQADD v4.2s, v6.2s, v1.2s //real_imag1 = vqadd_s32(x2_3,vr_i); 119 SQADD v5.2s, v14.2s, v7.2s //real_imag2 = vqadd_s32(a0_1_v,a2_3_v); 120 121 UZP1 v1.2s, v9.2s, v13.2s //x6_7 122 SQADD v6.2s, v9.2s, v13.2s //tempr = vqadd_s32(x6_14,x7_15); 123 SQSUB v14.2s, v9.2s, v13.2s //tempi = vqsub_s32(x6_14,x7_15); 124 125 SMULL v9.2d, v6.2s, v0.2s 126 SMULL v13.2d, v14.2s, v0.2s 127 128 SSHR v9.2d, v9.2d, #32 129 SSHR v13.2d, v13.2d, #32 130 131 SHL v9.4s, v9.4s, #1 132 SHL v13.4s, v13.4s, #1 133 134 135 136 MOV v0.s[0], v9.s[2] 137 MOV v0.s[1], v13.s[2] 138 139 SQSUB v9.2s, v1.2s, v0.2s // a20_30_v 140 SQADD v13.2s, v1.2s, v0.2s //real_imag5 141 142 143 MOV w4, v9.s[1] 144 MOV v9.s[1], v13.s[1] 145 MOV v13.s[1], w4 146 147 SQADD v6.2s, v9.2s, v8.2s //real_imag3 148 149 ST1 {v3.s}[0], [x1], #4 150 ST1 {v4.s}[0], [x1], #4 151 ST1 {v5.s}[0], [x1], #4 152 ST1 {v6.s}[0], [x1], #4 153 ST1 {v12.s}[0], [x1], #4 154 ST1 {v13.s}[0], [x1], #4 155 156 ST1 {v3.s}[1], [x2], #4 157 ST1 {v4.s}[1], [x2], #4 158 ST1 {v5.s}[1], [x2], #4 159 ST1 {v6.s}[1], [x2], #4 160 ST1 {v12.s}[1], [x2], #4 161 ST1 {v13.s}[1], [x2], #4 162 //ST4 {v3.s,v4.s,v5.s,v6.s}[0],[x1],x5 163 //ST4 {v3.s,v4.s,v5.s,v6.s}[1],[x2],x5 164 //ST2 {v12.s,v13.s}[0],[x1] 165 //ST2 {v12.s,v13.s}[1],[x2] 166 pop_v_regs 167 ret 168 169 170 171 172 173 174 175