1@ This file was created from a .asm file 2@ using the ads2gas.pl script. 3.syntax unified 4@ 5@ Copyright (c) 2013 The WebM project authors. All Rights Reserved. 6@ 7@ Use of this source code is governed by a BSD-style license 8@ that can be found in the LICENSE file in the root of the source 9@ tree. An additional intellectual property rights grant can be found 10@ in the file PATENTS. All contributing project authors may 11@ be found in the AUTHORS file in the root of the source tree. 12@ 13 14 .global vpx_idct4x4_16_add_neon 15 .type vpx_idct4x4_16_add_neon, function 16 .arm 17 .eabi_attribute 24, 1 @Tag_ABI_align_needed 18 .eabi_attribute 25, 1 @Tag_ABI_align_preserved 19 20 .text 21 .p2align 2 22 23 .include "vpx_dsp/arm/idct_neon.asm.S" 24 25 .text 26 .p2align 2 27@void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride) 28@ 29@ r0 int16_t input 30@ r1 uint8_t *dest 31@ r2 int stride) 32 33vpx_idct4x4_16_add_neon: @ PROC 34 35 @ The 2D transform is done with two passes which are actually pretty 36 @ similar. We first transform the rows. This is done by transposing 37 @ the inputs, doing an SIMD column transform (the columns are the 38 @ transposed rows) and then transpose the results (so that it goes back 39 @ in normal/row positions). Then, we transform the columns by doing 40 @ another SIMD column transform. 41 @ So, two passes of a transpose followed by a column transform. 42 43 @ load the inputs into q8-q9, d16-d19 44 LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0 45 46 @ generate scalar constants 47 @ cospi_8_64 = 15137 48 movw r0, #0x3b21 49 @ cospi_16_64 = 11585 50 movw r3, #0x2d41 51 @ cospi_24_64 = 6270 52 movw r12, #0x187e 53 54 @ transpose the input data 55 @ 00 01 02 03 d16 56 @ 10 11 12 13 d17 57 @ 20 21 22 23 d18 58 @ 30 31 32 33 d19 59 vtrn.16 d16, d17 60 vtrn.16 d18, d19 61 62 @ generate constant vectors 63 vdup.16 d20, r0 @ replicate cospi_8_64 64 vdup.16 d21, r3 @ replicate cospi_16_64 65 66 @ 00 10 02 12 d16 67 @ 01 11 03 13 d17 68 @ 20 30 22 32 d18 69 @ 21 31 23 33 d19 70 vtrn.32 q8, q9 71 @ 00 10 20 30 d16 72 @ 01 11 21 31 d17 73 @ 02 12 22 32 d18 74 @ 03 13 23 33 d19 75 76 vdup.16 d22, r12 @ replicate cospi_24_64 77 78 @ do the transform on transposed rows 79 80 @ stage 1 81 vmull.s16 q15, d17, d22 @ input[1] * cospi_24_64 82 vmull.s16 q1, d17, d20 @ input[1] * cospi_8_64 83 84 @ (input[0] + input[2]) * cospi_16_64; 85 @ (input[0] - input[2]) * cospi_16_64; 86 vmull.s16 q8, d16, d21 87 vmull.s16 q14, d18, d21 88 vadd.s32 q13, q8, q14 89 vsub.s32 q14, q8, q14 90 91 @ input[1] * cospi_24_64 - input[3] * cospi_8_64; 92 @ input[1] * cospi_8_64 + input[3] * cospi_24_64; 93 vmlsl.s16 q15, d19, d20 94 vmlal.s16 q1, d19, d22 95 96 @ dct_const_round_shift 97 vrshrn.s32 d26, q13, #14 98 vrshrn.s32 d27, q14, #14 99 vrshrn.s32 d29, q15, #14 100 vrshrn.s32 d28, q1, #14 101 102 @ stage 2 103 @ output[0] = step[0] + step[3]; 104 @ output[1] = step[1] + step[2]; 105 @ output[3] = step[0] - step[3]; 106 @ output[2] = step[1] - step[2]; 107 vadd.s16 q8, q13, q14 108 vsub.s16 q9, q13, q14 109 vswp d18, d19 110 111 @ transpose the results 112 @ 00 01 02 03 d16 113 @ 10 11 12 13 d17 114 @ 20 21 22 23 d18 115 @ 30 31 32 33 d19 116 vtrn.16 d16, d17 117 vtrn.16 d18, d19 118 @ 00 10 02 12 d16 119 @ 01 11 03 13 d17 120 @ 20 30 22 32 d18 121 @ 21 31 23 33 d19 122 vtrn.32 q8, q9 123 @ 00 10 20 30 d16 124 @ 01 11 21 31 d17 125 @ 02 12 22 32 d18 126 @ 03 13 23 33 d19 127 128 @ do the transform on columns 129 130 @ stage 1 131 vadd.s16 d23, d16, d18 @ (input[0] + input[2]) 132 vsub.s16 d24, d16, d18 @ (input[0] - input[2]) 133 134 vmull.s16 q15, d17, d22 @ input[1] * cospi_24_64 135 vmull.s16 q1, d17, d20 @ input[1] * cospi_8_64 136 137 @ (input[0] + input[2]) * cospi_16_64; 138 @ (input[0] - input[2]) * cospi_16_64; 139 vmull.s16 q13, d23, d21 140 vmull.s16 q14, d24, d21 141 142 @ input[1] * cospi_24_64 - input[3] * cospi_8_64; 143 @ input[1] * cospi_8_64 + input[3] * cospi_24_64; 144 vmlsl.s16 q15, d19, d20 145 vmlal.s16 q1, d19, d22 146 147 @ dct_const_round_shift 148 vrshrn.s32 d26, q13, #14 149 vrshrn.s32 d27, q14, #14 150 vrshrn.s32 d29, q15, #14 151 vrshrn.s32 d28, q1, #14 152 153 @ stage 2 154 @ output[0] = step[0] + step[3]; 155 @ output[1] = step[1] + step[2]; 156 @ output[3] = step[0] - step[3]; 157 @ output[2] = step[1] - step[2]; 158 vadd.s16 q8, q13, q14 159 vsub.s16 q9, q13, q14 160 161 @ The results are in two registers, one of them being swapped. This will 162 @ be taken care of by loading the 'dest' value in a swapped fashion and 163 @ also storing them in the same swapped fashion. 164 @ temp_out[0, 1] = d16, d17 = q8 165 @ temp_out[2, 3] = d19, d18 = q9 swapped 166 167 @ ROUND_POWER_OF_TWO(temp_out[j], 4) 168 vrshr.s16 q8, q8, #4 169 vrshr.s16 q9, q9, #4 170 171 vld1.32 {d26[0]}, [r1], r2 172 vld1.32 {d26[1]}, [r1], r2 173 vld1.32 {d27[1]}, [r1], r2 174 vld1.32 {d27[0]}, [r1] @ no post-increment 175 176 @ ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i] 177 vaddw.u8 q8, q8, d26 178 vaddw.u8 q9, q9, d27 179 180 @ clip_pixel 181 vqmovun.s16 d26, q8 182 vqmovun.s16 d27, q9 183 184 @ do the stores in reverse order with negative post-increment, by changing 185 @ the sign of the stride 186 rsb r2, r2, #0 187 vst1.32 {d27[0]}, [r1], r2 188 vst1.32 {d27[1]}, [r1], r2 189 vst1.32 {d26[1]}, [r1], r2 190 vst1.32 {d26[0]}, [r1] @ no post-increment 191 bx lr 192.size vpx_idct4x4_16_add_neon, .-vpx_idct4x4_16_add_neon @ ENDP @ |vpx_idct4x4_16_add_neon| 193 194 .section .note.GNU-stack,"",%progbits 195