1@ This file was created from a .asm file 2@ using the ads2gas.pl script. 3 .syntax unified 4@ 5@ Copyright (c) 2013 The WebM project authors. All Rights Reserved. 6@ 7@ Use of this source code is governed by a BSD-style license 8@ that can be found in the LICENSE file in the root of the source 9@ tree. An additional intellectual property rights grant can be found 10@ in the file PATENTS. All contributing project authors may 11@ be found in the AUTHORS file in the root of the source tree. 12@ 13 14 .global vpx_idct4x4_16_add_neon 15 .type vpx_idct4x4_16_add_neon, function 16 .arm 17 .eabi_attribute 24, 1 @Tag_ABI_align_needed 18 .eabi_attribute 25, 1 @Tag_ABI_align_preserved 19 20.text 21.p2align 2 22 23 .include "vpx_dsp/arm/idct_neon.asm.S" 24 25.text 26.p2align 2@ name this block of code 27@void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride) 28@ 29@ r0 int16_t input 30@ r1 uint8_t *dest 31@ r2 int stride) 32 33_vpx_idct4x4_16_add_neon: 34 vpx_idct4x4_16_add_neon: @ PROC 35 36 @ The 2D transform is done with two passes which are actually pretty 37 @ similar. We first transform the rows. This is done by transposing 38 @ the inputs, doing an SIMD column transform (the columns are the 39 @ transposed rows) and then transpose the results (so that it goes back 40 @ in normal/row positions). Then, we transform the columns by doing 41 @ another SIMD column transform. 42 @ So, two passes of a transpose followed by a column transform. 43 44 @ load the inputs into q8-q9, d16-d19 45 LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0 46 47 @ generate scalar constants 48 @ cospi_8_64 = 15137 49 movw r0, #0x3b21 50 @ cospi_16_64 = 11585 51 movw r3, #0x2d41 52 @ cospi_24_64 = 6270 53 movw r12, #0x187e 54 55 @ transpose the input data 56 @ 00 01 02 03 d16 57 @ 10 11 12 13 d17 58 @ 20 21 22 23 d18 59 @ 30 31 32 33 d19 60 vtrn.16 d16, d17 61 vtrn.16 d18, d19 62 63 @ generate constant vectors 64 vdup.16 d20, r0 @ replicate cospi_8_64 65 vdup.16 d21, r3 @ replicate cospi_16_64 66 67 @ 00 10 02 12 d16 68 @ 01 11 03 13 d17 69 @ 20 30 22 32 d18 70 @ 21 31 23 33 d19 71 vtrn.32 q8, q9 72 @ 00 10 20 30 d16 73 @ 01 11 21 31 d17 74 @ 02 12 22 32 d18 75 @ 03 13 23 33 d19 76 77 vdup.16 d22, r12 @ replicate cospi_24_64 78 79 @ do the transform on transposed rows 80 81 @ stage 1 82 vmull.s16 q15, d17, d22 @ input[1] * cospi_24_64 83 vmull.s16 q1, d17, d20 @ input[1] * cospi_8_64 84 85 @ (input[0] + input[2]) * cospi_16_64; 86 @ (input[0] - input[2]) * cospi_16_64; 87 vmull.s16 q8, d16, d21 88 vmull.s16 q14, d18, d21 89 vadd.s32 q13, q8, q14 90 vsub.s32 q14, q8, q14 91 92 @ input[1] * cospi_24_64 - input[3] * cospi_8_64; 93 @ input[1] * cospi_8_64 + input[3] * cospi_24_64; 94 vmlsl.s16 q15, d19, d20 95 vmlal.s16 q1, d19, d22 96 97 @ dct_const_round_shift 98 vrshrn.s32 d26, q13, #14 99 vrshrn.s32 d27, q14, #14 100 vrshrn.s32 d29, q15, #14 101 vrshrn.s32 d28, q1, #14 102 103 @ stage 2 104 @ output[0] = step[0] + step[3]; 105 @ output[1] = step[1] + step[2]; 106 @ output[3] = step[0] - step[3]; 107 @ output[2] = step[1] - step[2]; 108 vadd.s16 q8, q13, q14 109 vsub.s16 q9, q13, q14 110 vswp d18, d19 111 112 @ transpose the results 113 @ 00 01 02 03 d16 114 @ 10 11 12 13 d17 115 @ 20 21 22 23 d18 116 @ 30 31 32 33 d19 117 vtrn.16 d16, d17 118 vtrn.16 d18, d19 119 @ 00 10 02 12 d16 120 @ 01 11 03 13 d17 121 @ 20 30 22 32 d18 122 @ 21 31 23 33 d19 123 vtrn.32 q8, q9 124 @ 00 10 20 30 d16 125 @ 01 11 21 31 d17 126 @ 02 12 22 32 d18 127 @ 03 13 23 33 d19 128 129 @ do the transform on columns 130 131 @ stage 1 132 vadd.s16 d23, d16, d18 @ (input[0] + input[2]) 133 vsub.s16 d24, d16, d18 @ (input[0] - input[2]) 134 135 vmull.s16 q15, d17, d22 @ input[1] * cospi_24_64 136 vmull.s16 q1, d17, d20 @ input[1] * cospi_8_64 137 138 @ (input[0] + input[2]) * cospi_16_64; 139 @ (input[0] - input[2]) * cospi_16_64; 140 vmull.s16 q13, d23, d21 141 vmull.s16 q14, d24, d21 142 143 @ input[1] * cospi_24_64 - input[3] * cospi_8_64; 144 @ input[1] * cospi_8_64 + input[3] * cospi_24_64; 145 vmlsl.s16 q15, d19, d20 146 vmlal.s16 q1, d19, d22 147 148 @ dct_const_round_shift 149 vrshrn.s32 d26, q13, #14 150 vrshrn.s32 d27, q14, #14 151 vrshrn.s32 d29, q15, #14 152 vrshrn.s32 d28, q1, #14 153 154 @ stage 2 155 @ output[0] = step[0] + step[3]; 156 @ output[1] = step[1] + step[2]; 157 @ output[3] = step[0] - step[3]; 158 @ output[2] = step[1] - step[2]; 159 vadd.s16 q8, q13, q14 160 vsub.s16 q9, q13, q14 161 162 @ The results are in two registers, one of them being swapped. This will 163 @ be taken care of by loading the 'dest' value in a swapped fashion and 164 @ also storing them in the same swapped fashion. 165 @ temp_out[0, 1] = d16, d17 = q8 166 @ temp_out[2, 3] = d19, d18 = q9 swapped 167 168 @ ROUND_POWER_OF_TWO(temp_out[j], 4) 169 vrshr.s16 q8, q8, #4 170 vrshr.s16 q9, q9, #4 171 172 vld1.32 {d26[0]}, [r1], r2 173 vld1.32 {d26[1]}, [r1], r2 174 vld1.32 {d27[1]}, [r1], r2 175 vld1.32 {d27[0]}, [r1] @ no post-increment 176 177 @ ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i] 178 vaddw.u8 q8, q8, d26 179 vaddw.u8 q9, q9, d27 180 181 @ clip_pixel 182 vqmovun.s16 d26, q8 183 vqmovun.s16 d27, q9 184 185 @ do the stores in reverse order with negative post-increment, by changing 186 @ the sign of the stride 187 rsb r2, r2, #0 188 vst1.32 {d27[0]}, [r1], r2 189 vst1.32 {d27[1]}, [r1], r2 190 vst1.32 {d26[1]}, [r1], r2 191 vst1.32 {d26[0]}, [r1] @ no post-increment 192 bx lr 193 .size vpx_idct4x4_16_add_neon, .-vpx_idct4x4_16_add_neon @ ENDP @ |vpx_idct4x4_16_add_neon| 194 195 .section .note.GNU-stack,"",%progbits 196