1@/****************************************************************************** 2@ * 3@ * Copyright (C) 2015 The Android Open Source Project 4@ * 5@ * Licensed under the Apache License, Version 2.0 (the "License"); 6@ * you may not use this file except in compliance with the License. 7@ * You may obtain a copy of the License at: 8@ * 9@ * http://www.apache.org/licenses/LICENSE-2.0 10@ * 11@ * Unless required by applicable law or agreed to in writing, software 12@ * distributed under the License is distributed on an "AS IS" BASIS, 13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@ * See the License for the specific language governing permissions and 15@ * limitations under the License. 16@ * 17@ ***************************************************************************** 18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19@*/ 20@** 21@****************************************************************************** 22@* @file 23@* ih264_inter_pred_luma_vert_qpel_a9q.s 24@* 25@* @brief 26@* Contains function definitions for inter prediction vertical quarter pel interpolation. 27@* 28@* @author 29@* Mohit 30@* 31@* @par List of Functions: 32@* 33@* - ih264_inter_pred_luma_vert_qpel_a9q() 34@* 35@* @remarks 36@* None 37@* 38@******************************************************************************* 39@* 40 41@* All the functions here are replicated from ih264_inter_pred_filters.c 42@ 43 44@******************************************************************************* 45@* 46@* @brief 47@* Quarter pel interprediction luma filter for vertical input 48@* 49@* @par Description: 50@* Applies a 6 tap horizontal filter .The output is clipped to 8 bits 51@* sec 8.4.2.2.1 titled "Luma sample interpolation process" 52@* 53@* @param[in] pu1_src 54@* UWORD8 pointer to the source 55@* 56@* @param[out] pu1_dst 57@* UWORD8 pointer to the destination 58@* 59@* @param[in] src_strd 60@* integer source stride 61@* 62@* @param[in] dst_strd 63@* integer destination stride 64@* 65@* @param[in] ht 66@* integer height of the array 67@* 68@* @param[in] wd 69@* integer width of the array 70@* 71@* @param[in] pu1_tmp: temporary buffer: UNUSED in this function 72@* 73@* @param[in] dydx: x and y reference offset for qpel calculations. 74@* @returns 75@* 76@ @remarks 77@* None 78@* 79@******************************************************************************* 80@* 81 82@void ih264_inter_pred_luma_vert ( 83@ UWORD8 *pu1_src, 84@ UWORD8 *pu1_dst, 85@ WORD32 src_strd, 86@ WORD32 dst_strd, 87@ WORD32 ht, 88@ WORD32 wd, 89@ UWORD8* pu1_tmp, 90@ UWORD32 dydx) 91 92@**************Variables Vs Registers***************************************** 93@ r0 => *pu1_src 94@ r1 => *pu1_dst 95@ r2 => src_strd 96@ r3 => dst_strd 97@ r5 => ht 98@ r6 => wd 99@ r7 => dydx 100 101.text 102.p2align 2 103 104 .global ih264_inter_pred_luma_vert_qpel_a9q 105 106ih264_inter_pred_luma_vert_qpel_a9q: 107 108 stmfd sp!, {r4-r12, r14} @store register values to stack 109 vstmdb sp!, {d8-d15} @push neon registers to stack 110 ldr r5, [sp, #104] @Loads ht 111 112 ldr r6, [sp, #108] @Loads wd 113 ldr r7, [sp, #116] @Loads dydx 114 and r7, r7, #12 @Finds y-offset 115 lsr r7, r7, #3 @dydx>>3 116 mul r7, r2, r7 117 add r7, r0, r7 @pu1_src + (y_offset>>1)*src_strd 118 vmov.u16 q11, #20 @ Filter coeff 0x14 into Q11 119 sub r0, r0, r2, lsl #1 @pu1_src-2*src_strd 120 subs r12, r6, #8 @if wd=8 branch to loop_8 121 vmov.u16 q12, #5 @ Filter coeff 0x5 into Q12 122 beq loop_8 123 124 subs r12, r6, #4 @if wd=4 branch to loop_4 125 beq loop_4 126 127loop_16: @when wd=16 128 129 vld1.u32 {q0}, [r0], r2 @ Vector load from src[0_0] 130 vld1.u32 {q1}, [r0], r2 @ Vector load from src[1_0] 131 vld1.u32 {q2}, [r0], r2 @ Vector load from src[2_0] 132 vld1.u32 {q3}, [r0], r2 @ Vector load from src[3_0] 133 vld1.u32 {q4}, [r0], r2 @ Vector load from src[4_0] 134 vaddl.u8 q6, d4, d6 @ temp1 = src[2_0] + src[3_0] 135 vld1.u32 {q5}, [r0], r2 @ Vector load from src[5_0] 136 vaddl.u8 q7, d0, d10 @ temp = src[0_0] + src[5_0] 137 vaddl.u8 q8, d2, d8 @ temp2 = src[1_0] + src[4_0] 138 vmla.u16 q7, q6, q11 @ temp += temp1 * 20 139 vaddl.u8 q10, d1, d11 @ temp4 = src[0_8] + src[5_8] 140 vaddl.u8 q9, d5, d7 @ temp3 = src[2_8] + src[3_8] 141 vmla.u16 q10, q9, q11 @ temp4 += temp3 * 20 142 vld1.u32 {q0}, [r0], r2 143 vaddl.u8 q13, d3, d9 @ temp5 = src[1_8] + src[4_8] 144 vaddl.u8 q6, d6, d8 145 vmls.u16 q7, q8, q12 @ temp -= temp2 * 5 146 vaddl.u8 q8, d2, d0 147 vaddl.u8 q9, d4, d10 148 vmla.u16 q8, q6, q11 149 vmls.u16 q10, q13, q12 @ temp4 -= temp5 * 5 150 vaddl.u8 q13, d5, d11 151 vaddl.u8 q6, d7, d9 152 vqrshrun.s16 d30, q7, #5 @ dst[0_0] = CLIP_U8((temp +16) >> 5) 153 vaddl.u8 q7, d3, d1 154 vld1.u32 {q1}, [r0], r2 155 vmla.u16 q7, q6, q11 156 vmls.u16 q8, q9, q12 157 vqrshrun.s16 d31, q10, #5 @ dst[0_8] = CLIP_U8((temp4 +16) >> 5) 158 vld1.u32 {q10}, [r7], r2 @ Load for interpolation row 0 159 vrhadd.u8 q15, q10, q15 @ Interpolation to obtain qpel value 160 vaddl.u8 q9, d4, d2 161 vaddl.u8 q6, d8, d10 162 163 vst1.u32 {q15}, [r1], r3 @ Vector store to dst[0_0] 164 vmla.u16 q9, q6, q11 165 vaddl.u8 q10, d6, d0 166 vmls.u16 q7, q13, q12 167 vqrshrun.s16 d30, q8, #5 168 vaddl.u8 q6, d9, d11 169 vaddl.u8 q8, d5, d3 170 vaddl.u8 q13, d7, d1 171 vmla.u16 q8, q6, q11 172 vmls.u16 q9, q10, q12 173 vld1.u32 {q2}, [r0], r2 174 175 vqrshrun.s16 d31, q7, #5 176 vld1.u32 {q7}, [r7], r2 @ Load for interpolation row 1 177 vaddl.u8 q6, d10, d0 178 vrhadd.u8 q15, q7, q15 @ Interpolation to obtain qpel value 179 vaddl.u8 q7, d6, d4 180 vaddl.u8 q10, d8, d2 181 vmla.u16 q7, q6, q11 182 vmls.u16 q8, q13, q12 183 vst1.u32 {q15}, [r1], r3 @store row 1 184 vqrshrun.s16 d30, q9, #5 185 vaddl.u8 q9, d7, d5 186 vaddl.u8 q6, d11, d1 187 vmla.u16 q9, q6, q11 188 vaddl.u8 q13, d9, d3 189 vmls.u16 q7, q10, q12 190 vqrshrun.s16 d31, q8, #5 191 vld1.u32 {q8}, [r7], r2 @ Load for interpolation row 2 192 vmls.u16 q9, q13, q12 193 vrhadd.u8 q15, q8, q15 @ Interpolation to obtain qpel value 194 vaddl.u8 q6, d0, d2 @ temp1 = src[2_0] + src[3_0] 195 vst1.u32 {q15}, [r1], r3 @store row 2 196 vaddl.u8 q8, d10, d4 @ temp2 = src[1_0] + src[4_0] 197 vaddl.u8 q10, d9, d7 @ temp4 = src[0_8] + src[5_8] 198 vqrshrun.s16 d30, q7, #5 199 vaddl.u8 q13, d5, d11 @ temp5 = src[1_8] + src[4_8] 200 vaddl.u8 q7, d8, d6 @ temp = src[0_0] + src[5_0] 201 vqrshrun.s16 d31, q9, #5 202 vld1.u32 {q9}, [r7], r2 @ Load for interpolation row 3 203 vmla.u16 q7, q6, q11 @ temp += temp1 * 20 204 vrhadd.u8 q15, q9, q15 @ Interpolation to obtain qpel value 205 vaddl.u8 q9, d1, d3 @ temp3 = src[2_8] + src[3_8] 206 vst1.u32 {q15}, [r1], r3 @store row 3 207 subs r5, r5, #4 @ 4 rows processed, decrement by 4 208 subne r0, r0 , r2, lsl #2 209 subne r0, r0, r2 210 beq end_func @ Branch if height==4 211 212 b loop_16 @ looping if height = 8 or 16 213 214 215loop_8: 216 217 @ Processing row0 and row1 218 vld1.u32 d0, [r0], r2 @ Vector load from src[0_0] 219 vld1.u32 d1, [r0], r2 @ Vector load from src[1_0] 220 vld1.u32 d2, [r0], r2 @ Vector load from src[2_0] 221 vld1.u32 d3, [r0], r2 @ Vector load from src[3_0] 222 vld1.u32 d4, [r0], r2 @ Vector load from src[4_0] 223 vld1.u32 d5, [r0], r2 @ Vector load from src[5_0] 224 225 vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0] 226 vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0] 227 vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0] 228 vmla.u16 q4, q3, q11 @ temp += temp1 * 20 229 vld1.u32 d6, [r0], r2 230 vaddl.u8 q7, d3, d4 231 vaddl.u8 q8, d1, d6 232 vaddl.u8 q9, d2, d5 233 vmls.u16 q4, q5, q12 @ temp -= temp2 * 5 234 vmla.u16 q8, q7, q11 235 vld1.u32 d7, [r0], r2 236 vaddl.u8 q10, d4, d5 237 vaddl.u8 q6, d2, d7 238 vaddl.u8 q5, d3, d6 239 vmls.u16 q8, q9, q12 240 vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5) 241 vmla.u16 q6, q10, q11 242 vld1.32 d8, [r7], r2 @Load value for interpolation (row0) 243 vld1.32 d9, [r7], r2 @Load value for interpolation (row1) 244 vld1.u32 d0, [r0], r2 245 vaddl.u8 q7, d5, d6 246 vqrshrun.s16 d27, q8, #5 247 vrhadd.u8 q13, q4, q13 @ Interpolation step for qpel calculation 248 vaddl.u8 q10, d3, d0 249 vmls.u16 q6, q5, q12 250 vst1.u32 d26, [r1], r3 @ Vector store to dst[0_0] 251 vaddl.u8 q9, d4, d7 252 vmla.u16 q10, q7, q11 253 vst1.u32 d27, [r1], r3 @ Vector store to dst[1_0] 254 vqrshrun.s16 d28, q6, #5 255 vmls.u16 q10, q9, q12 256 vld1.32 d12, [r7], r2 @Load value for interpolation (row2) 257 vld1.32 d13, [r7], r2 @Load value for interpolation (row3) 258 vqrshrun.s16 d29, q10, #5 259 subs r9, r5, #4 260 vrhadd.u8 q14, q6, q14 261 vst1.u32 d28, [r1], r3 @store row 2 262 vst1.u32 d29, [r1], r3 @store row 3 263 264 subs r5, r5, #4 @ 4 rows processed, decrement by 4 265 subne r0, r0 , r2, lsl #2 266 subne r0, r0, r2 267 beq end_func @ Branch if height==4 268 b loop_8 @looping if height == 8 or 16 269 270loop_4: 271@ Processing row0 and row1 272 273 vld1.u32 d0[0], [r0], r2 @ Vector load from src[0_0] 274 vld1.u32 d1[0], [r0], r2 @ Vector load from src[1_0] 275 vld1.u32 d2[0], [r0], r2 @ Vector load from src[2_0] 276 vld1.u32 d3[0], [r0], r2 @ Vector load from src[3_0] 277 vld1.u32 d4[0], [r0], r2 @ Vector load from src[4_0] 278 vld1.u32 d5[0], [r0], r2 @ Vector load from src[5_0] 279 280 vaddl.u8 q3, d2, d3 @ temp1 = src[2_0] + src[3_0] 281 vaddl.u8 q4, d0, d5 @ temp = src[0_0] + src[5_0] 282 vaddl.u8 q5, d1, d4 @ temp2 = src[1_0] + src[4_0] 283 vmla.u16 q4, q3, q11 @ temp += temp1 * 20 284 vld1.u32 d6, [r0], r2 285 vaddl.u8 q7, d3, d4 286 vaddl.u8 q8, d1, d6 287 vaddl.u8 q9, d2, d5 288 vmls.u16 q4, q5, q12 @ temp -= temp2 * 5 289 vld1.u32 d7[0], [r0], r2 290 vmla.u16 q8, q7, q11 291 vaddl.u8 q10, d4, d5 292 vaddl.u8 q6, d2, d7 293 vaddl.u8 q5, d3, d6 294 vmls.u16 q8, q9, q12 295 vqrshrun.s16 d26, q4, #5 @ dst[0_0] = CLIP_U8( (temp + 16) >> 5) 296 vld1.u32 d8[0], [r7], r2 @Load value for interpolation - row 0 297 vld1.u32 d9[0], [r7], r2 @Load value for interpolation - row 1 298 vmla.u16 q6, q10, q11 299 vld1.u32 d0[0], [r0], r2 300 vaddl.u8 q7, d5, d6 301 vqrshrun.s16 d27, q8, #5 302 vaddl.u8 q10, d3, d0 303 vrhadd.u8 q13, q13, q4 @Interpolation step for qpel calculation 304 vmls.u16 q6, q5, q12 305 vst1.u32 d26[0], [r1], r3 @ Vector store to dst[0_0] 306 vaddl.u8 q9, d4, d7 307 vmla.u16 q10, q7, q11 308 vst1.u32 d27[0], [r1], r3 @ store row 1 309 vqrshrun.s16 d28, q6, #5 310 vld1.u32 d12[0], [r7], r2 @Load value for interpolation - row 2 311 vld1.u32 d13[0], [r7], r2 @Load value for interpolation - row 3 312 313 vmls.u16 q10, q9, q12 314 vqrshrun.s16 d29, q10, #5 315 vrhadd.u8 q14, q6, q14 @Interpolation step for qpel calculation 316 vst1.u32 d28[0], [r1], r3 @store row 2 317 vst1.u32 d29[0], [r1], r3 @store row 3 318 319 subs r5, r5, #8 320 subeq r0, r0, r2, lsl #2 321 subeq r0, r0, r2 322 beq loop_4 @ Loop if height==8 323 324end_func: 325 vldmia sp!, {d8-d15} @ Restore neon registers that were saved 326 ldmfd sp!, {r4-r12, pc} @Restoring registers from stack 327 328 329