1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@****************************************************************************** 20@* @file 21@* ihevc_inter_pred_filters_luma_vert_w16inp.s 22@* 23@* @brief 24@* contains function definitions for inter prediction interpolation. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* yogeswaran rs 31@* 32@* @par list of functions: 33@* 34@* - ihevc_inter_pred_luma_vert() 35@* 36@* @remarks 37@* none 38@* 39@******************************************************************************* 40@*/ 41 42@/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */ 43@/* include reconstruction */ 44@ 45 46@/** 47@******************************************************************************* 48@* 49@* @brief 50@* luma vertical filter for 16bit input. 51@* 52@* @par description: 53@* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 54@* the elements pointed by 'pu1_src' and writes to the location pointed by 55@* 'pu1_dst' input is 16 bits the filter output is downshifted by 12 and 56@* clipped to lie between 0 and 255 assumptions : the function is 57@* optimized considering the fact width is multiple of 4. and height as 58@* multiple of 2. 59@* 60@* @param[in] pi2_src 61@* word16 pointer to the source 62@* 63@* @param[out] pu1_dst 64@* uword8 pointer to the destination 65@* 66@* @param[in] src_strd 67@* integer source stride 68@* 69@* @param[in] dst_strd 70@* integer destination stride 71@* 72@* @param[in] pi1_coeff 73@* word8 pointer to the filter coefficients 74@* 75@* @param[in] ht 76@* integer height of the array 77@* 78@* @param[in] wd 79@* integer width of the array 80@* 81@* @returns 82@* 83@* @remarks 84@* none 85@* 86@******************************************************************************* 87@*/ 88 89@void ihevc_inter_pred_luma_vert_w16inp(word16 *pi2_src, 90@ uword8 *pu1_dst, 91@ word32 src_strd, 92@ word32 dst_strd, 93@ word8 *pi1_coeff, 94@ word32 ht, 95@ word32 wd ) 96@**************variables vs registers***************************************** 97@ r0 => *pu2_src 98@ r1 => *pu1_dst 99@ r2 => src_strd 100@ r3 => dst_strd 101@ r4 => *pi1_coeff 102@ r5 => ht 103@ r6 => wd 104 105.equ coeff_offset, 104 106.equ ht_offset, 108 107.equ wd_offset, 112 108 109.text 110.align 4 111 112 113 114 115.globl ihevc_inter_pred_luma_vert_w16inp_w16out_a9q 116 117.type ihevc_inter_pred_luma_vert_w16inp_w16out_a9q, %function 118 119ihevc_inter_pred_luma_vert_w16inp_w16out_a9q: 120 121 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 122 vpush {d8 - d15} 123 124 ldr r12,[sp,#coeff_offset] @load pi1_coeff 125 mov r6,r3,lsl #1 126 ldr r5,[sp,#wd_offset] @load wd 127 vld1.8 {d0},[r12] @coeff = vld1_s8(pi1_coeff) 128 mov r2, r2, lsl #1 129 sub r12,r2,r2,lsl #2 @src_ctrd & pi1_coeff 130 @vabs.s8 d0,d0 @vabs_s8(coeff) 131 add r0,r0,r12 @r0->pu1_src r12->pi1_coeff 132 ldr r3,[sp,#ht_offset] @load ht 133 subs r7,r3,#0 @r3->ht 134 @ble end_loops @end loop jump 135 vmovl.s8 q0,d0 136 vdup.16 d22,d0[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0)@ 137 vdup.16 d23,d0[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1)@ 138 vdup.16 d24,d0[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2)@ 139 vdup.16 d25,d0[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3)@ 140 vdup.16 d26,d1[0] @coeffabs_4 = vdup_lane_u8(coeffabs, 4)@ 141 vdup.16 d27,d1[1] @coeffabs_5 = vdup_lane_u8(coeffabs, 5)@ 142 vdup.16 d28,d1[2] @coeffabs_6 = vdup_lane_u8(coeffabs, 6)@ 143 vdup.16 d29,d1[3] @coeffabs_7 = vdup_lane_u8(coeffabs, 7)@ 144 vmov.i32 q15,#0x80000 145 146 rsb r9,r5,r6,lsl #2 @r6->dst_strd r5 ->wd 147 rsb r8,r5,r2,lsl #2 @r2->src_strd 148 sub r8,r8,r5 149 sub r9,r9,r5 150 mov r3, r5, lsr #2 @divide by 4 151 mul r7, r3 @multiply height by width 152 sub r7, #4 @subtract by one for epilog 153 mov r4,r5 @r5 ->wd 154 @mov r2, r2, lsl #1 155 156prolog: 157 158 add r3,r0,r2 @pu1_src_tmp += src_strd@ 159 vld1.16 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 160 vld1.16 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@ 161 subs r4,r4,#4 162 vld1.16 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 163 vmull.s16 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@ 164 vld1.16 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 165 vmlal.s16 q4,d0,d22 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@ 166 vld1.16 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 167 vmlal.s16 q4,d2,d24 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@ 168 vld1.16 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 169 vmlal.s16 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@ 170 vld1.16 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 171 vmlal.s16 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@ 172 vld1.16 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 173 vmlal.s16 q4,d5,d27 @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@ 174 vmlal.s16 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@ 175 vmlal.s16 q4,d7,d29 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@ 176 177 vld1.16 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 178 179 vmull.s16 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@ 180 addle r0,r0,r8,lsl #0 181 vmlal.s16 q5,d1,d22 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@ 182 movle r4,r5 @r5 ->wd 183 vmlal.s16 q5,d3,d24 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@ 184 vld1.16 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 185 vmlal.s16 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@ 186 vld1.16 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 187 vmlal.s16 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@ 188 add r3,r0,r2 @pu1_src_tmp += src_strd@ 189 vmlal.s16 q5,d6,d27 @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@ 190 vmlal.s16 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@ 191 vmlal.s16 q5,d16,d29 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@ 192 vsub.s32 q4, q4, q15 193 194 vld1.16 {d1},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 195 vmull.s16 q6,d3,d23 196 vld1.16 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@ 197 vmlal.s16 q6,d2,d22 198 vld1.16 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 199 vmlal.s16 q6,d4,d24 200 vmlal.s16 q6,d5,d25 201 vmlal.s16 q6,d6,d26 202 vmlal.s16 q6,d7,d27 203 vmlal.s16 q6,d16,d28 204 vmlal.s16 q6,d17,d29 205 add r14,r1,r6 206 vsub.s32 q5, q5, q15 207 vshrn.s32 d8, q4, #6 208 @vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 209 210 vmull.s16 q7,d4,d23 211 vmlal.s16 q7,d3,d22 212 vmlal.s16 q7,d5,d24 213 vmlal.s16 q7,d6,d25 214 vld1.16 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 215 vmlal.s16 q7,d7,d26 216 vld1.16 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 217 vmlal.s16 q7,d16,d27 218 vld1.16 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 219 vmlal.s16 q7,d17,d28 220 vld1.16 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 221 vmlal.s16 q7,d18,d29 222 vld1.16 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 223 224 vst1.32 {d8},[r1]! @vst1_u8(pu1_dst,sto_res)@ 225 vsub.s32 q6, q6, q15 226 vshrn.s32 d10, q5, #6 227 @vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 228 addle r1,r1,r9 229 230 subs r7,r7,#4 231 232 233 blt epilog_end @jumps to epilog_end 234 beq epilog @jumps to epilog 235 236kernel_8: 237 238 vmull.s16 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@ 239 subs r4,r4,#4 240 vmlal.s16 q4,d0,d22 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@ 241 addle r0,r0,r8,lsl #0 242 vmlal.s16 q4,d2,d24 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@ 243 vmlal.s16 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@ 244 vmlal.s16 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@ 245 vmlal.s16 q4,d5,d27 @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@ 246 vmlal.s16 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@ 247 vmlal.s16 q4,d7,d29 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@ 248 vst1.32 {d10},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@ 249 250 vsub.s32 q7, q7, q15 251 vshrn.s32 d12, q6, #6 252 @vqrshrun.s16 d12,q6,#6 253 vld1.16 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 254 255 vmull.s16 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@ 256 vmlal.s16 q5,d1,d22 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@ 257 vmlal.s16 q5,d3,d24 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@ 258 vmlal.s16 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@ 259 vmlal.s16 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@ 260 vmlal.s16 q5,d6,d27 @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@ 261 vst1.32 {d12},[r14],r6 262 263 vmlal.s16 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@ 264 vld1.16 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 265 266 vmlal.s16 q5,d16,d29 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@ 267 268 vsub.s32 q4, q4, q15 269 vshrn.s32 d14, q7, #6 270 @vqrshrun.s16 d14,q7,#6 271 272 vmull.s16 q6,d3,d23 273 movle r4,r5 @r5 ->wd 274 275 vmlal.s16 q6,d2,d22 276 vld1.16 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 277 278 vmlal.s16 q6,d4,d24 279 add r3,r0,r2 @pu1_src_tmp += src_strd@ 280 281 vmlal.s16 q6,d5,d25 282 283 vmlal.s16 q6,d6,d26 284 vst1.32 {d14},[r14],r6 285 286 vmlal.s16 q6,d7,d27 287 vld1.16 {d1},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 288 289 vmlal.s16 q6,d16,d28 290 add r14,r1,r6 291 292 vmlal.s16 q6,d17,d29 293 vld1.16 {d0},[r0]! @src_tmp1 = vld1_u8(pu1_src_tmp)@ 294 295 vsub.s32 q5, q5, q15 296 vshrn.s32 d8, q4, #6 297 @vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 298 vld1.16 {d2},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 299 300 vmull.s16 q7,d4,d23 301 vmlal.s16 q7,d3,d22 302 vmlal.s16 q7,d5,d24 303 vld1.16 {d3},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 304 305 vmlal.s16 q7,d6,d25 306 vld1.16 {d4},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 307 vmlal.s16 q7,d7,d26 308 vld1.16 {d5},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 309 vmlal.s16 q7,d16,d27 310 vld1.16 {d6},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 311 vmlal.s16 q7,d17,d28 312 vld1.16 {d7},[r3],r2 @src_tmp4 = vld1_u8(pu1_src_tmp)@ 313 vmlal.s16 q7,d18,d29 314 vst1.32 {d8},[r1]! @vst1_u8(pu1_dst,sto_res)@ 315 316 vsub.s32 q6, q6, q15 317 vshrn.s32 d10, q5, #6 318 addle r1,r1,r9 319 320 @vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 321 subs r7,r7,#4 322 323 bgt kernel_8 @jumps to kernel_8 324 325epilog: 326 327 vmull.s16 q4,d1,d23 @mul_res1 = vmull_u8(src_tmp2, coeffabs_1)@ 328 vmlal.s16 q4,d0,d22 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_0)@ 329 vmlal.s16 q4,d2,d24 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_2)@ 330 vmlal.s16 q4,d3,d25 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_3)@ 331 vmlal.s16 q4,d4,d26 @mul_res1 = vmlal_u8(mul_res1, src_tmp1, coeffabs_4)@ 332 vmlal.s16 q4,d5,d27 @mul_res1 = vmlal_u8(mul_res1, src_tmp2, coeffabs_5)@ 333 vmlal.s16 q4,d6,d28 @mul_res1 = vmlal_u8(mul_res1, src_tmp3, coeffabs_6)@ 334 vmlal.s16 q4,d7,d29 @mul_res1 = vmlal_u8(mul_res1, src_tmp4, coeffabs_7)@ 335 vst1.32 {d10},[r14],r6 336 337 vsub.s32 q7, q7, q15 338 vshrn.s32 d12, q6, #6 339 @vqrshrun.s16 d12,q6,#6 340 341 vld1.16 {d16},[r3],r2 @src_tmp1 = vld1_u8(pu1_src_tmp)@ 342 vmull.s16 q5,d2,d23 @mul_res2 = vmull_u8(src_tmp3, coeffabs_1)@ 343 vmlal.s16 q5,d1,d22 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_0)@ 344 vmlal.s16 q5,d3,d24 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_2)@ 345 vmlal.s16 q5,d4,d25 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_3)@ 346 vmlal.s16 q5,d5,d26 @mul_res2 = vmlal_u8(mul_res2, src_tmp2, coeffabs_4)@ 347 vmlal.s16 q5,d6,d27 @mul_res2 = vmlal_u8(mul_res2, src_tmp3, coeffabs_5)@ 348 vmlal.s16 q5,d7,d28 @mul_res2 = vmlal_u8(mul_res2, src_tmp4, coeffabs_6)@ 349 vmlal.s16 q5,d16,d29 @mul_res2 = vmlal_u8(mul_res2, src_tmp1, coeffabs_7)@ 350 vst1.32 {d12},[r14],r6 351 352 vsub.s32 q4, q4, q15 353 vshrn.s32 d14, q7, #6 354 @vqrshrun.s16 d14,q7,#6 355 356 vld1.16 {d17},[r3],r2 @src_tmp2 = vld1_u8(pu1_src_tmp)@ 357 vmull.s16 q6,d3,d23 358 vmlal.s16 q6,d2,d22 359 vmlal.s16 q6,d4,d24 360 vmlal.s16 q6,d5,d25 361 vmlal.s16 q6,d6,d26 362 vmlal.s16 q6,d7,d27 363 vmlal.s16 q6,d16,d28 364 vmlal.s16 q6,d17,d29 365 vst1.32 {d14},[r14],r6 366 vsub.s32 q5, q5, q15 367 vshrn.s32 d8, q4, #6 368 @vqrshrun.s16 d8,q4,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 369 370 vld1.16 {d18},[r3],r2 @src_tmp3 = vld1_u8(pu1_src_tmp)@ 371 vmull.s16 q7,d4,d23 372 vmlal.s16 q7,d3,d22 373 vmlal.s16 q7,d5,d24 374 vmlal.s16 q7,d6,d25 375 vmlal.s16 q7,d7,d26 376 vmlal.s16 q7,d16,d27 377 vmlal.s16 q7,d17,d28 378 vmlal.s16 q7,d18,d29 379 vsub.s32 q6, q6, q15 380 vshrn.s32 d10, q5, #6 381 @vqrshrun.s16 d10,q5,#6 @sto_res = vqmovun_s16(sto_res_tmp)@ 382 383 add r14,r1,r6 384 vst1.32 {d8},[r1]! @vst1_u8(pu1_dst,sto_res)@ 385 386epilog_end: 387 vst1.32 {d10},[r14],r6 @vst1_u8(pu1_dst_tmp,sto_res)@ 388 vshrn.s32 d12, q6, #6 389 @vqrshrun.s16 d12,q6,#6 390 391 vst1.32 {d12},[r14],r6 392 vsub.s32 q7, q7, q15 393 vshrn.s32 d14, q7, #6 394 @vqrshrun.s16 d14,q7,#6 395 396 vst1.32 {d14},[r14],r6 397 398 399end_loops: 400 401 vpop {d8 - d15} 402 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 403 404 405 406 407 408 409 410 411