1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19 20@/** 21@****************************************************************************** 22@* @file 23@* ihevc_inter_pred_luma_horz_w16out.s 24@* 25@* @brief 26@* contains function definitions for inter prediction interpolation. 27@* functions are coded using neon intrinsics and can be compiled using 28 29@* rvct 30@* 31@* @author 32@* parthiban v 33@* 34@* @par list of functions: 35@* 36@* - ihevc_inter_pred_luma_horz_w16out() 37@* 38@* @remarks 39@* none 40@* 41@******************************************************************************* 42@*/ 43@/** 44@******************************************************************************* 45@* 46@* @brief 47@* interprediction luma filter for horizontal 16bit output 48@* 49@* @par description: 50@* applies a horizontal filter with coefficients pointed to by 'pi1_coeff' 51@* to the elements pointed by 'pu1_src' and writes to the location pointed 52@* by 'pu1_dst' no downshifting or clipping is done and the output is used 53@* as an input for vertical filtering or weighted prediction assumptions : 54@* the function is optimized considering the fact width is multiple of 4 or 55@* 8. if width is multiple of 4 then height should be multiple of 2, width 8 56@* is optimized further. 57@* 58@* @param[in] pu1_src 59@* uword8 pointer to the source 60@* 61@* @param[out] pi2_dst 62@* word16 pointer to the destination 63@* 64@* @param[in] src_strd 65@* integer source stride 66@* 67@* @param[in] dst_strd 68@* integer destination stride 69@* 70@* @param[in] pi1_coeff 71@* word8 pointer to the filter coefficients 72@* 73@* @param[in] ht 74@* integer height of the array 75@* 76@* @param[in] wd 77@* integer width of the array 78@* 79@* @returns 80@* 81@* @remarks 82@* none 83@* 84@******************************************************************************* 85@*/ 86 87@void ihevc_inter_pred_luma_horz_w16out(uword8 *pu1_src, 88@ word16 *pi2_dst, 89@ word32 src_strd, 90@ word32 dst_strd, 91@ word8 *pi1_coeff, 92@ word32 ht, 93@ word32 wd 94 95 96@r0 - free 97@r1 - dst_ptr 98@r2 - src_strd 99@r3 - dst_strd 100@r4 - src_ptr2 101@r5 - inner loop counter 102@r6 - dst_ptr2 103@r7 - free 104@r8 - dst_strd2 105@r9 - src_strd1 106@r10 - wd 107@r11 - #1 108@r12 - src_ptr1 109@r14 - loop_counter 110 111.equ coeff_offset, 104 112.equ ht_offset, 108 113.equ wd_offset, 112 114 115.text 116.align 4 117.syntax unified 118 119 120 121 122.globl ihevc_inter_pred_luma_horz_w16out_a9q 123 124.type ihevc_inter_pred_luma_horz_w16out_a9q, %function 125 126ihevc_inter_pred_luma_horz_w16out_a9q: 127 128 bic r14, #1 @ clearing bit[0], so that it goes back to mode 129 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 130 vpush {d8 - d15} 131 ldr r4,[sp,#coeff_offset] @loads pi1_coeff 132 ldr r7,[sp,#ht_offset] @loads ht 133 134 135 vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff) 136 sub r14,r7,#0 @checks for ht == 0 137 vabs.s8 d2,d0 @vabs_s8(coeff) 138 mov r11,#1 139 ldr r10,[sp,#wd_offset] @loads wd 140 vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0) 141 sub r12,r0,#3 @pu1_src - 3 142 vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1) 143 add r4,r12,r2 @pu1_src_tmp2_8 = pu1_src + src_strd 144 vdup.8 d26,d2[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2) 145 rsb r9,r10,r2,lsl #1 @2*src_strd - wd 146 vdup.8 d27,d2[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3) 147 rsb r8,r10,r3 @dst_strd - wd 148 vdup.8 d28,d2[4] @coeffabs_4 = vdup_lane_u8(coeffabs, 4) 149 150 vdup.8 d29,d2[5] @coeffabs_5 = vdup_lane_u8(coeffabs, 5) 151 and r7,r14,#1 @calculating ht_residue ht_residue = (ht & 1) 152 vdup.8 d30,d2[6] @coeffabs_6 = vdup_lane_u8(coeffabs, 6) 153 sub r14,r14,r7 @decrement height by ht_residue(residue value is calculated outside) 154 vdup.8 d31,d2[7] @coeffabs_7 = vdup_lane_u8(coeffabs, 7) 155 156 cmp r7,#1 157 beq odd_height_decision 158 159even_height_decision: 160 mov r7,r1 161 cmp r10,#4 162 ble outer_loop_4 163 164 cmp r10,#24 165 moveq r10,#16 166 addeq r8,#8 167 addeq r9,#8 168 169 cmp r10,#16 170 bge outer_loop_16_branch 171 172 cmp r10,#12 173 addeq r8,#4 174 addeq r9,#4 175outer_loop_8_branch: 176 b outer_loop_8 177 178outer_loop_16_branch: 179 b outer_loop_16 180 181 182odd_height_decision: 183 cmp r10,#24 184 beq outer_loop_8_branch 185 cmp r10,#12 186 beq outer_loop_4 187 b even_height_decision 188 189outer_loop4_residual: 190 sub r12,r0,#3 @pu1_src - 3 191 mov r1,r7 192 add r1,#16 193 mov r10,#4 194 add r12,#8 195 mov r14,#16 196 add r8,#4 197 add r9,#4 198 199outer_loop_4: 200 add r6,r1,r3,lsl #1 @pu1_dst + dst_strd 201 add r4,r12,r2 @pu1_src + src_strd 202 203 subs r5,r10,#0 @checks wd 204 ble end_inner_loop_4 205 206inner_loop_4: 207 vld1.u32 {d0},[r12],r11 @vector load pu1_src 208 vld1.u32 {d1},[r12],r11 209 vld1.u32 {d2},[r12],r11 210 vld1.u32 {d3},[r12],r11 211 vld1.u32 {d4},[r12],r11 212 vld1.u32 {d5},[r12],r11 213 vld1.u32 {d6},[r12],r11 214 vld1.u32 {d7},[r12],r11 215 @add r12,r12,#4 @increment the input pointer 216 sub r12,r12,#4 217 @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 218 @vext.u8 d3,d0,d1,#3 @vector extract of src[0_3] 219 @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 220 221 @vext.u8 d5,d0,d1,#5 @vector extract of src[0_5] 222 @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6] 223 @vext.u8 d7,d0,d1,#7 @vector extract of src[0_7] 224 @vext.u8 d1,d0,d1,#1 @vector extract of src[0_1] 225 vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd 226 vld1.u32 {d13},[r4],r11 227 vzip.32 d0,d12 @vector zip the i iteration and ii interation in single register 228 vld1.u32 {d14},[r4],r11 229 vzip.32 d1,d13 230 vld1.u32 {d15},[r4],r11 231 vzip.32 d2,d14 232 vld1.u32 {d16},[r4],r11 233 vzip.32 d3,d15 234 vld1.u32 {d17},[r4],r11 235 vzip.32 d4,d16 236 vld1.u32 {d18},[r4],r11 237 vzip.32 d5,d17 238 vld1.u32 {d19},[r4],r11 239 sub r4,r4,#4 240 @ add r4,r4,#4 @increment the input pointer 241 @ vext.u8 d14,d12,d13,#2 @vector extract of src[0_2] 242 @ vext.u8 d15,d12,d13,#3 @vector extract of src[0_3] 243 @ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4] 244 @ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5] 245 @ vext.u8 d18,d12,d13,#6 @vector extract of src[0_6] 246 @ vext.u8 d19,d12,d13,#7 @vector extract of src[0_7] 247 @vext.u8 d13,d12,d13,#1 @vector extract of src[0_1] 248 249 250 251 252 253 254 255 vzip.32 d6,d18 256 vzip.32 d7,d19 257 258 vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time 259 vmlsl.u8 q4,d0,d24 260 vmlsl.u8 q4,d2,d26 261 vmlal.u8 q4,d3,d27 262 vmlal.u8 q4,d4,d28 263 vmlsl.u8 q4,d5,d29 264 vmlal.u8 q4,d6,d30 265 vmlsl.u8 q4,d7,d31 266 267 @ vqrshrun.s16 d8,q4,#6 @narrow right shift and saturating the result 268 vst1.64 {d8},[r1]! @store the i iteration result which is in upper part of the register 269 vst1.64 {d9},[r6]! @store the ii iteration result which is in lower part of the register 270 subs r5,r5,#4 @decrement the wd by 4 271 bgt inner_loop_4 272 273end_inner_loop_4: 274 subs r14,r14,#2 @decrement the ht by 4 275 add r12,r12,r9 @increment the input pointer 2*src_strd-wd 276 add r1,r6,r8,lsl #1 @increment the output pointer 2*dst_strd-wd 277 bgt outer_loop_4 278 279 280height_residue_4: 281 282 ldr r7,[sp,#ht_offset] @loads ht 283 and r7,r7,#1 @calculating ht_residue ht_residue = (ht & 1) 284 cmp r7,#0 285 beq end_loops 286 287outer_loop_height_residue_4: 288 289 290 subs r5,r10,#0 @checks wd 291 ble end_inner_loop_height_residue_4 292 293inner_loop_height_residue_4: 294 vld1.u32 {d0},[r12],r11 @vector load pu1_src 295 vld1.u32 {d1},[r12],r11 296 297 298 299 300 301 302 @ vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 303 @ vext.u8 d3,d0,d1,#3 @vector extract of src[0_3] 304 @ vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 305 306 307 308 @add r12,r12,#4 @increment the input pointer 309 @ vext.u8 d5,d0,d1,#5 @vector extract of src[0_5] 310 @ vext.u8 d6,d0,d1,#6 @vector extract of src[0_6] 311 @ vext.u8 d7,d0,d1,#7 @vector extract of src[0_7] 312 @ vext.u8 d1,d0,d1,#1 @vector extract of src[0_1] 313 vld1.u32 {d2},[r12],r11 314 vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time 315 vld1.u32 {d3},[r12],r11 316 vmlsl.u8 q4,d0,d24 317 vld1.u32 {d4},[r12],r11 318 vmlsl.u8 q4,d2,d26 319 vld1.u32 {d5},[r12],r11 320 vmlal.u8 q4,d3,d27 321 vld1.u32 {d6},[r12],r11 322 vmlal.u8 q4,d4,d28 323 vld1.u32 {d7},[r12],r11 324 vmlsl.u8 q4,d5,d29 325 sub r12,r12,#4 326 vmlal.u8 q4,d6,d30 327 vmlsl.u8 q4,d7,d31 @store the i iteration result which is in upper part of the register 328 subs r5,r5,#4 @decrement the wd by 4 329 vst1.64 {d8},[r1]! 330 bgt inner_loop_height_residue_4 331 332end_inner_loop_height_residue_4: 333 subs r7,r7,#1 @decrement the ht by 4 334 rsb r9,r10,r2 335 add r12,r12,r9 @increment the input pointer src_strd-wd 336 add r1,r1,r8 @increment the output pointer dst_strd-wd 337 bgt outer_loop_height_residue_4 338 vpop {d8 - d15} 339 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 340 341outer_loop8_residual: 342 sub r12,r0,#3 @pu1_src - 3 343 mov r1,r7 344 mov r14,#32 345 add r1,#32 346 add r12,#16 347 mov r10,#8 348 add r8,#8 349 add r9,#8 350 351outer_loop_8: 352 353 add r6,r1,r3,lsl #1 @pu1_dst + dst_strd 354 add r4,r12,r2 @pu1_src + src_strd 355 subs r5,r10,#0 @checks wd 356 357 ble end_inner_loop_8 358 359inner_loop_8: 360 vld1.u32 {d0},[r12],r11 @vector load pu1_src 361 vld1.u32 {d1},[r12],r11 362 vld1.u32 {d2},[r12],r11 363 vld1.u32 {d3},[r12],r11 364 365 366 367 368 369 @ vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 370 @ vext.u8 d3,d0,d1,#3 @vector extract of src[0_3] 371 @ vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 372 @ vext.u8 d5,d0,d1,#5 @vector extract of src[0_5] 373 @ vext.u8 d6,d0,d1,#6 @vector extract of src [0_6] 374 @ vext.u8 d7,d0,d1,#7 @vector extract of src[0_7] 375 @ vext.u8 d1,d0,d1,#1 @vector extract of src[0_1] 376 @ vext.u8 d14,d12,d13,#2 377 378 @vext.u8 d15,d12,d13,#3 @vector extract of src[0_3] 379 @ vext.u8 d16,d12,d13,#4 @vector extract of src[0_4] 380 @ vext.u8 d17,d12,d13,#5 @vector extract of src[0_5] 381 @vext.u8 d18,d12,d13,#6 @vector extract of src[0_6] 382 @vext.u8 d19,d12,d13,#7 @vector extract of src[0_7] 383 @vext.u8 d13,d12,d13,#1 @vector extract of src[0_1] 384 vld1.u32 {d4},[r12],r11 385 vmull.u8 q4,d1,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 386 vld1.u32 {d5},[r12],r11 387 vmlal.u8 q4,d3,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 388 vld1.u32 {d6},[r12],r11 389 vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 390 vld1.u32 {d7},[r12],r11 391 vmlsl.u8 q4,d2,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 392 vld1.u32 {d12},[r4],r11 @vector load pu1_src + src_strd 393 vmlal.u8 q4,d4,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 394 vld1.u32 {d13},[r4],r11 395 vmlsl.u8 q4,d5,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 396 vld1.u32 {d14},[r4],r11 397 vmlal.u8 q4,d6,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 398 vld1.u32 {d15},[r4],r11 399 vmlsl.u8 q4,d7,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 400 vld1.u32 {d16},[r4],r11 @vector load pu1_src + src_strd 401 402 vmull.u8 q5,d15,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 403 vld1.u32 {d17},[r4],r11 404 vmlsl.u8 q5,d14,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 405 vld1.u32 {d18},[r4],r11 406 vmlal.u8 q5,d16,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 407 vld1.u32 {d19},[r4],r11 @vector load pu1_src + src_strd 408 vmlsl.u8 q5,d17,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 409 @ vqrshrun.s16 d20,q4,#6 @right shift and saturating narrow result 1 410 vmlal.u8 q5,d18,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 411 vmlsl.u8 q5,d19,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 412 vst1.16 {q4},[r1]! @store the result pu1_dst 413 vmlsl.u8 q5,d12,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 414 vmlal.u8 q5,d13,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 415 416 417 418 @ vqrshrun.s16 d8,q5,#6 @right shift and saturating narrow result 2 419 subs r5,r5,#8 @decrement the wd loop 420 vst1.16 {q5},[r6]! @store the result pu1_dst 421 cmp r5,#4 422 bgt inner_loop_8 423 424end_inner_loop_8: 425 subs r14,r14,#2 @decrement the ht loop 426 add r12,r12,r9 @increment the src pointer by 2*src_strd-wd 427 add r1,r6,r8,lsl #1 @increment the dst pointer by 2*dst_strd-wd 428 bgt outer_loop_8 429 430 431 432 433 434 ldr r10,[sp,#wd_offset] @loads wd 435 cmp r10,#12 436 437 beq outer_loop4_residual 438 439 ldr r7,[sp,#ht_offset] @loads ht 440 and r7,r7,#1 441 cmp r7,#1 442 beq height_residue_4 443 444 445 vpop {d8 - d15} 446 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 447 448 449 450 451 452outer_loop_16: 453 str r0, [sp, #-4]! 454 str r7, [sp, #-4]! 455 add r6,r1,r3,lsl #1 @pu1_dst + dst_strd 456 add r4,r12,r2 @pu1_src + src_strd 457 and r0, r12, #31 458 sub r5,r10,#0 @checks wd 459 pld [r12, r2, lsl #1] 460 vld1.u32 {q0},[r12],r11 @vector load pu1_src 461 pld [r4, r2, lsl #1] 462 vld1.u32 {q1},[r12],r11 463 vld1.u32 {q2},[r12],r11 464 vld1.u32 {q3},[r12],r11 465 vld1.u32 {q6},[r12],r11 466 vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 467 vld1.u32 {q7},[r12],r11 468 vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 469 vld1.u32 {q8},[r12],r11 470 vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 471 vld1.u32 {q9},[r12],r11 472 vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 473 vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 474 vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 475 vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 476 vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 477 478 479inner_loop_16: 480 481 482 subs r5,r5,#16 483 vmull.u8 q10,d3,d25 484 485 add r12,#8 486 vmlsl.u8 q10,d1,d24 487 488 vld1.u32 {q0},[r4],r11 @vector load pu1_src 489 vmlal.u8 q10,d7,d27 490 491 vld1.u32 {q1},[r4],r11 492 vmlsl.u8 q10,d5,d26 493 494 vld1.u32 {q2},[r4],r11 495 vmlal.u8 q10,d13,d28 496 497 vld1.u32 {q3},[r4],r11 498 vmlal.u8 q10,d17,d30 499 500 vld1.u32 {q6},[r4],r11 501 vmlsl.u8 q10,d15,d29 502 503 vld1.u32 {q7},[r4],r11 504 vmlsl.u8 q10,d19,d31 505 506 vld1.u32 {q8},[r4],r11 507 vmull.u8 q5,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 508 509 vld1.u32 {q9},[r4],r11 510 vmlal.u8 q5,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 511 512 add r4,#8 513 vmlsl.u8 q5,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 514 pld [r12, r2, lsl #2] 515 pld [r4, r2, lsl #2] 516 vst1.8 {q4},[r1]! @store the result pu1_dst 517 vmlsl.u8 q5,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 518 519 addeq r12,r12,r9 @increment the src pointer by 2*src_strd-wd 520 vmlal.u8 q5,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 521 522 addeq r4,r12,r2 @pu1_src + src_strd 523 vmlsl.u8 q5,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 524 525@ and r7, r12, #31 526 vmlal.u8 q5,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 527 528 subeq r14,r14,#2 529 vmlsl.u8 q5,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 530 531 @cmp r7, r0 532 vmull.u8 q11,d3,d25 533 534@ pld [r12, r2, lsl #2] 535 vmlsl.u8 q11,d1,d24 536 537 vst1.16 {q10},[r1]! 538 vmlal.u8 q11,d7,d27 539 540@ pld [r4, r2, lsl #2] 541 vmlsl.u8 q11,d5,d26 542 543@ mov r0, r7 544 vmlal.u8 q11,d13,d28 545 546 cmp r14,#0 547 vmlal.u8 q11,d17,d30 548 549 vst1.16 {q5},[r6]! 550 vmlsl.u8 q11,d15,d29 551 552 vmlsl.u8 q11,d19,d31 553 554 beq epilog_16 555 556 vld1.u32 {q0},[r12],r11 @vector load pu1_src 557 vld1.u32 {q1},[r12],r11 558 vld1.u32 {q2},[r12],r11 559 vld1.u32 {q3},[r12],r11 560 vld1.u32 {q6},[r12],r11 561 vmull.u8 q4,d2,d25 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 562 vld1.u32 {q7},[r12],r11 563 vmlal.u8 q4,d6,d27 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 564 vld1.u32 {q8},[r12],r11 565 vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 566 vld1.u32 {q9},[r12],r11 567 vmlsl.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 568 vmlal.u8 q4,d12,d28 @mul_res = vmlal_u8(src[0_4], coeffabs_4)@ 569 cmp r5,#0 570 vmlsl.u8 q4,d14,d29 @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@ 571 moveq r5,r10 572 vmlal.u8 q4,d16,d30 @mul_res = vmlal_u8(src[0_6], coeffabs_6)@ 573 vst1.8 {q11},[r6]! @store the result pu1_dst 574 vmlsl.u8 q4,d18,d31 @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@ 575 addeq r1,r6,r8,lsl #1 576 addeq r6,r1,r3,lsl #1 @pu1_dst + dst_strd 577 b inner_loop_16 578 579 580epilog_16: 581@ vqrshrun.s16 d11,q11,#6 582 vst1.8 {q11},[r6]! @store the result pu1_dst 583 584 ldr r7, [sp], #4 585 ldr r0, [sp], #4 586 ldr r10,[sp,#wd_offset] 587 cmp r10,#24 588 beq outer_loop8_residual 589 add r1,r6,r8,lsl #1 590 ldr r7,[sp,#ht_offset] @loads ht 591 and r7,r7,#1 592 cmp r7,#1 593 beq height_residue_4 594 595end_loops: 596 vpop {d8 - d15} 597 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 598 599 600 601 602 603 604 605 606 607