1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* @file 21@* ihevc_inter_pred_chroma_horz_neon.s 22@* 23@* @brief 24@* contains function definitions for inter prediction interpolation. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* yogeswaran rs / akshaya mukund 31@* 32@* @par list of functions: 33@* 34@* 35@* @remarks 36@* none 37@* 38@******************************************************************************* 39@*/ 40@/** 41@******************************************************************************* 42@* 43@* @brief 44@* chroma interprediction filter for horizontal input 45@* 46@* @par description: 47@* applies a horizontal filter with coefficients pointed to by 'pi1_coeff' 48@* to the elements pointed by 'pu1_src' and writes to the location pointed 49@* by 'pu1_dst' the output is downshifted by 6 and clipped to 8 bits 50@* assumptions : the function is optimized considering the fact width is 51@* multiple of 2,4 or 8. if width is 2, then height should be multiple of 2. 52@* width 4,8 is optimized further 53@* 54@* @param[in] pu1_src 55@* uword8 pointer to the source 56@* 57@* @param[out] pu1_dst 58@* uword8 pointer to the destination 59@* 60@* @param[in] src_strd 61@* integer source stride 62@* 63@* @param[in] dst_strd 64@* integer destination stride 65@* 66@* @param[in] pi1_coeff 67@* word8 pointer to the filter coefficients 68@* 69@* @param[in] ht 70@* integer height of the array 71@* 72@* @param[in] wd 73@* integer width of the array 74@* 75@* @returns 76@* 77@* @remarks 78@* none 79@* 80@******************************************************************************* 81@*/ 82 83@void ihevc_inter_pred_chroma_horz(uword8 *pu1_src, 84@ uword8 *pu1_dst, 85@ word32 src_strd, 86@ word32 dst_strd, 87@ word8 *pi1_coeff, 88@ word32 ht, 89@ word32 wd) 90@**************variables vs registers***************************************** 91@r0 => *pu1_src 92@r1 => *pi2_dst 93@r2 => src_strd 94@r3 => dst_strd 95 96.text 97.align 4 98 99 100 101 102.globl ihevc_inter_pred_chroma_horz_a9q 103 104.type ihevc_inter_pred_chroma_horz_a9q, %function 105 106ihevc_inter_pred_chroma_horz_a9q: 107 108 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 109 110 ldr r4,[sp,#40] @loads pi1_coeff 111 ldr r7,[sp,#44] @loads ht 112 ldr r10,[sp,#48] @loads wd 113 114 vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff) 115 subs r14,r7,#0 @checks for ht == 0 116 vabs.s8 d2,d0 @vabs_s8(coeff) 117 mov r11,#2 118 ble end_loops 119 120 vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0) 121 sub r12,r0,#2 @pu1_src - 2 122 vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1) 123 add r4,r12,r2 @pu1_src_tmp2_8 = pu1_src + src_strd 124 vdup.8 d26,d2[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2) 125 126 tst r10,#3 @checks wd for multiples 127 mov r5,r10,lsl #1 128 129 vdup.8 d27,d2[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3) 130 131 bne outer_loop_4 132 cmp r10,#12 133 beq skip_16 134 135 cmp r10,#8 136 bge outer_loop_16 137skip_16: 138 tst r7,#3 139 140 sub r9,r0,#2 141 beq outer_loop_ht_4 @jumps to else condition 142 143 b outer_loop_8 144 145 146outer_loop_16: 147 mov r10,r5 @2wd 148 mul r14,r14,r10 149 150 rsb r6,r3,#16 151 152 add r4,r12,r2 153 mov r9,#10 154 and r0, r12, #31 155 rsb r8,r5,r3,lsl #1 156 pld [r12, r2, lsl #1] 157 158 159 160 161 vld1.u32 {q0},[r12],r11 @vector load pu1_src 162 pld [r4, r2, lsl #1] 163 vld1.u32 {q1},[r12],r11 @vector load pu1_src 164 165 vld1.u32 {q2},[r12],r11 @vector load pu1_src 166 167 vld1.u32 {q3},[r12],r9 @vector load pu1_src 168 169 170 vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 171 vld1.u32 {q4},[r4],r11 @vector load pu1_src 172 vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 173 vld1.u32 {q5},[r4],r11 @vector load pu1_src 174 vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 175 vld1.u32 {q6},[r4],r11 @vector load pu1_src 176 vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 177 vld1.u32 {q7},[r4],r9 @vector load pu1_src 178 vmull.u8 q14,d3,d25 179 180 vmlsl.u8 q14,d1,d24 181 182 183 vmlal.u8 q14,d5,d26 184 185 vmlsl.u8 q14,d7,d27 186 187 188 cmp r14,#32 189 beq epilog_end 190 sub r14,#64 191 192inner_loop_16: 193 194 195 196 197@ bgt l_2 198 199@ pld [r12, r2, lsl #1] 200@ pld [r4, r2, lsl #1] 201 202 203 204 subs r10,r10,#16 205 206 vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 207 208 209 addeq r12,r12,r8 210 addeq r4,r12,r2 211 vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 212 213 214 215 pld [r12, r2, lsl #2] 216 vqrshrun.s16 d30,q15,#6 217 218 vld1.u32 {q0},[r12],r11 @vector load pu1_src 219 vqrshrun.s16 d31,q14,#6 220 221 222 vld1.u32 {q1},[r12],r11 @vector load pu1_src 223 vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 224 225 226 227 228 vld1.u32 {q2},[r12],r11 @vector load pu1_src 229 vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 230 231 232 vld1.u32 {q3},[r12],r9 @vector load pu1_src 233 vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 234 235 pld [r4, r2, lsl #2] 236 vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 237 238 vst1.16 {q15}, [r1],r3 239 vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 240 241 vld1.u32 {q4},[r4],r11 @vector load pu1_src 242 vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 243 244 245 vld1.u32 {q5},[r4],r11 @vector load pu1_src 246 vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 247 248 vld1.u32 {q6},[r4],r11 @vector load pu1_src 249 vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 250 251 vld1.u32 {q7},[r4],r9 @vector load pu1_src 252 vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 253 254 cmp r10,#0 255 vqrshrun.s16 d22,q11,#6 256 vqrshrun.s16 d23,q10,#6 257 258 259 260 vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 261 262 moveq r10,r5 @2wd 263 vmull.u8 q14,d3,d25 264 265 266 vst1.16 {q11},[r1],r6 @store the result pu1_dst 267 vmlsl.u8 q14,d1,d24 268 269 270 addeq r1,r1,r8 271 vmlal.u8 q14,d5,d26 272 273 subs r14,r14,#32 @decrement the ht loop 274 vmlsl.u8 q14,d7,d27 275 276@ mov r0, r7 277 278 bgt inner_loop_16 279 280 281 282 add r14,r14,#64 283 cmp r14,#32 284 beq epilog_end 285 286epilog: 287 vqrshrun.s16 d30,q15,#6 288 vqrshrun.s16 d31,q14,#6 289 290 291 292 vst1.16 {q15}, [r1],r3 293 vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 294 295 296 297 298 vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 299 subs r10,r10,#16 @decrement the wd loop 300 vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 301 addeq r12,r12,r8 302 vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 303 moveq r10,r5 @2wd 304 305 306 addeq r4,r12,r2 307 vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 308 vld1.u32 {q0},[r12],r11 @vector load pu1_src 309 vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 310 vld1.u32 {q1},[r12],r11 @vector load pu1_src 311 vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 312 vld1.u32 {q2},[r12],r11 @vector load pu1_src 313 vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 314 vld1.u32 {q3},[r12],r9 @vector load pu1_src 315 vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 316 317 318 vld1.u32 {q4},[r4],r11 @vector load pu1_src 319 vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 320 vld1.u32 {q5},[r4],r11 @vector load pu1_src 321 vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 322 323 vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 324 325 vld1.u32 {q6},[r4],r11 @vector load pu1_src 326 vmull.u8 q14,d3,d25 327 vld1.u32 {q7},[r4],r9 @vector load pu1_src 328 vmlsl.u8 q14,d1,d24 329 vqrshrun.s16 d22,q11,#6 330 vqrshrun.s16 d23,q10,#6 331 332 vst1.16 {q11},[r1],r6 @store the result pu1_dst 333 vmlal.u8 q14,d5,d26 334 335 vmlsl.u8 q14,d7,d27 336 addeq r1,r1,r8 337 338 339 340epilog_end: 341 vqrshrun.s16 d30,q15,#6 342 vqrshrun.s16 d31,q14,#6 343 344 345 vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 346 vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 347 vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 348 vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 349 350 351 vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 352 vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 353 vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 354 vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 355 vqrshrun.s16 d22,q11,#6 356 vqrshrun.s16 d23,q10,#6 357 358 359 vst1.16 {q15}, [r1],r3 360 361 vst1.16 {q11},[r1] @store the result pu1_dst 362 363 364 365 b end_loops 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385outer_loop_8: 386 387 388 add r6,r1,r3 @pu1_dst + dst_strd 389 mov r7,r5 390 add r4,r12,r2 @pu1_src + src_strd 391 392 393inner_loop_8: 394 @vld1.u32 {d0,d1},[r12],r11 @vector load pu1_src 395 vld1.u32 {d0},[r12],r11 @vector load pu1_src 396 vld1.u32 {d1},[r12],r11 @vector load pu1_src 397 vld1.u32 {d2},[r12],r11 @vector load pu1_src 398 vld1.u32 {d3},[r12],r11 @vector load pu1_src 399 400 @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 401 vmull.u8 q4,d1,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 402 vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 403 @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 404 @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6] 405 vmlal.u8 q4,d2,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 406 vmlsl.u8 q4,d3,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 407 408 vld1.u32 {d4},[r4],r11 @vector load pu1_src 409 vld1.u32 {d5},[r4],r11 @vector load pu1_src 410 vld1.u32 {d6},[r4],r11 @vector load pu1_src 411 vld1.u32 {d7},[r4],r11 @vector load pu1_src 412 @vld1.u32 {d12,d13},[r4],r11 @vector load pu1_src + src_strd 413 @vext.u8 d14,d12,d13,#2 @vector extract of src[0_2] 414 vmull.u8 q5,d5,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 415 vmlsl.u8 q5,d4,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 416 @vext.u8 d16,d12,d13,#4 @vector extract of src[0_4] 417 @vext.u8 d18,d12,d13,#6 @vector extract of src[0_6] 418 vqrshrun.s16 d8,q4,#6 @right shift and saturating narrow result 1 419 vmlal.u8 q5,d6,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 420 vmlsl.u8 q5,d7,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 421 422 vst1.8 {d8},[r1]! @store the result pu1_dst 423 424 vqrshrun.s16 d10,q5,#6 @right shift and saturating narrow result 2 425 subs r7,r7,#8 @decrement the wd loop 426 vst1.8 {d10},[r6]! @store the result pu1_dst 427 bgt inner_loop_8 428 429 sub r12,r12,r5 430 subs r14,r14,#2 @decrement the ht loop 431 sub r1,r1,r5 432 add r12,r12,r2,lsl #1 433 add r1,r1,r3,lsl #1 434 bgt outer_loop_8 435 b end_loops 436 437@height if 4 comes 438outer_loop_ht_4: 439 440 mov r7,r5 441 442prologue_ht_4: 443 444inner_loop_ht_4: 445 446 mov r12,r9 447 mov r4,r1 448 449 sub r8, r2, #6 450 451 vld1.u32 {d0},[r12],r11 @(1)vector load pu1_src 452 vld1.u32 {d1},[r12],r11 @(1)vector load pu1_src 453 vld1.u32 {d2},[r12],r11 @(1)vector load pu1_src 454 @vld1.u32 {d3},[r12],r2 @(1)vector load pu1_src 455 vld1.u32 {d3},[r12],r8 @(1)vector load pu1_src 456 457 @sub r12, r12, #6 @(1) 458 459 vld1.u32 {d4},[r12],r11 @(2)vector load pu1_src 460 vld1.u32 {d5},[r12],r11 @(2)vector load pu1_src 461 vld1.u32 {d6},[r12],r11 @(2)vector load pu1_src 462 @vld1.u32 {d7},[r12],r2 @(2)vector load pu1_src 463 vld1.u32 {d7},[r12],r8 @(2)vector load pu1_src 464 465 @sub r12, r12, #6 @(2) 466 467 vld1.u32 {d14},[r12],r11 @(3)vector load pu1_src 468 vmull.u8 q4,d1,d25 @(1)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 469 470 vld1.u32 {d15},[r12],r11 @(3)vector load pu1_src 471 vmlsl.u8 q4,d0,d24 @(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 472 473 vld1.u32 {d16},[r12],r11 @(3)vector load pu1_src 474 vmlal.u8 q4,d2,d26 @(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 475 476 @vld1.u32 {d17},[r12],r2 @(3)vector load pu1_src 477 vld1.u32 {d17},[r12],r8 @(3)vector load pu1_src 478 vmlsl.u8 q4,d3,d27 @(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 479 480 @sub r12, r12, #6 @(3) 481 vmull.u8 q5,d5,d25 @(2)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 482 483 vld1.u32 {d18},[r12],r11 @(4)vector load pu1_src 484 vmlsl.u8 q5,d4,d24 @(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 485 486 vld1.u32 {d19},[r12],r11 @(4)vector load pu1_src 487 vmlal.u8 q5,d6,d26 @(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 488 489 vld1.u32 {d20},[r12],r11 @(4)vector load pu1_src 490 vmlsl.u8 q5,d7,d27 @(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 491 492 vld1.u32 {d21},[r12],r2 @(4)vector load pu1_src 493 vqrshrun.s16 d8,q4,#6 @(1)right shift and saturating narrow result 1 494 495 add r9,r9,#8 @(core loop) 496 497 subs r7,r7,#8 @(prologue)decrement the wd loop 498 beq epilogue 499 500core_loop: 501 mov r12,r9 502 503 vld1.u32 {d0},[r12],r11 @(1_1)vector load pu1_src 504 vmull.u8 q6,d15,d25 @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 505 506 vld1.u32 {d1},[r12],r11 @(1_1)vector load pu1_src 507 vmlsl.u8 q6,d14,d24 @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 508 509 vld1.u32 {d2},[r12],r11 @(1_1)vector load pu1_src 510 vmlal.u8 q6,d16,d26 @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 511 512 @vld1.u32 {d3},[r12],r2 @(1_1)vector load pu1_src 513 vld1.u32 {d3},[r12],r8 @(1_1)vector load pu1_src 514 vmlsl.u8 q6,d17,d27 @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 515 516 @sub r12, r12, #6 @(1_1) 517 518 vst1.8 {d8},[r4],r3 @(1)store the result pu1_dst 519 vqrshrun.s16 d10,q5,#6 @(2)right shift and saturating narrow result 2 520 521 vld1.u32 {d4},[r12],r11 @(2_1)vector load pu1_src 522 vmull.u8 q11,d19,d25 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 523 524 vld1.u32 {d5},[r12],r11 @(2_1)vector load pu1_src 525 vmlsl.u8 q11,d18,d24 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 526 527 vld1.u32 {d6},[r12],r11 @(2_1)vector load pu1_src 528 vmlal.u8 q11,d20,d26 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 529 530 @vld1.u32 {d7},[r12],r2 @(2_1)vector load pu1_src 531 vld1.u32 {d7},[r12],r8 @(2_1)vector load pu1_src 532 vmlsl.u8 q11,d21,d27 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 533 534 @sub r12, r12, #6 @(2_1) 535 536 vst1.8 {d10},[r4],r3 @(2)store the result pu1_dst 537 vqrshrun.s16 d12,q6,#6 @(3)right shift and saturating narrow result 1 538 539 vld1.u32 {d14},[r12],r11 @(3_1)vector load pu1_src 540 vmull.u8 q4,d1,d25 @(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 541 542 vld1.u32 {d15},[r12],r11 @(3_1)vector load pu1_src 543 vmlsl.u8 q4,d0,d24 @(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 544 545 vld1.u32 {d16},[r12],r11 @(3_1)vector load pu1_src 546 vmlal.u8 q4,d2,d26 @(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 547 548 @vld1.u32 {d17},[r12],r2 @(3_1)vector load pu1_src 549 vld1.u32 {d17},[r12],r8 @(3_1)vector load pu1_src 550 vmlsl.u8 q4,d3,d27 @(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 551 552 @sub r12, r12, #6 @(3_1) 553 554 vst1.8 {d12},[r4],r3 @(3)store the result pu1_dst 555 vqrshrun.s16 d22,q11,#6 @(4)right shift and saturating narrow result 2 556 557 add r9,r9,#8 @(core loop) 558 559 vmull.u8 q5,d5,d25 @(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 560 vld1.u32 {d18},[r12],r11 @(4_1)vector load pu1_src 561 562 vld1.u32 {d19},[r12],r11 @(4_1)vector load pu1_src 563 vmlsl.u8 q5,d4,d24 @(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 564 565 vld1.u32 {d20},[r12],r11 @(4_1)vector load pu1_src 566 vmlal.u8 q5,d6,d26 @(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 567 568 vld1.u32 {d21},[r12],r2 @(4_1)vector load pu1_src 569 vmlsl.u8 q5,d7,d27 @(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 570 571 add r1,r1,#8 @(core loop) 572 573 subs r7,r7,#8 @(core loop) 574 575 vst1.8 {d22}, [r4], r3 @(4)store the result pu1_dst 576 vqrshrun.s16 d8,q4,#6 @(1_1)right shift and saturating narrow result 1 577 578 mov r4, r1 @(core loop) 579 580 bgt core_loop @loopback 581 582epilogue: 583 vmull.u8 q6,d15,d25 @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 584 585 vmlsl.u8 q6,d14,d24 @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 586 587 vmlal.u8 q6,d16,d26 @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 588 589 vmlsl.u8 q6,d17,d27 @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 590 591 vst1.8 {d8},[r4],r3 @(1)store the result pu1_dst 592 vqrshrun.s16 d10,q5,#6 @(2)right shift and saturating narrow result 2 593 594 vmull.u8 q11,d19,d25 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 595 vmlsl.u8 q11,d18,d24 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 596 597 vmlal.u8 q11,d20,d26 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 598 599 vmlsl.u8 q11,d21,d27 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 600 601 vst1.8 {d10},[r4],r3 @(2)store the result pu1_dst 602 vqrshrun.s16 d12,q6,#6 @(3)right shift and saturating narrow result 1 603 604 vst1.8 {d12},[r4],r3 @(3)store the result pu1_dst 605 606 add r1,r1,#8 @(core loop) 607 608 vqrshrun.s16 d22,q11,#6 @(4)right shift and saturating narrow result 2 609 610 611 vst1.8 {d22}, [r4], r3 @(4)store the result pu1_dst 612 613 sub r9,r9,r5 614 subs r14,r14,#4 @decrement the ht loop 615 sub r1,r1,r5 616 add r9,r9,r2,lsl #2 617 add r1,r1,r3,lsl #2 618 bgt outer_loop_ht_4 619 b end_loops 620 621outer_loop_4: 622 add r6,r1,r3 @pu1_dst + dst_strd 623 mov r7,r5 624 add r4,r12,r2 @pu1_src + src_strd 625 626inner_loop_4: 627 @vld1.u32 {d0,d1},[r12] @vector load pu1_src 628 629 vld1.u32 {d0},[r12],r11 @vector load pu1_src 630 vld1.u32 {d1},[r12],r11 @vector load pu1_src 631 vld1.u32 {d2},[r12],r11 @vector load pu1_src 632 vld1.u32 {d3},[r12] @vector load pu1_src 633 634 sub r12,r12,#2 @increment the input pointer 635 vld1.u32 {d4},[r4],r11 @vector load pu1_src 636 vld1.u32 {d5},[r4],r11 @vector load pu1_src 637 vld1.u32 {d6},[r4],r11 @vector load pu1_src 638 vld1.u32 {d7},[r4] @vector load pu1_src 639 @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 640 @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 641 @vld1.u32 {d12,d13},[r4] @vector load pu1_src + src_strd 642 @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6] 643 644 sub r4,r4,#2 @increment the input pointer 645 @vext.u8 d14,d12,d13,#2 @vector extract of src[0_2] 646 @vext.u8 d16,d12,d13,#4 @vector extract of src[0_4] 647 @vext.u8 d18,d12,d13,#6 @vector extract of src[0_6] 648 649 vzip.32 d0,d4 @vector zip the i iteration and ii interation in single register 650 vzip.32 d1,d5 651 vzip.32 d2,d6 652 vzip.32 d3,d7 653 654 vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time 655 vmlsl.u8 q4,d0,d24 656 vmlal.u8 q4,d2,d26 657 vmlsl.u8 q4,d3,d27 658 659 vqrshrun.s16 d8,q4,#6 @narrow right shift and saturating the result 660 vst1.32 {d8[0]},[r1]! @store the i iteration result which is in upper part of the register 661 subs r7,r7,#4 @decrement the wd by 4 662 663 vst1.32 {d8[1]},[r6]! @store the ii iteration result which is in lower part of the register 664 665 bgt inner_loop_4 666 667 sub r12,r12,r5 668 subs r14,r14,#2 @decrement the ht by 2 669 sub r1,r1,r5 670 add r12,r12,r2,lsl #1 671 add r1,r1,r3,lsl #1 672 bgt outer_loop_4 673 674end_loops: 675 676 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 677 678 679 680 681 682 683 684 685