1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* @file 21@* ihevc_inter_pred_chroma_horz_neon.s 22@* 23@* @brief 24@* contains function definitions for inter prediction interpolation. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* yogeswaran rs / akshaya mukund 31@* 32@* @par list of functions: 33@* 34@* 35@* @remarks 36@* none 37@* 38@******************************************************************************* 39@*/ 40@/** 41@******************************************************************************* 42@* 43@* @brief 44@* chroma interprediction filter for horizontal input 45@* 46@* @par description: 47@* applies a horizontal filter with coefficients pointed to by 'pi1_coeff' 48@* to the elements pointed by 'pu1_src' and writes to the location pointed 49@* by 'pu1_dst' the output is downshifted by 6 and clipped to 8 bits 50@* assumptions : the function is optimized considering the fact width is 51@* multiple of 2,4 or 8. if width is 2, then height should be multiple of 2. 52@* width 4,8 is optimized further 53@* 54@* @param[in] pu1_src 55@* uword8 pointer to the source 56@* 57@* @param[out] pu1_dst 58@* uword8 pointer to the destination 59@* 60@* @param[in] src_strd 61@* integer source stride 62@* 63@* @param[in] dst_strd 64@* integer destination stride 65@* 66@* @param[in] pi1_coeff 67@* word8 pointer to the filter coefficients 68@* 69@* @param[in] ht 70@* integer height of the array 71@* 72@* @param[in] wd 73@* integer width of the array 74@* 75@* @returns 76@* 77@* @remarks 78@* none 79@* 80@******************************************************************************* 81@*/ 82 83@void ihevc_inter_pred_chroma_horz(uword8 *pu1_src, 84@ uword8 *pu1_dst, 85@ word32 src_strd, 86@ word32 dst_strd, 87@ word8 *pi1_coeff, 88@ word32 ht, 89@ word32 wd) 90@**************variables vs registers***************************************** 91@r0 => *pu1_src 92@r1 => *pi2_dst 93@r2 => src_strd 94@r3 => dst_strd 95 96.equ coeff_offset, 104 97.equ ht_offset, 108 98.equ wd_offset, 112 99 100.text 101.align 4 102 103 104 105 106.globl ihevc_inter_pred_chroma_horz_a9q 107 108.type ihevc_inter_pred_chroma_horz_a9q, %function 109 110ihevc_inter_pred_chroma_horz_a9q: 111 112 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 113 vpush {d8 - d15} 114 115 ldr r4,[sp,#coeff_offset] @loads pi1_coeff 116 ldr r7,[sp,#ht_offset] @loads ht 117 ldr r10,[sp,#wd_offset] @loads wd 118 119 vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff) 120 subs r14,r7,#0 @checks for ht == 0 121 vabs.s8 d2,d0 @vabs_s8(coeff) 122 mov r11,#2 123 ble end_loops 124 125 vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0) 126 sub r12,r0,#2 @pu1_src - 2 127 vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1) 128 add r4,r12,r2 @pu1_src_tmp2_8 = pu1_src + src_strd 129 vdup.8 d26,d2[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2) 130 131 tst r10,#3 @checks wd for multiples 132 mov r5,r10,lsl #1 133 134 vdup.8 d27,d2[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3) 135 136 bne outer_loop_4 137 cmp r10,#12 138 beq skip_16 139 140 cmp r10,#8 141 bge outer_loop_16 142skip_16: 143 tst r7,#3 144 145 sub r9,r0,#2 146 beq outer_loop_ht_4 @jumps to else condition 147 148 b outer_loop_8 149 150 151outer_loop_16: 152 mov r10,r5 @2wd 153 mul r14,r14,r10 154 155 rsb r6,r3,#16 156 157 add r4,r12,r2 158 mov r9,#10 159 and r0, r12, #31 160 rsb r8,r5,r3,lsl #1 161 pld [r12, r2, lsl #1] 162 163 164 165 166 vld1.u32 {q0},[r12],r11 @vector load pu1_src 167 pld [r4, r2, lsl #1] 168 vld1.u32 {q1},[r12],r11 @vector load pu1_src 169 170 vld1.u32 {q2},[r12],r11 @vector load pu1_src 171 172 vld1.u32 {q3},[r12],r9 @vector load pu1_src 173 174 175 vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 176 vld1.u32 {q4},[r4],r11 @vector load pu1_src 177 vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 178 vld1.u32 {q5},[r4],r11 @vector load pu1_src 179 vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 180 vld1.u32 {q6},[r4],r11 @vector load pu1_src 181 vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 182 vld1.u32 {q7},[r4],r9 @vector load pu1_src 183 vmull.u8 q14,d3,d25 184 185 vmlsl.u8 q14,d1,d24 186 187 188 vmlal.u8 q14,d5,d26 189 190 vmlsl.u8 q14,d7,d27 191 192 193 cmp r14,#32 194 beq epilog_end 195 sub r14,#64 196 197inner_loop_16: 198 199 200 201 202@ bgt l_2 203 204@ pld [r12, r2, lsl #1] 205@ pld [r4, r2, lsl #1] 206 207 pld [r12, r2, lsl #2] 208 pld [r4, r2, lsl #2] 209 210 subs r10,r10,#16 211 212 vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 213 214 215 addeq r12,r12,r8 216 addeq r4,r12,r2 217 vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 218 219 220 221 vqrshrun.s16 d30,q15,#6 222 223 vld1.u32 {q0},[r12],r11 @vector load pu1_src 224 vqrshrun.s16 d31,q14,#6 225 226 227 vld1.u32 {q1},[r12],r11 @vector load pu1_src 228 vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 229 230 231 232 233 vld1.u32 {q2},[r12],r11 @vector load pu1_src 234 vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 235 236 237 vld1.u32 {q3},[r12],r9 @vector load pu1_src 238 vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 239 240 vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 241 242 vst1.16 {q15}, [r1],r3 243 vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 244 245 vld1.u32 {q4},[r4],r11 @vector load pu1_src 246 vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 247 248 249 vld1.u32 {q5},[r4],r11 @vector load pu1_src 250 vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 251 252 vld1.u32 {q6},[r4],r11 @vector load pu1_src 253 vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 254 255 vld1.u32 {q7},[r4],r9 @vector load pu1_src 256 vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 257 258 cmp r10,#0 259 vqrshrun.s16 d22,q11,#6 260 vqrshrun.s16 d23,q10,#6 261 262 263 264 vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 265 266 moveq r10,r5 @2wd 267 vmull.u8 q14,d3,d25 268 269 270 vst1.16 {q11},[r1],r6 @store the result pu1_dst 271 vmlsl.u8 q14,d1,d24 272 273 274 addeq r1,r1,r8 275 vmlal.u8 q14,d5,d26 276 277 subs r14,r14,#32 @decrement the ht loop 278 vmlsl.u8 q14,d7,d27 279 280@ mov r0, r7 281 282 bgt inner_loop_16 283 284 285 286 add r14,r14,#64 287 cmp r14,#32 288 beq epilog_end 289 290epilog: 291 vqrshrun.s16 d30,q15,#6 292 vqrshrun.s16 d31,q14,#6 293 294 295 296 vst1.16 {q15}, [r1],r3 297 vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 298 299 300 301 302 vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 303 subs r10,r10,#16 @decrement the wd loop 304 vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 305 addeq r12,r12,r8 306 vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 307 moveq r10,r5 @2wd 308 309 310 addeq r4,r12,r2 311 vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 312 vld1.u32 {q0},[r12],r11 @vector load pu1_src 313 vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 314 vld1.u32 {q1},[r12],r11 @vector load pu1_src 315 vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 316 vld1.u32 {q2},[r12],r11 @vector load pu1_src 317 vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 318 vld1.u32 {q3},[r12],r9 @vector load pu1_src 319 vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 320 321 322 vld1.u32 {q4},[r4],r11 @vector load pu1_src 323 vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 324 vld1.u32 {q5},[r4],r11 @vector load pu1_src 325 vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 326 327 vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 328 329 vld1.u32 {q6},[r4],r11 @vector load pu1_src 330 vmull.u8 q14,d3,d25 331 vld1.u32 {q7},[r4],r9 @vector load pu1_src 332 vmlsl.u8 q14,d1,d24 333 vqrshrun.s16 d22,q11,#6 334 vqrshrun.s16 d23,q10,#6 335 336 vst1.16 {q11},[r1],r6 @store the result pu1_dst 337 vmlal.u8 q14,d5,d26 338 339 vmlsl.u8 q14,d7,d27 340 addeq r1,r1,r8 341 342 343 344epilog_end: 345 vqrshrun.s16 d30,q15,#6 346 vqrshrun.s16 d31,q14,#6 347 348 349 vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 350 vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 351 vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 352 vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 353 354 355 vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 356 vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 357 vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 358 vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 359 vqrshrun.s16 d22,q11,#6 360 vqrshrun.s16 d23,q10,#6 361 362 363 vst1.16 {q15}, [r1],r3 364 365 vst1.16 {q11},[r1] @store the result pu1_dst 366 367 368 369 b end_loops 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389outer_loop_8: 390 391 392 add r6,r1,r3 @pu1_dst + dst_strd 393 mov r7,r5 394 add r4,r12,r2 @pu1_src + src_strd 395 396 397inner_loop_8: 398 @vld1.u32 {d0,d1},[r12],r11 @vector load pu1_src 399 vld1.u32 {d0},[r12],r11 @vector load pu1_src 400 vld1.u32 {d1},[r12],r11 @vector load pu1_src 401 vld1.u32 {d2},[r12],r11 @vector load pu1_src 402 vld1.u32 {d3},[r12],r11 @vector load pu1_src 403 404 @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 405 vmull.u8 q4,d1,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 406 vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 407 @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 408 @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6] 409 vmlal.u8 q4,d2,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 410 vmlsl.u8 q4,d3,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 411 412 vld1.u32 {d4},[r4],r11 @vector load pu1_src 413 vld1.u32 {d5},[r4],r11 @vector load pu1_src 414 vld1.u32 {d6},[r4],r11 @vector load pu1_src 415 vld1.u32 {d7},[r4],r11 @vector load pu1_src 416 @vld1.u32 {d12,d13},[r4],r11 @vector load pu1_src + src_strd 417 @vext.u8 d14,d12,d13,#2 @vector extract of src[0_2] 418 vmull.u8 q5,d5,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 419 vmlsl.u8 q5,d4,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 420 @vext.u8 d16,d12,d13,#4 @vector extract of src[0_4] 421 @vext.u8 d18,d12,d13,#6 @vector extract of src[0_6] 422 vqrshrun.s16 d8,q4,#6 @right shift and saturating narrow result 1 423 vmlal.u8 q5,d6,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 424 vmlsl.u8 q5,d7,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 425 426 vst1.8 {d8},[r1]! @store the result pu1_dst 427 428 vqrshrun.s16 d10,q5,#6 @right shift and saturating narrow result 2 429 subs r7,r7,#8 @decrement the wd loop 430 vst1.8 {d10},[r6]! @store the result pu1_dst 431 bgt inner_loop_8 432 433 sub r12,r12,r5 434 subs r14,r14,#2 @decrement the ht loop 435 sub r1,r1,r5 436 add r12,r12,r2,lsl #1 437 add r1,r1,r3,lsl #1 438 bgt outer_loop_8 439 b end_loops 440 441@height if 4 comes 442outer_loop_ht_4: 443 444 mov r7,r5 445 446prologue_ht_4: 447 448inner_loop_ht_4: 449 450 mov r12,r9 451 mov r4,r1 452 453 sub r8, r2, #6 454 455 vld1.u32 {d0},[r12],r11 @(1)vector load pu1_src 456 vld1.u32 {d1},[r12],r11 @(1)vector load pu1_src 457 vld1.u32 {d2},[r12],r11 @(1)vector load pu1_src 458 @vld1.u32 {d3},[r12],r2 @(1)vector load pu1_src 459 vld1.u32 {d3},[r12],r8 @(1)vector load pu1_src 460 461 @sub r12, r12, #6 @(1) 462 463 vld1.u32 {d4},[r12],r11 @(2)vector load pu1_src 464 vld1.u32 {d5},[r12],r11 @(2)vector load pu1_src 465 vld1.u32 {d6},[r12],r11 @(2)vector load pu1_src 466 @vld1.u32 {d7},[r12],r2 @(2)vector load pu1_src 467 vld1.u32 {d7},[r12],r8 @(2)vector load pu1_src 468 469 @sub r12, r12, #6 @(2) 470 471 vld1.u32 {d14},[r12],r11 @(3)vector load pu1_src 472 vmull.u8 q4,d1,d25 @(1)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 473 474 vld1.u32 {d15},[r12],r11 @(3)vector load pu1_src 475 vmlsl.u8 q4,d0,d24 @(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 476 477 vld1.u32 {d16},[r12],r11 @(3)vector load pu1_src 478 vmlal.u8 q4,d2,d26 @(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 479 480 @vld1.u32 {d17},[r12],r2 @(3)vector load pu1_src 481 vld1.u32 {d17},[r12],r8 @(3)vector load pu1_src 482 vmlsl.u8 q4,d3,d27 @(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 483 484 @sub r12, r12, #6 @(3) 485 vmull.u8 q5,d5,d25 @(2)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 486 487 vld1.u32 {d18},[r12],r11 @(4)vector load pu1_src 488 vmlsl.u8 q5,d4,d24 @(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 489 490 vld1.u32 {d19},[r12],r11 @(4)vector load pu1_src 491 vmlal.u8 q5,d6,d26 @(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 492 493 vld1.u32 {d20},[r12],r11 @(4)vector load pu1_src 494 vmlsl.u8 q5,d7,d27 @(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 495 496 vld1.u32 {d21},[r12],r2 @(4)vector load pu1_src 497 vqrshrun.s16 d8,q4,#6 @(1)right shift and saturating narrow result 1 498 499 add r9,r9,#8 @(core loop) 500 501 subs r7,r7,#8 @(prologue)decrement the wd loop 502 beq epilogue 503 504core_loop: 505 mov r12,r9 506 507 vld1.u32 {d0},[r12],r11 @(1_1)vector load pu1_src 508 vmull.u8 q6,d15,d25 @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 509 510 vld1.u32 {d1},[r12],r11 @(1_1)vector load pu1_src 511 vmlsl.u8 q6,d14,d24 @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 512 513 vld1.u32 {d2},[r12],r11 @(1_1)vector load pu1_src 514 vmlal.u8 q6,d16,d26 @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 515 516 @vld1.u32 {d3},[r12],r2 @(1_1)vector load pu1_src 517 vld1.u32 {d3},[r12],r8 @(1_1)vector load pu1_src 518 vmlsl.u8 q6,d17,d27 @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 519 520 @sub r12, r12, #6 @(1_1) 521 522 vst1.8 {d8},[r4],r3 @(1)store the result pu1_dst 523 vqrshrun.s16 d10,q5,#6 @(2)right shift and saturating narrow result 2 524 525 vld1.u32 {d4},[r12],r11 @(2_1)vector load pu1_src 526 vmull.u8 q11,d19,d25 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 527 528 vld1.u32 {d5},[r12],r11 @(2_1)vector load pu1_src 529 vmlsl.u8 q11,d18,d24 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 530 531 vld1.u32 {d6},[r12],r11 @(2_1)vector load pu1_src 532 vmlal.u8 q11,d20,d26 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 533 534 @vld1.u32 {d7},[r12],r2 @(2_1)vector load pu1_src 535 vld1.u32 {d7},[r12],r8 @(2_1)vector load pu1_src 536 vmlsl.u8 q11,d21,d27 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 537 538 @sub r12, r12, #6 @(2_1) 539 540 vst1.8 {d10},[r4],r3 @(2)store the result pu1_dst 541 vqrshrun.s16 d12,q6,#6 @(3)right shift and saturating narrow result 1 542 543 vld1.u32 {d14},[r12],r11 @(3_1)vector load pu1_src 544 vmull.u8 q4,d1,d25 @(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 545 546 vld1.u32 {d15},[r12],r11 @(3_1)vector load pu1_src 547 vmlsl.u8 q4,d0,d24 @(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 548 549 vld1.u32 {d16},[r12],r11 @(3_1)vector load pu1_src 550 vmlal.u8 q4,d2,d26 @(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 551 552 @vld1.u32 {d17},[r12],r2 @(3_1)vector load pu1_src 553 vld1.u32 {d17},[r12],r8 @(3_1)vector load pu1_src 554 vmlsl.u8 q4,d3,d27 @(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 555 556 @sub r12, r12, #6 @(3_1) 557 558 vst1.8 {d12},[r4],r3 @(3)store the result pu1_dst 559 vqrshrun.s16 d22,q11,#6 @(4)right shift and saturating narrow result 2 560 561 add r9,r9,#8 @(core loop) 562 563 vmull.u8 q5,d5,d25 @(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 564 vld1.u32 {d18},[r12],r11 @(4_1)vector load pu1_src 565 566 vld1.u32 {d19},[r12],r11 @(4_1)vector load pu1_src 567 vmlsl.u8 q5,d4,d24 @(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 568 569 vld1.u32 {d20},[r12],r11 @(4_1)vector load pu1_src 570 vmlal.u8 q5,d6,d26 @(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 571 572 vld1.u32 {d21},[r12],r2 @(4_1)vector load pu1_src 573 vmlsl.u8 q5,d7,d27 @(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 574 575 add r1,r1,#8 @(core loop) 576 577 subs r7,r7,#8 @(core loop) 578 579 vst1.8 {d22}, [r4], r3 @(4)store the result pu1_dst 580 vqrshrun.s16 d8,q4,#6 @(1_1)right shift and saturating narrow result 1 581 582 mov r4, r1 @(core loop) 583 584 bgt core_loop @loopback 585 586epilogue: 587 vmull.u8 q6,d15,d25 @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 588 589 vmlsl.u8 q6,d14,d24 @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 590 591 vmlal.u8 q6,d16,d26 @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 592 593 vmlsl.u8 q6,d17,d27 @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 594 595 vst1.8 {d8},[r4],r3 @(1)store the result pu1_dst 596 vqrshrun.s16 d10,q5,#6 @(2)right shift and saturating narrow result 2 597 598 vmull.u8 q11,d19,d25 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 599 vmlsl.u8 q11,d18,d24 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 600 601 vmlal.u8 q11,d20,d26 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 602 603 vmlsl.u8 q11,d21,d27 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 604 605 vst1.8 {d10},[r4],r3 @(2)store the result pu1_dst 606 vqrshrun.s16 d12,q6,#6 @(3)right shift and saturating narrow result 1 607 608 vst1.8 {d12},[r4],r3 @(3)store the result pu1_dst 609 610 add r1,r1,#8 @(core loop) 611 612 vqrshrun.s16 d22,q11,#6 @(4)right shift and saturating narrow result 2 613 614 615 vst1.8 {d22}, [r4], r3 @(4)store the result pu1_dst 616 617 sub r9,r9,r5 618 subs r14,r14,#4 @decrement the ht loop 619 sub r1,r1,r5 620 add r9,r9,r2,lsl #2 621 add r1,r1,r3,lsl #2 622 bgt outer_loop_ht_4 623 b end_loops 624 625outer_loop_4: 626 add r6,r1,r3 @pu1_dst + dst_strd 627 mov r7,r5 628 add r4,r12,r2 @pu1_src + src_strd 629 630inner_loop_4: 631 @vld1.u32 {d0,d1},[r12] @vector load pu1_src 632 633 vld1.u32 {d0},[r12],r11 @vector load pu1_src 634 vld1.u32 {d1},[r12],r11 @vector load pu1_src 635 vld1.u32 {d2},[r12],r11 @vector load pu1_src 636 vld1.u32 {d3},[r12] @vector load pu1_src 637 638 sub r12,r12,#2 @increment the input pointer 639 vld1.u32 {d4},[r4],r11 @vector load pu1_src 640 vld1.u32 {d5},[r4],r11 @vector load pu1_src 641 vld1.u32 {d6},[r4],r11 @vector load pu1_src 642 vld1.u32 {d7},[r4] @vector load pu1_src 643 @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 644 @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 645 @vld1.u32 {d12,d13},[r4] @vector load pu1_src + src_strd 646 @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6] 647 648 sub r4,r4,#2 @increment the input pointer 649 @vext.u8 d14,d12,d13,#2 @vector extract of src[0_2] 650 @vext.u8 d16,d12,d13,#4 @vector extract of src[0_4] 651 @vext.u8 d18,d12,d13,#6 @vector extract of src[0_6] 652 653 vzip.32 d0,d4 @vector zip the i iteration and ii interation in single register 654 vzip.32 d1,d5 655 vzip.32 d2,d6 656 vzip.32 d3,d7 657 658 vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time 659 vmlsl.u8 q4,d0,d24 660 vmlal.u8 q4,d2,d26 661 vmlsl.u8 q4,d3,d27 662 663 vqrshrun.s16 d8,q4,#6 @narrow right shift and saturating the result 664 vst1.32 {d8[0]},[r1]! @store the i iteration result which is in upper part of the register 665 subs r7,r7,#4 @decrement the wd by 4 666 667 vst1.32 {d8[1]},[r6]! @store the ii iteration result which is in lower part of the register 668 669 bgt inner_loop_4 670 671 sub r12,r12,r5 672 subs r14,r14,#2 @decrement the ht by 2 673 sub r1,r1,r5 674 add r12,r12,r2,lsl #1 675 add r1,r1,r3,lsl #1 676 bgt outer_loop_4 677 678end_loops: 679 680 vpop {d8 - d15} 681 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 682 683 684 685 686 687 688 689 690