1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* @file 21@* ihevc_inter_pred_chroma_horz_neon.s 22@* 23@* @brief 24@* contains function definitions for inter prediction interpolation. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* yogeswaran rs / akshaya mukund 31@* 32@* @par list of functions: 33@* 34@* 35@* @remarks 36@* none 37@* 38@******************************************************************************* 39@*/ 40@/** 41@******************************************************************************* 42@* 43@* @brief 44@* chroma interprediction filter to store horizontal 16bit ouput 45@* 46@* @par description: 47@* applies a horizontal filter with coefficients pointed to by 'pi1_coeff' 48@* to the elements pointed by 'pu1_src' and writes to the location pointed 49@* by 'pu1_dst' no downshifting or clipping is done and the output is used 50@* as an input for vertical filtering or weighted prediction 51@* 52@* @param[in] pu1_src 53@* uword8 pointer to the source 54@* 55@* @param[out] pi2_dst 56@* word16 pointer to the destination 57@* 58@* @param[in] src_strd 59@* integer source stride 60@* 61@* @param[in] dst_strd 62@* integer destination stride 63@* 64@* @param[in] pi1_coeff 65@* word8 pointer to the filter coefficients 66@* 67@* @param[in] ht 68@* integer height of the array 69@* 70@* @param[in] wd 71@* integer width of the array 72@* 73@* @returns 74@* 75@* @remarks 76@* none 77@* 78@******************************************************************************* 79@*/ 80@void ihevc_inter_pred_chroma_horz_w16out(uword8 *pu1_src, 81@ word16 *pi2_dst, 82@ word32 src_strd, 83@ word32 dst_strd, 84@ word8 *pi1_coeff, 85@ word32 ht, 86@ word32 wd) 87@**************variables vs registers***************************************** 88@r0 => *pu1_src 89@r1 => *pi2_dst 90@r2 => src_strd 91@r3 => dst_strd 92 93.equ coeff_offset, 104 94.equ ht_offset, 108 95.equ wd_offset, 112 96 97.text 98.align 4 99 100 101 102 103.globl ihevc_inter_pred_chroma_horz_w16out_a9q 104 105 106.type ihevc_inter_pred_chroma_horz_w16out_a9q, %function 107 108ihevc_inter_pred_chroma_horz_w16out_a9q: 109 110 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 111 vpush {d8 - d15} 112 113 ldr r4,[sp,#coeff_offset] @loads pi1_coeff 114 ldr r6,[sp,#ht_offset] @loads ht 115 ldr r10,[sp,#wd_offset] @loads wd 116 117 vld1.8 {d0},[r4] @coeff = vld1_s8(pi1_coeff) 118 subs r14,r6,#0 @checks for ht == 0 119 vabs.s8 d2,d0 @vabs_s8(coeff) 120 121@******* added 122 mov r11, #2 123@******* added ends 124 125 ble end_loops 126 127 vdup.8 d24,d2[0] @coeffabs_0 = vdup_lane_u8(coeffabs, 0) 128 sub r12,r0,#2 @pu1_src - 2 129 vdup.8 d25,d2[1] @coeffabs_1 = vdup_lane_u8(coeffabs, 1) 130 add r4,r12,r2 @pu1_src_tmp2_8 = pu1_src + src_strd 131 vdup.8 d26,d2[2] @coeffabs_2 = vdup_lane_u8(coeffabs, 2) 132 133 tst r10,#3 @checks wd for multiples of 4 134 mov r5,r10,lsl #1 @2wd 135 136 vdup.8 d27,d2[3] @coeffabs_3 = vdup_lane_u8(coeffabs, 3) 137 138 and r7,r14,#1 @added @calculating ht_residue ht_residue = (ht & 1) 139 sub r14,r14,r7 @added @decrement height by ht_residue(residue value is calculated outside) 140 141 bne outer_loop_4 @ this branching happens when the width is 2 or 6 142 143 cmp r10,#12 144 beq skip_16 145 146 cmp r10,#8 147 bge outer_loop_16 148 149skip_16: 150 tst r6,#3 151 152@******* removal 153 @mov r11,#8 154@******* removal ends 155 156 sub r9,r0,#2 157 beq outer_loop_ht_4 @this branching happens when the height is a a multiple of 4 158 159 160 161@ cmp r10,#12 162@ beq outer_loop_8 163@ cmp r10,#16 164@ bge outer_loop_16 165 b outer_loop_8 166 167 168 169outer_loop_16: 170 add r4,r12,r2 171 172 173 and r0, r12, #31 174 pld [r12, r2, lsl #1] 175 176 177 178 179 180 181 182 vld1.u32 {q0},[r12],r11 @vector load pu1_src 183 mov r10,r5 @2wd 184 mul r14,r14,r10 185 vld1.u32 {q1},[r12],r11 @vector load pu1_src 186 pld [r4, r2, lsl #1] 187 mov r9,#10 188 vld1.u32 {q2},[r12],r11 @vector load pu1_src 189 rsb r6,r3,#8 190 sub r8,r3,#8 191 vld1.u32 {q3},[r12],r9 @vector load pu1_src 192 193 194 vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 195 vld1.u32 {q4},[r4],r11 @vector load pu1_src 196 vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 197 vld1.u32 {q5},[r4],r11 @vector load pu1_src 198 vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 199 vld1.u32 {q6},[r4],r11 @vector load pu1_src 200 vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 201 vld1.u32 {q7},[r4],r9 @vector load pu1_src 202 vmull.u8 q14,d3,d25 203 lsl r6,#1 204 rsb r3,r5,r3,lsl #1 205 vmlsl.u8 q14,d1,d24 206 lsl r8,#1 207 rsb r7,r5,r2,lsl #1 208 vmlal.u8 q14,d5,d26 209 210 vmlsl.u8 q14,d7,d27 211 cmp r14,#32 212 beq epilog_end 213 sub r14,#64 214 215inner_loop_16: 216 217 @ and r7, r12, #31 @decrement the wd loop 218 @ cmp r7, r0 219 pld [r12, r2, lsl #2] 220 pld [r4, r2, lsl #2] 221 222 223 subs r10,r10,#16 224 225 vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 226 227 228 229@ addeq r12,r12,r2,lsl #1 230@ subeq r12,r12,r5 231 addeq r12,r12,r7 232 addeq r4,r12,r2 233 234 235 vst1.16 {q15}, [r1]! 236 vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 237 238 239 240 241 242 vld1.u32 {q0},[r12],r11 @vector load pu1_src 243 vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 244 245 246 247 248 vld1.u32 {q1},[r12],r11 @vector load pu1_src 249 vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 250 251 252 vld1.u32 {q2},[r12],r11 @vector load pu1_src 253 vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 254 255 vst1.16 {q14}, [r1],r8 256 vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 257 258 vld1.u32 {q3},[r12],r9 @vector load pu1_src 259 vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 260 261 vld1.u32 {q4},[r4],r11 @vector load pu1_src 262 vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 263 264 265 vld1.u32 {q5},[r4],r11 @vector load pu1_src 266 vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 267 268 vld1.u32 {q6},[r4],r11 @vector load pu1_src 269 vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 270 271 vld1.u32 {q7},[r4],r9 @vector load pu1_src 272 vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 273 274 vst1.16 {q11},[r1]! @store the result pu1_dst 275 vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 276 277 moveq r10,r5 @2wd 278 vmull.u8 q14,d3,d25 279 280 281 282 vmlsl.u8 q14,d1,d24 283 vst1.16 {q10},[r1],r6 @store the result pu1_dst 284 285 286 addeq r1,r1,r3,lsl #1 287 vmlal.u8 q14,d5,d26 288 289 subs r14,r14,#32 @decrement the ht loop 290 vmlsl.u8 q14,d7,d27 291 292 293 294@ mov r0, r7 295 bgt inner_loop_16 296 297 298 299 add r14,r14,#64 300 cmp r14,#32 301 beq epilog_end 302 303epilog: 304 305 vst1.16 {q15}, [r1]! 306 vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 307 vst1.16 {q14}, [r1],r8 308 309 310 311 vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 312 subs r10,r10,#16 @decrement the wd loop 313 vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 314@ addeq r12,r12,r2,lsl #1 315 addeq r12,r12,r7 316 vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 317 @ subeq r12,r12,r5 318 moveq r10,r5 @2wd 319 addeq r4,r12,r2 320 vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 321 vld1.u32 {q0},[r12],r11 @vector load pu1_src 322 vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 323 vld1.u32 {q1},[r12],r11 @vector load pu1_src 324 vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 325 vld1.u32 {q2},[r12],r11 @vector load pu1_src 326 vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 327 vld1.u32 {q3},[r12],r9 @vector load pu1_src 328 vmull.u8 q15,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 329 330 331 vld1.u32 {q4},[r4],r11 @vector load pu1_src 332 vmlsl.u8 q15,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 333 vld1.u32 {q5},[r4],r11 @vector load pu1_src 334 vmlal.u8 q15,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 335 336 vmlsl.u8 q15,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 337 338 vld1.u32 {q6},[r4],r11 @vector load pu1_src 339 vmull.u8 q14,d3,d25 340 vld1.u32 {q7},[r4],r9 @vector load pu1_src 341 vmlsl.u8 q14,d1,d24 342 vst1.16 {q11},[r1]! @store the result pu1_dst 343 vmlal.u8 q14,d5,d26 344 vst1.16 {q10},[r1],r6 @store the result pu1_dst 345 vmlsl.u8 q14,d7,d27 346 addeq r1,r1,r3,lsl #1 347 348 349epilog_end: 350 351 vmull.u8 q11,d10,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 352 vmlsl.u8 q11,d8,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 353 vmlal.u8 q11,d12,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 354 vmlsl.u8 q11,d14,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 355 356 357 vmull.u8 q10,d11,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 358 vmlsl.u8 q10,d9,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 359 vmlal.u8 q10,d13,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 360 vmlsl.u8 q10,d15,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 361 362 363 vst1.16 {q15}, [r1]! 364 vst1.16 {q14}, [r1],r8 365 vst1.16 {q11},[r1]! @store the result pu1_dst 366 vst1.16 {q10},[r1],r6 @store the result pu1_dst 367 368 369 ldr r6,[sp,#ht_offset] @loads ht 370 371 and r7,r6,#1 372 373 cmp r7,#0 374 mov r10,r5 375 addne r12,r12,r2,lsl #1 376 subne r12,r12,r5 377 addne r1,r1,r3,lsl #1 378 379 380 bgt loop_residue_4 381 382 b end_loops 383 384 385 386 387outer_loop_8: 388 389 add r6,r1,r3,lsl #1 @pu1_dst + dst_strd 390 mov r10,r5 @2wd 391 add r4,r12,r2 @pu1_src + src_strd 392 393inner_loop_8: 394 @vld1.u32 {d0,d1},[r12],r11 @vector load pu1_src 395 vld1.u32 {d0},[r12],r11 @vector load pu1_src 396 vld1.u32 {d1},[r12],r11 @vector load pu1_src 397 vld1.u32 {d2},[r12],r11 @vector load pu1_src 398 vld1.u32 {d3},[r12],r11 @vector load pu1_src 399 400 401 @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 402 vmull.u8 q4,d1,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 403 vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 404 @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 405 @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6] 406 vmlal.u8 q4,d2,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 407 vmlsl.u8 q4,d3,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 408 409 @vld1.u32 {d12,d13},[r4],r11 @vector load pu1_src + src_strd 410 vld1.u32 {d4},[r4],r11 @vector load pu1_src 411 vld1.u32 {d5},[r4],r11 @vector load pu1_src 412 vld1.u32 {d6},[r4],r11 @vector load pu1_src 413 vld1.u32 {d7},[r4],r11 @vector load pu1_src 414 @vext.u8 d14,d12,d13,#2 @vector extract of src[0_2] 415 vmull.u8 q5,d5,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 416 vmlsl.u8 q5,d4,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 417 @vext.u8 d16,d12,d13,#4 @vector extract of src[0_4] 418 @vext.u8 d18,d12,d13,#6 @vector extract of src[0_6] 419 vmlal.u8 q5,d6,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 420 vmlsl.u8 q5,d7,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 421 422 vst1.16 {d8, d9}, [r1]! 423 424 subs r10,r10,#8 @decrement the wd loop 425 vst1.16 {d10, d11},[r6]! @store the result pu1_dst 426 bgt inner_loop_8 427 428 sub r12,r12,r5 429 subs r14,r14,#2 @decrement the ht loop 430 sub r1,r1,r5,lsl #1 431 add r12,r12,r2,lsl #1 432 add r1,r1,r3,lsl #2 433 bgt outer_loop_8 434 435 cmp r7,#0 436 mov r10,r5 437 bgt loop_residue_4 438 439 b end_loops 440 441 442 443@height if 4 comes 444outer_loop_ht_4: 445 446 mov r10,r5 447 448prologue_ht_4: 449 mov r8,r3,lsl #1 450 451inner_loop_ht_4: 452 453 mov r12,r9 454 mov r4,r1 455 456 sub r0, r2, #6 @ not sure if r0 needs to be preserved 457 458 vld1.u32 {d0},[r12],r11 @(1)vector load pu1_src 459 vld1.u32 {d1},[r12],r11 @(1)vector load pu1_src 460 vld1.u32 {d2},[r12],r11 @(1)vector load pu1_src 461 vld1.u32 {d3},[r12],r0 @(1)vector load pu1_src 462 463 vld1.u32 {d4},[r12],r11 @(2)vector load pu1_src 464 vld1.u32 {d5},[r12],r11 @(2)vector load pu1_src 465 vld1.u32 {d6},[r12],r11 @(2)vector load pu1_src 466 vld1.u32 {d7},[r12],r0 @(2)vector load pu1_src 467 468 vld1.u32 {d14},[r12],r11 @(3)vector load pu1_src 469 vmull.u8 q4,d1,d25 @(1)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 470 471 vld1.u32 {d15},[r12],r11 @(3)vector load pu1_src 472 vmlsl.u8 q4,d0,d24 @(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 473 474 vld1.u32 {d16},[r12],r11 @(3)vector load pu1_src 475 vmlal.u8 q4,d2,d26 @(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 476 477 vld1.u32 {d17},[r12],r0 @(3)vector load pu1_src 478 vmlsl.u8 q4,d3,d27 @(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 479 480 vld1.u32 {d18},[r12],r11 @(4)vector load pu1_src 481 vmull.u8 q5,d5,d25 @(2)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 482 483 vld1.u32 {d19},[r12],r11 @(4)vector load pu1_src 484 vmlsl.u8 q5,d4,d24 @(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 485 486 vld1.u32 {d20},[r12],r11 @(4)vector load pu1_src 487 vmlal.u8 q5,d6,d26 @(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 488 489 vld1.u32 {d21},[r12],r2 @(4)vector load pu1_src 490 vmlsl.u8 q5,d7,d27 @(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 491 492 add r9,r9,#8 @(core loop) 493 494 subs r10,r10,#8 @(prologue)decrement the wd loop 495 beq epilogue 496 497core_loop: 498 vst1.16 {d8, d9},[r4],r8 @(1)store the result pu1_dst 499 mov r12,r9 500 501 vld1.u32 {d0},[r12],r11 @(1_1)vector load pu1_src 502 vmull.u8 q6,d15,d25 @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 503 504 vld1.u32 {d1},[r12],r11 @(1_1)vector load pu1_src 505 vmlsl.u8 q6,d14,d24 @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 506 507 vld1.u32 {d2},[r12],r11 @(1_1)vector load pu1_src 508 vmlal.u8 q6,d16,d26 @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 509 510 vld1.u32 {d3},[r12],r0 @(1_1)vector load pu1_src 511 vmlsl.u8 q6,d17,d27 @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 512 513 vst1.16 {d10, d11},[r4],r8 @(2)store the result pu1_dst 514 add r9,r9,#8 @(core loop) 515 516 vld1.u32 {d4},[r12],r11 @(2_1)vector load pu1_src 517 vmull.u8 q11,d19,d25 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 518 519 vld1.u32 {d5},[r12],r11 @(2_1)vector load pu1_src 520 vmlsl.u8 q11,d18,d24 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 521 522 vld1.u32 {d6},[r12],r11 @(2_1)vector load pu1_src 523 vmlal.u8 q11,d20,d26 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 524 525 vld1.u32 {d7},[r12],r0 @(2_1)vector load pu1_src 526 vmlsl.u8 q11,d21,d27 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 527 528 vst1.16 {d12, d13},[r4],r8 @(3)store the result pu1_dst 529 add r1,r1,#16 @(core loop) 530 531 vld1.u32 {d14},[r12],r11 @(3_1)vector load pu1_src 532 vmull.u8 q4,d1,d25 @(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 533 534 vld1.u32 {d15},[r12],r11 @(3_1)vector load pu1_src 535 vmlsl.u8 q4,d0,d24 @(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 536 537 vld1.u32 {d16},[r12],r11 @(3_1)vector load pu1_src 538 vmlal.u8 q4,d2,d26 @(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 539 540 vld1.u32 {d17},[r12],r0 @(3_1)vector load pu1_src 541 vmlsl.u8 q4,d3,d27 @(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 542 543 vst1.16 {d22, d23}, [r4], r8 @(4)store the result pu1_dst 544 subs r10,r10,#8 @(core loop) 545 546 vmull.u8 q5,d5,d25 @(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 547 vld1.u32 {d18},[r12],r11 @(4_1)vector load pu1_src 548 549 vld1.u32 {d19},[r12],r11 @(4_1)vector load pu1_src 550 vmlsl.u8 q5,d4,d24 @(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 551 552 vld1.u32 {d20},[r12],r11 @(4_1)vector load pu1_src 553 vmlal.u8 q5,d6,d26 @(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 554 555 mov r4, r1 @(core loop) 556 557 vld1.u32 {d21},[r12],r0 @(4_1)vector load pu1_src 558 vmlsl.u8 q5,d7,d27 @(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 559 560 561 562 bgt core_loop @loopback 563 564epilogue: 565 vmull.u8 q6,d15,d25 @(3)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 566 567 vmlsl.u8 q6,d14,d24 @(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 568 569 vmlal.u8 q6,d16,d26 @(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 570 571 vmlsl.u8 q6,d17,d27 @(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 572 573 vst1.16 {d8, d9},[r4], r8 @(1)store the result pu1_dst 574 575 vmull.u8 q11,d19,d25 @(4)mul_res = vmull_u8(src[0_3], coeffabs_3)@ 576 vmlsl.u8 q11,d18,d24 @(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 577 578 vmlal.u8 q11,d20,d26 @(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 579 580 vmlsl.u8 q11,d21,d27 @(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 581 582 vst1.16 {d10, d11},[r4], r8 @(2)store the result pu1_dst 583 584 vst1.16 {d12, d13},[r4], r8 @(3)store the result pu1_dst 585 586 add r1,r1,#16 @(core loop) 587 588 vst1.16 {d22, d23},[r4], r8 @(4)store the result pu1_dst 589 590 sub r9,r9,r5 591 subs r14,r14,#4 @decrement the ht loop 592 sub r1,r1,r5,lsl #1 593 add r9,r9,r2,lsl #2 594 add r1,r1,r3,lsl #3 595 bgt outer_loop_ht_4 596 597 cmp r7,#0 598 mov r10,r5 599 movgt r12,r9 600 movgt r4,r1 601 bgt loop_residue_4 602 603 b end_loops 604 605outer_loop_4: 606 add r6,r1,r3,lsl #1 @pu1_dst + dst_strd 607 mov r10,r5 608 add r4,r12,r2 @pu1_src + src_strd 609 610inner_loop_4: 611 @vld1.u32 {d0,d1},[r12] @vector load pu1_src 612 vld1.u32 {d0},[r12],r11 @vector load pu1_src 613 vld1.u32 {d1},[r12],r11 @vector load pu1_src 614 vld1.u32 {d2},[r12],r11 @vector load pu1_src 615 vld1.u32 {d3},[r12] @vector load pu1_src 616 617@**** removal 618 @add r12,r12,#4 @increment the input pointer 619@**** removal ends 620@**** addn 621 sub r12,r12,#2 @increment the input pointer 622@**** addn ends 623 vld1.u32 {d4},[r4],r11 @vector load pu1_src 624 vld1.u32 {d5},[r4],r11 @vector load pu1_src 625 vld1.u32 {d6},[r4],r11 @vector load pu1_src 626 vld1.u32 {d7},[r4] @vector load pu1_src 627 @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 628 @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 629 @vld1.u32 {d12,d13},[r4] @vector load pu1_src + src_strd 630 @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6] 631 632 @add r4,r4,#4 @increment the input pointer 633 sub r4,r4,#2 634 @vext.u8 d14,d12,d13,#2 @vector extract of src[0_2] 635 @vext.u8 d16,d12,d13,#4 @vector extract of src[0_4] 636 @vext.u8 d18,d12,d13,#6 @vector extract of src[0_6] 637 638@**** removal 639 @vzip.32 d0,d12 @vector zip the i iteration and ii interation in single register 640 @vzip.32 d2,d14 641 @vzip.32 d4,d16 642 @vzip.32 d6,d18 643@**** removal ends 644@**** addn 645 vzip.32 d0,d4 @vector zip the i iteration and ii interation in single register 646 vzip.32 d1,d5 647 vzip.32 d2,d6 648 vzip.32 d3,d7 649@**** addn ends 650 651 vmull.u8 q4,d1,d25 @arithmetic operations for ii iteration in the same time 652 vmlsl.u8 q4,d0,d24 653 vmlal.u8 q4,d2,d26 654 vmlsl.u8 q4,d3,d27 655 656 vst1.32 {d8},[r1]! @store the i iteration result which is in upper part of the register 657 subs r10,r10,#4 @decrement the wd by 4 658 659 vst1.32 {d9},[r6]! @store the ii iteration result which is in lower part of the register 660 661 bgt inner_loop_4 662 663 sub r12,r12,r5 664 subs r14,r14,#2 @decrement the ht by 2 665 sub r1,r1,r5,lsl #1 666 add r12,r12,r2,lsl #1 667 add r1,r1,r3,lsl #2 668 bgt outer_loop_4 669 670 cmp r7,#0 671 mov r10,r5 672 beq end_loops 673 674loop_residue_4: 675 676 mov r10,r5 @2wd 677 678loop_residue: 679 680 @vld1.u32 {d0,d1},[r12] @vector load pu1_src 681 vld1.u32 {d0},[r12],r11 @vector load pu1_src 682 vld1.u32 {d1},[r12],r11 @vector load pu1_src 683 vld1.u32 {d2},[r12],r11 @vector load pu1_src 684 vld1.u32 {d3},[r12] @vector load pu1_src 685 @vext.u8 d2,d0,d1,#2 @vector extract of src[0_2] 686 @vmull.u8 q4,d2,d25 @mul_res = vmull_u8(src[0_3], coeffabs_3)@ 687 @vmlsl.u8 q4,d0,d24 @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@ 688 @vext.u8 d4,d0,d1,#4 @vector extract of src[0_4] 689 @add r12,r12,#4 @pu1_src + 4 690 sub r12, r12, #2 691 @vext.u8 d6,d0,d1,#6 @vector extract of src[0_6] 692 @vmlal.u8 q4,d4,d26 @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@ 693 @vmlsl.u8 q4,d6,d27 @mul_res = vmlal_u8(src[0_1], coeffabs_1)@ 694 vmull.u8 q4,d1,d25 695 vmlsl.u8 q4,d0,d24 696 vmlal.u8 q4,d2,d26 697 vmlsl.u8 q4,d3,d27 698 699 vst1.64 {d8 },[r1] @store the result pu1_dst 700 subs r10,r10,#4 @decrement the wd loop 701 add r1,r1,#8 @pi2_dst + 8 702 703 bgt loop_residue @loop again 704 705 @inner loop ends 706 @add r8,r3,lsl #1 @2*dst_strd 707 @sub r8,r8,r5,lsl #1 @2*dst_strd - 2wd 708 @sub r9,r2,r5 @src_strd - 2wd 709 @subs r7,r7,#1 @decrement the ht loop 710 @add r12,r12,r9 @pu1_src + src_strd 711 @add r1,r1,r8 @pu1_dst + 2*dst_strd 712 @bgt outer_loop_residue_4 @loop again 713 @b end_loops @jumps to end 714 715end_loops: 716 717 vpop {d8 - d15} 718 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 719 720 721 722 723 724 725