1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* @file 21//* ihevc_weighted_pred_bi_default.s 22//* 23//* @brief 24//* contains function definitions for weighted prediction used in inter 25//* prediction 26//* 27//* @author 28//* parthiban v 29//* 30//* @par list of functions: 31//* - ihevc_weighted_pred_bi_default() 32//* 33//* @remarks 34//* none 35//* 36//******************************************************************************* 37//*/ 38///** 39//******************************************************************************* 40//* 41//* @brief 42//* does default bi-weighted prediction on the arrays pointed by pi2_src1 and 43//* pi2_src2 and stores it at location pointed by pi2_dst assumptions : the 44//* function is optimized considering the fact width and height are multiple 45//* of 2. 46//* 47//* @par description: 48//* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) ) 49//* >> shift where shift = 15 - bitdepth 50//* 51//* @param[in] pi2_src1 52//* pointer to source 1 53//* 54//* @param[in] pi2_src2 55//* pointer to source 2 56//* 57//* @param[out] pu1_dst 58//* pointer to destination 59//* 60//* @param[in] src_strd1 61//* source stride 1 62//* 63//* @param[in] src_strd2 64//* source stride 2 65//* 66//* @param[in] dst_strd 67//* destination stride 68//* 69//* @param[in] lvl_shift1 70//* added before shift and offset 71//* 72//* @param[in] lvl_shift2 73//* added before shift and offset 74//* 75//* @param[in] ht 76//* height of the source 77//* 78//* @param[in] wd 79//* width of the source 80//* 81//* @returns 82//* 83//* @remarks 84//* none 85//* 86//******************************************************************************* 87//*/ 88//void ihevc_weighted_pred_bi_default(word16 *pi2_src1, 89// word16 *pi2_src2, 90// uword8 *pu1_dst, 91// word32 src_strd1, 92// word32 src_strd2, 93// word32 dst_strd, 94// word32 lvl_shift1, 95// word32 lvl_shift2, 96// word32 ht, 97// word32 wd) 98 99//**************variables vs registers***************************************** 100// x0 => *pi2_src1 101// x1 => *pi2_src2 102// x2 => *pu1_dst 103// x3 => src_strd1 104// x4 => src_strd2 105// x5 => dst_strd 106// x6 => lvl_shift1 107// x7 => lvl_shift2 108// x8 => ht 109// x9 => wd 110.text 111.align 4 112 113.include "ihevc_neon_macros.s" 114 115.globl ihevc_weighted_pred_bi_default_av8 116 117.type ihevc_weighted_pred_bi_default_av8, %function 118 119ihevc_weighted_pred_bi_default_av8: 120 121 ldr w8,[sp,#0] 122 ldr w9,[sp,#8] 123 124 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 125 126 stp x19, x20,[sp,#-16]! 127 stp x21, x22,[sp,#-16]! 128 129 mov x15,x4 // src_strd2 40 130 mov x16,x5 // dst_strd 44 131 mov x17,x6 // lvl_shift1 48 132 mov x19,x7 // lvl_shift2 52 133 mov x20,x8 // ht 56 134 mov x21,x9 // wd 60 135 136 mov x4,x15 //load src_strd2 137 lsl x3,x3,#1 138 mov x5,x16 //load dst_strd 139 mov x6,x17 //load lvl_shift1 140 lsl x4,x4,#1 141 mov x7,x19 //load lvl_shift2 142 mov x8,x20 //load ht 143 mov x9,x21 //load wd 144 dup v4.8h,w6 //lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1) 145 dup v6.8h,w7 //lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2) 146 movi v0.8h, #0x40 //tmp_lvl_shift = 1 << (shift - 1) 147 add v4.8h, v4.8h,v6.8h 148 add v0.8h, v0.8h , v4.8h 149// vmvn.i32 v2.8h,#0x6 @vmovq_n_s32(tmp_shift) 150 lsl x6,x9,#1 151 sub x20,x6,x3,lsl #2 //4*src_strd1 - wd 152 neg x7, x20 153 sub x20,x6,x4,lsl #2 //4*src_strd2 - wd 154 neg x10, x20 155 //asr x6,#1 156 //rsb x6,x6,x5,lsl #2 @4*dst_strd - wd 157 158 cmp x8,#0 //check ht == 0 159 beq end_loops //if equal, then end the function 160 161chroma_decision: 162 orr x14,x8,x9 163 cmp x14,#10 164 beq outer_loop_chroma_8x2 165 166 cmp x14,#6 167 beq outer_loop_chroma_4x2 168 169 170luma_decision: 171 cmp x9,#24 172 beq outer_loop_8 173 174 cmp x9,#16 175 bge outer_loop_16 176 177 cmp x9,#12 178 beq outer_loop_4 179 180 cmp x9,#8 181 bge outer_loop_8 182 183 184 185 186 187 188outer_loop_4: 189 cmp x9,#0 //check wd == 0 190 beq end_loops //if equal, then end the function 191 192core_loop_4: 193 add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 194 add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) 195 ld1 {v6.4h},[x0],#8 //load and increment the pi2_src1 196 add x14,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd 197 ld1 {v7.4h},[x1],#8 //load and increment the pi2_src2 198 ld1 {v1.4h},[x11],x3 //load and increment the pi2_src1 ii iteration 199 sqadd v18.4h,v6.4h,v7.4h 200 sqadd v18.4h,v18.4h,v0.4h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) 201 ld1 {v3.4h},[x12],x4 //load and increment the pi2_src2 ii iteration 202 sqadd v20.4h,v1.4h,v3.4h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) 203 sqadd v19.4h,v20.4h,v0.4h //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) 204 mov v18.d[1],v19.d[0] 205 sqshrun v20.8b, v18.8h,#7 206 ld1 {v22.4h},[x11],x3 //load and increment the pi2_src1 iii iteration 207 ld1 {v23.4h},[x12],x4 //load and increment the pi2_src2 iii iteration 208 sqadd v30.4h,v22.4h,v23.4h 209 sqadd v30.4h,v30.4h,v0.4h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration 210 ld1 {v24.4h},[x11],x3 //load and increment the pi2_src1 iv iteration 211 ld1 {v25.4h},[x12],x4 //load and increment the pi2_src2 iv iteration 212 sqadd v18.4h,v24.4h,v25.4h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration 213 sqadd v31.4h,v18.4h,v0.4h 214 mov v30.d[1],v31.d[0] 215 st1 {v20.s}[0],[x2],#4 //store pu1_dst i iteration 216 st1 {v20.s}[1],[x14],x5 //store pu1_dst ii iteration 217 sqshrun v30.8b, v30.8h,#7 218 st1 {v30.s}[0],[x14],x5 //store pu1_dst iii iteration //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio 219 subs x9,x9,#4 //decrement wd by 4 and check for 0 220 st1 {v30.s}[1],[x14],x5 //store pu1_dst iv iteration 221 bgt core_loop_4 //if greater than 0 repeat the core loop again 222 223end_core_loop_4: 224 225 subs x8,x8,#4 //decrement the ht by 4 226 227 add x0,x0,x7 //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement) 228 asr x9,x6,#1 229 add x1,x1,x10 //pi2_src2 + 4*src_strd2 - 2*wd 230 sub x20,x9,x5,lsl #2 //4*dst_strd - wd 231 neg x14, x20 232 add x2,x2,x14 233 //pu1_dst + dst_std - wd 234 bgt core_loop_4 //if ht is greater than 0 goto outer_loop 235 236 b end_loops 237 238 239// this is only for chroma module with input 2x2 240outer_loop_chroma_4x2: 241 cmp x9,#0 //check wd == 0 242 beq end_loops //if equal, then end the function 243 sub x20,x6,x3,lsl #1 //2*src_strd1 - wd 244 neg x7, x20 245 sub x20,x6,x4,lsl #1 //2*src_strd2 - wd 246 neg x10, x20 247core_loop_chroma_4x2: 248 add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 249 add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) 250 ld1 {v6.4h},[x0],#8 //load and increment the pi2_src1 251 add x14,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd 252 ld1 {v7.4h},[x1],#8 //load and increment the pi2_src2 253 ld1 {v1.4h},[x11],x3 //load and increment the pi2_src1 ii iteration 254 sqadd v18.4h,v6.4h,v7.4h 255 sqadd v18.4h,v18.4h,v0.4h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) 256 ld1 {v3.4h},[x12],x4 //load and increment the pi2_src2 ii iteration 257 sqadd v20.4h,v1.4h,v3.4h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) 258 sqadd v19.4h,v20.4h,v0.4h //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) 259 mov v18.d[1],v19.d[0] 260 sqshrun v20.8b, v18.8h,#7 261 st1 {v20.s}[0],[x2],#4 //store pu1_dst i iteration 262 st1 {v20.s}[1],[x14],x5 //store pu1_dst ii iteration 263 264 subs x9,x9,#4 //decrement wd by 4 and check for 0 265 266 bgt core_loop_chroma_4x2 //if greater than 0 repeat the core loop again 267 268end_core_loop_chorma_4x2: 269 270 subs x8,x8,#2 //decrement the ht by 4 271 272 add x0,x0,x7 //pi2_src1 + 2*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement) 273 asr x9,x6,#1 274 add x1,x1,x10 //pi2_src2 + 2*src_strd2 - 2*wd 275 sub x20,x9,x5,lsl #1 //2*dst_strd - wd 276 neg x14, x20 277 add x2,x2,x14 278 //pu1_dst + dst_std - wd 279 bgt core_loop_chroma_4x2 //if ht is greater than 0 goto outer_loop 280 281 b end_loops 282 283 284 285outer_loop_8: 286 cmp x9,#0 //check wd == 0 287 beq end_loops //if equal, then end the function 288 add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 289 add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) 290core_loop_8: 291 292 ld1 { v24.8h},[x0],#16 //load and increment the pi2_src1 293 add x14,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd 294 ld1 { v26.8h},[x1],#16 //load and increment the pi2_src2 295 sqadd v24.8h,v24.8h,v26.8h 296 ld1 { v28.8h},[x11],x3 //load and increment the pi2_src1 ii iteration 297 sqadd v24.8h,v24.8h,v0.8h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) 298 ld1 { v30.8h},[x12],x4 //load and increment the pi2_src2 ii iteration 299 ld1 { v16.8h},[x11],x3 //load and increment the pi2_src1 iii iteration 300 sqadd v22.8h,v28.8h,v30.8h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) 301 ld1 { v18.8h},[x12],x4 //load and increment the pi2_src2 iii iteration 302 sqadd v22.8h,v22.8h,v0.8h //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) 303 sqshrun v20.8b, v24.8h,#7 304 ld1 { v17.8h},[x11],x3 //load and increment the pi2_src1 iv iteration 305 sqadd v30.8h,v16.8h,v18.8h 306 sqshrun v21.8b, v22.8h,#7 307 ld1 { v29.8h},[x12],x4 //load and increment the pi2_src2 iv iteration 308 sqadd v30.8h,v30.8h,v0.8h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration 309 st1 {v20.2s},[x2],#8 //store pu1_dst i iteration 310 sqadd v1.8h,v17.8h,v29.8h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration 311 st1 {v21.2s},[x14],x5 //store pu1_dst ii iteration 312 sqadd v1.8h,v1.8h,v0.8h 313 sqshrun v30.8b, v30.8h,#7 314 sqshrun v31.8b, v1.8h,#7 315 add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 316 add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) 317 st1 {v30.2s},[x14],x5 //store pu1_dst iii iteration //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio 318 subs x9,x9,#8 //decrement wd by 4 and check for 0 319 st1 {v31.2s},[x14],x5 //store pu1_dst iv iteration 320 bgt core_loop_8 //if greater than 0 repeat the core loop again 321 322end_core_loop_8: 323 324 subs x8,x8,#4 //decrement the ht by 4 325 326 add x0,x0,x7 //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement) 327 asr x9,x6,#1 328 add x1,x1,x10 //pi2_src2 + 4*src_strd2 - 2*wd 329 sub x20,x9,x5,lsl #2 //4*dst_strd - wd 330 neg x14, x20 331 add x2,x2,x14 332 add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 333 add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) //pu1_dst + dst_std - wd 334 335 bgt core_loop_8 336 b end_loops 337 338 339 340// this is only for chroma module with inpput 4x2 341outer_loop_chroma_8x2: 342 cmp x9,#0 //check wd == 0 343 beq end_loops //if equal, then end the function 344 add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 345 add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) 346 sub x20,x6,x3,lsl #1 //2*src_strd1 - wd 347 neg x7, x20 348 sub x20,x6,x4,lsl #1 //2*src_strd2 - wd 349 neg x10, x20 350core_loop_chroma_8x2: 351 352 ld1 { v24.8h},[x0],#16 //load and increment the pi2_src1 353 add x14,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd 354 ld1 { v26.8h},[x1],#16 //load and increment the pi2_src2 355 sqadd v24.8h,v24.8h,v26.8h 356 ld1 { v28.8h},[x11],x3 //load and increment the pi2_src1 ii iteration 357 sqadd v24.8h,v24.8h,v0.8h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) 358 ld1 { v30.8h},[x12],x4 //load and increment the pi2_src2 ii iteration 359 ld1 { v16.8h},[x11],x3 //load and increment the pi2_src1 iii iteration 360 sqadd v22.8h,v28.8h,v30.8h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) 361 sqadd v22.8h,v22.8h,v0.8h //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) 362 sqshrun v20.8b, v24.8h,#7 363 sqshrun v21.8b, v22.8h,#7 364 st1 {v20.2s},[x2],#8 //store pu1_dst i iteration 365 st1 {v21.2s},[x14],x5 //store pu1_dst ii iteration 366 367 add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 368 add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) 369 //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio 370 subs x9,x9,#8 //decrement wd by 4 and check for 0 371 372 bgt core_loop_chroma_8x2 //if greater than 0 repeat the core loop again 373 374end_core_loop_chroma_8x2: 375 376 subs x8,x8,#2 //decrement the ht by 4 377 378 add x0,x0,x7 //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement) 379 asr x9,x6,#1 380 add x1,x1,x10 //pi2_src2 + 4*src_strd2 - 2*wd 381 sub x20,x9,x5,lsl #1 //4*dst_strd - wd 382 neg x14, x20 383 add x2,x2,x14 384 add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 385 add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) //pu1_dst + dst_std - wd 386 387 bgt core_loop_chroma_8x2 388 389 b end_loops 390 391 392 393 394outer_loop_16: 395 cmp x9,#0 //check wd == 0 396 beq end_loops //if equal, then end the function 397 add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 398 add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) 399 sub x20,x6,x3,lsl #1 //2*src_strd1 - wd 400 neg x7, x20 401 mov x14,#16 402 sub x10,x14,x5 403 sub x11,x3,x14 404 sub x12,x14,x3 405 406 sub x20,x9,x5,lsl #1 //2*dst_strd - wd 407 neg x14, x20 408 409 410 411prolog_16: 412 413 414 ld1 { v2.8h},[x0],#16 //load and increment the pi2_src1 415 ld1 { v4.8h},[x1],#16 //load and increment the pi2_src2 416 ld1 { v5.8h},[x0],x11 //load and increment the pi2_src1 417 ld1 { v17.8h},[x1],x11 //load and increment the pi2_src2 418 ld1 { v6.8h},[x0],#16 //load and increment the pi2_src1 ii iteration 419 subs x9,x9,#16 420 ld1 { v1.8h},[x1],#16 //load and increment the pi2_src2 ii iteration 421 sub x20,x8,#2 422 csel x8, x20, x8,eq 423 sqadd v22.8h,v2.8h,v4.8h 424 ld1 { v29.8h},[x0],x12 //load and increment the pi2_src1 ii iteration 425 sqadd v28.8h,v5.8h,v17.8h 426 ld1 { v16.8h},[x1],x12 //load and increment the pi2_src2 ii iteration 427 add x20,x0,x7 428 csel x0, x20, x0,eq 429 add x20,x1,x7 430 csel x1, x20, x1,eq 431 sqadd v24.8h,v6.8h,v1.8h 432 ld1 { v2.8h},[x0],#16 433 sqadd v26.8h,v29.8h,v16.8h 434// if the input is chroma with 8x2 block size 435 cmp x8,#0 436 beq epilog_16 437 438 ld1 { v4.8h},[x1],#16 //load and increment the pi2_src2 439 sqadd v22.8h,v22.8h,v0.8h 440 ld1 { v5.8h},[x0],x11 //load and increment the pi2_src1 441 sqadd v28.8h,v28.8h,v0.8h 442 ld1 { v17.8h},[x1],x11 //load and increment the pi2_src2 443 sqadd v24.8h,v24.8h,v0.8h 444 ld1 { v6.8h},[x0],#16 //load and increment the pi2_src1 ii iteration 445 sqadd v30.8h,v26.8h,v0.8h 446 sqshrun v20.8b, v22.8h,#7 447 ld1 { v1.8h},[x1],#16 //load and increment the pi2_src2 ii iteration 448 sqshrun v21.8b, v28.8h,#7 449 ld1 { v29.8h},[x0],x12 //load and increment the pi2_src1 ii iteration 450 sqshrun v26.8b, v24.8h,#7 451 ld1 { v16.8h},[x1],x12 //load and increment the pi2_src2 ii iteration 452 sqshrun v27.8b, v30.8h,#7 453 454 455 456core_loop_16: 457 458 cmp x9,#0 459 sqadd v22.8h,v2.8h,v4.8h 460 asr x20,x6,#1 461 csel x9,x20,x9,eq 462 //asreq x9,x6,#1 463 mov v20.d[1],v21.d[0] 464 mov v26.d[1],v27.d[0] 465 st1 { v20.4s},[x2],x5 466 sqadd v28.8h,v5.8h,v17.8h 467 st1 { v26.4s},[x2],x10 468 add x20,x2,x14 469 csel x2, x20, x2,eq 470 sqadd v24.8h,v6.8h,v1.8h 471 subs x9,x9,#16 472 add x20,x0,x7 473 csel x0, x20, x0,eq 474 sqadd v26.8h,v29.8h,v16.8h 475 476 add x20,x1,x7 477 csel x1, x20, x1,eq 478 sub x20,x8,#2 479 csel x8,x20,x8,eq 480 cmp x8,#0 481 //subeqs x8,x8,#2 //decrement the ht by 2 482 beq epilog_16 483 484 485 sqadd v22.8h,v22.8h,v0.8h 486 ld1 { v2.8h},[x0],#16 //load and increment the pi2_src1 487 sqadd v28.8h,v28.8h,v0.8h 488 ld1 { v4.8h},[x1],#16 //load and increment the pi2_src2 489 sqadd v24.8h,v24.8h,v0.8h 490 ld1 { v5.8h},[x0],x11 //load and increment the pi2_src1 491 sqadd v30.8h,v26.8h,v0.8h 492 ld1 { v17.8h},[x1],x11 //load and increment the pi2_src2 493 sqshrun v20.8b, v22.8h,#7 494 ld1 { v6.8h},[x0],#16 //load and increment the pi2_src1 ii iteration 495 sqshrun v21.8b, v28.8h,#7 496 ld1 { v1.8h},[x1],#16 //load and increment the pi2_src2 ii iteration 497 sqshrun v26.8b, v24.8h,#7 498 ld1 { v29.8h},[x0],x12 //load and increment the pi2_src1 ii iteration 499 sqshrun v27.8b, v30.8h,#7 500 ld1 { v16.8h},[x1],x12 //load and increment the pi2_src2 ii iteration 501 502 503 b core_loop_16 504 505 506epilog_16: 507 508 sqadd v22.8h,v22.8h,v0.8h 509 sqadd v28.8h,v28.8h,v0.8h 510 sqadd v24.8h,v24.8h,v0.8h 511 sqadd v30.8h,v26.8h,v0.8h 512 sqshrun v20.8b, v22.8h,#7 513 sqshrun v21.8b, v28.8h,#7 514 sqshrun v26.8b, v24.8h,#7 515 sqshrun v27.8b, v30.8h,#7 516 mov v20.d[1],v21.d[0] 517 mov v26.d[1],v27.d[0] 518 st1 { v20.4s},[x2],x5 519 st1 { v26.4s},[x2] 520 521 522 523end_core_loop_16: 524 525 526 527 528 529 530 531 532end_loops: 533 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 534 ldp x21, x22,[sp],#16 535 ldp x19, x20,[sp],#16 536 537 ret 538 539 540 541 542