1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21//****************************************************************************** 22//* @file 23//* ih264_inter_pred_luma_vert_qpel_av8.s 24//* 25//* @brief 26//* Contains function definitions for inter prediction vertical quarter pel interpolation. 27//* 28//* @author 29//* Mohit 30//* 31//* @par List of Functions: 32//* 33//* - ih264_inter_pred_luma_vert_qpel_av8() 34//* 35//* @remarks 36//* None 37//* 38//******************************************************************************* 39//*/ 40 41///* All the functions here are replicated from ih264_inter_pred_filters.c 42// 43 44///** 45///** 46//******************************************************************************* 47//* 48//* @brief 49//* Quarter pel interprediction luma filter for vertical input 50//* 51//* @par Description: 52//* Applies a 6 tap horizontal filter .The output is clipped to 8 bits 53//* sec 8.4.2.2.1 titled "Luma sample interpolation process" 54//* 55//* @param[in] pu1_src 56//* UWORD8 pointer to the source 57//* 58//* @param[out] pu1_dst 59//* UWORD8 pointer to the destination 60//* 61//* @param[in] src_strd 62//* integer source stride 63//* 64//* @param[in] dst_strd 65//* integer destination stride 66//* 67//* @param[in] ht 68//* integer height of the array 69//* 70//* @param[in] wd 71//* integer width of the array 72//* 73//* @param[in] pu1_tmp: temporary buffer: UNUSED in this function 74//* 75//* @param[in] dydx: x and y reference offset for qpel calculations. 76//* @returns 77//* 78// @remarks 79//* None 80//* 81//******************************************************************************* 82//*/ 83 84//void ih264_inter_pred_luma_vert ( 85// UWORD8 *pu1_src, 86// UWORD8 *pu1_dst, 87// WORD32 src_strd, 88// WORD32 dst_strd, 89// WORD32 ht, 90// WORD32 wd, 91// UWORD8* pu1_tmp, 92// UWORD32 dydx) 93 94//**************Variables Vs Registers***************************************** 95// x0 => *pu1_src 96// x1 => *pu1_dst 97// w2 => src_strd 98// w3 => dst_strd 99// w4 => ht 100// w5 => wd 101// w7 => dydx 102 103.text 104.p2align 2 105.include "ih264_neon_macros.s" 106 107 108 109 .global ih264_inter_pred_luma_vert_qpel_av8 110 111ih264_inter_pred_luma_vert_qpel_av8: 112 113 push_v_regs 114 stp x19, x20, [sp, #-16]! 115 sxtw x2, w2 116 sxtw x3, w3 117 sxtw x4, w4 118 sxtw x5, w5 119 120 121 and x7, x7, #12 //Finds y-offset 122 lsr x7, x7, #3 //dydx>>3 123 mul x7, x2, x7 124 add x7, x0, x7 //pu1_src + (y_offset>>1)*src_strd 125 sub x14, x4, #16 126 movi v22.8h, #20 // Filter coeff 0x14 into Q11 127 sub x0, x0, x2, lsl #1 //pu1_src-2*src_strd 128 subs x12, x5, #8 //if wd=8 branch to loop_8 129 movi v24.8h, #5 // Filter coeff 0x4 into Q12 130 beq loop_8_start 131 132 subs x12, x5, #4 //if wd=4 branch to loop_4 133 beq loop_4_start 134 135 136 ld1 {v0.2s, v1.2s}, [x0], x2 // Vector load from src[0_0] 137 ld1 {v2.2s, v3.2s}, [x0], x2 // Vector load from src[1_0] 138 ld1 {v4.2s, v5.2s}, [x0], x2 // Vector load from src[2_0] 139 ld1 {v6.2s, v7.2s}, [x0], x2 // Vector load from src[3_0] 140 add x14, x14, #1 //for checking loop 141 ld1 {v8.2s, v9.2s}, [x0], x2 // Vector load from src[4_0] 142 uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0] 143 ld1 {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0] 144 145loop_16: //when wd=16 146 147 uaddl v14.8h, v0.8b, v10.8b // temp = src[0_0] + src[5_0] 148 uaddl v16.8h, v2.8b, v8.8b // temp2 = src[1_0] + src[4_0] 149 mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 150 uaddl v20.8h, v1.8b, v11.8b // temp4 = src[0_8] + src[5_8] 151 uaddl v18.8h, v5.8b, v7.8b // temp3 = src[2_8] + src[3_8] 152 mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 153 ld1 {v0.2s, v1.2s}, [x0], x2 154 uaddl v26.8h, v3.8b, v9.8b // temp5 = src[1_8] + src[4_8] 155 uaddl v12.8h, v6.8b, v8.8b 156 mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 157 uaddl v16.8h, v2.8b, v0.8b 158 uaddl v18.8h, v4.8b, v10.8b 159 mla v16.8h, v12.8h , v22.8h 160 mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 161 uaddl v26.8h, v5.8b, v11.8b 162 uaddl v12.8h, v7.8b, v9.8b 163 sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) 164 uaddl v14.8h, v3.8b, v1.8b 165 ld1 {v2.2s, v3.2s}, [x0], x2 166 mla v14.8h, v12.8h , v22.8h 167 mls v16.8h, v18.8h , v24.8h 168 sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) 169 ld1 {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 0 170 urhadd v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value 171 urhadd v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value 172 uaddl v18.8h, v4.8b, v2.8b 173 uaddl v12.8h, v8.8b, v10.8b 174 st1 {v30.2s, v31.2s}, [x1], x3 // Vector store to dst[0_0] 175 mla v18.8h, v12.8h , v22.8h 176 uaddl v20.8h, v6.8b, v0.8b 177 mls v14.8h, v26.8h , v24.8h 178 sqrshrun v30.8b, v16.8h, #5 179 uaddl v12.8h, v9.8b, v11.8b 180 uaddl v16.8h, v5.8b, v3.8b 181 uaddl v26.8h, v7.8b, v1.8b 182 mla v16.8h, v12.8h , v22.8h 183 mls v18.8h, v20.8h , v24.8h 184 ld1 {v4.2s, v5.2s}, [x0], x2 185 sqrshrun v31.8b, v14.8h, #5 186 ld1 {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 1 187 uaddl v12.8h, v10.8b, v0.8b 188 urhadd v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value 189 urhadd v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value 190 uaddl v14.8h, v6.8b, v4.8b 191 uaddl v20.8h, v8.8b, v2.8b 192 mla v14.8h, v12.8h , v22.8h 193 mls v16.8h, v26.8h , v24.8h 194 st1 {v30.2s, v31.2s}, [x1], x3 //store row 1 195 sqrshrun v30.8b, v18.8h, #5 196 uaddl v18.8h, v7.8b, v5.8b 197 uaddl v12.8h, v11.8b, v1.8b 198 mla v18.8h, v12.8h , v22.8h 199 uaddl v26.8h, v9.8b, v3.8b 200 mls v14.8h, v20.8h , v24.8h 201 ld1 {v6.2s, v7.2s}, [x0], x2 202 sqrshrun v31.8b, v16.8h, #5 203 ld1 {v16.2s, v17.2s}, [x7], x2 // Load for interpolation row 2 204 mls v18.8h, v26.8h , v24.8h 205 urhadd v30.16b, v16.16b , v30.16b // Interpolation to obtain qpel value 206 urhadd v31.16b, v17.16b , v31.16b // Interpolation to obtain qpel value 207 uaddl v12.8h, v0.8b, v2.8b // temp1 = src[2_0] + src[3_0] 208 st1 {v30.2s, v31.2s}, [x1], x3 //store row 2 209 uaddl v16.8h, v10.8b, v4.8b // temp2 = src[1_0] + src[4_0] 210 uaddl v20.8h, v9.8b, v7.8b // temp4 = src[0_8] + src[5_8] 211 sqrshrun v30.8b, v14.8h, #5 212 uaddl v26.8h, v5.8b, v11.8b // temp5 = src[1_8] + src[4_8] 213 uaddl v14.8h, v8.8b, v6.8b // temp = src[0_0] + src[5_0] 214 sqrshrun v31.8b, v18.8h, #5 215 ld1 {v18.2s, v19.2s}, [x7], x2 // Load for interpolation row 3 216 mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 217 urhadd v30.16b, v18.16b , v30.16b // Interpolation to obtain qpel value 218 urhadd v31.16b, v19.16b , v31.16b // Interpolation to obtain qpel value 219 uaddl v18.8h, v1.8b, v3.8b // temp3 = src[2_8] + src[3_8] 220 st1 {v30.2s, v31.2s}, [x1], x3 //store row 3 221 // 4 rows processed 222 mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 223 ld1 {v8.2s, v9.2s}, [x0], x2 224 uaddl v12.8h, v2.8b, v4.8b 225 uaddl v18.8h, v3.8b, v5.8b 226 mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 227 uaddl v28.8h, v9.8b, v11.8b 228 uaddl v16.8h, v6.8b, v0.8b 229 mla v28.8h, v18.8h , v22.8h // temp4 += temp3 * 20 230 mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 231 uaddl v26.8h, v1.8b, v7.8b 232 uaddl v18.8h, v5.8b, v7.8b 233 sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) 234 uaddl v14.8h, v8.8b, v10.8b 235 sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) 236 ld1 {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 4 237 ld1 {v10.2s, v11.2s}, [x0], x2 238 urhadd v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value 239 urhadd v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value 240 mls v28.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 241 st1 {v30.2s, v31.2s}, [x1], x3 // store row 4 242 mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 243 uaddl v20.8h, v11.8b, v1.8b 244 uaddl v26.8h, v3.8b, v9.8b 245 mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 246 uaddl v12.8h, v6.8b, v4.8b 247 uaddl v18.8h, v7.8b, v9.8b 248 sqrshrun v31.8b, v28.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) 249 mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 250 uaddl v16.8h, v8.8b, v2.8b 251 sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) 252 ld1 {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 5 253 mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 254 urhadd v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value 255 urhadd v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value 256 uaddl v14.8h, v10.8b, v0.8b 257 st1 {v30.2s, v31.2s}, [x1], x3 // store row 5 258 mla v14.8h, v12.8h , v22.8h // temp += temp1 * 20 259 ld1 {v0.2s, v1.2s}, [x0], x2 260 uaddl v26.8h, v5.8b, v11.8b 261 uaddl v12.8h, v8.8b, v6.8b 262 uaddl v28.8h, v0.8b, v2.8b 263 sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) 264 mla v28.8h, v12.8h , v22.8h // temp += temp1 * 20 265 uaddl v20.8h, v1.8b, v3.8b 266 mls v14.8h, v16.8h , v24.8h // temp -= temp2 * 5 267 mla v20.8h, v18.8h , v22.8h // temp4 += temp3 * 20 268 uaddl v16.8h, v10.8b, v4.8b 269 sqrshrun v30.8b, v14.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) 270 ld1 {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 6 271 mov v2.8b, v6.8b 272 mov v3.8b, v7.8b 273 urhadd v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value 274 urhadd v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value 275 276 mls v28.8h, v16.8h , v24.8h // temp -= temp2 * 5 277 st1 {v30.2s, v31.2s}, [x1], x3 // store row 6 278 sqrshrun v30.8b, v28.8h, #5 // dst[0_0] = CLIP_U8((temp +16) >> 5) 279 swp v0.8b, v4.8b // swapping registers to put it in order 280 swp v1.8b, v5.8b // swapping registers to put it in order 281 282 mls v20.8h, v26.8h , v24.8h // temp4 -= temp5 * 5 283 mov v6.8b, v10.8b 284 mov v7.8b, v11.8b 285 subs x12, x14, #1 // if height==16 - looping 286 swp v4.8b, v8.8b 287 swp v5.8b, v9.8b 288 sqrshrun v31.8b, v20.8h, #5 // dst[0_8] = CLIP_U8((temp4 +16) >> 5) 289 ld1 {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 7 290 urhadd v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value 291 urhadd v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value 292 st1 {v30.2s, v31.2s}, [x1], x3 // store row 7 293 bne end_func //if height =8 end function 294 add x14, x14, #1 //for checking loop 295 ld1 {v10.2s, v11.2s}, [x0], x2 296 uaddl v12.8h, v4.8b, v6.8b // temp1 = src[2_0] + src[3_0] 297 298 b loop_16 // looping if height =16 299 300loop_8_start: 301//// Processing row0 and row1 302 303 ld1 {v0.2s}, [x0], x2 // Vector load from src[0_0] 304 ld1 {v1.2s}, [x0], x2 // Vector load from src[1_0] 305 ld1 {v2.2s}, [x0], x2 // Vector load from src[2_0] 306 ld1 {v3.2s}, [x0], x2 // Vector load from src[3_0] 307 add x14, x14, #1 //for checking loop 308 ld1 {v4.2s}, [x0], x2 // Vector load from src[4_0] 309 ld1 {v5.2s}, [x0], x2 // Vector load from src[5_0] 310 311loop_8: 312 //for checking loop 313 uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0] 314 uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] 315 uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] 316 mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20 317 ld1 {v6.2s}, [x0], x2 318 uaddl v14.8h, v3.8b, v4.8b 319 uaddl v16.8h, v1.8b, v6.8b 320 uaddl v18.8h, v2.8b, v5.8b 321 mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5 322 mla v16.8h, v14.8h , v22.8h 323 ld1 {v7.2s}, [x0], x2 324 uaddl v20.8h, v4.8b, v5.8b 325 uaddl v12.8h, v2.8b, v7.8b 326 uaddl v10.8h, v3.8b, v6.8b 327 mls v16.8h, v18.8h , v24.8h 328 sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5) 329 mla v12.8h, v20.8h , v22.8h 330 ld1 {v8.2s}, [x7], x2 //Load value for interpolation (row0) 331 ld1 {v9.2s}, [x7], x2 //Load value for interpolation (row1) 332 ld1 {v0.2s}, [x0], x2 333 uaddl v14.8h, v5.8b, v6.8b 334 sqrshrun v27.8b, v16.8h, #5 335 urhadd v26.16b, v8.16b , v26.16b // Interpolation step for qpel calculation 336 urhadd v27.16b, v9.16b , v27.16b // Interpolation step for qpel calculation 337 338 uaddl v20.8h, v3.8b, v0.8b 339 mls v12.8h, v10.8h , v24.8h 340 st1 {v26.2s}, [x1], x3 // Vector store to dst[0_0] 341 uaddl v18.8h, v4.8b, v7.8b 342 mla v20.8h, v14.8h , v22.8h 343 st1 {v27.2s}, [x1], x3 // Vector store to dst[1_0] 344 sqrshrun v28.8b, v12.8h, #5 345 mls v20.8h, v18.8h , v24.8h 346 ld1 {v12.2s}, [x7], x2 //Load value for interpolation (row2) 347 ld1 {v13.2s}, [x7], x2 //Load value for interpolation (row3) 348 ld1 {v1.2s}, [x0], x2 349 sqrshrun v29.8b, v20.8h, #5 350 subs x9, x4, #4 351 urhadd v28.16b, v12.16b , v28.16b 352 urhadd v29.16b, v13.16b , v29.16b 353 st1 {v28.2s}, [x1], x3 //store row 2 354 st1 {v29.2s}, [x1], x3 //store row 3 355 beq end_func // Branch if height==4 356 uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0] 357 uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] 358 uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] 359 mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 360 ld1 {v2.2s}, [x0], x2 361 mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 362 uaddl v8.8h, v0.8b, v7.8b 363 uaddl v10.8h, v1.8b, v6.8b 364 uaddl v12.8h, v2.8b, v5.8b 365 sqrshrun v26.8b, v18.8h, #5 366 mla v12.8h, v8.8h , v22.8h 367 ld1 {v18.2s}, [x7], x2 //Load value for interpolation (row4) 368 ld1 {v19.2s}, [x7], x2 //Load value for interpolation (row5) 369 ld1 {v3.2s}, [x0], x2 370 mls v12.8h, v10.8h , v24.8h 371 sqrshrun v27.8b, v12.8h, #5 372 urhadd v26.16b, v18.16b , v26.16b // Interpolation step for qpel calculation 373 urhadd v27.16b, v19.16b , v27.16b // Interpolation step for qpel calculation 374 375 st1 {v26.2s}, [x1], x3 // store row 4 376 st1 {v27.2s}, [x1], x3 // store row 5 377 uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0] 378 uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0] 379 uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0] 380 mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 381 ld1 {v4.2s}, [x0], x2 382 mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 383 uaddl v8.8h, v2.8b, v1.8b 384 uaddl v10.8h, v3.8b, v0.8b 385 uaddl v12.8h, v4.8b, v7.8b 386 sqrshrun v26.8b, v18.8h, #5 387 mla v12.8h, v8.8h , v22.8h 388 ld1 {v18.2s}, [x7], x2 //Load value for interpolation (row6) 389 ld1 {v19.2s}, [x7], x2 //Load value for interpolation (row7) 390 ld1 {v5.2s}, [x0], x2 391 mls v12.8h, v10.8h , v24.8h 392 sqrshrun v27.8b, v12.8h, #5 393 urhadd v26.16b, v18.16b , v26.16b // Interpolation step for qpel calculation 394 urhadd v27.16b, v19.16b , v27.16b // Interpolation step for qpel calculation 395 396 subs x12, x14, #1 397 st1 {v26.2s}, [x1], x3 // store row 6 398 st1 {v27.2s}, [x1], x3 // store row 7 399 add x14, x14, #1 400 beq loop_8 //looping if height ==16 401 402 b end_func 403 404 405loop_4_start: 406//// Processing row0 and row1 407 408 409 ld1 {v0.s}[0], [x0], x2 // Vector load from src[0_0] 410 ld1 {v1.s}[0], [x0], x2 // Vector load from src[1_0] 411 ld1 {v2.s}[0], [x0], x2 // Vector load from src[2_0] 412 ld1 {v3.s}[0], [x0], x2 // Vector load from src[3_0] 413 ld1 {v4.s}[0], [x0], x2 // Vector load from src[4_0] 414 ld1 {v5.s}[0], [x0], x2 // Vector load from src[5_0] 415 416 uaddl v6.8h, v2.8b, v3.8b // temp1 = src[2_0] + src[3_0] 417 uaddl v8.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] 418 uaddl v10.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] 419 mla v8.8h, v6.8h , v22.8h // temp += temp1 * 20 420 ld1 {v6.2s}, [x0], x2 421 uaddl v14.8h, v3.8b, v4.8b 422 uaddl v16.8h, v1.8b, v6.8b 423 uaddl v18.8h, v2.8b, v5.8b 424 mls v8.8h, v10.8h , v24.8h // temp -= temp2 * 5 425 ld1 {v7.s}[0], [x0], x2 426 mla v16.8h, v14.8h , v22.8h 427 uaddl v20.8h, v4.8b, v5.8b 428 uaddl v12.8h, v2.8b, v7.8b 429 uaddl v10.8h, v3.8b, v6.8b 430 mls v16.8h, v18.8h , v24.8h 431 sqrshrun v26.8b, v8.8h, #5 // dst[0_0] = CLIP_U8( (temp + 16) >> 5) 432 ld1 {v8.s}[0], [x7], x2 //Load value for interpolation - row 0 433 ld1 {v9.s}[0], [x7], x2 //Load value for interpolation - row 1 434 mla v12.8h, v20.8h , v22.8h 435 ld1 {v0.s}[0], [x0], x2 436 uaddl v14.8h, v5.8b, v6.8b 437 sqrshrun v27.8b, v16.8h, #5 438 uaddl v20.8h, v3.8b, v0.8b 439 urhadd v26.16b, v26.16b , v8.16b //Interpolation step for qpel calculation 440 urhadd v27.16b, v27.16b , v9.16b //Interpolation step for qpel calculation 441 442 mls v12.8h, v10.8h , v24.8h 443 st1 {v26.s}[0], [x1], x3 // Vector store to dst[0_0] 444 uaddl v18.8h, v4.8b, v7.8b 445 mla v20.8h, v14.8h , v22.8h 446 st1 {v27.s}[0], [x1], x3 // store row 1 447 sqrshrun v28.8b, v12.8h, #5 448 ld1 {v12.s}[0], [x7], x2 //Load value for interpolation - row 2 449 ld1 {v13.s}[0], [x7], x2 //Load value for interpolation - row 3 450 451 mls v20.8h, v18.8h , v24.8h 452 ld1 {v1.s}[0], [x0], x2 453 sqrshrun v29.8b, v20.8h, #5 454 urhadd v28.16b, v12.16b , v28.16b //Interpolation step for qpel calculation 455 urhadd v29.16b, v13.16b , v29.16b //Interpolation step for qpel calculation 456 457 st1 {v28.s}[0], [x1], x3 //store row 2 458 st1 {v29.s}[0], [x1], x3 //store row 3 459 460 subs x9, x4, #4 461 beq end_func // Branch if height==4 462 463 464 uaddl v14.8h, v6.8b, v7.8b // temp1 = src[2_0] + src[3_0] 465 uaddl v16.8h, v0.8b, v5.8b // temp = src[0_0] + src[5_0] 466 uaddl v18.8h, v1.8b, v4.8b // temp2 = src[1_0] + src[4_0] 467 mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 468 ld1 {v2.s}[0], [x0], x2 469 mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 470 uaddl v8.8h, v0.8b, v7.8b 471 uaddl v10.8h, v1.8b, v6.8b 472 uaddl v12.8h, v2.8b, v5.8b 473 sqrshrun v26.8b, v18.8h, #5 474 ld1 {v18.s}[0], [x7], x2 //Load value for interpolation - row 4 475 ld1 {v19.s}[0], [x7], x2 //Load value for interpolation - row 5 476 mla v12.8h, v8.8h , v22.8h 477 ld1 {v3.s}[0], [x0], x2 478 mls v12.8h, v10.8h , v24.8h 479 sqrshrun v27.8b, v12.8h, #5 480 urhadd v26.16b, v18.16b , v26.16b //Interpolation step for qpel calculation 481 urhadd v27.16b, v27.16b , v19.16b //Interpolation step for qpel calculation 482 483 st1 {v26.s}[0], [x1], x3 //store row 4 484 st1 {v27.s}[0], [x1], x3 // store row 5 485 uaddl v14.8h, v0.8b, v1.8b // temp1 = src[2_0] + src[3_0] 486 uaddl v16.8h, v2.8b, v7.8b // temp = src[0_0] + src[5_0] 487 uaddl v18.8h, v3.8b, v6.8b // temp2 = src[1_0] + src[4_0] 488 mla v18.8h, v14.8h , v22.8h // temp += temp1 * 20 489 ld1 {v4.s}[0], [x0], x2 490 mls v18.8h, v16.8h , v24.8h // temp -= temp2 * 5 491 uaddl v8.8h, v2.8b, v1.8b 492 uaddl v10.8h, v3.8b, v0.8b 493 uaddl v12.8h, v4.8b, v7.8b 494 sqrshrun v26.8b, v18.8h, #5 495 ld1 {v18.s}[0], [x7], x2 //Load value for interpolation - row 6 496 ld1 {v19.s}[0], [x7], x2 //Load value for interpolation - row 7 497 mla v12.8h, v8.8h , v22.8h 498 ld1 {v5.s}[0], [x0], x2 499 mls v12.8h, v10.8h , v24.8h 500 sqrshrun v27.8b, v12.8h, #5 501 urhadd v26.16b, v18.16b , v26.16b //Interpolation step for qpel calculation 502 urhadd v27.16b, v19.16b , v27.16b //Interpolation step for qpel calculation 503 504 st1 {v26.s}[0], [x1], x3 // store row 6 505 st1 {v27.s}[0], [x1], x3 // store row 7 506 507 508end_func: 509 // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack 510 ldp x19, x20, [sp], #16 511 pop_v_regs 512 ret 513 514 515 516