1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21//****************************************************************************** 22//* @file 23//* ih264_inter_pred_luma_horz_qpel_av8.s 24//* 25//* @brief 26//* Contains function definitions for inter prediction horizontal quarter pel interpolation. 27//* 28//* @author 29//* Mohit 30//* 31//* @par List of Functions: 32//* 33//* - ih264_inter_pred_luma_horz_qpel_av8() 34//* 35//* @remarks 36//* None 37//* 38//******************************************************************************* 39//*/ 40 41///* All the functions here are replicated from ih264_inter_pred_filters.c 42// 43 44///** 45///** 46//******************************************************************************* 47//* 48//* @brief 49//* Quarter pel interprediction luma filter for horizontal input 50//* 51//* @par Description: 52//* Applies a 6 tap horizontal filter .The output is clipped to 8 bits 53//* sec 8.4.2.2.1 titled "Luma sample interpolation process" 54//* 55//* @param[in] pu1_src 56//* UWORD8 pointer to the source 57//* 58//* @param[out] pu1_dst 59//* UWORD8 pointer to the destination 60//* 61//* @param[in] src_strd 62//* integer source stride 63//* 64//* @param[in] dst_strd 65//* integer destination stride 66//* 67//* @param[in] ht 68//* integer height of the array 69//* 70//* @param[in] wd 71//* integer width of the array 72//* 73// @param[in] pu1_tmp: temporary buffer: UNUSED in this function 74//* 75//* @param[in] dydx: x and y reference offset for qpel calculations. 76//* @returns 77//* 78// @remarks 79//* None 80//* 81//******************************************************************************* 82//*/ 83 84//void ih264_inter_pred_luma_horz ( 85// UWORD8 *pu1_src, 86// UWORD8 *pu1_dst, 87// WORD32 src_strd, 88// WORD32 dst_strd, 89// WORD32 ht, 90// WORD32 wd, 91// UWORD8* pu1_tmp, 92// UWORD32 dydx) 93 94//**************Variables Vs Registers***************************************** 95// x0 => *pu1_src 96// x1 => *pu1_dst 97// w2 => src_strd 98// w3 => dst_strd 99// w4 => ht 100// w5 => wd 101// w7 => dydx 102 103.text 104.p2align 2 105.include "ih264_neon_macros.s" 106 107 108 109 110 .global ih264_inter_pred_luma_horz_qpel_av8 111 112ih264_inter_pred_luma_horz_qpel_av8: 113 114 115 push_v_regs 116 stp x19, x20, [sp, #-16]! 117 sxtw x2, w2 118 sxtw x3, w3 119 sxtw x4, w4 120 sxtw x5, w5 121 122 123 and x7, x7, #3 //Finds x-offset 124 add x7, x0, x7, lsr #1 //pu1_src + (x_offset>>1) 125 sub x0, x0, #2 //pu1_src-2 126 sub x14, x4, #16 127 movi v0.16b, #5 //filter coeff 128 subs x12, x5, #8 //if wd=8 branch to loop_8 129 movi v1.16b, #20 //filter coeff 130 131 beq loop_8 132 133 subs x12, x5, #4 //if wd=4 branch to loop_4 134 beq loop_4 135 136loop_16: //when wd=16 137 //// Processing row0 and row1 138 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0 139 add x14, x14, #1 //for checking loop 140 ext v31.8b, v2.8b , v3.8b , #5 141 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1 142 ext v30.8b, v3.8b , v4.8b , #5 143 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) 144 ext v28.8b, v5.8b , v6.8b , #5 145 uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row0) 146 ext v27.8b, v6.8b , v7.8b , #5 147 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) 148 ext v31.8b, v2.8b , v3.8b , #2 149 uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row1) 150 ext v30.8b, v3.8b , v4.8b , #2 151 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 152 ext v28.8b, v5.8b , v6.8b , #2 153 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row0) 154 ext v27.8b, v6.8b , v7.8b , #2 155 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) 156 ext v31.8b, v2.8b , v3.8b , #3 157 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row1) 158 ext v30.8b, v3.8b , v4.8b , #3 159 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 160 ext v28.8b, v5.8b , v6.8b , #3 161 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row0) 162 ext v27.8b, v6.8b , v7.8b , #3 163 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) 164 ext v31.8b, v2.8b , v3.8b , #1 165 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row1) 166 ext v30.8b, v3.8b , v4.8b , #1 167 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 168 ext v28.8b, v5.8b , v6.8b , #1 169 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) 170 ext v27.8b, v6.8b , v7.8b , #1 171 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) 172 ext v31.8b, v2.8b , v3.8b , #4 173 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) 174 ext v30.8b, v3.8b , v4.8b , #4 175 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 176 ext v28.8b, v5.8b , v6.8b , #4 177 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) 178 ext v27.8b, v6.8b , v7.8b , #4 179 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) 180 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row2 181 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) 182 183 ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row0) 184 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 185 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row3 186 sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) 187 ext v31.8b, v2.8b , v3.8b , #5 188 urhadd v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation 189 urhadd v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation 190 191 sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) 192 st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row0 193 ext v30.8b, v3.8b , v4.8b , #5 194 sqrshrun v19.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) 195 196 197 198//// Processing row2 and row3 199 ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row1) 200 ext v28.8b, v5.8b , v6.8b , #5 201 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 202 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 203 204 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) 205 st1 {v18.8b, v19.8b}, [x1], x3 ////Store dest row1 206 uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row2) 207 ext v27.8b, v6.8b , v7.8b , #5 208 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) 209 ext v31.8b, v2.8b , v3.8b , #2 210 uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row3) 211 ext v30.8b, v3.8b , v4.8b , #2 212 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) 213 ext v27.8b, v6.8b , v7.8b , #2 214 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row2) 215 ext v28.8b, v5.8b , v6.8b , #2 216 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) 217 ext v31.8b, v2.8b , v3.8b , #3 218 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row3) 219 ext v30.8b, v3.8b , v4.8b , #3 220 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) 221 ext v28.8b, v5.8b , v6.8b , #3 222 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row2) 223 ext v27.8b, v6.8b , v7.8b , #3 224 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) 225 ext v31.8b, v2.8b , v3.8b , #1 226 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row3) 227 ext v30.8b, v3.8b , v4.8b , #1 228 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) 229 ext v28.8b, v5.8b , v6.8b , #1 230 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row2) 231 ext v27.8b, v6.8b , v7.8b , #1 232 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) 233 ext v31.8b, v2.8b , v3.8b , #4 234 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row3) 235 ext v30.8b, v3.8b , v4.8b , #4 236 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) 237 ext v28.8b, v5.8b , v6.8b , #4 238 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row2) 239 ext v27.8b, v6.8b , v7.8b , #4 240 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) 241 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row4 242 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row3) 243 244 ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row2) 245 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) 246 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row5 247 sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row2) 248 ext v31.8b, v2.8b , v3.8b , #5 249 urhadd v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation 250 urhadd v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation 251 252 sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) 253 ext v30.8b, v3.8b , v4.8b , #5 254 st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row2 255 sqrshrun v19.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row3) 256 ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row3) 257 258//// Processing row4 and row5 259 ext v28.8b, v5.8b , v6.8b , #5 260 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 261 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 262 263 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row4) 264 st1 {v18.8b, v19.8b}, [x1], x3 ////Store dest row3 265 uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row4) 266 ext v27.8b, v6.8b , v7.8b , #5 267 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row5) 268 ext v31.8b, v2.8b , v3.8b , #2 269 uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row5) 270 ext v30.8b, v3.8b , v4.8b , #2 271 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row4) 272 ext v27.8b, v6.8b , v7.8b , #2 273 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row4) 274 ext v28.8b, v5.8b , v6.8b , #2 275 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row5) 276 ext v31.8b, v2.8b , v3.8b , #3 277 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row5) 278 ext v30.8b, v3.8b , v4.8b , #3 279 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row4) 280 ext v28.8b, v5.8b , v6.8b , #3 281 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row4) 282 ext v27.8b, v6.8b , v7.8b , #3 283 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row5) 284 ext v31.8b, v2.8b , v3.8b , #1 285 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row5) 286 ext v30.8b, v3.8b , v4.8b , #1 287 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) 288 ext v28.8b, v5.8b , v6.8b , #1 289 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row4) 290 ext v27.8b, v6.8b , v7.8b , #1 291 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) 292 ext v31.8b, v2.8b , v3.8b , #4 293 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row5) 294 ext v30.8b, v3.8b , v4.8b , #4 295 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row4) 296 ext v28.8b, v5.8b , v6.8b , #4 297 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row4) 298 ext v27.8b, v6.8b , v7.8b , #4 299 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row5) 300 ld1 {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row6 301 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row5) 302 ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row4) 303 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row4) 304 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row7 305 sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row4) 306 ext v31.8b, v2.8b , v3.8b , #5 307 urhadd v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation 308 urhadd v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation 309 310 sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row5) 311 st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row4 312 ext v30.8b, v3.8b , v4.8b , #5 313 sqrshrun v19.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row5) 314 ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row5) 315 316 317 //// Processing row6 and row7 318 319 ext v28.8b, v5.8b , v6.8b , #5 320 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 321 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 322 323 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row6) 324 st1 {v18.8b, v19.8b}, [x1], x3 ////Store dest row5 325 uaddl v10.8h, v30.8b, v3.8b //// a0 + a5 (column2,row6) 326 ext v27.8b, v6.8b , v7.8b , #5 327 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row7) 328 ext v31.8b, v2.8b , v3.8b , #2 329 uaddl v16.8h, v27.8b, v6.8b //// a0 + a5 (column2,row7) 330 ext v30.8b, v3.8b , v4.8b , #2 331 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 (column1,row6) 332 ext v27.8b, v6.8b , v7.8b , #2 333 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column2,row6) 334 ext v28.8b, v5.8b , v6.8b , #2 335 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 (column1,row7) 336 ext v31.8b, v2.8b , v3.8b , #3 337 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 (column2,row7) 338 ext v30.8b, v3.8b , v4.8b , #3 339 umlal v8.8h, v31.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row6) 340 ext v28.8b, v5.8b , v6.8b , #3 341 umlal v10.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row6) 342 ext v27.8b, v6.8b , v7.8b , #3 343 umlal v14.8h, v28.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row7) 344 ext v31.8b, v2.8b , v3.8b , #1 345 umlal v16.8h, v27.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column2,row7) 346 ext v30.8b, v3.8b , v4.8b , #1 347 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) 348 ext v28.8b, v5.8b , v6.8b , #1 349 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row6) 350 ext v27.8b, v6.8b , v7.8b , #1 351 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) 352 ext v31.8b, v2.8b , v3.8b , #4 353 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row7) 354 ext v30.8b, v3.8b , v4.8b , #4 355 umlsl v8.8h, v31.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row6) 356 ext v28.8b, v5.8b , v6.8b , #4 357 umlsl v10.8h, v30.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row6) 358 ext v27.8b, v6.8b , v7.8b , #4 359 ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row6) 360 sqrshrun v20.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row6) 361 umlsl v14.8h, v28.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row7) 362 sqrshrun v21.8b, v10.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row6) 363 umlsl v16.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row7) 364 urhadd v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation 365 urhadd v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation 366 367 ld1 {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation (column1,row7) 368 sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row7) 369 st1 {v20.8b, v21.8b}, [x1], x3 ////Store dest row6 370 sqrshrun v19.8b, v16.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row7) 371 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 372 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 373 374 subs x12, x14, #1 // if height==16 - looping 375 st1 {v18.8b, v19.8b}, [x1], x3 ////Store dest row7 376 377 378 379 beq loop_16 380 b end_func 381 382loop_8: 383//// Processing row0 and row1 384 385 ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row1 386 add x14, x14, #1 //for checking loop 387 ext v28.8b, v5.8b , v6.8b , #5 388 ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row0 389 ext v25.8b, v5.8b , v6.8b , #2 390 ext v31.8b, v2.8b , v3.8b , #5 391 ext v24.8b, v5.8b , v6.8b , #3 392 ext v23.8b, v5.8b , v6.8b , #1 393 ext v22.8b, v5.8b , v6.8b , #4 394 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) 395 ext v29.8b, v2.8b , v3.8b , #3 396 umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) 397 umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) 398 umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) 399 umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) 400 ext v30.8b, v2.8b , v3.8b , #2 401 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) 402 ext v27.8b, v2.8b , v3.8b , #1 403 ext v26.8b, v2.8b , v3.8b , #4 404 ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row2 405 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 406 umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 407 umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 408 umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 409 ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3 410 sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 411 412 //// Processing row2 and row3 413 ext v28.8b, v5.8b , v6.8b , #5 414 ext v25.8b, v5.8b , v6.8b , #2 415 ext v31.8b, v2.8b , v3.8b , #5 416 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) 417 ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row0) 418 ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row1) 419 ext v24.8b, v5.8b , v6.8b , #3 420 ext v23.8b, v5.8b , v6.8b , #1 421 sqrshrun v19.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) 422 ext v22.8b, v5.8b , v6.8b , #4 423 ext v29.8b, v2.8b , v3.8b , #3 424 umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) 425 umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) 426 umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) 427 umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) 428 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 429 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 430 431 st1 {v18.8b}, [x1], x3 ////Store dest row0 432 st1 {v19.8b}, [x1], x3 ////Store dest row1 433 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) 434 ext v30.8b, v2.8b , v3.8b , #2 435 ext v27.8b, v2.8b , v3.8b , #1 436 ext v26.8b, v2.8b , v3.8b , #4 437 ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row4 438 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) 439 umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) 440 umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) 441 umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) 442 ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row5 443 subs x9, x4, #4 444 sqrshrun v19.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) 445 ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row2) 446 ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row3) 447 ext v28.8b, v5.8b , v6.8b , #5 448 ext v25.8b, v5.8b , v6.8b , #2 449 ext v31.8b, v2.8b , v3.8b , #5 450 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row5) 451 ext v24.8b, v5.8b , v6.8b , #3 452 sqrshrun v18.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) 453 ext v22.8b, v5.8b , v6.8b , #4 454 ext v29.8b, v2.8b , v3.8b , #3 455 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 456 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 457 458 st1 {v18.8b}, [x1], x3 ////Store dest row2 459 ext v30.8b, v2.8b , v3.8b , #2 460 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row4) 461 st1 {v19.8b}, [x1], x3 ////Store dest row3 462 beq end_func // Branch if height==4 463 464//// Processing row4 and row5 465 ext v23.8b, v5.8b , v6.8b , #1 466 umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row5) 467 umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row5) 468 umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row5) 469 umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row5) 470 ext v27.8b, v2.8b , v3.8b , #1 471 ext v26.8b, v2.8b , v3.8b , #4 472 ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row6 473 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row4) 474 umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row4) 475 umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row4) 476 umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row4) 477 sqrshrun v19.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row5) 478 ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row7 479 ext v31.8b, v2.8b , v3.8b , #5 480 ext v28.8b, v5.8b , v6.8b , #5 481 ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row4) 482 ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row5) 483 ext v25.8b, v5.8b , v6.8b , #2 484 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row7) 485 ext v24.8b, v5.8b , v6.8b , #3 486 ext v22.8b, v5.8b , v6.8b , #4 487 sqrshrun v18.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row4) 488 ext v29.8b, v2.8b , v3.8b , #3 489 ext v30.8b, v2.8b , v3.8b , #2 490 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 491 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 492 493 st1 {v18.8b}, [x1], x3 ////Store dest row4 494 ext v27.8b, v2.8b , v3.8b , #1 495 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row6) 496 ext v26.8b, v2.8b , v3.8b , #4 497 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row6) 498 umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row6) 499 umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row6) 500 umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row6) 501 //// Processing row6 and row7 502 st1 {v19.8b}, [x1], x3 ////Store dest row5 503 ext v23.8b, v5.8b , v6.8b , #1 504 umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row7) 505 umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row7) 506 umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row7) 507 umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row7) 508 ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row6) 509 ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row7) 510 sqrshrun v18.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row6) 511 subs x12, x14, #1 512 sqrshrun v19.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row7) 513 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 514 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 515 516 st1 {v18.8b}, [x1], x3 ////Store dest row6 517 st1 {v19.8b}, [x1], x3 ////Store dest row7 518 519 beq loop_8 //looping if height ==16 520 521 b end_func 522 523loop_4: 524 ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row1 525 ext v28.8b, v5.8b , v6.8b , #5 526 ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row0 527 ext v25.8b, v5.8b , v6.8b , #2 528 ext v31.8b, v2.8b , v3.8b , #5 529 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row1) 530 ext v24.8b, v5.8b , v6.8b , #3 531 ext v23.8b, v5.8b , v6.8b , #1 532 ext v22.8b, v5.8b , v6.8b , #4 533 ext v29.8b, v2.8b , v3.8b , #3 534 umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row1) 535 umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row1) 536 umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) 537 umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) 538 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row0) 539 ext v30.8b, v2.8b , v3.8b , #2 540 ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row0) 541 ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row1) 542 ext v27.8b, v2.8b , v3.8b , #1 543 ext v26.8b, v2.8b , v3.8b , #4 544 ld1 {v2.8b, v3.8b}, [x0], x2 //// Load row2 545 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row0) 546 umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row0) 547 umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) 548 umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) 549 ld1 {v5.8b, v6.8b}, [x0], x2 //// Load row3 550 ext v28.8b, v5.8b , v6.8b , #5 551 ext v25.8b, v5.8b , v6.8b , #2 552 sqrshrun v18.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) 553 ext v31.8b, v2.8b , v3.8b , #5 554 ext v24.8b, v5.8b , v6.8b , #3 555 556 ext v23.8b, v5.8b , v6.8b , #1 557 ext v22.8b, v5.8b , v6.8b , #4 558 ext v29.8b, v2.8b , v3.8b , #3 559 sqrshrun v19.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) 560 ext v30.8b, v2.8b , v3.8b , #2 561 ext v27.8b, v2.8b , v3.8b , #1 562 563 //// Processing row2 and row3 564 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 565 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 566 567 st1 {v18.s}[0], [x1], x3 ////Store dest row0 568 st1 {v19.s}[0], [x1], x3 ////Store dest row1 569 uaddl v14.8h, v28.8b, v5.8b //// a0 + a5 (column1,row3) 570 ext v26.8b, v2.8b , v3.8b , #4 571 ld1 {v12.2s}, [x7], x2 //Load value for interpolation (column1,row2) 572 ld1 {v13.2s}, [x7], x2 //Load value for interpolation (column1,row3) 573 574 umlal v14.8h, v25.8b, v1.8b //// a0 + a5 + 20a2 (column1,row3) 575 umlal v14.8h, v24.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row3) 576 umlsl v14.8h, v23.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row3) 577 umlsl v14.8h, v22.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row3) 578 uaddl v8.8h, v31.8b, v2.8b //// a0 + a5 (column1,row2) 579 umlal v8.8h, v29.8b, v1.8b //// a0 + a5 + 20a2 + 20a3 (column1,row2) 580 umlal v8.8h, v30.8b, v1.8b //// a0 + a5 + 20a2 (column1,row2) 581 umlsl v8.8h, v27.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row2) 582 umlsl v8.8h, v26.8b, v0.8b //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row2) 583 sqrshrun v19.8b, v14.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row3) 584 sqrshrun v18.8b, v8.8h, #5 //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row2) 585 urhadd v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation 586 urhadd v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation 587 588 st1 {v18.s}[0], [x1], x3 ////Store dest row2 589 subs x4, x4, #8 // Loop if height =8 590 st1 {v19.s}[0], [x1], x3 ////Store dest row3 591 592 beq loop_4 593 594end_func: 595 596 ldp x19, x20, [sp], #16 597 pop_v_regs 598 ret 599 600 601 602