1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21//****************************************************************************** 22//* @file 23//* ih264_inter_pred_chroma_av8.s 24//* 25//* @brief 26//* Contains function definitions for inter prediction interpolation. 27//* 28//* @author 29//* Ittaim 30//* 31//* @par List of Functions: 32//* 33//* - ih264_inter_pred_chroma_av8() 34//* 35//* @remarks 36//* None 37//* 38//******************************************************************************* 39//*/ 40 41///* All the functions here are replicated from ih264_inter_pred_filters.c 42// 43 44///** 45///** 46///** 47// 48///** 49//******************************************************************************* 50//* 51//* @brief 52//* Interprediction chroma filter 53//* 54//* @par Description: 55//* Applies filtering to chroma samples as mentioned in 56//* sec 8.4.2.2.2 titled "chroma sample interpolation process" 57//* 58//* @param[in] pu1_src 59//* UWORD8 pointer to the source containing alternate U and V samples 60//* 61//* @param[out] pu1_dst 62//* UWORD8 pointer to the destination 63//* 64//* @param[in] src_strd 65//* integer source stride 66//* 67//* @param[in] dst_strd 68//* integer destination stride 69//* 70//* @param[in]uc_dx 71//* dx value where the sample is to be produced(refer sec 8.4.2.2.2 ) 72//* 73//* @param[in] uc_dy 74//* dy value where the sample is to be produced(refer sec 8.4.2.2.2 ) 75//* 76//* @param[in] ht 77//* integer height of the array 78//* 79//* @param[in] wd 80//* integer width of the array 81//* 82//* @returns 83//* 84//* @remarks 85//* None 86//* 87//******************************************************************************* 88//*/ 89 90//void ih264_inter_pred_chroma(UWORD8 *pu1_src, 91// UWORD8 *pu1_dst, 92// WORD32 src_strd, 93// WORD32 dst_strd, 94// WORD32 u1_dx, 95// WORD32 u1_dy, 96// WORD32 ht, 97// WORD32 wd) 98//**************Variables Vs Registers***************************************** 99// x0 => *pu1_src 100// x1 => *pu1_dst 101// w2 => src_strd 102// w3 => dst_strd 103// w4 => u1_dx 104// w5 => u1_dy 105// w6 => height 106// w7 => width 107// 108.text 109.p2align 2 110.include "ih264_neon_macros.s" 111 112 113 114 .global ih264_inter_pred_chroma_av8 115 116ih264_inter_pred_chroma_av8: 117 118 119 120 // STMFD sp!, {x4-x12, x14} //store register values to stack 121 push_v_regs 122 stp x19, x20, [sp, #-16]! 123 sxtw x2, w2 124 sxtw x3, w3 125 sxtw x4, w4 126 sxtw x5, w5 127 sxtw x6, w6 128 sxtw x7, w7 129 130 131 132 133 134 sub x20, x4, #8 //8-u1_dx 135 neg x8, x20 136 sub x20, x5, #8 //8-u1_dy 137 neg x9, x20 138 mul x10, x8, x9 // 139 mul x11, x4, x9 // 140 141 dup v28.8b, w10 142 dup v29.8b, w11 143 144 mul x10, x8, x5 // 145 mul x11, x4, x5 // 146 147 dup v30.8b, w10 148 dup v31.8b, w11 149 150 subs x12, x7, #2 //if wd=4 branch to loop_4 151 beq loop_2 152 subs x12, x7, #4 //if wd=8 branch to loop_8 153 beq loop_4 154 155loop_8: 156 ld1 {v0.8b, v1.8b, v2.8b}, [x0], x2 //// Load row0 ; 157 ext v3.8b, v0.8b , v1.8b , #2 158 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1; 159 umull v20.8h, v0.8b, v28.8b 160 ext v8.8b, v5.8b , v6.8b , #2 161 umlal v20.8h, v3.8b, v29.8b 162 ext v9.8b, v6.8b , v7.8b , #2 163 umlal v20.8h, v5.8b, v30.8b 164 ext v4.8b, v1.8b , v2.8b , #2 165 umlal v20.8h, v8.8b, v31.8b 166 sqrshrun v26.8b, v20.8h, #6 167 umull v22.8h, v1.8b, v28.8b 168 ld1 {v10.8b, v11.8b, v12.8b}, [x0], x2 //// Load row2 ; 169 umlal v22.8h, v4.8b, v29.8b 170 ext v13.8b, v10.8b , v11.8b , #2 171 umlal v22.8h, v6.8b, v30.8b 172 ext v14.8b, v11.8b , v12.8b , #2 173 umlal v22.8h, v9.8b, v31.8b 174 sqrshrun v27.8b, v22.8h, #6 175 umull v24.8h, v5.8b, v28.8b 176 st1 { v26.8b, v27.8b}, [x1], x3 ////Store dest row 177 umlal v24.8h, v8.8b, v29.8b 178 ld1 {v0.8b, v1.8b, v2.8b}, [x0], x2 //// Load row3 ; 179 umlal v24.8h, v10.8b, v30.8b 180 ext v3.8b, v0.8b , v1.8b , #2 181 umlal v24.8h, v13.8b, v31.8b 182 ext v4.8b, v1.8b , v2.8b , #2 183 umull v16.8h, v6.8b, v28.8b 184 sqrshrun v18.8b, v24.8h, #6 185 umlal v16.8h, v9.8b, v29.8b 186 umlal v16.8h, v11.8b, v30.8b 187 umlal v16.8h, v14.8b, v31.8b 188 sqrshrun v19.8b, v16.8h, #6 189 st1 {v18.8b, v19.8b}, [x1], x3 // store row 1 190 umull v20.8h, v10.8b, v28.8b 191 umlal v20.8h, v13.8b, v29.8b 192 umlal v20.8h, v0.8b, v30.8b 193 umlal v20.8h, v3.8b, v31.8b 194 sqrshrun v26.8b, v20.8h, #6 195 umull v24.8h, v11.8b, v28.8b 196 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row4; 197 umlal v24.8h, v14.8b, v29.8b 198 ext v8.8b, v5.8b , v6.8b , #2 199 umlal v24.8h, v1.8b, v30.8b 200 ext v9.8b, v6.8b , v7.8b , #2 201 umlal v24.8h, v4.8b, v31.8b 202 umull v20.8h, v0.8b, v28.8b 203 sqrshrun v27.8b, v24.8h, #6 204 umlal v20.8h, v3.8b, v29.8b 205 st1 { v26.8b, v27.8b}, [x1], x3 ////Store dest row2 206 umlal v20.8h, v5.8b, v30.8b 207 umlal v20.8h, v8.8b, v31.8b 208 umull v22.8h, v1.8b, v28.8b 209 umlal v22.8h, v4.8b, v29.8b 210 umlal v22.8h, v6.8b, v30.8b 211 sqrshrun v26.8b, v20.8h, #6 212 umlal v22.8h, v9.8b, v31.8b 213 subs x12, x6, #4 214 sqrshrun v27.8b, v22.8h, #6 215 st1 { v26.8b, v27.8b}, [x1], x3 ////Store dest row3 216 217 beq end_func //If ht=4 218 219 ld1 {v10.8b, v11.8b, v12.8b}, [x0], x2 //// Load row5 220 ext v13.8b, v10.8b , v11.8b , #2 221 umull v24.8h, v5.8b, v28.8b 222 ext v14.8b, v11.8b , v12.8b , #2 223 ld1 {v0.8b, v1.8b, v2.8b}, [x0], x2 //// Load row6; 224 umlal v24.8h, v8.8b, v29.8b 225 umlal v24.8h, v10.8b, v30.8b 226 umlal v24.8h, v13.8b, v31.8b 227 ext v3.8b, v0.8b , v1.8b , #2 228 umull v16.8h, v6.8b, v28.8b 229 sqrshrun v18.8b, v24.8h, #6 230 umlal v16.8h, v9.8b, v29.8b 231 umlal v16.8h, v11.8b, v30.8b 232 umlal v16.8h, v14.8b, v31.8b 233 ext v4.8b, v1.8b , v2.8b , #2 234 sqrshrun v19.8b, v16.8h, #6 235 st1 { v18.8b, v19.8b}, [x1], x3 // store row 4 236 umull v20.8h, v10.8b, v28.8b 237 umlal v20.8h, v13.8b, v29.8b 238 umlal v20.8h, v0.8b, v30.8b 239 umlal v20.8h, v3.8b, v31.8b 240 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row7; 241 sqrshrun v26.8b, v20.8h, #6 242 umull v24.8h, v11.8b, v28.8b 243 umlal v24.8h, v14.8b, v29.8b 244 ext v8.8b, v5.8b , v6.8b , #2 245 umlal v24.8h, v1.8b, v30.8b 246 umlal v24.8h, v4.8b, v31.8b 247 ext v9.8b, v6.8b , v7.8b , #2 248 sqrshrun v27.8b, v24.8h, #6 249 st1 {v26.8b, v27.8b}, [x1], x3 ////Store dest row5 250 umull v20.8h, v0.8b, v28.8b 251 umlal v20.8h, v3.8b, v29.8b 252 umlal v20.8h, v5.8b, v30.8b 253 umlal v20.8h, v8.8b, v31.8b 254 ld1 {v10.8b, v11.8b, v12.8b}, [x0], x2 //// Load row8 ; 255 sqrshrun v26.8b, v20.8h, #6 256 umull v22.8h, v1.8b, v28.8b 257 umlal v22.8h, v4.8b, v29.8b 258 umlal v22.8h, v6.8b, v30.8b 259 ext v13.8b, v10.8b , v11.8b , #2 260 umlal v22.8h, v9.8b, v31.8b 261 ext v14.8b, v11.8b , v12.8b , #2 262 sqrshrun v27.8b, v22.8h, #6 263 st1 { v26.8b, v27.8b}, [x1], x3 ////Store dest row6 264 umull v24.8h, v5.8b, v28.8b 265 umlal v24.8h, v8.8b, v29.8b 266 umlal v24.8h, v10.8b, v30.8b 267 umlal v24.8h, v13.8b, v31.8b 268 umull v16.8h, v6.8b, v28.8b 269 sqrshrun v18.8b, v24.8h, #6 270 umlal v16.8h, v9.8b, v29.8b 271 umlal v16.8h, v11.8b, v30.8b 272 umlal v16.8h, v14.8b, v31.8b 273 sqrshrun v19.8b, v16.8h, #6 274 st1 { v18.8b, v19.8b}, [x1], x3 // store row 7 275 b end_func 276 277loop_4: 278 ld1 {v0.8b, v1.8b}, [x0], x2 //// Load row0 ; 279 ext v2.8b, v0.8b , v1.8b , #2 280 ld1 {v3.8b, v4.8b}, [x0], x2 //// Load row1; 281 ext v5.8b, v3.8b , v4.8b , #2 282 umull v20.8h, v0.8b, v28.8b 283 umlal v20.8h, v2.8b, v29.8b 284 umlal v20.8h, v3.8b, v30.8b 285 umlal v20.8h, v5.8b, v31.8b 286 ld1 {v6.8b, v7.8b}, [x0], x2 //// Load row2 287 sqrshrun v26.8b, v20.8h, #6 288 ext v8.8b, v6.8b , v7.8b , #2 289 st1 {v26.8b}, [x1], x3 ////Store dest row0 290 umull v22.8h, v3.8b, v28.8b 291 umlal v22.8h, v5.8b, v29.8b 292 umlal v22.8h, v6.8b, v30.8b 293 umlal v22.8h, v8.8b, v31.8b 294 subs x12, x6, #2 295 sqrshrun v27.8b, v22.8h, #6 296 st1 {v27.8b}, [x1], x3 ////Store dest row1 297 beq end_func //If ht=2 298 299 ld1 {v9.8b, v10.8b}, [x0], x2 //// Load row3; 300 ext v11.8b, v9.8b , v10.8b , #2 301 umull v24.8h, v6.8b, v28.8b 302 umlal v24.8h, v8.8b, v29.8b 303 umlal v24.8h, v9.8b, v30.8b 304 umlal v24.8h, v11.8b, v31.8b 305 ld1 {v0.8b, v1.8b}, [x0], x2 //// Load row4 ; 306 sqrshrun v16.8b, v24.8h, #6 307 ext v2.8b, v0.8b , v1.8b , #2 308 st1 {v16.8b}, [x1], x3 ////Store dest row2 309 umull v18.8h, v9.8b, v28.8b 310 umlal v18.8h, v11.8b, v29.8b 311 umlal v18.8h, v0.8b, v30.8b 312 umlal v18.8h, v2.8b, v31.8b 313 subs x12, x6, #4 314 sqrshrun v17.8b, v18.8h, #6 315 st1 {v17.8b}, [x1], x3 ////Store dest row3 316 beq end_func //If ht=4 317 318 ld1 {v3.8b, v4.8b}, [x0], x2 //// Load row5; 319 ext v5.8b, v3.8b , v4.8b , #2 320 umull v20.8h, v0.8b, v28.8b 321 umlal v20.8h, v2.8b, v29.8b 322 umlal v20.8h, v3.8b, v30.8b 323 umlal v20.8h, v5.8b, v31.8b 324 ld1 {v6.8b, v7.8b}, [x0], x2 //// Load row6 ; 325 sqrshrun v26.8b, v20.8h, #6 326 ext v8.8b, v6.8b , v7.8b , #2 327 st1 {v26.8b}, [x1], x3 ////Store dest row4 328 umull v22.8h, v3.8b, v28.8b 329 umlal v22.8h, v5.8b, v29.8b 330 umlal v22.8h, v6.8b, v30.8b 331 umlal v22.8h, v8.8b, v31.8b 332 ld1 {v9.8b, v10.8b}, [x0], x2 //// Load row7; 333 sqrshrun v27.8b, v22.8h, #6 334 ext v11.8b, v9.8b , v10.8b , #2 335 st1 {v27.8b}, [x1], x3 ////Store dest row5 336 umull v24.8h, v6.8b, v28.8b 337 umlal v24.8h, v8.8b, v29.8b 338 umlal v24.8h, v9.8b, v30.8b 339 umlal v24.8h, v11.8b, v31.8b 340 ld1 {v0.8b, v1.8b}, [x0], x2 //// Load row8; 341 sqrshrun v16.8b, v24.8h, #6 342 ext v2.8b, v0.8b , v1.8b , #2 343 st1 {v16.8b}, [x1], x3 ////Store dest row6 344 umull v18.8h, v9.8b, v28.8b 345 umlal v18.8h, v11.8b, v29.8b 346 umlal v18.8h, v0.8b, v30.8b 347 umlal v18.8h, v2.8b, v31.8b 348 sqrshrun v17.8b, v18.8h, #6 349 st1 {v17.8b}, [x1], x3 ////Store dest row7 350 b end_func 351 352loop_2: 353 ld1 {v0.8b}, [x0], x2 //// Load row0 ; 354 ext v2.8b, v0.8b , v0.8b , #2 355 ld1 {v3.8b}, [x0], x2 //// Load row1; 356 ext v5.8b, v3.8b , v3.8b , #2 357 umull v20.8h, v0.8b, v28.8b 358 umlal v20.8h, v2.8b, v29.8b 359 umlal v20.8h, v3.8b, v30.8b 360 umlal v20.8h, v5.8b, v31.8b 361 ld1 {v6.8b}, [x0], x2 //// Load row2 362 sqrshrun v26.8b, v20.8h, #6 363 ext v8.8b, v6.8b , v6.8b , #2 364 st1 {v26.s}[0], [x1], x3 ////Store dest row0 365 umull v22.8h, v3.8b, v28.8b 366 umlal v22.8h, v5.8b, v29.8b 367 umlal v22.8h, v6.8b, v30.8b 368 umlal v22.8h, v8.8b, v31.8b 369 subs x12, x6, #2 370 sqrshrun v27.8b, v22.8h, #6 371 st1 {v27.s}[0], [x1], x3 ////Store dest row1 372 beq end_func //If ht=2 373 374 ld1 {v9.8b}, [x0], x2 //// Load row3; 375 ext v11.8b, v9.8b , v9.8b , #2 376 umull v24.8h, v6.8b, v28.8b 377 umlal v24.8h, v8.8b, v29.8b 378 umlal v24.8h, v9.8b, v30.8b 379 umlal v24.8h, v11.8b, v31.8b 380 ld1 {v0.8b}, [x0], x2 //// Load row4 ; 381 sqrshrun v16.8b, v24.8h, #6 382 ext v2.8b, v0.8b , v0.8b , #2 383 st1 {v16.s}[0], [x1], x3 ////Store dest row2 384 umull v18.8h, v9.8b, v28.8b 385 umlal v18.8h, v11.8b, v29.8b 386 umlal v18.8h, v0.8b, v30.8b 387 umlal v18.8h, v2.8b, v31.8b 388 sqrshrun v17.8b, v18.8h, #6 389 st1 {v17.s}[0], [x1], x3 ////Store dest row3 390 391 392end_func: 393 // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack 394 ldp x19, x20, [sp], #16 395 pop_v_regs 396 ret 397 398 399