1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21//****************************************************************************** 22//* @file 23//* ih264_inter_pred_chroma_av8.s 24//* 25//* @brief 26//* Contains function definitions for inter prediction interpolation. 27//* 28//* @author 29//* Ittaim 30//* 31//* @par List of Functions: 32//* 33//* - ih264_inter_pred_chroma_av8() 34//* 35//* @remarks 36//* None 37//* 38//******************************************************************************* 39//*/ 40 41///* All the functions here are replicated from ih264_inter_pred_filters.c 42// 43 44///** 45///** 46///** 47// 48///** 49//******************************************************************************* 50//* 51//* @brief 52//* Interprediction chroma filter 53//* 54//* @par Description: 55//* Applies filtering to chroma samples as mentioned in 56//* sec 8.4.2.2.2 titled "chroma sample interpolation process" 57//* 58//* @param[in] pu1_src 59//* UWORD8 pointer to the source containing alternate U and V samples 60//* 61//* @param[out] pu1_dst 62//* UWORD8 pointer to the destination 63//* 64//* @param[in] src_strd 65//* integer source stride 66//* 67//* @param[in] dst_strd 68//* integer destination stride 69//* 70//* @param[in]uc_dx 71//* dx value where the sample is to be produced(refer sec 8.4.2.2.2 ) 72//* 73//* @param[in] uc_dy 74//* dy value where the sample is to be produced(refer sec 8.4.2.2.2 ) 75//* 76//* @param[in] ht 77//* integer height of the array 78//* 79//* @param[in] wd 80//* integer width of the array 81//* 82//* @returns 83//* 84//* @remarks 85//* None 86//* 87//******************************************************************************* 88//*/ 89 90//void ih264_inter_pred_chroma(UWORD8 *pu1_src, 91// UWORD8 *pu1_dst, 92// WORD32 src_strd, 93// WORD32 dst_strd, 94// UWORD8 u1_dx, 95// UWORD8 u1_dy, 96// WORD32 ht, 97// WORD32 wd) 98//**************Variables Vs Registers***************************************** 99// x0 => *pu1_src 100// x1 => *pu1_dst 101// x2 => src_strd 102// x3 => dst_strd 103// x4 => u1_dx 104// x5 => u1_dy 105// x6 => height 106// x7 => width 107// 108.text 109.p2align 2 110.include "ih264_neon_macros.s" 111 112 113 114 .global ih264_inter_pred_chroma_av8 115 116ih264_inter_pred_chroma_av8: 117 118 119 120 // STMFD sp!, {x4-x12, x14} //store register values to stack 121 push_v_regs 122 stp x19, x20, [sp, #-16]! 123 124 125 126 127 128 sub x20, x4, #8 //8-u1_dx 129 neg x8, x20 130 sub x20, x5, #8 //8-u1_dy 131 neg x9, x20 132 mul x10, x8, x9 // 133 mul x11, x4, x9 // 134 135 dup v28.8b, w10 136 dup v29.8b, w11 137 138 mul x10, x8, x5 // 139 mul x11, x4, x5 // 140 141 dup v30.8b, w10 142 dup v31.8b, w11 143 144 subs x12, x7, #2 //if wd=4 branch to loop_4 145 beq loop_2 146 subs x12, x7, #4 //if wd=8 branch to loop_8 147 beq loop_4 148 149loop_8: 150 ld1 {v0.8b, v1.8b, v2.8b}, [x0], x2 //// Load row0 ; 151 ext v3.8b, v0.8b , v1.8b , #2 152 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1; 153 umull v20.8h, v0.8b, v28.8b 154 ext v8.8b, v5.8b , v6.8b , #2 155 umlal v20.8h, v3.8b, v29.8b 156 ext v9.8b, v6.8b , v7.8b , #2 157 umlal v20.8h, v5.8b, v30.8b 158 ext v4.8b, v1.8b , v2.8b , #2 159 umlal v20.8h, v8.8b, v31.8b 160 sqrshrun v26.8b, v20.8h, #6 161 umull v22.8h, v1.8b, v28.8b 162 ld1 {v10.8b, v11.8b, v12.8b}, [x0], x2 //// Load row2 ; 163 umlal v22.8h, v4.8b, v29.8b 164 ext v13.8b, v10.8b , v11.8b , #2 165 umlal v22.8h, v6.8b, v30.8b 166 ext v14.8b, v11.8b , v12.8b , #2 167 umlal v22.8h, v9.8b, v31.8b 168 sqrshrun v27.8b, v22.8h, #6 169 umull v24.8h, v5.8b, v28.8b 170 st1 { v26.8b, v27.8b}, [x1], x3 ////Store dest row 171 umlal v24.8h, v8.8b, v29.8b 172 ld1 {v0.8b, v1.8b, v2.8b}, [x0], x2 //// Load row3 ; 173 umlal v24.8h, v10.8b, v30.8b 174 ext v3.8b, v0.8b , v1.8b , #2 175 umlal v24.8h, v13.8b, v31.8b 176 ext v4.8b, v1.8b , v2.8b , #2 177 umull v16.8h, v6.8b, v28.8b 178 sqrshrun v18.8b, v24.8h, #6 179 umlal v16.8h, v9.8b, v29.8b 180 umlal v16.8h, v11.8b, v30.8b 181 umlal v16.8h, v14.8b, v31.8b 182 sqrshrun v19.8b, v16.8h, #6 183 st1 {v18.8b, v19.8b}, [x1], x3 // store row 1 184 umull v20.8h, v10.8b, v28.8b 185 umlal v20.8h, v13.8b, v29.8b 186 umlal v20.8h, v0.8b, v30.8b 187 umlal v20.8h, v3.8b, v31.8b 188 sqrshrun v26.8b, v20.8h, #6 189 umull v24.8h, v11.8b, v28.8b 190 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row4; 191 umlal v24.8h, v14.8b, v29.8b 192 ext v8.8b, v5.8b , v6.8b , #2 193 umlal v24.8h, v1.8b, v30.8b 194 ext v9.8b, v6.8b , v7.8b , #2 195 umlal v24.8h, v4.8b, v31.8b 196 umull v20.8h, v0.8b, v28.8b 197 sqrshrun v27.8b, v24.8h, #6 198 umlal v20.8h, v3.8b, v29.8b 199 st1 { v26.8b, v27.8b}, [x1], x3 ////Store dest row2 200 umlal v20.8h, v5.8b, v30.8b 201 umlal v20.8h, v8.8b, v31.8b 202 umull v22.8h, v1.8b, v28.8b 203 umlal v22.8h, v4.8b, v29.8b 204 umlal v22.8h, v6.8b, v30.8b 205 sqrshrun v26.8b, v20.8h, #6 206 umlal v22.8h, v9.8b, v31.8b 207 subs x12, x6, #4 208 sqrshrun v27.8b, v22.8h, #6 209 st1 { v26.8b, v27.8b}, [x1], x3 ////Store dest row3 210 211 beq end_func //If ht=4 212 213 ld1 {v10.8b, v11.8b, v12.8b}, [x0], x2 //// Load row5 214 ext v13.8b, v10.8b , v11.8b , #2 215 umull v24.8h, v5.8b, v28.8b 216 ext v14.8b, v11.8b , v12.8b , #2 217 ld1 {v0.8b, v1.8b, v2.8b}, [x0], x2 //// Load row6; 218 umlal v24.8h, v8.8b, v29.8b 219 umlal v24.8h, v10.8b, v30.8b 220 umlal v24.8h, v13.8b, v31.8b 221 ext v3.8b, v0.8b , v1.8b , #2 222 umull v16.8h, v6.8b, v28.8b 223 sqrshrun v18.8b, v24.8h, #6 224 umlal v16.8h, v9.8b, v29.8b 225 umlal v16.8h, v11.8b, v30.8b 226 umlal v16.8h, v14.8b, v31.8b 227 ext v4.8b, v1.8b , v2.8b , #2 228 sqrshrun v19.8b, v16.8h, #6 229 st1 { v18.8b, v19.8b}, [x1], x3 // store row 4 230 umull v20.8h, v10.8b, v28.8b 231 umlal v20.8h, v13.8b, v29.8b 232 umlal v20.8h, v0.8b, v30.8b 233 umlal v20.8h, v3.8b, v31.8b 234 ld1 {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row7; 235 sqrshrun v26.8b, v20.8h, #6 236 umull v24.8h, v11.8b, v28.8b 237 umlal v24.8h, v14.8b, v29.8b 238 ext v8.8b, v5.8b , v6.8b , #2 239 umlal v24.8h, v1.8b, v30.8b 240 umlal v24.8h, v4.8b, v31.8b 241 ext v9.8b, v6.8b , v7.8b , #2 242 sqrshrun v27.8b, v24.8h, #6 243 st1 {v26.8b, v27.8b}, [x1], x3 ////Store dest row5 244 umull v20.8h, v0.8b, v28.8b 245 umlal v20.8h, v3.8b, v29.8b 246 umlal v20.8h, v5.8b, v30.8b 247 umlal v20.8h, v8.8b, v31.8b 248 ld1 {v10.8b, v11.8b, v12.8b}, [x0], x2 //// Load row8 ; 249 sqrshrun v26.8b, v20.8h, #6 250 umull v22.8h, v1.8b, v28.8b 251 umlal v22.8h, v4.8b, v29.8b 252 umlal v22.8h, v6.8b, v30.8b 253 ext v13.8b, v10.8b , v11.8b , #2 254 umlal v22.8h, v9.8b, v31.8b 255 ext v14.8b, v11.8b , v12.8b , #2 256 sqrshrun v27.8b, v22.8h, #6 257 st1 { v26.8b, v27.8b}, [x1], x3 ////Store dest row6 258 umull v24.8h, v5.8b, v28.8b 259 umlal v24.8h, v8.8b, v29.8b 260 umlal v24.8h, v10.8b, v30.8b 261 umlal v24.8h, v13.8b, v31.8b 262 umull v16.8h, v6.8b, v28.8b 263 sqrshrun v18.8b, v24.8h, #6 264 umlal v16.8h, v9.8b, v29.8b 265 umlal v16.8h, v11.8b, v30.8b 266 umlal v16.8h, v14.8b, v31.8b 267 sqrshrun v19.8b, v16.8h, #6 268 st1 { v18.8b, v19.8b}, [x1], x3 // store row 7 269 b end_func 270 271loop_4: 272 ld1 {v0.8b, v1.8b}, [x0], x2 //// Load row0 ; 273 ext v2.8b, v0.8b , v1.8b , #2 274 ld1 {v3.8b, v4.8b}, [x0], x2 //// Load row1; 275 ext v5.8b, v3.8b , v4.8b , #2 276 umull v20.8h, v0.8b, v28.8b 277 umlal v20.8h, v2.8b, v29.8b 278 umlal v20.8h, v3.8b, v30.8b 279 umlal v20.8h, v5.8b, v31.8b 280 ld1 {v6.8b, v7.8b}, [x0], x2 //// Load row2 281 sqrshrun v26.8b, v20.8h, #6 282 ext v8.8b, v6.8b , v7.8b , #2 283 st1 {v26.8b}, [x1], x3 ////Store dest row0 284 umull v22.8h, v3.8b, v28.8b 285 umlal v22.8h, v5.8b, v29.8b 286 umlal v22.8h, v6.8b, v30.8b 287 umlal v22.8h, v8.8b, v31.8b 288 subs x12, x6, #2 289 sqrshrun v27.8b, v22.8h, #6 290 st1 {v27.8b}, [x1], x3 ////Store dest row1 291 beq end_func //If ht=2 292 293 ld1 {v9.8b, v10.8b}, [x0], x2 //// Load row3; 294 ext v11.8b, v9.8b , v10.8b , #2 295 umull v24.8h, v6.8b, v28.8b 296 umlal v24.8h, v8.8b, v29.8b 297 umlal v24.8h, v9.8b, v30.8b 298 umlal v24.8h, v11.8b, v31.8b 299 ld1 {v0.8b, v1.8b}, [x0], x2 //// Load row4 ; 300 sqrshrun v16.8b, v24.8h, #6 301 ext v2.8b, v0.8b , v1.8b , #2 302 st1 {v16.8b}, [x1], x3 ////Store dest row2 303 umull v18.8h, v9.8b, v28.8b 304 umlal v18.8h, v11.8b, v29.8b 305 umlal v18.8h, v0.8b, v30.8b 306 umlal v18.8h, v2.8b, v31.8b 307 subs x12, x6, #4 308 sqrshrun v17.8b, v18.8h, #6 309 st1 {v17.8b}, [x1], x3 ////Store dest row3 310 beq end_func //If ht=4 311 312 ld1 {v3.8b, v4.8b}, [x0], x2 //// Load row5; 313 ext v5.8b, v3.8b , v4.8b , #2 314 umull v20.8h, v0.8b, v28.8b 315 umlal v20.8h, v2.8b, v29.8b 316 umlal v20.8h, v3.8b, v30.8b 317 umlal v20.8h, v5.8b, v31.8b 318 ld1 {v6.8b, v7.8b}, [x0], x2 //// Load row6 ; 319 sqrshrun v26.8b, v20.8h, #6 320 ext v8.8b, v6.8b , v7.8b , #2 321 st1 {v26.8b}, [x1], x3 ////Store dest row4 322 umull v22.8h, v3.8b, v28.8b 323 umlal v22.8h, v5.8b, v29.8b 324 umlal v22.8h, v6.8b, v30.8b 325 umlal v22.8h, v8.8b, v31.8b 326 ld1 {v9.8b, v10.8b}, [x0], x2 //// Load row7; 327 sqrshrun v27.8b, v22.8h, #6 328 ext v11.8b, v9.8b , v10.8b , #2 329 st1 {v27.8b}, [x1], x3 ////Store dest row5 330 umull v24.8h, v6.8b, v28.8b 331 umlal v24.8h, v8.8b, v29.8b 332 umlal v24.8h, v9.8b, v30.8b 333 umlal v24.8h, v11.8b, v31.8b 334 ld1 {v0.8b, v1.8b}, [x0], x2 //// Load row8; 335 sqrshrun v16.8b, v24.8h, #6 336 ext v2.8b, v0.8b , v1.8b , #2 337 st1 {v16.8b}, [x1], x3 ////Store dest row6 338 umull v18.8h, v9.8b, v28.8b 339 umlal v18.8h, v11.8b, v29.8b 340 umlal v18.8h, v0.8b, v30.8b 341 umlal v18.8h, v2.8b, v31.8b 342 sqrshrun v17.8b, v18.8h, #6 343 st1 {v17.8b}, [x1], x3 ////Store dest row7 344 b end_func 345 346loop_2: 347 ld1 {v0.8b}, [x0], x2 //// Load row0 ; 348 ext v2.8b, v0.8b , v0.8b , #2 349 ld1 {v3.8b}, [x0], x2 //// Load row1; 350 ext v5.8b, v3.8b , v3.8b , #2 351 umull v20.8h, v0.8b, v28.8b 352 umlal v20.8h, v2.8b, v29.8b 353 umlal v20.8h, v3.8b, v30.8b 354 umlal v20.8h, v5.8b, v31.8b 355 ld1 {v6.8b}, [x0], x2 //// Load row2 356 sqrshrun v26.8b, v20.8h, #6 357 ext v8.8b, v6.8b , v6.8b , #2 358 st1 {v26.s}[0], [x1], x3 ////Store dest row0 359 umull v22.8h, v3.8b, v28.8b 360 umlal v22.8h, v5.8b, v29.8b 361 umlal v22.8h, v6.8b, v30.8b 362 umlal v22.8h, v8.8b, v31.8b 363 subs x12, x6, #2 364 sqrshrun v27.8b, v22.8h, #6 365 st1 {v27.s}[0], [x1], x3 ////Store dest row1 366 beq end_func //If ht=2 367 368 ld1 {v9.8b}, [x0], x2 //// Load row3; 369 ext v11.8b, v9.8b , v9.8b , #2 370 umull v24.8h, v6.8b, v28.8b 371 umlal v24.8h, v8.8b, v29.8b 372 umlal v24.8h, v9.8b, v30.8b 373 umlal v24.8h, v11.8b, v31.8b 374 ld1 {v0.8b}, [x0], x2 //// Load row4 ; 375 sqrshrun v16.8b, v24.8h, #6 376 ext v2.8b, v0.8b , v0.8b , #2 377 st1 {v16.s}[0], [x1], x3 ////Store dest row2 378 umull v18.8h, v9.8b, v28.8b 379 umlal v18.8h, v11.8b, v29.8b 380 umlal v18.8h, v0.8b, v30.8b 381 umlal v18.8h, v2.8b, v31.8b 382 sqrshrun v17.8b, v18.8h, #6 383 st1 {v17.s}[0], [x1], x3 ////Store dest row3 384 385 386end_func: 387 // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack 388 ldp x19, x20, [sp], #16 389 pop_v_regs 390 ret 391 392 393