1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* //file 21//* ihevc_inter_pred_chroma_copy_w16out_neon.s 22//* 23//* //brief 24//* contains function definitions for inter prediction interpolation. 25//* functions are coded using neon intrinsics and can be compiled using 26 27//* rvct 28//* 29//* //author 30//* yogeswaran rs 31//* 32//* //par list of functions: 33//* 34//* 35//* //remarks 36//* none 37//* 38//******************************************************************************* 39//*/ 40///** 41//******************************************************************************* 42//* 43//* //brief 44//* chroma interprediction filter for copy 45//* 46//* //par description: 47//* copies the array of width 'wd' and height 'ht' from the location pointed 48//* by 'src' to the location pointed by 'dst' 49//* 50//* //param[in] pu1_src 51//* uword8 pointer to the source 52//* 53//* //param[out] pu1_dst 54//* uword8 pointer to the destination 55//* 56//* //param[in] src_strd 57//* integer source stride 58//* 59//* //param[in] dst_strd 60//* integer destination stride 61//* 62//* //param[in] pi1_coeff 63//* word8 pointer to the filter coefficients 64//* 65//* //param[in] ht 66//* integer height of the array 67//* 68//* //param[in] wd 69//* integer width of the array 70//* 71//* //returns 72//* 73//* //remarks 74//* none 75//* 76//******************************************************************************* 77//*/ 78 79//void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src, 80// word16 *pi2_dst, 81// word32 src_strd, 82// word32 dst_strd, 83// word8 *pi1_coeff, 84// word32 ht, 85// word32 wd) 86//**************variables vs registers***************************************** 87//x0 => *pu1_src 88//x1 => *pi2_dst 89//x2 => src_strd 90//x3 => dst_strd 91//x4 => *pi1_coeff 92//x5 => ht 93//x6 => wd 94 95.text 96.align 4 97 98.include "ihevc_neon_macros.s" 99 100.globl ihevc_inter_pred_chroma_copy_w16out_av8 101 102.type ihevc_inter_pred_chroma_copy_w16out_av8, %function 103 104ihevc_inter_pred_chroma_copy_w16out_av8: 105 106 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 107 108 stp x19, x20,[sp,#-16]! 109 110 mov x15,x4 // pi1_coeff 111 mov x16,x5 // ht 112 mov x17,x6 // wd 113 114 115 mov x12,x17 //loads wd 116 lsl x12,x12,#1 //2*wd 117 mov x7,x16 //loads ht 118 cmp x7,#0 //ht condition(ht == 0) 119 ble end_loops //loop 120 and x8,x7,#3 //check ht for mul of 2 121 sub x9,x7,x8 //check the rounded height value 122 and x11,x7,#6 123 cmp x11,#6 124 beq loop_ht_6 125 tst x12,#7 //conditional check for wd (multiples) 126 beq core_loop_wd_8 127 128loop_ht_6: 129 sub x11,x12,#4 130 lsl x6, x3,#1 131 adds x6, x6,#0 132 cmp x9,#0 133 beq outer_loop_wd_4_ht_2 134 135outer_loop_wd_4: 136 subs x4,x12,#0 //wd conditional subtract 137 ble end_inner_loop_wd_4 138 139inner_loop_wd_4: 140 ld1 {v0.8b},[x0] //vld1_u8(pu1_src_tmp) 141 add x5,x0,x2 //pu1_src +src_strd 142 uxtl v0.8h, v0.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 143 add x10,x1,x6 144 subs x4,x4,#4 //wd - 4 145 shl v0.2d, v0.2d,#6 //vshlq_n_s64(temp, 6) 146 ld1 {v22.8b},[x5],x2 //vld1_u8(pu1_src_tmp) 147 add x0,x0,#4 //pu1_src += 4 148 st1 {v0.1d},[x1] //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 149 add x1,x1,#8 150 uxtl v22.8h, v22.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 151 ld1 {v24.8b},[x5],x2 //vld1_u8(pu1_src_tmp) 152 shl v22.2d, v22.2d,#6 //vshlq_n_s64(temp, 6) 153 uxtl v24.8h, v24.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 154 st1 {v22.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 155 shl v24.2d, v24.2d,#6 //vshlq_n_s64(temp, 6) 156 ld1 {v26.8b},[x5],x2 //vld1_u8(pu1_src_tmp) 157 st1 {v24.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 158 uxtl v26.8h, v26.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 159 shl v26.2d, v26.2d,#6 //vshlq_n_s64(temp, 6) 160 st1 {v26.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 161 bgt inner_loop_wd_4 162 163end_inner_loop_wd_4: 164 subs x9,x9,#4 //ht - 4 165 sub x0,x5,x11 166 sub x1,x10,x11,lsl #1 167 bgt outer_loop_wd_4 168 cmp x8,#0 169 bgt outer_loop_wd_4_ht_2 170 171 172end_loops: 173 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 174 ldp x19, x20,[sp],#16 175 176 ret 177 178 179outer_loop_wd_4_ht_2: 180 subs x4,x12,#0 //wd conditional subtract 181 ble end_inner_loop_wd_4 182 183inner_loop_wd_4_ht_2: 184 ld1 {v0.8b},[x0] //vld1_u8(pu1_src_tmp) 185 add x5,x0,x2 //pu1_src +src_strd 186 uxtl v0.8h, v0.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 187 add x10,x1,x6 188 subs x4,x4,#4 //wd - 4 189 shl v0.2d, v0.2d,#6 //vshlq_n_s64(temp, 6) 190 ld1 {v22.8b},[x5],x2 //vld1_u8(pu1_src_tmp) 191 add x0,x0,#4 //pu1_src += 4 192 st1 {v0.1d},[x1] //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 193 add x1,x1,#8 194 uxtl v22.8h, v22.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 195 ld1 {v24.8b},[x5],x2 //vld1_u8(pu1_src_tmp) 196 shl v22.2d, v22.2d,#6 //vshlq_n_s64(temp, 6) 197 uxtl v24.8h, v24.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 198 st1 {v22.1d},[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 199 bgt inner_loop_wd_4_ht_2 200 b end_loops 201 202 203core_loop_wd_8: 204 //sub x11,x12,#8 205 lsl x5, x3,#1 206 adds x5, x5,#0 207 sub x20,x12,x3, lsl #2 // x11 = (dst_strd * 4) - width 208 neg x11, x20 209 sub x20,x12,x2,lsl #2 //x2->src_strd 210 neg x8, x20 211 lsr x4, x12, #3 // divide by 8 212 mov x7,x9 213 mul x7, x7, x4 214 sub x4,x12,#0 //wd conditional check 215 sub x7,x7,#4 //subtract one for epilog 216 cmp x9,#0 217 beq core_loop_wd_8_ht_2 218 219prolog: 220 add x6,x0,x2 //pu1_src_tmp += src_strd 221 add x10,x1,x5 222 ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 223 ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 224 ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 225 ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 226 uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) 227 uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 228 uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 229 uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 230 subs x4,x4,#8 //wd decrements by 8 231 shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) 232 shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) 233 shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) 234 shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) 235 add x20,x0,x8 236 csel x0, x20, x0,le 237 add x6,x0,x2 //pu1_src_tmp += src_strd 238 ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 239 ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 240 ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 241 ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 242 243 st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) 244 add x20,x1,x11,lsl #1 245 csel x1, x20, x1,le 246 sub x20,x12,#0 //wd conditional check 247 csel x4, x20, x4,le 248 249 subs x7,x7,#4 //ht - 4 250 251 blt epilog_end //jumps to epilog_end 252 beq epilog //jumps to epilog 253 254 255 256outer_loop_wd_8: 257 258 st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 259 uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) 260 261 st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 262 uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 263 264 st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 265 uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 266 267 uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 268 269 subs x4,x4,#8 //wd decrements by 8 270 add x20,x0,x8 271 csel x0, x20, x0,le 272 273 add x6,x0,x2 //pu1_src_tmp += src_strd 274 275 ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 276 shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) 277 278 ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 279 shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) 280 281 ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 282 shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) 283 284 ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 285 add x10,x1,x5 286 287 shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) 288 289 st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) 290 291 add x20,x1,x11,lsl #1 292 csel x1, x20, x1,le 293 sub x20,x12,#0 //wd conditional check 294 csel x4, x20, x4,le 295 296 subs x7,x7,#4 //ht - 4 297 bgt outer_loop_wd_8 298 299epilog: 300 st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 301 uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) 302 303 st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 304 uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 305 306 st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 307 uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 308 309 uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 310 //add x6,x0,x2 //pu1_src_tmp += src_strd 311 312 shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) 313 shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) 314 shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) 315 add x10,x1,x5 316 shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) 317 318 st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) 319epilog_end: 320 st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 321 st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 322 st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 323 b end_loops 324 325core_loop_wd_8_ht_2: 326 add x6,x0,x2 //pu1_src_tmp += src_strd 327 add x10,x1,x5 328 ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 329 ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 330 uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) 331 uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 332 subs x12,x12,#8 //wd decrements by 8 333 shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) 334 shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) 335 st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) 336 st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 337 bgt core_loop_wd_8_ht_2 338 339 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 340 ldp x19, x20,[sp],#16 341 342 ret 343 344 345 346 347 348 349