1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19///** 20//******************************************************************************* 21//* 22//* //brief 23//* interprediction luma function for copy 24//* 25//* //par description: 26//* copies the array of width 'wd' and height 'ht' from the location pointed 27//* by 'src' to the location pointed by 'dst' 28//* 29//* //param[in] pu1_src 30//* uword8 pointer to the source 31//* 32//* //param[out] pu1_dst 33//* uword8 pointer to the destination 34//* 35//* //param[in] src_strd 36//* integer source stride 37//* 38//* //param[in] dst_strd 39//* integer destination stride 40//* 41//* //param[in] pi1_coeff 42//* word8 pointer to the filter coefficients 43//* 44//* //param[in] ht 45//* integer height of the array 46//* 47//* //param[in] wd 48//* integer width of the array 49//* 50//* //returns 51//* 52//* //remarks 53//* none 54//* 55//******************************************************************************* 56//*/ 57 58//void ihevc_inter_pred_luma_copy_w16out ( 59// uword8 *pu1_src, 60// word16 *pi2_dst, 61// word32 src_strd, 62// word32 dst_strd, 63// word8 *pi1_coeff, 64// word32 ht, 65// word32 wd ) 66 67//**************variables vs registers***************************************** 68// x0 => *pu1_src 69// x1 => *pi2_dst 70// x2 => src_strd 71// x3 => dst_strd 72// x7 => ht 73// x12 => wd 74 75.text 76.align 4 77 78.include "ihevc_neon_macros.s" 79 80.globl ihevc_inter_pred_luma_copy_w16out_av8 81 82.type ihevc_inter_pred_luma_copy_w16out_av8, %function 83 84ihevc_inter_pred_luma_copy_w16out_av8: 85 86 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 87 88 stp x19, x20,[sp,#-16]! 89 90 mov x15,x4 // pi1_coeff 91 mov x16,x5 // ht 92 mov x17,x6 // wd 93 94 mov x12,x17 //loads wd 95 mov x7,x16 //loads ht 96 cmp x7,#0 //ht condition(ht == 0) 97 ble end_loops //loop 98 tst x12,#7 //conditional check for wd (multiples) 99 beq core_loop_wd_8 100 sub x11,x12,#4 101 lsl x6, x3,#1 102 adds x6, x6,#0 103 104outer_loop_wd_4: 105 subs x4,x12,#0 //wd conditional subtract 106 ble end_inner_loop_wd_4 107 108inner_loop_wd_4: 109 ld1 {v0.8b},[x0] //vld1_u8(pu1_src_tmp) 110 add x5,x0,x2 //pu1_src +src_strd 111 uxtl v0.8h, v0.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 112 add x10,x1,x6 113 subs x4,x4,#4 //wd - 4 114 shl v0.2d, v0.2d,#6 //vshlq_n_s64(temp, 6) 115 ld1 {v22.8b},[x5],x2 //vld1_u8(pu1_src_tmp) 116 add x0,x0,#4 //pu1_src += 4 117 st1 {v0.d}[0],[x1] //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 118 add x1,x1,#8 119 uxtl v22.8h, v22.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 120 ld1 {v24.8b},[x5],x2 //vld1_u8(pu1_src_tmp) 121 shl v22.2d, v22.2d,#6 //vshlq_n_s64(temp, 6) 122 uxtl v24.8h, v24.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 123 st1 {v22.d}[0],[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 124 shl v24.2d, v24.2d,#6 //vshlq_n_s64(temp, 6) 125 ld1 {v26.8b},[x5],x2 //vld1_u8(pu1_src_tmp) 126 st1 {v24.d}[0],[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 127 uxtl v26.8h, v26.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 128 shl v26.2d, v26.2d,#6 //vshlq_n_s64(temp, 6) 129 st1 {v26.d}[0],[x10],x6 //vst1q_lane_s64(pi2_dst_tmp, temp, 0) 130 bgt inner_loop_wd_4 131 132end_inner_loop_wd_4: 133 subs x7,x7,#4 //ht + 4 134 sub x0,x5,x11 135 sub x1,x10,x11,lsl #1 136 bgt outer_loop_wd_4 137 138end_loops: 139 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 140 ldp x19, x20,[sp], #16 141 142 143 ret 144 145 146core_loop_wd_8: 147 //sub x11,x12,#8 148 lsl x5, x3,#1 149 adds x5, x5,#0 150 sub x20,x12,x3, lsl #2 // x11 = (dst_strd * 4) - width 151 neg x11, x20 152 sub x20,x12,x2,lsl #2 //x2->src_strd 153 neg x8, x20 154 lsr x4, x12, #3 // divide by 8 155 mul x7, x7, x4 156 sub x4,x12,#0 //wd conditional check 157 sub x7,x7,#4 //subtract one for epilog 158 159prolog: 160 add x6,x0,x2 //pu1_src_tmp += src_strd 161 add x10,x1,x5 162 ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 163 ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 164 ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 165 ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 166 uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) 167 uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 168 uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 169 uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 170 subs x4,x4,#8 //wd decrements by 8 171 shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) 172 shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) 173 shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) 174 shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) 175 add x20,x0,x8 176 csel x0, x20, x0,le 177 add x6,x0,x2 //pu1_src_tmp += src_strd 178 ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 179 ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 180 ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 181 ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 182 183 st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) 184 add x20,x1,x11,lsl #1 185 csel x1, x20, x1,le 186 sub x20,x12,#0 //wd conditional check 187 csel x4, x20, x4,le 188 189 subs x7,x7,#4 //ht - 4 190 191 blt epilog_end //jumps to epilog_end 192 beq epilog //jumps to epilog 193 194 195 196outer_loop_wd_8: 197 198 st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 199 uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) 200 201 st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 202 uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 203 204 st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 205 uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 206 207 uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 208 209 subs x4,x4,#8 //wd decrements by 8 210 add x20,x0,x8 211 csel x0, x20, x0,le 212 213 add x6,x0,x2 //pu1_src_tmp += src_strd 214 215 ld1 {v1.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 216 shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) 217 218 ld1 {v3.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 219 shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) 220 221 ld1 {v5.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 222 shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) 223 224 ld1 {v7.8b},[x6],x2 //vld1_u8(pu1_src_tmp) 225 add x10,x1,x5 226 227 shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) 228 229 st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) 230 231 add x20,x1,x11,lsl #1 232 csel x1, x20, x1,le 233 sub x20,x12,#0 //wd conditional check 234 csel x4, x20, x4,le 235 236 subs x7,x7,#4 //ht - 4 237 bgt outer_loop_wd_8 238 239epilog: 240 st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 241 uxtl v16.8h, v1.8b //vmovl_u8(vld1_u8(pu1_src_tmp)) 242 243 st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 244 uxtl v18.8h, v3.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 245 246 st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 247 uxtl v20.8h, v5.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 248 249 uxtl v22.8h, v7.8b //vmovl_u8(vld1_u8(pu1_src_tmp) 250 //add x6,x0,x2 //pu1_src_tmp += src_strd 251 252 shl v0.8h, v16.8h,#6 //vshlq_n_s16(tmp, 6) 253 shl v2.8h, v18.8h,#6 //vshlq_n_s16(tmp, 6) 254 shl v4.8h, v20.8h,#6 //vshlq_n_s16(tmp, 6) 255 add x10,x1,x5 256 shl v6.8h, v22.8h,#6 //vshlq_n_s16(tmp, 6) 257 258 st1 {v0.8h},[x1],#16 //vst1q_s16(pi2_dst_tmp, tmp) 259epilog_end: 260 st1 {v2.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 261 st1 {v4.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 262 st1 {v6.8h},[x10],x5 //vst1q_s16(pi2_dst_tmp, tmp) 263 264 265 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 266 ldp x19, x20,[sp], #16 267 268 ret 269 270 271 272 273