1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* @file 21@* ihevc_inter_pred_chroma_copy_w16out_neon.s 22@* 23@* @brief 24@* contains function definitions for inter prediction interpolation. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* yogeswaran rs 31@* 32@* @par list of functions: 33@* 34@* 35@* @remarks 36@* none 37@* 38@******************************************************************************* 39@*/ 40@/** 41@******************************************************************************* 42@* 43@* @brief 44@* chroma interprediction filter for copy 45@* 46@* @par description: 47@* copies the array of width 'wd' and height 'ht' from the location pointed 48@* by 'src' to the location pointed by 'dst' 49@* 50@* @param[in] pu1_src 51@* uword8 pointer to the source 52@* 53@* @param[out] pu1_dst 54@* uword8 pointer to the destination 55@* 56@* @param[in] src_strd 57@* integer source stride 58@* 59@* @param[in] dst_strd 60@* integer destination stride 61@* 62@* @param[in] pi1_coeff 63@* word8 pointer to the filter coefficients 64@* 65@* @param[in] ht 66@* integer height of the array 67@* 68@* @param[in] wd 69@* integer width of the array 70@* 71@* @returns 72@* 73@* @remarks 74@* none 75@* 76@******************************************************************************* 77@*/ 78 79@void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src, 80@ word16 *pi2_dst, 81@ word32 src_strd, 82@ word32 dst_strd, 83@ word8 *pi1_coeff, 84@ word32 ht, 85@ word32 wd) 86@**************variables vs registers***************************************** 87@r0 => *pu1_src 88@r1 => *pi2_dst 89@r2 => src_strd 90@r3 => dst_strd 91@r4 => *pi1_coeff 92@r5 => ht 93@r6 => wd 94 95.equ coeff_offset, 104 96.equ ht_offset, 108 97.equ wd_offset, 112 98 99 100.text 101.align 4 102 103 104 105 106.globl ihevc_inter_pred_chroma_copy_w16out_a9q 107 108.type ihevc_inter_pred_chroma_copy_w16out_a9q, %function 109 110ihevc_inter_pred_chroma_copy_w16out_a9q: 111 112 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 113 vpush {d8 - d15} 114 115 ldr r12,[sp,#wd_offset] @loads wd 116 lsl r12,r12,#1 @2*wd 117 ldr r7,[sp,#ht_offset] @loads ht 118 cmp r7,#0 @ht condition(ht == 0) 119 ble end_loops @loop 120 and r8,r7,#3 @check ht for mul of 2 121 sub r9,r7,r8 @check the rounded height value 122 and r11,r7,#6 123 cmp r11,#6 124 beq loop_ht_6 125 tst r12,#7 @conditional check for wd (multiples) 126 beq core_loop_wd_8 127 128loop_ht_6: 129 sub r11,r12,#4 130 lsls r6,r3,#1 131 cmp r9,#0 132 beq outer_loop_wd_4_ht_2 133 134outer_loop_wd_4: 135 subs r4,r12,#0 @wd conditional subtract 136 ble end_inner_loop_wd_4 137 138inner_loop_wd_4: 139 vld1.8 {d0},[r0] @vld1_u8(pu1_src_tmp) 140 add r5,r0,r2 @pu1_src +src_strd 141 vmovl.u8 q0,d0 @vmovl_u8(vld1_u8(pu1_src_tmp) 142 add r10,r1,r6 143 subs r4,r4,#4 @wd - 4 144 vshl.i64 q0,q0,#6 @vshlq_n_s64(temp, 6) 145 vld1.8 {d22},[r5],r2 @vld1_u8(pu1_src_tmp) 146 add r0,r0,#4 @pu1_src += 4 147 vst1.64 {d0},[r1] @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 148 add r1,r1,#8 149 vmovl.u8 q11,d22 @vmovl_u8(vld1_u8(pu1_src_tmp) 150 vld1.8 {d24},[r5],r2 @vld1_u8(pu1_src_tmp) 151 vshl.i64 q11,q11,#6 @vshlq_n_s64(temp, 6) 152 vmovl.u8 q12,d24 @vmovl_u8(vld1_u8(pu1_src_tmp) 153 vst1.64 {d22},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 154 vshl.i64 q12,q12,#6 @vshlq_n_s64(temp, 6) 155 vld1.8 {d26},[r5],r2 @vld1_u8(pu1_src_tmp) 156 vst1.64 {d24},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 157 vmovl.u8 q13,d26 @vmovl_u8(vld1_u8(pu1_src_tmp) 158 vshl.i64 q13,q13,#6 @vshlq_n_s64(temp, 6) 159 vst1.64 {d26},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 160 bgt inner_loop_wd_4 161 162end_inner_loop_wd_4: 163 subs r9,r9,#4 @ht - 4 164 sub r0,r5,r11 165 sub r1,r10,r11,lsl #1 166 bgt outer_loop_wd_4 167 cmp r8,#0 168 bgt outer_loop_wd_4_ht_2 169 170 171end_loops: 172 vpop {d8 - d15} 173 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 174 175 176outer_loop_wd_4_ht_2: 177 subs r4,r12,#0 @wd conditional subtract 178 ble end_inner_loop_wd_4 179 180inner_loop_wd_4_ht_2: 181 vld1.8 {d0},[r0] @vld1_u8(pu1_src_tmp) 182 add r5,r0,r2 @pu1_src +src_strd 183 vmovl.u8 q0,d0 @vmovl_u8(vld1_u8(pu1_src_tmp) 184 add r10,r1,r6 185 subs r4,r4,#4 @wd - 4 186 vshl.i64 q0,q0,#6 @vshlq_n_s64(temp, 6) 187 vld1.8 {d22},[r5],r2 @vld1_u8(pu1_src_tmp) 188 add r0,r0,#4 @pu1_src += 4 189 vst1.64 {d0},[r1] @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 190 add r1,r1,#8 191 vmovl.u8 q11,d22 @vmovl_u8(vld1_u8(pu1_src_tmp) 192 vld1.8 {d24},[r5],r2 @vld1_u8(pu1_src_tmp) 193 vshl.i64 q11,q11,#6 @vshlq_n_s64(temp, 6) 194 vmovl.u8 q12,d24 @vmovl_u8(vld1_u8(pu1_src_tmp) 195 vst1.64 {d22},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 196 bgt inner_loop_wd_4_ht_2 197 b end_loops 198 199 200core_loop_wd_8: 201 @sub r11,r12,#8 202 lsls r5,r3,#1 203 rsb r11,r12,r3, lsl #2 @ r11 = (dst_strd * 4) - width 204 rsb r8,r12,r2,lsl #2 @r2->src_strd 205 mov r4,r12, lsr #3 @ divide by 8 206 mov r7,r9 207 mul r7, r4 208 sub r4,r12,#0 @wd conditional check 209 sub r7,r7,#4 @subtract one for epilog 210 cmp r9,#0 211 beq core_loop_wd_8_ht_2 212 213prolog: 214 add r6,r0,r2 @pu1_src_tmp += src_strd 215 add r10,r1,r5 216 vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) 217 vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) 218 vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp) 219 vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp) 220 vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) 221 vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) 222 vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp) 223 vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp) 224 subs r4,r4,#8 @wd decrements by 8 225 vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) 226 vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) 227 vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6) 228 vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6) 229 addle r0,r0,r8 230 add r6,r0,r2 @pu1_src_tmp += src_strd 231 vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) 232 vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) 233 vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp) 234 vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp) 235 236 vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) 237 addle r1,r1,r11,lsl #1 238 suble r4,r12,#0 @wd conditional check 239 240 subs r7,r7,#4 @ht - 4 241 242 blt epilog_end @jumps to epilog_end 243 beq epilog @jumps to epilog 244 245 246 247outer_loop_wd_8: 248 249 vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 250 vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) 251 252 vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 253 vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) 254 255 vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 256 vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp) 257 258 vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp) 259 260 subs r4,r4,#8 @wd decrements by 8 261 addle r0,r0,r8 262 263 add r6,r0,r2 @pu1_src_tmp += src_strd 264 265 vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) 266 vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) 267 268 vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) 269 vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) 270 271 vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp) 272 vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6) 273 274 vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp) 275 add r10,r1,r5 276 277 vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6) 278 279 vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) 280 281 addle r1,r1,r11,lsl #1 282 suble r4,r12,#0 @wd conditional check 283 284 subs r7,r7,#4 @ht - 4 285 bgt outer_loop_wd_8 286 287epilog: 288 vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 289 vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) 290 291 vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 292 vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) 293 294 vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 295 vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp) 296 297 vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp) 298 @add r6,r0,r2 @pu1_src_tmp += src_strd 299 300 vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) 301 vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) 302 vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6) 303 add r10,r1,r5 304 vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6) 305 306 vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) 307epilog_end: 308 vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 309 vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 310 vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 311 b end_loops 312 313core_loop_wd_8_ht_2: 314 add r6,r0,r2 @pu1_src_tmp += src_strd 315 add r10,r1,r5 316 vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) 317 vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) 318 vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) 319 vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) 320 subs r12,r12,#8 @wd decrements by 8 321 vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) 322 vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) 323 vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) 324 vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 325 bgt core_loop_wd_8_ht_2 326 327 vpop {d8 - d15} 328 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 329 330 331 332 333 334 335