1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@/** 20@******************************************************************************* 21@* 22@* @brief 23@* interprediction luma function for copy 24@* 25@* @par description: 26@* copies the array of width 'wd' and height 'ht' from the location pointed 27@* by 'src' to the location pointed by 'dst' 28@* 29@* @param[in] pu1_src 30@* uword8 pointer to the source 31@* 32@* @param[out] pu1_dst 33@* uword8 pointer to the destination 34@* 35@* @param[in] src_strd 36@* integer source stride 37@* 38@* @param[in] dst_strd 39@* integer destination stride 40@* 41@* @param[in] pi1_coeff 42@* word8 pointer to the filter coefficients 43@* 44@* @param[in] ht 45@* integer height of the array 46@* 47@* @param[in] wd 48@* integer width of the array 49@* 50@* @returns 51@* 52@* @remarks 53@* none 54@* 55@******************************************************************************* 56@*/ 57 58@void ihevc_inter_pred_luma_copy_w16out ( 59@ uword8 *pu1_src, 60@ word16 *pi2_dst, 61@ word32 src_strd, 62@ word32 dst_strd, 63@ word8 *pi1_coeff, 64@ word32 ht, 65@ word32 wd ) 66 67@**************variables vs registers***************************************** 68@ r0 => *pu1_src 69@ r1 => *pi2_dst 70@ r2 => src_strd 71@ r3 => dst_strd 72@ r7 => ht 73@ r12 => wd 74 75.equ coeff_offset, 104 76.equ ht_offset, 108 77.equ wd_offset, 112 78 79.text 80.align 4 81 82 83 84 85.globl ihevc_inter_pred_luma_copy_w16out_a9q 86 87.type ihevc_inter_pred_luma_copy_w16out_a9q, %function 88 89ihevc_inter_pred_luma_copy_w16out_a9q: 90 91 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 92 vpush {d8 - d15} 93 ldr r12,[sp,#wd_offset] @loads wd 94 ldr r7,[sp,#ht_offset] @loads ht 95 cmp r7,#0 @ht condition(ht == 0) 96 ble end_loops @loop 97 tst r12,#7 @conditional check for wd (multiples) 98 beq core_loop_wd_8 99 sub r11,r12,#4 100 lsls r6,r3,#1 101 102outer_loop_wd_4: 103 subs r4,r12,#0 @wd conditional subtract 104 ble end_inner_loop_wd_4 105 106inner_loop_wd_4: 107 vld1.8 {d0},[r0] @vld1_u8(pu1_src_tmp) 108 add r5,r0,r2 @pu1_src +src_strd 109 vmovl.u8 q0,d0 @vmovl_u8(vld1_u8(pu1_src_tmp) 110 add r10,r1,r6 111 subs r4,r4,#4 @wd - 4 112 vshl.i64 q0,q0,#6 @vshlq_n_s64(temp, 6) 113 vld1.8 {d22},[r5],r2 @vld1_u8(pu1_src_tmp) 114 add r0,r0,#4 @pu1_src += 4 115 vst1.64 {d0},[r1] @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 116 add r1,r1,#8 117 vmovl.u8 q11,d22 @vmovl_u8(vld1_u8(pu1_src_tmp) 118 vld1.8 {d24},[r5],r2 @vld1_u8(pu1_src_tmp) 119 vshl.i64 q11,q11,#6 @vshlq_n_s64(temp, 6) 120 vmovl.u8 q12,d24 @vmovl_u8(vld1_u8(pu1_src_tmp) 121 vst1.64 {d22},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 122 vshl.i64 q12,q12,#6 @vshlq_n_s64(temp, 6) 123 vld1.8 {d26},[r5],r2 @vld1_u8(pu1_src_tmp) 124 vst1.64 {d24},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 125 vmovl.u8 q13,d26 @vmovl_u8(vld1_u8(pu1_src_tmp) 126 vshl.i64 q13,q13,#6 @vshlq_n_s64(temp, 6) 127 vst1.64 {d26},[r10],r6 @vst1q_lane_s64(pi2_dst_tmp, temp, 0) 128 bgt inner_loop_wd_4 129 130end_inner_loop_wd_4: 131 subs r7,r7,#4 @ht + 4 132 sub r0,r5,r11 133 sub r1,r10,r11,lsl #1 134 bgt outer_loop_wd_4 135 136end_loops: 137 vpop {d8 - d15} 138 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 139 140 141core_loop_wd_8: 142 @sub r11,r12,#8 143 lsls r5,r3,#1 144 rsb r11,r12,r3, lsl #2 @ r11 = (dst_strd * 4) - width 145 rsb r8,r12,r2,lsl #2 @r2->src_strd 146 mov r4,r12, lsr #3 @ divide by 8 147 mul r7, r4 148 sub r4,r12,#0 @wd conditional check 149 sub r7,r7,#4 @subtract one for epilog 150 151prolog: 152 add r6,r0,r2 @pu1_src_tmp += src_strd 153 add r10,r1,r5 154 vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) 155 vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) 156 vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp) 157 vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp) 158 vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) 159 vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) 160 vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp) 161 vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp) 162 subs r4,r4,#8 @wd decrements by 8 163 vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) 164 vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) 165 vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6) 166 vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6) 167 addle r0,r0,r8 168 add r6,r0,r2 @pu1_src_tmp += src_strd 169 vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) 170 vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) 171 vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp) 172 vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp) 173 174 vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) 175 addle r1,r1,r11,lsl #1 176 suble r4,r12,#0 @wd conditional check 177 178 subs r7,r7,#4 @ht - 4 179 180 blt epilog_end @jumps to epilog_end 181 beq epilog @jumps to epilog 182 183 184 185outer_loop_wd_8: 186 187 vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 188 vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) 189 190 vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 191 vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) 192 193 vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 194 vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp) 195 196 vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp) 197 198 subs r4,r4,#8 @wd decrements by 8 199 addle r0,r0,r8 200 201 add r6,r0,r2 @pu1_src_tmp += src_strd 202 203 vld1.8 {d8},[r0]! @vld1_u8(pu1_src_tmp) 204 vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) 205 206 vld1.8 {d10},[r6],r2 @vld1_u8(pu1_src_tmp) 207 vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) 208 209 vld1.8 {d12},[r6],r2 @vld1_u8(pu1_src_tmp) 210 vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6) 211 212 vld1.8 {d14},[r6],r2 @vld1_u8(pu1_src_tmp) 213 add r10,r1,r5 214 215 vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6) 216 217 vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) 218 219 addle r1,r1,r11,lsl #1 220 suble r4,r12,#0 @wd conditional check 221 222 subs r7,r7,#4 @ht - 4 223 bgt outer_loop_wd_8 224 225epilog: 226 vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 227 vmovl.u8 q8,d8 @vmovl_u8(vld1_u8(pu1_src_tmp)) 228 229 vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 230 vmovl.u8 q9,d10 @vmovl_u8(vld1_u8(pu1_src_tmp) 231 232 vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 233 vmovl.u8 q10,d12 @vmovl_u8(vld1_u8(pu1_src_tmp) 234 235 vmovl.u8 q11,d14 @vmovl_u8(vld1_u8(pu1_src_tmp) 236 @add r6,r0,r2 @pu1_src_tmp += src_strd 237 238 vshl.i16 q0,q8,#6 @vshlq_n_s16(tmp, 6) 239 vshl.i16 q1,q9,#6 @vshlq_n_s16(tmp, 6) 240 vshl.i16 q2,q10,#6 @vshlq_n_s16(tmp, 6) 241 add r10,r1,r5 242 vshl.i16 q3,q11,#6 @vshlq_n_s16(tmp, 6) 243 244 vst1.16 {d0,d1},[r1]! @vst1q_s16(pi2_dst_tmp, tmp) 245epilog_end: 246 vst1.16 {d2,d3},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 247 vst1.16 {d4,d5},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 248 vst1.16 {d6,d7},[r10],r5 @vst1q_s16(pi2_dst_tmp, tmp) 249 250 251 vpop {d8 - d15} 252 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 253 254 255 256 257