1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* @file 21@* ihevc_inter_pred_chroma_copy_neon.s 22@* 23@* @brief 24@* contains function definitions for inter prediction interpolation. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* yogeswaran rs 31@* 32@* @par list of functions: 33@* 34@* 35@* @remarks 36@* none 37@* 38@******************************************************************************* 39@*/ 40@/** 41@******************************************************************************* 42@* 43@* @brief 44@* chroma interprediction filter for copy 45@* 46@* @par description: 47@* copies the array of width 'wd' and height 'ht' from the location pointed 48@* by 'src' to the location pointed by 'dst' 49@* 50@* @param[in] pu1_src 51@* uword8 pointer to the source 52@* 53@* @param[out] pu1_dst 54@* uword8 pointer to the destination 55@* 56@* @param[in] src_strd 57@* integer source stride 58@* 59@* @param[in] dst_strd 60@* integer destination stride 61@* 62@* @param[in] pi1_coeff 63@* word8 pointer to the filter coefficients 64@* 65@* @param[in] ht 66@* integer height of the array 67@* 68@* @param[in] wd 69@* integer width of the array 70@* 71@* @returns 72@* 73@* @remarks 74@* none 75@* 76@******************************************************************************* 77@*/ 78 79@void ihevc_inter_pred_chroma_copy( uword8 *pu1_src, 80@ uword8 *pu1_dst, 81@ word32 src_strd, 82@ word32 dst_strd, 83@ word8 *pi1_coeff, 84@ word32 ht, 85@ word32 wd) 86@**************variables vs registers***************************************** 87@ r0 => *pu1_src 88@ r1 => *pu1_dst 89@ r2 => src_strd 90@ r3 => dst_strd 91@ r4 => *pi1_coeff 92@ r5 => ht 93@ r6 => wd 94 95.equ ht_offset, 44 96.equ wd_offset, 48 97 98.text 99.align 4 100 101 102 103 104.globl ihevc_inter_pred_chroma_copy_a9q 105 106.type ihevc_inter_pred_chroma_copy_a9q, %function 107 108ihevc_inter_pred_chroma_copy_a9q: 109 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 110 ldr r12,[sp,#wd_offset] @loads wd 111 lsl r12,r12,#1 112 ldr r7,[sp,#ht_offset] @loads ht 113 cmp r7,#0 @checks ht == 0 114 ble end_loops 115 and r8,r7,#3 @check ht for mul of 2 116 sub r7,r7,r8 @check the rounded height value 117 tst r12,#15 @checks wd for multiples for 4 & 8 118 beq core_loop_wd_16 119 tst r12,#7 @checks wd for multiples for 4 & 8 120 beq core_loop_wd_8 121 122 sub r11,r12,#4 123 cmp r7,#0 124 beq outer_loop_wd_4_ht_2 125 126outer_loop_wd_4: 127 subs r4,r12,#0 @checks wd == 0 128 ble end_inner_loop_wd_4 129 130inner_loop_wd_4: 131 vld1.32 {d0[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 132 add r5,r0,r2 @pu1_src_tmp += src_strd 133 add r6,r1,r3 @pu1_dst_tmp += dst_strd 134 vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 135 vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 136 add r0,r0,#4 @pu1_src += 4 137 vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 138 vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 139 subs r4,r4,#4 @(wd -4) 140 vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 141 vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 142 add r1,r1,#4 @pu1_dst += 4 143 vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 144 bgt inner_loop_wd_4 145 146end_inner_loop_wd_4: 147 subs r7,r7,#4 @ht - 4 148 sub r0,r5,r11 @pu1_src = pu1_src_tmp 149 sub r1,r6,r11 @pu1_dst = pu1_dst_tmp 150 bgt outer_loop_wd_4 151 cmp r8,#0 152 bgt outer_loop_wd_4_ht_2 153 154end_loops: 155 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 156 157 158outer_loop_wd_4_ht_2: 159 subs r4,r12,#0 @checks wd == 0 160 ble end_loops 161 162inner_loop_wd_4_ht_2: 163 vld1.32 {d0[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 164 add r5,r0,r2 @pu1_src_tmp += src_strd 165 add r6,r1,r3 @pu1_dst_tmp += dst_strd 166 vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 167 vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 168 add r0,r0,#4 @pu1_src += 4 169 vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 170 subs r4,r4,#4 @(wd -4) 171 add r1,r1,#4 @pu1_dst += 4 172 bgt inner_loop_wd_4_ht_2 173 b end_loops 174 175core_loop_wd_8: 176 sub r11,r12,#8 177 cmp r7,#0 178 beq outer_loop_wd_8_ht_2 179 180outer_loop_wd_8: 181 subs r4,r12,#0 @checks wd 182 ble end_inner_loop_wd_8 183 184inner_loop_wd_8: 185 add r5,r0,r2 @pu1_src_tmp += src_strd 186 vld1.8 {d0},[r0]! @vld1_u8(pu1_src_tmp) 187 add r6,r1,r3 @pu1_dst_tmp += dst_strd 188 vst1.8 {d0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src) 189 vld1.8 {d1},[r5],r2 @vld1_u8(pu1_src_tmp) 190 vst1.8 {d1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 191 subs r4,r4,#8 @wd - 8(loop condition) 192 vld1.8 {d2},[r5],r2 @vld1_u8(pu1_src_tmp) 193 vst1.8 {d2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 194 vld1.8 {d3},[r5],r2 @vld1_u8(pu1_src_tmp) 195 vst1.8 {d3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 196 bgt inner_loop_wd_8 197 198end_inner_loop_wd_8: 199 subs r7,r7,#4 @ht -= 4 200 sub r0,r5,r11 @pu1_src = pu1_src_tmp 201 sub r1,r6,r11 @pu1_dst = pu1_dst_tmp 202 bgt outer_loop_wd_8 203 cmp r8,#0 204 bgt outer_loop_wd_8_ht_2 205 b end_loops 206 207outer_loop_wd_8_ht_2: 208 subs r4,r12,#0 @checks wd 209 ble end_loops 210 211inner_loop_wd_8_ht_2: 212 add r5,r0,r2 @pu1_src_tmp += src_strd 213 vld1.8 {d0},[r0]! @vld1_u8(pu1_src_tmp) 214 add r6,r1,r3 @pu1_dst_tmp += dst_strd 215 vst1.8 {d0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src) 216 vld1.8 {d1},[r5],r2 @vld1_u8(pu1_src_tmp) 217 vst1.8 {d1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 218 @subs r4,r4,#8 @wd - 8(loop condition) 219 @bgt inner_loop_wd_8_ht_2 220 b end_loops 221 222core_loop_wd_16: 223 sub r11,r12,#16 224 cmp r7,#0 225 beq outer_loop_wd_16_ht_2 226 227outer_loop_wd_16: 228 subs r4,r12,#0 @checks wd 229 ble end_inner_loop_wd_16 230 231inner_loop_wd_16: 232 add r5,r0,r2 @pu1_src_tmp += src_strd 233 vld1.8 {q0},[r0]! @vld1_u8(pu1_src_tmp) 234 add r6,r1,r3 @pu1_dst_tmp += dst_strd 235 vst1.8 {q0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src) 236 vld1.8 {q1},[r5],r2 @vld1_u8(pu1_src_tmp) 237 vst1.8 {q1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 238 subs r4,r4,#16 @wd - 16(loop condition) 239 vld1.8 {q2},[r5],r2 @vld1_u8(pu1_src_tmp) 240 vst1.8 {q2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 241 vld1.8 {q3},[r5],r2 @vld1_u8(pu1_src_tmp) 242 vst1.8 {q3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 243 bgt inner_loop_wd_16 244 245end_inner_loop_wd_16: 246 subs r7,r7,#4 @ht -= 4 247 sub r0,r5,r11 @pu1_src = pu1_src_tmp 248 sub r1,r6,r11 @pu1_dst = pu1_dst_tmp 249 bgt outer_loop_wd_16 250 cmp r8,#0 251 bgt outer_loop_wd_16_ht_2 252 b end_loops 253 254outer_loop_wd_16_ht_2: 255 subs r4,r12,#0 @checks wd 256 ble end_loops 257 258inner_loop_wd_16_ht_2: 259 add r5,r0,r2 @pu1_src_tmp += src_strd 260 vld1.8 {q0},[r0]! @vld1_u8(pu1_src_tmp) 261 add r6,r1,r3 @pu1_dst_tmp += dst_strd 262 vst1.8 {q0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src) 263 vld1.8 {q1},[r5],r2 @vld1_u8(pu1_src_tmp) 264 vst1.8 {q1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 265 @subs r4,r4,#16 @wd - 16(loop condition) 266 @bgt inner_loop_wd_16_ht_2 267 268 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 269 270 271 272 273 274