1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* @file 21@* ihevc_intra_pred_chroma_dc_neon.s 22@* 23@* @brief 24@* contains function definitions for intra prediction dc filtering. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* yogeswaran rs 31@* 32@* @par list of functions: 33@* 34@* 35@* @remarks 36@* none 37@* 38@******************************************************************************* 39@*/ 40@/** 41@******************************************************************************* 42@* 43@* @brief 44@* luma intraprediction filter for dc input 45@* 46@* @par description: 47@* 48@* @param[in] pu1_ref 49@* uword8 pointer to the source 50@* 51@* @param[out] pu1_dst 52@* uword8 pointer to the destination 53@* 54@* @param[in] src_strd 55@* integer source stride 56@* 57@* @param[in] dst_strd 58@* integer destination stride 59@* 60@* @param[in] pi1_coeff 61@* word8 pointer to the planar coefficients 62@* 63@* @param[in] nt 64@* size of tranform block 65@* 66@* @param[in] mode 67@* type of filtering 68@* 69@* @returns 70@* 71@* @remarks 72@* none 73@* 74@******************************************************************************* 75@*/ 76 77@void ihevc_intra_pred_chroma_dc(uword8 *pu1_ref, 78@ word32 src_strd, 79@ uword8 *pu1_dst, 80@ word32 dst_strd, 81@ word32 nt, 82@ word32 mode) 83@ 84@**************variables vs registers***************************************** 85@r0 => *pu1_ref 86@r1 => src_strd 87@r2 => *pu1_dst 88@r3 => dst_strd 89 90@stack contents from #40 91@ nt 92@ mode 93@ pi1_coeff 94 95.equ nt_offset, 40 96 97.text 98.align 4 99 100 101 102 103.globl ihevc_intra_pred_chroma_dc_a9q 104 105.type ihevc_intra_pred_chroma_dc_a9q, %function 106 107ihevc_intra_pred_chroma_dc_a9q: 108 109 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 110 111 ldr r4,[sp,#nt_offset] @loads nt 112 mov r9, #0 113 vmov d17, r9, r9 114 115 clz r5, r4 @counts leading zeros 116 117 add r6, r0, r4,lsl #1 @&src[2nt] 118 vmov d18, r9, r9 119 rsb r5, r5, #32 @log2nt 120 add r7, r0, r4, lsl #2 @&src[4nt] 121 mov r12,r5 122 add r8, r7, #2 @&src[4nt+2] 123 124 cmp r4, #4 125 beq dc_4 @nt=4 loop 126 127 128add_loop: 129 vld2.s8 {d30,d31}, [r6]! @load from src[nt] 130 lsl r10,r4,#1 @2nt 131 132 vpaddl.u8 d2, d30 133 subs r10, #0x10 134 135 vld2.s8 {d26,d27}, [r8]! @load from src[2nt+1] 136 137 vpaddl.u8 d3, d31 138 vpaddl.u16 d2, d2 139 vpaddl.u16 d3, d3 140 141 vpadal.u32 d17, d2 142 143 vpadal.u32 d18, d3 144 145 vpaddl.u8 d2, d26 146 vpaddl.u8 d3, d27 147 148 vpaddl.u16 d2, d2 149 vpaddl.u16 d3, d3 150 151 vpadal.u32 d17, d2 152 vpadal.u32 d18, d3 153 154 beq epil_add_loop 155 156core_loop_add: 157 vld2.s8 {d30,d31}, [r6]! @load from src[nt] 158 vpaddl.u8 d28, d30 159 vpaddl.u8 d3, d31 160 161 vld2.s8 {d26,d27}, [r8]! @load from src[2nt+1] 162 163 vpaddl.u16 d3, d3 164 vpaddl.u16 d29, d28 165 166 vpadal.u32 d18, d3 167 vpadal.u32 d17, d29 168 169 vpaddl.u8 d3, d27 170 vpaddl.u8 d28, d26 171 172 vpaddl.u16 d3, d3 173 vpaddl.u16 d29, d28 174 175 vpadal.u32 d18, d3 176 vpadal.u32 d17, d29 177 178 179epil_add_loop: 180 181 vmov.32 r1,d18[0] 182 vmov.32 r11,d17[0] 183 184 add r1,r1,r4 185 add r11,r11,r4 186 187 lsr r1,r1,r12 188 lsr r11,r11,r12 189 190 vdup.8 d17,r1 191 vdup.8 d16,r11 192 193prologue_cpy_32: 194 195 add r5, r2, r3 196 subs r9, r4, #8 197 lsl r6, r3, #2 198 moveq r11,r6 199 add r8, r5, r3 200 add r10, r8, r3 201 202 beq epilogue_copy 203 204 vst2.8 {d16,d17}, [r2]! 205 add r6, r6, #0xfffffff0 206 207 vst2.8 {d16,d17}, [r5]! 208 vst2.8 {d16,d17}, [r8]! 209 movne r11,#16 210 vst2.8 {d16,d17}, [r10]! 211 212 213 vst2.8 {d16,d17}, [r2], r6 214 vst2.8 {d16,d17}, [r5], r6 215 vst2.8 {d16,d17}, [r8], r6 216 vst2.8 {d16,d17}, [r10], r6 217 218kernel_copy: 219 vst2.8 {d16,d17}, [r2]! 220 vst2.8 {d16,d17}, [r5]! 221 vst2.8 {d16,d17}, [r8]! 222 vst2.8 {d16,d17}, [r10]! 223 224 vst2.8 {d16,d17}, [r2], r6 225 vst2.8 {d16,d17}, [r5], r6 226 vst2.8 {d16,d17}, [r8], r6 227 vst2.8 {d16,d17}, [r10], r6 228 229 vst2.8 {d16,d17}, [r2]! 230 vst2.8 {d16,d17}, [r5]! 231 vst2.8 {d16,d17}, [r8]! 232 vst2.8 {d16,d17}, [r10]! 233 234 vst2.8 {d16,d17}, [r2], r6 235 vst2.8 {d16,d17}, [r5], r6 236 vst2.8 {d16,d17}, [r8], r6 237 vst2.8 {d16,d17}, [r10], r6 238 239epilogue_copy: 240 vst2.8 {d16,d17}, [r2],r11 241 vst2.8 {d16,d17}, [r5],r11 242 vst2.8 {d16,d17}, [r8],r11 243 vst2.8 {d16,d17}, [r10],r11 244 245 vst2.8 {d16,d17}, [r2] 246 vst2.8 {d16,d17}, [r5] 247 vst2.8 {d16,d17}, [r8] 248 vst2.8 {d16,d17}, [r10] 249 b end_func 250 251dc_4: 252 vld2.s8 {d30,d31},[r6] @load from src[nt] 253 vshl.i64 d3,d30,#32 254 255 vld2.s8 {d26,d27},[r8] @load from src[2nt+1] 256 vshl.i64 d2,d31,#32 257 258 vpaddl.u8 d3,d3 259 vpaddl.u8 d2,d2 260 vpaddl.u16 d3,d3 261 vpaddl.u16 d2,d2 262 vpadal.u32 d17,d3 263 vpadal.u32 d18,d2 264 265 vshl.i64 d3,d26,#32 266 vshl.i64 d2,d27,#32 267 vpaddl.u8 d3,d3 268 vpaddl.u8 d2,d2 269 vpaddl.u16 d3,d3 270 vpaddl.u16 d2,d2 271 vpadal.u32 d17,d3 272 vpadal.u32 d18,d2 273 274 vmov.32 r10,d17[0] 275 vmov.32 r11,d18[0] 276 277 add r10,r10,r4 278 add r11,r11,r4 279 lsr r10,r10,r12 280 lsr r11,r11,r12 281 orr r10,r10,r11,lsl #8 282 vdup.16 d0,r10 283 284 vst1.8 {d0},[r2],r3 285 vst1.8 {d0},[r2],r3 286 vst1.8 {d0},[r2],r3 287 vst1.8 {d0},[r2] 288 289end_func: 290 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 291 292 293 294 295