1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* @file 21@* ihevc_intra_pred_luma_mode2_neon.s 22@* 23@* @brief 24@* contains function definitions for intra prediction dc filtering. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* yogeswaran rs 31@* 32@* @par list of functions: 33@* 34@* 35@* @remarks 36@* none 37@* 38@******************************************************************************* 39@*/ 40@/** 41@******************************************************************************* 42@* 43@* @brief 44@* luma intraprediction filter for dc input 45@* 46@* @par description: 47@* 48@* @param[in] pu1_ref 49@* uword8 pointer to the source 50@* 51@* @param[out] pu1_dst 52@* uword8 pointer to the destination 53@* 54@* @param[in] src_strd 55@* integer source stride 56@* 57@* @param[in] dst_strd 58@* integer destination stride 59@* 60@* @param[in] pi1_coeff 61@* word8 pointer to the planar coefficients 62@* 63@* @param[in] nt 64@* size of tranform block 65@* 66@* @param[in] mode 67@* type of filtering 68@* 69@* @returns 70@* 71@* @remarks 72@* none 73@* 74@******************************************************************************* 75@*/ 76 77@void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref, 78@ word32 src_strd, 79@ uword8 *pu1_dst, 80@ word32 dst_strd, 81@ word32 nt, 82@ word32 mode) 83@ 84@**************variables vs registers***************************************** 85@r0 => *pu1_ref 86@r1 => src_strd 87@r2 => *pu1_dst 88@r3 => dst_strd 89 90@stack contents from #104 91@ nt 92@ mode 93@ pi1_coeff 94 95.equ nt_offset, 104 96 97.text 98.align 4 99 100 101 102 103.globl ihevc_intra_pred_chroma_mode2_a9q 104 105.type ihevc_intra_pred_chroma_mode2_a9q, %function 106 107ihevc_intra_pred_chroma_mode2_a9q: 108 109 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 110 vpush {d8 - d15} 111 112 ldr r4,[sp,#nt_offset] @loads nt 113 mov r8,#-4 114 115 cmp r4,#4 116 beq mode2_4 117 118 add r0,r0,r4,lsl #2 119 120 sub r0,r0,#0x12 @src[1] 121 add r10,r0,#-2 122 123prologue_cpy_32: 124 125 vld2.8 {d0,d1},[r0],r8 126 127 mov r11,r4 128 vrev64.8 d16,d0 129 vrev64.8 d17,d1 130 131 vld2.8 {d2,d3},[r10],r8 132 mov r6, r2 133 134 vld2.8 {d4,d5},[r0],r8 135 vld2.8 {d6,d7},[r10],r8 136 lsr r1, r4, #3 137 138 vld2.8 {d8,d9},[r0],r8 139 vld2.8 {d10,d11},[r10],r8 140 vld2.8 {d12,d13},[r0],r8 141 mul r1, r4, r1 142 143 vld2.8 {d14,d15},[r10],r8 144 add r7,r6,r3 145 146 vrev64.8 d18,d2 147 vrev64.8 d19,d3 148 lsl r5, r3, #2 149 150 vrev64.8 d20,d4 151 vrev64.8 d21,d5 152 add r9,r7,r3 153 154 vrev64.8 d22,d6 155 vrev64.8 d23,d7 156 157 vrev64.8 d24,d8 158 vrev64.8 d25,d9 159 160 vrev64.8 d26,d10 161 subs r1,r1,#8 162 163 vrev64.8 d27,d11 164 165 vrev64.8 d28,d12 166 vrev64.8 d29,d13 167 168 vrev64.8 d30,d14 169 add r14,r9,r3 170 vrev64.8 d31,d15 171 172 beq epilogue_mode2 173 174 sub r12,r4,#8 175 176kernel_mode2: 177 178 vst2.8 {d16,d17},[r6],r5 179 vst2.8 {d18,d19},[r7],r5 180 subs r11,r11,#8 181 vst2.8 {d20,d21},[r9],r5 182 vst2.8 {d22,d23},[r14],r5 183 vst2.8 {d24,d25},[r6],r5 184 addgt r2,r2,#16 185 vst2.8 {d26,d27},[r7],r5 186 vst2.8 {d28,d29},[r9],r5 187 vst2.8 {d30,d31},[r14],r5 188 189 vld2.8 {d0,d1},[r0],r8 190 movle r11,r4 191 192 vld2.8 {d2,d3},[r10],r8 193 vld2.8 {d4,d5},[r0],r8 194 addle r2, r2, r3, lsl #2 195 vld2.8 {d6,d7},[r10],r8 196 vrev64.8 d16,d0 197 198 vld2.8 {d8,d9},[r0],r8 199 vld2.8 {d10,d11},[r10],r8 200 suble r2, r6,#16 201 vld2.8 {d12,d13},[r0],r8 202 vrev64.8 d17,d1 203 vld2.8 {d14,d15},[r10],r8 204 205 subs r12,r12,#8 206 mov r6, r2 207 addle r0, r0, r4,lsl #1 208 add r7, r6, r3 209 210 vrev64.8 d18,d2 211 suble r0, r0, #16 212 vrev64.8 d19,d3 213 214 vrev64.8 d20,d4 215 movle r12,r4 216 vrev64.8 d21,d5 217 218 vrev64.8 d22,d6 219 add r9, r7, r3 220 vrev64.8 d23,d7 221 222 vrev64.8 d24,d8 223 add r10,r0,#-2 224 vrev64.8 d25,d9 225 226 vrev64.8 d26,d10 227 subs r1, r1, #8 228 vrev64.8 d27,d11 229 230 vrev64.8 d28,d12 231 vrev64.8 d29,d13 232 233 vrev64.8 d30,d14 234 add r14, r9, r3 235 vrev64.8 d31,d15 236 237 bne kernel_mode2 238 239epilogue_mode2: 240 241 vst2.8 {d16,d17},[r6],r5 242 vst2.8 {d18,d19},[r7],r5 243 vst2.8 {d20,d21},[r9],r5 244 vst2.8 {d22,d23},[r14],r5 245 vst2.8 {d24,d25},[r6],r5 246 vst2.8 {d26,d27},[r7],r5 247 vst2.8 {d28,d29},[r9],r5 248 vst2.8 {d30,d31},[r14],r5 249 250 b end_func 251 252mode2_4: 253 254 lsl r12,r4,#1 255 add r0,r0,r12 256 sub r0,r0,#2 257 258 vld2.8 {d12,d13},[r0],r8 259 vshl.i64 d0,d12,#32 260 add r10,r0,#2 261 vshl.i64 d1,d13,#32 262 263 vrev64.8 d0,d0 264 vld2.8 {d14,d15},[r10],r8 265 vshl.i64 d2,d14,#32 266 267 vrev64.8 d1,d1 268 vshl.i64 d3,d15,#32 269 vzip.8 d0,d1 270 vst1.8 {d0},[r2],r3 271 272 vrev64.8 d2,d2 273 vld2.8 {d16,d17},[r0],r8 274 vshl.i64 d4,d16,#32 275 vrev64.8 d3,d3 276 vshl.i64 d5,d17,#32 277 vzip.8 d2,d3 278 vrev64.8 d4,d4 279 vrev64.8 d5,d5 280 vst1.8 {d2},[r2],r3 281 282 283 vld2.8 {d18,d19},[r10],r8 284 vshl.i64 d6,d18,#32 285 286 vzip.8 d4,d5 287 vshl.i64 d7,d19,#32 288 vrev64.8 d6,d6 289 vst1.8 {d4},[r2],r3 290 291 vrev64.8 d7,d7 292 vzip.8 d6,d7 293 vst1.8 {d6},[r2],r3 294 295end_func: 296 vpop {d8 - d15} 297 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 298 299 300 301 302 303 304