1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* @file 21//* ihevc_intra_pred_chroma_dc_neon.s 22//* 23//* @brief 24//* contains function definitions for intra prediction dc filtering. 25//* functions are coded using neon intrinsics and can be compiled using 26 27//* rvct 28//* 29//* @author 30//* yogeswaran rs 31//* 32//* @par list of functions: 33//* 34//* 35//* @remarks 36//* none 37//* 38//******************************************************************************* 39//*/ 40///** 41//******************************************************************************* 42//* 43//* @brief 44//* luma intraprediction filter for dc input 45//* 46//* @par description: 47//* 48//* @param[in] pu1_ref 49//* uword8 pointer to the source 50//* 51//* @param[out] pu1_dst 52//* uword8 pointer to the destination 53//* 54//* @param[in] src_strd 55//* integer source stride 56//* 57//* @param[in] dst_strd 58//* integer destination stride 59//* 60//* @param[in] pi1_coeff 61//* word8 pointer to the planar coefficients 62//* 63//* @param[in] nt 64//* size of tranform block 65//* 66//* @param[in] mode 67//* type of filtering 68//* 69//* @returns 70//* 71//* @remarks 72//* none 73//* 74//******************************************************************************* 75//*/ 76 77//void ihevc_intra_pred_chroma_dc(uword8 *pu1_ref, 78// word32 src_strd, 79// uword8 *pu1_dst, 80// word32 dst_strd, 81// word32 nt, 82// word32 mode) 83// 84//**************variables vs registers***************************************** 85//x0 => *pu1_ref 86//x1 => src_strd 87//x2 => *pu1_dst 88//x3 => dst_strd 89 90//stack contents from #40 91// nt 92// mode 93// pi1_coeff 94 95.text 96.align 4 97.include "ihevc_neon_macros.s" 98 99 100 101.globl ihevc_intra_pred_chroma_dc_av8 102 103.type ihevc_intra_pred_chroma_dc_av8, %function 104 105ihevc_intra_pred_chroma_dc_av8: 106 107 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 108 push_v_regs 109 stp x19, x20,[sp,#-16]! 110 111 mov x9, #0 112 mov v17.s[0], w9 113 mov v17.s[1], w9 114 115 clz w5,w4 //counts leading zeros 116 117 add x6, x0, x4,lsl #1 //&src[2nt] 118 mov v18.s[0], w9 119 mov v18.s[1], w9 120 sub x20, x5, #32 //log2nt 121 neg x5, x20 122 add x7, x0, x4, lsl #2 //&src[4nt] 123 mov x12,x5 124 add x8, x7, #2 //&src[4nt+2] 125 126 cmp x4, #4 127 beq dc_4 //nt=4 loop 128 129 130add_loop: 131 ld2 {v30.8b, v31.8b}, [x6], #16 //load from src[nt] 132 lsl x10,x4,#1 //2nt 133 134 uaddlp v2.4h, v30.8b 135 subs x10, x10,#0x10 136 137 ld2 {v26.8b, v27.8b}, [x8],#16 //load from src[2nt+1] 138 139 uaddlp v3.4h, v31.8b 140 uaddlp v2.2s, v2.4h 141 uaddlp v3.2s, v3.4h 142 143 uadalp v17.1d, v2.2s 144 145 uadalp v18.1d, v3.2s 146 147 uaddlp v2.4h, v26.8b 148 uaddlp v3.4h, v27.8b 149 150 uaddlp v2.2s, v2.4h 151 uaddlp v3.2s, v3.4h 152 153 uadalp v17.1d, v2.2s 154 uadalp v18.1d, v3.2s 155 156 beq epil_add_loop 157 158core_loop_add: 159 ld2 {v30.8b, v31.8b}, [x6],#16 //load from src[nt] 160 uaddlp v28.4h, v30.8b 161 uaddlp v3.4h, v31.8b 162 163 ld2 {v26.8b, v27.8b}, [x8],#16 //load from src[2nt+1] 164 165 uaddlp v3.2s, v3.4h 166 uaddlp v29.2s, v28.4h 167 168 uadalp v18.1d, v3.2s 169 uadalp v17.1d, v29.2s 170 171 uaddlp v3.4h, v27.8b 172 uaddlp v28.4h, v26.8b 173 174 uaddlp v3.2s, v3.4h 175 uaddlp v29.2s, v28.4h 176 177 uadalp v18.1d, v3.2s 178 uadalp v17.1d, v29.2s 179 180 181epil_add_loop: 182 183 smov x1, v18.s[0] 184 smov x11, v17.s[0] 185 186 add x1,x1,x4 187 add x11,x11,x4 188 189 lsr x1,x1,x12 190 lsr x11,x11,x12 191 192 dup v17.8b,w1 193 dup v16.8b,w11 194 195prologue_cpy_32: 196 197 add x5, x2, x3 198 subs x9, x4, #8 199 lsl x6, x3, #2 200 csel x11, x6, x11,eq 201 add x8, x5, x3 202 add x10, x8, x3 203 204 beq epilogue_copy 205 206 st2 {v16.8b, v17.8b}, [x2],#16 207 sub x6, x6, #16 208 209 st2 {v16.8b, v17.8b}, [x5],#16 210 st2 {v16.8b, v17.8b}, [x8],#16 211 mov x20,#16 212 csel x11, x20, x11,ne 213 st2 {v16.8b, v17.8b}, [x10],#16 214 215 216 st2 {v16.8b, v17.8b}, [x2], x6 217 st2 {v16.8b, v17.8b}, [x5], x6 218 st2 {v16.8b, v17.8b}, [x8], x6 219 st2 {v16.8b, v17.8b}, [x10], x6 220 221kernel_copy: 222 st2 {v16.8b, v17.8b}, [x2],#16 223 st2 {v16.8b, v17.8b}, [x5],#16 224 st2 {v16.8b, v17.8b}, [x8],#16 225 st2 {v16.8b, v17.8b}, [x10],#16 226 227 st2 {v16.8b, v17.8b}, [x2], x6 228 st2 {v16.8b, v17.8b}, [x5], x6 229 st2 {v16.8b, v17.8b}, [x8], x6 230 st2 {v16.8b, v17.8b}, [x10], x6 231 232 st2 {v16.8b, v17.8b}, [x2],#16 233 st2 {v16.8b, v17.8b}, [x5],#16 234 st2 {v16.8b, v17.8b}, [x8],#16 235 st2 {v16.8b, v17.8b}, [x10],#16 236 237 st2 {v16.8b, v17.8b}, [x2], x6 238 st2 {v16.8b, v17.8b}, [x5], x6 239 st2 {v16.8b, v17.8b}, [x8], x6 240 st2 {v16.8b, v17.8b}, [x10], x6 241 242epilogue_copy: 243 st2 {v16.8b, v17.8b}, [x2],x11 244 st2 {v16.8b, v17.8b}, [x5],x11 245 st2 {v16.8b, v17.8b}, [x8],x11 246 st2 {v16.8b, v17.8b}, [x10],x11 247 248 st2 {v16.8b, v17.8b}, [x2] 249 st2 {v16.8b, v17.8b}, [x5] 250 st2 {v16.8b, v17.8b}, [x8] 251 st2 {v16.8b, v17.8b}, [x10] 252 b end_func 253 254dc_4: 255 ld2 {v30.8b, v31.8b},[x6] //load from src[nt] 256 shl d3, d30,#32 257 258 ld2 {v26.8b, v27.8b},[x8] //load from src[2nt+1] 259 shl d2, d31,#32 260 261 uaddlp v3.4h, v3.8b 262 uaddlp v2.4h, v2.8b 263 uaddlp v3.2s, v3.4h 264 uaddlp v2.2s, v2.4h 265 uadalp v17.1d, v3.2s 266 uadalp v18.1d, v2.2s 267 268 shl d3, d26,#32 269 shl d2, d27,#32 270 uaddlp v3.4h, v3.8b 271 uaddlp v2.4h, v2.8b 272 uaddlp v3.2s, v3.4h 273 uaddlp v2.2s, v2.4h 274 uadalp v17.1d, v3.2s 275 uadalp v18.1d, v2.2s 276 277 smov x10, v17.s[0] 278 smov x11, v18.s[0] 279 280 add x10,x10,x4 281 add x11,x11,x4 282 lsr x10,x10,x12 283 lsr x11,x11,x12 284 orr x10,x10,x11,lsl #8 285 dup v0.4h,w10 286 287 st1 {v0.8b},[x2],x3 288 st1 {v0.8b},[x2],x3 289 st1 {v0.8b},[x2],x3 290 st1 {v0.8b},[x2] 291 292end_func: 293 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 294 ldp x19, x20,[sp],#16 295 pop_v_regs 296 ret 297 298 299 300 301