1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* @file 21//* ihevc_intra_pred_luma_mode2_neon.s 22//* 23//* @brief 24//* contains function definitions for intra prediction dc filtering. 25//* functions are coded using neon intrinsics and can be compiled using 26 27//* rvct 28//* 29//* @author 30//* yogeswaran rs 31//* 32//* @par list of functions: 33//* 34//* 35//* @remarks 36//* none 37//* 38//******************************************************************************* 39//*/ 40///** 41//******************************************************************************* 42//* 43//* @brief 44//* luma intraprediction filter for dc input 45//* 46//* @par description: 47//* 48//* @param[in] pu1_ref 49//* uword8 pointer to the source 50//* 51//* @param[out] pu1_dst 52//* uword8 pointer to the destination 53//* 54//* @param[in] src_strd 55//* integer source stride 56//* 57//* @param[in] dst_strd 58//* integer destination stride 59//* 60//* @param[in] pi1_coeff 61//* word8 pointer to the planar coefficients 62//* 63//* @param[in] nt 64//* size of tranform block 65//* 66//* @param[in] mode 67//* type of filtering 68//* 69//* @returns 70//* 71//* @remarks 72//* none 73//* 74//******************************************************************************* 75//*/ 76 77//void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref, 78// word32 src_strd, 79// uword8 *pu1_dst, 80// word32 dst_strd, 81// word32 nt, 82// word32 mode) 83// 84//**************variables vs registers***************************************** 85//x0 => *pu1_ref 86//x1 => src_strd 87//x2 => *pu1_dst 88//x3 => dst_strd 89 90//stack contents from #40 91// nt 92// mode 93// pi1_coeff 94 95.text 96.align 4 97.include "ihevc_neon_macros.s" 98 99 100 101.globl ihevc_intra_pred_chroma_mode2_av8 102 103.type ihevc_intra_pred_chroma_mode2_av8, %function 104 105ihevc_intra_pred_chroma_mode2_av8: 106 107 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 108 push_v_regs 109 stp x19, x20,[sp,#-16]! 110 111 mov x8,#-4 112 113 cmp x4,#4 114 beq mode2_4 115 116 add x0,x0,x4,lsl #2 117 118 sub x0,x0,#0x12 //src[1] 119 sub x10,x0,#2 120 121prologue_cpy_32: 122 123 ld2 {v0.8b, v1.8b},[x0],x8 124 125 mov x11,x4 126 rev64 v16.8b, v0.8b 127 rev64 v17.8b, v1.8b 128 129 ld2 {v2.8b, v3.8b},[x10],x8 130 mov x6, x2 131 132 ld2 {v4.8b, v5.8b},[x0],x8 133 ld2 {v6.8b, v7.8b},[x10],x8 134 lsr x1, x4, #3 135 136 ld2 {v8.8b, v9.8b},[x0],x8 137 ld2 {v10.8b, v11.8b},[x10],x8 138 ld2 {v12.8b, v13.8b},[x0],x8 139 mul x1, x4, x1 140 141 ld2 {v14.8b, v15.8b},[x10],x8 142 add x7,x6,x3 143 144 rev64 v18.8b, v2.8b 145 rev64 v19.8b, v3.8b 146 lsl x5, x3, #2 147 148 rev64 v20.8b, v4.8b 149 rev64 v21.8b, v5.8b 150 add x9,x7,x3 151 152 rev64 v22.8b, v6.8b 153 rev64 v23.8b, v7.8b 154 155 rev64 v24.8b, v8.8b 156 rev64 v25.8b, v9.8b 157 158 rev64 v26.8b, v10.8b 159 subs x1,x1,#8 160 161 rev64 v27.8b, v11.8b 162 163 rev64 v28.8b, v12.8b 164 rev64 v29.8b, v13.8b 165 166 rev64 v30.8b, v14.8b 167 add x14,x9,x3 168 rev64 v31.8b, v15.8b 169 170 beq epilogue_mode2 171 172 sub x12,x4,#8 173 174kernel_mode2: 175 176 st2 {v16.8b, v17.8b},[x6],x5 177 st2 {v18.8b, v19.8b},[x7],x5 178 subs x11,x11,#8 179 st2 {v20.8b, v21.8b},[x9],x5 180 st2 {v22.8b, v23.8b},[x14],x5 181 st2 {v24.8b, v25.8b},[x6],x5 182 add x20,x2,#16 183 csel x2, x20, x2,gt 184 st2 {v26.8b, v27.8b},[x7],x5 185 st2 {v28.8b, v29.8b},[x9],x5 186 st2 {v30.8b, v31.8b},[x14],x5 187 188 ld2 {v0.8b, v1.8b},[x0],x8 189 csel x11, x4, x11,le 190 191 ld2 {v2.8b, v3.8b},[x10],x8 192 ld2 {v4.8b, v5.8b},[x0],x8 193 add x20, x2, x3, lsl #2 194 csel x2, x20, x2,le 195 ld2 {v6.8b, v7.8b},[x10],x8 196 rev64 v16.8b, v0.8b 197 198 ld2 {v8.8b, v9.8b},[x0],x8 199 ld2 {v10.8b, v11.8b},[x10],x8 200 sub x20, x6,#16 201 csel x2, x20, x2,le 202 ld2 {v12.8b, v13.8b},[x0],x8 203 rev64 v17.8b, v1.8b 204 ld2 {v14.8b, v15.8b},[x10],x8 205 206 subs x12,x12,#8 207 mov x6, x2 208 add x20, x0, x4,lsl #1 209 csel x0, x20, x0,le 210 add x7, x6, x3 211 212 rev64 v18.8b, v2.8b 213 sub x20, x0, #16 214 csel x0, x20, x0,le 215 rev64 v19.8b, v3.8b 216 217 rev64 v20.8b, v4.8b 218 csel x12, x4, x12,le 219 rev64 v21.8b, v5.8b 220 221 rev64 v22.8b, v6.8b 222 add x9, x7, x3 223 rev64 v23.8b, v7.8b 224 225 rev64 v24.8b, v8.8b 226 sub x10,x0,#2 227 rev64 v25.8b, v9.8b 228 229 rev64 v26.8b, v10.8b 230 subs x1, x1, #8 231 rev64 v27.8b, v11.8b 232 233 rev64 v28.8b, v12.8b 234 rev64 v29.8b, v13.8b 235 236 rev64 v30.8b, v14.8b 237 add x14, x9, x3 238 rev64 v31.8b, v15.8b 239 240 bne kernel_mode2 241 242epilogue_mode2: 243 244 st2 {v16.8b, v17.8b},[x6],x5 245 st2 {v18.8b, v19.8b},[x7],x5 246 st2 {v20.8b, v21.8b},[x9],x5 247 st2 {v22.8b, v23.8b},[x14],x5 248 st2 {v24.8b, v25.8b},[x6],x5 249 st2 {v26.8b, v27.8b},[x7],x5 250 st2 {v28.8b, v29.8b},[x9],x5 251 st2 {v30.8b, v31.8b},[x14],x5 252 253 b end_func 254 255mode2_4: 256 257 lsl x12,x4,#1 258 add x0,x0,x12 259 sub x0,x0,#2 260 261 ld2 {v12.8b, v13.8b},[x0],x8 262 shl d0, d12,#32 263 add x10,x0,#2 264 shl d1, d13,#32 265 266 rev64 v0.8b, v0.8b 267 ld2 {v14.8b, v15.8b},[x10],x8 268 shl d2, d14,#32 269 270 rev64 v1.8b, v1.8b 271 shl d3, d15,#32 272 zip1 v0.8b, v0.8b, v1.8b 273 zip2 v1.8b, v0.8b, v1.8b 274 st1 {v0.8b},[x2],x3 275 276 rev64 v2.8b, v2.8b 277 ld2 {v16.8b, v17.8b},[x0],x8 278 shl d4, d16,#32 279 rev64 v3.8b, v3.8b 280 shl d5, d17,#32 281 zip1 v2.8b, v2.8b, v3.8b 282 zip2 v3.8b, v2.8b, v3.8b 283 rev64 v4.8b, v4.8b 284 rev64 v5.8b, v5.8b 285 st1 {v2.8b},[x2],x3 286 287 288 ld2 {v18.8b, v19.8b},[x10],x8 289 shl d6, d18,#32 290 291 zip1 v4.8b, v4.8b, v5.8b 292 zip2 v5.8b, v4.8b, v5.8b 293 shl d7, d19,#32 294 rev64 v6.8b, v6.8b 295 st1 {v4.8b},[x2],x3 296 297 rev64 v7.8b, v7.8b 298 zip1 v6.8b, v6.8b, v7.8b 299 zip2 v7.8b, v6.8b, v7.8b 300 st1 {v6.8b},[x2],x3 301 302end_func: 303 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 304 ldp x19, x20,[sp],#16 305 pop_v_regs 306 ret 307 308 309 310 311 312 313