1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* @file 21//* ihevc_intra_pred_luma_mode2_neon.s 22//* 23//* @brief 24//* contains function definitions for intra prediction dc filtering. 25//* functions are coded using neon intrinsics and can be compiled using 26 27//* rvct 28//* 29//* @author 30//* yogeswaran rs 31//* 32//* @par list of functions: 33//* 34//* 35//* @remarks 36//* none 37//* 38//******************************************************************************* 39//*/ 40///** 41//******************************************************************************* 42//* 43//* @brief 44//* luma intraprediction filter for dc input 45//* 46//* @par description: 47//* 48//* @param[in] pu1_ref 49//* uword8 pointer to the source 50//* 51//* @param[out] pu1_dst 52//* uword8 pointer to the destination 53//* 54//* @param[in] src_strd 55//* integer source stride 56//* 57//* @param[in] dst_strd 58//* integer destination stride 59//* 60//* @param[in] pi1_coeff 61//* word8 pointer to the planar coefficients 62//* 63//* @param[in] nt 64//* size of tranform block 65//* 66//* @param[in] mode 67//* type of filtering 68//* 69//* @returns 70//* 71//* @remarks 72//* none 73//* 74//******************************************************************************* 75//*/ 76 77//void ihevc_intra_pred_luma_mode2(uword8 *pu1_ref, 78// word32 src_strd, 79// uword8 *pu1_dst, 80// word32 dst_strd, 81// word32 nt, 82// word32 mode) 83// 84//**************variables vs registers***************************************** 85//x0 => *pu1_ref 86//x1 => src_strd 87//x2 => *pu1_dst 88//x3 => dst_strd 89 90//stack contents from #40 91// nt 92// mode 93// pi1_coeff 94 95.text 96.align 4 97.include "ihevc_neon_macros.s" 98 99 100 101.globl ihevc_intra_pred_luma_mode2_av8 102 103.type ihevc_intra_pred_luma_mode2_av8, %function 104 105ihevc_intra_pred_luma_mode2_av8: 106 107 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 108 109 stp x19, x20,[sp,#-16]! 110 111 mov x8,#-2 112 113 cmp x4,#4 114 beq mode2_4 115 116 add x0,x0,x4,lsl #1 117 118 sub x0,x0,#9 //src[1] 119 sub x10,x0,#1 120 121prologue_cpy_32: 122 123 ld1 {v0.8b},[x0],x8 124 mov x11,x4 125 126 ld1 {v1.8b},[x10],x8 127 mov x6, x2 128 129 ld1 {v2.8b},[x0],x8 130 ld1 {v3.8b},[x10],x8 131 lsr x1, x4, #3 132 133 ld1 {v4.8b},[x0],x8 134 ld1 {v5.8b},[x10],x8 135 ld1 {v6.8b},[x0],x8 136 mul x1, x4, x1 137 138 ld1 {v7.8b},[x10],x8 139 add x7,x6,x3 140 141 rev64 v16.8b, v0.8b 142 rev64 v17.8b, v1.8b 143 lsl x5, x3, #2 144 145 rev64 v18.8b, v2.8b 146 rev64 v19.8b, v3.8b 147 add x9,x7,x3 148 149 rev64 v20.8b, v4.8b 150 subs x1,x1,#8 151 152 rev64 v21.8b, v5.8b 153 rev64 v22.8b, v6.8b 154 rev64 v23.8b, v7.8b 155 add x14,x9,x3 156 157 beq epilogue_mode2 158 159 sub x12,x4,#8 160 161kernel_mode2: 162 163 st1 {v16.8b},[x6],x5 164 st1 {v17.8b},[x7],x5 165 subs x11,x11,#8 166 167 st1 {v18.8b},[x9],x5 168 add x20,x2,#8 169 csel x2, x20, x2,gt 170 171 st1 {v19.8b},[x14],x5 172 st1 {v20.8b},[x6],x5 173 csel x11, x4, x11,le 174 175 st1 {v21.8b},[x7],x5 176 st1 {v22.8b},[x9],x5 177 add x20, x2, x3, lsl #2 178 csel x2, x20, x2,le 179 180 st1 {v23.8b},[x14],x5 181 ld1 {v0.8b},[x0],x8 182 sub x14,x4,#8 183 184 ld1 {v1.8b},[x10],x8 185 ld1 {v2.8b},[x0],x8 186 add x20, x2, #8 187 csel x2, x20, x2,le 188 189 ld1 {v3.8b},[x10],x8 190 ld1 {v4.8b},[x0],x8 191 sub x20, x6, x14 192 csel x2, x20, x2,le 193 194 ld1 {v5.8b},[x10],x8 195 subs x12,x12,#8 196 197 ld1 {v6.8b},[x0],x8 198 mov x6, x2 199 200 ld1 {v7.8b},[x10],x8 201 add x20, x0, x4 202 csel x0, x20, x0,le 203 204 rev64 v16.8b, v0.8b 205 add x7, x6, x3 206 207 rev64 v17.8b, v1.8b 208 sub x20, x0, #8 209 csel x0, x20, x0,le 210 211 rev64 v18.8b, v2.8b 212 csel x12, x4, x12,le 213 214 rev64 v19.8b, v3.8b 215 add x9, x7, x3 216 217 rev64 v20.8b, v4.8b 218 sub x10,x0,#1 219 220 rev64 v21.8b, v5.8b 221 subs x1, x1, #8 222 223 rev64 v22.8b, v6.8b 224 add x14, x9, x3 225 226 rev64 v23.8b, v7.8b 227 228 bne kernel_mode2 229 230epilogue_mode2: 231 232 st1 {v16.8b},[x6],x5 233 st1 {v17.8b},[x7],x5 234 st1 {v18.8b},[x9],x5 235 st1 {v19.8b},[x14],x5 236 st1 {v20.8b},[x6],x5 237 st1 {v21.8b},[x7],x5 238 st1 {v22.8b},[x9],x5 239 st1 {v23.8b},[x14],x5 240 241 b end_func 242 243mode2_4: 244 245 mov x8,#-2 246 sub x0,x0,#1 247 sub x10,x0,#1 248 249 ld1 {v0.8b},[x0],x8 250 add x5,x2,x3 251 ld1 {v2.8b},[x10],x8 252 add x6,x5,x3 253 ld1 {v4.8b},[x0] 254 add x7,x6,x3 255 ld1 {v6.8b},[x10] 256 257 rev64 v1.8b, v0.8b 258 rev64 v3.8b, v2.8b 259 260 261 262 st1 {v1.s}[0],[x2] 263 rev64 v5.8b, v4.8b 264 st1 {v3.s}[0],[x5] 265 rev64 v7.8b, v6.8b 266 st1 {v5.s}[0],[x6] 267 st1 {v7.s}[0],[x7] 268 269end_func: 270 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 271 ldp x19, x20,[sp],#16 272 273 ret 274 275 276 277 278 279 280 281