1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21///** 22//******************************************************************************* 23//* 24//* @brief 25//* Interprediction luma function for copy 26//* 27//* @par Description: 28//* Copies the array of width 'wd' and height 'ht' from the location pointed 29//* by 'src' to the location pointed by 'dst' 30//* 31//* @param[in] pu1_src 32//* UWORD8 pointer to the source 33//* 34//* @param[out] pu1_dst 35//* UWORD8 pointer to the destination 36//* 37//* @param[in] src_strd 38//* integer source stride 39//* 40//* @param[in] dst_strd 41//* integer destination stride 42//* 43//* 44//* @param[in] ht 45//* integer height of the array 46//* 47//* @param[in] wd 48//* integer width of the array 49//* 50//* @returns 51//* 52//* @remarks 53//* None 54//* 55//******************************************************************************* 56//*/ 57//void ih264_inter_pred_luma_copy ( 58// UWORD8 *pu1_src, 59// UWORD8 *pu1_dst, 60// WORD32 src_strd, 61// WORD32 dst_strd, 62// WORD32 ht, 63// WORD32 wd ) 64 65//**************Variables Vs Registers***************************************** 66// x0 => *pu1_src 67// x1 => *pu1_dst 68// w2 => src_strd 69// w3 => dst_strd 70// w4 => ht 71// w5 => wd 72 73.text 74.p2align 2 75.include "ih264_neon_macros.s" 76 77 78 79 .global ih264_inter_pred_luma_copy_av8 80 81ih264_inter_pred_luma_copy_av8: 82 83 push_v_regs 84 stp x19, x20, [sp, #-16]! 85 sxtw x2, w2 86 sxtw x3, w3 87 sxtw x4, w4 88 sxtw x5, w5 89 90 mov x12, x5 91 mov x7, x4 92 cmp x7, #0 //checks ht == 0 93 ble end_loops 94 tst x12, #15 //checks wd for multiples for 4 & 8 95 beq core_loop_wd_16 96 tst x12, #7 //checks wd for multiples for 4 & 8 97 beq core_loop_wd_8 98 sub x11, x12, #4 99 100outer_loop_wd_4: 101 subs x4, x12, #0 //checks wd == 0 102 ble end_inner_loop_wd_4 103 104inner_loop_wd_4: 105 ld1 {v0.s}[0], [x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 106 add x5, x0, x2 //pu1_src_tmp += src_strd 107 add x6, x1, x3 //pu1_dst_tmp += dst_strd 108 st1 {v0.s}[0], [x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 109 ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 110 add x0, x0, #4 //pu1_src += 4 111 st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 112 ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 113 subs x4, x4, #4 //(wd -4) 114 st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 115 ld1 {v0.s}[0], [x5], x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 116 add x1, x1, #4 //pu1_dst += 4 117 st1 {v0.s}[0], [x6], x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 118 119 bgt inner_loop_wd_4 120 121end_inner_loop_wd_4: 122 subs x7, x7, #4 //ht - 4 123 sub x0, x5, x11 //pu1_src = pu1_src_tmp 124 sub x1, x6, x11 //pu1_dst = pu1_dst_tmp 125 bgt outer_loop_wd_4 126 127end_loops: 128 // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP 129 ldp x19, x20, [sp], #16 130 pop_v_regs 131 ret 132 133 134core_loop_wd_8: 135 sub x11, x12, #8 136 137outer_loop_wd_8: 138 subs x4, x12, #0 //checks wd 139 ble end_inner_loop_wd_8 140 141inner_loop_wd_8: 142 add x5, x0, x2 //pu1_src_tmp += src_strd 143 ld1 {v0.8b}, [x0], #8 //vld1_u8(pu1_src_tmp) 144 add x6, x1, x3 //pu1_dst_tmp += dst_strd 145 st1 {v0.8b}, [x1], #8 //vst1_u8(pu1_dst_tmp, tmp_src) 146 ld1 {v1.8b}, [x5], x2 //vld1_u8(pu1_src_tmp) 147 st1 {v1.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) 148 subs x4, x4, #8 //wd - 8(Loop condition) 149 ld1 {v2.8b}, [x5], x2 //vld1_u8(pu1_src_tmp) 150 st1 {v2.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) 151 ld1 {v3.8b}, [x5], x2 //vld1_u8(pu1_src_tmp) 152 st1 {v3.8b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) 153 bgt inner_loop_wd_8 154 155end_inner_loop_wd_8: 156 subs x7, x7, #4 //ht -= 4 157 sub x0, x5, x11 //pu1_src = pu1_src_tmp 158 sub x1, x6, x11 //pu1_dst = pu1_dst_tmp 159 bgt outer_loop_wd_8 160 161 // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP 162 ldp x19, x20, [sp], #16 163 pop_v_regs 164 ret 165 166core_loop_wd_16: 167 sub x11, x12, #16 168 169outer_loop_wd_16: 170 subs x4, x12, #0 //checks wd 171 ble end_inner_loop_wd_16 172 173inner_loop_wd_16: 174 add x5, x0, x2 //pu1_src_tmp += src_strd 175 ld1 { v0.16b}, [x0], #16 //vld1_u8(pu1_src_tmp) 176 add x6, x1, x3 //pu1_dst_tmp += dst_strd 177 st1 { v0.16b}, [x1], #16 //vst1_u8(pu1_dst_tmp, tmp_src) 178 ld1 { v2.16b}, [x5], x2 //vld1_u8(pu1_src_tmp) 179 st1 { v2.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) 180 subs x4, x4, #16 //wd - 8(Loop condition) 181 ld1 { v4.16b}, [x5], x2 //vld1_u8(pu1_src_tmp) 182 st1 { v4.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) 183 ld1 { v6.16b}, [x5], x2 //vld1_u8(pu1_src_tmp) 184 st1 { v6.16b}, [x6], x3 //vst1_u8(pu1_dst_tmp, tmp_src) 185 bgt inner_loop_wd_16 186 187end_inner_loop_wd_16: 188 subs x7, x7, #4 //ht -= 4 189 sub x0, x5, x11 //pu1_src = pu1_src_tmp 190 sub x1, x6, x11 //pu1_dst = pu1_dst_tmp 191 bgt outer_loop_wd_16 192 193 194 ldp x19, x20, [sp], #16 195 pop_v_regs 196 ret 197 198 199// /* 200// ******************************************************************************** 201// * 202// * @brief This function copies a 4x4 block to destination 203// * 204// * @par Description: 205// * Copies a 4x4 block to destination, where both src and dst are interleaved 206// * 207// * @param[in] pi2_src 208// * Source 209// * 210// * @param[in] pu1_out 211// * Output pointer 212// * 213// * @param[in] pred_strd, 214// * Prediction buffer stride 215// * 216// * @param[in] out_strd 217// * output buffer buffer Stride 218// * 219// * @returns none 220// * 221// * @remarks none 222// * Currently wd and height is not used, ie a 4x4 block is always copied 223// * 224// ******************************************************************************* 225// */ 226// void ih264_interleave_copy(WORD16 *pi2_src, 227// UWORD8 *pu1_out, 228// WORD32 pred_strd, 229// WORD32 out_strd 230// WORD32 wd 231// WORD32 ht) 232// Register Usage 233// x0 : pi2_src 234// x1 : pu1_out 235// w2 : src_strd 236// w3 : out_strd 237// Neon registers d0-d7, d16-d30 are used 238// No need for pushing arm and neon registers 239 240 .global ih264_interleave_copy_av8 241ih264_interleave_copy_av8: 242 push_v_regs 243 sxtw x2, w2 244 sxtw x3, w3 245 ld1 {v2.8b}, [x0], x2 //load src plane 1 => d2 &pred palne 2 => d3 246 ld1 {v3.8b}, [x0], x2 247 mov v2.d[1], v3.d[0] 248 ld1 {v4.8b}, [x0], x2 249 ld1 {v5.8b}, [x0], x2 250 mov v4.d[1], v5.d[0] 251 252 mov x0, x1 253 254 ld1 {v18.8b}, [x1], x3 //load out [8 bit size) -8 coeffs 255 ld1 {v19.8b}, [x1], x3 256 mov v18.d[1], v19.d[0] 257 movi v30.8h, #0x00ff 258 ld1 {v20.8b}, [x1], x3 259 ld1 {v21.8b}, [x1], x3 260 mov v20.d[1], v21.d[0] 261 262 bit v18.16b, v2.16b , v30.16b 263 bit v20.16b, v4.16b , v30.16b 264 265 st1 {v18.8b}, [x0], x3 //store out 266 st1 {v18.d}[1], [x0], x3 267 st1 {v20.8b}, [x0], x3 268 st1 {v20.d}[1], [x0], x3 269 270 pop_v_regs 271 ret 272 273 274