1@/****************************************************************************** 2@ * 3@ * Copyright (C) 2015 The Android Open Source Project 4@ * 5@ * Licensed under the Apache License, Version 2.0 (the "License"); 6@ * you may not use this file except in compliance with the License. 7@ * You may obtain a copy of the License at: 8@ * 9@ * http://www.apache.org/licenses/LICENSE-2.0 10@ * 11@ * Unless required by applicable law or agreed to in writing, software 12@ * distributed under the License is distributed on an "AS IS" BASIS, 13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@ * See the License for the specific language governing permissions and 15@ * limitations under the License. 16@ * 17@ ***************************************************************************** 18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19@*/ 20@** 21@** 22@******************************************************************************* 23@* 24@* @brief 25@* Interprediction luma function for copy 26@* 27@* @par Description: 28@* Copies the array of width 'wd' and height 'ht' from the location pointed 29@* by 'src' to the location pointed by 'dst' 30@* 31@* @param[in] pu1_src 32@* UWORD8 pointer to the source 33@* 34@* @param[out] pu1_dst 35@* UWORD8 pointer to the destination 36@* 37@* @param[in] src_strd 38@* integer source stride 39@* 40@* @param[in] dst_strd 41@* integer destination stride 42@* 43@* 44@* @param[in] ht 45@* integer height of the array 46@* 47@* @param[in] wd 48@* integer width of the array 49@* 50@* @returns 51@* 52@* @remarks 53@* None 54@* 55@******************************************************************************* 56@* 57@void ih264_inter_pred_luma_copy ( 58@ UWORD8 *pu1_src, 59@ UWORD8 *pu1_dst, 60@ WORD32 src_strd, 61@ WORD32 dst_strd, 62@ WORD32 ht, 63@ WORD32 wd ) 64 65@**************Variables Vs Registers***************************************** 66@ r0 => *pu1_src 67@ r1 => *pu1_dst 68@ r2 => src_strd 69@ r3 => dst_strd 70@ r7 => ht 71@ r12 => wd 72 73.text 74.p2align 2 75 76 .global ih264_inter_pred_luma_copy_a9q 77 78ih264_inter_pred_luma_copy_a9q: 79 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 80 vstmdb sp!, {d8-d15} @push neon registers to stack 81 ldr r12, [sp, #108] @Loads wd 82 ldr r7, [sp, #104] @Loads ht 83 cmp r7, #0 @checks ht == 0 84 ble end_loops 85 tst r12, #15 @checks wd for multiples for 4 & 8 86 beq core_loop_wd_16 87 tst r12, #7 @checks wd for multiples for 4 & 8 88 beq core_loop_wd_8 89 sub r11, r12, #4 90 91outer_loop_wd_4: 92 subs r4, r12, #0 @checks wd == 0 93 ble end_inner_loop_wd_4 94 95inner_loop_wd_4: 96 vld1.32 {d0[0]}, [r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 97 add r5, r0, r2 @pu1_src_tmp += src_strd 98 add r6, r1, r3 @pu1_dst_tmp += dst_strd 99 vst1.32 {d0[0]}, [r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 100 vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 101 add r0, r0, #4 @pu1_src += 4 102 vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 103 vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 104 subs r4, r4, #4 @(wd -4) 105 vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 106 vld1.32 {d0[0]}, [r5], r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 107 add r1, r1, #4 @pu1_dst += 4 108 vst1.32 {d0[0]}, [r6], r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 109 110 bgt inner_loop_wd_4 111 112end_inner_loop_wd_4: 113 subs r7, r7, #4 @ht - 4 114 sub r0, r5, r11 @pu1_src = pu1_src_tmp 115 sub r1, r6, r11 @pu1_dst = pu1_dst_tmp 116 bgt outer_loop_wd_4 117 118end_loops: 119 vldmia sp!, {d8-d15} @ Restore neon registers that were saved 120 ldmfd sp!, {r4-r12, r15} @Reload the registers from SP 121 122 123 124core_loop_wd_8: 125 sub r11, r12, #8 126 127outer_loop_wd_8: 128 subs r4, r12, #0 @checks wd 129 ble end_inner_loop_wd_8 130 131inner_loop_wd_8: 132 add r5, r0, r2 @pu1_src_tmp += src_strd 133 vld1.8 {d0}, [r0]! @vld1_u8(pu1_src_tmp) 134 add r6, r1, r3 @pu1_dst_tmp += dst_strd 135 vst1.8 {d0}, [r1]! @vst1_u8(pu1_dst_tmp, tmp_src) 136 vld1.8 {d1}, [r5], r2 @vld1_u8(pu1_src_tmp) 137 vst1.8 {d1}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) 138 subs r4, r4, #8 @wd - 8(Loop condition) 139 vld1.8 {d2}, [r5], r2 @vld1_u8(pu1_src_tmp) 140 vst1.8 {d2}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) 141 vld1.8 {d3}, [r5], r2 @vld1_u8(pu1_src_tmp) 142 vst1.8 {d3}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) 143 bgt inner_loop_wd_8 144 145end_inner_loop_wd_8: 146 subs r7, r7, #4 @ht -= 4 147 sub r0, r5, r11 @pu1_src = pu1_src_tmp 148 sub r1, r6, r11 @pu1_dst = pu1_dst_tmp 149 bgt outer_loop_wd_8 150 151 vldmia sp!, {d8-d15} @ Restore neon registers that were saved 152 ldmfd sp!, {r4-r12, r15} @Reload the registers from SP 153 154core_loop_wd_16: 155 sub r11, r12, #16 156 157outer_loop_wd_16: 158 subs r4, r12, #0 @checks wd 159 ble end_inner_loop_wd_16 160 161inner_loop_wd_16: 162 add r5, r0, r2 @pu1_src_tmp += src_strd 163 vld1.8 {q0}, [r0]! @vld1_u8(pu1_src_tmp) 164 add r6, r1, r3 @pu1_dst_tmp += dst_strd 165 vst1.8 {q0}, [r1]! @vst1_u8(pu1_dst_tmp, tmp_src) 166 vld1.8 {q1}, [r5], r2 @vld1_u8(pu1_src_tmp) 167 vst1.8 {q1}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) 168 subs r4, r4, #16 @wd - 8(Loop condition) 169 vld1.8 {q2}, [r5], r2 @vld1_u8(pu1_src_tmp) 170 vst1.8 {q2}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) 171 vld1.8 {q3}, [r5], r2 @vld1_u8(pu1_src_tmp) 172 vst1.8 {q3}, [r6], r3 @vst1_u8(pu1_dst_tmp, tmp_src) 173 bgt inner_loop_wd_16 174 175end_inner_loop_wd_16: 176 subs r7, r7, #4 @ht -= 4 177 sub r0, r5, r11 @pu1_src = pu1_src_tmp 178 sub r1, r6, r11 @pu1_dst = pu1_dst_tmp 179 bgt outer_loop_wd_16 180 181 vldmia sp!, {d8-d15} @ Restore neon registers that were saved 182 ldmfd sp!, {r4-r12, r15} @Reload the registers from SP 183 184 185@ * 186@ ******************************************************************************** 187@ * 188@ * @brief This function copies a 4x4 block to destination 189@ * 190@ * @par Description: 191@ * Copies a 4x4 block to destination, where both src and dst are interleaved 192@ * 193@ * @param[in] pi2_src 194@ * Source 195@ * 196@ * @param[in] pu1_out 197@ * Output pointer 198@ * 199@ * @param[in] pred_strd, 200@ * Prediction buffer stride 201@ * 202@ * @param[in] out_strd 203@ * output buffer buffer Stride 204@ * 205@ * @returns none 206@ * 207@ * @remarks none 208@ * Currently wd and height is not used, ie a 4x4 block is always copied 209@ * 210@ ******************************************************************************* 211@ * 212@ void ih264_interleave_copy(WORD16 *pi2_src, 213@ UWORD8 *pu1_out, 214@ WORD32 pred_strd, 215@ WORD32 out_strd 216@ WORD32 wd 217@ WORD32 ht) 218@ Register Usage 219@ r0 : pi2_src 220@ r1 : pu1_out 221@ r2 : src_strd 222@ r3 : out_strd 223@ Neon registers d0-d7, d16-d30 are used 224@ No need for pushing arm and neon registers 225 226 .global ih264_interleave_copy_a9 227ih264_interleave_copy_a9: 228 229 vld1.u8 d2, [r0], r2 @load src plane 1 => d2 &pred palne 2 => d3 230 vld1.u8 d3, [r0], r2 231 vld1.u8 d4, [r0], r2 232 vld1.u8 d5, [r0], r2 233 234 mov r0, r1 235 236 vld1.u8 d18, [r1], r3 @load out [8 bit size) -8 coeffs 237 vld1.u8 d19, [r1], r3 238 vmov.u16 q15, #0x00ff 239 vld1.u8 d20, [r1], r3 240 vld1.u8 d21, [r1], r3 241 242 vbit.u8 q9, q1, q15 243 vbit.u8 q10, q2, q15 244 245 vst1.u8 d18, [r0], r3 @store out 246 vst1.u8 d19, [r0], r3 247 vst1.u8 d20, [r0], r3 248 vst1.u8 d21, [r0], r3 249 250 bx lr 251 252 253 254