1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@/** 20@******************************************************************************* 21@* 22@* @brief 23@* interprediction luma function for copy 24@* 25@* @par description: 26@* copies the array of width 'wd' and height 'ht' from the location pointed 27@* by 'src' to the location pointed by 'dst' 28@* 29@* @param[in] pu1_src 30@* uword8 pointer to the source 31@* 32@* @param[out] pu1_dst 33@* uword8 pointer to the destination 34@* 35@* @param[in] src_strd 36@* integer source stride 37@* 38@* @param[in] dst_strd 39@* integer destination stride 40@* 41@* @param[in] pi1_coeff 42@* word8 pointer to the filter coefficients 43@* 44@* @param[in] ht 45@* integer height of the array 46@* 47@* @param[in] wd 48@* integer width of the array 49@* 50@* @returns 51@* 52@* @remarks 53@* none 54@* 55@******************************************************************************* 56@*/ 57@void ihevc_inter_pred_luma_copy ( 58@ uword8 *pu1_src, 59@ uword8 *pu1_dst, 60@ word32 src_strd, 61@ word32 dst_strd, 62@ word8 *pi1_coeff, 63@ word32 ht, 64@ word32 wd ) 65 66@**************variables vs registers***************************************** 67@ r0 => *pu1_src 68@ r1 => *pu1_dst 69@ r2 => src_strd 70@ r3 => dst_strd 71@ r7 => ht 72@ r12 => wd 73 74.equ coeff_offset, 104 75.equ ht_offset, 108 76.equ wd_offset, 112 77 78.text 79.align 4 80 81 82 83 84.globl ihevc_inter_pred_luma_copy_a9q 85 86.type ihevc_inter_pred_luma_copy_a9q, %function 87 88ihevc_inter_pred_luma_copy_a9q: 89 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 90 vpush {d8 - d15} 91 ldr r12,[sp,#wd_offset] @loads wd 92 ldr r7,[sp,#ht_offset] @loads ht 93 cmp r7,#0 @checks ht == 0 94 ble end_loops 95 tst r12,#15 @checks wd for multiples for 4 & 8 96 beq core_loop_wd_16 97 tst r12,#7 @checks wd for multiples for 4 & 8 98 beq core_loop_wd_8 99 sub r11,r12,#4 100 101outer_loop_wd_4: 102 subs r4,r12,#0 @checks wd == 0 103 ble end_inner_loop_wd_4 104 105inner_loop_wd_4: 106 vld1.32 {d0[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 107 add r5,r0,r2 @pu1_src_tmp += src_strd 108 add r6,r1,r3 @pu1_dst_tmp += dst_strd 109 vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 110 vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 111 add r0,r0,#4 @pu1_src += 4 112 vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 113 vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 114 subs r4,r4,#4 @(wd -4) 115 vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 116 vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 117 add r1,r1,#4 @pu1_dst += 4 118 vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 119 120 bgt inner_loop_wd_4 121 122end_inner_loop_wd_4: 123 subs r7,r7,#4 @ht - 4 124 sub r0,r5,r11 @pu1_src = pu1_src_tmp 125 sub r1,r6,r11 @pu1_dst = pu1_dst_tmp 126 bgt outer_loop_wd_4 127 128end_loops: 129 vpop {d8 - d15} 130 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 131 132 133core_loop_wd_8: 134 sub r11,r12,#8 135 136outer_loop_wd_8: 137 subs r4,r12,#0 @checks wd 138 ble end_inner_loop_wd_8 139 140inner_loop_wd_8: 141 add r5,r0,r2 @pu1_src_tmp += src_strd 142 vld1.8 {d0},[r0]! @vld1_u8(pu1_src_tmp) 143 add r6,r1,r3 @pu1_dst_tmp += dst_strd 144 vst1.8 {d0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src) 145 vld1.8 {d1},[r5],r2 @vld1_u8(pu1_src_tmp) 146 vst1.8 {d1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 147 subs r4,r4,#8 @wd - 8(loop condition) 148 vld1.8 {d2},[r5],r2 @vld1_u8(pu1_src_tmp) 149 vst1.8 {d2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 150 vld1.8 {d3},[r5],r2 @vld1_u8(pu1_src_tmp) 151 vst1.8 {d3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 152 bgt inner_loop_wd_8 153 154end_inner_loop_wd_8: 155 subs r7,r7,#4 @ht -= 4 156 sub r0,r5,r11 @pu1_src = pu1_src_tmp 157 sub r1,r6,r11 @pu1_dst = pu1_dst_tmp 158 bgt outer_loop_wd_8 159 160 vpop {d8 - d15} 161 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 162 163core_loop_wd_16: 164 sub r11,r12,#16 165 166outer_loop_wd_16: 167 subs r4,r12,#0 @checks wd 168 ble end_inner_loop_wd_16 169 170inner_loop_wd_16: 171 add r5,r0,r2 @pu1_src_tmp += src_strd 172 vld1.8 {q0},[r0]! @vld1_u8(pu1_src_tmp) 173 add r6,r1,r3 @pu1_dst_tmp += dst_strd 174 vst1.8 {q0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src) 175 vld1.8 {q1},[r5],r2 @vld1_u8(pu1_src_tmp) 176 vst1.8 {q1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 177 subs r4,r4,#16 @wd - 8(loop condition) 178 vld1.8 {q2},[r5],r2 @vld1_u8(pu1_src_tmp) 179 vst1.8 {q2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 180 vld1.8 {q3},[r5],r2 @vld1_u8(pu1_src_tmp) 181 vst1.8 {q3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 182 bgt inner_loop_wd_16 183 184end_inner_loop_wd_16: 185 subs r7,r7,#4 @ht -= 4 186 sub r0,r5,r11 @pu1_src = pu1_src_tmp 187 sub r1,r6,r11 @pu1_dst = pu1_dst_tmp 188 bgt outer_loop_wd_16 189 190 vpop {d8 - d15} 191 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 192 193 194 195 196 197