1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* ,:file 21@* ihevc_sao_edge_offset_class3.s 22@* 23@* ,:brief 24@* Contains function definitions for inter prediction interpolation. 25@* Functions are coded using NEON intrinsics and can be compiled using@ ARM 26@* RVCT 27@* 28@* ,:author 29@* Parthiban V 30@* 31@* ,:par List of Functions: 32@* 33@* 34@* ,:remarks 35@* None 36@* 37@******************************************************************************* 38@*/ 39@void ihevc_sao_edge_offset_class3(UWORD8 *pu1_src, 40@ WORD32 src_strd, 41@ UWORD8 *pu1_src_left, 42@ UWORD8 *pu1_src_top, 43@ UWORD8 *pu1_src_top_left, 44@ UWORD8 *pu1_src_top_right, 45@ UWORD8 *pu1_src_bot_left, 46@ UWORD8 *pu1_avail, 47@ WORD8 *pi1_sao_offset, 48@ WORD32 wd, 49@ WORD32 ht) 50@**************Variables Vs Registers***************************************** 51@r0 => *pu1_src 52@r1 => src_strd 53@r2 => *pu1_src_left 54@r3 => *pu1_src_top 55@r4 => *pu1_src_top_left 56@r5 => *pu1_avail 57@r6 => *pi1_sao_offset 58@r7 => wd 59@r8=> ht 60 61.equ pu1_src_top_left_offset, 264 62.equ pu1_src_top_right_offset, 268 63.equ pu1_src_bot_left_offset, 272 64.equ pu1_avail_offset, 276 65.equ pi1_sao_offset, 280 66.equ wd_offset, 284 67.equ ht_offset, 288 68 69.text 70.syntax unified 71.p2align 2 72 73.extern gi1_table_edge_idx 74.globl ihevc_sao_edge_offset_class3_a9q 75 76gi1_table_edge_idx_addr_1: 77.long gi1_table_edge_idx - ulbl1 - 8 78 79gi1_table_edge_idx_addr_2: 80.long gi1_table_edge_idx - ulbl2 - 8 81 82gi1_table_edge_idx_addr_3: 83.long gi1_table_edge_idx - ulbl3 - 8 84 85ihevc_sao_edge_offset_class3_a9q: 86 87 88 STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments 89 vpush {d8 - d15} 90 SUB sp,sp,#160 @Decrement the stack pointer to store some temp arr values 91 LDR r7,[sp,#wd_offset] @Loads wd 92 93 LDR r8,[sp,#ht_offset] @Loads ht 94 SUB r9,r7,#1 @wd - 1 95 96 LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left 97 LDRB r10,[r3,r9] @pu1_src_top[wd - 1] 98 99 MOV r9,r7 @Move width to r9 for loop count 100 101 LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail 102 LDR r6,[sp,#pi1_sao_offset] @Loads pi1_sao_offset 103 STR r3,[sp,#156] @Store pu1_src_top in sp 104 105 106 STRB r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 1] 107 SUB r10,r8,#1 @ht-1 108 MLA r11,r10,r1,r0 @pu1_src[(ht - 1) * src_strd + col] 109 ADD r12,sp,#2 @temp array 110 111AU1_SRC_TOP_LOOP: 112 VLD1.8 D0,[r11]! @pu1_src[(ht - 1) * src_strd + col] 113 SUBS r9,r9,#8 @Decrement the loop count by 8 114 VST1.8 D0,[r12]! @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col] 115 BNE AU1_SRC_TOP_LOOP 116 117PU1_AVAIL_5_LOOP: 118 LDRB r9,[r5,#5] @pu1_avail[5] 119 CMP r9,#0 120 SUB r10,r7,#1 @[wd - 1] 121 LDRB r9,[r0,r10] @u1_pos_0_0_tmp = pu1_src[wd - 1] 122 BEQ PU1_AVAIL_6_LOOP 123 124 LDR r11,[sp,#pu1_src_top_right_offset] @Load pu1_src_top_right from sp 125 SUB r10,r10,#1 @[wd - 1 - 1] 126 127 LDRB r11,[r11] @pu1_src_top_right[0] 128 SUB r12,r9,r11 @pu1_src[wd - 1] - pu1_src_top_right[0] 129 130 ADD r11,r0,r1 @pu1_src + src_strd 131 132 LDRB r14,[r11,r10] @pu1_src[wd - 1 - 1 + src_strd] 133 CMP r12,#0 134 MVNLT r12,#0 135 SUB r11,r9,r14 @pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd] 136 137 MOVGT r12,#1 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) 138 CMP r11,#0 139 MVNLT r11,#0 140 MOVGT r11,#1 @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]) 141 LDR r14, gi1_table_edge_idx_addr_1 @table pointer 142ulbl1: 143 add r14,r14,pc 144 ADD r11,r12,r11 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) + SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]) 145 ADD r11,r11,#2 @edge_idx 146 147 LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] 148 CMP r12,#0 @0 != edge_idx 149 BEQ PU1_AVAIL_6_LOOP 150 LDRSB r10,[r6,r12] @pi1_sao_offset[edge_idx] 151 ADD r9,r9,r10 @pu1_src[0] + pi1_sao_offset[edge_idx] 152 USAT r9,#8,r9 @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 153 154PU1_AVAIL_6_LOOP: 155 LDRB r10,[r5,#6] @pu1_avail[6] 156 SUB r11,r8,#1 @ht - 1 157 158 CMP r10,#0 159 STR r0,[sp,#148] @Store pu1_src in sp 160 MLA r12,r11,r1,r0 @pu1_src[(ht - 1) * src_strd] 161 162 LDRB r10,[r12] @u1_pos_wd_ht_tmp = pu1_src[(ht - 1) * src_strd] 163 BEQ PU1_AVAIL_3_LOOP 164 165 LDR r14,[sp,#pu1_src_bot_left_offset] @Load pu1_src_bot_left from sp 166 SUB r11,r12,r1 @pu1_src[(ht - 1) * src_strd) - src_strd] 167 168 LDRB r14,[r14] @Load pu1_src_bot_left[0] 169 ADD r11,r11,#1 @pu1_src[(ht - 1) * src_strd + 1 - src_strd] 170 171 LDRB r11,[r11] @Load pu1_src[(ht - 1) * src_strd + 1 - src_strd] 172 SUB r14,r10,r14 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0] 173 174 SUB r11,r10,r11 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd] 175 CMP r11,#0 176 MVNLT r11,#0 177 MOVGT r11,#1 @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) 178 179 CMP r14,#0 180 MVNLT r14,#0 181 MOVGT r14,#1 @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]) 182 183 ADD r11,r11,r14 @Add 2 sign value 184 185 LDR r14, gi1_table_edge_idx_addr_2 @table pointer 186ulbl2: 187 add r14,r14,pc 188 ADD r11,r11,#2 @edge_idx 189 190 LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] 191 CMP r12,#0 192 BEQ PU1_AVAIL_3_LOOP 193 LDRSB r11,[r6,r12] @pi1_sao_offset[edge_idx] 194 ADD r10,r10,r11 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 195 USAT r10,#8,r10 @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 196 197PU1_AVAIL_3_LOOP: 198 STR r2,[sp,#152] @Store pu1_src_left in sp 199 MOV r12,r8 @Move ht 200 201 MOV r14,r2 @Move pu1_src_left to pu1_src_left_cpy 202 VMOV.I8 Q0,#2 @const_2 = vdupq_n_s8(2) 203 LDRB r11,[r5,#3] @pu1_avail[3] 204 205 CMP r11,#0 206 VMOV.I16 Q1,#0 @const_min_clip = vdupq_n_s16(0) 207 SUBEQ r12,r12,#1 @ht_tmp-- 208 209 LDRB r5,[r5,#2] @pu1_avail[2] 210 VMOV.I16 Q2,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) 211 CMP r5,#0 212 213 ADDEQ r0,r0,r1 @pu1_src += src_strd 214 VLD1.8 D7,[r6] @offset_tbl = vld1_s8(pi1_sao_offset) 215 SUBEQ r12,r12,#1 @ht_tmp-- 216 217 LDR r6, gi1_table_edge_idx_addr_3 @table pointer 218ulbl3: 219 add r6,r6,pc 220 VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1) 221 ADDEQ r14,r14,#1 @pu1_src_left_cpy += 1 222 223 STR r0,[sp,#144] @Store pu1_src in sp 224 VLD1.8 D6,[r6] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 225 MOV r6,r7 @move wd to r6 loop_count 226 227 CMP r7,#16 @Compare wd with 16 228 BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case 229 CMP r8,#4 @Compare ht with 4 230 BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP 231 232WIDTH_LOOP_16: 233 LDR r7,[sp,#wd_offset] @Loads wd 234 235 LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail 236 CMP r6,r7 @col == wd 237 LDRBEQ r8,[r5] @pu1_avail[0] 238 MOVNE r8,#-1 239 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 240 241 CMP r6,#16 @if(col == 16) 242 BNE SKIP_AU1_MASK_VAL 243 LDRB r8,[r5,#1] @pu1_avail[1] 244 VMOV.8 d9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 245 246SKIP_AU1_MASK_VAL: 247 LDRB r8,[r5,#2] @pu1_avail[2] 248 CMP r8,#0 249 250 LDR r4,[sp,#ht_offset] @Loads ht 251 SUBEQ r8,r0,r1 @pu1_src - src_strd 252 253 MOVNE r8,r3 254 ADD r5,sp,#66 @*au1_src_left_tmp 255 256 LDR r7,[sp,#wd_offset] @Loads wd 257 ADD r8,r8,#1 @pu1_src - src_strd + 1 258 259 SUB r7,r7,r6 @(wd - col) 260 VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 261 VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 262 SUB r8,#8 263 ADD r3,r3,#16 264 265 LDR r8,[sp,#148] @Loads *pu1_src 266 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 267 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 268 SUB r0,#8 269 ADD r7,r7,#15 @15 + (wd - col) 270 271 ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)] 272 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 273 SUB r5,r5,#1 274 275AU1_SRC_LEFT_LOOP: 276 LDRB r8,[r7],r1 @load the value and increment by src_strd 277 SUBS r4,r4,#1 @decrement the loop count 278 STRB r8,[r5,#1]! @store it in the stack pointer 279 BNE AU1_SRC_LEFT_LOOP 280 281 VMOV.I8 Q9,#0 282 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 283 284 ADD r8,r0,r1 @I *pu1_src + src_strd 285 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 286 MOV r7,r12 @row count, move ht_tmp to r7 287 288 SUB r5,r12,r7 @I ht_tmp - row 289 VLD1.8 D16,[r8]! @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 290 VLD1.8 D17,[r8] @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 291 SUB r8,#8 292 ADD r8,r14,r5 @I pu1_src_left_cpy[ht_tmp - row] 293 294 ADD r8,r8,#1 @I pu1_src_left_cpy[ht_tmp - row + 1] 295 LDRB r8,[r8] 296 297 LDR r5,[sp,#pu1_avail_offset] @I Loads pu1_avail 298 VMOV.8 D19[7],r8 @I vsetq_lane_u8 299 LDRB r5,[r5,#2] @I pu1_avail[2] 300 301 VEXT.8 Q9,Q9,Q8,#15 @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 302 CMP r5,#0 @I 303 BNE SIGN_UP_CHANGE_DONE @I 304 305SIGN_UP_CHANGE: 306 LDRB r8,[r0,#15] @I pu1_src_cpy[15] 307 SUB r5,r0,r1 @I pu1_src_cpy[16 - src_strd] 308 309 LDRB r5,[r5,#16] @I load the value 310 SUB r8,r8,r5 @I pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 311 CMP r8,#0 @I 312 MVNLT r8,#0 @I 313 MOVGT r8,#1 @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 314 VMOV.8 D15[7],r8 @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 315 316SIGN_UP_CHANGE_DONE: 317 VCGT.U8 Q5,Q6,Q9 @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 318 VCLT.U8 Q9,Q6,Q9 @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 319 VSUB.U8 Q5,Q9,Q5 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 320 321 VADD.I8 Q9,Q0,Q7 @I edge_idx = vaddq_s8(const_2, sign_up) 322 VADD.I8 Q9,Q9,Q5 @I edge_idx = vaddq_s8(edge_idx, sign_down) 323 VTBL.8 D18,{D6},D18 @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 324 VNEG.S8 Q7,Q5 @I sign_up = vnegq_s8(sign_down) 325 326 VEXT.8 Q7,Q7,Q7,#1 @I sign_up = vextq_s8(sign_up, sign_up, 1) 327 VTBL.8 D19,{D6},D19 @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 328 329 VMOVL.U8 Q10,D12 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 330 VAND Q9,Q9,Q4 @I edge_idx = vandq_s8(edge_idx, au1_mask) 331 332 VTBL.8 D10,{D7},D18 @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 333 334 VMOVL.U8 Q11,D13 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 335 VADDW.S8 Q10,Q10,D10 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 336 337 VMAX.S16 Q10,Q10,Q1 @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 338 VTBL.8 D11,{D7},D19 @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 339 VMIN.U16 Q10,Q10,Q2 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 340 341 VMOV Q6,Q8 342 VADDW.S8 Q11,Q11,D11 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 343 344 VMAX.S16 Q11,Q11,Q1 @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 345 VMIN.U16 Q11,Q11,Q2 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 346 347 SUB r7,r7,#1 @I Decrement the ht_tmp loop count by 1 348 349PU1_SRC_LOOP: 350 ADD r8,r0,r1,LSL #1 @II *pu1_src + src_strd 351 VMOVN.I16 D20,Q10 @I vmovn_s16(pi2_tmp_cur_row.val[0]) 352 SUB r5,r12,r7 @II ht_tmp - row 353 354 ADD r4,r0,r1 @II pu1_src_cpy[16 - src_strd] 355 VMOVN.I16 D21,Q11 @I vmovn_s16(pi2_tmp_cur_row.val[1]) 356 ADD r2,r8,r1 @III *pu1_src + src_strd 357 358 LDRB r11,[r4,#15] @II pu1_src_cpy[15] 359 VLD1.8 D16,[r8]! @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 360 VLD1.8 D17,[r8] @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 361 SUB r8,#8 362 SUB r7,r7,#1 @II Decrement the ht_tmp loop count by 1 363 364 ADD r8,r14,r5 @II pu1_src_left_cpy[ht_tmp - row] 365 VLD1.8 D30,[r2]! @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 366 VLD1.8 D31,[r2] @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 367 SUB r2,#8 368 LDRB r8,[r8,#1] 369 370 LDRB r4,[r0,#16] @II load the value 371 VMOV.8 D19[7],r8 @II vsetq_lane_u8 372 SUB r11,r11,r4 @II pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 373 374 CMP r11,#0 @II 375 VST1.8 {Q10},[r0],r1 @I vst1q_u8(pu1_src_cpy, pu1_cur_row) 376 SUB r5,r12,r7 @III ht_tmp - row 377 378 MVNLT r11,#0 @II 379 VEXT.8 Q9,Q9,Q8,#15 @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 380 MOVGT r11,#1 @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 381 382 ADD r8,r14,r5 @III pu1_src_left_cpy[ht_tmp - row] 383 VMOV.8 D15[7],r11 @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 384 CMP r7,#1 @III 385 386 BNE NEXT_ROW_ELSE_2 @III 387 LDR r5,[sp,#pu1_avail_offset] @III Loads pu1_avail 388 LDRB r5,[r5,#3] @III pu1_avail[3] 389 CMP r5,#0 @III 390 SUBNE r8,r2,#2 @III pu1_src_cpy[src_strd - 1] 391 392NEXT_ROW_ELSE_2: 393 LDRB r8,[r8,#1] @III 394 VCGT.U8 Q12,Q6,Q9 @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 395 ADD r5,r0,r1 396 397 LDRB r2,[r5,#15] @III pu1_src_cpy[15] 398 VCLT.U8 Q13,Q6,Q9 @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 399 LDRB r5,[r0,#16] @III load the value 400 401 SUB r2,r2,r5 @III pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 402 VSUB.U8 Q12,Q13,Q12 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 403 CMP r2,#0 @III 404 405 MVNLT r2,#0 @III 406 VMOV.8 D19[7],r8 @III vsetq_lane_u8 407 MOVGT r2,#1 @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 408 409 SUB r7,r7,#1 @III Decrement the ht_tmp loop count by 1 410 VADD.I8 Q13,Q0,Q7 @II edge_idx = vaddq_s8(const_2, sign_up) 411 412 VNEG.S8 Q7,Q12 @II sign_up = vnegq_s8(sign_down) 413 VEXT.8 Q9,Q9,Q15,#15 @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 414 415 VADD.I8 Q13,Q13,Q12 @II edge_idx = vaddq_s8(edge_idx, sign_down) 416 417 VEXT.8 Q7,Q7,Q7,#1 @II sign_up = vextq_s8(sign_up, sign_up, 1) 418 VTBL.8 D26,{D6},D26 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 419 VCGT.U8 Q5,Q8,Q9 @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 420 421 VMOV.8 D15[7],r2 @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 422 VTBL.8 D27,{D6},D27 @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 423 VCLT.U8 Q9,Q8,Q9 @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 424 425 VMOVL.U8 Q14,D12 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 426 VAND Q13,Q13,Q4 @II edge_idx = vandq_s8(edge_idx, au1_mask) 427 428 VSUB.U8 Q5,Q9,Q5 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 429 VTBL.8 D24,{D7},D26 @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 430 VADD.I8 Q9,Q0,Q7 @III edge_idx = vaddq_s8(const_2, sign_up) 431 432 VADD.I8 Q9,Q9,Q5 @III edge_idx = vaddq_s8(edge_idx, sign_down) 433 VTBL.8 D25,{D7},D27 @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 434 VNEG.S8 Q7,Q5 @III sign_up = vnegq_s8(sign_down) 435 436 VADDW.S8 Q14,Q14,D24 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 437 VTBL.8 D18,{D6},D18 @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 438 VMAX.S16 Q14,Q14,Q1 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 439 440 VEXT.8 Q7,Q7,Q7,#1 @III sign_up = vextq_s8(sign_up, sign_up, 1) 441 VTBL.8 D19,{D6},D19 @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 442 VMIN.U16 Q14,Q14,Q2 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 443 444 VMOVL.U8 Q13,D13 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 445 VAND Q9,Q9,Q4 @III edge_idx = vandq_s8(edge_idx, au1_mask) 446 447 VADDW.S8 Q13,Q13,D25 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 448 VTBL.8 D10,{D7},D18 @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 449 VMAX.S16 Q13,Q13,Q1 @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 450 451 VMOVL.U8 Q10,D16 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 452 VMIN.U16 Q13,Q13,Q2 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 453 454 VADDW.S8 Q10,Q10,D10 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 455 VTBL.8 D11,{D7},D19 @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 456 VMAX.S16 Q10,Q10,Q1 @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 457 458 VMOVL.U8 Q11,D17 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 459 VMIN.U16 Q10,Q10,Q2 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 460 461 VMOVN.I16 D28,Q14 @II vmovn_s16(pi2_tmp_cur_row.val[0]) 462 VADDW.S8 Q11,Q11,D11 @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 463 464 VMOVN.I16 D29,Q13 @II vmovn_s16(pi2_tmp_cur_row.val[1]) 465 VMAX.S16 Q11,Q11,Q1 @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 466 467 VMOV Q6,Q15 @II pu1_cur_row = pu1_next_row 468 VMIN.U16 Q11,Q11,Q2 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 469 470 CMP r7,#1 @III 471 VST1.8 {Q14},[r0],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row) 472 BGT PU1_SRC_LOOP @If not equal jump to PU1_SRC_LOOP 473 BLT INNER_LOOP_DONE 474 475 ADD r8,r0,r1,LSL #1 @*pu1_src + src_strd 476 VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0]) 477 LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail 478 479 LDRB r5,[r5,#3] @pu1_avail[3] 480 VMOVN.I16 D21,Q11 @III vmovn_s16(pi2_tmp_cur_row.val[1]) 481 CMP r5,#0 482 483 ADD r4,r0,r1 @pu1_src_cpy[16 - src_strd] 484 VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 485 VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 486 SUB r8,#8 487 LDRB r5,[r0,#16] @load the value 488 489 BEQ NEXT_ROW_ELSE_3 490 LDRB r8,[r8,#-1] @pu1_src_cpy[src_strd - 1] 491 B NEXT_ROW_POINTER_ASSIGNED_3 492NEXT_ROW_ELSE_3: 493 SUB r11,r12,r7 @ht_tmp - row 494 ADD r8,r14,r11 @pu1_src_left_cpy[ht_tmp - row] 495 ADD r8,r8,#1 @pu1_src_left_cpy[ht_tmp - row + 1] 496 LDRB r8,[r8] 497 498NEXT_ROW_POINTER_ASSIGNED_3: 499 LDRB r11,[r4,#15] @pu1_src_cpy[15] 500 VMOV.8 D19[7],r8 @vsetq_lane_u8 501 SUB r8,r11,r5 @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 502 503 CMP r8,#0 504 VEXT.8 Q9,Q9,Q8,#15 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 505 MVNLT r8,#0 506 507 VST1.8 {Q10},[r0],r1 @III vst1q_u8(pu1_src_cpy, pu1_cur_row) 508 VCGT.U8 Q12,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 509 510 MOVGT r8,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 511 VCLT.U8 Q13,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 512 513 VMOV.8 D15[7],r8 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 514 VSUB.U8 Q12,Q13,Q12 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 515 516 VMOVL.U8 Q10,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 517 VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 518 519 VMOVL.U8 Q11,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 520 VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down) 521 522 VTBL.8 D26,{D6},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 523 VTBL.8 D27,{D6},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 524 525 VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 526 527 VTBL.8 D24,{D7},D26 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 528 529 VADDW.S8 Q10,Q10,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 530 VTBL.8 D25,{D7},D27 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 531 VMAX.S16 Q10,Q10,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 532 533 VMIN.U16 Q10,Q10,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 534 535 VADDW.S8 Q11,Q11,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 536 VMAX.S16 Q11,Q11,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 537 VMIN.U16 Q11,Q11,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 538 539INNER_LOOP_DONE: 540 VMOVN.I16 D20,Q10 @vmovn_s16(pi2_tmp_cur_row.val[0]) 541 LDR r8,[sp,#ht_offset] @Loads ht 542 543 VMOVN.I16 D21,Q11 @vmovn_s16(pi2_tmp_cur_row.val[1]) 544 ADD r5,sp,#66 @*au1_src_left_tmp 545 546 VST1.8 {Q10},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 547 LDR r2,[sp,#152] @Loads *pu1_src_left 548SRC_LEFT_LOOP: 549 LDR r7,[r5],#4 @au1_src_left_tmp[row] 550 SUBS r8,r8,#4 551 STR r7,[r2],#4 @pu1_src_left[row] = au1_src_left_tmp[row] 552 BNE SRC_LEFT_LOOP 553 554 SUBS r6,r6,#16 @Decrement the wd loop count by 16 555 CMP r6,#8 @Check whether residue remains 556 BLT RE_ASSINING_LOOP @Jump to re-assigning loop 557 LDR r7,[sp,#wd_offset] @Loads wd 558 LDR r0,[sp,#144] @Loads *pu1_src 559 SUB r7,r7,r6 560 ADD r0,r0,r7 561 BGT WIDTH_LOOP_16 @If not equal jump to width_loop 562 BEQ WIDTH_RESIDUE @If residue remains jump to residue loop 563 564 565 566WD_16_HT_4_LOOP: 567 LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail 568 LDR r7,[sp,#wd_offset] @Loads wd 569 CMP r6,r7 @col == wd 570 LDRBEQ r8,[r5] @pu1_avail[0] 571 MOVNE r8,#-1 572 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 573 574 CMP r6,#16 @if(col == 16) 575 BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 576 LDRB r8,[r5,#1] @pu1_avail[1] 577 VMOV.8 d9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 578 579SKIP_AU1_MASK_VAL_WD_16_HT_4: 580 LDRB r8,[r5,#2] @pu1_avail[2] 581 CMP r8,#0 582 583 SUBEQ r8,r0,r1 @pu1_src - src_strd 584 MOVNE r8,r3 585 ADD r8,r8,#1 @pu1_src - src_strd + 1 586 VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 587 VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 588 SUB r8,#8 589 590 ADD r3,r3,#16 591 ADD r5,sp,#66 @*au1_src_left_tmp 592 LDR r4,[sp,#ht_offset] @Loads ht 593 LDR r7,[sp,#wd_offset] @Loads wd 594 SUB r7,r7,r6 @(wd - col) 595 ADD r7,r7,#15 @15 + (wd - col) 596 LDR r8,[sp,#148] @Loads *pu1_src 597 ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)] 598 SUB r5,r5,#1 599 600AU1_SRC_LEFT_LOOP_WD_16_HT_4: 601 LDRB r8,[r7],r1 @load the value and increment by src_strd 602 STRB r8,[r5,#1]! @store it in the stack pointer 603 SUBS r4,r4,#1 @decrement the loop count 604 BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4 605 606 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 607 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 608 SUB r0,#8 609 610 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 611 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 612 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 613 VMOV.I8 Q9,#0 614 MOV r7,r12 @row count, move ht_tmp to r7 615 616PU1_SRC_LOOP_WD_16_HT_4: 617 ADD r8,r0,r1 @*pu1_src + src_strd 618 VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 619 VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 620 SUB r8,#8 621 LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail 622 LDRB r5,[r5,#3] @pu1_avail[3] 623 CMP r5,#0 624 BEQ NEXT_ROW_ELSE_WD_16_HT_4 625 CMP r7,#1 626 LDRBEQ r8,[r8,#-1] @pu1_src_cpy[src_strd - 1] 627 BEQ NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4 628NEXT_ROW_ELSE_WD_16_HT_4: 629 SUB r5,r12,r7 @ht_tmp - row 630 ADD r8,r14,r5 @pu1_src_left_cpy[ht_tmp - row] 631 ADD r8,r8,#1 @pu1_src_left_cpy[ht_tmp - row + 1] 632 LDRB r8,[r8] 633 634NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4: 635 VMOV.8 D19[7],r8 @vsetq_lane_u8 636 VEXT.8 Q9,Q9,Q8,#15 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 637 638 CMP r7,r12 639 BNE SIGN_UP_CHANGE_WD_16_HT_4 640 LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail 641 LDRB r5,[r5,#2] @pu1_avail[2] 642 CMP r5,#0 643 BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4 644 645SIGN_UP_CHANGE_WD_16_HT_4: 646 LDRB r8,[r0,#15] @pu1_src_cpy[15] 647 ADD r5,r0,#16 @pu1_src_cpy[16] 648 SUB r5,r5,r1 @pu1_src_cpy[16 - src_strd] 649 LDRB r5,[r5] @load the value 650 SUB r8,r8,r5 @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 651 CMP r8,#0 652 MVNLT r8,#0 653 MOVGT r8,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 654 VMOV.8 D15[7],r8 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 655 656SIGN_UP_CHANGE_DONE_WD_16_HT_4: 657 VCGT.U8 Q10,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 658 VCLT.U8 Q11,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 659 VSUB.U8 Q12,Q11,Q10 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 660 661 VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 662 VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down) 663 VTBL.8 D26,{D6},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 664 VTBL.8 D27,{D6},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 665 666 VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 667 668 VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down) 669 VEXT.8 Q7,Q7,Q7,#1 @sign_up = vextq_s8(sign_up, sign_up, 1) 670 671 VTBL.8 D24,{D7},D26 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 672 VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 673 VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 674 VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 675 VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 676 677 VTBL.8 D25,{D7},D27 @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx)) 678 VMOVL.U8 Q15,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 679 VADDW.S8 Q15,Q15,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 680 VMAX.S16 Q15,Q15,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 681 VMIN.U16 Q15,Q15,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 682 683 VMOVN.I16 D28,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0]) 684 VMOVN.I16 D29,Q15 @vmovn_s16(pi2_tmp_cur_row.val[1]) 685 686 VST1.8 {Q14},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 687 688 VMOV Q6,Q8 @pu1_cur_row = pu1_next_row 689 SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1 690 BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 691 692 LDR r8,[sp,#ht_offset] @Loads ht 693 ADD r5,sp,#66 @*au1_src_left_tmp 694 LDR r2,[sp,#152] @Loads *pu1_src_left 695SRC_LEFT_LOOP_WD_16_HT_4: 696 LDR r7,[r5],#4 @au1_src_left_tmp[row] 697 STR r7,[r2],#4 @pu1_src_left[row] = au1_src_left_tmp[row] 698 SUBS r8,r8,#4 699 BNE SRC_LEFT_LOOP_WD_16_HT_4 700 701 SUBS r6,r6,#16 @Decrement the wd loop count by 16 702 BLE RE_ASSINING_LOOP @Jump to re-assigning loop 703 LDR r7,[sp,#wd_offset] @Loads wd 704 LDR r0,[sp,#144] @Loads *pu1_src 705 SUB r7,r7,r6 706 ADD r0,r0,r7 707 BGT WD_16_HT_4_LOOP @If not equal jump to width_loop 708 709 710WIDTH_RESIDUE: 711 LDR r7,[sp,#wd_offset] @Loads wd 712 LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail 713 CMP r6,r7 @wd_residue == wd 714 LDRBEQ r8,[r5] @pu1_avail[0] 715 716 MOVNE r8,#-1 717 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 718 719 LDRB r8,[r5,#1] @pu1_avail[1] 720 VMOV.8 d8[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 721 722PU1_AVAIL_2_RESIDUE: 723 LDRB r8,[r5,#2] @pu1_avail[2] 724 CMP r8,#0 725 726 SUBEQ r8,r0,r1 @pu1_src - src_strd 727 MOVNE r8,r3 728 ADD r8,r8,#1 @pu1_src - src_strd + 1 729 VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 730 VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1) 731 SUB r8,#8 732 733 734 ADD r5,sp,#66 @*au1_src_left_tmp 735 LDR r4,[sp,#ht_offset] @Loads ht 736 LDR r7,[sp,#wd_offset] @Loads wd 737 LDR r8,[sp,#148] @Loads *pu1_src 738 SUB r7,r7,#1 @(wd - 1) 739 ADD r7,r8,r7 @pu1_src[0 * src_strd + (wd - 1)] 740 SUB r5,r5,#1 741 742AU1_SRC_LEFT_LOOP_RESIDUE: 743 LDRB r8,[r7],r1 @load the value and increment by src_strd 744 STRB r8,[r5,#1]! @store it in the stack pointer 745 SUBS r4,r4,#1 @decrement the loop count 746 BNE AU1_SRC_LEFT_LOOP_RESIDUE 747 748 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 749 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 750 SUB r0,#8 751 752 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 753 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 754 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 755 MOV r7,r12 @row count, move ht_tmp to r7 756 757PU1_SRC_LOOP_RESIDUE: 758 VMOV.I8 Q9,#0 759 ADD r8,r0,r1 @*pu1_src + src_strd 760 VLD1.8 D16,[r8]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 761 VLD1.8 D17,[r8] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 762 SUB r8,#8 763 LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail 764 LDRB r5,[r5,#3] @pu1_avail[3] 765 CMP r5,#0 766 BEQ NEXT_ROW_ELSE_RESIDUE 767 CMP r7,#1 768 LDRBEQ r8,[r8,#-1] @pu1_src_cpy[src_strd - 1] 769 BEQ NEXT_ROW_POINTER_ASSIGNED_RESIDUE 770NEXT_ROW_ELSE_RESIDUE: 771 SUB r5,r12,r7 @ht_tmp - row 772 ADD r8,r14,r5 @pu1_src_left_cpy[ht_tmp - row] 773 ADD r8,r8,#1 @pu1_src_left_cpy[ht_tmp - row + 1] 774 LDRB r8,[r8] 775 776NEXT_ROW_POINTER_ASSIGNED_RESIDUE: 777 VMOV.8 D19[7],r8 @vsetq_lane_u8 778 VEXT.8 Q9,Q9,Q8,#15 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15) 779 780 CMP r7,r12 781 BNE SIGN_UP_CHANGE_RESIDUE 782 LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail 783 LDRB r5,[r5,#2] @pu1_avail[2] 784 CMP r5,#0 785 BNE SIGN_UP_CHANGE_DONE_RESIDUE 786 787SIGN_UP_CHANGE_RESIDUE: 788 LDRB r8,[r0,#15] @pu1_src_cpy[15] 789 ADD r5,r0,#16 @pu1_src_cpy[16] 790 SUB r5,r5,r1 @pu1_src_cpy[16 - src_strd] 791 LDRB r5,[r5] @load the value 792 SUB r8,r8,r5 @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd] 793 CMP r8,#0 794 MVNLT r8,#0 795 MOVGT r8,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]) 796 VMOV.8 D15[7],r8 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15) 797 798SIGN_UP_CHANGE_DONE_RESIDUE: 799 VCGT.U8 Q10,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 800 VCLT.U8 Q11,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 801 VSUB.U8 Q12,Q11,Q10 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 802 803 VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 804 VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down) 805 VTBL.8 D26,{D6},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 806 VTBL.8 D27,{D6},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 807 808 VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 809 810 VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down) 811 VEXT.8 Q7,Q7,Q7,#1 @sign_up = vextq_s8(sign_up, sign_up, 1) 812 813 VTBL.8 D24,{D7},D26 @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx)) 814 VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 815 VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 816 VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 817 VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 818 819 VMOVN.I16 D30,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0]) 820 821 VST1.8 {D30},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 822 VMOV Q6,Q8 @pu1_cur_row = pu1_next_row 823 SUBS r7,r7,#1 824 BNE PU1_SRC_LOOP_RESIDUE 825 826 LDR r8,[sp,#ht_offset] @Loads ht 827 LDR r2,[sp,#152] @Loads *pu1_src_left 828 ADD r5,sp,#66 @*au1_src_left_tmp 829 830SRC_LEFT_LOOP_RESIDUE: 831 LDR r7,[r5],#4 @au1_src_left_tmp[row] 832 SUBS r8,r8,#4 833 STR r7,[r2],#4 @pu1_src_left[row] = au1_src_left_tmp[row] 834 BNE SRC_LEFT_LOOP_RESIDUE 835 836 837RE_ASSINING_LOOP: 838 LDR r7,[sp,#wd_offset] @Loads wd 839 LDR r0,[sp,#148] @Loads *pu1_src 840 841 LDR r11,[sp,#ht_offset] @Loads ht 842 ADD r8,r0,r7 @pu1_src[wd] 843 844 LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left 845 SUB r11,r11,#1 @ht - 1 846 847 STRB r9,[r8,#-1] @pu1_src_org[wd - 1] = u1_pos_wd_0_tmp 848 MLA r6,r11,r1,r0 @pu1_src_org[(ht - 1) * src_strd] 849 850 LDRB r8,[sp] @load u1_src_top_left_tmp from stack pointer 851 ADD r12,sp,#2 852 853 STRB r10,[r6] @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp 854 STRB r8,[r4] @*pu1_src_top_left = u1_src_top_left_tmp 855 LDR r3,[sp,#156] @Loads pu1_src_top 856 857SRC_TOP_LOOP: 858 VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col] 859 SUBS r7,r7,#8 @Decrement the width 860 VST1.8 D0,[r3]! @pu1_src_top[col] = au1_src_top_tmp[col] 861 BNE SRC_TOP_LOOP 862 863END_LOOPS: 864 ADD sp,sp,#160 865 vpop {d8 - d15} 866 LDMFD sp!,{r4-r12,r15} @Reload the registers from SP 867 868 869 870