1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* ,:file 21@* ihevc_sao_edge_offset_class3_chroma.s 22@* 23@* ,:brief 24@* Contains function definitions for inter prediction interpolation. 25@* Functions are coded using NEON intrinsics and can be compiled using@ ARM 26@* RVCT 27@* 28@* ,:author 29@* Parthiban V 30@* 31@* ,:par List of Functions: 32@* 33@* 34@* ,:remarks 35@* None 36@* 37@******************************************************************************* 38@*/ 39@void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src, 40@ WORD32 src_strd, 41@ UWORD8 *pu1_src_left, 42@ UWORD8 *pu1_src_top, 43@ UWORD8 *pu1_src_top_left, 44@ UWORD8 *pu1_src_top_right, 45@ UWORD8 *pu1_src_bot_left, 46@ UWORD8 *pu1_avail, 47@ WORD8 *pi1_sao_offset_u, 48@ WORD8 *pi1_sao_offset_v, 49@ WORD32 wd, 50@ WORD32 ht) 51@**************Variables Vs Registers***************************************** 52@r0 => *pu1_src 53@r1 => src_strd 54@r2 => *pu1_src_left 55@r3 => *pu1_src_top 56@r4 => *pu1_src_top_left 57@r5 => *pu1_avail 58@r6 => *pi1_sao_offset_u 59@r9 => *pi1_sao_offset_v 60@r7 => wd 61@r8=> ht 62 63.equ pu1_src_top_left_offset, 328 64.equ pu1_src_top_right_offset, 332 65.equ pu1_src_bot_left_offset, 336 66.equ pu1_avail_offset, 340 67.equ pi1_sao_u_offset, 344 68.equ pi1_sao_v_offset, 348 69.equ wd_offset, 352 70.equ ht_offset, 356 71 72.text 73.syntax unified 74.p2align 2 75 76.extern gi1_table_edge_idx 77.globl ihevc_sao_edge_offset_class3_chroma_a9q 78 79gi1_table_edge_idx_addr_1: 80.long gi1_table_edge_idx - ulbl1 - 8 81 82gi1_table_edge_idx_addr_2: 83.long gi1_table_edge_idx - ulbl2 - 8 84 85gi1_table_edge_idx_addr_3: 86.long gi1_table_edge_idx - ulbl3 - 8 87 88gi1_table_edge_idx_addr_4: 89.long gi1_table_edge_idx - ulbl4 - 8 90 91gi1_table_edge_idx_addr_5: 92.long gi1_table_edge_idx - ulbl5 - 8 93 94ihevc_sao_edge_offset_class3_chroma_a9q: 95 96 97 STMFD sp!,{r4-r12,r14} @stack stores the values of the arguments 98 vpush {d8 - d15} 99 SUB sp,sp,#224 @Decrement the stack pointer to store some temp arr values 100 101 LDR r7,[sp,#wd_offset] @Loads wd 102 LDR r8,[sp,#ht_offset] @Loads ht 103 SUB r9,r7,#2 @wd - 2 104 105 LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left 106 LDRH r10,[r3,r9] @pu1_src_top[wd - 2] 107 108 MOV r9,r7 @Move width to r9 for loop count 109 110 LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail 111 LDR r6,[sp,#pi1_sao_u_offset] @Loads pi1_sao_offset_u 112 113 STR r3,[sp,#220] @Store pu1_src_top in sp 114 115 STRH r10,[sp] @u1_src_top_left_tmp = pu1_src_top[wd - 2] 116 SUB r10,r8,#1 @ht-1 117 MLA r11,r10,r1,r0 @pu1_src[(ht - 1) * src_strd + col] 118 ADD r12,sp,#10 @temp array 119 120AU1_SRC_TOP_LOOP: 121 VLD1.8 D0,[r11]! @pu1_src[(ht - 1) * src_strd + col] 122 SUBS r9,r9,#8 @Decrement the loop count by 8 123 VST1.8 D0,[r12]! @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col] 124 BNE AU1_SRC_TOP_LOOP 125 126PU1_AVAIL_5_LOOP_U: 127 LDRB r9,[r5,#5] @pu1_avail[5] 128 CMP r9,#0 129 SUB r14,r7,#2 @[wd - 2] 130 LDRB r9,[r0,r14] @u1_pos_0_0_tmp_u = pu1_src[wd - 2] 131 SUB r11,r7,#1 @[wd - 1] 132 LDRB r10,[r0,r11] @u1_pos_0_0_tmp_v = pu1_src[wd - 1] 133 BEQ PU1_AVAIL_6_LOOP_U 134 135 LDR r11,[sp,#pu1_src_top_right_offset] @Load pu1_src_top_right from sp 136 LDRB r11,[r11] @pu1_src_top_right[0] 137 SUB r12,r9,r11 @pu1_src[wd - 2] - pu1_src_top_right[0] 138 CMP r12,#0 139 MVNLT r12,#0 140 MOVGT r12,#1 @SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) 141 ADD r11,r0,r1 @pu1_src + src_strd 142 SUB r14,r14,#2 @[wd - 2 - 2] 143 LDRB r14,[r11,r14] @pu1_src[wd - 2 - 2 + src_strd] 144 SUB r11,r9,r14 @pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd] 145 CMP r11,#0 146 MVNLT r11,#0 147 MOVGT r11,#1 @SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]) 148 ADD r11,r12,r11 @SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) + SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]) 149 ADD r11,r11,#2 @edge_idx 150 LDR r14, gi1_table_edge_idx_addr_1 @table pointer 151ulbl1: 152 add r14,r14,pc 153 154 LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] 155 CMP r12,#0 @0 != edge_idx 156 BEQ PU1_AVAIL_5_LOOP_V 157 LDRSB r11,[r6,r12] @pi1_sao_offset_u[edge_idx] 158 ADD r9,r9,r11 @pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx] 159 USAT r9,#8,r9 @u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 160 161PU1_AVAIL_5_LOOP_V: 162 163 LDR r11,[sp,#pu1_src_top_right_offset] @Load pu1_src_top_right from sp 164 LDRB r11,[r11,#1] @pu1_src_top_right[1] 165 SUB r12,r10,r11 @pu1_src[wd - 1] - pu1_src_top_right[1] 166 CMP r12,#0 167 MVNLT r12,#0 168 MOVGT r12,#1 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) 169 ADD r11,r0,r1 @pu1_src + src_strd 170 SUB r14,r7,#3 @[wd - 1 - 2] 171 LDRB r14,[r11,r14] @pu1_src[wd - 1 - 2 + src_strd] 172 SUB r11,r10,r14 @pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd] 173 CMP r11,#0 174 MVNLT r11,#0 175 MOVGT r11,#1 @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]) 176 ADD r11,r12,r11 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) + SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]) 177 ADD r11,r11,#2 @edge_idx 178 LDR r14, gi1_table_edge_idx_addr_2 @table pointer 179ulbl2: 180 add r14,r14,pc 181 182 LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] 183 CMP r12,#0 @0 != edge_idx 184 BEQ PU1_AVAIL_6_LOOP_U 185 LDR r11,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v 186 LDRSB r11,[r11,r12] @pi1_sao_offset_v[edge_idx] 187 ADD r10,r10,r11 @pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx] 188 USAT r10,#8,r10 @u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1) 189 190PU1_AVAIL_6_LOOP_U: 191 STRB r9,[sp,#6] 192 STRB r10,[sp,#7] 193 STR r0,[sp,#212] @Store pu1_src in sp 194 195 LDRB r10,[r5,#6] @pu1_avail[6] 196 CMP r10,#0 197 SUB r11,r8,#1 @ht - 1 198 MLA r12,r11,r1,r0 @pu1_src[(ht - 1) * src_strd] 199 LDRB r10,[r12] @u1_pos_wd_ht_tmp_u = pu1_src[(ht - 1) * src_strd] 200 LDRB r9,[r12,#1] @u1_pos_wd_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1] 201 BEQ PU1_AVAIL_3_LOOP 202 203 SUB r11,r12,r1 @pu1_src[(ht - 1) * src_strd - src_strd] 204 ADD r11,r11,#2 @pu1_src[(ht - 1) * src_strd + 2 - src_strd] 205 LDRB r11,[r11] @Load pu1_src[(ht - 1) * src_strd + 2 - src_strd] 206 SUB r11,r10,r11 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd] 207 CMP r11,#0 208 MVNLT r11,#0 209 MOVGT r11,#1 @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) 210 211 LDR r14,[sp,#pu1_src_bot_left_offset] @Load pu1_src_bot_left from sp 212 LDRB r14,[r14] @Load pu1_src_bot_left[0] 213 SUB r14,r10,r14 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0] 214 CMP r14,#0 215 MVNLT r14,#0 216 MOVGT r14,#1 @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]) 217 218 ADD r11,r11,r14 @Add 2 sign value 219 ADD r11,r11,#2 @edge_idx 220 LDR r14, gi1_table_edge_idx_addr_3 @table pointer 221ulbl3: 222 add r14,r14,pc 223 224 LDRSB r14,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] 225 CMP r14,#0 226 BEQ PU1_AVAIL_6_LOOP_V 227 LDRSB r11,[r6,r14] @pi1_sao_offset_u[edge_idx] 228 ADD r10,r10,r11 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 229 USAT r10,#8,r10 @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 230 231PU1_AVAIL_6_LOOP_V: 232 ADD r12,r12,#1 @pu1_src[(ht - 1) * src_strd + 1] 233 SUB r11,r12,r1 @pu1_src[(ht - 1) * src_strd + 1) - src_strd] 234 ADD r11,r11,#2 @pu1_src[(ht - 1) * src_strd + 2 - src_strd] 235 LDRB r11,[r11] @Load pu1_src[(ht - 1) * src_strd + 2 - src_strd] 236 SUB r11,r9,r11 @pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd] 237 CMP r11,#0 238 MVNLT r11,#0 239 MOVGT r11,#1 @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) 240 241 LDR r14,[sp,#pu1_src_bot_left_offset] @Load pu1_src_bot_left from sp 242 LDRB r14,[r14,#1] @Load pu1_src_bot_left[1] 243 SUB r14,r9,r14 @pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1] 244 CMP r14,#0 245 MVNLT r14,#0 246 MOVGT r14,#1 @SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]) 247 248 ADD r11,r11,r14 @Add 2 sign value 249 ADD r11,r11,#2 @edge_idx 250 LDR r14, gi1_table_edge_idx_addr_4 @table pointer 251ulbl4: 252 add r14,r14,pc 253 254 LDRSB r12,[r14,r11] @edge_idx = gi1_table_edge_idx[edge_idx] 255 CMP r12,#0 256 BEQ PU1_AVAIL_3_LOOP 257 LDR r14,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v 258 LDRSB r11,[r14,r12] @pi1_sao_offset_v[edge_idx] 259 ADD r9,r9,r11 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 260 USAT r9,#8,r9 @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 261 262PU1_AVAIL_3_LOOP: 263 STRB r10,[sp,#8] 264 STRB r9,[sp,#9] 265 STR r2,[sp,#216] @Store pu1_src_left in sp 266 267 MOV r12,r8 @Move ht 268 MOV r14,r2 @Move pu1_src_left to pu1_src_left_cpy 269 LDRB r11,[r5,#3] @pu1_avail[3] 270 CMP r11,#0 271 BNE PU1_AVAIL_2_LOOP 272 SUB r12,r12,#1 @ht_tmp-- 273 274PU1_AVAIL_2_LOOP: 275 LDRB r5,[r5,#2] @pu1_avail[2] 276 CMP r5,#0 277 BNE PU1_AVAIL_2_LOOP_END 278 279 ADD r0,r0,r1 @pu1_src += src_strd 280 SUB r12,r12,#1 @ht_tmp-- 281 ADD r14,r14,#2 @pu1_src_left_cpy += 2 282 283PU1_AVAIL_2_LOOP_END: 284 STR r0,[sp,#2] @Store pu1_src in sp 285 VMOV.I8 Q0,#2 @const_2 = vdupq_n_s8(2) 286 VMOV.I16 Q1,#0 @const_min_clip = vdupq_n_s16(0) 287 VMOV.I16 Q2,#255 @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) 288 VLD1.8 D6,[r6] @offset_tbl_u = vld1_s8(pi1_sao_offset_u) 289 LDR r6,[sp,#pi1_sao_v_offset] @Loads pi1_sao_offset_v 290 VLD1.8 D7,[r6] @offset_tbl_v = vld1_s8(pi1_sao_offset_v) 291 LDR r2, gi1_table_edge_idx_addr_5 @table pointer 292ulbl5: 293 add r2,r2,pc 294 @VLD1.8 D6,[r6] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 295 VMOV.S8 Q4,#0xFF @au1_mask = vdupq_n_s8(-1) 296 MOV r6,r7 @move wd to r6 loop_count 297 298 CMP r7,#16 @Compare wd with 16 299 BLT WIDTH_RESIDUE @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case 300 CMP r8,#4 @Compare ht with 4 301 BLE WD_16_HT_4_LOOP @If jump to WD_16_HT_4_LOOP 302 303WIDTH_LOOP_16: 304 LDR r7,[sp,#wd_offset] @Loads wd 305 CMP r6,r7 @col == wd 306 LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail 307 308 LDRBEQ r8,[r5] @pu1_avail[0] 309 MOVNE r8,#-1 310 311 VMOV.8 D8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 312 LDRB r11,[r5,#2] @pu1_avail[2] 313 314 CMP r6,#16 @if(col == 16) 315 VMOV.8 D8[1],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 316 317 BNE SKIP_AU1_MASK_VAL 318 LDRB r8,[r5,#1] @pu1_avail[1] 319 VMOV.8 D9[6],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 320 VMOV.8 D9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 321 322SKIP_AU1_MASK_VAL: 323 CMP r11,#0 324 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 325 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 326 SUB r0,#8 327 ADD r5,sp,#75 @*au1_src_left_tmp 328 329 SUBEQ r8,r0,r1 @pu1_src - src_strd 330 VMOV.I8 Q9,#0 331 MOVNE r8,r3 332 333 ADD r8,r8,#2 @pu1_src - src_strd + 2 334 VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 335 VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 336 SUB r8,#8 337 ADD r3,r3,#16 338 339 LDR r4,[sp,#ht_offset] @Loads ht 340 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 341 LDR r7,[sp,#wd_offset] @Loads wd 342 343 SUB r7,r7,r6 @(wd - col) 344 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 345 ADD r7,r7,#14 @15 + (wd - col) 346 347 LDR r8,[sp,#212] @Loads *pu1_src 348 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 349 ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)] 350 351AU1_SRC_LEFT_LOOP: 352 LDRH r8,[r7] @load the value and increment by src_strd 353 SUBS r4,r4,#1 @decrement the loop count 354 355 STRH r8,[r5],#2 @store it in the stack pointer 356 ADD r7,r7,r1 357 BNE AU1_SRC_LEFT_LOOP 358 359 360 MOV r7,r12 @row count, move ht_tmp to r7 361 VMOV.I8 Q9,#0 @I 362 ADD r11,r0,r1 @I *pu1_src + src_strd 363 364 SUB r5,r12,r7 @I ht_tmp - row 365 VLD1.8 D16,[r11]! @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 366 VLD1.8 D17,[r11] @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 367 SUB r11,#8 368 ADD r8,r14,r5,LSL #1 @I pu1_src_left_cpy[(ht_tmp - row) * 2] 369 370 LDRH r5,[r8,#2] @I 371 VMOV.16 D19[3],r5 @I vsetq_lane_u8 372 LDR r11,[sp,#pu1_avail_offset] @I Loads pu1_avail 373 374 LDRB r11,[r11,#2] @I pu1_avail[2] 375 VEXT.8 Q9,Q9,Q8,#14 @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 376 CMP r11,#0 @I 377 BNE SIGN_UP_CHANGE_DONE @I 378 379 LDRB r8,[r0,#14] @I pu1_src_cpy[14] 380 SUB r5,r0,r1 @I 381 382 LDRB r11,[r5,#16] @I load the value pu1_src_cpy[16 - src_strd] 383 384 LDRB r9,[r0,#15] @I pu1_src_cpy[15] 385 SUB r8,r8,r11 @I pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 386 387 LDRB r10,[r5,#17] @I load the value pu1_src_cpy[17 - src_strd] 388 CMP r8,#0 @I 389 390 MVNLT r8,#0 @I 391 SUB r9,r9,r10 @I pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 392 393 MOVGT r8,#1 @I SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 394 CMP r9,#0 @I 395 396 MVNLT r9,#0 @I 397 VMOV.8 D15[6],r8 @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 398 MOVGT r9,#1 @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 399 400 VMOV.8 D15[7],r9 @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 401 402SIGN_UP_CHANGE_DONE: 403 VLD1.8 D28,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 404 VCGT.U8 Q10,Q6,Q9 @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 405 406 VCLT.U8 Q11,Q6,Q9 @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 407 VSUB.U8 Q11,Q11,Q10 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 408 409 VADD.I8 Q9,Q0,Q7 @I edge_idx = vaddq_s8(const_2, sign_up) 410 VADD.I8 Q9,Q9,Q11 @I edge_idx = vaddq_s8(edge_idx, sign_down) 411 VTBL.8 D18,{D28},D18 @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 412 VNEG.S8 Q7,Q11 @I sign_up = vnegq_s8(sign_down) 413 414 VTBL.8 D19,{D28},D19 @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 415 VEXT.8 Q7,Q7,Q7,#2 @I sign_up = vextq_s8(sign_up, sign_up, 2) 416 417 VMOVL.U8 Q10,D12 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 418 VAND Q9,Q9,Q4 @I edge_idx = vandq_s8(edge_idx, au1_mask) 419 420 VUZP.8 D18,D19 @I 421 VTBL.8 D22,{D6},D18 @I 422 VTBL.8 D23,{D7},D19 @I 423 VZIP.8 D22,D23 @I 424 425 VMOVL.U8 Q9,D13 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 426 VADDW.S8 Q10,Q10,D22 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 427 428 VMAX.S16 Q10,Q10,Q1 @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 429 VMIN.U16 Q10,Q10,Q2 @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 430 431 VMOV Q6,Q8 @I pu1_cur_row = pu1_next_row 432 VADDW.S8 Q9,Q9,D23 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 433 434 SUB r7,r7,#1 @I Decrement the ht_tmp loop count by 1 435 VMAX.S16 Q9,Q9,Q1 @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 436 437 VMIN.U16 Q9,Q9,Q2 @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 438 439 440PU1_SRC_LOOP: 441 ADD r11,r0,r1,LSL #1 @II *pu1_src + src_strd 442 VMOVN.I16 D20,Q10 @I vmovn_s16(pi2_tmp_cur_row.val[0]) 443 SUB r5,r12,r7 @II ht_tmp - row 444 445 ADD r4,r0,r1 @III *pu1_src + src_strd 446 VMOVN.I16 D21,Q9 @I vmovn_s16(pi2_tmp_cur_row.val[1]) 447 ADD r8,r14,r5,LSL #1 @II pu1_src_left_cpy[(ht_tmp - row) * 2] 448 449 LDRH r9,[r8,#2] 450 VLD1.8 D16,[r11]! @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 451 VLD1.8 D17,[r11] @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 452 SUB r11,#8 453 LDRB r10,[r4,#14] @II pu1_src_cpy[14] 454 455 LDRB r8,[r4,#15] @II pu1_src_cpy[15] 456 VMOV.16 D29[3],r9 @II vsetq_lane_u8 457 ADD r4,r11,r1 @III *pu1_src + src_strd 458 459 LDRB r5,[r0,#17] @II load the value pu1_src_cpy[17 - src_strd] 460 VLD1.8 D30,[r4]! @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 461 VLD1.8 D31,[r4] @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 462 SUB r4,#8 463 LDRB r11,[r0,#16] @II load the value pu1_src_cpy[16 - src_strd] 464 465 SUB r7,r7,#1 @II Decrement the ht_tmp loop count by 1 466 VST1.8 {Q10},[r0],r1 @I vst1q_u8(pu1_src_cpy, pu1_cur_row) 467 SUB r10,r10,r11 @II pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 468 469 CMP r10,#0 @II 470 VEXT.8 Q14,Q14,Q8,#14 @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 471 SUB r8,r8,r5 @II pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 472 473 MVNLT r10,#0 @II 474 VLD1.8 D21,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 475 MOVGT r10,#1 @II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 476 477 CMP r8,#0 @II 478 VMOV.8 D15[6],r10 @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 479 MVNLT r8,#0 @II 480 481 MOVGT r8,#1 @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 482 SUB r10,r12,r7 @III ht_tmp - row 483 VMOV.8 D15[7],r8 @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 484 ADD r11,r14,r10,LSL #1 @III pu1_src_left_cpy[(ht_tmp - row) * 2] 485 486 CMP r7,#1 @III 487 VCGT.U8 Q11,Q6,Q14 @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 488 BNE NEXT_ROW_POINTER_ASSIGNED_2 @III 489 490 LDR r5,[sp,#pu1_avail_offset] @III Loads pu1_avail 491 LDRB r5,[r5,#3] @III pu1_avail[3] 492 CMP r5,#0 @III 493 SUBNE r11,r4,#4 @III pu1_src[src_strd - 2] 494 495NEXT_ROW_POINTER_ASSIGNED_2: 496 LDRH r5,[r11,#2] @III 497 VCLT.U8 Q12,Q6,Q14 @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 498 ADD r11,r0,r1 @III 499 500 LDRB r9,[r11,#14] @III pu1_src_cpy[14] 501 VMOV.16 D19[3],r5 @III vsetq_lane_u8 502 LDRB r8,[r11,#15] @III pu1_src_cpy[15] 503 504 LDRB r11,[r0,#16] @III load the value pu1_src_cpy[16 - src_strd] 505 VSUB.U8 Q12,Q12,Q11 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 506 LDRB r10,[r0,#17] @III load the value pu1_src_cpy[17 - src_strd] 507 508 SUB r9,r9,r11 @III pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 509 VEXT.8 Q9,Q9,Q15,#14 @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 510 SUB r10,r8,r10 @III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 511 512 CMP r9,#0 @III 513 VADD.I8 Q13,Q0,Q7 @II edge_idx = vaddq_s8(const_2, sign_up) 514 MVNLT r9,#0 @III 515 516 MOVGT r9,#1 @III SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 517 VADD.I8 Q13,Q13,Q12 @II edge_idx = vaddq_s8(edge_idx, sign_down) 518 CMP r10,#0 @III 519 520 VNEG.S8 Q7,Q12 @II sign_up = vnegq_s8(sign_down) 521 VTBL.8 D26,{D21},D26 @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 522 MVNLT r10,#0 @III 523 MOVGT r10,#1 @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 524 525 VEXT.8 Q7,Q7,Q7,#2 @II sign_up = vextq_s8(sign_up, sign_up, 2) 526 VTBL.8 D27,{D21},D27 @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 527 VCGT.U8 Q11,Q8,Q9 @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 528 529 VMOV.8 D15[6],r9 @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 530 VAND Q13,Q13,Q4 @II edge_idx = vandq_s8(edge_idx, au1_mask) 531 532 VMOV.8 D15[7],r10 @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 533 VUZP.8 D26,D27 @II 534 535 VCLT.U8 Q10,Q8,Q9 @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 536 VTBL.8 D24,{D6},D26 @II 537 VSUB.U8 Q11,Q10,Q11 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 538 539 VADD.I8 Q9,Q0,Q7 @III edge_idx = vaddq_s8(const_2, sign_up) 540 VTBL.8 D25,{D7},D27 @II 541 VADD.I8 Q9,Q9,Q11 @III edge_idx = vaddq_s8(edge_idx, sign_down) 542 543 VLD1.8 D20,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 544 VZIP.8 D24,D25 @II 545 546 VMOVL.U8 Q14,D12 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 547 VTBL.8 D18,{D20},D18 @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 548 VNEG.S8 Q7,Q11 @III sign_up = vnegq_s8(sign_down) 549 550 VADDW.S8 Q14,Q14,D24 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 551 VTBL.8 D19,{D20},D19 @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 552 VEXT.8 Q7,Q7,Q7,#2 @III sign_up = vextq_s8(sign_up, sign_up, 2) 553 554 VMOVL.U8 Q13,D13 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 555 VAND Q9,Q9,Q4 @III edge_idx = vandq_s8(edge_idx, au1_mask) 556 557 VMOVL.U8 Q10,D16 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 558 VUZP.8 D18,D19 @III 559 560 VMAX.S16 Q14,Q14,Q1 @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 561 VTBL.8 D22,{D6},D18 @III 562 VMIN.U16 Q14,Q14,Q2 @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 563 564 VADDW.S8 Q13,Q13,D25 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 565 VTBL.8 D23,{D7},D19 @III 566 VMAX.S16 Q13,Q13,Q1 @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 567 568 VMOVL.U8 Q9,D17 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 569 VZIP.8 D22,D23 @III 570 571 VMOVN.I16 D28,Q14 @II vmovn_s16(pi2_tmp_cur_row.val[0]) 572 VADDW.S8 Q10,Q10,D22 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 573 574 VMOV Q6,Q15 @III pu1_cur_row = pu1_next_row 575 VMIN.U16 Q13,Q13,Q2 @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 576 577 SUB r7,r7,#1 @III Decrement the ht_tmp loop count by 1 578 VMAX.S16 Q10,Q10,Q1 @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 579 CMP r7,#1 @III 580 581 VMOVN.I16 D29,Q13 @II vmovn_s16(pi2_tmp_cur_row.val[1]) 582 VMIN.U16 Q10,Q10,Q2 @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 583 584 VADDW.S8 Q9,Q9,D23 @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 585 586 VMAX.S16 Q9,Q9,Q1 @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 587 588 VST1.8 {Q14},[r0],r1 @II vst1q_u8(pu1_src_cpy, pu1_cur_row) 589 VMIN.U16 Q9,Q9,Q2 @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 590 591 BGT PU1_SRC_LOOP @If not equal jump to PU1_SRC_LOOP 592 BLT INNER_LOOP_DONE 593 594 595 ADD r11,r0,r1,LSL #1 @*pu1_src + src_strd 596 VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0]) 597 SUB r5,r12,r7 @ht_tmp - row 598 599 ADD r8,r14,r5,LSL #1 @pu1_src_left_cpy[(ht_tmp - row) * 2] 600 VMOVN.I16 D21,Q9 @III vmovn_s16(pi2_tmp_cur_row.val[1]) 601 CMP r7,#1 602 603 LDRB r4,[r0,#16] @load the value pu1_src_cpy[16 - src_strd] 604 VLD1.8 D16,[r11]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 605 VLD1.8 D17,[r11] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 606 SUB r11,#8 607 LDRB r9,[r0,#17] @load the value pu1_src_cpy[17 - src_strd] 608 609 BNE NEXT_ROW_POINTER_ASSIGNED_3 610 LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail 611 LDRB r5,[r5,#3] @pu1_avail[3] 612 CMP r5,#0 613 SUBNE r8,r11,#4 @pu1_src[src_strd - 2] 614 615NEXT_ROW_POINTER_ASSIGNED_3: 616 LDRH r5,[r8,#2] 617 VST1.8 {Q10},[r0],r1 @III vst1q_u8(pu1_src_cpy, pu1_cur_row) 618 LDRB r8,[r0,#14] @pu1_src_cpy[14] 619 620 SUB r8,r8,r4 @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 621 VMOV.16 D19[3],r5 @vsetq_lane_u8 622 LDRB r10,[r0,#15] @pu1_src_cpy[15] 623 624 CMP r8,#0 625 VEXT.8 Q9,Q9,Q8,#14 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 626 SUB r10,r10,r9 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 627 628 MVNLT r8,#0 629 VLD1.8 D28,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 630 MOVGT r8,#1 @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 631 632 CMP r10,#0 633 VMOV.8 D15[6],r8 @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 634 MVNLT r10,#0 635 636 MOVGT r10,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 637 VMOV.8 D15[7],r10 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 638 VCGT.U8 Q10,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 639 640 VCLT.U8 Q11,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 641 VSUB.U8 Q11,Q11,Q10 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 642 643 VADD.I8 Q9,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 644 VADD.I8 Q9,Q9,Q11 @edge_idx = vaddq_s8(edge_idx, sign_down) 645 VTBL.8 D18,{D28},D18 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 646 VTBL.8 D19,{D28},D19 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 647 648 VAND Q9,Q9,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 649 650 VMOVL.U8 Q10,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 651 VUZP.8 D18,D19 652 653 VTBL.8 D22,{D6},D18 654 VTBL.8 D23,{D7},D19 655 656 VMOVL.U8 Q9,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 657 VZIP.8 D22,D23 658 659 VADDW.S8 Q10,Q10,D22 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 660 VMAX.S16 Q10,Q10,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 661 VMIN.U16 Q10,Q10,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 662 663 VADDW.S8 Q9,Q9,D23 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 664 VMAX.S16 Q9,Q9,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 665 VMIN.U16 Q9,Q9,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 666 667 668INNER_LOOP_DONE: 669 670 LDR r8,[sp,#ht_offset] @Loads ht 671 VMOVN.I16 D20,Q10 @III vmovn_s16(pi2_tmp_cur_row.val[0]) 672 ADD r5,sp,#75 @*au1_src_left_tmp 673 674 LSL r8,r8,#1 675 VMOVN.I16 D21,Q9 @III vmovn_s16(pi2_tmp_cur_row.val[1]) 676 LDR r11,[sp,#216] @Loads *pu1_src_left 677 678SRC_LEFT_LOOP: 679 LDR r7,[r5],#4 @au1_src_left_tmp[row] 680 SUBS r8,r8,#4 681 STR r7,[r11],#4 @pu1_src_left[row] = au1_src_left_tmp[row] 682 BNE SRC_LEFT_LOOP 683 684 SUBS r6,r6,#16 @Decrement the wd loop count by 16 685 VST1.8 {Q10},[r0],r1 @III vst1q_u8(pu1_src_cpy, pu1_cur_row) 686 CMP r6,#8 @Check whether residue remains 687 688 BLT RE_ASSINING_LOOP @Jump to re-assigning loop 689 LDR r7,[sp,#wd_offset] @Loads wd 690 LDR r0,[sp,#0x02] @Loads *pu1_src 691 SUB r7,r7,r6 692 ADD r0,r0,r7 693 BGT WIDTH_LOOP_16 @If not equal jump to width_loop 694 BEQ WIDTH_RESIDUE @If residue remains jump to residue loop 695 696WD_16_HT_4_LOOP: 697 LDR r7,[sp,#wd_offset] @Loads wd 698 699 LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail 700 CMP r6,r7 @col == wd 701 702 LDRBEQ r8,[r5] @pu1_avail[0] 703 MOVNE r8,#-1 704 VMOV.8 D8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 705 706 CMP r6,#16 @if(col == 16) 707 VMOV.8 D8[1],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 708 709 BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 710 LDRB r8,[r5,#1] @pu1_avail[1] 711 VMOV.8 D9[6],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 712 VMOV.8 D9[7],r8 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 713 714SKIP_AU1_MASK_VAL_WD_16_HT_4: 715 LDRB r11,[r5,#2] @pu1_avail[2] 716 CMP r11,#0 717 SUBEQ r8,r0,r1 @pu1_src - src_strd 718 719 MOVNE r8,r3 720 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 721 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 722 SUB r0,#8 723 ADD r8,r8,#2 @pu1_src - src_strd + 2 724 725 ADD r3,r3,#16 726 VLD1.8 D10,[r8]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 727 VLD1.8 D11,[r8] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 728 SUB r8,#8 729 ADD r5,sp,#75 @*au1_src_left_tmp 730 731 LDR r4,[sp,#ht_offset] @Loads ht 732 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 733 LDR r7,[sp,#wd_offset] @Loads wd 734 735 SUB r7,r7,r6 @(wd - col) 736 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 737 ADD r7,r7,#14 @15 + (wd - col) 738 739 LDR r8,[sp,#212] @Loads *pu1_src 740 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 741 ADD r7,r8,r7 @pu1_src[0 * src_strd + 15 + (wd - col)] 742 743AU1_SRC_LEFT_LOOP_WD_16_HT_4: 744 LDRH r8,[r7] @load the value and increment by src_strd 745 SUBS r4,r4,#1 @decrement the loop count 746 747 STRH r8,[r5],#2 @store it in the stack pointer 748 ADD r7,r7,r1 749 BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4 750 751 VMOV.I8 Q9,#0 752 MOV r7,r12 @row count, move ht_tmp to r7 753 754PU1_SRC_LOOP_WD_16_HT_4: 755 ADD r9,r0,r1 @*pu1_src + src_strd 756 757 LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail 758 VLD1.8 D16,[r9]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 759 VLD1.8 D17,[r9] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 760 SUB r9,#8 761 LDRB r5,[r5,#3] @pu1_avail[3] 762 763 SUB r11,r12,r7 @ht_tmp - row 764 ADD r8,r14,r11,LSL #1 @pu1_src_left_cpy[(ht_tmp - row) * 2] 765 ADD r8,r8,#2 @pu1_src_left_cpy[(ht_tmp - row + 1) * 2] 766 767 CMP r5,#0 768 BEQ NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4 769 CMP r7,#1 770 SUBEQ r8,r9,#2 @pu1_src[src_strd - 2] 771 772NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4: 773 LDRH r5,[r8] 774 VMOV.16 D19[3],r5 @vsetq_lane_u8 775 VEXT.8 Q9,Q9,Q8,#14 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 776 777 CMP r7,r12 778 BLT SIGN_UP_CHANGE_WD_16_HT_4 779 LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail 780 LDRB r5,[r5,#2] @pu1_avail[2] 781 CMP r5,#0 782 BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4 783 784SIGN_UP_CHANGE_WD_16_HT_4: 785 LDRB r8,[r0,#14] @pu1_src_cpy[14] 786 SUB r9,r0,r1 787 788 LDRB r5,[r9,#16] @load the value pu1_src_cpy[16 - src_strd] 789 790 LDRB r10,[r0,#15] @pu1_src_cpy[15] 791 SUB r8,r8,r5 @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 792 793 LDRB r11,[r9,#17] @load the value pu1_src_cpy[17 - src_strd] 794 CMP r8,#0 795 796 MVNLT r8,#0 797 SUB r10,r10,r11 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 798 799 MOVGT r8,#1 @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 800 801 CMP r10,#0 802 VMOV.8 D15[6],r8 @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 803 MVNLT r10,#0 804 805 MOVGT r10,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 806 VMOV.8 D15[7],r10 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 807 808SIGN_UP_CHANGE_DONE_WD_16_HT_4: 809 VLD1.8 D20,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 810 VCGT.U8 Q11,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 811 812 VCLT.U8 Q12,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 813 VSUB.U8 Q12,Q12,Q11 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 814 815 VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 816 VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down) 817 818 VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down) 819 VTBL.8 D26,{D20},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 820 821 VTBL.8 D27,{D20},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 822 VEXT.8 Q7,Q7,Q7,#2 @sign_up = vextq_s8(sign_up, sign_up, 2) 823 824 VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 825 VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 826 827 828 VUZP.8 D26,D27 829 VTBL.8 D24,{D6},D26 830 VTBL.8 D25,{D7},D27 831 VZIP.8 D24,D25 832 833 VMOVL.U8 Q15,D13 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 834 VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 835 836 VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 837 VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 838 839 VMOV Q6,Q8 @pu1_cur_row = pu1_next_row 840 VADDW.S8 Q15,Q15,D25 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 841 842 VMAX.S16 Q15,Q15,Q1 @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 843 VMIN.U16 Q15,Q15,Q2 @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 844 845 VMOVN.I16 D28,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0]) 846 VMOVN.I16 D29,Q15 @vmovn_s16(pi2_tmp_cur_row.val[1]) 847 848 SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1 849 VST1.8 {Q14},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 850 BNE PU1_SRC_LOOP_WD_16_HT_4 @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 851 852 LDR r8,[sp,#ht_offset] @Loads ht 853 ADD r5,sp,#75 @*au1_src_left_tmp 854 LDR r11,[sp,#216] @Loads *pu1_src_left 855 856SRC_LEFT_LOOP_WD_16_HT_4: 857 LDR r7,[r5],#4 @au1_src_left_tmp[row] 858 SUBS r8,r8,#2 859 STR r7,[r11],#4 @pu1_src_left[row] = au1_src_left_tmp[row] 860 BNE SRC_LEFT_LOOP_WD_16_HT_4 861 862 SUBS r6,r6,#16 @Decrement the wd loop count by 16 863 CMP r6,#8 864 BLT RE_ASSINING_LOOP @Jump to re-assigning loop 865 LDR r7,[sp,#wd_offset] @Loads wd 866 LDR r0,[sp,#0x02] @Loads *pu1_src 867 SUB r7,r7,r6 868 ADD r0,r0,r7 869 BGT WD_16_HT_4_LOOP @If not equal jump to width_loop 870 BEQ WIDTH_RESIDUE @If residue remains jump to residue loop 871 872WIDTH_RESIDUE: 873 LDR r7,[sp,#wd_offset] @Loads wd 874 875 LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail 876 CMP r6,r7 @wd_residue == wd 877 878 LDRBEQ r8,[r5] @pu1_avail[0] 879 880 MOVNE r8,#-1 881 LDRB r11,[r5,#1] @pu1_avail[1] 882 883 LDRB r9,[r5,#2] @pu1_avail[2] 884 VMOV.8 d8[0],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 885 CMP r9,#0 886 887 SUBEQ r10,r0,r1 @pu1_src - src_strd 888 VMOV.8 d8[1],r8 @au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 889 MOVNE r10,r3 890 891 ADD r10,r10,#2 @pu1_src - src_strd + 2 892 VMOV.8 d8[6],r11 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 893 ADD r5,sp,#75 @*au1_src_left_tmp 894 895 LDR r4,[sp,#ht_offset] @Loads ht 896 VMOV.8 d8[7],r11 @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 897 LDR r7,[sp,#wd_offset] @Loads wd 898 899 LDR r8,[sp,#212] @Loads *pu1_src 900 VLD1.8 D10,[r10]! @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 901 VLD1.8 D11,[r10] @pu1_top_row = vld1q_u8(pu1_src - src_strd + 2) 902 SUB r10,#8 903 SUB r7,r7,#2 @(wd - 2) 904 905 ADD r7,r8,r7 @pu1_src[0 * src_strd + (wd - 2)] 906 907AU1_SRC_LEFT_LOOP_RESIDUE: 908 LDRH r8,[r7] @load the value and increment by src_strd 909 ADD r7,r7,r1 910 STRH r8,[r5],#2 @store it in the stack pointer 911 SUBS r4,r4,#1 @decrement the loop count 912 BNE AU1_SRC_LEFT_LOOP_RESIDUE 913 914 VLD1.8 D12,[r0]! @pu1_cur_row = vld1q_u8(pu1_src) 915 VLD1.8 D13,[r0] @pu1_cur_row = vld1q_u8(pu1_src) 916 SUB r0,#8 917 918 VMOV.I8 Q9,#0 919 VCGT.U8 Q7,Q6,Q5 @vcgtq_u8(pu1_cur_row, pu1_top_row) 920 921 VCLT.U8 Q8,Q6,Q5 @vcltq_u8(pu1_cur_row, pu1_top_row) 922 VSUB.U8 Q7,Q8,Q7 @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 923 MOV r7,r12 @row count, move ht_tmp to r7 924 925PU1_SRC_LOOP_RESIDUE: 926 ADD r9,r0,r1 @*pu1_src + src_strd 927 928 SUB r11,r12,r7 @ht_tmp - row 929 VLD1.8 D16,[r9]! @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 930 VLD1.8 D17,[r9] @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 931 SUB r9,#8 932 LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail 933 934 LDRB r5,[r5,#3] @pu1_avail[3] 935 ADD r8,r14,r11,LSL #1 @pu1_src_left_cpy[(ht_tmp - row) * 2] 936 937 CMP r5,#0 938 ADD r8,r8,#2 @pu1_src_left_cpy[(ht_tmp - row + 1) * 2] 939 940 BEQ NEXT_ROW_POINTER_ASSIGNED_RESIDUE 941 CMP r7,#1 942 SUBEQ r8,r9,#2 @pu1_src[src_strd - 2] 943 944NEXT_ROW_POINTER_ASSIGNED_RESIDUE: 945 LDRB r5,[r8] 946 947 LDRB r8,[r8,#1] 948 VMOV.8 D19[6],r5 @vsetq_lane_u8 949 CMP r7,r12 950 951 VMOV.8 D19[7],r8 @vsetq_lane_u8 952 VEXT.8 Q9,Q9,Q8,#14 @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14) 953 954 BLT SIGN_UP_CHANGE_RESIDUE 955 LDR r5,[sp,#pu1_avail_offset] @Loads pu1_avail 956 LDRB r5,[r5,#2] @pu1_avail[2] 957 CMP r5,#0 958 BNE SIGN_UP_CHANGE_DONE_RESIDUE 959 960SIGN_UP_CHANGE_RESIDUE: 961 LDRB r8,[r0,#14] @pu1_src_cpy[14] 962 SUB r9,r0,r1 963 964 LDRB r5,[r9,#16] @load the value pu1_src_cpy[16 - src_strd] 965 966 LDRB r10,[r0,#15] @pu1_src_cpy[15] 967 SUB r8,r8,r5 @pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd] 968 969 LDRB r11,[r9,#17] @load the value pu1_src_cpy[17 - src_strd] 970 CMP r8,#0 971 972 MVNLT r8,#0 973 SUB r10,r10,r11 @pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 974 975 MOVGT r8,#1 @SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]) 976 977 CMP r10,#0 978 VMOV.8 D15[6],r8 @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0) 979 MVNLT r10,#0 980 981 MOVGT r10,#1 @SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd] 982 VMOV.8 D15[7],r10 @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1) 983 984SIGN_UP_CHANGE_DONE_RESIDUE: 985 VLD1.8 D20,[r2] @edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 986 VCGT.U8 Q11,Q6,Q9 @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 987 988 VCLT.U8 Q12,Q6,Q9 @vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 989 VSUB.U8 Q12,Q12,Q11 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 990 991 VADD.I8 Q13,Q0,Q7 @edge_idx = vaddq_s8(const_2, sign_up) 992 VADD.I8 Q13,Q13,Q12 @edge_idx = vaddq_s8(edge_idx, sign_down) 993 994 VNEG.S8 Q7,Q12 @sign_up = vnegq_s8(sign_down) 995 VTBL.8 D26,{D20},D26 @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 996 997 VTBL.8 D27,{D20},D27 @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 998 VEXT.8 Q7,Q7,Q7,#2 @sign_up = vextq_s8(sign_up, sign_up, 14) 999 1000 VMOVL.U8 Q14,D12 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 1001 VAND Q13,Q13,Q4 @edge_idx = vandq_s8(edge_idx, au1_mask) 1002 1003 1004 VUZP.8 D26,D27 1005 VTBL.8 D24,{D6},D26 1006 VTBL.8 D25,{D7},D27 1007 VZIP.8 D24,D25 1008 1009 VMOV Q6,Q8 @pu1_cur_row = pu1_next_row 1010 VADDW.S8 Q14,Q14,D24 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 1011 1012 VMAX.S16 Q14,Q14,Q1 @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 1013 VMIN.U16 Q14,Q14,Q2 @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 1014 1015 SUBS r7,r7,#1 @Decrement the ht_tmp loop count by 1 1016 VMOVN.I16 D30,Q14 @vmovn_s16(pi2_tmp_cur_row.val[0]) 1017 1018 VST1.8 {D30},[r0],r1 @vst1q_u8(pu1_src_cpy, pu1_cur_row) 1019 1020 BNE PU1_SRC_LOOP_RESIDUE @If not equal jump to PU1_SRC_LOOP 1021 1022 LDR r8,[sp,#ht_offset] @Loads ht 1023 ADD r5,sp,#75 @*au1_src_left_tmp 1024 1025 LDR r11,[sp,#216] @Loads *pu1_src_left 1026 1027SRC_LEFT_LOOP_RESIDUE: 1028 LDR r7,[r5],#4 @au1_src_left_tmp[row] 1029 SUBS r8,r8,#2 1030 STR r7,[r11],#4 @pu1_src_left[row] = au1_src_left_tmp[row] 1031 BNE SRC_LEFT_LOOP_RESIDUE 1032 1033 1034RE_ASSINING_LOOP: 1035 LDR r7,[sp,#wd_offset] @Loads wd 1036 LDR r8,[sp,#ht_offset] @Loads ht 1037 1038 LDR r0,[sp,#212] @Loads *pu1_src 1039 SUB r10,r7,#2 @wd - 2 1040 1041 LDRH r9,[sp,#6] 1042 SUB r8,r8,#1 @ht - 1 1043 1044 STRH r9,[r0,r10] @pu1_src_org[0] = u1_pos_0_0_tmp 1045 MLA r6,r8,r1,r0 @pu1_src[(ht - 1) * src_strd] 1046 1047 LDR r4,[sp,#pu1_src_top_left_offset] @Loads pu1_src_top_left 1048 1049 LDRH r9,[sp,#8] 1050 ADD r12,sp,#10 1051 1052 STRH r9,[r6] @pu1_src_org[(ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u 1053 1054 LDRH r10,[sp] @load u1_src_top_left_tmp from stack pointer 1055 STRH r10,[r4] @*pu1_src_top_left = u1_src_top_left_tmp 1056 LDR r3,[sp,#220] @Loads pu1_src_top 1057 1058SRC_TOP_LOOP: 1059 VLD1.8 D0,[r12]! @pu1_src_top[col] = au1_src_top_tmp[col] 1060 SUBS r7,r7,#8 @Decrement the width 1061 VST1.8 D0,[r3]! @pu1_src_top[col] = au1_src_top_tmp[col] 1062 BNE SRC_TOP_LOOP 1063 1064END_LOOPS: 1065 ADD sp,sp,#224 1066 vpop {d8 - d15} 1067 LDMFD sp!,{r4-r12,r15} @Reload the registers from SP 1068 1069 1070 1071