1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* ,:file 21//* ihevc_sao_edge_offset_class2_chroma.s 22//* 23//* ,:brief 24//* Contains function definitions for inter prediction interpolation. 25//* Functions are coded using NEON intrinsics and can be compiled using@ ARM 26//* RVCT 27//* 28//* ,:author 29//* Parthiban V 30//* 31//* ,:par List of Functions: 32//* 33//* 34//* ,:remarks 35//* None 36//* 37//******************************************************************************* 38//*/ 39//void ihevc_sao_edge_offset_class2_chroma(UWORD8 *pu1_src, 40// WORD32 src_strd, 41// UWORD8 *pu1_src_left, 42// UWORD8 *pu1_src_top, 43// UWORD8 *pu1_src_top_left, 44// UWORD8 *pu1_src_top_right, 45// UWORD8 *pu1_src_bot_left, 46// UWORD8 *pu1_avail, 47// WORD8 *pi1_sao_offset_u, 48// WORD8 *pi1_sao_offset_v, 49// WORD32 wd, 50// WORD32 ht) 51//**************Variables Vs Registers***************************************** 52//x0 => *pu1_src 53//x1 => src_strd 54//x2 => *pu1_src_left 55//x3 => *pu1_src_top 56//x4 => *pu1_src_top_left 57//x5 => *pu1_avail 58//x6 => *pi1_sao_offset_u 59//x9 => *pi1_sao_offset_v 60//x7 => wd 61//x8=> ht 62 63.text 64.p2align 2 65.include "ihevc_neon_macros.s" 66 67.globl gi1_table_edge_idx 68.globl ihevc_sao_edge_offset_class2_chroma_av8 69 70ihevc_sao_edge_offset_class2_chroma_av8: 71 72 73 // STMFD sp!,{x4-x12,x14} //stack stores the values of the arguments 74 75 ldr x8,[sp,#0] 76 ldr x9,[sp,#8] 77 ldr w10,[sp,#16] 78 ldr w11,[sp,#24] 79 80 81 82 // STMFD sp!, {x4-x12, x14} //stack stores the values of the arguments 83 stp x19, x20,[sp,#-16]! 84 stp x21, x22,[sp,#-16]! 85 stp x23, x24,[sp,#-16]! 86 stp x25, x26,[sp,#-16]! 87 stp x27, x28,[sp,#-16]! 88 89 mov x15,x4 // *pu1_src_top_left 0x28 90 //mov x16,x5 // *pu1_src_top_right 0x2c 91 mov x17,x6 // *pu1_src_bot_left 0x30 92 mov x21,x7 // *pu1_avail 0x34 93 mov x22,x8 // *pi1_sao_offset_u 0x38 94 mov x23,x9 // *pi1_sao_offset_v 0x3c 95 mov x24,x10 // wd 0x40 96 mov x25,x11 // ht 0x44 97 98 99 mov w7, w24 //Loads wd 100 mov w8, w25 //Loads ht 101 SUB x9,x7,#2 //wd - 2 102 103 mov x4, x15 //Loads pu1_src_top_left 104 LDRH w10,[x3,x9] //pu1_src_top[wd - 2] 105 106 mov x26, x0 //Store pu1_src in sp 107 MOV x9,x7 //Move width to x9 for loop count 108 109 mov x17, x2 //Store pu1_src_bot_left in sp 110 mov x5, x21 //Loads pu1_avail 111 mov x6, x22 //Loads pi1_sao_offset_u 112 113 mov x22, x3 //Store pu1_src_top in sp 114 SUB sp,sp,#0xE0 //Decrement the stack pointer to store some temp arr values 115 116 STRH w10,[sp] //u1_src_top_left_tmp = pu1_src_top[wd - 2] 117 SUB x10,x8,#1 //ht-1 118 madd x11, x10, x1, x0 //pu1_src[(ht - 1) * src_strd + col] 119 ADD x12,sp,#10 //temp array 120 121AU1_SRC_TOP_LOOP: 122 LD1 {v0.8b},[x11],#8 //pu1_src[(ht - 1) * src_strd + col] 123 SUBS x9,x9,#8 //Decrement the loop count by 8 124 ST1 {v0.8b},[x12],#8 //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col] 125 BNE AU1_SRC_TOP_LOOP 126 127PU1_AVAIL_4_LOOP_U: 128 LDRB w9,[x5,#4] //pu1_avail[4] 129 CMP x9,#0 130 LDRB w9,[x0] //u1_pos_0_0_tmp_u = pu1_src[0] 131 LDRB w10,[x0,#1] //u1_pos_0_0_tmp_v = pu1_src[1] 132 BEQ PU1_AVAIL_7_LOOP_U 133 134 LDRB w11,[x4] //pu1_src_top_left[0] 135 ADD x14,x0,x1 //pu1_src + src_strd 136 137 SUB x12,x9,x11 //pu1_src[0] - pu1_src_top_left[0] 138 139 LDRB w14,[x14,#2] //pu1_src[2 + src_strd] 140 CMP x12,#0 141 142 movn x20,#0 143 csel x12, x20, x12,LT 144 SUB x11,x9,x14 //pu1_src[0] - pu1_src[2 + src_strd] 145 146 MOV x20,#1 147 csel x12, x20, x12,GT //SIGN(pu1_src[0] - pu1_src_top_left[0]) 148 149 CMP x11,#0 150 movn x20,#0 151 csel x11, x20, x11,LT 152 ADRP x14, :got:gi1_table_edge_idx //table pointer 153 LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 154 MOV x20,#1 155 csel x11, x20, x11,GT //SIGN(pu1_src[0] - pu1_src[2 + src_strd]) 156 157 ADD x11,x12,x11 //SIGN(pu1_src[0] - pu1_src_top_left[0]) + SIGN(pu1_src[0] - pu1_src[2 + src_strd]) 158 ADD x11,x11,#2 //edge_idx 159 160 LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 161 CMP x12,#0 //0 != edge_idx 162 BEQ PU1_AVAIL_4_LOOP_V 163 LDRSB x11,[x6,x12] //pi1_sao_offset_u[edge_idx] 164 ADD x9,x9,x11 //pu1_src[0] + pi1_sao_offset_u[edge_idx] 165 mov x20,#255 166 cmp x9,x20 167 csel x9, x20, x9, ge //u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 168 mov x20,#0 169 cmp x9,x20 170 csel x9, x20, x9, LT //u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 171 172PU1_AVAIL_4_LOOP_V: 173 174 LDRB w11,[x4,#1] //pu1_src_top_left[1] 175 ADD x14,x0,x1 //pu1_src + src_strd 176 177 SUB x12,x10,x11 //pu1_src[1] - pu1_src_top_left[1] 178 LDRB w14,[x14,#3] //pu1_src[3 + src_strd] 179 180 CMP x12,#0 181 movn x20,#0 182 csel x12, x20, x12,LT 183 SUB x11,x10,x14 //pu1_src[1] - pu1_src[3 + src_strd] 184 MOV x20,#1 185 csel x12, x20, x12,GT //SIGN(pu1_src[0] - pu1_src_top_left[0]) 186 187 CMP x11,#0 188 movn x20,#0 189 csel x11, x20, x11,LT 190 ADRP x14, :got:gi1_table_edge_idx //table pointer 191 LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 192 MOV x20,#1 193 csel x11, x20, x11,GT //SIGN(pu1_src[0] - pu1_src[3 + src_strd]) 194 195 ADD x11,x12,x11 //SIGN(pu1_src[0] - pu1_src_top_left[0]) + SIGN(pu1_src[0] - pu1_src[3 + src_strd]) 196 ADD x11,x11,#2 //edge_idx 197 198 LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 199 CMP x12,#0 //0 != edge_idx 200 BEQ PU1_AVAIL_7_LOOP_U 201 mov x11, x23 //Loads pi1_sao_offset_v 202 LDRSB x11,[x11,x12] //pi1_sao_offset_v[edge_idx] 203 ADD x10,x10,x11 //pu1_src[0] + pi1_sao_offset_v[edge_idx] 204 mov x20,#255 205 cmp x10,x20 206 csel x10, x20, x10, ge //u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1) 207 mov x20,#0 208 cmp x10,x20 209 csel x10, x20, x10, LT //u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1) 210 211PU1_AVAIL_7_LOOP_U: 212 STRB w10,[sp,#7] 213 STRB w9,[sp,#6] 214 215 LDRB w10,[x5,#7] //pu1_avail[7] 216 CMP x10,#0 217 SUB x10,x7,#2 //wd - 2 218 SUB x11,x8,#1 //ht - 1 219 madd x12, x11, x1, x10 //wd - 2 + (ht - 1) * src_strd 220 ADD x12,x12,x0 //pu1_src[wd - 2 + (ht - 1) * src_strd] 221 LDRB w10,[x12] //u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd] 222 LDRB w9,[x12,#1] //u1_pos_wd_ht_tmp_v = pu1_src[wd - 2 + (ht - 1) * src_strd] 223 BEQ PU1_AVAIL_3_LOOP 224 225 SUB x11,x12,x1 //pu1_src[(wd - 2 + (ht - 1) * src_strd) - src_strd] 226 SUB x11,x11,#2 //pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd] 227 LDRB w11,[x11] //Load pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd] 228 SUB x11,x10,x11 //pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd] 229 CMP x11,#0 230 movn x20,#0 231 csel x11, x20, x11,LT 232 MOV x20,#1 233 csel x11, x20, x11,GT //SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd]) 234 235 ADD x14,x12,x1 //pu1_src[(wd - 2 + (ht - 1) * src_strd) + src_strd] 236 ADD x14,x14,#2 //pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd] 237 LDRB w14,[x14] //Load pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd] 238 SUB x14,x10,x14 //pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd] 239 CMP x14,#0 240 movn x20,#0 241 csel x14, x20, x14,LT 242 MOV x20,#1 243 csel x14, x20, x14,GT //SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]) 244 245 ADD x11,x11,x14 //Add 2 sign value 246 ADD x11,x11,#2 //edge_idx 247 ADRP x14, :got:gi1_table_edge_idx //table pointer 248 LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 249 250 LDRSB x14,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 251 CMP x14,#0 252 BEQ PU1_AVAIL_7_LOOP_V 253 LDRSB x11,[x6,x14] //pi1_sao_offset_u[edge_idx] 254 ADD x10,x10,x11 //pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 255 mov x20,#255 256 cmp x10,x20 257 csel x10, x20, x10, ge //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 258 mov x20,#0 259 cmp x10,x20 260 csel x10, x20, x10, LT //u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 261 262PU1_AVAIL_7_LOOP_V: 263 ADD x12,x12,#1 264 SUB x11,x12,x1 //pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd] 265 SUB x11,x11,#2 //pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd] 266 LDRB w11,[x11] //Load pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd] 267 SUB x11,x9,x11 //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 2 - src_strd] 268 CMP x11,#0 269 movn x20,#0 270 csel x11, x20, x11,LT 271 MOV x20,#1 272 csel x11, x20, x11,GT //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) 273 274 ADD x14,x12,x1 //pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd] 275 ADD x14,x14,#2 //pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd] 276 LDRB w14,[x14] //Load pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd] 277 SUB x14,x9,x14 //pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd] 278 CMP x14,#0 279 movn x20,#0 280 csel x14, x20, x14,LT 281 MOV x20,#1 282 csel x14, x20, x14,GT //SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]) 283 284 ADD x11,x11,x14 //Add 2 sign value 285 ADD x11,x11,#2 //edge_idx 286 ADRP x14, :got:gi1_table_edge_idx //table pointer 287 LDR x14, [x14, #:got_lo12:gi1_table_edge_idx] 288 289 LDRSB x12,[x14,x11] //edge_idx = gi1_table_edge_idx[edge_idx] 290 CMP x12,#0 291 BEQ PU1_AVAIL_3_LOOP 292 mov x14, x23 //Loads pi1_sao_offset_v 293 LDRSB x11,[x14,x12] //pi1_sao_offset_v[edge_idx] 294 ADD x9,x9,x11 //pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx] 295 mov x20,#255 296 cmp x9,x20 297 csel x9, x20, x9, ge //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 298 mov x20,#0 299 cmp x9,x20 300 csel x9, x20, x9, LT //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1) 301 302PU1_AVAIL_3_LOOP: 303 STRB w10,[sp,#8] 304 movi v0.16b, #2 //const_2 = vdupq_n_s8(2) 305 STRB w9,[sp,#9] 306 307 MOV x12,x8 //Move ht 308 movi v2.8h, #0 //const_min_clip = vdupq_n_s16(0) 309 MOV x14,x2 //Move pu1_src_left to pu1_src_left_cpy 310 311 LDRB w11,[x5,#3] //pu1_avail[3] 312 movi v4.8h, #255 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1) 313 CMP x11,#0 314 315 SUB x20,x12,#1 //ht_tmp-- 316 csel x12, x20, x12,EQ 317 LDRB w5,[x5,#2] //pu1_avail[2] 318 319 CMP x5,#0 320 321 ADD x20,x0,x1 //pu1_src += src_strd 322 csel x0, x20, x0,EQ 323 LD1 {v6.8b},[x6] //offset_tbl_u = vld1_s8(pi1_sao_offset_u) 324 SUB x20,x12,#1 //ht_tmp-- 325 csel x12, x20, x12,EQ 326 327 mov x6, x23 //Loads pi1_sao_offset_v 328 ADD x20,x14,#2 //pu1_src_left_cpy += 2 329 csel x14, x20, x14,EQ 330 331 mov x27, x0 //Store pu1_src in sp 332 LD1 {v7.8b},[x6] //offset_tbl_v = vld1_s8(pi1_sao_offset_v) 333 ADRP x2, :got:gi1_table_edge_idx //table pointer 334 LDR x2, [x2, #:got_lo12:gi1_table_edge_idx] 335 336 MOV x6,x7 //move wd to x6 loop_count 337 movi v1.16b, #0xFF //au1_mask = vdupq_n_s8(-1) 338 CMP x7,#16 //Compare wd with 16 339 340 BLT WIDTH_RESIDUE //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case 341 CMP x8,#4 //Compare ht with 4 342 BLE WD_16_HT_4_LOOP //If jump to WD_16_HT_4_LOOP 343 344WIDTH_LOOP_16: 345 mov x5, x21 //Loads pu1_avail 346 mov w7, w24 //Loads wd 347 CMP x6,x7 //col == wd 348 LDRb w20, [x5] //pu1_avail[0] 349 csel w8,w20,w8,EQ 350 351 MOV x20,#-1 352 csel x8, x20, x8,NE 353 mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 354 355 CMP x6,#16 //if(col == 16) 356 mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 357 358 BNE SKIP_AU1_MASK_VAL 359 LDRB w8,[x5,#1] //pu1_avail[1] 360 mov v1.b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 361 mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 362 363SKIP_AU1_MASK_VAL: 364 LDRB w9,[x5,#2] //pu1_avail[2] 365 LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 366 //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 367 //SUB x0, x0,#8 368 CMP x9,#0 369 370 mov w4, w25 //Loads ht 371 SUB x20,x0,x1 //pu1_src - src_strd 372 csel x8, x20, x8,EQ 373 374 mov w7, w24 //Loads wd 375 csel x8, x3, x8,NE //pu1_src_top_cpy 376 377 SUB x8,x8,#2 //pu1_src - src_strd - 2 378 ADD x3,x3,#16 379 380 ADD x5,sp,#0x4B //*au1_src_left_tmp 381 LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) 382 //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) 383 //SUB x8, x8,#8 384 SUB x7,x7,x6 //(wd - col) 385 386 ADD x7,x7,#14 //15 + (wd - col) 387 cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 388 mov x8, x26 //Loads *pu1_src 389 390 ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)] 391 cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 392 393AU1_SRC_LEFT_LOOP: 394 LDRH w8,[x7] //load the value and increment by src_strd 395 SUBS x4,x4,#1 //decrement the loop count 396 397 STRH w8,[x5],#2 //store it in the stack pointer 398 ADD x7,x7,x1 399 400 BNE AU1_SRC_LEFT_LOOP 401 402 ADD x8,x0,x1 //I *pu1_src + src_strd 403 SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 404 MOV x7,x12 //row count, move ht_tmp to x7 405 406 LD1 {v16.16b},[x8] //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 407 //LD1 {v17.8b},[x8] //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 408 //SUB x8, x8,#8 409 410 ADD x8,x8,#16 //I 411 movi v18.16b, #0 412 LDRH w5,[x8] //I pu1_src_cpy[src_strd + 16] 413 414 mov x10, x21 //I Loads pu1_avail 415 mov v18.h[0], w5 //I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 416 LDRB w10,[x10,#2] //I pu1_avail[2] 417 418 CMP x10,#0 //I 419 EXT v18.16b, v16.16b , v18.16b,#2 //I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) 420 BNE SIGN_UP_CHANGE_DONE //I 421 422 LDRB w11,[x0] //I pu1_src_cpy[0] 423 SUB x4,x12,x7 //I ht_tmp - row 424 425 LDRB w10,[x0,#1] //I pu1_src_cpy[0] 426 LSL x4,x4,#1 //I (ht_tmp - row) * 2 427 428 ADD x9,x14,x4 //I pu1_src_left_cpy[(ht_tmp - row) * 2] 429 sub x13,x9,#2 430 LDRB w5,[x13] //I load the value 431 432 SUB x8,x11,x5 //I pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 433 sub x13,x9,#1 434 LDRB w5,[x13] //I load the value 435 436 CMP x8,#0 //I 437 SUB x4,x10,x5 //I pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1] 438 439 movn x20,#0 440 csel x8, x20, x8,LT //I 441 MOV x20,#1 442 csel x8, x20, x8,GT //I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 443 444 CMP x4,#0 //I 445 mov v17.b[0], w8 //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) 446 movn x20,#0 447 csel x4, x20, x4,LT //I 448 449 MOV x20,#1 450 csel x4, x20, x4,GT //I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 451 mov v17.b[1], w4 //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) 452 453SIGN_UP_CHANGE_DONE: 454 LD1 {v30.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 455 cmhi v20.16b, v5.16b , v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 456 457 cmhi v22.16b, v18.16b , v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 458 SUB v22.16b, v22.16b , v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 459 460 ADD v18.16b, v0.16b , v17.16b //I edge_idx = vaddq_s8(const_2, sign_up) 461 ADD v18.16b, v18.16b , v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down) 462 463 TBL v18.16b, {v30.16b},v18.16b //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 464 NEG v17.16b, v22.16b //I sign_up = vnegq_s8(sign_down) 465 466 //TBL v19.8b, {v30.16b},v19.8b //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 467 EXT v17.16b, v17.16b , v17.16b,#14 //I sign_up = vextq_s8(sign_up, sign_up, 14) 468 469 Uxtl v20.8h, v5.8b //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 470 AND v22.16b, v18.16b , v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask) 471 mov v23.d[0],v22.d[1] 472 473 Uxtl2 v18.8h, v5.16b //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 474 UZP1 v31.8b, v22.8b, v23.8b 475 UZP2 v23.8b, v22.8b, v23.8b //I 476 mov v22.8b,v31.8b 477 478 TBL v22.8b, {v6.16b},v22.8b //I 479 TBL v23.8b, {v7.16b},v23.8b //I 480 ZIP1 v31.8b, v22.8b, v23.8b 481 ZIP2 v23.8b, v22.8b, v23.8b //I 482 mov v22.8b,v31.8b 483 484 mov v5.16b, v16.16b //I pu1_cur_row = pu1_next_row 485 SADDW v20.8h, v20.8h , v22.8b //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 486 487 SMAX v20.8h, v20.8h , v2.8h //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 488 UMIN v20.8h, v20.8h , v4.8h //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 489 490 SADDW v18.8h, v18.8h , v23.8b //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 491 SMAX v18.8h, v18.8h , v2.8h //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 492 493 UMIN v18.8h, v18.8h , v4.8h //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 494 SUB x7,x7,#1 //I Decrement the ht_tmp loop count by 1 495 496 497PU1_SRC_LOOP: 498 ADD x8,x0,x1,LSL #1 //II *pu1_src + src_strd 499 xtn v20.8b, v20.8h //I vmovn_s16(pi2_tmp_cur_row.val[0]) 500 ADD x11,x8,x1 //III *pu1_src + src_strd 501 502 LD1 {v16.16b},[x8] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 503 //LD1 {v17.8b},[x8] //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 504 //SUB x8, x8,#8 505 LD1 {v30.16b},[x11] //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 506 //LD1 {v31.8b},[x11] //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 507 //SUB x11, x11,#8 508 509 ADD x8,x8,#16 //II 510 xtn2 v20.16b, v18.8h //I vmovn_s16(pi2_tmp_cur_row.val[1]) 511 LDRH w5,[x8] //II pu1_src_cpy[src_strd + 16] 512 513 ADD x11,x11,#16 //III 514 mov v28.h[0], w5 //II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 515 LDRH w4,[x11] //III pu1_src_cpy[src_strd + 16] 516 517 LDRB w8,[x0,x1] //II pu1_src_cpy[0] 518 EXT v28.16b, v16.16b , v28.16b,#2 //II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) 519 SUB x5,x12,x7 //II ht_tmp - row 520 521 LSL x5,x5,#1 //II (ht_tmp - row) * 2 522 mov v18.h[0], w4 //III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 523 ADD x9,x14,x5 //II pu1_src_left_cpy[(ht_tmp - row) * 2] 524 525 sub x13,x9,#2 526 LDRB w11,[x13] //II load the value 527 ST1 { v20.16b},[x0],x1 //I vst1q_u8(pu1_src_cpy, pu1_cur_row) 528 SUB x8,x8,x11 //II pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 529 530 CMP x8,#0 //II 531 EXT v18.16b, v30.16b , v18.16b,#2 //III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) 532 LDRB w11,[x0,#1] //II pu1_src_cpy[0] 533 534 movn x20,#0 535 csel x8, x20, x8,LT //II 536 cmhi v22.16b, v5.16b , v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 537 MOV x20,#1 538 csel x8, x20, x8,GT //II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 539 540 sub x13,x9,#1 541 LDRB w5,[x13] //II load the value 542 mov v17.b[0], w8 //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) 543 SUB x7,x7,#1 //II Decrement the ht_tmp loop count by 1 544 545 SUB x11,x11,x5 //II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1] 546 cmhi v24.16b, v28.16b , v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 547 CMP x11,#0 //II 548 549 movn x20,#0 550 csel x11, x20, x11,LT //II 551 SUB v24.16b, v24.16b , v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 552 MOV x20,#1 553 csel x11, x20, x11,GT //II SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 554 555 LDRB w4,[x0,x1] //III pu1_src_cpy[0] 556 LD1 {v22.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 557 SUB x5,x12,x7 //III ht_tmp - row 558 559 ADD x10,x0,x1 560 mov v17.b[1], w11 //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) 561 LSL x5,x5,#1 //III (ht_tmp - row) * 2 562 563 ADD x9,x14,x5 //III pu1_src_left_cpy[(ht_tmp - row) * 2] 564 ADD v26.16b, v0.16b , v17.16b //II edge_idx = vaddq_s8(const_2, sign_up) 565 LDRB w10,[x10,#1] //III pu1_src_cpy[0] 566 567 sub x13,x9,#2 568 LDRB w5,[x13] //III load the value 569 ADD v26.16b, v26.16b , v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down) 570 SUB x4,x4,x5 //III pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 571 572 mov v22.d[1],v22.d[0] 573 CMP x4,#0 //III 574 sub x13,x9,#1 575 LDRB w9,[x13] //III load the value 576 TBL v26.16b, {v22.16b},v26.16b //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 577 NEG v17.16b, v24.16b //II sign_up = vnegq_s8(sign_down) 578 579 movn x20,#0 580 csel x4, x20, x4,LT //III 581 SUB x10,x10,x9 //III pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1] 582 //TBL v27.8b, {v22.16b},v27.8b //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 583 EXT v17.16b, v17.16b , v17.16b,#14 //II sign_up = vextq_s8(sign_up, sign_up, 14) 584 585 MOV x20,#1 586 csel x4, x20, x4,GT //III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 587 AND v26.16b, v26.16b , v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask) 588 CMP x10,#0 //III 589 590 mov v27.d[0],v26.d[1] 591 UZP1 v31.8b, v26.8b, v27.8b 592 UZP2 v27.8b, v26.8b, v27.8b //II 593 mov v26.8b,v31.8b 594 mov v17.b[0], w4 //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) 595 596 movn x20,#0 597 csel x10, x20, x10,LT //III 598 MOV x20,#1 599 csel x10, x20, x10,GT //III SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 600 TBL v24.8b, {v6.16b},v26.8b //II 601 cmhi v20.16b, v16.16b , v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 602 603 cmhi v22.16b, v18.16b , v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 604 TBL v25.8b, {v7.16b},v27.8b //II 605 SUB v22.16b, v22.16b , v20.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 606 607 mov v17.b[1], w10 //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) 608 ZIP1 v31.8b, v24.8b, v25.8b 609 ZIP2 v25.8b, v24.8b, v25.8b //II 610 mov v24.8b,v31.8b 611 612 Uxtl v28.8h, v5.8b //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 613 ADD v18.16b, v0.16b , v17.16b //III edge_idx = vaddq_s8(const_2, sign_up) 614 615 LD1 {v20.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 616 SADDW v28.8h, v28.8h , v24.8b //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 617 618 ADD v18.16b, v18.16b , v22.16b //III edge_idx = vaddq_s8(edge_idx, sign_down) 619 SMAX v28.8h, v28.8h , v2.8h //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 620 621 UMIN v28.8h, v28.8h , v4.8h //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 622 TBL v18.16b, {v20.16b},v18.16b //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 623 NEG v17.16b, v22.16b //III sign_up = vnegq_s8(sign_down) 624 625 //TBL v19.8b, {v20.16b},v19.8b //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 626 EXT v17.16b, v17.16b , v17.16b,#14 //III sign_up = vextq_s8(sign_up, sign_up, 14) 627 628 Uxtl2 v26.8h, v5.16b //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 629 AND v18.16b, v18.16b , v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask) 630 631 mov v19.d[0],v18.d[1] 632 UZP1 v31.8b, v18.8b, v19.8b 633 UZP2 v19.8b, v18.8b, v19.8b //III 634 mov v18.8b,v31.8b 635 TBL v22.8b, {v6.16b},v18.8b //III 636 SADDW v26.8h, v26.8h , v25.8b //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 637 638 mov v5.16b, v30.16b //III pu1_cur_row = pu1_next_row 639 TBL v23.8b, {v7.16b},v19.8b //III 640 SMAX v26.8h, v26.8h , v2.8h //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 641 642 Uxtl v20.8h, v16.8b //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 643 UMIN v26.8h, v26.8h , v4.8h //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 644 645 ZIP1 v31.8b, v22.8b, v23.8b 646 ZIP2 v23.8b, v22.8b, v23.8b //III 647 mov v22.8b,v31.8b 648 xtn v28.8b, v28.8h //II vmovn_s16(pi2_tmp_cur_row.val[0]) 649 650 xtn2 v28.16b, v26.8h //II vmovn_s16(pi2_tmp_cur_row.val[1]) 651 SADDW v20.8h, v20.8h , v22.8b //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 652 653 Uxtl2 v18.8h, v16.16b //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 654 SMAX v20.8h, v20.8h , v2.8h //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 655 656 UMIN v20.8h, v20.8h , v4.8h //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 657 SADDW v18.8h, v18.8h , v23.8b //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 658 659 SUB x7,x7,#1 //III Decrement the ht_tmp loop count by 1 660 SMAX v18.8h, v18.8h , v2.8h //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 661 CMP x7,#1 662 663 ST1 { v28.16b},[x0],x1 //II vst1q_u8(pu1_src_cpy, pu1_cur_row) 664 UMIN v18.8h, v18.8h , v4.8h //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 665 666 BGT PU1_SRC_LOOP //If not equal jump to PU1_SRC_LOOP 667 BLT INNER_LOOP_DONE 668 669 ADD x8,x0,x1,LSL #1 //*pu1_src + src_strd 670 xtn v20.8b, v20.8h //III vmovn_s16(pi2_tmp_cur_row.val[0]) 671 672 LDRB w11,[x0,x1] //pu1_src_cpy[0] 673 LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 674 //LD1 {v17.8b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 675 //SUB x8, x8,#8 676 SUB x4,x12,x7 //ht_tmp - row 677 678 ADD x8,x8,#16 679 xtn2 v20.16b, v18.8h //III vmovn_s16(pi2_tmp_cur_row.val[1]) 680 LDRH w5,[x8] //pu1_src_cpy[src_strd + 16] 681 682 LSL x4,x4,#1 //(ht_tmp - row) * 2 683 mov v18.h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 684 ADD x9,x14,x4 //pu1_src_left_cpy[(ht_tmp - row) * 2] 685 686 sub x13,x9,#2 687 LDRB w5,[x13] //load the value 688 EXT v18.16b, v16.16b , v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) 689 SUB x8,x11,x5 //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 690 691 CMP x8,#0 692 ST1 { v20.16b},[x0],x1 //III vst1q_u8(pu1_src_cpy, pu1_cur_row) 693 movn x20,#0 694 csel x8, x20, x8,LT 695 696 MOV x20,#1 697 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 698 LD1 {v30.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 699 700 LDRB w11,[x0,#1] //pu1_src_cpy[0] 701 mov v17.b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) 702 sub x13,x9,#1 703 LDRB w5,[x13] //load the value 704 705 SUB x4,x11,x5 //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1] 706 cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 707 CMP x4,#0 708 709 movn x20,#0 710 csel x4, x20, x4,LT 711 cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 712 MOV x20,#1 713 csel x4, x20, x4,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 714 715 mov v17.b[1], w4 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) 716 SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 717 718 ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) 719 ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 720 721 mov v30.d[1],v30.d[0] 722 TBL v26.16b, {v30.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 723 //TBL v27.8b, {v30.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 724 725 Uxtl v20.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 726 AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 727 mov v27.d[0],v26.d[1] 728 729 Uxtl2 v18.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 730 UZP1 v31.8b, v26.8b, v27.8b 731 UZP2 v27.8b, v26.8b, v27.8b 732 mov v26.8b,v31.8b 733 734 TBL v24.8b, {v6.16b},v26.8b 735 TBL v25.8b, {v7.16b},v27.8b 736 ZIP1 v31.8b, v24.8b, v25.8b 737 ZIP2 v25.8b, v24.8b, v25.8b 738 mov v24.8b,v31.8b 739 740 SADDW v20.8h, v20.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 741 SMAX v20.8h, v20.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 742 UMIN v20.8h, v20.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 743 744 SADDW v18.8h, v18.8h , v25.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 745 SMAX v18.8h, v18.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 746 UMIN v18.8h, v18.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 747 748 749INNER_LOOP_DONE: 750 mov w8, w25 //Loads ht 751 xtn v20.8b, v20.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 752 ADD x5,sp,#0x4B //*au1_src_left_tmp 753 754 mov x11, x17 //Loads *pu1_src_left 755 xtn2 v20.16b, v18.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) 756 757 758SRC_LEFT_LOOP: 759 LDR w7, [x5],#4 //au1_src_left_tmp[row] 760 SUBS x8,x8,#2 761 STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row] 762 BNE SRC_LEFT_LOOP 763 764 SUBS x6,x6,#16 //Decrement the wd loop count by 16 765 ST1 { v20.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 766 CMP x6,#8 //Check whether residue remains 767 768 BLT RE_ASSINING_LOOP //Jump to re-assigning loop 769 mov w7, w24 //Loads wd 770 mov x0, x27 //Loads *pu1_src 771 SUB x7,x7,x6 772 ADD x0,x0,x7 773 BGT WIDTH_LOOP_16 //If not equal jump to width_loop 774 BEQ WIDTH_RESIDUE //If residue remains jump to residue loop 775 776 777WD_16_HT_4_LOOP: 778 mov x5, x21 //Loads pu1_avail 779 mov w7, w24 //Loads wd 780 CMP x6,x7 //col == wd 781 LDRb w20, [x5] //pu1_avail[0] 782 csel w8,w20,w8,EQ 783 784 MOV x20,#-1 785 csel x8, x20, x8,NE 786 mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 787 mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 788 789 CMP x6,#16 //if(col == 16) 790 BNE SKIP_AU1_MASK_VAL_WD_16_HT_4 791 LDRB w8,[x5,#1] //pu1_avail[1] 792 mov v1.b[14], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 793 mov v1.b[15], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 794 795SKIP_AU1_MASK_VAL_WD_16_HT_4: 796 LDRB w8,[x5,#2] //pu1_avail[2] 797 CMP x8,#0 798 799 SUB x20,x0,x1 //pu1_src - src_strd 800 csel x8, x20, x8,EQ 801 csel x8, x3, x8,NE //pu1_src_top_cpy 802 SUB x8,x8,#2 //pu1_src - src_strd - 2 803 LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) 804 //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2) 805 //SUB x8, x8,#8 806 807 ADD x3,x3,#16 808 ADD x5,sp,#0x4B //*au1_src_left_tmp 809 mov w4, w25 //Loads ht 810 mov x7, x24 //Loads wd 811 SUB x7,x7,x6 //(wd - col) 812 ADD x7,x7,#14 //15 + (wd - col) 813 mov x8, x26 //Loads *pu1_src 814 ADD x7,x8,x7 //pu1_src[0 * src_strd + 15 + (wd - col)] 815 816AU1_SRC_LEFT_LOOP_WD_16_HT_4: 817 LDRH w8,[x7] //load the value and increment by src_strd 818 STRH w8,[x5],#2 //store it in the stack pointer 819 ADD x7,x7,x1 820 821 SUBS x4,x4,#1 //decrement the loop count 822 BNE AU1_SRC_LEFT_LOOP_WD_16_HT_4 823 824 LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 825 //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 826 //SUB x0, x0,#8 827 828 cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 829 cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 830 SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 831 movi v18.16b, #0 832 MOV x7,x12 //row count, move ht_tmp to x7 833 834PU1_SRC_LOOP_WD_16_HT_4: 835 movi v18.16b, #0 836 ADD x8,x0,x1 //*pu1_src + src_strd 837 LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 838 //LD1 {v17.8b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 839 //SUB x8, x8,#8 840 841 ADD x8,x8,#16 842 LDRH w5,[x8] //pu1_src_cpy[src_strd + 16] 843 mov v18.h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 844 EXT v18.16b, v16.16b , v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) 845 846 CMP x7,x12 847 BLT SIGN_UP_CHANGE_WD_16_HT_4 848 mov x5, x21 //Loads pu1_avail 849 LDRB w5,[x5,#2] //pu1_avail[2] 850 CMP x5,#0 851 BNE SIGN_UP_CHANGE_DONE_WD_16_HT_4 852 853SIGN_UP_CHANGE_WD_16_HT_4: 854 LDRB w8,[x0] //pu1_src_cpy[0] 855 SUB x5,x12,x7 //ht_tmp - row 856 LSL x5,x5,#1 //(ht_tmp - row) * 2 857 ADD x9,x14,x5 //pu1_src_left_cpy[(ht_tmp - row) * 2] 858 sub x13,x9,#2 859 LDRB w5,[x13] //load the value 860 SUB x8,x8,x5 //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 861 CMP x8,#0 862 movn x20,#0 863 csel x8, x20, x8,LT 864 MOV x20,#1 865 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 866 mov v17.b[0], w8 //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) 867 868 LDRB w8,[x0,#1] //pu1_src_cpy[0] 869 sub x13,x9,#1 870 LDRB w5,[x13] //load the value 871 SUB x8,x8,x5 //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1] 872 CMP x8,#0 873 movn x20,#0 874 csel x8, x20, x8,LT 875 MOV x20,#1 876 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 877 mov v17.b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) 878 879SIGN_UP_CHANGE_DONE_WD_16_HT_4: 880 cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 881 cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 882 SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 883 884 ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) 885 ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 886 887 LD1 {v22.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 888 TBL v26.16b, {v22.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 889 //TBL v27.8b, {v22.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 890 891 AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 892 mov v27.d[0],v26.d[1] 893 894 NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) 895 EXT v17.16b, v17.16b , v17.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14) 896 897 UZP1 v31.8b, v26.8b, v27.8b 898 UZP2 v27.8b, v26.8b, v27.8b 899 mov v26.8b,v31.8b 900 TBL v24.8b, {v6.16b},v26.8b 901 TBL v25.8b, {v7.16b},v27.8b 902 ZIP1 v31.8b, v24.8b, v25.8b 903 ZIP2 v25.8b, v24.8b, v25.8b 904 mov v24.8b,v31.8b 905 906 Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 907 SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 908 SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 909 UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 910 911 Uxtl2 v26.8h, v5.16b //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row))) 912 SADDW v26.8h, v26.8h , v25.8b //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset) 913 SMAX v26.8h, v26.8h , v2.8h //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip) 914 UMIN v26.8h, v26.8h , v4.8h //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip)) 915 916 xtn v28.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 917 xtn2 v28.16b, v26.8h //vmovn_s16(pi2_tmp_cur_row.val[1]) 918 919 ST1 { v28.16b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 920 921 mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row 922 SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1 923 BNE PU1_SRC_LOOP_WD_16_HT_4 //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4 924 925 mov w8, w25 //Loads ht 926 ADD x5,sp,#0x4B //*au1_src_left_tmp 927 mov x11, x17 //Loads *pu1_src_left 928 929SRC_LEFT_LOOP_WD_16_HT_4: 930 LDR w7, [x5],#4 //au1_src_left_tmp[row] 931 STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row] 932 933 SUBS x8,x8,#2 934 BNE SRC_LEFT_LOOP_WD_16_HT_4 935 936 937 SUBS x6,x6,#16 //Decrement the wd loop count by 16 938 CMP x6,#8 //Check whether residue remains 939 BLT RE_ASSINING_LOOP //Jump to re-assigning loop 940 mov w7, w24 //Loads wd 941 mov x0, x27 //Loads *pu1_src 942 SUB x7,x7,x6 943 ADD x0,x0,x7 944 BGT WD_16_HT_4_LOOP 945 BEQ WIDTH_RESIDUE //If residue remains jump to residue loop 946 947 948WIDTH_RESIDUE: 949 mov w7, w24 //Loads wd 950 mov x5, x21 //Loads pu1_avail 951 CMP x6,x7 //wd_residue == wd 952 LDRb w20, [x5] //pu1_avail[0] 953 csel w8,w20,w8,EQ 954 955 MOV x20,#-1 956 csel x8, x20, x8,NE 957 mov v1.b[0], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 958 mov v1.b[1], w8 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0) 959 960 LDRB w8,[x5,#1] //pu1_avail[1] 961 mov v1.b[6], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 962 mov v1.b[7], w8 //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15) 963 964 LDRB w8,[x5,#2] //pu1_avail[2] 965 CMP x8,#0 966 967 SUB x20,x0,x1 //pu1_src - src_strd 968 csel x8, x20, x8,EQ 969 csel x8, x3, x8,NE 970 SUB x8,x8,#2 //pu1_src - src_strd - 2 971 LD1 {v3.16b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) 972 //LD1 {v11.8b},[x8] //pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) 973 //SUB x8, x8,#8 974 975 ADD x5,sp,#0x4B //*au1_src_left_tmp 976 mov w4, w25 //Loads ht 977 mov w7, w24 //Loads wd 978 mov x8, x26 //Loads *pu1_src 979 SUB x7,x7,#2 //(wd - 2) 980 ADD x7,x8,x7 //pu1_src[0 * src_strd + (wd - 2)] 981 982AU1_SRC_LEFT_LOOP_RESIDUE: 983 LDRH w8,[x7] //load the value and increment by src_strd 984 STRH w8,[x5],#2 //store it in the stack pointer 985 ADD x7,x7,x1 986 SUBS x4,x4,#1 //decrement the loop count 987 BNE AU1_SRC_LEFT_LOOP_RESIDUE 988 989 LD1 {v5.16b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 990 //LD1 {v13.8b},[x0] //pu1_cur_row = vld1q_u8(pu1_src) 991 //SUB x0, x0,#8 992 993 cmhi v17.16b, v5.16b , v3.16b //vcgtq_u8(pu1_cur_row, pu1_top_row) 994 cmhi v16.16b, v3.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_top_row) 995 SUB v17.16b, v16.16b , v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 996 MOV x7,x12 //row count, move ht_tmp to x7 997 998PU1_SRC_LOOP_RESIDUE: 999 movi v18.16b, #0 1000 ADD x8,x0,x1 //*pu1_src + src_strd 1001 LD1 {v16.16b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 1002 //LD1 {v17.8b},[x8] //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd) 1003 //SUB x8, x8,#8 1004 1005 ADD x8,x8,#16 1006 LDRH w5,[x8] //pu1_src_cpy[src_strd + 16] 1007 mov v18.h[0], w5 //pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0) 1008 EXT v18.16b, v16.16b , v18.16b,#2 //pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2) 1009 1010 CMP x7,x12 1011 BLT SIGN_UP_CHANGE_RESIDUE 1012 mov x5, x21 //Loads pu1_avail 1013 LDRB w5,[x5,#2] //pu1_avail[2] 1014 CMP x5,#0 1015 BNE SIGN_UP_CHANGE_DONE_RESIDUE 1016 1017SIGN_UP_CHANGE_RESIDUE: 1018 LDRB w8,[x0] //pu1_src_cpy[0] 1019 SUB x5,x12,x7 //ht_tmp - row 1020 LSL x5,x5,#1 //(ht_tmp - row) * 2 1021 ADD x9,x14,x5 //pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 1022 sub x13,x9,#2 1023 LDRB w5,[x13] //load the value 1024 SUB x8,x8,x5 //pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 1025 CMP x8,#0 1026 movn x20,#0 1027 csel x8, x20, x8,LT 1028 MOV x20,#1 1029 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 1030 mov v17.b[0], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0) 1031 1032 LDRB w8,[x0,#1] //pu1_src_cpy[0] 1033 sub x13,x9,#1 1034 LDRB w5,[x13] //load the value 1035 SUB x8,x8,x5 //pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2] 1036 CMP x8,#0 1037 movn x20,#0 1038 csel x8, x20, x8,LT 1039 MOV x20,#1 1040 csel x8, x20, x8,GT //SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]) 1041 mov v17.b[1], w8 //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1) 1042 1043SIGN_UP_CHANGE_DONE_RESIDUE: 1044 cmhi v22.16b, v5.16b , v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp) 1045 cmhi v24.16b, v18.16b , v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp) 1046 SUB v24.16b, v24.16b , v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt)) 1047 1048 ADD v26.16b, v0.16b , v17.16b //edge_idx = vaddq_s8(const_2, sign_up) 1049 ADD v26.16b, v26.16b , v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down) 1050 1051 LD1 {v22.8b},[x2] //edge_idx_tbl = vld1_s8(gi1_table_edge_idx) 1052 mov v22.d[1],v22.d[0] 1053 TBL v26.16b, {v22.16b},v26.16b //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx)) 1054 //TBL v27.8b, {v22.16b},v27.8b //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx)) 1055 1056 AND v26.16b, v26.16b , v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask) 1057 mov v27.d[0],v26.d[1] 1058 1059 NEG v17.16b, v24.16b //sign_up = vnegq_s8(sign_down) 1060 EXT v17.16b, v17.16b , v17.16b,#14 //sign_up = vextq_s8(sign_up, sign_up, 14) 1061 1062 UZP1 v31.8b, v26.8b, v27.8b 1063 UZP2 v27.8b, v26.8b, v27.8b 1064 mov v26.8b,v31.8b 1065 TBL v24.8b, {v6.16b},v26.8b 1066 TBL v25.8b, {v7.16b},v27.8b 1067 ZIP1 v31.8b, v24.8b, v25.8b 1068 ZIP2 v25.8b, v24.8b, v25.8b 1069 mov v24.8b,v31.8b 1070 1071 Uxtl v28.8h, v5.8b //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row))) 1072 SADDW v28.8h, v28.8h , v24.8b //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset) 1073 SMAX v28.8h, v28.8h , v2.8h //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip) 1074 UMIN v28.8h, v28.8h , v4.8h //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip)) 1075 1076 xtn v28.8b, v28.8h //vmovn_s16(pi2_tmp_cur_row.val[0]) 1077 1078 ST1 {v28.8b},[x0],x1 //vst1q_u8(pu1_src_cpy, pu1_cur_row) 1079 1080 mov v5.16b, v16.16b //pu1_cur_row = pu1_next_row 1081 SUBS x7,x7,#1 //Decrement the ht_tmp loop count by 1 1082 BNE PU1_SRC_LOOP_RESIDUE //If not equal jump to PU1_SRC_LOOP 1083 1084 mov w8, w25 //Loads ht 1085 mov x11, x17 //Loads *pu1_src_left 1086 ADD x5,sp,#0x4B //*au1_src_left_tmp 1087 1088SRC_LEFT_LOOP_RESIDUE: 1089 LDR w7, [x5],#4 //au1_src_left_tmp[row] 1090 SUBS x8,x8,#2 1091 STR w7, [x11],#4 //pu1_src_left[row] = au1_src_left_tmp[row] 1092 1093 BNE SRC_LEFT_LOOP_RESIDUE 1094 1095 1096RE_ASSINING_LOOP: 1097 mov w8, w25 //Loads ht 1098 1099 mov x0, x26 //Loads *pu1_src 1100 SUB x8,x8,#1 //ht - 1 1101 1102 mov w7, w24 //Loads wd 1103 1104 LDRH w9,[sp,#6] 1105 madd x6, x8, x1, x7 //wd - 2 + (ht - 1) * src_strd 1106 1107 STRH w9,[x0] //pu1_src_org[0] = u1_pos_0_0_tmp 1108 ADD x6,x0,x6 //pu1_src[wd - 2 + (ht - 1) * src_strd] 1109 1110 LDRH w9,[sp,#8] 1111 ADD x12,sp,#10 1112 sub x13,x6,#2 1113 STRH w9,[x13] //pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u 1114 1115 mov x4, x15 //Loads pu1_src_top_left 1116 LDRH w10,[sp] //load u1_src_top_left_tmp from stack pointer 1117 STRH w10,[x4] //*pu1_src_top_left = u1_src_top_left_tmp 1118 mov x3, x22 //Loads pu1_src_top 1119 1120SRC_TOP_LOOP: 1121 LD1 {v0.8b},[x12],#8 //pu1_src_top[col] = au1_src_top_tmp[col] 1122 SUBS x7,x7,#8 //Decrement the width 1123 ST1 {v0.8b},[x3],#8 //pu1_src_top[col] = au1_src_top_tmp[col] 1124 BNE SRC_TOP_LOOP 1125 1126END_LOOPS: 1127 ADD sp,sp,#0xE0 1128 // LDMFD sp!,{x4-x12,x15} //Reload the registers from SP 1129 ldp x27, x28,[sp],#16 1130 ldp x25, x26,[sp],#16 1131 ldp x23, x24,[sp],#16 1132 ldp x21, x22,[sp],#16 1133 ldp x19, x20,[sp],#16 1134 1135 ret 1136 1137 1138 1139