1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19// ******************************************************************************* 20// * @file 21// * ihevc_itrans_recon_8x8_neon.s 22// * 23// * @brief 24// * contains function definitions for single stage inverse transform 25// * 26// * @author 27// * anand s 28// * 29// * @par list of functions: 30// * - ihevc_itrans_recon_8x8() 31// * 32// * @remarks 33// * none 34// * 35// ******************************************************************************* 36//*/ 37 38///** 39// ******************************************************************************* 40// * 41// * @brief 42// * this function performs inverse transform and reconstruction for 8x8 43// * input block 44// * 45// * @par description: 46// * performs inverse transform and adds the prediction data and clips output 47// * to 8 bit 48// * 49// * @param[in] pi2_src 50// * input 8x8 coefficients 51// * 52// * @param[in] pi2_tmp 53// * temporary 8x8 buffer for storing inverse 54// * 55// * transform 56// * 1st stage output 57// * 58// * @param[in] pu1_pred 59// * prediction 8x8 block 60// * 61// * @param[out] pu1_dst 62// * output 8x8 block 63// * 64// * @param[in] src_strd 65// * input stride 66// * 67// * @param[in] pred_strd 68// * prediction stride 69// * 70// * @param[in] dst_strd 71// * output stride 72// * 73// * @param[in] shift 74// * output shift 75// * 76// * @param[in] zero_cols 77// * zero columns in pi2_src 78// * 79// * @returns void 80// * 81// * @remarks 82// * none 83// * 84// ******************************************************************************* 85// */ 86 87//void ihevc_itrans_recon_8x8(word16 *pi2_src, 88// word16 *pi2_tmp, 89// uword8 *pu1_pred, 90// uword8 *pu1_dst, 91// word32 src_strd, 92// word32 pred_strd, 93// word32 dst_strd, 94// word32 zero_cols 95// word32 zero_rows ) 96 97//**************variables vs registers************************* 98// x0 => *pi2_src 99// x1 => *pi2_tmp 100// x2 => *pu1_pred 101// x3 => *pu1_dst 102// src_strd 103// pred_strd 104// dst_strd 105// zero_cols 106 107 108 109.text 110.align 4 111.include "ihevc_neon_macros.s" 112 113 114 115.set width_x_size_x5 , 40 116.set width_x_size_x2 , 32 117.set shift_stage1_idct , 7 118.set shift_stage2_idct , 12 119 120.globl ihevc_itrans_recon_8x8_av8 121 122.extern g_ai2_ihevc_trans_8_transpose 123 124.type ihevc_itrans_recon_8x8_av8, %function 125 126ihevc_itrans_recon_8x8_av8: 127////register usage.extern - loading and until idct of columns 128//// cosine constants - d0 129//// sine constants - d1 130//// row 0 first half - d2 - y0 131//// row 1 first half - d6 - y1 132//// row 2 first half - d3 - y2 133//// row 3 first half - d7 - y3 134//// row 4 first half - d10 - y4 135//// row 5 first half - d14 - y5 136//// row 6 first half - d11 - y6 137//// row 7 first half - d15 - y7 138 139//// row 0 second half - d4 - y0 140//// row 1 second half - d8 - y1 141//// row 2 second half - d5 - y2 142//// row 3 second half - d9 - y3 143//// row 4 second half - d12 - y4 144//// row 5 second half - d16 - y5 145//// row 6 second half - d13 - y6 146//// row 7 second half - d17 - y7 147 148 //// copy the input pointer to another register 149 //// step 1 : load all constants 150 // stmfd sp!,{x4-x12,x14} 151 152 ldr w11, [sp] // zero rows 153 154 push_v_regs 155 stp x19, x20,[sp,#-16]! 156 157 mov x12, x7 // zero columns 158 mov x8, x5 // prediction stride 159 mov x7, x6 // destination stride 160 mov x6, x4 // src stride 161 lsl x6, x6, #1 // x sizeof(word16) 162 add x9,x0,x6, lsl #1 // 2 rows 163 164 add x10,x6,x6, lsl #1 // 3 rows 165 166 sub x10,x10, #8 // - 4 cols * sizeof(word16) 167 sub x5,x6, #8 // src_strd - 4 cols * sizeof(word16) 168 169 adrp x14, :got:g_ai2_ihevc_trans_8_transpose 170 ldr x14, [x14, #:got_lo12:g_ai2_ihevc_trans_8_transpose] 171 172 ld1 {v0.4h, v1.4h},[x14] ////d0,d1 are used for storing the constant data 173 174 ////step 2 load all the input data 175 ////step 3 operate first 4 colums at a time 176 177 and x11,x11,#0xff 178 and x12,x12,#0xff 179 180 cmp x11,#0xf0 181 bge skip_last4_rows 182 183 184 ld1 {v2.4h},[x0],#8 185 ld1 {v3.4h},[x9],#8 186 ld1 {v4.4h},[x0],x5 187 smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) 188 ld1 {v5.4h},[x9],x5 189 smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) 190 ld1 {v6.4h},[x0],#8 191 ld1 {v7.4h},[x9],#8 192 smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) 193 ld1 {v8.4h},[x0],x10 194 smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) 195 ld1 {v9.4h},[x9],x10 196 smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) 197 ld1 {v10.4h},[x0],#8 198 smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) 199 ld1 {v11.4h},[x9],#8 200 smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 201 ld1 {v12.4h},[x0],x5 202 smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 203 ld1 {v13.4h},[x9],x5 204 smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 205 ld1 {v14.4h},[x0],#8 206 smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 207 ld1 {v15.4h},[x9],#8 208 smull v22.4s, v10.4h, v0.h[0] //// y4 * cos4(part of c0 and c1) 209 ld1 {v16.4h},[x0],x10 210 smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0) 211 ld1 {v17.4h},[x9],x10 212 213 ///* this following was activated when alignment is not there */ 214//// vld1.16 d2,[x0]! 215//// vld1.16 d3,[x2]! 216//// vld1.16 d4,[x0]! 217//// vld1.16 d5,[x2]! 218//// vld1.16 d6,[x0]! 219//// vld1.16 d7,[x2]! 220//// vld1.16 d8,[x0],x3 221//// vld1.16 d9,[x2],x3 222//// vld1.16 d10,[x0]! 223//// vld1.16 d11,[x2]! 224//// vld1.16 d12,[x0]! 225//// vld1.16 d13,[x2]! 226//// vld1.16 d14,[x0]! 227//// vld1.16 d15,[x2]! 228//// vld1.16 d16,[x0],x3 229//// vld1.16 d17,[x2],x3 230 231 232 233 234 smlal v24.4s, v14.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) 235 smlsl v26.4s, v14.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) 236 smlal v28.4s, v14.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) 237 smlal v30.4s, v14.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) 238 239 smlsl v18.4s, v11.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) 240 smlal v6.4s, v11.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) 241 242 add v10.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) 243 sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) 244 245 smlal v24.4s, v15.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) 246 smlsl v26.4s, v15.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) 247 smlal v28.4s, v15.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) 248 smlsl v30.4s, v15.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) 249 250 add v14.4s, v10.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) 251 sub v10.4s, v10.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) 252 sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) 253 add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) 254 255 add v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0) 256 sub v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7) 257 258 add v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2) 259 sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5) 260 261 add v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1) 262 sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6) 263 264 add v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3) 265 sub v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4) 266 267 sqrshrn v2.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) 268 sqrshrn v15.4h, v6.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) 269 sqrshrn v3.4h, v24.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) 270 sqrshrn v14.4h, v22.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) 271 sqrshrn v6.4h, v28.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) 272 sqrshrn v11.4h, v18.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) 273 sqrshrn v7.4h, v26.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) 274 sqrshrn v10.4h, v30.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) 275 276 277 b last4_cols 278 279 280 281skip_last4_rows: 282 283 284 285 ld1 {v2.4h},[x0],#8 286 ld1 {v3.4h},[x9],#8 287 ld1 {v4.4h},[x0],x5 288 ld1 {v5.4h},[x9],x5 289 ld1 {v6.4h},[x0],#8 290 ld1 {v7.4h},[x9],#8 291 ld1 {v8.4h},[x0],x10 292 ld1 {v9.4h},[x9],x10 293 294 295 296 movi v12.4h, #0 297 movi v13.4h, #0 298 movi v16.4h, #0 299 movi v17.4h, #0 300 301 302 303 304 smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) 305 smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) 306 smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) 307 smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) 308 309 smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 310 smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 311 smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 312 smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 313 314 smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) 315 smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0) 316 317 smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) 318 319 320 add v14.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) 321 sub v10.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) 322 sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) 323 add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) 324 325 add v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0) 326 sub v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7) 327 328 add v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2) 329 sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5) 330 331 add v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1) 332 sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6) 333 334 add v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3) 335 sub v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4) 336 337 sqrshrn v2.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) 338 sqrshrn v15.4h, v6.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) 339 sqrshrn v3.4h, v24.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) 340 sqrshrn v14.4h, v22.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) 341 sqrshrn v6.4h, v28.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) 342 sqrshrn v11.4h, v18.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) 343 sqrshrn v7.4h, v26.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) 344 sqrshrn v10.4h, v30.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) 345 346 347last4_cols: 348 349 350 cmp x12,#0xf0 351 bge skip_last4cols 352 353 smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0) 354 smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1) 355 smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2) 356 smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3) 357 358 smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 359 smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 360 smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 361 smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 362 363 smull v18.4s, v5.4h, v1.h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1) 364 smull v8.4s, v5.4h, v0.h[2] //// y2 * cos2(part of d0) 365 366 smull v20.4s, v4.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) 367 smull v22.4s, v12.4h, v0.h[0] //// y4 * cos4(part of c0 and c1) 368 369 smlal v24.4s, v16.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) 370 smlsl v26.4s, v16.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) 371 smlal v28.4s, v16.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) 372 smlal v30.4s, v16.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) 373 374 smlsl v18.4s, v13.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) 375 smlal v8.4s, v13.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) 376 377 add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) 378 sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) 379 380 smlal v24.4s, v17.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7) 381 smlsl v26.4s, v17.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6) 382 smlal v28.4s, v17.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5) 383 smlsl v30.4s, v17.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4) 384 385 add v16.4s, v12.4s , v8.4s //// a0 = c0 + d0(part of e0,e7) 386 sub v12.4s, v12.4s , v8.4s //// a3 = c0 - d0(part of e3,e4) 387 sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of e2,e5) 388 add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of e1,e6) 389 390 add v20.4s, v16.4s , v24.4s //// a0 + b0(part of e0) 391 sub v8.4s, v16.4s , v24.4s //// a0 - b0(part of e7) 392 393 add v24.4s, v22.4s , v28.4s //// a2 + b2(part of e2) 394 sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of e5) 395 396 add v28.4s, v18.4s , v26.4s //// a1 + b1(part of e1) 397 sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of e6) 398 399 add v26.4s, v12.4s , v30.4s //// a3 + b3(part of e3) 400 sub v30.4s, v12.4s , v30.4s //// a3 - b3(part of x4) 401 402 sqrshrn v4.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct) 403 sqrshrn v17.4h, v8.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct) 404 sqrshrn v5.4h, v24.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct) 405 sqrshrn v16.4h, v22.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct) 406 sqrshrn v8.4h, v28.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct) 407 sqrshrn v13.4h, v18.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct) 408 sqrshrn v9.4h, v26.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct) 409 sqrshrn v12.4h, v30.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct) 410 b end_skip_last4cols 411 412 413 414skip_last4cols: 415 416 umov x15,v25.d[0] 417 418 trn1 v25.4h, v2.4h, v6.4h 419 trn2 v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing 420 421 trn1 v27.4h, v3.4h, v7.4h 422 trn2 v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing 423 424 trn1 v6.2s, v29.2s, v31.2s 425 trn2 v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued..... 426 trn1 v2.2s, v25.2s, v27.2s 427 trn2 v3.2s, v25.2s, v27.2s ////x0,x1,x2,x3 first qudrant transposing continued..... 428 429 430 trn1 v25.4h, v10.4h, v14.4h 431 trn2 v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing 432 433 trn1 v27.4h, v11.4h, v15.4h 434 trn2 v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing 435 436 trn1 v10.2s, v25.2s, v27.2s 437 trn2 v11.2s, v25.2s, v27.2s ////x4,x5,x6,x7 third qudrant transposing continued..... 438 trn1 v14.2s, v29.2s, v31.2s 439 trn2 v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued..... 440 441 mov v25.d[0],x15 442 443 smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) 444 smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) 445 smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) 446 smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) 447 448 smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 449 smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 450 smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 451 smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 452 453 smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) 454// vmull.s16 q11,d4,d0[0] @// y4 * cos4(part of c0 and c1) 455 456 smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) 457 smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0) 458 459 460 461 462 sub v22.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) 463 add v4.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) 464 465 466 add v2.4s, v4.4s , v24.4s 467 468 sub v6.4s, v4.4s , v24.4s 469 470 add v8.4s, v22.4s , v30.4s 471 472 sub v24.4s, v22.4s , v30.4s 473 474 sqrshrn v5.4h, v8.4s,#shift_stage2_idct 475 sqrshrn v2.4h, v2.4s,#shift_stage2_idct 476 sqrshrn v9.4h, v6.4s,#shift_stage2_idct 477 sqrshrn v6.4h, v24.4s,#shift_stage2_idct 478 479 sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) 480 add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) 481 482 483 add v30.4s, v22.4s , v28.4s 484 485 sub v24.4s, v22.4s , v28.4s 486 487 add v28.4s, v18.4s , v26.4s 488 489 sub v22.4s, v18.4s , v26.4s 490 sqrshrn v4.4h, v30.4s,#shift_stage2_idct 491 sqrshrn v7.4h, v24.4s,#shift_stage2_idct 492 sqrshrn v3.4h, v28.4s,#shift_stage2_idct 493 sqrshrn v8.4h, v22.4s,#shift_stage2_idct 494 495 496 497 umov x19,v25.d[0] 498 umov x20,v25.d[1] 499 500 trn1 v27.4h, v2.4h, v3.4h 501 trn2 v29.4h, v2.4h, v3.4h 502 trn1 v25.4h, v4.4h, v5.4h 503 trn2 v31.4h, v4.4h, v5.4h 504 505 trn1 v2.2s, v27.2s, v25.2s 506 trn2 v4.2s, v27.2s, v25.2s 507 trn1 v3.2s, v29.2s, v31.2s 508 trn2 v5.2s, v29.2s, v31.2s 509 510 trn1 v27.4h, v6.4h, v7.4h 511 trn2 v29.4h, v6.4h, v7.4h 512 trn1 v25.4h, v8.4h, v9.4h 513 trn2 v31.4h, v8.4h, v9.4h 514 515 trn1 v6.2s, v27.2s, v25.2s 516 trn2 v8.2s, v27.2s, v25.2s 517 trn1 v7.2s, v29.2s, v31.2s 518 trn2 v9.2s, v29.2s, v31.2s 519 520 mov v25.d[0],x19 521 mov v25.d[1],x20 522 523 smull v24.4s, v14.4h, v0.h[1] //// y1 * cos1(part of b0) 524 525 smull v26.4s, v14.4h, v0.h[3] //// y1 * cos3(part of b1) 526 smull v28.4s, v14.4h, v1.h[1] //// y1 * sin3(part of b2) 527 smull v30.4s, v14.4h, v1.h[3] //// y1 * sin1(part of b3) 528 529 smlal v24.4s, v15.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 530 smlsl v26.4s, v15.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 531 smlsl v28.4s, v15.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 532 smlsl v30.4s, v15.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 533 smull v20.4s, v10.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) 534 smull v18.4s, v11.4h, v1.h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1) 535 smull v14.4s, v11.4h, v0.h[2] //// y2 * cos2(part of d0) 536 537 538 add x4,x2,x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data 539 540 541 add x5,x8,x8, lsl #1 // 542 543 544 add x0,x3,x7, lsl #1 // x0 points to 3rd row of dest data 545 546 547 add x10,x7,x7, lsl #1 // 548 549 // swapping v3 and v6 550 mov v31.d[0], v3.d[0] 551 mov v3.d[0], v6.d[0] 552 mov v6.d[0], v31.d[0] 553 554 // swapping v5 and v8 555 mov v31.d[0], v5.d[0] 556 mov v5.d[0], v8.d[0] 557 mov v8.d[0], v31.d[0] 558 559 560 sub v22.4s, v20.4s , v14.4s //// a3 = c0 - d0(part of x3,x4) 561 add v12.4s, v20.4s , v14.4s //// a0 = c0 + d0(part of x0,x7) 562 563 564 add v0.4s, v12.4s , v24.4s 565 566 567 sub v24.4s, v12.4s , v24.4s 568 569 570 add v12.4s, v22.4s , v30.4s 571 572 573 sub v14.4s, v22.4s , v30.4s 574 575 sqrshrn v10.4h, v0.4s,#shift_stage2_idct 576 sqrshrn v17.4h, v24.4s,#shift_stage2_idct 577 sqrshrn v13.4h, v12.4s,#shift_stage2_idct 578 sqrshrn v14.4h, v14.4s,#shift_stage2_idct 579 580 sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) 581 add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) 582 583 584 add v0.4s, v22.4s , v28.4s 585 586 587 sub v24.4s, v22.4s , v28.4s 588 589 590 add v28.4s, v18.4s , v26.4s 591 592 593 sub v26.4s, v18.4s , v26.4s 594 ld1 {v18.8b},[x2],x8 595 596 sqrshrn v12.4h, v0.4s,#shift_stage2_idct 597 ld1 {v20.8b},[x2],x5 598 599 600 sqrshrn v15.4h, v24.4s,#shift_stage2_idct 601 ld1 {v19.8b},[x2],x8 602 603 604 605 606 sqrshrn v11.4h, v28.4s,#shift_stage2_idct 607 ld1 {v22.8b},[x4],x8 608 609 610 611 612 sqrshrn v16.4h, v26.4s,#shift_stage2_idct 613 ld1 {v21.8b},[x2],x5 614 615 616 b pred_buff_addition 617end_skip_last4cols: 618 619 620 umov x19,v25.d[0] 621 umov x20,v25.d[1] 622 623///* now the idct of columns is done, transpose so that row idct done efficiently(step5) */ 624 trn1 v27.4h, v2.4h, v6.4h 625 trn2 v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing 626 trn1 v25.4h, v3.4h, v7.4h 627 trn2 v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing 628 629 trn1 v2.2s, v27.2s, v25.2s 630 trn2 v3.2s, v27.2s, v25.2s ////x0,x1,x2,x3 first qudrant transposing continued..... 631 trn1 v6.2s, v29.2s, v31.2s 632 trn2 v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued..... 633 634 trn1 v27.4h, v4.4h, v8.4h 635 trn2 v29.4h, v4.4h, v8.4h ////[x3,x1],[x2,x0] second qudrant transposing 636 trn1 v25.4h, v5.4h, v9.4h 637 trn2 v31.4h, v5.4h, v9.4h ////[x3,x1],[x2,x0] second qudrant transposing 638 639 trn1 v4.2s, v27.2s, v25.2s 640 trn2 v5.2s, v27.2s, v25.2s ////x0,x1,x2,x3 second qudrant transposing continued..... 641 trn1 v8.2s, v29.2s, v31.2s 642 trn2 v9.2s, v29.2s, v31.2s ////x0,x1,x2,x3 second qudrant transposing continued..... 643 644 trn1 v27.4h, v10.4h, v14.4h 645 trn2 v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing 646 trn1 v25.4h, v11.4h, v15.4h 647 trn2 v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing 648 649 trn1 v10.2s, v27.2s, v25.2s 650 trn2 v11.2s, v27.2s, v25.2s ////x4,x5,x6,x7 third qudrant transposing continued..... 651 trn1 v14.2s, v29.2s, v31.2s 652 trn2 v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued..... 653 654 trn1 v27.4h, v12.4h, v16.4h 655 trn2 v29.4h, v12.4h, v16.4h ////[x7,x5],[x6,x4] fourth qudrant transposing 656 trn1 v25.4h, v13.4h, v17.4h 657 trn2 v31.4h, v13.4h, v17.4h ////[x7,x5],[x6,x4] fourth qudrant transposing 658 659 trn1 v12.2s, v27.2s, v25.2s 660 trn2 v13.2s, v27.2s, v25.2s ////x4,x5,x6,x7 fourth qudrant transposing continued..... 661 trn1 v16.2s, v29.2s, v31.2s 662 trn2 v17.2s, v29.2s, v31.2s ////x4,x5,x6,x7 fourth qudrant transposing continued..... 663 664 mov v25.d[0],x19 665 mov v25.d[1],x20 666 667 ////step6 operate on first four rows and find their idct 668 ////register usage.extern - storing and idct of rows 669//// cosine constants - d0 670//// sine constants - d1 671//// element 0 first four - d2 - y0 672//// element 1 first four - d6 - y1 673//// element 2 first four - d3 - y2 674//// element 3 first four - d7 - y3 675//// element 4 first four - d4 - y4 676//// element 5 first four - d8 - y5 677//// element 6 first four - d5 - y6 678//// element 7 first four - d9 - y7 679//// element 0 second four - d10 - y0 680//// element 1 second four - d14 - y1 681//// element 2 second four - d11 - y2 682//// element 3 second four - d15 - y3 683//// element 4 second four - d12 - y4 684//// element 5 second four - d16 - y5 685//// element 6 second four - d13 - y6 686//// element 7 second four - d17 - y7 687 688 //// map between first kernel code seq and current 689//// d2 -> d2 690//// d6 -> d6 691//// d3 -> d3 692//// d7 -> d7 693//// d10 -> d4 694//// d14 -> d8 695//// d11 -> d5 696//// d15 -> d9 697//// q3 -> q3 698//// q5 -> q2 699//// q7 -> q4 700 701 smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0) 702 smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1) 703 smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2) 704 smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3) 705 706 smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 707 smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 708 smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 709 smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 710 711 smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) 712 smull v22.4s, v4.4h, v0.h[0] //// y4 * cos4(part of c0 and c1) 713 714 smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1) 715 smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0) 716 717 718 smlal v24.4s, v8.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) 719 smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) 720 smlal v28.4s, v8.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) 721 smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) 722 723 smlsl v18.4s, v5.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) 724 smlal v6.4s, v5.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) 725 726 add v2.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) 727 sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) 728 729 smlal v24.4s, v9.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) 730 smlsl v26.4s, v9.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) 731 smlal v28.4s, v9.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) 732 smlsl v30.4s, v9.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) 733 734 sub v22.4s, v2.4s , v6.4s //// a3 = c0 - d0(part of x3,x4) 735 add v4.4s, v2.4s , v6.4s //// a0 = c0 + d0(part of x0,x7) 736 737 738 add v2.4s, v4.4s , v24.4s 739 740 sub v6.4s, v4.4s , v24.4s 741 742 add v8.4s, v22.4s , v30.4s 743 744 sub v24.4s, v22.4s , v30.4s 745 746 sqrshrn v5.4h, v8.4s,#shift_stage2_idct 747 sqrshrn v2.4h, v2.4s,#shift_stage2_idct 748 sqrshrn v9.4h, v6.4s,#shift_stage2_idct 749 sqrshrn v6.4h, v24.4s,#shift_stage2_idct 750 751 sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) 752 add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) 753 754 755 add v30.4s, v22.4s , v28.4s 756 757 sub v24.4s, v22.4s , v28.4s 758 759 add v28.4s, v18.4s , v26.4s 760 761 sub v22.4s, v18.4s , v26.4s 762 sqrshrn v4.4h, v30.4s,#shift_stage2_idct 763 sqrshrn v7.4h, v24.4s,#shift_stage2_idct 764 sqrshrn v3.4h, v28.4s,#shift_stage2_idct 765 sqrshrn v8.4h, v22.4s,#shift_stage2_idct 766 767 768 769 umov x19,v25.d[0] 770 umov x20,v25.d[1] 771 772 trn1 v27.4h, v2.4h, v3.4h 773 trn2 v29.4h, v2.4h, v3.4h 774 trn1 v25.4h, v4.4h, v5.4h 775 trn2 v31.4h, v4.4h, v5.4h 776 777 trn1 v2.2s, v27.2s, v25.2s 778 trn2 v4.2s, v27.2s, v25.2s 779 trn1 v3.2s, v29.2s, v31.2s 780 trn2 v5.2s, v29.2s, v31.2s 781 782 trn1 v27.4h, v6.4h, v7.4h 783 trn2 v29.4h, v6.4h, v7.4h 784 trn1 v25.4h, v8.4h, v9.4h 785 trn2 v31.4h, v8.4h, v9.4h 786 787 trn1 v6.2s, v27.2s, v25.2s 788 trn2 v8.2s, v27.2s, v25.2s 789 trn1 v7.2s, v29.2s, v31.2s 790 trn2 v9.2s, v29.2s, v31.2s 791 792 mov v25.d[0],x19 793 mov v25.d[1],x20 794 795 796 797 smull v24.4s, v14.4h, v0.h[1] //// y1 * cos1(part of b0) 798 smull v26.4s, v14.4h, v0.h[3] //// y1 * cos3(part of b1) 799 smull v28.4s, v14.4h, v1.h[1] //// y1 * sin3(part of b2) 800 smull v30.4s, v14.4h, v1.h[3] //// y1 * sin1(part of b3) 801 smlal v24.4s, v15.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0) 802 smlsl v26.4s, v15.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1) 803 smlsl v28.4s, v15.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2) 804 smlsl v30.4s, v15.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3) 805 smull v20.4s, v10.4h, v0.h[0] //// y0 * cos4(part of c0 and c1) 806 smull v22.4s, v12.4h, v0.h[0] //// y4 * cos4(part of c0 and c1) 807 smull v18.4s, v11.4h, v1.h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1) 808 smull v14.4s, v11.4h, v0.h[2] //// y2 * cos2(part of d0) 809 smlal v24.4s, v16.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0) 810 811 add x4,x2,x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data 812 smlsl v26.4s, v16.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1) 813 814 add x5,x8,x8, lsl #1 // 815 smlal v28.4s, v16.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2) 816 817 add x0,x3,x7, lsl #1 // x0 points to 3rd row of dest data 818 smlal v30.4s, v16.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3) 819 820 add x10,x7,x7, lsl #1 // 821 smlsl v18.4s, v13.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1) 822 823 824 smlal v14.4s, v13.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1) 825 826 add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1) 827 sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1) 828 829 smlal v24.4s, v17.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7) 830 831 // swapping v3 and v6 832 mov v31.d[0], v3.d[0] 833 mov v3.d[0], v6.d[0] 834 mov v6.d[0], v31.d[0] 835 836 smlsl v26.4s, v17.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6) 837 // swapping v5 and v8 838 mov v31.d[0], v5.d[0] 839 mov v5.d[0], v8.d[0] 840 mov v8.d[0], v31.d[0] 841 842 smlal v28.4s, v17.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5) 843 smlsl v30.4s, v17.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4) 844 845 sub v22.4s, v12.4s , v14.4s //// a3 = c0 - d0(part of x3,x4) 846 add v12.4s, v12.4s , v14.4s //// a0 = c0 + d0(part of x0,x7) 847 848 849 add v0.4s, v12.4s , v24.4s 850 851 852 sub v24.4s, v12.4s , v24.4s 853 854 855 add v12.4s, v22.4s , v30.4s 856 857 858 sub v14.4s, v22.4s , v30.4s 859 860 sqrshrn v10.4h, v0.4s,#shift_stage2_idct 861 sqrshrn v17.4h, v24.4s,#shift_stage2_idct 862 sqrshrn v13.4h, v12.4s,#shift_stage2_idct 863 sqrshrn v14.4h, v14.4s,#shift_stage2_idct 864 865 sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5) 866 add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6) 867 868 869 add v0.4s, v22.4s , v28.4s 870 871 872 sub v24.4s, v22.4s , v28.4s 873 874 875 add v28.4s, v18.4s , v26.4s 876 877 878 sub v26.4s, v18.4s , v26.4s 879 ld1 {v18.8b},[x2],x8 880 881 sqrshrn v12.4h, v0.4s,#shift_stage2_idct 882 ld1 {v20.8b},[x2],x5 883 884 885 sqrshrn v15.4h, v24.4s,#shift_stage2_idct 886 ld1 {v19.8b},[x2],x8 887 888 889 890 891 sqrshrn v11.4h, v28.4s,#shift_stage2_idct 892 ld1 {v22.8b},[x4],x8 893 894 895 896 897 sqrshrn v16.4h, v26.4s,#shift_stage2_idct 898 ld1 {v21.8b},[x2],x5 899 900 901 902 903pred_buff_addition: 904 905 umov x19,v25.d[0] 906 umov x20,v25.d[1] 907 908 trn1 v27.4h, v10.4h, v11.4h 909 trn2 v29.4h, v10.4h, v11.4h 910 trn1 v25.4h, v12.4h, v13.4h 911 trn2 v31.4h, v12.4h, v13.4h 912 913 trn1 v10.2s, v27.2s, v25.2s 914 trn2 v12.2s, v27.2s, v25.2s 915 trn1 v11.2s, v29.2s, v31.2s 916 trn2 v13.2s, v29.2s, v31.2s 917 918 trn1 v27.4h, v14.4h, v15.4h 919 trn2 v29.4h, v14.4h, v15.4h 920 trn1 v25.4h, v16.4h, v17.4h 921 trn2 v31.4h, v16.4h, v17.4h 922 923 trn1 v14.2s, v27.2s, v25.2s 924 trn2 v16.2s, v27.2s, v25.2s 925 trn1 v15.2s, v29.2s, v31.2s 926 trn2 v17.2s, v29.2s, v31.2s 927 928 929 mov v25.d[0],x19 930 mov v25.d[1],x20 931 932 933 ld1 {v24.8b},[x4],x5 934 ld1 {v23.8b},[x4],x8 935 ld1 {v25.8b},[x4],x5 936 mov v2.d[1], v3.d[0] 937 mov v4.d[1], v5.d[0] 938 mov v6.d[1], v7.d[0] 939 mov v8.d[1], v9.d[0] 940 uaddw v2.8h, v2.8h , v18.8b 941 uaddw v4.8h, v4.8h , v22.8b 942 uaddw v6.8h, v6.8h , v20.8b 943 uaddw v8.8h, v8.8h , v24.8b 944 945 // swapping v11 and v14 946 mov v31.d[0], v11.d[0] 947 mov v11.d[0], v14.d[0] 948 mov v14.d[0], v31.d[0] 949 950 // swapping v13 and v16 951 mov v31.d[0], v13.d[0] 952 mov v13.d[0], v16.d[0] 953 mov v16.d[0], v31.d[0] 954// row values stored in the q register. 955 956//q1 :x0 957//q3: x1 958//q2: x2 959//q4: x3 960//q5: x4 961//q7: x5 962//q6: x6 963//q8: x7 964 965 966 967///// adding the prediction buffer 968 969 970 971 972 973 974 975 976 977 // load prediction data 978 979 980 981 982 983 //adding recon with prediction 984 985 986 987 988 mov v10.d[1], v11.d[0] 989 mov v12.d[1], v13.d[0] 990 mov v14.d[1], v15.d[0] 991 mov v16.d[1], v17.d[0] 992 uaddw v10.8h, v10.8h , v19.8b 993 sqxtun v2.8b, v2.8h 994 uaddw v14.8h, v14.8h , v21.8b 995 sqxtun v4.8b, v4.8h 996 uaddw v12.8h, v12.8h , v23.8b 997 sqxtun v6.8b, v6.8h 998 uaddw v16.8h, v16.8h , v25.8b 999 sqxtun v8.8b, v8.8h 1000 1001 1002 1003 1004 1005 1006 1007 st1 {v2.8b},[x3],x7 1008 sqxtun v10.8b, v10.8h 1009 st1 {v6.8b},[x3],x10 1010 sqxtun v14.8b, v14.8h 1011 st1 {v4.8b},[x0],x7 1012 sqxtun v12.8b, v12.8h 1013 st1 {v8.8b},[x0],x10 1014 sqxtun v16.8b, v16.8h 1015 1016 1017 1018 1019 1020 1021 1022 st1 {v10.8b},[x3],x7 1023 st1 {v14.8b},[x3],x10 1024 st1 {v12.8b},[x0],x7 1025 st1 {v16.8b},[x0],x10 1026 1027 1028 1029 1030 // ldmfd sp!,{x4-x12,pc} 1031 ldp x19, x20,[sp],#16 1032 pop_v_regs 1033 ret 1034 1035 1036 1037 1038 1039