.macro push_v_regs stp q8, q9, [sp, #-32]! stp q10, q11, [sp, #-32]! stp q12, q13, [sp, #-32]! stp q14, q15, [sp, #-32]! //st1 { v8.2d, v9.2d, v10.2d, v11.2d}, [sp, #-64]! //st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp, #-64]! stp X8, X9, [sp, #-16]! stp X10, X11, [sp, #-16]! stp X12, X13, [sp, #-16]! stp X14, X15, [sp, #-16]! stp X16, X17, [sp, #-16]! stp X18, X19, [sp, #-16]! stp X20, X21, [sp, #-16]! stp X22, X23, [sp, #-16]! stp X24, X25, [sp, #-16]! stp X26, X27, [sp, #-16]! stp X28, X29, [sp, #-16]! stp X30, X29, [sp, #-16]! .endm .macro pop_v_regs ldp X30, X29, [sp], #16 ldp X28, X29, [sp], #16 ldp X26, X27, [sp], #16 ldp X24, X25, [sp], #16 ldp X22, X23, [sp], #16 ldp X20, X21, [sp], #16 ldp X18, X19, [sp], #16 ldp X16, X17, [sp], #16 ldp X14, X15, [sp], #16 ldp X12, X13, [sp], #16 ldp X10, X11, [sp], #16 ldp X8, X9, [sp], #16 //ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [sp], #64 //ld1 { v8.2d, v9.2d, v10.2d, v11.2d}, [sp], #64 ldp q14, q15, [sp], #32 ldp q12, q13, [sp], #32 ldp q10, q11, [sp], #32 ldp q8, q9, [sp], #32 .endm .text .p2align 2 .global ixheaacd_post_twid_overlap_add_armv8 ixheaacd_post_twid_overlap_add_armv8: // STMFD sp!, {x4-x12} push_v_regs //stp x19, x20,[sp,#-16]! //VPUSH {d8 - d15} //LDR w4, [sp, #100] //sxtw x4,w4 //LDR w5, [sp, #104] //sxtw x5,w5 //LDR w6, [sp, #108] //sxtw x6,w6 MOV x16, x5 MOV x17, x7 LSL x9, x3, #2 ASR x9, x9, #1 ADD x6, x6, x9 SUB x6, x6, #4 MOV w8, #7500 sxtw x8, w8 ADD x2, x2, x8 movi v18.4h, #50 sub x20, x5, #15 neg x9, x20 movi v20.4s, #0x80, LSL #8 dup v16.4s, w5 SUB x5, x5, #16 //STR w5, [sp, #116] MOV w25, w5 sxtw x25, w25 MOV x8, #1 LSL x8, x8, x9 //STR w8, [sp, #120] MOV w26, w8 //sxtw x8,w8 ARM_PROLOGUE: LDR w8, [x1], #4 sxtw x8, w8 LDR w9, [x1], #4 sxtw x9, w9 LDR w10, [x2], #4 sxtw x10, w10 AND w19, w10, 0xFFFF sxth x19, w19 ASR w10, w10, #16 // SMULWT x11, x8, x10 // // SMULWB x12, x9, x10 // SMULWB x5, x8, x10 // SMLAWT x7, x9, x10, x5 SMULL x11, w8, w10 ASR x11, x11, #16 SMULL x12, w9, w19 ASR x12, x12, #16 SMULL x5, w8, w19 ASR x5, x5, #16 SMULL x7, w9, w10 ASR x7, x7, #16 ADD x7, x7, x5 SUB x8, x12, x11 MVN x5, x7 ADD x5, x5, #1 MOV x9, #50 MOV x12, #-50 AND w19, w9, 0xFFFF sxth x19, w19 SMULL x10, w5, w19 ASR x10, x10, #16 AND w19, w12, 0xFFFF sxth x19, w19 SMULL x11, w8, w19 ASR x11, x11, #16 ADD x8, x8, x10 ADD x5, x5, x11 //LDR w11, [sp, #104] MOV w11, w16 sxth x11, w11 LDR w10, [x6], #-32 sxtw x10, w10 AND w19, w10, 0xFFFF sxth x19, w19 ASR w20, w10, #16 //SMULWB x7, x8, x10 SMULL x7, w8, w19 ASR x7, x7, #16 MVN x8, x8 ADD x8, x8, #1 //SMULWT x12, x8, x10 SMULL x12, w8, w20 ASR x12, x12, #16 CMP x11, #0 BLT NEXT SUB x9, x11, #16 negs x9, x9 // LDR w8, [sp, #120] //sxtw x8,w8 MOV v1.s[0], w26 MOV v2.s[0], w5 //sQADD w5, w5, w8 //ASR w5, w5, w9 SQADD v2.2s, v2.2s, v1.2s MOV w5, v2.s[0] ASR w5, w5, w9 SUB x9, x11, #31 negs x9, x9 ASR x20, x7, x9 //MOV x8, x20 ADDS x8, x20, #0 BGE NEXT2 CMN x8, #1 NEXT2: MOV x20, #0x80000000 csel x7, x20, x7, LT MOV x20, #0x7fffffff csel x7, x20, x7, GT LSL x20, x7, x11 csel x7, x20, x7, EQ SUB x9, x11, #31 negs x9, x9 ASR x20, x12, x9 //MOV x8, x20 ADDS x8, x20, #0 BGE NEXT3 CMN x8, #1 NEXT3: MOV x20, #0x80000000 csel x12, x20, x12, LT MOV x20, #0x7fffffff csel x12, x20, x12, GT LSL x20, x12, x11 csel x12, x20, x12, EQ B NEXT1 NEXT: MVN w11, w11 ADD w11, w11, #1 ASR w5, w5, w11 MOV w8, #0x8000 MOV v1.s[0], w8 MOV v2.s[0], w5 //QADD x5, x5, x8 SQADD v2.2s, v2.2s, v1.2s MOV w5, v2.s[0] ASR w5, w5, #16 ASR w7, w7, w11 ASR w12, w12, w11 NEXT1: LDR w9, [x4] sxtw x9, w9 MOV w8, #0x8000 //sxtw x8,w8 STR w5, [x4], #4 sxtw x5, w5 ROR w20, w10, #16 //UXTH x5, x10, ROR #16 UXTH w5, w20 UXTH w10, w10 dup v0.2s, w9 dup v2.2s, w10 dup v3.2s, w5 //VZIP.32 D2, D3 ZIP1 v28.2s, v2.2s, v3.2s ZIP2 v3.2s, v2.2s, v3.2s MOV v2.8b, v28.8b sMULL v0.2d, v2.2s, v0.2s Sqxtn v8.2s, v0.2d dup v0.2s, w12 dup v1.2s, w7 //VZIP.32 D0, D1 ZIP1 v28.2s, v0.2s, v1.2s ZIP2 v1.2s, v0.2s, v1.2s MOV v0.8b, v28.8b SQSUB v8.2s, v0.2s , v8.2s sQshL v8.2s, v8.2s, #2 dup v0.2s, w8 SQADD v8.2s, v8.2s , v0.2s sshR v8.2s, v8.2s, #16 MOV x7, x17 //sxtw x7,w7 LSL x10, x7, #1 ASR x5, x3, #1 //SMULBB x5, x10, x5 AND w5, w5, 0xFFFF sxth x5, w5 AND w19, w10, 0xFFFF sxth x19, w19 SMULL x5, w19, w5 ADD x5, x5, x0 SUB x0, x5, x10 MVN x9, x10 ADD x9, x9, #1 ST1 {v8.h}[2], [x0], x9 ST1 {v8.h}[0], [x5], x10 MOV x8, x1 LSL x12, x3, #2 ADD x1, x1, x12 SUB x1, x1, #40 MOV x12, #-32 PROLOGUE_NEON: ASR x3, x3, #2 SUB x3, x3, #4 ASR x3, x3, #2 SUB x3, x3, #2 LD2 { v0.4s, v1.4s}, [x1] MOV v2.16b, v1.16b ADD x1, x1, x12 //VUZP.16 D0, D1 UZP1 v28.8h, v0.8h, v0.8h UZP2 v29.8h, v0.8h, v0.8h MOV v0.d[0], v28.d[0] MOV v0.d[1], v29.d[0] //VUZP.16 D2, D3 UZP1 v28.8h, v2.8h, v2.8h UZP2 v29.8h, v2.8h, v2.8h MOV v2.d[0], v28.d[0] MOV v2.d[1], v29.d[0] //rev64 v0.8h, v0.8h rev64 v0.8h, v0.8h MOV v1.d[0], v0.d[1] rev64 v2.8h, v2.8h MOV v3.d[0], v2.d[1] LD2 {v8.4h, v9.4h}, [x2] ADD x2, x2, #16 LD2 { v4.4s, v5.4s}, [x8] MOV v6.16b, v5.16b ADD x8, x8, #32 uMULL v30.4s, v0.4h, v9.4h // VUZP.16 D4, D5 UZP1 v28.8h, v4.8h, v4.8h UZP2 v29.8h, v4.8h, v4.8h MOV v4.d[0], v28.d[0] MOV v5.d[0], v29.d[0] uMULL v28.4s, v2.4h, v8.4h // VUZP.16 D6, D7 UZP1 v26.8h, v6.8h, v6.8h UZP2 v27.8h, v6.8h, v6.8h MOV v6.d[0], v26.d[0] MOV v7.d[0], v27.d[0] uMULL v26.4s, v0.4h, v8.4h uMULL v24.4s, v2.4h, v9.4h LD2 { v10.4s, v11.4s}, [x6] MOV v12.16b, v11.16b ADD x6, x6, x12 ushR v30.4s, v30.4s, #16 //VUZP.16 D10, D11 UZP1 v22.8h, v10.8h, v10.8h UZP2 v23.8h, v10.8h, v10.8h MOV v10.d[0], v22.d[0] MOV v10.d[1], v23.d[0] ushR v28.4s, v28.4s, #16 //VUZP.16 D12, D13 UZP1 v22.8h, v12.8h, v12.8h UZP2 v23.8h, v12.8h, v12.8h MOV v12.d[0], v22.d[0] MOV v12.d[1], v23.d[0] sMLAL v30.4s, v1.4h, v9.4h rev64 v10.8h, v10.8h MOV v11.d[0], v10.d[1] sMLAL v28.4s, v3.4h, v8.4h rev64 v12.8h, v12.8h MOV v13.d[0], v12.d[1] ushR v26.4s, v26.4s, #16 ushR v24.4s, v24.4s, #16 sMLAL v26.4s, v1.4h, v8.4h sMLAL v24.4s, v3.4h, v9.4h ADD v30.4s, v30.4s , v28.4s NEG v30.4s, v30.4s uMULL v22.4s, v4.4h, v8.4h SUB v28.4s, v24.4s , v26.4s mov v26.16b, v30.16b mov v24.16b, v28.16b // VUZP.16 D24, D25 UZP1 v19.8h, v24.8h, v24.8h UZP2 v21.8h, v24.8h, v24.8h MOV v24.d[0], v19.d[0] MOV v25.d[0], v21.d[0] // VUZP.16 D26, D27 UZP1 v19.8h, v26.8h, v26.8h UZP2 v21.8h, v26.8h, v26.8h MOV v26.d[0], v19.d[0] MOV v27.d[0], v21.d[0] uMULL v2.4s, v24.4h, v18.4h uMULL v0.4s, v26.4h, v18.4h ushR v22.4s, v22.4s, #16 sMLAL v22.4s, v5.4h, v8.4h ushR v2.4s, v2.4s, #16 ushR v0.4s, v0.4s, #16 sMLAL v2.4s, v25.4h, v18.4h sMLAL v0.4s, v27.4h, v18.4h uMULL v24.4s, v4.4h, v9.4h uMULL v26.4s, v6.4h, v8.4h NEG v2.4s, v2.4s ADD v28.4s, v28.4s , v0.4s ADD v30.4s, v30.4s , v2.4s uMULL v0.4s, v6.4h, v9.4h sshR v24.4s, v24.4s, #16 sMLAL v24.4s, v5.4h, v9.4h sshR v26.4s, v26.4s, #16 sshR v0.4s, v0.4s, #16 sMLAL v26.4s, v7.4h, v8.4h sMLAL v0.4s, v7.4h, v9.4h ADD v22.4s, v22.4s , v0.4s NEG v22.4s, v22.4s SUB v24.4s, v26.4s , v24.4s //LDR w11, [sp, #120] //sxtw x11,w11 MOV w11, w26 dup v14.4s, w11 SQADD v28.4s, v28.4s , v14.4s //LDR w11, [sp, #116] MOV w11, w25 //sxtw x11,w11 dup v0.4s, w11 sQshL v28.4s, v28.4s, v0.4s mov v0.16b, v22.16b mov v14.16b, v24.16b // VUZP.16 D24, D25 UZP1 v19.8h, v24.8h, v24.8h UZP2 v21.8h, v24.8h, v24.8h MOV v24.d[0], v19.d[0] MOV v25.d[0], v21.d[0] // VUZP.16 D22, D23 UZP1 v19.8h, v22.8h, v22.8h UZP2 v21.8h, v22.8h, v22.8h MOV v22.d[0], v19.d[0] MOV v23.d[0], v21.d[0] uMULL v8.4s, v24.4h, v18.4h uMULL v26.4s, v22.4h, v18.4h NEG v2.4s, v30.4s // VUZP.16 D30, D31 UZP1 v19.8h, v30.8h, v30.8h UZP2 v21.8h, v30.8h, v30.8h MOV v30.d[0], v19.d[0] MOV v30.d[1], v21.d[0] // VUZP.16 D2, D3 UZP1 v19.8h, v2.8h, v2.8h UZP2 v21.8h, v2.8h, v2.8h MOV v2.d[0], v19.d[0] MOV v3.d[0], v21.d[0] uMULL v4.4s, v30.4h, v12.4h uMULL v6.4s, v2.4h, v13.4h ushR v8.4s, v8.4s, #16 ushR v26.4s, v26.4s, #16 sMLAL v8.4s, v25.4h, v18.4h sMLAL v26.4s, v23.4h, v18.4h ushR v4.4s, v4.4s, #16 ushR v6.4s, v6.4s, #16 MOV v19.d[0], v30.d[1] sMLAL v4.4s, v19.4h, v12.4h sMLAL v6.4s, v3.4h, v13.4h NEG v8.4s, v8.4s ADD v14.4s, v14.4s , v26.4s ADD v0.4s, v0.4s , v8.4s //LDR w11, [sp, #120] //sxtw x11,w11 MOV w11, w26 dup v8.4s, w11 SQADD v0.4s, v0.4s , v8.4s //LDR w11, [sp, #116] //sxtw x11,w11 MOV w11, w25 dup v26.4s, w11 sQshL v0.4s, v0.4s, v26.4s mov v26.16b, v28.16b LD2 { v28.4s, v29.4s}, [x4] MOV v30.16b, v29.16b MOV v29.d[0], v28.d[1] // VZIP.32 Q13, Q0 ZIP1 v19.4s, v26.4s, v0.4s ZIP2 v0.4s, v26.4s, v0.4s MOV v26.16b, v19.16b ST1 { v26.4s}, [x4], #16 ST1 { v0.4s}, [x4], #16 movi v1.2s, #0 //VADDL.S16 Q0, D13, D1 SADDL v0.4s, v13.4h, v1.4h MOV v1.d[0], v0.d[1] sMULL v26.2d, v28.2s, v0.2s Sqxtn v8.2s, v26.2d sMULL v26.2d, v29.2s, v1.2s Sqxtn v9.2s, v26.2d MOV v8.d[1], v9.d[0] movi v1.2s, #0 // VADDL.S16 Q0, D12, D1 SADDL v0.4s, v12.4h, v1.4h MOV v1.d[0], v0.d[1] sMULL v24.2d, v28.2s, v0.2s Sqxtn v26.2s, v24.2d sMULL v24.2d, v29.2s, v1.2s Sqxtn v27.2s, v24.2d MOV v26.d[1], v27.d[0] sQshL v4.4s, v4.4s, v16.4s sQshL v6.4s, v6.4s, v16.4s SQSUB v4.4s, v4.4s , v8.4s SQSUB v6.4s, v6.4s , v26.4s NEG v26.4s, v14.4s // VUZP.16 D14, D15 UZP1 v19.8h, v14.8h, v14.8h UZP2 v21.8h, v14.8h, v14.8h MOV v14.d[0], v19.d[0] MOV v15.d[0], v21.d[0] // VUZP.16 D26, D27 UZP1 v19.8h, v26.8h, v26.8h UZP2 v21.8h, v26.8h, v26.8h MOV v26.d[0], v19.d[0] MOV v27.d[0], v21.d[0] movi v1.2s, #0 // VADDL.S16 Q0, D10, D1 SADDL v0.4s, v10.4h, v1.4h MOV v1.d[0], v0.d[0] sMULL v22.2d, v30.2s, v0.2s Sqxtn v24.2s, v22.2d sMULL2 v22.2d, v30.4s, v0.4s Sqxtn v25.2s, v22.2d MOV v24.d[1], v25.d[0] movi v1.2s, #0 // VADDL.S16 Q0, D11, D1 SADDL v0.4s, v11.4h, v1.4h MOV v1.d[0], v0.d[1] sMULL v8.2d, v30.2s, v0.2s Sqxtn v22.2s, v8.2d sMULL2 v8.2d, v30.4s, v0.4s Sqxtn v23.2s, v8.2d MOV v22.d[1], v23.d[0] uMULL v8.4s, v26.4h, v11.4h uMULL v30.4s, v14.4h, v10.4h LD2 { v0.4s, v1.4s}, [x1] MOV v2.16b, v1.16b ADD x1, x1, x12 // VUZP.16 D0, D1 UZP1 v19.8h, v0.8h, v0.8h UZP2 v21.8h, v0.8h, v0.8h MOV v0.d[0], v19.d[0] MOV v0.d[1], v21.d[0] // VUZP.16 D2, D3 UZP1 v19.8h, v2.8h, v2.8h UZP2 v21.8h, v2.8h, v2.8h MOV v2.d[0], v19.d[0] MOV v2.d[1], v21.d[0] ushR v8.4s, v8.4s, #16 rev64 v0.8h, v0.8h MOV v1.d[0], v0.d[1] ushR v30.4s, v30.4s, #16 rev64 v2.8h, v2.8h MOV v3.d[0], v2.d[1] sMLAL v8.4s, v27.4h, v11.4h sMLAL v30.4s, v15.4h, v10.4h LD2 { v10.4s, v11.4s}, [x6] ADD x6, x6, x12 MOV v12.16b, v11.16b sQshL v4.4s, v4.4s, #2 // VUZP.16 D10, D11 UZP1 v19.8h, v10.8h, v10.8h UZP2 v21.8h, v10.8h, v10.8h MOV v10.d[0], v19.d[0] MOV v10.d[1], v21.d[0] sQshL v6.4s, v6.4s, #2 // VUZP.16 D12, D13 UZP1 v19.8h, v12.8h, v12.8h UZP2 v21.8h, v12.8h, v12.8h MOV v12.d[0], v19.d[0] MOV v12.d[1], v21.d[0] SQADD v14.4s, v4.4s , v20.4s rev64 v10.8h, v10.8h MOV v11.d[0], v10.d[1] SQADD v6.4s, v6.4s , v20.4s rev64 v12.8h, v12.8h MOV v13.d[0], v12.d[1] sshR v14.4s, v14.4s, #16 // VUZP.16 D14, D15 UZP1 v19.8h, v14.8h, v14.8h UZP2 v21.8h, v14.8h, v14.8h MOV v14.d[0], v19.d[0] MOV v15.d[0], v21.d[0] sshR v6.4s, v6.4s, #16 // VUZP.16 D6, D7 UZP1 v19.8h, v6.8h, v6.8h UZP2 v21.8h, v6.8h, v6.8h MOV v6.d[0], v19.d[0] MOV v7.d[0], v21.d[0] mov v15.8b, v6.8b sQshL v8.4s, v8.4s, v16.4s LD2 { v4.4s, v5.4s}, [x8] ADD x8, x8, #32 MOV v6.16b, v5.16b sQshL v30.4s, v30.4s, v16.4s // VUZP.16 D4, D5 UZP1 v19.8h, v4.8h, v4.8h UZP2 v21.8h, v4.8h, v4.8h MOV v4.d[0], v19.d[0] MOV v5.d[0], v21.d[0] SQSUB v8.4s, v8.4s , v24.4s // VUZP.16 D6, D7 UZP1 v19.8h, v6.8h, v6.8h UZP2 v21.8h, v6.8h, v6.8h MOV v6.d[0], v19.d[0] MOV v7.d[0], v21.d[0] SQSUB v22.4s, v30.4s , v22.4s sQshL v30.4s, v8.4s, #2 LD2 {v8.4h, v9.4h}, [x2] ADD x2, x2, #16 sQshL v22.4s, v22.4s, #2 SQADD v30.4s, v30.4s , v20.4s SQADD v22.4s, v22.4s , v20.4s sshR v30.4s, v30.4s, #16 // VUZP.16 D30, D31 UZP1 v19.8h, v30.8h, v30.8h UZP2 v21.8h, v30.8h, v30.8h MOV v30.d[0], v19.d[0] MOV v30.d[1], v21.d[0] sshR v22.4s, v22.4s, #16 // VUZP.16 D22, D23 UZP1 v19.8h, v22.8h, v22.8h UZP2 v21.8h, v22.8h, v22.8h MOV v22.d[0], v19.d[0] MOV v23.d[0], v21.d[0] mov v23.8b, v30.8b CORE_LOOP: ST1 {v14.h}[0], [x0] ADD x0, x0, x9 uMULL v30.4s, v0.4h, v9.4h ST1 {v22.h}[0], [x0] ADD x0, x0, x9 uMULL v28.4s, v2.4h, v8.4h ST1 {v14.h}[1], [x0] ADD x0, x0, x9 uMULL v26.4s, v0.4h, v8.4h ST1 {v22.h}[1], [x0] ADD x0, x0, x9 uMULL v24.4s, v2.4h, v9.4h ST1 {v14.h}[2], [x0] ADD x0, x0, x9 ushR v30.4s, v30.4s, #16 ST1 {v22.h}[2], [x0] ADD x0, x0, x9 ushR v28.4s, v28.4s, #16 ST1 {v14.h}[3], [x0] ADD x0, x0, x9 sMLAL v30.4s, v1.4h, v9.4h ST1 {v22.h}[3], [x0] ADD x0, x0, x9 sMLAL v28.4s, v3.4h, v8.4h ST1 {v15.h}[0], [x5] ADD x5, x5, x10 ushR v26.4s, v26.4s, #16 ST1 {v23.h}[0], [x5] ADD x5, x5, x10 ushR v24.4s, v24.4s, #16 ST1 {v15.h}[1], [x5] ADD x5, x5, x10 sMLAL v26.4s, v1.4h, v8.4h ST1 {v23.h}[1], [x5] ADD x5, x5, x10 sMLAL v24.4s, v3.4h, v9.4h ST1 {v15.h}[2], [x5] ADD x5, x5, x10 ADD v30.4s, v30.4s , v28.4s ST1 {v23.h}[2], [x5] ADD x5, x5, x10 NEG v30.4s, v30.4s ST1 {v15.h}[3], [x5] ADD x5, x5, x10 ST1 {v23.h}[3], [x5] ADD x5, x5, x10 SUB v28.4s, v24.4s , v26.4s mov v26.16b, v30.16b uMULL v22.4s, v4.4h, v8.4h mov v24.16b, v28.16b // VUZP.16 D24, D25 UZP1 v19.8h, v24.8h, v24.8h UZP2 v21.8h, v24.8h, v24.8h MOV v24.d[0], v19.d[0] MOV v25.d[0], v21.d[0] // VUZP.16 D26, D27 UZP1 v19.8h, v26.8h, v26.8h UZP2 v21.8h, v26.8h, v26.8h MOV v26.d[0], v19.d[0] MOV v27.d[0], v21.d[0] uMULL v2.4s, v24.4h, v18.4h uMULL v0.4s, v26.4h, v18.4h ushR v22.4s, v22.4s, #16 sMLAL v22.4s, v5.4h, v8.4h ushR v2.4s, v2.4s, #16 ushR v0.4s, v0.4s, #16 sMLAL v2.4s, v25.4h, v18.4h sMLAL v0.4s, v27.4h, v18.4h uMULL v24.4s, v4.4h, v9.4h uMULL v26.4s, v6.4h, v8.4h NEG v2.4s, v2.4s ADD v28.4s, v28.4s , v0.4s ADD v30.4s, v30.4s , v2.4s uMULL v0.4s, v6.4h, v9.4h sshR v24.4s, v24.4s, #16 sMLAL v24.4s, v5.4h, v9.4h sshR v26.4s, v26.4s, #16 sshR v0.4s, v0.4s, #16 sMLAL v26.4s, v7.4h, v8.4h sMLAL v0.4s, v7.4h, v9.4h ADD v22.4s, v22.4s , v0.4s NEG v22.4s, v22.4s SUB v24.4s, v26.4s , v24.4s //LDR w11, [sp, #120] //sxtw x11,w11 MOV w11, w26 dup v14.4s, w11 SQADD v28.4s, v28.4s , v14.4s //LDR w11, [sp, #116] //sxtw x11,w11 MOV w11, w25 dup v0.4s, w11 sQshL v28.4s, v28.4s, v0.4s mov v0.16b, v22.16b mov v14.16b, v24.16b // VUZP.16 D24, D25 UZP1 v19.8h, v24.8h, v24.8h UZP2 v21.8h, v24.8h, v24.8h MOV v24.d[0], v19.d[0] MOV v25.d[0], v21.d[0] // VUZP.16 D22, D23 UZP1 v19.8h, v22.8h, v22.8h UZP2 v21.8h, v22.8h, v22.8h MOV v22.d[0], v19.d[0] MOV v23.d[0], v21.d[0] uMULL v8.4s, v24.4h, v18.4h uMULL v26.4s, v22.4h, v18.4h NEG v2.4s, v30.4s // VUZP.16 D30, D31 UZP1 v19.8h, v30.8h, v30.8h UZP2 v21.8h, v30.8h, v30.8h MOV v30.d[0], v19.d[0] MOV v30.d[1], v21.d[0] // VUZP.16 D2, D3 UZP1 v19.8h, v2.8h, v2.8h UZP2 v21.8h, v2.8h, v2.8h MOV v2.d[0], v19.d[0] MOV v3.d[0], v21.d[0] uMULL v4.4s, v30.4h, v12.4h uMULL v6.4s, v2.4h, v13.4h ushR v8.4s, v8.4s, #16 ushR v26.4s, v26.4s, #16 sMLAL v8.4s, v25.4h, v18.4h sMLAL v26.4s, v23.4h, v18.4h ushR v4.4s, v4.4s, #16 ushR v6.4s, v6.4s, #16 MOV v19.d[0], v30.d[1] sMLAL v4.4s, v19.4h, v12.4h sMLAL v6.4s, v3.4h, v13.4h NEG v8.4s, v8.4s ADD v14.4s, v14.4s , v26.4s ADD v0.4s, v0.4s , v8.4s //LDR w11, [sp, #120] //sxtw x11,w11 MOV w11, w26 dup v8.4s, w11 SQADD v0.4s, v0.4s , v8.4s //LDR w11, [sp, #116] //sxtw x11,w11 MOV w11, w25 dup v26.4s, w11 sQshL v0.4s, v0.4s, v26.4s mov v26.16b, v28.16b LD2 { v28.4s, v29.4s}, [x4] MOV v30.16b, v29.16b MOV v29.d[0], v28.d[1] // VZIP.32 Q13, Q0 ZIP1 v19.4s, v26.4s, v0.4s ZIP2 v0.4s, v26.4s, v0.4s MOV v26.16b, v19.16b ST1 { v26.4s}, [x4] ADD x4, x4, #16 ST1 { v0.4s}, [x4] ADD x4, x4, #16 movi v1.2s, #0 // VADDL.S16 Q0, D13, D1 SADDL v0.4s, v13.4h, v1.4h MOV v1.d[0], v0.d[1] sMULL v26.2d, v28.2s, v0.2s Sqxtn v8.2s, v26.2d sMULL v26.2d, v29.2s, v1.2s Sqxtn v9.2s, v26.2d MOV v8.d[1], v9.d[0] movi v1.2s, #0 //VADDL.S16 Q0, D12, D1 SADDL v0.4s, v12.4h, v1.4h MOV v1.d[0], v0.d[1] sMULL v24.2d, v28.2s, v0.2s Sqxtn v26.2s, v24.2d sMULL v24.2d, v29.2s, v1.2s Sqxtn v27.2s, v24.2d MOV v26.d[1], v27.d[0] sQshL v4.4s, v4.4s, v16.4s sQshL v6.4s, v6.4s, v16.4s SQSUB v4.4s, v4.4s , v8.4s SQSUB v6.4s, v6.4s , v26.4s NEG v26.4s, v14.4s // VUZP.16 D26, D27 UZP1 v19.8h, v26.8h, v26.8h UZP2 v21.8h, v26.8h, v26.8h MOV v26.d[0], v19.d[0] MOV v27.d[0], v21.d[0] movi v1.2s, #0 //VADDL.S16 Q0, D10, D1 SADDL v0.4s, v10.4h, v1.4h MOV v1.d[0], v0.d[1] sMULL v22.2d, v30.2s, v0.2s Sqxtn v24.2s, v22.2d sMULL2 v22.2d, v30.4s, v0.4s Sqxtn v25.2s, v22.2d MOV v24.d[1], v25.d[0] movi v1.2s, #0 //VADDL.S16 Q0, D11, D1 SADDL v0.4s, v11.4h, v1.4h sMULL v8.2d, v30.2s, v0.2s Sqxtn v22.2s, v8.2d sMULL2 v8.2d, v30.4s, v0.4s Sqxtn v23.2s, v8.2d MOV v22.d[1], v23.d[0] // VUZP.16 D14, D15 UZP1 v19.8h, v14.8h, v14.8h UZP2 v21.8h, v14.8h, v14.8h MOV v14.d[0], v19.d[0] MOV v15.d[0], v21.d[0] uMULL v8.4s, v26.4h, v11.4h uMULL v30.4s, v14.4h, v10.4h LD2 { v0.4s, v1.4s}, [x1] MOV v2.16b, v1.16b ADD X1, X1, x12 // VUZP.16 D0, D1 UZP1 v19.8h, v0.8h, v0.8h UZP2 v21.8h, v0.8h, v0.8h MOV v0.d[0], v19.d[0] MOV v0.d[1], v21.d[0] // VUZP.16 D2, D3 UZP1 v19.8h, v2.8h, v2.8h UZP2 v21.8h, v2.8h, v2.8h MOV v2.d[0], v19.d[0] MOV v2.d[1], v21.d[0] ushR v8.4s, v8.4s, #16 rev64 v0.8h, v0.8h MOV v1.d[0], v0.d[1] ushR v30.4s, v30.4s, #16 rev64 v2.8h, v2.8h MOV v3.d[0], v2.d[1] sMLAL v8.4s, v27.4h, v11.4h sMLAL v30.4s, v15.4h, v10.4h LD2 { v10.4s, v11.4s}, [x6] add X6, x6, x12 MOV v12.16b, v11.16b sQshL v4.4s, v4.4s, #2 //VUZP.16 D10, D11 UZP1 v19.8h, v10.8h, v10.8h UZP2 v21.8h, v10.8h, v10.8h MOV v10.d[0], v19.d[0] MOV v10.d[1], v21.d[0] sQshL v6.4s, v6.4s, #2 // VUZP.16 D12, D13 UZP1 v19.8h, v12.8h, v12.8h UZP2 v21.8h, v12.8h, v12.8h MOV v12.d[0], v19.d[0] MOV v12.d[1], v21.d[0] SQADD v14.4s, v4.4s , v20.4s rev64 v10.8h, v10.8h MOV v11.d[0], v10.d[1] SQADD v6.4s, v6.4s , v20.4s rev64 v12.8h, v12.8h MOV v13.d[0], v12.d[1] sshR v14.4s, v14.4s, #16 // VUZP.16 D14, D15 UZP1 v19.8h, v14.8h, v14.8h UZP2 v21.8h, v14.8h, v14.8h MOV v14.d[0], v19.d[0] MOV v15.d[0], v21.d[0] sshR v6.4s, v6.4s, #16 // VUZP.16 D6, D7 UZP1 v19.8h, v6.8h, v6.8h UZP2 v21.8h, v6.8h, v6.8h MOV v6.d[0], v19.d[0] MOV v7.d[0], v21.d[0] mov v15.8b, v6.8b sQshL v8.4s, v8.4s, v16.4s LD2 { v4.4s, v5.4s}, [x8] ADD x8, x8, #32 MOV v6.16b, v5.16b sQshL v30.4s, v30.4s, v16.4s // VUZP.16 D4, D5 UZP1 v19.8h, v4.8h, v4.8h UZP2 v21.8h, v4.8h, v4.8h MOV v4.d[0], v19.d[0] MOV v5.d[0], v21.d[0] SQSUB v8.4s, v8.4s , v24.4s // VUZP.16 D6, D7 UZP1 v19.8h, v6.8h, v6.8h UZP2 v21.8h, v6.8h, v6.8h MOV v6.d[0], v19.d[0] MOV v7.d[0], v21.d[0] SQSUB v22.4s, v30.4s , v22.4s sQshL v30.4s, v8.4s, #2 LD2 {v8.4h, v9.4h}, [x2] ADD x2, x2, #16 sQshL v22.4s, v22.4s, #2 SQADD v30.4s, v30.4s , v20.4s SQADD v22.4s, v22.4s , v20.4s sshR v30.4s, v30.4s, #16 // VUZP.16 D30, D31 UZP1 v19.8h, v30.8h, v30.8h UZP2 v21.8h, v30.8h, v30.8h MOV v30.d[0], v19.d[0] MOV v30.d[1], v21.d[0] sshR v22.4s, v22.4s, #16 // VUZP.16 D22, D23 UZP1 v19.8h, v22.8h, v22.8h UZP2 v21.8h, v22.8h, v22.8h MOV v22.d[0], v19.d[0] MOV v23.d[0], v21.d[0] mov v23.8b, v30.8b SUBS x3, x3, #1 BNE CORE_LOOP EPILOGUE: ST1 {v14.h}[0], [x0] ADD x0, x0, x9 uMULL v30.4s, v0.4h, v9.4h ST1 {v22.h}[0], [x0] ADD x0, x0, x9 uMULL v28.4s, v2.4h, v8.4h ST1 {v14.h}[1], [x0] ADD x0, x0, x9 uMULL v26.4s, v0.4h, v8.4h ST1 {v22.h}[1], [x0] ADD x0, x0, x9 uMULL v24.4s, v2.4h, v9.4h ST1 {v14.h}[2], [x0] ADD x0, x0, x9 ushR v30.4s, v30.4s, #16 ST1 {v22.h}[2], [x0] ADD x0, x0, x9 ushR v28.4s, v28.4s, #16 ST1 {v14.h}[3], [x0] ADD x0, x0, x9 sMLAL v30.4s, v1.4h, v9.4h ST1 {v22.h}[3], [x0] ADD x0, x0, x9 sMLAL v28.4s, v3.4h, v8.4h ST1 {v15.h}[0], [x5] ADD x5, x5, x10 ushR v26.4s, v26.4s, #16 ST1 {v23.h}[0], [x5] ADD x5, x5, x10 ushR v24.4s, v24.4s, #16 ST1 {v15.h}[1], [x5] ADD x5, x5, x10 sMLAL v26.4s, v1.4h, v8.4h ST1 {v23.h}[1], [x5] ADD x5, x5, x10 sMLAL v24.4s, v3.4h, v9.4h ST1 {v15.h}[2], [x5] ADD x5, x5, x10 ADD v30.4s, v30.4s , v28.4s ST1 {v23.h}[2], [x5] ADD x5, x5, x10 NEG v30.4s, v30.4s ST1 {v15.h}[3], [x5] ADD x5, x5, x10 ST1 {v23.h}[3], [x5] ADD x5, x5, x10 SUB v28.4s, v24.4s , v26.4s uMULL v22.4s, v4.4h, v8.4h mov v26.16b, v30.16b mov v24.16b, v28.16b mov v26.16b, v30.16b mov v24.16b, v28.16b //VUZP.16 D26, D27 UZP1 v19.8h, v26.8h, v26.8h UZP2 v21.8h, v26.8h, v26.8h MOV v26.d[0], v19.d[0] MOV v27.d[0], v21.d[0] // VUZP.16 D24, D25 UZP1 v19.8h, v24.8h, v24.8h UZP2 v21.8h, v24.8h, v24.8h MOV v24.d[0], v19.d[0] MOV v25.d[0], v21.d[0] uMULL v2.4s, v24.4h, v18.4h uMULL v0.4s, v26.4h, v18.4h ushR v22.4s, v22.4s, #16 sMLAL v22.4s, v5.4h, v8.4h ushR v2.4s, v2.4s, #16 ushR v0.4s, v0.4s, #16 sMLAL v2.4s, v25.4h, v18.4h sMLAL v0.4s, v27.4h, v18.4h uMULL v24.4s, v4.4h, v9.4h uMULL v26.4s, v6.4h, v8.4h NEG v2.4s, v2.4s ADD v28.4s, v28.4s , v0.4s ADD v30.4s, v30.4s , v2.4s uMULL v0.4s, v6.4h, v9.4h sshR v24.4s, v24.4s, #16 sMLAL v24.4s, v5.4h, v9.4h sshR v26.4s, v26.4s, #16 sshR v0.4s, v0.4s, #16 sMLAL v26.4s, v7.4h, v8.4h sMLAL v0.4s, v7.4h, v9.4h ADD v22.4s, v22.4s , v0.4s NEG v22.4s, v22.4s SUB v24.4s, v26.4s , v24.4s //LDR w11, [sp, #120] //sxtw x11,w11 MOV w11, w26 dup v14.4s, w11 SQADD v28.4s, v28.4s , v14.4s //LDR w11, [sp, #116] //sxtw x11,w11 MOV w11, w25 dup v0.4s, w11 sQshL v28.4s, v28.4s, v0.4s mov v0.16b, v22.16b mov v14.16b, v24.16b // VUZP.16 D22, D23 UZP1 v19.8h, v22.8h, v22.8h UZP2 v21.8h, v22.8h, v22.8h MOV v22.d[0], v19.d[0] MOV v23.d[0], v21.d[0] // VUZP.16 D24, D25 UZP1 v19.8h, v24.8h, v24.8h UZP2 v21.8h, v24.8h, v24.8h MOV v24.d[0], v19.d[0] MOV v25.d[0], v21.d[0] uMULL v8.4s, v24.4h, v18.4h uMULL v26.4s, v22.4h, v18.4h NEG v2.4s, v30.4s // VUZP.16 D30, D31 UZP1 v19.8h, v30.8h, v30.8h UZP2 v21.8h, v30.8h, v30.8h MOV v30.d[0], v19.d[0] MOV v30.d[1], v21.d[0] // VUZP.16 D2, D3 UZP1 v19.8h, v2.8h, v2.8h UZP2 v21.8h, v2.8h, v2.8h MOV v2.d[0], v19.d[0] MOV v3.d[0], v21.d[0] uMULL v4.4s, v30.4h, v12.4h uMULL v6.4s, v2.4h, v13.4h ushR v8.4s, v8.4s, #16 ushR v26.4s, v26.4s, #16 sMLAL v8.4s, v25.4h, v18.4h sMLAL v26.4s, v23.4h, v18.4h ushR v4.4s, v4.4s, #16 ushR v6.4s, v6.4s, #16 MOV v19.d[0], v30.d[1] sMLAL v4.4s, v19.4h, v12.4h sMLAL v6.4s, v3.4h, v13.4h NEG v8.4s, v8.4s ADD v14.4s, v14.4s , v26.4s ADD v0.4s, v0.4s , v8.4s //LDR w11, [sp, #120] //sxtw x11,w11 MOV w11, w26 dup v8.4s, w11 SQADD v0.4s, v0.4s , v8.4s //LDR w11, [sp, #116] //sxtw x11,w11 MOV w11, w25 dup v26.4s, w11 sQshL v0.4s, v0.4s, v26.4s mov v26.16b, v28.16b LD2 { v28.4s, v29.4s}, [x4] MOV v30.16b, v29.16b MOV v29.d[0], v28.d[1] // VZIP.32 Q13, Q0 ZIP1 v19.4s, v26.4s, v0.4s ZIP2 v0.4s, v26.4s, v0.4s MOV v26.16b, v19.16b ST1 { v26.4s}, [x4], #16 ST1 { v0.4s}, [x4], #16 movi v1.2s, #0 // VADDL.S16 Q0, D13, D1 SADDL v0.4s, v13.4h, v1.4h MOV v1.d[0], v0.d[1] sMULL v26.2d, v28.2s, v0.2s Sqxtn v8.2s, v26.2d sMULL v26.2d, v29.2s, v1.2s Sqxtn v9.2s, v26.2d MOV v8.d[1], v9.d[0] movi v1.2s, #0 // VADDL.S16 Q0, D12, D1 SADDL v0.4s, v12.4h, v1.4h MOV v1.d[0], v0.d[1] sMULL v24.2d, v28.2s, v0.2s Sqxtn v26.2s, v24.2d sMULL v24.2d, v29.2s, v1.2s Sqxtn v27.2s, v24.2d MOV v26.d[1], v27.d[0] sQshL v4.4s, v4.4s, v16.4s sQshL v6.4s, v6.4s, v16.4s SQSUB v4.4s, v4.4s , v8.4s SQSUB v6.4s, v6.4s , v26.4s NEG v26.4s, v14.4s // VUZP.16 D14, D15 UZP1 v19.8h, v14.8h, v14.8h UZP2 v21.8h, v14.8h, v14.8h MOV v14.d[0], v19.d[0] MOV v15.d[0], v21.d[0] // VUZP.16 D26, D27 UZP1 v19.8h, v26.8h, v26.8h UZP2 v21.8h, v26.8h, v26.8h MOV v26.d[0], v19.d[0] MOV v27.d[0], v21.d[0] movi v1.2s, #0 //VADDL.S16 Q0, D10, D1 SADDL v0.4s, v10.4h, v1.4h MOV v1.d[0], v0.d[1] sMULL v22.2d, v30.2s, v0.2s Sqxtn v24.2s, v22.2d sMULL2 v22.2d, v30.4s, v0.4s Sqxtn v25.2s, v22.2d MOV v24.d[1], v25.d[0] movi v1.2s, #0 //VADDL.S16 Q0, D11, D1 SADDL v0.4s, v11.4h, v1.4h MOV v1.d[0], v0.d[1] sMULL v8.2d, v30.2s, v0.2s Sqxtn v22.2s, v8.2d sMULL2 v8.2d, v30.4s, v0.4s Sqxtn v23.2s, v8.2d MOV v22.d[1], v23.d[0] uMULL v8.4s, v26.4h, v11.4h uMULL v30.4s, v14.4h, v10.4h ushR v8.4s, v8.4s, #16 ushR v30.4s, v30.4s, #16 sMLAL v8.4s, v27.4h, v11.4h sMLAL v30.4s, v15.4h, v10.4h sQshL v4.4s, v4.4s, #2 sQshL v6.4s, v6.4s, #2 SQADD v14.4s, v4.4s , v20.4s SQADD v6.4s, v6.4s , v20.4s sshR v14.4s, v14.4s, #16 // VUZP.16 D14, D15 UZP1 v19.8h, v14.8h, v14.8h UZP2 v21.8h, v14.8h, v14.8h MOV v14.d[0], v19.d[0] MOV v15.d[0], v21.d[0] sshR v6.4s, v6.4s, #16 // VUZP.16 D6, D7 UZP1 v19.8h, v6.8h, v6.8h UZP2 v21.8h, v6.8h, v6.8h MOV v6.d[0], v19.d[0] MOV v7.d[0], v21.d[0] mov v15.8b, v6.8b sQshL v8.4s, v8.4s, v16.4s sQshL v30.4s, v30.4s, v16.4s SQSUB v8.4s, v8.4s , v24.4s SQSUB v22.4s, v30.4s , v22.4s sQshL v30.4s, v8.4s, #2 sQshL v22.4s, v22.4s, #2 SQADD v30.4s, v30.4s , v20.4s SQADD v22.4s, v22.4s , v20.4s sshR v30.4s, v30.4s, #16 //VUZP.16 D30, D31 UZP1 v19.8h, v30.8h, v30.8h UZP2 v21.8h, v30.8h, v30.8h MOV v30.d[0], v19.d[0] MOV v30.d[1], v21.d[0] sshR v22.4s, v22.4s, #16 // VUZP.16 D22, D23 UZP1 v19.8h, v22.8h, v22.8h UZP2 v21.8h, v22.8h, v22.8h MOV v22.d[0], v19.d[0] MOV v23.d[0], v21.d[0] mov v23.8b, v30.8b ST1 {v14.h}[0], [x0] ADD x0, x0, x9 ST1 {v22.h}[0], [x0] ADD x0, x0, x9 ST1 {v14.h}[1], [x0] ADD x0, x0, x9 ST1 {v22.h}[1], [x0] ADD x0, x0, x9 ST1 {v14.h}[2], [x0] ADD x0, x0, x9 ST1 {v22.h}[2], [x0] ADD x0, x0, x9 ST1 {v14.h}[3], [x0] ADD x0, x0, x9 ST1 {v22.h}[3], [x0] ADD x0, x0, x9 ST1 {v15.h}[0], [x5] ADD x5, x5, x10 ST1 {v23.h}[0], [x5] ADD x5, x5, x10 ST1 {v15.h}[1], [x5] ADD x5, x5, x10 ST1 {v23.h}[1], [x5] ADD x5, x5, x10 ST1 {v15.h}[2], [x5] ADD x5, x5, x10 ST1 {v23.h}[2], [x5] ADD x5, x5, x10 ST1 {v15.h}[3], [x5] ADD x5, x5, x10 ST1 {v23.h}[3], [x5] ADD x5, x5, x10 ARM_EPILOGUE: ARM_LOOP: LD2 { v0.4s, v1.4s}, [x1] MOV v2.16b, v1.16b //VUZP.16 D0, D1 UZP1 v19.8h, v0.8h, v0.8h UZP2 v21.8h, v0.8h, v0.8h MOV v0.d[0], v19.d[0] MOV v0.d[1], v21.d[0] //VUZP.16 D2, D3 UZP1 v19.8h, v2.8h, v2.8h UZP2 v21.8h, v2.8h, v2.8h MOV v2.d[0], v19.d[0] MOV v2.d[1], v21.d[0] rev64 v0.8h, v0.8h MOV v1.d[0], v0.d[1] rev64 v2.8h, v2.8h MOV v3.d[0], v2.d[1] LD2 {v8.4h, v9.4h}, [x2] ADD x2, x2, #16 LD2 {v4.2s, v5.2s}, [x8] ADD x8, x8, #16 MOV v6.16b, v5.16b movi v5.2s, #0x00000000 movi v7.2s, #0x00000000 LD1 {v5.s}[0], [x8], #4 LD1 {v7.s}[0], [x8] MOV x12, #16 MOV v4.d[1], v5.d[0] MOV v6.d[1], v7.d[0] // VUZP.16 D4, D5 UZP1 v19.8h, v4.8h, v4.8h UZP2 v21.8h, v4.8h, v4.8h MOV v4.d[0], v19.d[0] MOV v5.d[0], v21.d[0] // VUZP.16 D6, D7 UZP1 v19.8h, v6.8h, v6.8h UZP2 v21.8h, v6.8h, v6.8h MOV v6.d[0], v19.d[0] MOV v7.d[0], v21.d[0] ADD x6, x6, #16 MOV x12, #-4 LD2 {v11.2s, v12.2s}, [x6] ADD x6, x6, x12 MOV v13.16b, v12.16b movi v10.2s, #0x00000000 LD1 {v12.s}[1], [x6] ADD x6, x6, x12 LD1 {v10.s}[1], [x6] ADD x6, x6, x12 LD1 {v12.s}[0], [x6] ADD x6, x6, x12 MOV v10.d[1], v11.d[0] MOV v12.d[1], v13.d[0] //VUZP.16 D10, D11 UZP1 v19.8h, v10.8h, v10.8h UZP2 v21.8h, v10.8h, v10.8h MOV v10.d[0], v19.d[0] MOV v10.d[1], v21.d[0] //VUZP.16 D12, D13 UZP1 v19.8h, v12.8h, v12.8h UZP2 v21.8h, v12.8h, v12.8h MOV v12.d[0], v19.d[0] MOV v12.d[1], v21.d[0] rev64 v10.8h, v10.8h MOV v11.d[0], v10.d[1] rev64 v12.8h, v12.8h MOV v13.d[0], v12.d[1] uMULL v30.4s, v0.4h, v9.4h uMULL v28.4s, v2.4h, v8.4h uMULL v26.4s, v0.4h, v8.4h uMULL v24.4s, v2.4h, v9.4h ushR v30.4s, v30.4s, #16 ushR v28.4s, v28.4s, #16 sMLAL v30.4s, v1.4h, v9.4h sMLAL v28.4s, v3.4h, v8.4h ushR v26.4s, v26.4s, #16 ushR v24.4s, v24.4s, #16 sMLAL v26.4s, v1.4h, v8.4h sMLAL v24.4s, v3.4h, v9.4h ADD v30.4s, v30.4s , v28.4s NEG v30.4s, v30.4s uMULL v22.4s, v4.4h, v8.4h SUB v28.4s, v24.4s , v26.4s mov v26.16b, v30.16b mov v24.16b, v28.16b // VUZP.16 D26, D27 UZP1 v19.8h, v26.8h, v26.8h UZP2 v21.8h, v26.8h, v26.8h MOV v26.d[0], v19.d[0] MOV v27.d[0], v21.d[0] //VUZP.16 D24, D25 UZP1 v19.8h, v24.8h, v24.8h UZP2 v21.8h, v24.8h, v24.8h MOV v24.d[0], v19.d[0] MOV v25.d[0], v21.d[0] uMULL v2.4s, v24.4h, v18.4h uMULL v0.4s, v26.4h, v18.4h ushR v22.4s, v22.4s, #16 sMLAL v22.4s, v5.4h, v8.4h ushR v2.4s, v2.4s, #16 ushR v0.4s, v0.4s, #16 sMLAL v2.4s, v25.4h, v18.4h sMLAL v0.4s, v27.4h, v18.4h uMULL v24.4s, v4.4h, v9.4h uMULL v26.4s, v6.4h, v8.4h NEG v2.4s, v2.4s ADD v28.4s, v28.4s , v0.4s ADD v30.4s, v30.4s , v2.4s uMULL v0.4s, v6.4h, v9.4h sshR v24.4s, v24.4s, #16 sMLAL v24.4s, v5.4h, v9.4h sshR v26.4s, v26.4s, #16 sshR v0.4s, v0.4s, #16 sMLAL v26.4s, v7.4h, v8.4h sMLAL v0.4s, v7.4h, v9.4h ADD v22.4s, v22.4s , v0.4s NEG v22.4s, v22.4s SUB v24.4s, v26.4s , v24.4s //LDR w11, [sp, #120] //sxtw x11,w11 MOV w11, w26 dup v14.4s, w11 SQADD v28.4s, v28.4s , v14.4s //LDR w11, [sp, #116] //sxtw x11,w11 MOV w11, w25 dup v0.4s, w11 sQshL v28.4s, v28.4s, v0.4s mov v0.16b, v22.16b mov v14.16b, v24.16b // VUZP.16 D22, D23 UZP1 v19.8h, v22.8h, v22.8h UZP2 v21.8h, v22.8h, v22.8h MOV v22.d[0], v19.d[0] MOV v23.d[0], v21.d[0] // VUZP.16 D24, D25 UZP1 v19.8h, v24.8h, v24.8h UZP2 v21.8h, v24.8h, v24.8h MOV v24.d[0], v19.d[0] MOV v25.d[0], v21.d[0] uMULL v8.4s, v24.4h, v18.4h uMULL v26.4s, v22.4h, v18.4h NEG v2.4s, v30.4s // VUZP.16 D30, D31 UZP1 v19.8h, v30.8h, v30.8h UZP2 v21.8h, v30.8h, v30.8h MOV v30.d[0], v19.d[0] MOV v30.d[1], v21.d[0] // VUZP.16 D2, D3 UZP1 v19.8h, v2.8h, v2.8h UZP2 v21.8h, v2.8h, v2.8h MOV v2.d[0], v19.d[0] MOV v3.d[0], v21.d[0] uMULL v4.4s, v30.4h, v12.4h uMULL v6.4s, v2.4h, v13.4h ushR v8.4s, v8.4s, #16 ushR v26.4s, v26.4s, #16 sMLAL v8.4s, v25.4h, v18.4h sMLAL v26.4s, v23.4h, v18.4h ushR v4.4s, v4.4s, #16 ushR v6.4s, v6.4s, #16 MOV v19.d[0], v30.d[1] sMLAL v4.4s, v19.4h, v12.4h sMLAL v6.4s, v3.4h, v13.4h NEG v8.4s, v8.4s ADD v14.4s, v14.4s , v26.4s ADD v0.4s, v0.4s , v8.4s //LDR w11, [sp, #120] //sxtw x11,w11 MOV w11, w26 dup v8.4s, w11 SQADD v0.4s, v0.4s , v8.4s //LDR w11, [sp, #116] //sxtw x11,w11 MOV w11, w25 dup v26.4s, w11 sQshL v0.4s, v0.4s, v26.4s mov v26.16b, v28.16b MOV x6, x4 LD1 {v28.2s, v29.2s}, [x4], #16 movi v19.2s, #0x00000000 LD1 {v30.s}[0], [x4], #4 LD1 {v30.s}[1], [x4], #4 LD1 {v19.s}[0], [x4], #4 MOV v28.d[1], v29.d[0] MOV v30.d[1], v19.d[0] //VUZP.32 Q14, Q15 UZP1 v19.4s, v28.4s, v30.4s UZP2 v30.4s, v28.4s, v30.4s MOV v28.16b, v19.16b MOV v29.d[0], v28.d[1] ST1 {v26.s}[0], [x6], #4 ST1 {v0.s}[0], [x6], #4 ST1 {v26.s}[1], [x6], #4 ST1 {v0.s}[1], [x6], #4 ST1 {v26.s}[2], [x6], #4 ST1 {v0.s}[2], [x6], #4 ST1 {v26.s}[3], [x6], #4 movi v1.2s, #0 //VADDL.S16 Q0, D13, D1 SADDL v0.4s, v13.4h, v1.4h MOV v1.d[0], v0.d[1] sMULL v26.2d, v28.2s, v0.2s Sqxtn v8.2s, v26.2d sMULL v26.2d, v29.2s, v1.2s Sqxtn v9.2s, v26.2d MOV v8.d[1], v9.d[0] movi v1.2s, #0 //VADDL.S16 Q0, D12, D1 SADDL v0.4s, v12.4h, v1.4h MOV v1.d[0], v0.d[1] sMULL v24.2d, v28.2s, v0.2s Sqxtn v26.2s, v24.2d sMULL v24.2d, v29.2s, v1.2s Sqxtn v27.2s, v24.2d MOV v26.d[1], v27.d[0] sQshL v4.4s, v4.4s, v16.4s sQshL v6.4s, v6.4s, v16.4s SQSUB v4.4s, v4.4s , v8.4s SQSUB v6.4s, v6.4s , v26.4s NEG v26.4s, v14.4s //VUZP.16 D14, D15 UZP1 v19.8h, v14.8h, v14.8h UZP2 v21.8h, v14.8h, v14.8h MOV v14.d[0], v19.d[0] MOV v15.d[0], v21.d[0] // VUZP.16 D26, D27 UZP1 v19.8h, v26.8h, v26.8h UZP2 v21.8h, v26.8h, v26.8h MOV v26.d[0], v19.d[0] MOV v27.d[0], v21.d[0] movi v1.2s, #0 //VADDL.S16 Q0, D10, D1 SADDL v0.4s, v10.4h, v1.4h MOV v1.d[0], v0.d[1] sMULL v22.2d, v30.2s, v0.2s Sqxtn v24.2s, v22.2d sMULL2 v22.2d, v30.4s, v0.4s Sqxtn v25.2s, v22.2d MOV v24.d[1], v25.d[0] movi v1.2s, #0 // VADDL.S16 Q0, D11, D1 SADDL v0.4s, v11.4h, v1.4h MOV v1.d[0], v0.d[1] sMULL v8.2d, v30.2s, v0.2s Sqxtn v22.2s, v8.2d sMULL2 v8.2d, v30.4s, v0.4s Sqxtn v23.2s, v8.2d MOV v22.d[1], v23.d[0] uMULL v8.4s, v26.4h, v11.4h uMULL v30.4s, v14.4h, v10.4h ushR v8.4s, v8.4s, #16 ushR v30.4s, v30.4s, #16 sMLAL v8.4s, v27.4h, v11.4h sMLAL v30.4s, v15.4h, v10.4h sQshL v4.4s, v4.4s, #2 sQshL v6.4s, v6.4s, #2 SQADD v14.4s, v4.4s , v20.4s SQADD v6.4s, v6.4s , v20.4s sshR v14.4s, v14.4s, #16 // VUZP.16 D14, D15 UZP1 v19.8h, v14.8h, v14.8h UZP2 v21.8h, v14.8h, v14.8h MOV v14.d[0], v19.d[0] MOV v15.d[0], v21.d[0] sshR v6.4s, v6.4s, #16 //VUZP.16 D6, D7 UZP1 v19.8h, v6.8h, v6.8h UZP2 v21.8h, v6.8h, v6.8h MOV v6.d[0], v19.d[0] MOV v7.d[0], v21.d[0] mov v15.8b, v6.8b sQshL v8.4s, v8.4s, v16.4s sQshL v30.4s, v30.4s, v16.4s SQSUB v8.4s, v8.4s , v24.4s SQSUB v22.4s, v30.4s , v22.4s sQshL v30.4s, v8.4s, #2 sQshL v22.4s, v22.4s, #2 SQADD v30.4s, v30.4s , v20.4s SQADD v22.4s, v22.4s , v20.4s sshR v30.4s, v30.4s, #16 // VUZP.16 D30, D31 UZP1 v19.8h, v30.8h, v30.8h UZP2 v21.8h, v30.8h, v30.8h MOV v30.d[0], v19.d[0] MOV v30.d[1], v21.d[0] sshR v22.4s, v22.4s, #16 // VUZP.16 D22, D23 UZP1 v19.8h, v22.8h, v22.8h UZP2 v21.8h, v22.8h, v22.8h MOV v22.d[0], v19.d[0] MOV v23.d[0], v21.d[0] mov v23.8b, v30.8b ST1 {v14.h}[0], [x0] ADD x0, x0, x9 ST1 {v22.h}[0], [x0] ADD x0, x0, x9 ST1 {v14.h}[1], [x0] ADD x0, x0, x9 ST1 {v22.h}[1], [x0] ADD x0, x0, x9 ST1 {v14.h}[2], [x0] ADD x0, x0, x9 ST1 {v22.h}[2], [x0] ADD x0, x0, x9 ST1 {v14.h}[3], [x0] ADD x0, x0, x9 ST1 {v15.h}[0], [x5] ADD x5, x5, x10 ST1 {v23.h}[0], [x5] ADD x5, x5, x10 ST1 {v15.h}[1], [x5] ADD x5, x5, x10 ST1 {v23.h}[1], [x5] ADD x5, x5, x10 ST1 {v15.h}[2], [x5] ADD x5, x5, x10 ST1 {v23.h}[2], [x5] ADD x5, x5, x10 ST1 {v15.h}[3], [x5] ADD x5, x5, x10 // VPOP {d8 - d15} // LDMFD sp!, {x4-x12} //ldp x19, x20,[sp],#16 pop_v_regs ret //BX x14