///****************************************************************************** // * // * Copyright (C) 2018 The Android Open Source Project // * // * Licensed under the Apache License, Version 2.0 (the "License"); // * you may not use this file except in compliance with the License. // * You may obtain a copy of the License at: // * // * http://www.apache.org/licenses/LICENSE-2.0 // * // * Unless required by applicable law or agreed to in writing, software // * distributed under the License is distributed on an "AS IS" BASIS, // * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // * See the License for the specific language governing permissions and // * limitations under the License. // * // ***************************************************************************** // * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore //*/ .macro push_v_regs stp q8, q9, [sp, #-32]! stp q10, q11, [sp, #-32]! stp q12, q13, [sp, #-32]! stp q14, q15, [sp, #-32]! stp X8, X9, [sp, #-16]! stp X10, X11, [sp, #-16]! stp X12, X13, [sp, #-16]! stp X14, X15, [sp, #-16]! stp X16, X17, [sp, #-16]! stp X29, X30, [sp, #-16]! .endm .macro pop_v_regs ldp X29, X30, [sp], #16 ldp X16, X17, [sp], #16 ldp X14, X15, [sp], #16 ldp X12, X13, [sp], #16 ldp X10, X11, [sp], #16 ldp X8, X9, [sp], #16 ldp q14, q15, [sp], #32 ldp q12, q13, [sp], #32 ldp q10, q11, [sp], #32 ldp q8, q9, [sp], #32 .endm .text .global ixheaacd_over_lap_add2_armv8 ixheaacd_over_lap_add2_armv8: push_v_regs MOV X8, X5 SUB X12, X5, #1 LSL X9, X5, #2 LSL X12, X12, #2 ADD X10, X0, X9 ADD X7, X1, X12 ADD X4, X4, #1 LD2 {V0.4H, V1.4H}, [X10], #16 LSL X11, X6, #2 SUB X7, X7, #12 SUB X4, X4, #16 MOV X12, #-16 MOV X13, #1 ADD X14, X4, #1 NEG X14, X14 DUP V21.4S, W4 LD2 {V6.4H, V7.4H}, [X7], X12 LSL X4, X13, X14 REV64 V4.4H, V6.4H DUP V20.4S, W4 REV64 V5.4H, V7.4H MOV X4, X3 MOV X9, X2 LD2 {V2.4H, V3.4H}, [X3], #16 UMULL V23.4S, V0.4H, V2.4H UMLSL V23.4S, V4.4H, V3.4H LD2 {V8.4H, V9.4H}, [X10], #16 SSHR V23.4S, V23.4S, #16 LD2 {V10.4H, V11.4H}, [X3], #16 SMLAL V23.4S, V1.4H, V2.4H SMLSL V23.4S, V5.4H, V3.4H LD2 {V14.4H, V15.4H}, [X7], X12 REV64 V12.4H, V14.4H REV64 V13.4H, V15.4H SQADD V22.4S, V23.4S, V20.4S SSHL V22.4S, V22.4S, V21.4S MOV V24.16B, V22.16B SUB X8, X8, #8 LOOP_1: LD2 {V0.4H, V1.4H}, [X10], #16 UMULL V19.4S, V8.4H, V10.4H LD2 {V2.4H, V3.4H}, [X3], #16 UMLSL V19.4S, V12.4H, V11.4H LD2 {V6.4H, V7.4H}, [X7], X12 UMULL V23.4S, V0.4H, V2.4H REV64 V4.4H, V6.4H UMLSL V23.4S, V4.4H, V3.4H REV64 V5.4H, V7.4H SSHR V19.4S, V19.4S, #16 ST1 {V24.S}[0], [X2], X11 SMLAL V19.4S, V9.4H, V10.4H ST1 {V24.S}[1], [X2], X11 SSHR V23.4S, V23.4S, #16 ST1 {V24.S}[2], [X2], X11 SMLAL V23.4S, V1.4H, V2.4H ST1 {V24.S}[3], [X2], X11 SMLSL V19.4S, V13.4H, V11.4H SMLSL V23.4S, V5.4H, V3.4H LD2 {V8.4H, V9.4H}, [X10], #16 LD2 {V10.4H, V11.4H}, [X3], #16 LD2 {V14.4H, V15.4H}, [X7], X12 SQADD V18.4S, V19.4S, V20.4S REV64 V12.4H, V14.4H REV64 V13.4H, V15.4H SQADD V22.4S, V23.4S, V20.4S SSHL V18.4S, V18.4S, V21.4S MOV V16.16B, V18.16B ST1 {V16.S}[0], [X2], X11 SSHL V22.4S, V22.4S, V21.4S MOV V24.16B, V22.16B SUBS X8, X8, #8 ST1 {V16.S}[1], [X2], X11 ST1 {V16.S}[2], [X2], X11 ST1 {V16.S}[3], [X2], X11 BGT LOOP_1 ST1 {V24.S}[0], [X2], X11 UMULL V19.4S, V8.4H, V10.4H UMLSL V19.4S, V12.4H, V11.4H ST1 {V24.S}[1], [X2], X11 ST1 {V24.S}[2], [X2], X11 SSHR V19.4S, V19.4S, #16 ST1 {V24.S}[3], [X2], X11 SMLAL V19.4S, V9.4H, V10.4H SMLSL V19.4S, V13.4H, V11.4H MOV X12, #12 MOV V30.S[0], W5 MOV V31.S[0], W6 SMULL V29.4S, V30.4H, V31.4H MOV W7, V29.S[0] LSL W10, W5, #1 SQADD V18.4S, V19.4S, V20.4S SSHL V18.4S, V18.4S, V21.4S MOV V16.16B, V18.16B ST1 {V16.S}[0], [X2], X11 LSL X7, X7, #2 ST1 {V16.S}[1], [X2], X11 ADD X7, X7, X9 ST1 {V16.S}[2], [X2], X11 ST1 {V16.S}[3], [X2], X11 SUB X11, X10, #1 LSL X10, X11, #2 ADD X10, X0, X10 LSL X11, X11, #1 SUB X10, X10, X12 LSL X8, X6, #2 MOV X12, #-16 ADD X11, X11, X4 LD1 {V6.4S}, [X10], X12 SUB X11, X11, #14 REV64 V0.4S, V6.4S SQNEG V0.4S, V0.4S UZP1 V1.8H, V0.8H, V0.8H UZP2 V0.8H, V0.8H, V0.8H REV64 V1.4S, V1.4S REV64 V0.4S, V0.4S LD2 {V2.4H, V3.4H}, [X11], X12 REV64 V2.4H, V2.4H REV64 V3.4H, V3.4H LD2 {V4.4H, V5.4H}, [X1], #16 UMULL V23.4S, V1.4H, V3.4H UMLSL V23.4S, V4.4H, V2.4H SSHR V23.4S, V23.4S, #16 SMLAL V23.4S, V0.4H, V3.4H SMLSL V23.4S, V5.4H, V2.4H SQADD V22.4S, V23.4S, V20.4S SSHL V22.4S, V22.4S, V21.4S MOV V24.16B, V22.16B LD1 {V14.4S}, [X10], X12 UMULL V23.4S, V1.4H, V3.4H UMLSL V23.4S, V4.4H, V2.4H REV64 V8.4S, V14.4S SQNEG V8.4S, V8.4S LD2 {V10.4H, V11.4H}, [X11], X12 SSHR V23.4S, V23.4S, #16 LD2 {V12.4H, V13.4H}, [X1], #16 SMLAL V23.4S, V0.4H, V3.4H SMLSL V23.4S, V5.4H, V2.4H UZP1 V9.8H, V8.8H, V8.8H UZP2 V8.8H, V8.8H, V8.8H rev64 v9.4s, v9.4s rev64 v8.4s, v8.4s REV64 V10.4H, V10.4H REV64 V11.4H, V11.4H SQADD V22.4S, V23.4S, V20.4S SUB X5, X5, #8 SSHL V22.4S, V22.4S, V21.4S MOV V24.16B, V22.16B LOOP_2: LD1 {V6.4S}, [X10], X12 UMULL V19.4S, V9.4H, V11.4H REV64 V0.4S, V6.4S SQNEG V0.4S, V0.4S UZP1 V1.8H, V0.8H, V0.8H UZP2 V0.8H, V0.8H, V0.8H REV64 V1.4S, V1.4S REV64 V0.4S, V0.4S LD2 {V2.4H, V3.4H}, [X11], X12 REV64 V2.8H, V2.8H REV64 V3.8H, V3.8H LD2 {V4.4H, V5.4H}, [X1], #16 UMLSL V19.4S, V12.4H, V10.4H ST1 {V24.S}[0], [X7], X8 UMULL V23.4S, V1.4H, V3.4H ST1 {V24.S}[1], [X7], X8 SSHR V19.4S, V19.4S, #16 ST1 {V24.S}[2], [X7], X8 UMLSL V23.4S, V4.4H, V2.4H ST1 {V24.S}[3], [X7], X8 SMLAL V19.4S, V8.4H, V11.4H LD1 {V14.4S}, [X10], X12 SSHR V23.4S, V23.4S, #16 SMLSL V19.4S, V13.4H, V10.4H LD2 {V10.4H, V11.4H}, [X11], X12 SMLAL V23.4S, V0.4H, V3.4H SMLSL V23.4S, V5.4H, V2.4H REV64 V8.4S, V14.4S LD2 {V12.4H, V13.4H}, [X1], #16 SQNEG V8.4S, V8.4S REV64 V11.4H, V11.4h REV64 V10.4H, V10.4H SQADD V18.4S, V19.4S, V20.4S UZP1 V9.8H, V8.8H, V8.8H UZP2 V8.8H, V8.8H, V8.8H rev64 v9.4s, v9.4s rev64 v8.4s, v8.4s SQADD V22.4S, V23.4S, V20.4S SSHL V18.4S, V18.4S, V21.4S SUBS X5, X5, #8 MOV V16.16B, V18.16B ST1 {V16.S}[0], [X7], X8 SSHL V22.4S, V22.4S, V21.4S ST1 {V16.S}[1], [X7], X8 MOV V24.16B, V22.16B ST1 {V16.S}[2], [X7], X8 ST1 {V16.S}[3], [X7], X8 BGT LOOP_2 ST1 {V24.S}[0], [X7], X8 UMULL V19.4S, V9.4H, V11.4H UMLSL V19.4S, V12.4H, V10.4H ST1 {V24.S}[1], [X7], X8 ST1 {V24.S}[2], [X7], X8 SSHR V19.4S, V19.4S, #16 ST1 {V24.S}[3], [X7], X8 SMLAL V19.4S, V8.4H, V11.4H SMLSL V19.4S, V13.4H, V10.4H SQADD V18.4S, V19.4S, V20.4S SSHL V18.4S, V18.4S, V21.4S MOV V16.16B, V18.16B ST1 {V16.S}[0], [X7], X8 ST1 {V16.S}[1], [X7], X8 ST1 {V16.S}[2], [X7], X8 ST1 {V16.S}[3], [X7], X8 pop_v_regs RET