1/*! 2 * \copy 3 * Copyright (c) 2013, Cisco Systems 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 * 31 */ 32 33#ifdef HAVE_NEON_AARCH64 34#include "arm_arch64_common_macro.S" 35 36.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 37// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8\() \arg9\() 38 39 saddl \arg4\().4s, \arg0\().4h, \arg2\().4h //int32 e[i][0] = src[0] + src[2]; 40 ssubl \arg5\().4s, \arg0\().4h, \arg2\().4h //int32 e[i][1] = src[0] - src[2]; 41 sshr \arg8\().4h, \arg1\().4h, #1 42 sshr \arg9\().4h, \arg3\().4h, #1 43 ssubl \arg6\().4s, \arg8\().4h, \arg3\().4h //int32 e[i][2] = (src[1]>>1)-src[3]; 44 saddl \arg7\().4s, \arg1\().4h, \arg9\().4h //int32 e[i][3] = src[1] + (src[3]>>1); 45// } 46.endm 47 48.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 49// both row & col transform used 50// { // output: f_q[0]~[3], input: e_q[0]~[3]; 51 add \arg0\().4s, \arg4\().4s, \arg7\().4s //int16 f[i][0] = e[i][0] + e[i][3]; 52 add \arg1\().4s, \arg5\().4s, \arg6\().4s //int16 f[i][1] = e[i][1] + e[i][2]; 53 sub \arg2\().4s, \arg5\().4s, \arg6\().4s //int16 f[i][2] = e[i][1] - e[i][2]; 54 sub \arg3\().4s, \arg4\().4s, \arg7\().4s //int16 f[i][3] = e[i][0] - e[i][3]; 55// } 56.endm 57 58.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 59// { // input: src_q[0]~[3], output: e_q[0]~[3]; 60 add \arg4\().4s, \arg0\().4s, \arg2\().4s //int32 e[0][j] = f[0][j] + f[2][j]; 61 sub \arg5\().4s, \arg0\().4s, \arg2\().4s //int32 e[1][j] = f[0][j] - f[2][j]; 62 sshr \arg6\().4s, \arg1\().4s, #1 63 sshr \arg7\().4s, \arg3\().4s, #1 64 sub \arg6\().4s, \arg6\().4s, \arg3\().4s //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; 65 add \arg7\().4s, \arg1\().4s, \arg7\().4s //int32 e[3][j] = f[1][j] + (f[3][j]>>1); 66// } 67.endm 68 69// uint8_t *pred, const int32_t stride, int16_t *rs 70WELS_ASM_AARCH64_FUNC_BEGIN IdctResAddPred_AArch64_neon 71 SIGN_EXTENSION x1,w1 72 ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x2] // cost 3 cycles! 73 ROW_TRANSFORM_1_STEP v0, v1, v2, v3, v16, v17, v18, v19, v4, v5 74 TRANSFORM_4BYTES v0, v1, v2, v3, v16, v17, v18, v19 75 // transform element 32bits 76 trn1 v16.4s, v0.4s, v1.4s //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6] 77 trn2 v17.4s, v0.4s, v1.4s //[0 1 2 3]+[4 5 6 7]-->[1 5 3 7] 78 trn1 v18.4s, v2.4s, v3.4s //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14] 79 trn2 v19.4s, v2.4s, v3.4s //[8 9 10 11]+[12 13 14 15]-->[9 13 11 15] 80 trn1 v0.2d, v16.2d, v18.2d //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12] 81 trn2 v2.2d, v16.2d, v18.2d //[0 4 2 6]+[8 12 10 14]-->[2 6 10 14] 82 trn1 v1.2d, v17.2d, v19.2d //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13] 83 trn2 v3.2d, v17.2d, v19.2d //[1 5 3 7]+[9 13 11 15]-->[3 7 11 15] 84 COL_TRANSFORM_1_STEP v0, v1, v2, v3, v16, v17, v18, v19 85 86 TRANSFORM_4BYTES v0, v1, v2, v3, v16, v17, v18, v19 87 //after clip_table[MAX_NEG_CROP] into [0, 255] 88 mov x2, x0 89 ld1 {v16.s}[0],[x0],x1 90 ld1 {v16.s}[1],[x0],x1 91 ld1 {v17.s}[0],[x0],x1 92 ld1 {v17.s}[1],[x0] 93 94 rshrn v0.4h, v0.4s, #6 95 rshrn2 v0.8h, v1.4s, #6 96 rshrn v1.4h, v2.4s, #6 97 rshrn2 v1.8h, v3.4s, #6 98 99 uxtl v2.8h,v16.8b 100 uxtl v3.8h,v17.8b 101 add v2.8h, v2.8h, v0.8h 102 add v3.8h, v3.8h, v1.8h 103 104 sqxtun v0.8b,v2.8h 105 sqxtun v1.8b,v3.8h 106 107 st1 {v0.s}[0],[x2],x1 108 st1 {v0.s}[1],[x2],x1 109 st1 {v1.s}[0],[x2],x1 110 st1 {v1.s}[1],[x2] 111WELS_ASM_AARCH64_FUNC_END 112 113WELS_ASM_AARCH64_FUNC_BEGIN WelsBlockZero16x16_AArch64_neon 114 eor v0.16b, v0.16b, v0.16b 115 eor v1.16b, v1.16b, v1.16b 116 SIGN_EXTENSION x1,w1 117 lsl x1, x1, 1 118.rept 16 119 st1 {v0.16b, v1.16b}, [x0], x1 120.endr 121WELS_ASM_AARCH64_FUNC_END 122 123WELS_ASM_AARCH64_FUNC_BEGIN WelsBlockZero8x8_AArch64_neon 124 eor v0.16b, v0.16b, v0.16b 125 SIGN_EXTENSION x1, w1 126 lsl x1, x1, 1 127.rept 8 128 st1 {v0.16b}, [x0], x1 129.endr 130WELS_ASM_AARCH64_FUNC_END 131#endif 132