1/*! 2 * \copy 3 * Copyright (c) 2013, Cisco Systems 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 * 31 */ 32 33#ifdef HAVE_NEON 34#include "arm_arch_common_macro.S" 35 36.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9 37// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9 38 vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2]; 39 vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2]; 40 vshr.s16 \arg8, \arg1, #1 41 vshr.s16 \arg9, \arg3, #1 42 vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3]; 43 vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1); 44// } 45.endm 46 47.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used 48// { // output: f_q[0]~[3], input: e_q[0]~[3]; 49 vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3]; 50 vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2]; 51 vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2]; 52 vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3]; 53// } 54.endm 55 56.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 57// { // input: src_q[0]~[3], output: e_q[0]~[3]; 58 vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j]; 59 vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j]; 60 vshr.s32 \arg6, \arg1, #1 61 vshr.s32 \arg7, \arg3, #1 62 vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j]; 63 vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1); 64// } 65.endm 66 67// uint8_t *pred, const int32_t stride, int16_t *rs 68WELS_ASM_FUNC_BEGIN IdctResAddPred_neon 69 70 vld4.s16 {d0, d1, d2, d3}, [r2] // cost 3 cycles! 71 72 ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q8, q9, q10, q11, d4, d5 73 74 TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11 75 76 // transform element 32bits 77 vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7] 78 vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15] 79 vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14] 80 vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15] 81 82 COL_TRANSFORM_1_STEP q0, q1, q2, q3, q8, q9, q10, q11 83 84 TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11 85 86 //after clip_table[MAX_NEG_CROP] into [0, 255] 87 mov r2, r0 88 vld1.32 {d20[0]},[r0],r1 89 vld1.32 {d20[1]},[r0],r1 90 vld1.32 {d22[0]},[r0],r1 91 vld1.32 {d22[1]},[r0] 92 93 vrshrn.s32 d16, q0, #6 94 vrshrn.s32 d17, q1, #6 95 vrshrn.s32 d18, q2, #6 96 vrshrn.s32 d19, q3, #6 97 98 vmovl.u8 q0,d20 99 vmovl.u8 q1,d22 100 vadd.s16 q0,q8 101 vadd.s16 q1,q9 102 103 vqmovun.s16 d20,q0 104 vqmovun.s16 d22,q1 105 106 vst1.32 {d20[0]},[r2],r1 107 vst1.32 {d20[1]},[r2],r1 108 vst1.32 {d22[0]},[r2],r1 109 vst1.32 {d22[1]},[r2] 110WELS_ASM_FUNC_END 111 112 113WELS_ASM_FUNC_BEGIN WelsBlockZero16x16_neon 114 veor q0, q0 115 veor q1, q1 116 lsl r1, r1, 1 117.rept 16 118 vst1.64 {q0, q1}, [r0], r1 119.endr 120WELS_ASM_FUNC_END 121 122WELS_ASM_FUNC_BEGIN WelsBlockZero8x8_neon 123 veor q0, q0 124 lsl r1, r1, 1 125.rept 8 126 vst1.64 {q0}, [r0], r1 127.endr 128WELS_ASM_FUNC_END 129#endif 130