1; 2; Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" 13 14SECTION .text 15 16%macro REORDER_INPUTS 0 17 ; a c d b to a b c d 18 SWAP 1, 3, 2 19%endmacro 20 21%macro TRANSFORM_COLS 0 22 ; input: 23 ; m0 a 24 ; m1 b 25 ; m2 c 26 ; m3 d 27 paddw m0, m2 28 psubw m3, m1 29 30 ; wide subtract 31 punpcklwd m4, m0 32 punpcklwd m5, m3 33 psrad m4, 16 34 psrad m5, 16 35 psubd m4, m5 36 psrad m4, 1 37 packssdw m4, m4 ; e 38 39 psubw m5, m4, m1 ; b 40 psubw m4, m2 ; c 41 psubw m0, m5 42 paddw m3, m4 43 ; m0 a 44 SWAP 1, 5 ; m1 b 45 SWAP 2, 4 ; m2 c 46 ; m3 d 47%endmacro 48 49%macro TRANSPOSE_4X4 0 50 punpcklwd m0, m2 51 punpcklwd m1, m3 52 mova m2, m0 53 punpcklwd m0, m1 54 punpckhwd m2, m1 55 pshufd m1, m0, 0x0e 56 pshufd m3, m2, 0x0e 57%endmacro 58 59; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3 60%macro TRANSPOSE_4X4_WIDE 0 61 mova m3, m0 62 punpcklwd m0, m1 63 punpckhwd m3, m1 64 mova m2, m0 65 punpcklwd m0, m3 66 punpckhwd m2, m3 67 pshufd m1, m0, 0x0e 68 pshufd m3, m2, 0x0e 69%endmacro 70 71%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero 72 movd m%3, [outputq] 73 movd m%4, [outputq + strideq] 74 punpcklbw m%3, m%5 75 punpcklbw m%4, m%5 76 paddw m%1, m%3 77 paddw m%2, m%4 78 packuswb m%1, m%5 79 packuswb m%2, m%5 80 movd [outputq], m%1 81 movd [outputq + strideq], m%2 82%endmacro 83 84INIT_XMM sse2 85cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride 86 LOAD_TRAN_LOW 0, inputq, 0 87 LOAD_TRAN_LOW 1, inputq, 8 88 psraw m0, 2 89 psraw m1, 2 90 91 TRANSPOSE_4X4_WIDE 92 REORDER_INPUTS 93 TRANSFORM_COLS 94 TRANSPOSE_4X4 95 REORDER_INPUTS 96 TRANSFORM_COLS 97 98 pxor m4, m4 99 ADD_STORE_4P_2X 0, 1, 5, 6, 4 100 lea outputq, [outputq + 2 * strideq] 101 ADD_STORE_4P_2X 2, 3, 5, 6, 4 102 103 RET 104