1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "third_party/x86inc/x86inc.asm" 15 16SECTION .text 17 18%macro REORDER_INPUTS 0 19 ; a c d b to a b c d 20 SWAP 1, 3, 2 21%endmacro 22 23%macro TRANSFORM_COLS 0 24 ; input: 25 ; m0 a 26 ; m1 b 27 ; m2 c 28 ; m3 d 29 paddw m0, m2 30 psubw m3, m1 31 32 ; wide subtract 33 punpcklwd m4, m0 34 punpcklwd m5, m3 35 psrad m4, 16 36 psrad m5, 16 37 psubd m4, m5 38 psrad m4, 1 39 packssdw m4, m4 ; e 40 41 psubw m5, m4, m1 ; b 42 psubw m4, m2 ; c 43 psubw m0, m5 44 paddw m3, m4 45 ; m0 a 46 SWAP 1, 5 ; m1 b 47 SWAP 2, 4 ; m2 c 48 ; m3 d 49%endmacro 50 51%macro TRANSPOSE_4X4 0 52 punpcklwd m0, m2 53 punpcklwd m1, m3 54 mova m2, m0 55 punpcklwd m0, m1 56 punpckhwd m2, m1 57 pshufd m1, m0, 0x0e 58 pshufd m3, m2, 0x0e 59%endmacro 60 61; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3 62%macro TRANSPOSE_4X4_WIDE 0 63 mova m3, m0 64 punpcklwd m0, m1 65 punpckhwd m3, m1 66 mova m2, m0 67 punpcklwd m0, m3 68 punpckhwd m2, m3 69 pshufd m1, m0, 0x0e 70 pshufd m3, m2, 0x0e 71%endmacro 72 73%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero 74 movd m%3, [outputq] 75 movd m%4, [outputq + strideq] 76 punpcklbw m%3, m%5 77 punpcklbw m%4, m%5 78 paddw m%1, m%3 79 paddw m%2, m%4 80 packuswb m%1, m%5 81 packuswb m%2, m%5 82 movd [outputq], m%1 83 movd [outputq + strideq], m%2 84%endmacro 85 86INIT_XMM sse2 87cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride 88 mova m0, [inputq + 0] 89 packssdw m0, [inputq + 16] 90 mova m1, [inputq + 32] 91 packssdw m1, [inputq + 48] 92 psraw m0, 2 93 psraw m1, 2 94 95 TRANSPOSE_4X4_WIDE 96 REORDER_INPUTS 97 TRANSFORM_COLS 98 TRANSPOSE_4X4 99 REORDER_INPUTS 100 TRANSFORM_COLS 101 102 pxor m4, m4 103 ADD_STORE_4P_2X 0, 1, 5, 6, 4 104 lea outputq, [outputq + 2 * strideq] 105 ADD_STORE_4P_2X 2, 3, 5, 6, 4 106 107 RET 108