1;****************************************************************************** 2;* TTA DSP SIMD optimizations 3;* 4;* Copyright (C) 2014 James Almer 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 26 27pd_n0113: dd ~0, ~1, ~1, ~3 28pd_1224: dd 1, 2, 2, 4 29 30SECTION .text 31 32%macro TTA_FILTER 2 33INIT_XMM %1 34cglobal tta_filter_process, 5,5,%2, qm, dx, dl, error, in, shift, round 35 mova m2, [qmq ] 36 mova m3, [qmq + 0x10] 37 mova m4, [dxq ] 38 mova m5, [dxq + 0x10] 39 40 movd m6, [errorq] ; if (filter->error < 0) { 41 SPLATD m6 ; for (int i = 0; i < 8; i++) 42 psignd m0, m4, m6 ; filter->qm[i] -= filter->dx[i]; 43 psignd m1, m5, m6 ; } else if (filter->error > 0) { 44 paddd m2, m0 ; for (int i = 0; i < 8; i++) 45 paddd m3, m1 ; filter->qm[i] += filter->dx[i]; 46 mova [qmq ], m2 ; } 47 mova [qmq + 0x10], m3 ; 48 49 mova m0, [dlq ] 50 mova m1, [dlq + 0x10] 51 52%if cpuflag(sse4) 53 pmulld m2, m0 54 pmulld m3, m1 55%else 56 pshufd m6, m0, 0xb1 57 pshufd m7, m2, 0xb1 58 pmuludq m6, m7 59 pshufd m6, m6, 0xd8 60 pmuludq m2, m0 61 pshufd m2, m2, 0xd8 62 punpckldq m2, m6 63 64 pshufd m6, m1, 0xb1 65 pshufd m7, m3, 0xb1 66 pmuludq m6, m7 67 pshufd m6, m6, 0xd8 68 pmuludq m3, m1 69 pshufd m3, m3, 0xd8 70 punpckldq m3, m6 71%endif 72 ; Using horizontal add (phaddd) seems to be slower than shuffling stuff around 73 paddd m2, m3 ; int sum = filter->round + 74 ; filter->dl[0] * filter->qm[0] + 75 pshufd m3, m2, 0xe ; filter->dl[1] * filter->qm[1] + 76 paddd m2, m3 ; filter->dl[2] * filter->qm[2] + 77 ; filter->dl[3] * filter->qm[3] + 78 movd m6, roundm ; filter->dl[4] * filter->qm[4] + 79 paddd m6, m2 ; filter->dl[5] * filter->qm[5] + 80 pshufd m2, m2, 0x1 ; filter->dl[6] * filter->qm[6] + 81 paddd m6, m2 ; filter->dl[7] * filter->qm[7]; 82 83 palignr m5, m4, 4 ; filter->dx[0] = filter->dx[1]; filter->dx[1] = filter->dx[2]; 84 ; filter->dx[2] = filter->dx[3]; filter->dx[3] = filter->dx[4]; 85 86 palignr m2, m1, m0, 4 ; filter->dl[0] = filter->dl[1]; filter->dl[1] = filter->dl[2]; 87 ; filter->dl[2] = filter->dl[3]; filter->dl[3] = filter->dl[4]; 88 89 psrad m4, m1, 30 ; filter->dx[4] = ((filter->dl[4] >> 30) | 1); 90 por m4, [pd_1224 ] ; filter->dx[5] = ((filter->dl[5] >> 30) | 2) & ~1; 91 pand m4, [pd_n0113] ; filter->dx[6] = ((filter->dl[6] >> 30) | 2) & ~1; 92 ; filter->dx[7] = ((filter->dl[7] >> 30) | 4) & ~3; 93 94 mova [dlq ], m2 95 mova [dxq ], m5 96 mova [dxq + 0x10], m4 97 movd m0, [inq] ; filter->error = *in; 98 movd [errorq], m0 ; 99 100 movd m2, shiftm ; *in += (sum >> filter->shift); 101 psrad m6, m2 ; 102 paddd m0, m6 ; 103 movd [inq], m0 ; 104 105 psrldq m1, 4 ; 106 pslldq m0, 12 ; filter->dl[4] = -filter->dl[5]; 107 pshufd m0, m0, 0xf0 ; filter->dl[5] = -filter->dl[6]; 108 psubd m0, m1 ; filter->dl[6] = *in - filter->dl[7]; 109 psrldq m1, m0, 4 ; filter->dl[7] = *in; 110 pshufd m1, m1, 0xf4 ; filter->dl[5] += filter->dl[6]; 111 paddd m0, m1 ; filter->dl[4] += filter->dl[5]; 112 psrldq m1, 4 ; 113 paddd m0, m1 ; 114 mova [dlq + 0x10], m0 ; 115 RET 116%endmacro 117 118TTA_FILTER ssse3, 8 119TTA_FILTER sse4, 7 120