1;****************************************************************************** 2;* Copyright (c) 2008 Loren Merritt 3;* 4;* This file is part of FFmpeg. 5;* 6;* FFmpeg is free software; you can redistribute it and/or 7;* modify it under the terms of the GNU Lesser General Public 8;* License as published by the Free Software Foundation; either 9;* version 2.1 of the License, or (at your option) any later version. 10;* 11;* FFmpeg is distributed in the hope that it will be useful, 12;* but WITHOUT ANY WARRANTY; without even the implied warranty of 13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14;* Lesser General Public License for more details. 15;* 16;* You should have received a copy of the GNU Lesser General Public 17;* License along with FFmpeg; if not, write to the Free Software 18;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19;****************************************************************************** 20 21%include "libavutil/x86/x86util.asm" 22 23SECTION .text 24 25%macro SCALARPRODUCT 0 26; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, 27; int order, int mul) 28cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul 29 shl orderq, 1 30 movd m7, mulm 31%if mmsize == 16 32 pshuflw m7, m7, 0 33 punpcklqdq m7, m7 34%else 35 pshufw m7, m7, 0 36%endif 37 pxor m6, m6 38 add v1q, orderq 39 add v2q, orderq 40 add v3q, orderq 41 neg orderq 42.loop: 43 movu m0, [v2q + orderq] 44 movu m1, [v2q + orderq + mmsize] 45 mova m4, [v1q + orderq] 46 mova m5, [v1q + orderq + mmsize] 47 movu m2, [v3q + orderq] 48 movu m3, [v3q + orderq + mmsize] 49 pmaddwd m0, m4 50 pmaddwd m1, m5 51 pmullw m2, m7 52 pmullw m3, m7 53 paddd m6, m0 54 paddd m6, m1 55 paddw m2, m4 56 paddw m3, m5 57 mova [v1q + orderq], m2 58 mova [v1q + orderq + mmsize], m3 59 add orderq, mmsize*2 60 jl .loop 61 HADDD m6, m0 62 movd eax, m6 63 RET 64%endmacro 65 66INIT_MMX mmxext 67SCALARPRODUCT 68INIT_XMM sse2 69SCALARPRODUCT 70 71INIT_XMM sse4 72; int ff_scalarproduct_and_madd_int32(int16_t *v1, int32_t *v2, int16_t *v3, 73; int order, int mul) 74cglobal scalarproduct_and_madd_int32, 4,4,8, v1, v2, v3, order, mul 75 shl orderq, 1 76 movd m7, mulm 77 SPLATW m7, m7 78 pxor m6, m6 79 add v1q, orderq 80 lea v2q, [v2q + 2*orderq] 81 add v3q, orderq 82 neg orderq 83.loop: 84 mova m3, [v1q + orderq] 85 movu m0, [v2q + 2*orderq] 86 pmovsxwd m4, m3 87 movu m1, [v2q + 2*orderq + mmsize] 88 movhlps m5, m3 89 movu m2, [v3q + orderq] 90 pmovsxwd m5, m5 91 pmullw m2, m7 92 pmulld m0, m4 93 pmulld m1, m5 94 paddw m2, m3 95 paddd m6, m0 96 paddd m6, m1 97 mova [v1q + orderq], m2 98 add orderq, 16 99 jl .loop 100 HADDD m6, m0 101 movd eax, m6 102 RET 103 104%macro SCALARPRODUCT_LOOP 1 105align 16 106.loop%1: 107 sub orderq, mmsize*2 108%if %1 109 mova m1, m4 110 mova m4, [v2q + orderq] 111 mova m0, [v2q + orderq + mmsize] 112 palignr m1, m0, %1 113 palignr m0, m4, %1 114 mova m3, m5 115 mova m5, [v3q + orderq] 116 mova m2, [v3q + orderq + mmsize] 117 palignr m3, m2, %1 118 palignr m2, m5, %1 119%else 120 mova m0, [v2q + orderq] 121 mova m1, [v2q + orderq + mmsize] 122 mova m2, [v3q + orderq] 123 mova m3, [v3q + orderq + mmsize] 124%endif 125 %define t0 [v1q + orderq] 126 %define t1 [v1q + orderq + mmsize] 127%if ARCH_X86_64 128 mova m8, t0 129 mova m9, t1 130 %define t0 m8 131 %define t1 m9 132%endif 133 pmaddwd m0, t0 134 pmaddwd m1, t1 135 pmullw m2, m7 136 pmullw m3, m7 137 paddw m2, t0 138 paddw m3, t1 139 paddd m6, m0 140 paddd m6, m1 141 mova [v1q + orderq], m2 142 mova [v1q + orderq + mmsize], m3 143 jg .loop%1 144%if %1 145 jmp .end 146%endif 147%endmacro 148 149; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, 150; int order, int mul) 151INIT_XMM ssse3 152cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul 153 shl orderq, 1 154 movd m7, mulm 155 pshuflw m7, m7, 0 156 punpcklqdq m7, m7 157 pxor m6, m6 158 mov r4d, v2d 159 and r4d, 15 160 and v2q, ~15 161 and v3q, ~15 162 mova m4, [v2q + orderq] 163 mova m5, [v3q + orderq] 164 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) 165 cmp r4d, 0 166 je .loop0 167 cmp r4d, 2 168 je .loop2 169 cmp r4d, 4 170 je .loop4 171 cmp r4d, 6 172 je .loop6 173 cmp r4d, 8 174 je .loop8 175 cmp r4d, 10 176 je .loop10 177 cmp r4d, 12 178 je .loop12 179SCALARPRODUCT_LOOP 14 180SCALARPRODUCT_LOOP 12 181SCALARPRODUCT_LOOP 10 182SCALARPRODUCT_LOOP 8 183SCALARPRODUCT_LOOP 6 184SCALARPRODUCT_LOOP 4 185SCALARPRODUCT_LOOP 2 186SCALARPRODUCT_LOOP 0 187.end: 188 HADDD m6, m0 189 movd eax, m6 190 RET 191