• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;******************************************************************************
2;* VC1 motion compensation optimizations
3;* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
4;*
5;* This file is part of FFmpeg.
6;*
7;* FFmpeg is free software; you can redistribute it and/or
8;* modify it under the terms of the GNU Lesser General Public
9;* License as published by the Free Software Foundation; either
10;* version 2.1 of the License, or (at your option) any later version.
11;*
12;* FFmpeg is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15;* Lesser General Public License for more details.
16;*
17;* You should have received a copy of the GNU Lesser General Public
18;* License along with FFmpeg; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20;******************************************************************************
21
22%include "libavutil/x86/x86util.asm"
23
24cextern pw_9
25cextern pw_128
26
27SECTION .text
28
29%if HAVE_MMX_INLINE
30
31; XXX some of these macros are not used right now, but they will in the future
32;     when more functions are ported.
33
34%macro OP_PUT 2 ; dst, src
35%endmacro
36
37%macro OP_AVG 2 ; dst, src
38    pavgb           %1, %2
39%endmacro
40
41%macro NORMALIZE_MMX 1 ; shift
42    paddw           m3, m7 ; +bias-r
43    paddw           m4, m7 ; +bias-r
44    psraw           m3, %1
45    psraw           m4, %1
46%endmacro
47
48%macro TRANSFER_DO_PACK 2 ; op, dst
49    packuswb        m3, m4
50    %1              m3, [%2]
51    mova          [%2], m3
52%endmacro
53
54%macro TRANSFER_DONT_PACK 2 ; op, dst
55    %1              m3, [%2]
56    %1              m3, [%2 + mmsize]
57    mova          [%2], m3
58    mova [mmsize + %2], m4
59%endmacro
60
61; see MSPEL_FILTER13_CORE for use as UNPACK macro
62%macro DO_UNPACK 1 ; reg
63    punpcklbw       %1, m0
64%endmacro
65%macro DONT_UNPACK 1 ; reg
66%endmacro
67
68; Compute the rounder 32-r or 8-r and unpacks it to m7
69%macro LOAD_ROUNDER_MMX 1 ; round
70    movd      m7, %1
71    punpcklwd m7, m7
72    punpckldq m7, m7
73%endmacro
74
75%macro SHIFT2_LINE 5 ; off, r0, r1, r2, r3
76    paddw          m%3, m%4
77    movh           m%2, [srcq + stride_neg2]
78    pmullw         m%3, m6
79    punpcklbw      m%2, m0
80    movh           m%5, [srcq + strideq]
81    psubw          m%3, m%2
82    punpcklbw      m%5, m0
83    paddw          m%3, m7
84    psubw          m%3, m%5
85    psraw          m%3, shift
86    movu   [dstq + %1], m%3
87    add           srcq, strideq
88%endmacro
89
90INIT_MMX mmx
91; void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src,
92;                                    x86_reg stride, int rnd, int64_t shift)
93; Sacrificing m6 makes it possible to pipeline loads from src
94%if ARCH_X86_32
95cglobal vc1_put_ver_16b_shift2, 3,6,0, dst, src, stride
96    DECLARE_REG_TMP     3, 4, 5
97    %define rnd r3mp
98    %define shift qword r4m
99%else ; X86_64
100cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
101    DECLARE_REG_TMP     4, 5, 6
102    %define   rnd r3d
103    ; We need shift either in memory or in a mm reg as it's used in psraw
104    ; On WIN64, the arg is already on the stack
105    ; On UNIX64, m5 doesn't seem to be used
106%if WIN64
107    %define shift r4mp
108%else ; UNIX64
109    %define shift m5
110    mova shift, r4q
111%endif ; WIN64
112%endif ; X86_32
113%define stride_neg2 t0q
114%define stride_9minus4 t1q
115%define i t2q
116    mov       stride_neg2, strideq
117    neg       stride_neg2
118    add       stride_neg2, stride_neg2
119    lea    stride_9minus4, [strideq * 9 - 4]
120    mov                 i, 3
121    LOAD_ROUNDER_MMX  rnd
122    mova               m6, [pw_9]
123    pxor               m0, m0
124.loop:
125    movh               m2, [srcq]
126    add              srcq, strideq
127    movh               m3, [srcq]
128    punpcklbw          m2, m0
129    punpcklbw          m3, m0
130    SHIFT2_LINE         0, 1, 2, 3, 4
131    SHIFT2_LINE        24, 2, 3, 4, 1
132    SHIFT2_LINE        48, 3, 4, 1, 2
133    SHIFT2_LINE        72, 4, 1, 2, 3
134    SHIFT2_LINE        96, 1, 2, 3, 4
135    SHIFT2_LINE       120, 2, 3, 4, 1
136    SHIFT2_LINE       144, 3, 4, 1, 2
137    SHIFT2_LINE       168, 4, 1, 2, 3
138    sub              srcq, stride_9minus4
139    add              dstq, 8
140    dec                 i
141        jnz         .loop
142    REP_RET
143%undef rnd
144%undef shift
145%undef stride_neg2
146%undef stride_9minus4
147%undef i
148
149; void ff_vc1_*_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
150;                                  const int16_t *src, int rnd);
151; Data is already unpacked, so some operations can directly be made from
152; memory.
153%macro HOR_16B_SHIFT2 2 ; op, opname
154cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h
155    mov                hq, 8
156    sub              srcq, 2
157    sub              rndd, (-1+9+9-1) * 1024 ; add -1024 bias
158    LOAD_ROUNDER_MMX rndd
159    mova               m5, [pw_9]
160    mova               m6, [pw_128]
161    pxor               m0, m0
162
163.loop:
164    mova               m1, [srcq + 2 * 0]
165    mova               m2, [srcq + 2 * 0 + mmsize]
166    mova               m3, [srcq + 2 * 1]
167    mova               m4, [srcq + 2 * 1 + mmsize]
168    paddw              m3, [srcq + 2 * 2]
169    paddw              m4, [srcq + 2 * 2 + mmsize]
170    paddw              m1, [srcq + 2 * 3]
171    paddw              m2, [srcq + 2 * 3 + mmsize]
172    pmullw             m3, m5
173    pmullw             m4, m5
174    psubw              m3, m1
175    psubw              m4, m2
176    NORMALIZE_MMX      7
177    ; remove bias
178    paddw              m3, m6
179    paddw              m4, m6
180    TRANSFER_DO_PACK   %1, dstq
181    add              srcq, 24
182    add              dstq, strideq
183    dec                hq
184        jnz         .loop
185
186    RET
187%endmacro
188
189INIT_MMX mmx
190HOR_16B_SHIFT2 OP_PUT, put
191
192INIT_MMX mmxext
193HOR_16B_SHIFT2 OP_AVG, avg
194%endif ; HAVE_MMX_INLINE
195
196%macro INV_TRANS_INIT 0
197    movsxdifnidn linesizeq, linesized
198    movd       m0, blockd
199    SPLATW     m0, m0
200    pxor       m1, m1
201    psubw      m1, m0
202    packuswb   m0, m0
203    packuswb   m1, m1
204
205    DEFINE_ARGS dest, linesize, linesize3
206    lea    linesize3q, [linesizeq*3]
207%endmacro
208
209%macro INV_TRANS_PROCESS 1
210    mov%1                  m2, [destq+linesizeq*0]
211    mov%1                  m3, [destq+linesizeq*1]
212    mov%1                  m4, [destq+linesizeq*2]
213    mov%1                  m5, [destq+linesize3q]
214    paddusb                m2, m0
215    paddusb                m3, m0
216    paddusb                m4, m0
217    paddusb                m5, m0
218    psubusb                m2, m1
219    psubusb                m3, m1
220    psubusb                m4, m1
221    psubusb                m5, m1
222    mov%1 [linesizeq*0+destq], m2
223    mov%1 [linesizeq*1+destq], m3
224    mov%1 [linesizeq*2+destq], m4
225    mov%1 [linesize3q +destq], m5
226%endmacro
227
228; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
229INIT_MMX mmxext
230cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block
231    movsx         r3d, WORD [blockq]
232    mov        blockd, r3d             ; dc
233    shl        blockd, 4               ; 16 * dc
234    lea        blockd, [blockq+r3+4]   ; 17 * dc + 4
235    sar        blockd, 3               ; >> 3
236    mov           r3d, blockd          ; dc
237    shl        blockd, 4               ; 16 * dc
238    lea        blockd, [blockq+r3+64]  ; 17 * dc + 64
239    sar        blockd, 7               ; >> 7
240
241    INV_TRANS_INIT
242
243    INV_TRANS_PROCESS h
244    RET
245
246INIT_MMX mmxext
247cglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block
248    movsx         r3d, WORD [blockq]
249    mov        blockd, r3d             ; dc
250    shl        blockd, 4               ; 16 * dc
251    lea        blockd, [blockq+r3+4]   ; 17 * dc + 4
252    sar        blockd, 3               ; >> 3
253    shl        blockd, 2               ;  4 * dc
254    lea        blockd, [blockq*3+64]   ; 12 * dc + 64
255    sar        blockd, 7               ; >> 7
256
257    INV_TRANS_INIT
258
259    INV_TRANS_PROCESS h
260    lea         destq, [destq+linesizeq*4]
261    INV_TRANS_PROCESS h
262    RET
263
264INIT_MMX mmxext
265cglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block
266    movsx      blockd, WORD [blockq]   ; dc
267    lea        blockd, [blockq*3+1]    ;  3 * dc + 1
268    sar        blockd, 1               ; >> 1
269    mov           r3d, blockd          ; dc
270    shl        blockd, 4               ; 16 * dc
271    lea        blockd, [blockq+r3+64]  ; 17 * dc + 64
272    sar        blockd, 7               ; >> 7
273
274    INV_TRANS_INIT
275
276    INV_TRANS_PROCESS a
277    RET
278
279INIT_MMX mmxext
280cglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block
281    movsx      blockd, WORD [blockq]   ; dc
282    lea        blockd, [blockq*3+1]    ;  3 * dc + 1
283    sar        blockd, 1               ; >> 1
284    lea        blockd, [blockq*3+16]   ;  3 * dc + 16
285    sar        blockd, 5               ; >> 5
286
287    INV_TRANS_INIT
288
289    INV_TRANS_PROCESS a
290    lea         destq, [destq+linesizeq*4]
291    INV_TRANS_PROCESS a
292    RET
293