• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
3;*****************************************************************************
4;* Copyright (C) 2005-2011 x264 project
5;*
6;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION_RODATA 32
28
29sq_1: dq 1
30      dq 0
31
32cextern pw_1
33cextern pw_1023
34%define pw_pixel_max pw_1023
35
36SECTION .text
37
38;-----------------------------------------------------------------------------
39; void ff_h264_weight_16_10(uint8_t *dst, int stride, int height,
40;                           int log2_denom, int weight, int offset);
41;-----------------------------------------------------------------------------
42%macro WEIGHT_PROLOGUE 0
43.prologue:
44    PROLOGUE 0,6,8
45    movifnidn  r0, r0mp
46    movifnidn r1d, r1m
47    movifnidn r2d, r2m
48    movifnidn r4d, r4m
49    movifnidn r5d, r5m
50%endmacro
51
52%macro WEIGHT_SETUP 0
53    mova       m0, [pw_1]
54    movd       m2, r3m
55    pslld      m0, m2       ; 1<<log2_denom
56    SPLATW     m0, m0
57    shl        r5, 19       ; *8, move to upper half of dword
58    lea        r5, [r5+r4*2+0x10000]
59    movd       m3, r5d      ; weight<<1 | 1+(offset<<(3))
60    pshufd     m3, m3, 0
61    mova       m4, [pw_pixel_max]
62    paddw      m2, [sq_1]   ; log2_denom+1
63%if notcpuflag(sse4)
64    pxor       m7, m7
65%endif
66%endmacro
67
68%macro WEIGHT_OP 1-2
69%if %0==1
70    mova        m5, [r0+%1]
71    punpckhwd   m6, m5, m0
72    punpcklwd   m5, m0
73%else
74    movq        m5, [r0+%1]
75    movq        m6, [r0+%2]
76    punpcklwd   m5, m0
77    punpcklwd   m6, m0
78%endif
79    pmaddwd     m5, m3
80    pmaddwd     m6, m3
81    psrad       m5, m2
82    psrad       m6, m2
83%if cpuflag(sse4)
84    packusdw    m5, m6
85    pminsw      m5, m4
86%else
87    packssdw    m5, m6
88    CLIPW       m5, m7, m4
89%endif
90%endmacro
91
92%macro WEIGHT_FUNC_DBL 0
93cglobal h264_weight_16_10
94    WEIGHT_PROLOGUE
95    WEIGHT_SETUP
96.nextrow:
97    WEIGHT_OP  0
98    mova [r0   ], m5
99    WEIGHT_OP 16
100    mova [r0+16], m5
101    add       r0, r1
102    dec       r2d
103    jnz .nextrow
104    REP_RET
105%endmacro
106
107INIT_XMM sse2
108WEIGHT_FUNC_DBL
109INIT_XMM sse4
110WEIGHT_FUNC_DBL
111
112
113%macro WEIGHT_FUNC_MM 0
114cglobal h264_weight_8_10
115    WEIGHT_PROLOGUE
116    WEIGHT_SETUP
117.nextrow:
118    WEIGHT_OP   0
119    mova     [r0], m5
120    add        r0, r1
121    dec        r2d
122    jnz .nextrow
123    REP_RET
124%endmacro
125
126INIT_XMM sse2
127WEIGHT_FUNC_MM
128INIT_XMM sse4
129WEIGHT_FUNC_MM
130
131
132%macro WEIGHT_FUNC_HALF_MM 0
133cglobal h264_weight_4_10
134    WEIGHT_PROLOGUE
135    sar         r2d, 1
136    WEIGHT_SETUP
137    lea         r3, [r1*2]
138.nextrow:
139    WEIGHT_OP    0, r1
140    movh      [r0], m5
141    movhps [r0+r1], m5
142    add         r0, r3
143    dec         r2d
144    jnz .nextrow
145    REP_RET
146%endmacro
147
148INIT_XMM sse2
149WEIGHT_FUNC_HALF_MM
150INIT_XMM sse4
151WEIGHT_FUNC_HALF_MM
152
153
154;-----------------------------------------------------------------------------
155; void ff_h264_biweight_16_10(uint8_t *dst, uint8_t *src, int stride,
156;                             int height, int log2_denom, int weightd,
157;                             int weights, int offset);
158;-----------------------------------------------------------------------------
159%if ARCH_X86_32
160DECLARE_REG_TMP 3
161%else
162DECLARE_REG_TMP 7
163%endif
164
165%macro BIWEIGHT_PROLOGUE 0
166.prologue:
167    PROLOGUE 0,8,8
168    movifnidn  r0, r0mp
169    movifnidn  r1, r1mp
170    movifnidn r2d, r2m
171    movifnidn r5d, r5m
172    movifnidn r6d, r6m
173    movifnidn t0d, r7m
174%endmacro
175
176%macro BIWEIGHT_SETUP 0
177    lea        t0, [t0*4+1] ; (offset<<2)+1
178    or         t0, 1
179    shl        r6, 16
180    or         r5, r6
181    movd       m4, r5d      ; weightd | weights
182    movd       m5, t0d      ; (offset+1)|1
183    movd       m6, r4m      ; log2_denom
184    pslld      m5, m6       ; (((offset<<2)+1)|1)<<log2_denom
185    paddd      m6, [sq_1]
186    pshufd     m4, m4, 0
187    pshufd     m5, m5, 0
188    mova       m3, [pw_pixel_max]
189    movifnidn r3d, r3m
190%if notcpuflag(sse4)
191    pxor       m7, m7
192%endif
193%endmacro
194
195%macro BIWEIGHT 1-2
196%if %0==1
197    mova       m0, [r0+%1]
198    mova       m1, [r1+%1]
199    punpckhwd  m2, m0, m1
200    punpcklwd  m0, m1
201%else
202    movq       m0, [r0+%1]
203    movq       m1, [r1+%1]
204    punpcklwd  m0, m1
205    movq       m2, [r0+%2]
206    movq       m1, [r1+%2]
207    punpcklwd  m2, m1
208%endif
209    pmaddwd    m0, m4
210    pmaddwd    m2, m4
211    paddd      m0, m5
212    paddd      m2, m5
213    psrad      m0, m6
214    psrad      m2, m6
215%if cpuflag(sse4)
216    packusdw   m0, m2
217    pminsw     m0, m3
218%else
219    packssdw   m0, m2
220    CLIPW      m0, m7, m3
221%endif
222%endmacro
223
224%macro BIWEIGHT_FUNC_DBL 0
225cglobal h264_biweight_16_10
226    BIWEIGHT_PROLOGUE
227    BIWEIGHT_SETUP
228.nextrow:
229    BIWEIGHT   0
230    mova [r0   ], m0
231    BIWEIGHT  16
232    mova [r0+16], m0
233    add       r0, r2
234    add       r1, r2
235    dec       r3d
236    jnz .nextrow
237    REP_RET
238%endmacro
239
240INIT_XMM sse2
241BIWEIGHT_FUNC_DBL
242INIT_XMM sse4
243BIWEIGHT_FUNC_DBL
244
245%macro BIWEIGHT_FUNC 0
246cglobal h264_biweight_8_10
247    BIWEIGHT_PROLOGUE
248    BIWEIGHT_SETUP
249.nextrow:
250    BIWEIGHT  0
251    mova   [r0], m0
252    add      r0, r2
253    add      r1, r2
254    dec      r3d
255    jnz .nextrow
256    REP_RET
257%endmacro
258
259INIT_XMM sse2
260BIWEIGHT_FUNC
261INIT_XMM sse4
262BIWEIGHT_FUNC
263
264%macro BIWEIGHT_FUNC_HALF 0
265cglobal h264_biweight_4_10
266    BIWEIGHT_PROLOGUE
267    BIWEIGHT_SETUP
268    sar        r3d, 1
269    lea        r4, [r2*2]
270.nextrow:
271    BIWEIGHT     0, r2
272    movh   [r0   ], m0
273    movhps [r0+r2], m0
274    add         r0, r4
275    add         r1, r4
276    dec         r3d
277    jnz .nextrow
278    REP_RET
279%endmacro
280
281INIT_XMM sse2
282BIWEIGHT_FUNC_HALF
283INIT_XMM sse4
284BIWEIGHT_FUNC_HALF
285