• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;******************************************************************************
2;*
3;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
4;* Copyright (c)      Nick Kurshev <nickols_k@mail.ru>
5;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
6;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
7;* Copyright (c) 2013 Daniel Kang
8;*
9;* SIMD-optimized halfpel functions
10;*
11;* This file is part of FFmpeg.
12;*
13;* FFmpeg is free software; you can redistribute it and/or
14;* modify it under the terms of the GNU Lesser General Public
15;* License as published by the Free Software Foundation; either
16;* version 2.1 of the License, or (at your option) any later version.
17;*
18;* FFmpeg is distributed in the hope that it will be useful,
19;* but WITHOUT ANY WARRANTY; without even the implied warranty of
20;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21;* Lesser General Public License for more details.
22;*
23;* You should have received a copy of the GNU Lesser General Public
24;* License along with FFmpeg; if not, write to the Free Software
25;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26;******************************************************************************
27
28%include "libavutil/x86/x86util.asm"
29
30SECTION_RODATA
31cextern pb_1
32cextern pw_2
33pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
34pb_interleave8:  db 0, 4, 1, 5, 2, 6, 3, 7
35
36cextern pw_8192
37
38SECTION .text
39
40; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
41%macro PUT_PIXELS8_X2 0
42%if cpuflag(sse2)
43cglobal put_pixels16_x2, 4,5,4
44%else
45cglobal put_pixels8_x2, 4,5
46%endif
47    lea          r4, [r2*2]
48.loop:
49    movu         m0, [r1+1]
50    movu         m1, [r1+r2+1]
51%if cpuflag(sse2)
52    movu         m2, [r1]
53    movu         m3, [r1+r2]
54    pavgb        m0, m2
55    pavgb        m1, m3
56%else
57    PAVGB        m0, [r1]
58    PAVGB        m1, [r1+r2]
59%endif
60    mova       [r0], m0
61    mova    [r0+r2], m1
62    add          r1, r4
63    add          r0, r4
64    movu         m0, [r1+1]
65    movu         m1, [r1+r2+1]
66%if cpuflag(sse2)
67    movu         m2, [r1]
68    movu         m3, [r1+r2]
69    pavgb        m0, m2
70    pavgb        m1, m3
71%else
72    PAVGB        m0, [r1]
73    PAVGB        m1, [r1+r2]
74%endif
75    add          r1, r4
76    mova       [r0], m0
77    mova    [r0+r2], m1
78    add          r0, r4
79    sub         r3d, 4
80    jne .loop
81    REP_RET
82%endmacro
83
84INIT_MMX mmxext
85PUT_PIXELS8_X2
86
87
88; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
89%macro PUT_PIXELS_16 0
90cglobal put_pixels16_x2, 4,5
91    lea          r4, [r2*2]
92.loop:
93    mova         m0, [r1]
94    mova         m1, [r1+r2]
95    mova         m2, [r1+8]
96    mova         m3, [r1+r2+8]
97    PAVGB        m0, [r1+1]
98    PAVGB        m1, [r1+r2+1]
99    PAVGB        m2, [r1+9]
100    PAVGB        m3, [r1+r2+9]
101    mova       [r0], m0
102    mova    [r0+r2], m1
103    mova     [r0+8], m2
104    mova  [r0+r2+8], m3
105    add          r1, r4
106    add          r0, r4
107    mova         m0, [r1]
108    mova         m1, [r1+r2]
109    mova         m2, [r1+8]
110    mova         m3, [r1+r2+8]
111    PAVGB        m0, [r1+1]
112    PAVGB        m1, [r1+r2+1]
113    PAVGB        m2, [r1+9]
114    PAVGB        m3, [r1+r2+9]
115    add          r1, r4
116    mova       [r0], m0
117    mova    [r0+r2], m1
118    mova     [r0+8], m2
119    mova  [r0+r2+8], m3
120    add          r0, r4
121    sub         r3d, 4
122    jne .loop
123    REP_RET
124%endmacro
125
126INIT_MMX mmxext
127PUT_PIXELS_16
128; The 8_X2 macro can easily be used here
129INIT_XMM sse2
130PUT_PIXELS8_X2
131
132
133; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
134INIT_MMX mmxext
135cglobal put_no_rnd_pixels8_x2, 4,5
136    mova         m6, [pb_1]
137    lea          r4, [r2*2]
138.loop:
139    mova         m0, [r1]
140    mova         m2, [r1+r2]
141    mova         m1, [r1+1]
142    mova         m3, [r1+r2+1]
143    add          r1, r4
144    psubusb      m0, m6
145    psubusb      m2, m6
146    PAVGB        m0, m1
147    PAVGB        m2, m3
148    mova       [r0], m0
149    mova    [r0+r2], m2
150    mova         m0, [r1]
151    mova         m1, [r1+1]
152    mova         m2, [r1+r2]
153    mova         m3, [r1+r2+1]
154    add          r0, r4
155    add          r1, r4
156    psubusb      m0, m6
157    psubusb      m2, m6
158    PAVGB        m0, m1
159    PAVGB        m2, m3
160    mova       [r0], m0
161    mova    [r0+r2], m2
162    add          r0, r4
163    sub         r3d, 4
164    jne .loop
165    REP_RET
166
167
168; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
169%macro PUT_PIXELS8_Y2 0
170%if cpuflag(sse2)
171cglobal put_pixels16_y2, 4,5,3
172%else
173cglobal put_pixels8_y2, 4,5
174%endif
175    lea          r4, [r2*2]
176    movu         m0, [r1]
177    sub          r0, r2
178.loop:
179    movu         m1, [r1+r2]
180    movu         m2, [r1+r4]
181    add          r1, r4
182    PAVGB        m0, m1
183    PAVGB        m1, m2
184    mova    [r0+r2], m0
185    mova    [r0+r4], m1
186    movu         m1, [r1+r2]
187    movu         m0, [r1+r4]
188    add          r0, r4
189    add          r1, r4
190    PAVGB        m2, m1
191    PAVGB        m1, m0
192    mova    [r0+r2], m2
193    mova    [r0+r4], m1
194    add          r0, r4
195    sub         r3d, 4
196    jne .loop
197    REP_RET
198%endmacro
199
200INIT_MMX mmxext
201PUT_PIXELS8_Y2
202; actually, put_pixels16_y2_sse2
203INIT_XMM sse2
204PUT_PIXELS8_Y2
205
206
207; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
208INIT_MMX mmxext
209cglobal put_no_rnd_pixels8_y2, 4,5
210    mova         m6, [pb_1]
211    lea          r4, [r2+r2]
212    mova         m0, [r1]
213    sub          r0, r2
214.loop:
215    mova         m1, [r1+r2]
216    mova         m2, [r1+r4]
217    add          r1, r4
218    psubusb      m1, m6
219    PAVGB        m0, m1
220    PAVGB        m1, m2
221    mova    [r0+r2], m0
222    mova    [r0+r4], m1
223    mova         m1, [r1+r2]
224    mova         m0, [r1+r4]
225    add          r0, r4
226    add          r1, r4
227    psubusb      m1, m6
228    PAVGB        m2, m1
229    PAVGB        m1, m0
230    mova    [r0+r2], m2
231    mova    [r0+r4], m1
232    add          r0, r4
233    sub         r3d, 4
234    jne .loop
235    REP_RET
236
237
238; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
239%macro AVG_PIXELS8_X2 0
240%if cpuflag(sse2)
241cglobal avg_pixels16_x2, 4,5,4
242%else
243cglobal avg_pixels8_x2, 4,5
244%endif
245    lea          r4, [r2*2]
246.loop:
247    movu         m0, [r1]
248    movu         m2, [r1+r2]
249%if cpuflag(sse2)
250    movu         m1, [r1+1]
251    movu         m3, [r1+r2+1]
252    pavgb        m0, m1
253    pavgb        m2, m3
254%else
255    PAVGB        m0, [r1+1], m3, m5
256    PAVGB        m2, [r1+r2+1], m4, m5
257%endif
258    PAVGB        m0, [r0], m3, m5
259    PAVGB        m2, [r0+r2], m4, m5
260    add          r1, r4
261    mova       [r0], m0
262    mova    [r0+r2], m2
263    movu         m0, [r1]
264    movu         m2, [r1+r2]
265%if cpuflag(sse2)
266    movu         m1, [r1+1]
267    movu         m3, [r1+r2+1]
268    pavgb        m0, m1
269    pavgb        m2, m3
270%else
271    PAVGB        m0, [r1+1], m3, m5
272    PAVGB        m2, [r1+r2+1], m4, m5
273%endif
274    add          r0, r4
275    add          r1, r4
276    PAVGB        m0, [r0], m3, m5
277    PAVGB        m2, [r0+r2], m4, m5
278    mova       [r0], m0
279    mova    [r0+r2], m2
280    add          r0, r4
281    sub         r3d, 4
282    jne .loop
283    REP_RET
284%endmacro
285
286INIT_MMX mmxext
287AVG_PIXELS8_X2
288; actually avg_pixels16_x2
289INIT_XMM sse2
290AVG_PIXELS8_X2
291
292
293; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
294%macro AVG_PIXELS8_Y2 0
295%if cpuflag(sse2)
296cglobal avg_pixels16_y2, 4,5,3
297%else
298cglobal avg_pixels8_y2, 4,5
299%endif
300    lea          r4, [r2*2]
301    movu         m0, [r1]
302    sub          r0, r2
303.loop:
304    movu         m1, [r1+r2]
305    movu         m2, [r1+r4]
306    add          r1, r4
307    PAVGB        m0, m1
308    PAVGB        m1, m2
309    PAVGB        m0, [r0+r2]
310    PAVGB        m1, [r0+r4]
311    mova    [r0+r2], m0
312    mova    [r0+r4], m1
313    movu         m1, [r1+r2]
314    movu         m0, [r1+r4]
315    PAVGB        m2, m1
316    PAVGB        m1, m0
317    add          r0, r4
318    add          r1, r4
319    PAVGB        m2, [r0+r2]
320    PAVGB        m1, [r0+r4]
321    mova    [r0+r2], m2
322    mova    [r0+r4], m1
323    add          r0, r4
324    sub         r3d, 4
325    jne .loop
326    REP_RET
327%endmacro
328
329INIT_MMX mmxext
330AVG_PIXELS8_Y2
331; actually avg_pixels16_y2
332INIT_XMM sse2
333AVG_PIXELS8_Y2
334
335
336; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
337; Note this is not correctly rounded, and is therefore used for
338; not-bitexact output
339INIT_MMX mmxext
340cglobal avg_approx_pixels8_xy2, 4,5
341    mova         m6, [pb_1]
342    lea          r4, [r2*2]
343    mova         m0, [r1]
344    PAVGB        m0, [r1+1]
345.loop:
346    mova         m2, [r1+r4]
347    mova         m1, [r1+r2]
348    psubusb      m2, m6
349    PAVGB        m1, [r1+r2+1]
350    PAVGB        m2, [r1+r4+1]
351    add          r1, r4
352    PAVGB        m0, m1
353    PAVGB        m1, m2
354    PAVGB        m0, [r0]
355    PAVGB        m1, [r0+r2]
356    mova       [r0], m0
357    mova    [r0+r2], m1
358    mova         m1, [r1+r2]
359    mova         m0, [r1+r4]
360    PAVGB        m1, [r1+r2+1]
361    PAVGB        m0, [r1+r4+1]
362    add          r0, r4
363    add          r1, r4
364    PAVGB        m2, m1
365    PAVGB        m1, m0
366    PAVGB        m2, [r0]
367    PAVGB        m1, [r0+r2]
368    mova       [r0], m2
369    mova    [r0+r2], m1
370    add          r0, r4
371    sub         r3d, 4
372    jne .loop
373    REP_RET
374
375
376; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
377%macro SET_PIXELS_XY2 1
378%if cpuflag(sse2)
379cglobal %1_pixels16_xy2, 4,5,8
380%else
381cglobal %1_pixels8_xy2, 4,5
382%endif
383    pxor        m7, m7
384    mova        m6, [pw_2]
385    movu        m0, [r1]
386    movu        m4, [r1+1]
387    mova        m1, m0
388    mova        m5, m4
389    punpcklbw   m0, m7
390    punpcklbw   m4, m7
391    punpckhbw   m1, m7
392    punpckhbw   m5, m7
393    paddusw     m4, m0
394    paddusw     m5, m1
395    xor         r4, r4
396    add         r1, r2
397.loop:
398    movu        m0, [r1+r4]
399    movu        m2, [r1+r4+1]
400    mova        m1, m0
401    mova        m3, m2
402    punpcklbw   m0, m7
403    punpcklbw   m2, m7
404    punpckhbw   m1, m7
405    punpckhbw   m3, m7
406    paddusw     m0, m2
407    paddusw     m1, m3
408    paddusw     m4, m6
409    paddusw     m5, m6
410    paddusw     m4, m0
411    paddusw     m5, m1
412    psrlw       m4, 2
413    psrlw       m5, 2
414%ifidn %1, avg
415    mova        m3, [r0+r4]
416    packuswb    m4, m5
417    PAVGB       m4, m3
418%else
419    packuswb    m4, m5
420%endif
421    mova   [r0+r4], m4
422    add         r4, r2
423
424    movu        m2, [r1+r4]
425    movu        m4, [r1+r4+1]
426    mova        m3, m2
427    mova        m5, m4
428    punpcklbw   m2, m7
429    punpcklbw   m4, m7
430    punpckhbw   m3, m7
431    punpckhbw   m5, m7
432    paddusw     m4, m2
433    paddusw     m5, m3
434    paddusw     m0, m6
435    paddusw     m1, m6
436    paddusw     m0, m4
437    paddusw     m1, m5
438    psrlw       m0, 2
439    psrlw       m1, 2
440%ifidn %1, avg
441    mova        m3, [r0+r4]
442    packuswb    m0, m1
443    PAVGB       m0, m3
444%else
445    packuswb    m0, m1
446%endif
447    mova   [r0+r4], m0
448    add         r4, r2
449    sub        r3d, 2
450    jnz .loop
451    REP_RET
452%endmacro
453
454INIT_MMX mmxext
455SET_PIXELS_XY2 avg
456INIT_XMM sse2
457SET_PIXELS_XY2 put
458SET_PIXELS_XY2 avg
459
460%macro SSSE3_PIXELS_XY2 1-2
461%if %0 == 2 ; sse2
462cglobal %1_pixels16_xy2, 4,5,%2
463    mova        m4, [pb_interleave16]
464%else
465cglobal %1_pixels8_xy2, 4,5
466    mova        m4, [pb_interleave8]
467%endif
468    mova        m5, [pb_1]
469    movu        m0, [r1]
470    movu        m1, [r1+1]
471    pmaddubsw   m0, m5
472    pmaddubsw   m1, m5
473    xor         r4, r4
474    add         r1, r2
475.loop:
476    movu        m2, [r1+r4]
477    movu        m3, [r1+r4+1]
478    pmaddubsw   m2, m5
479    pmaddubsw   m3, m5
480    paddusw     m0, m2
481    paddusw     m1, m3
482    pmulhrsw    m0, [pw_8192]
483    pmulhrsw    m1, [pw_8192]
484%ifidn %1, avg
485    mova        m6, [r0+r4]
486    packuswb    m0, m1
487    pshufb      m0, m4
488    pavgb       m0, m6
489%else
490    packuswb    m0, m1
491    pshufb      m0, m4
492%endif
493    mova   [r0+r4], m0
494    add         r4, r2
495
496    movu        m0, [r1+r4]
497    movu        m1, [r1+r4+1]
498    pmaddubsw   m0, m5
499    pmaddubsw   m1, m5
500    paddusw     m2, m0
501    paddusw     m3, m1
502    pmulhrsw    m2, [pw_8192]
503    pmulhrsw    m3, [pw_8192]
504%ifidn %1, avg
505    mova        m6, [r0+r4]
506    packuswb    m2, m3
507    pshufb      m2, m4
508    pavgb       m2, m6
509%else
510    packuswb    m2, m3
511    pshufb      m2, m4
512%endif
513    mova   [r0+r4], m2
514    add         r4, r2
515    sub        r3d, 2
516    jnz .loop
517    REP_RET
518%endmacro
519
520INIT_MMX ssse3
521SSSE3_PIXELS_XY2 put
522SSSE3_PIXELS_XY2 avg
523INIT_XMM ssse3
524SSSE3_PIXELS_XY2 put, 6
525SSSE3_PIXELS_XY2 avg, 7
526