• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;******************************************************************************
2;*
3;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
4;* Copyright (c)      Nick Kurshev <nickols_k@mail.ru>
5;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
6;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
7;* Copyright (c) 2013 Daniel Kang
8;*
9;* SIMD-optimized halfpel functions
10;*
11;* This file is part of FFmpeg.
12;*
13;* FFmpeg is free software; you can redistribute it and/or
14;* modify it under the terms of the GNU Lesser General Public
15;* License as published by the Free Software Foundation; either
16;* version 2.1 of the License, or (at your option) any later version.
17;*
18;* FFmpeg is distributed in the hope that it will be useful,
19;* but WITHOUT ANY WARRANTY; without even the implied warranty of
20;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21;* Lesser General Public License for more details.
22;*
23;* You should have received a copy of the GNU Lesser General Public
24;* License along with FFmpeg; if not, write to the Free Software
25;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26;******************************************************************************
27
28%include "libavutil/x86/x86util.asm"
29
30SECTION_RODATA
31cextern pb_1
32cextern pw_2
33pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
34pb_interleave8:  db 0, 4, 1, 5, 2, 6, 3, 7
35
36cextern pw_8192
37
38SECTION .text
39
40; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
41%macro PUT_PIXELS8_X2 0
42%if cpuflag(sse2)
43cglobal put_pixels16_x2, 4,5,4
44%else
45cglobal put_pixels8_x2, 4,5
46%endif
47    lea          r4, [r2*2]
48.loop:
49    movu         m0, [r1+1]
50    movu         m1, [r1+r2+1]
51%if cpuflag(sse2)
52    movu         m2, [r1]
53    movu         m3, [r1+r2]
54    pavgb        m0, m2
55    pavgb        m1, m3
56%else
57    PAVGB        m0, [r1]
58    PAVGB        m1, [r1+r2]
59%endif
60    mova       [r0], m0
61    mova    [r0+r2], m1
62    add          r1, r4
63    add          r0, r4
64    movu         m0, [r1+1]
65    movu         m1, [r1+r2+1]
66%if cpuflag(sse2)
67    movu         m2, [r1]
68    movu         m3, [r1+r2]
69    pavgb        m0, m2
70    pavgb        m1, m3
71%else
72    PAVGB        m0, [r1]
73    PAVGB        m1, [r1+r2]
74%endif
75    add          r1, r4
76    mova       [r0], m0
77    mova    [r0+r2], m1
78    add          r0, r4
79    sub         r3d, 4
80    jne .loop
81    REP_RET
82%endmacro
83
84INIT_MMX mmxext
85PUT_PIXELS8_X2
86INIT_MMX 3dnow
87PUT_PIXELS8_X2
88
89
90; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
91%macro PUT_PIXELS_16 0
92cglobal put_pixels16_x2, 4,5
93    lea          r4, [r2*2]
94.loop:
95    mova         m0, [r1]
96    mova         m1, [r1+r2]
97    mova         m2, [r1+8]
98    mova         m3, [r1+r2+8]
99    PAVGB        m0, [r1+1]
100    PAVGB        m1, [r1+r2+1]
101    PAVGB        m2, [r1+9]
102    PAVGB        m3, [r1+r2+9]
103    mova       [r0], m0
104    mova    [r0+r2], m1
105    mova     [r0+8], m2
106    mova  [r0+r2+8], m3
107    add          r1, r4
108    add          r0, r4
109    mova         m0, [r1]
110    mova         m1, [r1+r2]
111    mova         m2, [r1+8]
112    mova         m3, [r1+r2+8]
113    PAVGB        m0, [r1+1]
114    PAVGB        m1, [r1+r2+1]
115    PAVGB        m2, [r1+9]
116    PAVGB        m3, [r1+r2+9]
117    add          r1, r4
118    mova       [r0], m0
119    mova    [r0+r2], m1
120    mova     [r0+8], m2
121    mova  [r0+r2+8], m3
122    add          r0, r4
123    sub         r3d, 4
124    jne .loop
125    REP_RET
126%endmacro
127
128INIT_MMX mmxext
129PUT_PIXELS_16
130INIT_MMX 3dnow
131PUT_PIXELS_16
132; The 8_X2 macro can easily be used here
133INIT_XMM sse2
134PUT_PIXELS8_X2
135
136
137; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
138%macro PUT_NO_RND_PIXELS8_X2 0
139cglobal put_no_rnd_pixels8_x2, 4,5
140    mova         m6, [pb_1]
141    lea          r4, [r2*2]
142.loop:
143    mova         m0, [r1]
144    mova         m2, [r1+r2]
145    mova         m1, [r1+1]
146    mova         m3, [r1+r2+1]
147    add          r1, r4
148    psubusb      m0, m6
149    psubusb      m2, m6
150    PAVGB        m0, m1
151    PAVGB        m2, m3
152    mova       [r0], m0
153    mova    [r0+r2], m2
154    mova         m0, [r1]
155    mova         m1, [r1+1]
156    mova         m2, [r1+r2]
157    mova         m3, [r1+r2+1]
158    add          r0, r4
159    add          r1, r4
160    psubusb      m0, m6
161    psubusb      m2, m6
162    PAVGB        m0, m1
163    PAVGB        m2, m3
164    mova       [r0], m0
165    mova    [r0+r2], m2
166    add          r0, r4
167    sub         r3d, 4
168    jne .loop
169    REP_RET
170%endmacro
171
172INIT_MMX mmxext
173PUT_NO_RND_PIXELS8_X2
174INIT_MMX 3dnow
175PUT_NO_RND_PIXELS8_X2
176
177
178; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
179%macro PUT_PIXELS8_Y2 0
180%if cpuflag(sse2)
181cglobal put_pixels16_y2, 4,5,3
182%else
183cglobal put_pixels8_y2, 4,5
184%endif
185    lea          r4, [r2*2]
186    movu         m0, [r1]
187    sub          r0, r2
188.loop:
189    movu         m1, [r1+r2]
190    movu         m2, [r1+r4]
191    add          r1, r4
192    PAVGB        m0, m1
193    PAVGB        m1, m2
194    mova    [r0+r2], m0
195    mova    [r0+r4], m1
196    movu         m1, [r1+r2]
197    movu         m0, [r1+r4]
198    add          r0, r4
199    add          r1, r4
200    PAVGB        m2, m1
201    PAVGB        m1, m0
202    mova    [r0+r2], m2
203    mova    [r0+r4], m1
204    add          r0, r4
205    sub         r3d, 4
206    jne .loop
207    REP_RET
208%endmacro
209
210INIT_MMX mmxext
211PUT_PIXELS8_Y2
212INIT_MMX 3dnow
213PUT_PIXELS8_Y2
214; actually, put_pixels16_y2_sse2
215INIT_XMM sse2
216PUT_PIXELS8_Y2
217
218
219; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
220%macro PUT_NO_RND_PIXELS8_Y2 0
221cglobal put_no_rnd_pixels8_y2, 4,5
222    mova         m6, [pb_1]
223    lea          r4, [r2+r2]
224    mova         m0, [r1]
225    sub          r0, r2
226.loop:
227    mova         m1, [r1+r2]
228    mova         m2, [r1+r4]
229    add          r1, r4
230    psubusb      m1, m6
231    PAVGB        m0, m1
232    PAVGB        m1, m2
233    mova    [r0+r2], m0
234    mova    [r0+r4], m1
235    mova         m1, [r1+r2]
236    mova         m0, [r1+r4]
237    add          r0, r4
238    add          r1, r4
239    psubusb      m1, m6
240    PAVGB        m2, m1
241    PAVGB        m1, m0
242    mova    [r0+r2], m2
243    mova    [r0+r4], m1
244    add          r0, r4
245    sub         r3d, 4
246    jne .loop
247    REP_RET
248%endmacro
249
250INIT_MMX mmxext
251PUT_NO_RND_PIXELS8_Y2
252INIT_MMX 3dnow
253PUT_NO_RND_PIXELS8_Y2
254
255
256; void ff_avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
257%macro AVG_PIXELS8 0
258cglobal avg_pixels8, 4,5
259    lea          r4, [r2*2]
260.loop:
261    mova         m0, [r0]
262    mova         m1, [r0+r2]
263    PAVGB        m0, [r1]
264    PAVGB        m1, [r1+r2]
265    mova       [r0], m0
266    mova    [r0+r2], m1
267    add          r1, r4
268    add          r0, r4
269    mova         m0, [r0]
270    mova         m1, [r0+r2]
271    PAVGB        m0, [r1]
272    PAVGB        m1, [r1+r2]
273    add          r1, r4
274    mova       [r0], m0
275    mova    [r0+r2], m1
276    add          r0, r4
277    sub         r3d, 4
278    jne .loop
279    REP_RET
280%endmacro
281
282INIT_MMX 3dnow
283AVG_PIXELS8
284
285
286; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
287%macro AVG_PIXELS8_X2 0
288%if cpuflag(sse2)
289cglobal avg_pixels16_x2, 4,5,4
290%else
291cglobal avg_pixels8_x2, 4,5
292%endif
293    lea          r4, [r2*2]
294%if notcpuflag(mmxext)
295    pcmpeqd      m5, m5
296    paddb        m5, m5
297%endif
298.loop:
299    movu         m0, [r1]
300    movu         m2, [r1+r2]
301%if cpuflag(sse2)
302    movu         m1, [r1+1]
303    movu         m3, [r1+r2+1]
304    pavgb        m0, m1
305    pavgb        m2, m3
306%else
307    PAVGB        m0, [r1+1], m3, m5
308    PAVGB        m2, [r1+r2+1], m4, m5
309%endif
310    PAVGB        m0, [r0], m3, m5
311    PAVGB        m2, [r0+r2], m4, m5
312    add          r1, r4
313    mova       [r0], m0
314    mova    [r0+r2], m2
315    movu         m0, [r1]
316    movu         m2, [r1+r2]
317%if cpuflag(sse2)
318    movu         m1, [r1+1]
319    movu         m3, [r1+r2+1]
320    pavgb        m0, m1
321    pavgb        m2, m3
322%else
323    PAVGB        m0, [r1+1], m3, m5
324    PAVGB        m2, [r1+r2+1], m4, m5
325%endif
326    add          r0, r4
327    add          r1, r4
328    PAVGB        m0, [r0], m3, m5
329    PAVGB        m2, [r0+r2], m4, m5
330    mova       [r0], m0
331    mova    [r0+r2], m2
332    add          r0, r4
333    sub         r3d, 4
334    jne .loop
335    REP_RET
336%endmacro
337
338INIT_MMX mmx
339AVG_PIXELS8_X2
340INIT_MMX mmxext
341AVG_PIXELS8_X2
342INIT_MMX 3dnow
343AVG_PIXELS8_X2
344; actually avg_pixels16_x2
345INIT_XMM sse2
346AVG_PIXELS8_X2
347
348
349; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
350%macro AVG_PIXELS8_Y2 0
351%if cpuflag(sse2)
352cglobal avg_pixels16_y2, 4,5,3
353%else
354cglobal avg_pixels8_y2, 4,5
355%endif
356    lea          r4, [r2*2]
357    movu         m0, [r1]
358    sub          r0, r2
359.loop:
360    movu         m1, [r1+r2]
361    movu         m2, [r1+r4]
362    add          r1, r4
363    PAVGB        m0, m1
364    PAVGB        m1, m2
365    PAVGB        m0, [r0+r2]
366    PAVGB        m1, [r0+r4]
367    mova    [r0+r2], m0
368    mova    [r0+r4], m1
369    movu         m1, [r1+r2]
370    movu         m0, [r1+r4]
371    PAVGB        m2, m1
372    PAVGB        m1, m0
373    add          r0, r4
374    add          r1, r4
375    PAVGB        m2, [r0+r2]
376    PAVGB        m1, [r0+r4]
377    mova    [r0+r2], m2
378    mova    [r0+r4], m1
379    add          r0, r4
380    sub         r3d, 4
381    jne .loop
382    REP_RET
383%endmacro
384
385INIT_MMX mmxext
386AVG_PIXELS8_Y2
387INIT_MMX 3dnow
388AVG_PIXELS8_Y2
389; actually avg_pixels16_y2
390INIT_XMM sse2
391AVG_PIXELS8_Y2
392
393
394; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
395; Note this is not correctly rounded, and is therefore used for
396; not-bitexact output
397%macro AVG_APPROX_PIXELS8_XY2 0
398cglobal avg_approx_pixels8_xy2, 4,5
399    mova         m6, [pb_1]
400    lea          r4, [r2*2]
401    mova         m0, [r1]
402    PAVGB        m0, [r1+1]
403.loop:
404    mova         m2, [r1+r4]
405    mova         m1, [r1+r2]
406    psubusb      m2, m6
407    PAVGB        m1, [r1+r2+1]
408    PAVGB        m2, [r1+r4+1]
409    add          r1, r4
410    PAVGB        m0, m1
411    PAVGB        m1, m2
412    PAVGB        m0, [r0]
413    PAVGB        m1, [r0+r2]
414    mova       [r0], m0
415    mova    [r0+r2], m1
416    mova         m1, [r1+r2]
417    mova         m0, [r1+r4]
418    PAVGB        m1, [r1+r2+1]
419    PAVGB        m0, [r1+r4+1]
420    add          r0, r4
421    add          r1, r4
422    PAVGB        m2, m1
423    PAVGB        m1, m0
424    PAVGB        m2, [r0]
425    PAVGB        m1, [r0+r2]
426    mova       [r0], m2
427    mova    [r0+r2], m1
428    add          r0, r4
429    sub         r3d, 4
430    jne .loop
431    REP_RET
432%endmacro
433
434INIT_MMX mmxext
435AVG_APPROX_PIXELS8_XY2
436INIT_MMX 3dnow
437AVG_APPROX_PIXELS8_XY2
438
439
440; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
441%macro SET_PIXELS_XY2 1
442%if cpuflag(sse2)
443cglobal %1_pixels16_xy2, 4,5,8
444%else
445cglobal %1_pixels8_xy2, 4,5
446%endif
447    pxor        m7, m7
448    mova        m6, [pw_2]
449    movu        m0, [r1]
450    movu        m4, [r1+1]
451    mova        m1, m0
452    mova        m5, m4
453    punpcklbw   m0, m7
454    punpcklbw   m4, m7
455    punpckhbw   m1, m7
456    punpckhbw   m5, m7
457    paddusw     m4, m0
458    paddusw     m5, m1
459    xor         r4, r4
460    add         r1, r2
461.loop:
462    movu        m0, [r1+r4]
463    movu        m2, [r1+r4+1]
464    mova        m1, m0
465    mova        m3, m2
466    punpcklbw   m0, m7
467    punpcklbw   m2, m7
468    punpckhbw   m1, m7
469    punpckhbw   m3, m7
470    paddusw     m0, m2
471    paddusw     m1, m3
472    paddusw     m4, m6
473    paddusw     m5, m6
474    paddusw     m4, m0
475    paddusw     m5, m1
476    psrlw       m4, 2
477    psrlw       m5, 2
478%ifidn %1, avg
479    mova        m3, [r0+r4]
480    packuswb    m4, m5
481    PAVGB       m4, m3
482%else
483    packuswb    m4, m5
484%endif
485    mova   [r0+r4], m4
486    add         r4, r2
487
488    movu        m2, [r1+r4]
489    movu        m4, [r1+r4+1]
490    mova        m3, m2
491    mova        m5, m4
492    punpcklbw   m2, m7
493    punpcklbw   m4, m7
494    punpckhbw   m3, m7
495    punpckhbw   m5, m7
496    paddusw     m4, m2
497    paddusw     m5, m3
498    paddusw     m0, m6
499    paddusw     m1, m6
500    paddusw     m0, m4
501    paddusw     m1, m5
502    psrlw       m0, 2
503    psrlw       m1, 2
504%ifidn %1, avg
505    mova        m3, [r0+r4]
506    packuswb    m0, m1
507    PAVGB       m0, m3
508%else
509    packuswb    m0, m1
510%endif
511    mova   [r0+r4], m0
512    add         r4, r2
513    sub        r3d, 2
514    jnz .loop
515    REP_RET
516%endmacro
517
518INIT_MMX mmxext
519SET_PIXELS_XY2 avg
520INIT_MMX 3dnow
521SET_PIXELS_XY2 avg
522INIT_XMM sse2
523SET_PIXELS_XY2 put
524SET_PIXELS_XY2 avg
525
526%macro SSSE3_PIXELS_XY2 1-2
527%if %0 == 2 ; sse2
528cglobal %1_pixels16_xy2, 4,5,%2
529    mova        m4, [pb_interleave16]
530%else
531cglobal %1_pixels8_xy2, 4,5
532    mova        m4, [pb_interleave8]
533%endif
534    mova        m5, [pb_1]
535    movu        m0, [r1]
536    movu        m1, [r1+1]
537    pmaddubsw   m0, m5
538    pmaddubsw   m1, m5
539    xor         r4, r4
540    add         r1, r2
541.loop:
542    movu        m2, [r1+r4]
543    movu        m3, [r1+r4+1]
544    pmaddubsw   m2, m5
545    pmaddubsw   m3, m5
546    paddusw     m0, m2
547    paddusw     m1, m3
548    pmulhrsw    m0, [pw_8192]
549    pmulhrsw    m1, [pw_8192]
550%ifidn %1, avg
551    mova        m6, [r0+r4]
552    packuswb    m0, m1
553    pshufb      m0, m4
554    pavgb       m0, m6
555%else
556    packuswb    m0, m1
557    pshufb      m0, m4
558%endif
559    mova   [r0+r4], m0
560    add         r4, r2
561
562    movu        m0, [r1+r4]
563    movu        m1, [r1+r4+1]
564    pmaddubsw   m0, m5
565    pmaddubsw   m1, m5
566    paddusw     m2, m0
567    paddusw     m3, m1
568    pmulhrsw    m2, [pw_8192]
569    pmulhrsw    m3, [pw_8192]
570%ifidn %1, avg
571    mova        m6, [r0+r4]
572    packuswb    m2, m3
573    pshufb      m2, m4
574    pavgb       m2, m6
575%else
576    packuswb    m2, m3
577    pshufb      m2, m4
578%endif
579    mova   [r0+r4], m2
580    add         r4, r2
581    sub        r3d, 2
582    jnz .loop
583    REP_RET
584%endmacro
585
586INIT_MMX ssse3
587SSSE3_PIXELS_XY2 put
588SSSE3_PIXELS_XY2 avg
589INIT_XMM ssse3
590SSSE3_PIXELS_XY2 put, 6
591SSSE3_PIXELS_XY2 avg, 7
592