• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;******************************************************************************
2;* mpeg4 qpel
3;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
4;* Copyright (c) 2008 Loren Merritt
5;* Copyright (c) 2013 Daniel Kang
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27cextern pb_1
28cextern pw_3
29cextern pw_15
30cextern pw_16
31cextern pw_20
32
33
34SECTION .text
35
36; void ff_put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
37%macro PUT_NO_RND_PIXELS8_L2 0
38cglobal put_no_rnd_pixels8_l2, 6,6
39    movsxdifnidn r4, r4d
40    movsxdifnidn r3, r3d
41    pcmpeqb      m6, m6
42    test        r5d, 1
43    je .loop
44    mova         m0, [r1]
45    mova         m1, [r2]
46    add          r1, r4
47    add          r2, 8
48    pxor         m0, m6
49    pxor         m1, m6
50    PAVGB        m0, m1
51    pxor         m0, m6
52    mova       [r0], m0
53    add          r0, r3
54    dec r5d
55.loop:
56    mova         m0, [r1]
57    add          r1, r4
58    mova         m1, [r1]
59    add          r1, r4
60    mova         m2, [r2]
61    mova         m3, [r2+8]
62    pxor         m0, m6
63    pxor         m1, m6
64    pxor         m2, m6
65    pxor         m3, m6
66    PAVGB        m0, m2
67    PAVGB        m1, m3
68    pxor         m0, m6
69    pxor         m1, m6
70    mova       [r0], m0
71    add          r0, r3
72    mova       [r0], m1
73    add          r0, r3
74    mova         m0, [r1]
75    add          r1, r4
76    mova         m1, [r1]
77    add          r1, r4
78    mova         m2, [r2+16]
79    mova         m3, [r2+24]
80    pxor         m0, m6
81    pxor         m1, m6
82    pxor         m2, m6
83    pxor         m3, m6
84    PAVGB        m0, m2
85    PAVGB        m1, m3
86    pxor         m0, m6
87    pxor         m1, m6
88    mova       [r0], m0
89    add          r0, r3
90    mova       [r0], m1
91    add          r0, r3
92    add          r2, 32
93    sub         r5d, 4
94    jne .loop
95    REP_RET
96%endmacro
97
98INIT_MMX mmxext
99PUT_NO_RND_PIXELS8_L2
100
101
102; void ff_put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
103%macro PUT_NO_RND_PIXELS16_l2 0
104cglobal put_no_rnd_pixels16_l2, 6,6
105    movsxdifnidn r3, r3d
106    movsxdifnidn r4, r4d
107    pcmpeqb      m6, m6
108    test        r5d, 1
109    je .loop
110    mova         m0, [r1]
111    mova         m1, [r1+8]
112    mova         m2, [r2]
113    mova         m3, [r2+8]
114    pxor         m0, m6
115    pxor         m1, m6
116    pxor         m2, m6
117    pxor         m3, m6
118    PAVGB        m0, m2
119    PAVGB        m1, m3
120    pxor         m0, m6
121    pxor         m1, m6
122    add          r1, r4
123    add          r2, 16
124    mova       [r0], m0
125    mova     [r0+8], m1
126    add          r0, r3
127    dec r5d
128.loop:
129    mova         m0, [r1]
130    mova         m1, [r1+8]
131    add          r1, r4
132    mova         m2, [r2]
133    mova         m3, [r2+8]
134    pxor         m0, m6
135    pxor         m1, m6
136    pxor         m2, m6
137    pxor         m3, m6
138    PAVGB        m0, m2
139    PAVGB        m1, m3
140    pxor         m0, m6
141    pxor         m1, m6
142    mova       [r0], m0
143    mova     [r0+8], m1
144    add          r0, r3
145    mova         m0, [r1]
146    mova         m1, [r1+8]
147    add          r1, r4
148    mova         m2, [r2+16]
149    mova         m3, [r2+24]
150    pxor         m0, m6
151    pxor         m1, m6
152    pxor         m2, m6
153    pxor         m3, m6
154    PAVGB        m0, m2
155    PAVGB        m1, m3
156    pxor         m0, m6
157    pxor         m1, m6
158    mova       [r0], m0
159    mova     [r0+8], m1
160    add          r0, r3
161    add          r2, 32
162    sub         r5d, 2
163    jne .loop
164    REP_RET
165%endmacro
166
167INIT_MMX mmxext
168PUT_NO_RND_PIXELS16_l2
169
170%macro MPEG4_QPEL16_H_LOWPASS 1
171cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16
172    movsxdifnidn r2, r2d
173    movsxdifnidn r3, r3d
174    pxor         m7, m7
175.loop:
176    mova         m0, [r1]
177    mova         m1, m0
178    mova         m2, m0
179    punpcklbw    m0, m7
180    punpckhbw    m1, m7
181    pshufw       m5, m0, 0x90
182    pshufw       m6, m0, 0x41
183    mova         m3, m2
184    mova         m4, m2
185    psllq        m2, 8
186    psllq        m3, 16
187    psllq        m4, 24
188    punpckhbw    m2, m7
189    punpckhbw    m3, m7
190    punpckhbw    m4, m7
191    paddw        m5, m3
192    paddw        m6, m2
193    paddw        m5, m5
194    psubw        m6, m5
195    pshufw       m5, m0, 6
196    pmullw       m6, [pw_3]
197    paddw        m0, m4
198    paddw        m5, m1
199    pmullw       m0, [pw_20]
200    psubw        m0, m5
201    paddw        m6, [PW_ROUND]
202    paddw        m0, m6
203    psraw        m0, 5
204    mova    [rsp+8], m0
205    mova         m0, [r1+5]
206    mova         m5, m0
207    mova         m6, m0
208    psrlq        m0, 8
209    psrlq        m5, 16
210    punpcklbw    m0, m7
211    punpcklbw    m5, m7
212    paddw        m2, m0
213    paddw        m3, m5
214    paddw        m2, m2
215    psubw        m3, m2
216    mova         m2, m6
217    psrlq        m6, 24
218    punpcklbw    m2, m7
219    punpcklbw    m6, m7
220    pmullw       m3, [pw_3]
221    paddw        m1, m2
222    paddw        m4, m6
223    pmullw       m1, [pw_20]
224    psubw        m3, m4
225    paddw        m1, [PW_ROUND]
226    paddw        m3, m1
227    psraw        m3, 5
228    mova         m1, [rsp+8]
229    packuswb     m1, m3
230    OP_MOV     [r0], m1, m4
231    mova         m1, [r1+9]
232    mova         m4, m1
233    mova         m3, m1
234    psrlq        m1, 8
235    psrlq        m4, 16
236    punpcklbw    m1, m7
237    punpcklbw    m4, m7
238    paddw        m5, m1
239    paddw        m0, m4
240    paddw        m5, m5
241    psubw        m0, m5
242    mova         m5, m3
243    psrlq        m3, 24
244    pmullw       m0, [pw_3]
245    punpcklbw    m3, m7
246    paddw        m2, m3
247    psubw        m0, m2
248    mova         m2, m5
249    punpcklbw    m2, m7
250    punpckhbw    m5, m7
251    paddw        m6, m2
252    pmullw       m6, [pw_20]
253    paddw        m0, [PW_ROUND]
254    paddw        m0, m6
255    psraw        m0, 5
256    paddw        m3, m5
257    pshufw       m6, m5, 0xf9
258    paddw        m6, m4
259    pshufw       m4, m5, 0xbe
260    pshufw       m5, m5, 0x6f
261    paddw        m4, m1
262    paddw        m5, m2
263    paddw        m6, m6
264    psubw        m4, m6
265    pmullw       m3, [pw_20]
266    pmullw       m4, [pw_3]
267    psubw        m3, m5
268    paddw        m4, [PW_ROUND]
269    paddw        m4, m3
270    psraw        m4, 5
271    packuswb     m0, m4
272    OP_MOV   [r0+8], m0, m4
273    add          r1, r3
274    add          r0, r2
275    dec r4d
276    jne .loop
277    REP_RET
278%endmacro
279
280%macro PUT_OP 2-3
281    mova %1, %2
282%endmacro
283
284%macro AVG_OP 2-3
285    mova  %3, %1
286    pavgb %2, %3
287    mova  %1, %2
288%endmacro
289
290INIT_MMX mmxext
291%define PW_ROUND pw_16
292%define OP_MOV PUT_OP
293MPEG4_QPEL16_H_LOWPASS put
294%define PW_ROUND pw_16
295%define OP_MOV AVG_OP
296MPEG4_QPEL16_H_LOWPASS avg
297%define PW_ROUND pw_15
298%define OP_MOV PUT_OP
299MPEG4_QPEL16_H_LOWPASS put_no_rnd
300
301
302
303%macro MPEG4_QPEL8_H_LOWPASS 1
304cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8
305    movsxdifnidn r2, r2d
306    movsxdifnidn r3, r3d
307    pxor         m7, m7
308.loop:
309    mova         m0, [r1]
310    mova         m1, m0
311    mova         m2, m0
312    punpcklbw    m0, m7
313    punpckhbw    m1, m7
314    pshufw       m5, m0, 0x90
315    pshufw       m6, m0, 0x41
316    mova         m3, m2
317    mova         m4, m2
318    psllq        m2, 8
319    psllq        m3, 16
320    psllq        m4, 24
321    punpckhbw    m2, m7
322    punpckhbw    m3, m7
323    punpckhbw    m4, m7
324    paddw        m5, m3
325    paddw        m6, m2
326    paddw        m5, m5
327    psubw        m6, m5
328    pshufw       m5, m0, 0x6
329    pmullw       m6, [pw_3]
330    paddw        m0, m4
331    paddw        m5, m1
332    pmullw       m0, [pw_20]
333    psubw        m0, m5
334    paddw        m6, [PW_ROUND]
335    paddw        m0, m6
336    psraw        m0, 5
337    movh         m5, [r1+5]
338    punpcklbw    m5, m7
339    pshufw       m6, m5, 0xf9
340    paddw        m1, m5
341    paddw        m2, m6
342    pshufw       m6, m5, 0xbe
343    pshufw       m5, m5, 0x6f
344    paddw        m3, m6
345    paddw        m4, m5
346    paddw        m2, m2
347    psubw        m3, m2
348    pmullw       m1, [pw_20]
349    pmullw       m3, [pw_3]
350    psubw        m3, m4
351    paddw        m1, [PW_ROUND]
352    paddw        m3, m1
353    psraw        m3, 5
354    packuswb     m0, m3
355    OP_MOV     [r0], m0, m4
356    add          r1, r3
357    add          r0, r2
358    dec r4d
359    jne .loop
360    REP_RET
361%endmacro
362
363INIT_MMX mmxext
364%define PW_ROUND pw_16
365%define OP_MOV PUT_OP
366MPEG4_QPEL8_H_LOWPASS put
367%define PW_ROUND pw_16
368%define OP_MOV AVG_OP
369MPEG4_QPEL8_H_LOWPASS avg
370%define PW_ROUND pw_15
371%define OP_MOV PUT_OP
372MPEG4_QPEL8_H_LOWPASS put_no_rnd
373
374
375
376%macro QPEL_V_LOW 5
377    paddw      m0, m1
378    mova       m4, [pw_20]
379    pmullw     m4, m0
380    mova       m0, %4
381    mova       m5, %1
382    paddw      m5, m0
383    psubw      m4, m5
384    mova       m5, %2
385    mova       m6, %3
386    paddw      m5, m3
387    paddw      m6, m2
388    paddw      m6, m6
389    psubw      m5, m6
390    pmullw     m5, [pw_3]
391    paddw      m4, [PW_ROUND]
392    paddw      m5, m4
393    psraw      m5, 5
394    packuswb   m5, m5
395    OP_MOV     %5, m5, m7
396    SWAP 0,1,2,3
397%endmacro
398
399%macro MPEG4_QPEL16_V_LOWPASS 1
400cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544
401    movsxdifnidn r2, r2d
402    movsxdifnidn r3, r3d
403
404    mov         r4d, 17
405    mov          r5, rsp
406    pxor         m7, m7
407.looph:
408    mova         m0, [r1]
409    mova         m1, [r1]
410    mova         m2, [r1+8]
411    mova         m3, [r1+8]
412    punpcklbw    m0, m7
413    punpckhbw    m1, m7
414    punpcklbw    m2, m7
415    punpckhbw    m3, m7
416    mova       [r5], m0
417    mova  [r5+0x88], m1
418    mova [r5+0x110], m2
419    mova [r5+0x198], m3
420    add          r5, 8
421    add          r1, r3
422    dec r4d
423    jne .looph
424
425
426    ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 14*dstStride
427    mov         r4d, 4
428    mov          r1, 4
429    neg          r2
430    lea          r1, [r1+r2*8]
431    lea          r1, [r1+r2*4]
432    lea          r1, [r1+r2*2]
433    neg          r2
434    mov          r5, rsp
435.loopv:
436    pxor         m7, m7
437    mova         m0, [r5+ 0x0]
438    mova         m1, [r5+ 0x8]
439    mova         m2, [r5+0x10]
440    mova         m3, [r5+0x18]
441    QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
442    QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
443    lea    r0, [r0+r2*2]
444    QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
445    QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
446    lea    r0, [r0+r2*2]
447    QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
448    QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2]
449    lea    r0, [r0+r2*2]
450    QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0]
451    QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2]
452    lea    r0, [r0+r2*2]
453    QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0]
454    QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2]
455    lea    r0, [r0+r2*2]
456    QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0]
457    QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2]
458    lea    r0, [r0+r2*2]
459    QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0]
460    QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2]
461    lea    r0, [r0+r2*2]
462    QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0]
463    QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2]
464
465    add    r5, 0x88
466    add    r0, r1
467    dec r4d
468    jne .loopv
469    REP_RET
470%endmacro
471
472%macro PUT_OPH 2-3
473    movh %1, %2
474%endmacro
475
476%macro AVG_OPH 2-3
477    movh  %3, %1
478    pavgb %2, %3
479    movh  %1, %2
480%endmacro
481
482INIT_MMX mmxext
483%define PW_ROUND pw_16
484%define OP_MOV PUT_OPH
485MPEG4_QPEL16_V_LOWPASS put
486%define PW_ROUND pw_16
487%define OP_MOV AVG_OPH
488MPEG4_QPEL16_V_LOWPASS avg
489%define PW_ROUND pw_15
490%define OP_MOV PUT_OPH
491MPEG4_QPEL16_V_LOWPASS put_no_rnd
492
493
494
495%macro MPEG4_QPEL8_V_LOWPASS 1
496cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288
497    movsxdifnidn r2, r2d
498    movsxdifnidn r3, r3d
499
500    mov         r4d, 9
501    mov          r5, rsp
502    pxor         m7, m7
503.looph:
504    mova         m0, [r1]
505    mova         m1, [r1]
506    punpcklbw    m0, m7
507    punpckhbw    m1, m7
508    mova       [r5], m0
509    mova  [r5+0x48], m1
510    add          r5, 8
511    add          r1, r3
512    dec r4d
513    jne .looph
514
515
516    ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 6*dstStride
517    mov         r4d, 2
518    mov          r1, 4
519    neg          r2
520    lea          r1, [r1+r2*4]
521    lea          r1, [r1+r2*2]
522    neg          r2
523    mov          r5, rsp
524.loopv:
525    pxor         m7, m7
526    mova         m0, [r5+ 0x0]
527    mova         m1, [r5+ 0x8]
528    mova         m2, [r5+0x10]
529    mova         m3, [r5+0x18]
530    QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
531    QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
532    lea    r0, [r0+r2*2]
533    QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
534    QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
535    lea    r0, [r0+r2*2]
536    QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
537    QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2]
538    lea    r0, [r0+r2*2]
539    QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0]
540    QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2]
541
542    add    r5, 0x48
543    add    r0, r1
544    dec r4d
545    jne .loopv
546    REP_RET
547%endmacro
548
549INIT_MMX mmxext
550%define PW_ROUND pw_16
551%define OP_MOV PUT_OPH
552MPEG4_QPEL8_V_LOWPASS put
553%define PW_ROUND pw_16
554%define OP_MOV AVG_OPH
555MPEG4_QPEL8_V_LOWPASS avg
556%define PW_ROUND pw_15
557%define OP_MOV PUT_OPH
558MPEG4_QPEL8_V_LOWPASS put_no_rnd
559