• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code
3;*****************************************************************************
4;* Copyright (C) 2005-2011 x264 project
5;*
6;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
7;*
8;* This file is part of FFmpeg.
9;*
10;* FFmpeg is free software; you can redistribute it and/or
11;* modify it under the terms of the GNU Lesser General Public
12;* License as published by the Free Software Foundation; either
13;* version 2.1 of the License, or (at your option) any later version.
14;*
15;* FFmpeg is distributed in the hope that it will be useful,
16;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18;* Lesser General Public License for more details.
19;*
20;* You should have received a copy of the GNU Lesser General Public
21;* License along with FFmpeg; if not, write to the Free Software
22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23;******************************************************************************
24
25%include "libavutil/x86/x86util.asm"
26
27SECTION .text
28
29cextern pw_1023
30%define pw_pixel_max pw_1023
31cextern pd_32
32
33;-----------------------------------------------------------------------------
34; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride)
35;-----------------------------------------------------------------------------
36%macro STORE_DIFFx2 6
37    psrad       %1, 6
38    psrad       %2, 6
39    packssdw    %1, %2
40    movq        %3, [%5]
41    movhps      %3, [%5+%6]
42    paddsw      %1, %3
43    CLIPW       %1, %4, [pw_pixel_max]
44    movq      [%5], %1
45    movhps [%5+%6], %1
46%endmacro
47
48%macro STORE_DIFF16 5
49    psrad       %1, 6
50    psrad       %2, 6
51    packssdw    %1, %2
52    paddsw      %1, [%5]
53    CLIPW       %1, %3, %4
54    mova      [%5], %1
55%endmacro
56
57;dst, in, stride
58%macro IDCT4_ADD_10 3
59    mova  m0, [%2+ 0]
60    mova  m1, [%2+16]
61    mova  m2, [%2+32]
62    mova  m3, [%2+48]
63    IDCT4_1D d,0,1,2,3,4,5
64    TRANSPOSE4x4D 0,1,2,3,4
65    paddd m0, [pd_32]
66    IDCT4_1D d,0,1,2,3,4,5
67    pxor  m5, m5
68    mova [%2+ 0], m5
69    mova [%2+16], m5
70    mova [%2+32], m5
71    mova [%2+48], m5
72    STORE_DIFFx2 m0, m1, m4, m5, %1, %3
73    lea   %1, [%1+%3*2]
74    STORE_DIFFx2 m2, m3, m4, m5, %1, %3
75%endmacro
76
77%macro IDCT_ADD_10 0
78cglobal h264_idct_add_10, 3,3
79    movsxdifnidn r2, r2d
80    IDCT4_ADD_10 r0, r1, r2
81    RET
82%endmacro
83
84INIT_XMM sse2
85IDCT_ADD_10
86%if HAVE_AVX_EXTERNAL
87INIT_XMM avx
88IDCT_ADD_10
89%endif
90
91;-----------------------------------------------------------------------------
92; void ff_h264_idct_add16_10(pixel *dst, const int *block_offset,
93;                            int16_t *block, int stride,
94;                            const uint8_t nnzc[6*8])
95;-----------------------------------------------------------------------------
96;;;;;;; NO FATE SAMPLES TRIGGER THIS
97%macro ADD4x4IDCT 0
98add4x4_idct %+ SUFFIX:
99    add   r5, r0
100    mova  m0, [r2+ 0]
101    mova  m1, [r2+16]
102    mova  m2, [r2+32]
103    mova  m3, [r2+48]
104    IDCT4_1D d,0,1,2,3,4,5
105    TRANSPOSE4x4D 0,1,2,3,4
106    paddd m0, [pd_32]
107    IDCT4_1D d,0,1,2,3,4,5
108    pxor  m5, m5
109    mova  [r2+ 0], m5
110    mova  [r2+16], m5
111    mova  [r2+32], m5
112    mova  [r2+48], m5
113    STORE_DIFFx2 m0, m1, m4, m5, r5, r3
114    lea   r5, [r5+r3*2]
115    STORE_DIFFx2 m2, m3, m4, m5, r5, r3
116    ret
117%endmacro
118
119INIT_XMM sse2
120ALIGN 16
121ADD4x4IDCT
122%if HAVE_AVX_EXTERNAL
123INIT_XMM avx
124ALIGN 16
125ADD4x4IDCT
126%endif
127
128%macro ADD16_OP 2
129    cmp          byte [r4+%2], 0
130    jz .skipblock%1
131    mov         r5d, [r1+%1*4]
132    call add4x4_idct %+ SUFFIX
133.skipblock%1:
134%if %1<15
135    add          r2, 64
136%endif
137%endmacro
138
139%macro IDCT_ADD16_10 0
140cglobal h264_idct_add16_10, 5,6
141    movsxdifnidn r3, r3d
142    ADD16_OP 0, 4+1*8
143    ADD16_OP 1, 5+1*8
144    ADD16_OP 2, 4+2*8
145    ADD16_OP 3, 5+2*8
146    ADD16_OP 4, 6+1*8
147    ADD16_OP 5, 7+1*8
148    ADD16_OP 6, 6+2*8
149    ADD16_OP 7, 7+2*8
150    ADD16_OP 8, 4+3*8
151    ADD16_OP 9, 5+3*8
152    ADD16_OP 10, 4+4*8
153    ADD16_OP 11, 5+4*8
154    ADD16_OP 12, 6+3*8
155    ADD16_OP 13, 7+3*8
156    ADD16_OP 14, 6+4*8
157    ADD16_OP 15, 7+4*8
158    REP_RET
159%endmacro
160
161INIT_XMM sse2
162IDCT_ADD16_10
163%if HAVE_AVX_EXTERNAL
164INIT_XMM avx
165IDCT_ADD16_10
166%endif
167
168;-----------------------------------------------------------------------------
169; void ff_h264_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
170;-----------------------------------------------------------------------------
171%macro IDCT_DC_ADD_OP_10 3
172    pxor      m5, m5
173%if avx_enabled
174    paddw     m1, m0, [%1+0   ]
175    paddw     m2, m0, [%1+%2  ]
176    paddw     m3, m0, [%1+%2*2]
177    paddw     m4, m0, [%1+%3  ]
178%else
179    mova      m1, [%1+0   ]
180    mova      m2, [%1+%2  ]
181    mova      m3, [%1+%2*2]
182    mova      m4, [%1+%3  ]
183    paddw     m1, m0
184    paddw     m2, m0
185    paddw     m3, m0
186    paddw     m4, m0
187%endif
188    CLIPW     m1, m5, m6
189    CLIPW     m2, m5, m6
190    CLIPW     m3, m5, m6
191    CLIPW     m4, m5, m6
192    mova [%1+0   ], m1
193    mova [%1+%2  ], m2
194    mova [%1+%2*2], m3
195    mova [%1+%3  ], m4
196%endmacro
197
198INIT_MMX mmxext
199cglobal h264_idct_dc_add_10,3,3
200    movsxdifnidn r2, r2d
201    movd      m0, [r1]
202    mov dword [r1], 0
203    paddd     m0, [pd_32]
204    psrad     m0, 6
205    lea       r1, [r2*3]
206    pshufw    m0, m0, 0
207    mova      m6, [pw_pixel_max]
208    IDCT_DC_ADD_OP_10 r0, r2, r1
209    RET
210
211;-----------------------------------------------------------------------------
212; void ff_h264_idct8_dc_add_10(pixel *dst, int16_t *block, int stride)
213;-----------------------------------------------------------------------------
214%macro IDCT8_DC_ADD 0
215cglobal h264_idct8_dc_add_10,3,4,7
216    movsxdifnidn r2, r2d
217    movd      m0, [r1]
218    mov dword[r1], 0
219    paddd     m0, [pd_32]
220    psrad     m0, 6
221    lea       r1, [r2*3]
222    SPLATW    m0, m0, 0
223    mova      m6, [pw_pixel_max]
224    IDCT_DC_ADD_OP_10 r0, r2, r1
225    lea       r0, [r0+r2*4]
226    IDCT_DC_ADD_OP_10 r0, r2, r1
227    RET
228%endmacro
229
230INIT_XMM sse2
231IDCT8_DC_ADD
232%if HAVE_AVX_EXTERNAL
233INIT_XMM avx
234IDCT8_DC_ADD
235%endif
236
237;-----------------------------------------------------------------------------
238; void ff_h264_idct_add16intra_10(pixel *dst, const int *block_offset,
239;                                 int16_t *block, int stride,
240;                                 const uint8_t nnzc[6*8])
241;-----------------------------------------------------------------------------
242%macro AC 1
243.ac%1:
244    mov  r5d, [r1+(%1+0)*4]
245    call add4x4_idct %+ SUFFIX
246    mov  r5d, [r1+(%1+1)*4]
247    add  r2, 64
248    call add4x4_idct %+ SUFFIX
249    add  r2, 64
250    jmp .skipadd%1
251%endmacro
252
253%assign last_block 16
254%macro ADD16_OP_INTRA 2
255    cmp      word [r4+%2], 0
256    jnz .ac%1
257    mov      r5d, [r2+ 0]
258    or       r5d, [r2+64]
259    jz .skipblock%1
260    mov      r5d, [r1+(%1+0)*4]
261    call idct_dc_add %+ SUFFIX
262.skipblock%1:
263%if %1<last_block-2
264    add       r2, 128
265%endif
266.skipadd%1:
267%endmacro
268
269%macro IDCT_ADD16INTRA_10 0
270idct_dc_add %+ SUFFIX:
271    add       r5, r0
272    movq      m0, [r2+ 0]
273    movhps    m0, [r2+64]
274    mov dword [r2+ 0], 0
275    mov dword [r2+64], 0
276    paddd     m0, [pd_32]
277    psrad     m0, 6
278    pshufhw   m0, m0, 0
279    pshuflw   m0, m0, 0
280    lea       r6, [r3*3]
281    mova      m6, [pw_pixel_max]
282    IDCT_DC_ADD_OP_10 r5, r3, r6
283    ret
284
285cglobal h264_idct_add16intra_10,5,7,8
286    movsxdifnidn r3, r3d
287    ADD16_OP_INTRA 0, 4+1*8
288    ADD16_OP_INTRA 2, 4+2*8
289    ADD16_OP_INTRA 4, 6+1*8
290    ADD16_OP_INTRA 6, 6+2*8
291    ADD16_OP_INTRA 8, 4+3*8
292    ADD16_OP_INTRA 10, 4+4*8
293    ADD16_OP_INTRA 12, 6+3*8
294    ADD16_OP_INTRA 14, 6+4*8
295    REP_RET
296    AC 8
297    AC 10
298    AC 12
299    AC 14
300    AC 0
301    AC 2
302    AC 4
303    AC 6
304%endmacro
305
306INIT_XMM sse2
307IDCT_ADD16INTRA_10
308%if HAVE_AVX_EXTERNAL
309INIT_XMM avx
310IDCT_ADD16INTRA_10
311%endif
312
313%assign last_block 36
314;-----------------------------------------------------------------------------
315; void ff_h264_idct_add8_10(pixel **dst, const int *block_offset,
316;                           int16_t *block, int stride,
317;                           const uint8_t nnzc[6*8])
318;-----------------------------------------------------------------------------
319%macro IDCT_ADD8 0
320cglobal h264_idct_add8_10,5,8,7
321    movsxdifnidn r3, r3d
322%if ARCH_X86_64
323    mov      r7, r0
324%endif
325    add      r2, 1024
326    mov      r0, [r0]
327    ADD16_OP_INTRA 16, 4+ 6*8
328    ADD16_OP_INTRA 18, 4+ 7*8
329    add      r2, 1024-128*2
330%if ARCH_X86_64
331    mov      r0, [r7+gprsize]
332%else
333    mov      r0, r0m
334    mov      r0, [r0+gprsize]
335%endif
336    ADD16_OP_INTRA 32, 4+11*8
337    ADD16_OP_INTRA 34, 4+12*8
338    REP_RET
339    AC 16
340    AC 18
341    AC 32
342    AC 34
343
344%endmacro ; IDCT_ADD8
345
346INIT_XMM sse2
347IDCT_ADD8
348%if HAVE_AVX_EXTERNAL
349INIT_XMM avx
350IDCT_ADD8
351%endif
352
353;-----------------------------------------------------------------------------
354; void ff_h264_idct_add8_422_10(pixel **dst, const int *block_offset,
355;                               int16_t *block, int stride,
356;                               const uint8_t nnzc[6*8])
357;-----------------------------------------------------------------------------
358%assign last_block 44
359
360%macro IDCT_ADD8_422 0
361
362cglobal h264_idct_add8_422_10, 5, 8, 7
363    movsxdifnidn r3, r3d
364%if ARCH_X86_64
365    mov      r7, r0
366%endif
367
368    add      r2, 1024
369    mov      r0, [r0]
370    ADD16_OP_INTRA 16, 4+ 6*8
371    ADD16_OP_INTRA 18, 4+ 7*8
372    ADD16_OP_INTRA 24, 4+ 8*8 ; i+4
373    ADD16_OP_INTRA 26, 4+ 9*8 ; i+4
374    add      r2, 1024-128*4
375
376%if ARCH_X86_64
377    mov      r0, [r7+gprsize]
378%else
379    mov      r0, r0m
380    mov      r0, [r0+gprsize]
381%endif
382
383    ADD16_OP_INTRA 32, 4+11*8
384    ADD16_OP_INTRA 34, 4+12*8
385    ADD16_OP_INTRA 40, 4+13*8 ; i+4
386    ADD16_OP_INTRA 42, 4+14*8 ; i+4
387REP_RET
388    AC 16
389    AC 18
390    AC 24 ; i+4
391    AC 26 ; i+4
392    AC 32
393    AC 34
394    AC 40 ; i+4
395    AC 42 ; i+4
396
397%endmacro
398
399INIT_XMM sse2
400IDCT_ADD8_422
401%if HAVE_AVX_EXTERNAL
402INIT_XMM avx
403IDCT_ADD8_422
404%endif
405
406;-----------------------------------------------------------------------------
407; void ff_h264_idct8_add_10(pixel *dst, int16_t *block, int stride)
408;-----------------------------------------------------------------------------
409%macro IDCT8_1D 2
410    SWAP      0, 1
411    psrad     m4, m5, 1
412    psrad     m1, m0, 1
413    paddd     m4, m5
414    paddd     m1, m0
415    paddd     m4, m7
416    paddd     m1, m5
417    psubd     m4, m0
418    paddd     m1, m3
419
420    psubd     m0, m3
421    psubd     m5, m3
422    paddd     m0, m7
423    psubd     m5, m7
424    psrad     m3, 1
425    psrad     m7, 1
426    psubd     m0, m3
427    psubd     m5, m7
428
429    SWAP      1, 7
430    psrad     m1, m7, 2
431    psrad     m3, m4, 2
432    paddd     m3, m0
433    psrad     m0, 2
434    paddd     m1, m5
435    psrad     m5, 2
436    psubd     m0, m4
437    psubd     m7, m5
438
439    SWAP      5, 6
440    psrad     m4, m2, 1
441    psrad     m6, m5, 1
442    psubd     m4, m5
443    paddd     m6, m2
444
445    mova      m2, %1
446    mova      m5, %2
447    SUMSUB_BA d, 5, 2
448    SUMSUB_BA d, 6, 5
449    SUMSUB_BA d, 4, 2
450    SUMSUB_BA d, 7, 6
451    SUMSUB_BA d, 0, 4
452    SUMSUB_BA d, 3, 2
453    SUMSUB_BA d, 1, 5
454    SWAP      7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
455%endmacro
456
457%macro IDCT8_1D_FULL 1
458    mova         m7, [%1+112*2]
459    mova         m6, [%1+ 96*2]
460    mova         m5, [%1+ 80*2]
461    mova         m3, [%1+ 48*2]
462    mova         m2, [%1+ 32*2]
463    mova         m1, [%1+ 16*2]
464    IDCT8_1D   [%1], [%1+ 64*2]
465%endmacro
466
467; %1=int16_t *block, %2=int16_t *dstblock
468%macro IDCT8_ADD_SSE_START 2
469    IDCT8_1D_FULL %1
470%if ARCH_X86_64
471    TRANSPOSE4x4D  0,1,2,3,8
472    mova    [%2    ], m0
473    TRANSPOSE4x4D  4,5,6,7,8
474    mova    [%2+8*2], m4
475%else
476    mova         [%1], m7
477    TRANSPOSE4x4D   0,1,2,3,7
478    mova           m7, [%1]
479    mova    [%2     ], m0
480    mova    [%2+16*2], m1
481    mova    [%2+32*2], m2
482    mova    [%2+48*2], m3
483    TRANSPOSE4x4D   4,5,6,7,3
484    mova    [%2+ 8*2], m4
485    mova    [%2+24*2], m5
486    mova    [%2+40*2], m6
487    mova    [%2+56*2], m7
488%endif
489%endmacro
490
491; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
492%macro IDCT8_ADD_SSE_END 3
493    IDCT8_1D_FULL %2
494    mova  [%2     ], m6
495    mova  [%2+16*2], m7
496
497    pxor         m7, m7
498    STORE_DIFFx2 m0, m1, m6, m7, %1, %3
499    lea          %1, [%1+%3*2]
500    STORE_DIFFx2 m2, m3, m6, m7, %1, %3
501    mova         m0, [%2     ]
502    mova         m1, [%2+16*2]
503    lea          %1, [%1+%3*2]
504    STORE_DIFFx2 m4, m5, m6, m7, %1, %3
505    lea          %1, [%1+%3*2]
506    STORE_DIFFx2 m0, m1, m6, m7, %1, %3
507%endmacro
508
509%macro IDCT8_ADD 0
510cglobal h264_idct8_add_10, 3,4,16
511    movsxdifnidn r2, r2d
512%if UNIX64 == 0
513    %assign pad 16-gprsize-(stack_offset&15)
514    sub  rsp, pad
515    call h264_idct8_add1_10 %+ SUFFIX
516    add  rsp, pad
517    RET
518%endif
519
520ALIGN 16
521; TODO: does not need to use stack
522h264_idct8_add1_10 %+ SUFFIX:
523%assign pad 256+16-gprsize
524    sub          rsp, pad
525    add   dword [r1], 32
526
527%if ARCH_X86_64
528    IDCT8_ADD_SSE_START r1, rsp
529    SWAP 1,  9
530    SWAP 2, 10
531    SWAP 3, 11
532    SWAP 5, 13
533    SWAP 6, 14
534    SWAP 7, 15
535    IDCT8_ADD_SSE_START r1+16, rsp+128
536    PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7
537    IDCT8_1D [rsp], [rsp+128]
538    SWAP 0,  8
539    SWAP 1,  9
540    SWAP 2, 10
541    SWAP 3, 11
542    SWAP 4, 12
543    SWAP 5, 13
544    SWAP 6, 14
545    SWAP 7, 15
546    IDCT8_1D [rsp+16], [rsp+144]
547    psrad         m8, 6
548    psrad         m0, 6
549    packssdw      m8, m0
550    paddsw        m8, [r0]
551    pxor          m0, m0
552    mova    [r1+  0], m0
553    mova    [r1+ 16], m0
554    mova    [r1+ 32], m0
555    mova    [r1+ 48], m0
556    mova    [r1+ 64], m0
557    mova    [r1+ 80], m0
558    mova    [r1+ 96], m0
559    mova    [r1+112], m0
560    mova    [r1+128], m0
561    mova    [r1+144], m0
562    mova    [r1+160], m0
563    mova    [r1+176], m0
564    mova    [r1+192], m0
565    mova    [r1+208], m0
566    mova    [r1+224], m0
567    mova    [r1+240], m0
568    CLIPW         m8, m0, [pw_pixel_max]
569    mova        [r0], m8
570    mova          m8, [pw_pixel_max]
571    STORE_DIFF16  m9, m1, m0, m8, r0+r2
572    lea           r0, [r0+r2*2]
573    STORE_DIFF16 m10, m2, m0, m8, r0
574    STORE_DIFF16 m11, m3, m0, m8, r0+r2
575    lea           r0, [r0+r2*2]
576    STORE_DIFF16 m12, m4, m0, m8, r0
577    STORE_DIFF16 m13, m5, m0, m8, r0+r2
578    lea           r0, [r0+r2*2]
579    STORE_DIFF16 m14, m6, m0, m8, r0
580    STORE_DIFF16 m15, m7, m0, m8, r0+r2
581%else
582    IDCT8_ADD_SSE_START r1,    rsp
583    IDCT8_ADD_SSE_START r1+16, rsp+128
584    lea           r3, [r0+8]
585    IDCT8_ADD_SSE_END r0, rsp,    r2
586    IDCT8_ADD_SSE_END r3, rsp+16, r2
587    mova    [r1+  0], m7
588    mova    [r1+ 16], m7
589    mova    [r1+ 32], m7
590    mova    [r1+ 48], m7
591    mova    [r1+ 64], m7
592    mova    [r1+ 80], m7
593    mova    [r1+ 96], m7
594    mova    [r1+112], m7
595    mova    [r1+128], m7
596    mova    [r1+144], m7
597    mova    [r1+160], m7
598    mova    [r1+176], m7
599    mova    [r1+192], m7
600    mova    [r1+208], m7
601    mova    [r1+224], m7
602    mova    [r1+240], m7
603%endif ; ARCH_X86_64
604
605    add          rsp, pad
606    ret
607%endmacro
608
609INIT_XMM sse2
610IDCT8_ADD
611%if HAVE_AVX_EXTERNAL
612INIT_XMM avx
613IDCT8_ADD
614%endif
615
616;-----------------------------------------------------------------------------
617; void ff_h264_idct8_add4_10(pixel **dst, const int *block_offset,
618;                            int16_t *block, int stride,
619;                            const uint8_t nnzc[6*8])
620;-----------------------------------------------------------------------------
621;;;;;;; NO FATE SAMPLES TRIGGER THIS
622%macro IDCT8_ADD4_OP 2
623    cmp       byte [r4+%2], 0
624    jz .skipblock%1
625    mov      r0d, [r6+%1*4]
626    add       r0, r5
627    call h264_idct8_add1_10 %+ SUFFIX
628.skipblock%1:
629%if %1<12
630    add       r1, 256
631%endif
632%endmacro
633
634%macro IDCT8_ADD4 0
635cglobal h264_idct8_add4_10, 0,7,16
636    movsxdifnidn r3, r3d
637    %assign pad 16-gprsize-(stack_offset&15)
638    SUB      rsp, pad
639    mov       r5, r0mp
640    mov       r6, r1mp
641    mov       r1, r2mp
642    mov      r2d, r3m
643    movifnidn r4, r4mp
644    IDCT8_ADD4_OP  0, 4+1*8
645    IDCT8_ADD4_OP  4, 6+1*8
646    IDCT8_ADD4_OP  8, 4+3*8
647    IDCT8_ADD4_OP 12, 6+3*8
648    ADD       rsp, pad
649    RET
650%endmacro ; IDCT8_ADD4
651
652INIT_XMM sse2
653IDCT8_ADD4
654%if HAVE_AVX_EXTERNAL
655INIT_XMM avx
656IDCT8_ADD4
657%endif
658