• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code
3;*****************************************************************************
4;* Copyright (C) 2005-2011 x264 project
5;*
6;* Authors: Oskar Arvidsson <oskar@irock.se>
7;*          Loren Merritt <lorenm@u.washington.edu>
8;*          Fiona Glaser <fiona@x264.com>
9;*
10;* This file is part of FFmpeg.
11;*
12;* FFmpeg is free software; you can redistribute it and/or
13;* modify it under the terms of the GNU Lesser General Public
14;* License as published by the Free Software Foundation; either
15;* version 2.1 of the License, or (at your option) any later version.
16;*
17;* FFmpeg is distributed in the hope that it will be useful,
18;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20;* Lesser General Public License for more details.
21;*
22;* You should have received a copy of the GNU Lesser General Public
23;* License along with FFmpeg; if not, write to the Free Software
24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25;******************************************************************************
26
27%include "libavutil/x86/x86util.asm"
28
29SECTION .text
30
31cextern pw_2
32cextern pw_3
33cextern pw_4
34cextern pw_1023
35%define pw_pixel_max pw_1023
36
37; out: %4 = |%1-%2|-%3
38; clobbers: %5
39%macro ABS_SUB 5
40    psubusw %5, %2, %1
41    psubusw %4, %1, %2
42    por     %4, %5
43    psubw   %4, %3
44%endmacro
45
46; out: %4 = |%1-%2|<%3
47%macro DIFF_LT   5
48    psubusw %4, %2, %1
49    psubusw %5, %1, %2
50    por     %5, %4 ; |%1-%2|
51    pxor    %4, %4
52    psubw   %5, %3 ; |%1-%2|-%3
53    pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
54%endmacro
55
56%macro LOAD_AB 4
57    movd       %1, %3
58    movd       %2, %4
59    SPLATW     %1, %1
60    SPLATW     %2, %2
61%endmacro
62
63; in:  %2=tc reg
64; out: %1=splatted tc
65%macro LOAD_TC 2
66    movd        %1, [%2]
67    punpcklbw   %1, %1
68%if mmsize == 8
69    pshufw      %1, %1, 0
70%else
71    pshuflw     %1, %1, 01010000b
72    pshufd      %1, %1, 01010000b
73%endif
74    psraw       %1, 6
75%endmacro
76
77; in: %1=p1, %2=p0, %3=q0, %4=q1
78;     %5=alpha, %6=beta, %7-%9=tmp
79; out: %7=mask
80%macro LOAD_MASK 9
81    ABS_SUB     %2, %3, %5, %8, %7 ; |p0-q0| - alpha
82    ABS_SUB     %1, %2, %6, %9, %7 ; |p1-p0| - beta
83    pand        %8, %9
84    ABS_SUB     %3, %4, %6, %9, %7 ; |q1-q0| - beta
85    pxor        %7, %7
86    pand        %8, %9
87    pcmpgtw     %7, %8
88%endmacro
89
90; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
91; out: %1=p0', m2=q0'
92%macro DEBLOCK_P0_Q0 7
93    psubw   %3, %4
94    pxor    %7, %7
95    paddw   %3, [pw_4]
96    psubw   %7, %5
97    psubw   %6, %2, %1
98    psllw   %6, 2
99    paddw   %3, %6
100    psraw   %3, 3
101    mova    %6, [pw_pixel_max]
102    CLIPW   %3, %7, %5
103    pxor    %7, %7
104    paddw   %1, %3
105    psubw   %2, %3
106    CLIPW   %1, %7, %6
107    CLIPW   %2, %7, %6
108%endmacro
109
110; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
111%macro LUMA_Q1 6
112    pavgw       %6, %3, %4      ; (p0+q0+1)>>1
113    paddw       %1, %6
114    pxor        %6, %6
115    psraw       %1, 1
116    psubw       %6, %5
117    psubw       %1, %2
118    CLIPW       %1, %6, %5
119    paddw       %1, %2
120%endmacro
121
122%macro LUMA_DEBLOCK_ONE 3
123    DIFF_LT     m5, %1, bm, m4, m6
124    pxor        m6, m6
125    mova        %3, m4
126    pcmpgtw     m6, tcm
127    pand        m4, tcm
128    pandn       m6, m7
129    pand        m4, m6
130    LUMA_Q1 m5, %2, m1, m2, m4, m6
131%endmacro
132
133%macro LUMA_H_STORE 2
134%if mmsize == 8
135    movq        [r0-4], m0
136    movq        [r0+r1-4], m1
137    movq        [r0+r1*2-4], m2
138    movq        [r0+%2-4], m3
139%else
140    movq        [r0-4], m0
141    movhps      [r0+r1-4], m0
142    movq        [r0+r1*2-4], m1
143    movhps      [%1-4], m1
144    movq        [%1+r1-4], m2
145    movhps      [%1+r1*2-4], m2
146    movq        [%1+%2-4], m3
147    movhps      [%1+r1*4-4], m3
148%endif
149%endmacro
150
151%macro DEBLOCK_LUMA 0
152;-----------------------------------------------------------------------------
153; void ff_deblock_v_luma_10(uint16_t *pix, int stride, int alpha, int beta,
154;                           int8_t *tc0)
155;-----------------------------------------------------------------------------
156cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
157    %assign pad 5*mmsize+12-(stack_offset&15)
158    %define tcm [rsp]
159    %define ms1 [rsp+mmsize]
160    %define ms2 [rsp+mmsize*2]
161    %define am  [rsp+mmsize*3]
162    %define bm  [rsp+mmsize*4]
163    SUB        rsp, pad
164    shl        r2d, 2
165    shl        r3d, 2
166    LOAD_AB     m4, m5, r2d, r3d
167    mov         r3, 32/mmsize
168    mov         r2, r0
169    sub         r0, r1
170    mova        am, m4
171    sub         r0, r1
172    mova        bm, m5
173    sub         r0, r1
174.loop:
175    mova        m0, [r0+r1]
176    mova        m1, [r0+r1*2]
177    mova        m2, [r2]
178    mova        m3, [r2+r1]
179
180    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
181    LOAD_TC     m6, r4
182    mova       tcm, m6
183
184    mova        m5, [r0]
185    LUMA_DEBLOCK_ONE m1, m0, ms1
186    mova   [r0+r1], m5
187
188    mova        m5, [r2+r1*2]
189    LUMA_DEBLOCK_ONE m2, m3, ms2
190    mova   [r2+r1], m5
191
192    pxor        m5, m5
193    mova        m6, tcm
194    pcmpgtw     m5, tcm
195    psubw       m6, ms1
196    pandn       m5, m7
197    psubw       m6, ms2
198    pand        m5, m6
199    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
200    mova [r0+r1*2], m1
201    mova      [r2], m2
202
203    add         r0, mmsize
204    add         r2, mmsize
205    add         r4, mmsize/8
206    dec         r3
207    jg .loop
208    ADD         rsp, pad
209    RET
210
211cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
212    %assign pad 7*mmsize+12-(stack_offset&15)
213    %define tcm [rsp]
214    %define ms1 [rsp+mmsize]
215    %define ms2 [rsp+mmsize*2]
216    %define p1m [rsp+mmsize*3]
217    %define p2m [rsp+mmsize*4]
218    %define am  [rsp+mmsize*5]
219    %define bm  [rsp+mmsize*6]
220    SUB        rsp, pad
221    shl        r2d, 2
222    shl        r3d, 2
223    LOAD_AB     m4, m5, r2d, r3d
224    mov         r3, r1
225    mova        am, m4
226    add         r3, r1
227    mov         r5, 32/mmsize
228    mova        bm, m5
229    add         r3, r1
230%if mmsize == 16
231    mov         r2, r0
232    add         r2, r3
233%endif
234.loop:
235%if mmsize == 8
236    movq        m2, [r0-8]     ; y q2 q1 q0
237    movq        m7, [r0+0]
238    movq        m5, [r0+r1-8]
239    movq        m3, [r0+r1+0]
240    movq        m0, [r0+r1*2-8]
241    movq        m6, [r0+r1*2+0]
242    movq        m1, [r0+r3-8]
243    TRANSPOSE4x4W 2, 5, 0, 1, 4
244    SWAP         2, 7
245    movq        m7, [r0+r3]
246    TRANSPOSE4x4W 2, 3, 6, 7, 4
247%else
248    movu        m5, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
249    movu        m0, [r0+r1-8]
250    movu        m2, [r0+r1*2-8]
251    movu        m3, [r2-8]
252    TRANSPOSE4x4W 5, 0, 2, 3, 6
253    mova       tcm, m3
254
255    movu        m4, [r2+r1-8]
256    movu        m1, [r2+r1*2-8]
257    movu        m3, [r2+r3-8]
258    movu        m7, [r2+r1*4-8]
259    TRANSPOSE4x4W 4, 1, 3, 7, 6
260
261    mova        m6, tcm
262    punpcklqdq  m6, m7
263    punpckhqdq  m5, m4
264    SBUTTERFLY qdq, 0, 1, 7
265    SBUTTERFLY qdq, 2, 3, 7
266%endif
267
268    mova       p2m, m6
269    LOAD_MASK   m0, m1, m2, m3, am, bm, m7, m4, m6
270    LOAD_TC     m6, r4
271    mova       tcm, m6
272
273    LUMA_DEBLOCK_ONE m1, m0, ms1
274    mova       p1m, m5
275
276    mova        m5, p2m
277    LUMA_DEBLOCK_ONE m2, m3, ms2
278    mova       p2m, m5
279
280    pxor        m5, m5
281    mova        m6, tcm
282    pcmpgtw     m5, tcm
283    psubw       m6, ms1
284    pandn       m5, m7
285    psubw       m6, ms2
286    pand        m5, m6
287    DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
288    mova        m0, p1m
289    mova        m3, p2m
290    TRANSPOSE4x4W 0, 1, 2, 3, 4
291    LUMA_H_STORE r2, r3
292
293    add         r4, mmsize/8
294    lea         r0, [r0+r1*(mmsize/2)]
295    lea         r2, [r2+r1*(mmsize/2)]
296    dec         r5
297    jg .loop
298    ADD        rsp, pad
299    RET
300%endmacro
301
302%if ARCH_X86_64
303; in:  m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
304;      m12=alpha, m13=beta
305; out: m0=p1', m3=q1', m1=p0', m2=q0'
306; clobbers: m4, m5, m6, m7, m10, m11, m14
307%macro DEBLOCK_LUMA_INTER_SSE2 0
308    LOAD_MASK   m0, m1, m2, m3, m12, m13, m7, m4, m6
309    LOAD_TC     m6, r4
310    DIFF_LT     m8, m1, m13, m10, m4
311    DIFF_LT     m9, m2, m13, m11, m4
312    pand        m6, m7
313
314    mova       m14, m6
315    pxor        m4, m4
316    pcmpgtw     m6, m4
317    pand        m6, m14
318
319    mova        m5, m10
320    pand        m5, m6
321    LUMA_Q1 m8, m0, m1, m2, m5, m4
322
323    mova        m5, m11
324    pand        m5, m6
325    LUMA_Q1 m9, m3, m1, m2, m5, m4
326
327    pxor        m4, m4
328    psubw       m6, m10
329    pcmpgtw     m4, m14
330    pandn       m4, m7
331    psubw       m6, m11
332    pand        m4, m6
333    DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
334
335    SWAP         0, 8
336    SWAP         3, 9
337%endmacro
338
339%macro DEBLOCK_LUMA_64 0
340cglobal deblock_v_luma_10, 5,5,15
341    %define p2 m8
342    %define p1 m0
343    %define p0 m1
344    %define q0 m2
345    %define q1 m3
346    %define q2 m9
347    %define mask0 m7
348    %define mask1 m10
349    %define mask2 m11
350    shl        r2d, 2
351    shl        r3d, 2
352    LOAD_AB    m12, m13, r2d, r3d
353    mov         r2, r0
354    sub         r0, r1
355    sub         r0, r1
356    sub         r0, r1
357    mov         r3, 2
358.loop:
359    mova        p2, [r0]
360    mova        p1, [r0+r1]
361    mova        p0, [r0+r1*2]
362    mova        q0, [r2]
363    mova        q1, [r2+r1]
364    mova        q2, [r2+r1*2]
365    DEBLOCK_LUMA_INTER_SSE2
366    mova   [r0+r1], p1
367    mova [r0+r1*2], p0
368    mova      [r2], q0
369    mova   [r2+r1], q1
370    add         r0, mmsize
371    add         r2, mmsize
372    add         r4, 2
373    dec         r3
374    jg .loop
375    REP_RET
376
377cglobal deblock_h_luma_10, 5,7,15
378    shl        r2d, 2
379    shl        r3d, 2
380    LOAD_AB    m12, m13, r2d, r3d
381    mov         r2, r1
382    add         r2, r1
383    add         r2, r1
384    mov         r5, r0
385    add         r5, r2
386    mov         r6, 2
387.loop:
388    movu        m8, [r0-8]     ; y q2 q1 q0 p0 p1 p2 x
389    movu        m0, [r0+r1-8]
390    movu        m2, [r0+r1*2-8]
391    movu        m9, [r5-8]
392    movu        m5, [r5+r1-8]
393    movu        m1, [r5+r1*2-8]
394    movu        m3, [r5+r2-8]
395    movu        m7, [r5+r1*4-8]
396
397    TRANSPOSE4x4W 8, 0, 2, 9, 10
398    TRANSPOSE4x4W 5, 1, 3, 7, 10
399
400    punpckhqdq  m8, m5
401    SBUTTERFLY qdq, 0, 1, 10
402    SBUTTERFLY qdq, 2, 3, 10
403    punpcklqdq  m9, m7
404
405    DEBLOCK_LUMA_INTER_SSE2
406
407    TRANSPOSE4x4W 0, 1, 2, 3, 4
408    LUMA_H_STORE r5, r2
409    add         r4, 2
410    lea         r0, [r0+r1*8]
411    lea         r5, [r5+r1*8]
412    dec         r6
413    jg .loop
414    REP_RET
415%endmacro
416
417INIT_XMM sse2
418DEBLOCK_LUMA_64
419%if HAVE_AVX_EXTERNAL
420INIT_XMM avx
421DEBLOCK_LUMA_64
422%endif
423%endif
424
425%macro SWAPMOVA 2
426%ifid %1
427    SWAP %1, %2
428%else
429    mova %1, %2
430%endif
431%endmacro
432
433; in: t0-t2: tmp registers
434;     %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
435;     %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
436%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
437%if ARCH_X86_64
438    paddw     t0, %3, %2
439    mova      t2, %4
440    paddw     t2, %3
441%else
442    mova      t0, %3
443    mova      t2, %4
444    paddw     t0, %2
445    paddw     t2, %3
446%endif
447    paddw     t0, %1
448    paddw     t2, t2
449    paddw     t0, %5
450    paddw     t2, %9
451    paddw     t0, %9    ; (p2 + p1 + p0 + q0 + 2)
452    paddw     t2, t0    ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
453
454    psrlw     t2, 3
455    psrlw     t1, t0, 2
456    psubw     t2, %3
457    psubw     t1, %2
458    pand      t2, %8
459    pand      t1, %8
460    paddw     t2, %3
461    paddw     t1, %2
462    SWAPMOVA %11, t1
463
464    psubw     t1, t0, %3
465    paddw     t0, t0
466    psubw     t1, %5
467    psubw     t0, %3
468    paddw     t1, %6
469    paddw     t1, %2
470    paddw     t0, %6
471    psrlw     t1, 2     ; (2*p1 + p0 + q1 + 2)/4
472    psrlw     t0, 3     ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
473
474    pxor      t0, t1
475    pxor      t1, %1
476    pand      t0, %8
477    pand      t1, %7
478    pxor      t0, t1
479    pxor      t0, %1
480    SWAPMOVA %10, t0
481    SWAPMOVA %12, t2
482%endmacro
483
484%macro LUMA_INTRA_INIT 1
485    %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
486    %define t0 m4
487    %define t1 m5
488    %define t2 m6
489    %define t3 m7
490    %assign i 4
491%rep %1
492    CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
493    %assign i i+1
494%endrep
495    SUB    rsp, pad
496%endmacro
497
498; in: %1-%3=tmp, %4=p2, %5=q2
499%macro LUMA_INTRA_INTER 5
500    LOAD_AB t0, t1, r2d, r3d
501    mova    %1, t0
502    LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
503%if ARCH_X86_64
504    mova    %2, t0        ; mask0
505    psrlw   t3, %1, 2
506%else
507    mova    t3, %1
508    mova    %2, t0        ; mask0
509    psrlw   t3, 2
510%endif
511    paddw   t3, [pw_2]    ; alpha/4+2
512    DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
513    pand    t2, %2
514    mova    t3, %5        ; q2
515    mova    %1, t2        ; mask1
516    DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
517    pand    t2, %1
518    mova    t3, %4        ; p2
519    mova    %3, t2        ; mask1q
520    DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
521    pand    t2, %1
522    mova    %1, t2        ; mask1p
523%endmacro
524
525%macro LUMA_H_INTRA_LOAD 0
526%if mmsize == 8
527    movu    t0, [r0-8]
528    movu    t1, [r0+r1-8]
529    movu    m0, [r0+r1*2-8]
530    movu    m1, [r0+r4-8]
531    TRANSPOSE4x4W 4, 5, 0, 1, 2
532    mova    t4, t0        ; p3
533    mova    t5, t1        ; p2
534
535    movu    m2, [r0]
536    movu    m3, [r0+r1]
537    movu    t0, [r0+r1*2]
538    movu    t1, [r0+r4]
539    TRANSPOSE4x4W 2, 3, 4, 5, 6
540    mova    t6, t0        ; q2
541    mova    t7, t1        ; q3
542%else
543    movu    t0, [r0-8]
544    movu    t1, [r0+r1-8]
545    movu    m0, [r0+r1*2-8]
546    movu    m1, [r0+r5-8]
547    movu    m2, [r4-8]
548    movu    m3, [r4+r1-8]
549    movu    t2, [r4+r1*2-8]
550    movu    t3, [r4+r5-8]
551    TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
552    mova    t4, t0        ; p3
553    mova    t5, t1        ; p2
554    mova    t6, t2        ; q2
555    mova    t7, t3        ; q3
556%endif
557%endmacro
558
559; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
560%macro LUMA_H_INTRA_STORE 9
561%if mmsize == 8
562    TRANSPOSE4x4W %1, %2, %3, %4, %9
563    movq       [r0-8], m%1
564    movq       [r0+r1-8], m%2
565    movq       [r0+r1*2-8], m%3
566    movq       [r0+r4-8], m%4
567    movq       m%1, %8
568    TRANSPOSE4x4W %5, %6, %7, %1, %9
569    movq       [r0], m%5
570    movq       [r0+r1], m%6
571    movq       [r0+r1*2], m%7
572    movq       [r0+r4], m%1
573%else
574    TRANSPOSE2x4x4W %1, %2, %3, %4, %9
575    movq       [r0-8], m%1
576    movq       [r0+r1-8], m%2
577    movq       [r0+r1*2-8], m%3
578    movq       [r0+r5-8], m%4
579    movhps     [r4-8], m%1
580    movhps     [r4+r1-8], m%2
581    movhps     [r4+r1*2-8], m%3
582    movhps     [r4+r5-8], m%4
583%ifnum %8
584    SWAP       %1, %8
585%else
586    mova       m%1, %8
587%endif
588    TRANSPOSE2x4x4W %5, %6, %7, %1, %9
589    movq       [r0], m%5
590    movq       [r0+r1], m%6
591    movq       [r0+r1*2], m%7
592    movq       [r0+r5], m%1
593    movhps     [r4], m%5
594    movhps     [r4+r1], m%6
595    movhps     [r4+r1*2], m%7
596    movhps     [r4+r5], m%1
597%endif
598%endmacro
599
600%if ARCH_X86_64
601;-----------------------------------------------------------------------------
602; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha,
603;                                 int beta)
604;-----------------------------------------------------------------------------
605%macro DEBLOCK_LUMA_INTRA_64 0
606cglobal deblock_v_luma_intra_10, 4,7,16
607    %define t0 m1
608    %define t1 m2
609    %define t2 m4
610    %define p2 m8
611    %define p1 m9
612    %define p0 m10
613    %define q0 m11
614    %define q1 m12
615    %define q2 m13
616    %define aa m5
617    %define bb m14
618    lea     r4, [r1*4]
619    lea     r5, [r1*3] ; 3*stride
620    neg     r4
621    add     r4, r0     ; pix-4*stride
622    mov     r6, 2
623    mova    m0, [pw_2]
624    shl    r2d, 2
625    shl    r3d, 2
626    LOAD_AB aa, bb, r2d, r3d
627.loop:
628    mova    p2, [r4+r1]
629    mova    p1, [r4+2*r1]
630    mova    p0, [r4+r5]
631    mova    q0, [r0]
632    mova    q1, [r0+r1]
633    mova    q2, [r0+2*r1]
634
635    LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
636    mova    t2, aa
637    psrlw   t2, 2
638    paddw   t2, m0 ; alpha/4+2
639    DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
640    DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
641    DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
642    pand    m6, m3
643    pand    m7, m6
644    pand    m6, t1
645    LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
646    LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
647    add     r0, mmsize
648    add     r4, mmsize
649    dec     r6
650    jg .loop
651    REP_RET
652
653;-----------------------------------------------------------------------------
654; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha,
655;                                 int beta)
656;-----------------------------------------------------------------------------
657cglobal deblock_h_luma_intra_10, 4,7,16
658    %define t0 m15
659    %define t1 m14
660    %define t2 m2
661    %define q3 m5
662    %define q2 m8
663    %define q1 m9
664    %define q0 m10
665    %define p0 m11
666    %define p1 m12
667    %define p2 m13
668    %define p3 m4
669    %define spill [rsp]
670    %assign pad 24-(stack_offset&15)
671    SUB     rsp, pad
672    lea     r4, [r1*4]
673    lea     r5, [r1*3] ; 3*stride
674    add     r4, r0     ; pix+4*stride
675    mov     r6, 2
676    mova    m0, [pw_2]
677    shl    r2d, 2
678    shl    r3d, 2
679.loop:
680    movu    q3, [r0-8]
681    movu    q2, [r0+r1-8]
682    movu    q1, [r0+r1*2-8]
683    movu    q0, [r0+r5-8]
684    movu    p0, [r4-8]
685    movu    p1, [r4+r1-8]
686    movu    p2, [r4+r1*2-8]
687    movu    p3, [r4+r5-8]
688    TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
689
690    LOAD_AB m1, m2, r2d, r3d
691    LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
692    psrlw   m1, 2
693    paddw   m1, m0 ; alpha/4+2
694    DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
695    DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
696    DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
697    pand    m6, m3
698    pand    m7, m6
699    pand    m6, t1
700
701    mova spill, q3
702    LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
703    LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
704    mova    m7, spill
705
706    LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
707
708    lea     r0, [r0+r1*8]
709    lea     r4, [r4+r1*8]
710    dec     r6
711    jg .loop
712    ADD    rsp, pad
713    RET
714%endmacro
715
716INIT_XMM sse2
717DEBLOCK_LUMA_INTRA_64
718%if HAVE_AVX_EXTERNAL
719INIT_XMM avx
720DEBLOCK_LUMA_INTRA_64
721%endif
722
723%endif
724
725%macro DEBLOCK_LUMA_INTRA 0
726;-----------------------------------------------------------------------------
727; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha,
728;                                 int beta)
729;-----------------------------------------------------------------------------
730cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
731    LUMA_INTRA_INIT 3
732    lea     r4, [r1*4]
733    lea     r5, [r1*3]
734    neg     r4
735    add     r4, r0
736    mov     r6, 32/mmsize
737    shl    r2d, 2
738    shl    r3d, 2
739.loop:
740    mova    m0, [r4+r1*2] ; p1
741    mova    m1, [r4+r5]   ; p0
742    mova    m2, [r0]      ; q0
743    mova    m3, [r0+r1]   ; q1
744    LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
745    LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
746    mova    t3, [r0+r1*2] ; q2
747    LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
748    add     r0, mmsize
749    add     r4, mmsize
750    dec     r6
751    jg .loop
752    ADD    rsp, pad
753    RET
754
755;-----------------------------------------------------------------------------
756; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha,
757;                                 int beta)
758;-----------------------------------------------------------------------------
759cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
760    LUMA_INTRA_INIT 8
761%if mmsize == 8
762    lea     r4, [r1*3]
763    mov     r5, 32/mmsize
764%else
765    lea     r4, [r1*4]
766    lea     r5, [r1*3] ; 3*stride
767    add     r4, r0     ; pix+4*stride
768    mov     r6, 32/mmsize
769%endif
770    shl    r2d, 2
771    shl    r3d, 2
772.loop:
773    LUMA_H_INTRA_LOAD
774    LUMA_INTRA_INTER t8, t9, t10, t5, t6
775
776    LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
777    mova    t3, t6     ; q2
778    LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
779
780    mova    m2, t4
781    mova    m0, t11
782    mova    m1, t5
783    mova    m3, t8
784    mova    m6, t6
785
786    LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
787
788    lea     r0, [r0+r1*(mmsize/2)]
789%if mmsize == 8
790    dec     r5
791%else
792    lea     r4, [r4+r1*(mmsize/2)]
793    dec     r6
794%endif
795    jg .loop
796    ADD    rsp, pad
797    RET
798%endmacro
799
800%if ARCH_X86_64 == 0
801INIT_MMX mmxext
802DEBLOCK_LUMA
803DEBLOCK_LUMA_INTRA
804INIT_XMM sse2
805DEBLOCK_LUMA
806DEBLOCK_LUMA_INTRA
807%if HAVE_AVX_EXTERNAL
808INIT_XMM avx
809DEBLOCK_LUMA
810DEBLOCK_LUMA_INTRA
811%endif
812%endif
813
814; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
815; out: %1=p0', %2=q0'
816%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
817    mova    %6, [pw_2]
818    paddw   %6, %3
819    paddw   %6, %4
820    paddw   %7, %6, %2
821    paddw   %6, %1
822    paddw   %6, %3
823    paddw   %7, %4
824    psraw   %6, 2
825    psraw   %7, 2
826    psubw   %6, %1
827    psubw   %7, %2
828    pand    %6, %5
829    pand    %7, %5
830    paddw   %1, %6
831    paddw   %2, %7
832%endmacro
833
834%macro CHROMA_V_LOAD 1
835    mova        m0, [r0]    ; p1
836    mova        m1, [r0+r1] ; p0
837    mova        m2, [%1]    ; q0
838    mova        m3, [%1+r1] ; q1
839%endmacro
840
841%macro CHROMA_V_STORE 0
842    mova [r0+1*r1], m1
843    mova [r0+2*r1], m2
844%endmacro
845
846; in: 8 rows of 4 words in %4..%11
847; out: 4 rows of 8 words in m0..m3
848%macro TRANSPOSE4x8W_LOAD 8
849    movq             m0, %1
850    movq             m2, %2
851    movq             m1, %3
852    movq             m3, %4
853
854    punpcklwd        m0, m2
855    punpcklwd        m1, m3
856    punpckhdq        m2, m0, m1
857    punpckldq        m0, m1
858
859    movq             m4, %5
860    movq             m6, %6
861    movq             m5, %7
862    movq             m3, %8
863
864    punpcklwd        m4, m6
865    punpcklwd        m5, m3
866    punpckhdq        m6, m4, m5
867    punpckldq        m4, m5
868
869    punpckhqdq       m1, m0, m4
870    punpcklqdq       m0, m4
871    punpckhqdq       m3, m2, m6
872    punpcklqdq       m2, m6
873%endmacro
874
875; in: 4 rows of 8 words in m0..m3
876; out: 8 rows of 4 words in %1..%8
877%macro TRANSPOSE8x4W_STORE 8
878    TRANSPOSE4x4W     0, 1, 2, 3, 4
879    movq             %1, m0
880    movhps           %2, m0
881    movq             %3, m1
882    movhps           %4, m1
883    movq             %5, m2
884    movhps           %6, m2
885    movq             %7, m3
886    movhps           %8, m3
887%endmacro
888
889; %1 = base + 3*stride
890; %2 = 3*stride (unused on mmx)
891; %3, %4 = place to store p1 and q1 values
892%macro CHROMA_H_LOAD 4
893    %if mmsize == 8
894        movq m0, [pix_q - 4]
895        movq m1, [pix_q +   stride_q - 4]
896        movq m2, [pix_q + 2*stride_q - 4]
897        movq m3, [%1 - 4]
898        TRANSPOSE4x4W 0, 1, 2, 3, 4
899    %else
900        TRANSPOSE4x8W_LOAD PASS8ROWS(pix_q-4, %1-4, stride_q, %2)
901    %endif
902    mova %3, m0
903    mova %4, m3
904%endmacro
905
906; %1 = base + 3*stride
907; %2 = 3*stride (unused on mmx)
908; %3, %4 = place to load p1 and q1 values
909%macro CHROMA_H_STORE 4
910    mova m0, %3
911    mova m3, %4
912    %if mmsize == 8
913        TRANSPOSE4x4W 0, 1, 2, 3, 4
914        movq [pix_q - 4],              m0
915        movq [pix_q +   stride_q - 4], m1
916        movq [pix_q + 2*stride_q - 4], m2
917        movq [%1 - 4],                 m3
918    %else
919        TRANSPOSE8x4W_STORE PASS8ROWS(pix_q-4, %1-4, stride_q, %2)
920    %endif
921%endmacro
922
923%macro CHROMA_V_LOAD_TC 2
924    movd        %1, [%2]
925    punpcklbw   %1, %1
926    punpcklwd   %1, %1
927    psraw       %1, 6
928%endmacro
929
930%macro DEBLOCK_CHROMA 0
931;-----------------------------------------------------------------------------
932; void ff_deblock_v_chroma_10(uint16_t *pix, int stride, int alpha, int beta,
933;                             int8_t *tc0)
934;-----------------------------------------------------------------------------
935cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
936    mov         r5, r0
937    sub         r0, r1
938    sub         r0, r1
939    shl        r2d, 2
940    shl        r3d, 2
941%if mmsize < 16
942    mov         r6, 16/mmsize
943.loop:
944%endif
945    CHROMA_V_LOAD r5
946    LOAD_AB     m4, m5, r2d, r3d
947    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
948    pxor        m4, m4
949    CHROMA_V_LOAD_TC m6, r4
950    psubw       m6, [pw_3]
951    pmaxsw      m6, m4
952    pand        m7, m6
953    DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
954    CHROMA_V_STORE
955%if mmsize < 16
956    add         r0, mmsize
957    add         r5, mmsize
958    add         r4, mmsize/4
959    dec         r6
960    jg .loop
961    REP_RET
962%else
963    RET
964%endif
965
966;-----------------------------------------------------------------------------
967; void ff_deblock_v_chroma_intra_10(uint16_t *pix, int stride, int alpha,
968;                                   int beta)
969;-----------------------------------------------------------------------------
970cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
971    mov         r4, r0
972    sub         r0, r1
973    sub         r0, r1
974    shl        r2d, 2
975    shl        r3d, 2
976%if mmsize < 16
977    mov         r5, 16/mmsize
978.loop:
979%endif
980    CHROMA_V_LOAD r4
981    LOAD_AB     m4, m5, r2d, r3d
982    LOAD_MASK   m0, m1, m2, m3, m4, m5, m7, m6, m4
983    CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
984    CHROMA_V_STORE
985%if mmsize < 16
986    add         r0, mmsize
987    add         r4, mmsize
988    dec         r5
989    jg .loop
990    REP_RET
991%else
992    RET
993%endif
994
995;-----------------------------------------------------------------------------
996; void ff_deblock_h_chroma_10(uint16_t *pix, int stride, int alpha, int beta,
997;                             int8_t *tc0)
998;-----------------------------------------------------------------------------
999cglobal deblock_h_chroma_10, 5, 7, 8, 0-2*mmsize, pix_, stride_, alpha_, beta_, tc0_
1000    shl alpha_d,  2
1001    shl beta_d,   2
1002    mov r5,       pix_q
1003    lea r6,      [3*stride_q]
1004    add r5,       r6
1005%if mmsize == 8
1006    mov r6d,      2
1007    .loop:
1008%endif
1009
1010        CHROMA_H_LOAD r5, r6, [rsp], [rsp + mmsize]
1011        LOAD_AB          m4,  m5, alpha_d, beta_d
1012        LOAD_MASK        m0,  m1, m2, m3, m4, m5, m7, m6, m4
1013        pxor             m4,  m4
1014        CHROMA_V_LOAD_TC m6,  tc0_q
1015        psubw            m6, [pw_3]
1016        pmaxsw           m6,  m4
1017        pand             m7,  m6
1018        DEBLOCK_P0_Q0    m1,  m2, m0, m3, m7, m5, m6
1019        CHROMA_H_STORE r5, r6, [rsp], [rsp + mmsize]
1020
1021%if mmsize == 8
1022        lea pix_q, [pix_q + 4*stride_q]
1023        lea r5,    [r5 + 4*stride_q]
1024        add tc0_q,  2
1025        dec r6d
1026    jg .loop
1027%endif
1028RET
1029
1030;-----------------------------------------------------------------------------
1031; void ff_deblock_h_chroma422_10(uint16_t *pix, int stride, int alpha, int beta,
1032;                                int8_t *tc0)
1033;-----------------------------------------------------------------------------
1034cglobal deblock_h_chroma422_10, 5, 7, 8, 0-3*mmsize, pix_, stride_, alpha_, beta_, tc0_
1035    shl alpha_d,  2
1036    shl beta_d,   2
1037
1038    movd m0, [tc0_q]
1039    punpcklbw m0, m0
1040    psraw m0, 6
1041    movq [rsp], m0
1042
1043    mov r5,       pix_q
1044    lea r6,      [3*stride_q]
1045    add r5,       r6
1046
1047    mov r4, -8
1048    .loop:
1049
1050        CHROMA_H_LOAD r5, r6, [rsp + 1*mmsize], [rsp + 2*mmsize]
1051        LOAD_AB          m4,  m5, alpha_d, beta_d
1052        LOAD_MASK        m0,  m1, m2, m3, m4, m5, m7, m6, m4
1053        pxor             m4,  m4
1054        movd             m6, [rsp + r4 + 8]
1055        punpcklwd        m6,  m6
1056        punpcklwd        m6,  m6
1057        psubw            m6, [pw_3]
1058        pmaxsw           m6,  m4
1059        pand             m7,  m6
1060        DEBLOCK_P0_Q0    m1,  m2, m0, m3, m7, m5, m6
1061        CHROMA_H_STORE r5, r6, [rsp + 1*mmsize], [rsp + 2*mmsize]
1062
1063        lea pix_q, [pix_q + (mmsize/2)*stride_q]
1064        lea r5,    [r5 +    (mmsize/2)*stride_q]
1065        add r4, (mmsize/4)
1066    jl .loop
1067RET
1068
1069%endmacro
1070
1071%if ARCH_X86_64 == 0
1072INIT_MMX mmxext
1073DEBLOCK_CHROMA
1074%endif
1075INIT_XMM sse2
1076DEBLOCK_CHROMA
1077%if HAVE_AVX_EXTERNAL
1078INIT_XMM avx
1079DEBLOCK_CHROMA
1080%endif
1081