• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized H.264 deblocking code
3;*****************************************************************************
4;* Copyright (C) 2005-2011 x264 project
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;*          Fiona Glaser <fiona@x264.com>
8;*          Oskar Arvidsson <oskar@irock.se>
9;*
10;* This file is part of FFmpeg.
11;*
12;* FFmpeg is free software; you can redistribute it and/or
13;* modify it under the terms of the GNU Lesser General Public
14;* License as published by the Free Software Foundation; either
15;* version 2.1 of the License, or (at your option) any later version.
16;*
17;* FFmpeg is distributed in the hope that it will be useful,
18;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20;* Lesser General Public License for more details.
21;*
22;* You should have received a copy of the GNU Lesser General Public
23;* License along with FFmpeg; if not, write to the Free Software
24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25;******************************************************************************
26
27%include "libavutil/x86/x86util.asm"
28
29SECTION_RODATA
30
31pb_A1: times 16 db 0xA1
32pb_3_1: times 4 db 3, 1
33
34SECTION .text
35
36cextern pb_0
37cextern pb_1
38cextern pb_3
39
40%define PASS8ROWS(base, base3, stride, stride3, offset) \
41    PASS8ROWS(base+offset, base3+offset, stride, stride3)
42
43; in: 8 rows of 4 bytes in %4..%11
44; out: 4 rows of 8 bytes in m0..m3
45%macro TRANSPOSE4x8_LOAD 11
46    movh       m0, %4
47    movh       m2, %5
48    movh       m1, %6
49    movh       m3, %7
50    punpckl%1  m0, m2
51    punpckl%1  m1, m3
52    mova       m2, m0
53    punpckl%2  m0, m1
54    punpckh%2  m2, m1
55
56    movh       m4, %8
57    movh       m6, %9
58    movh       m5, %10
59    movh       m7, %11
60    punpckl%1  m4, m6
61    punpckl%1  m5, m7
62    mova       m6, m4
63    punpckl%2  m4, m5
64    punpckh%2  m6, m5
65
66    punpckh%3  m1, m0, m4
67    punpckh%3  m3, m2, m6
68    punpckl%3  m0, m4
69    punpckl%3  m2, m6
70%endmacro
71
72; in: 4 rows of 8 bytes in m0..m3
73; out: 8 rows of 4 bytes in %1..%8
74%macro TRANSPOSE8x4B_STORE 8
75    punpckhdq  m4, m0, m0
76    punpckhdq  m5, m1, m1
77    punpckhdq  m6, m2, m2
78
79    punpcklbw  m0, m1
80    punpcklbw  m2, m3
81    punpcklwd  m1, m0, m2
82    punpckhwd  m0, m2
83    movh       %1, m1
84    punpckhdq  m1, m1
85    movh       %2, m1
86    movh       %3, m0
87    punpckhdq  m0, m0
88    movh       %4, m0
89
90    punpckhdq  m3, m3
91    punpcklbw  m4, m5
92    punpcklbw  m6, m3
93    punpcklwd  m5, m4, m6
94    punpckhwd  m4, m6
95    movh       %5, m5
96    punpckhdq  m5, m5
97    movh       %6, m5
98    movh       %7, m4
99    punpckhdq  m4, m4
100    movh       %8, m4
101%endmacro
102
103%macro TRANSPOSE4x8B_LOAD 8
104    TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
105%endmacro
106
107%macro SBUTTERFLY3 4
108    punpckh%1  %4, %2, %3
109    punpckl%1  %2, %3
110%endmacro
111
112; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
113; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
114%macro TRANSPOSE6x8_MEM 9
115    RESET_MM_PERMUTATION
116    movq  m0, %1
117    movq  m1, %2
118    movq  m2, %3
119    movq  m3, %4
120    movq  m4, %5
121    movq  m5, %6
122    movq  m6, %7
123    SBUTTERFLY bw, 0, 1, 7
124    SBUTTERFLY bw, 2, 3, 7
125    SBUTTERFLY bw, 4, 5, 7
126    movq  [%9+0x10], m3
127    SBUTTERFLY3 bw, m6, %8, m7
128    SBUTTERFLY wd, 0, 2, 3
129    SBUTTERFLY wd, 4, 6, 3
130    punpckhdq m0, m4
131    movq  [%9+0x00], m0
132    SBUTTERFLY3 wd, m1, [%9+0x10], m3
133    SBUTTERFLY wd, 5, 7, 0
134    SBUTTERFLY dq, 1, 5, 0
135    SBUTTERFLY dq, 2, 6, 0
136    punpckldq m3, m7
137    movq  [%9+0x10], m2
138    movq  [%9+0x20], m6
139    movq  [%9+0x30], m1
140    movq  [%9+0x40], m5
141    movq  [%9+0x50], m3
142    RESET_MM_PERMUTATION
143%endmacro
144
145; in: 8 rows of 8 in %1..%8
146; out: 8 rows of 8 in %9..%16
147%macro TRANSPOSE8x8_MEM 16
148    RESET_MM_PERMUTATION
149    movq  m0, %1
150    movq  m1, %2
151    movq  m2, %3
152    movq  m3, %4
153    movq  m4, %5
154    movq  m5, %6
155    movq  m6, %7
156    SBUTTERFLY bw, 0, 1, 7
157    SBUTTERFLY bw, 2, 3, 7
158    SBUTTERFLY bw, 4, 5, 7
159    SBUTTERFLY3 bw, m6, %8, m7
160    movq  %9,  m5
161    SBUTTERFLY wd, 0, 2, 5
162    SBUTTERFLY wd, 4, 6, 5
163    SBUTTERFLY wd, 1, 3, 5
164    movq  %11, m6
165    movq  m6,  %9
166    SBUTTERFLY wd, 6, 7, 5
167    SBUTTERFLY dq, 0, 4, 5
168    SBUTTERFLY dq, 1, 6, 5
169    movq  %9,  m0
170    movq  %10, m4
171    movq  %13, m1
172    movq  %14, m6
173    SBUTTERFLY3 dq, m2, %11, m0
174    SBUTTERFLY dq, 3, 7, 4
175    movq  %11, m2
176    movq  %12, m0
177    movq  %15, m3
178    movq  %16, m7
179    RESET_MM_PERMUTATION
180%endmacro
181
182; out: %4 = |%1-%2|>%3
183; clobbers: %5
184%macro DIFF_GT 5
185%if avx_enabled == 0
186    mova    %5, %2
187    mova    %4, %1
188    psubusb %5, %1
189    psubusb %4, %2
190%else
191    psubusb %5, %2, %1
192    psubusb %4, %1, %2
193%endif
194    por     %4, %5
195    psubusb %4, %3
196%endmacro
197
198; out: %4 = |%1-%2|>%3
199; clobbers: %5
200%macro DIFF_GT2 5
201%if ARCH_X86_64
202    psubusb %5, %2, %1
203    psubusb %4, %1, %2
204%else
205    mova    %5, %2
206    mova    %4, %1
207    psubusb %5, %1
208    psubusb %4, %2
209%endif
210    psubusb %5, %3
211    psubusb %4, %3
212    pcmpeqb %4, %5
213%endmacro
214
215; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
216; out: m5=beta-1, m7=mask, %3=alpha-1
217; clobbers: m4,m6
218%macro LOAD_MASK 2-3
219    movd     m4, %1
220    movd     m5, %2
221    SPLATW   m4, m4
222    SPLATW   m5, m5
223    packuswb m4, m4  ; 16x alpha-1
224    packuswb m5, m5  ; 16x beta-1
225%if %0>2
226    mova     %3, m4
227%endif
228    DIFF_GT  m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
229    DIFF_GT  m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
230    por      m7, m4
231    DIFF_GT  m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
232    por      m7, m4
233    pxor     m6, m6
234    pcmpeqb  m7, m6
235%endmacro
236
237; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
238; out: m1=p0' m2=q0'
239; clobbers: m0,3-6
240%macro DEBLOCK_P0_Q0 0
241    pcmpeqb m4, m4
242    pxor    m5, m1, m2   ; p0^q0
243    pxor    m3, m4
244    pand    m5, [pb_1]   ; (p0^q0)&1
245    pavgb   m3, m0       ; (p1 - q1 + 256)>>1
246    pxor    m4, m1
247    pavgb   m3, [pb_3]   ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
248    pavgb   m4, m2       ; (q0 - p0 + 256)>>1
249    pavgb   m3, m5
250    mova    m6, [pb_A1]
251    paddusb m3, m4       ; d+128+33
252    psubusb m6, m3
253    psubusb m3, [pb_A1]
254    pminub  m6, m7
255    pminub  m3, m7
256    psubusb m1, m6
257    psubusb m2, m3
258    paddusb m1, m3
259    paddusb m2, m6
260%endmacro
261
262; in: m1=p0 m2=q0
263;     %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
264; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
265; clobbers: q2, tmp, tc0
266%macro LUMA_Q1 6
267    pavgb   %6, m1, m2
268    pavgb   %2, %6       ; avg(p2,avg(p0,q0))
269    pxor    %6, %3
270    pand    %6, [pb_1]   ; (p2^avg(p0,q0))&1
271    psubusb %2, %6       ; (p2+((p0+q0+1)>>1))>>1
272    psubusb %6, %1, %5
273    paddusb %5, %1
274    pmaxub  %2, %6
275    pminub  %2, %5
276    mova    %4, %2
277%endmacro
278
279%if ARCH_X86_64
280;-----------------------------------------------------------------------------
281; void ff_deblock_v_luma(uint8_t *pix, int stride, int alpha, int beta,
282;                        int8_t *tc0)
283;-----------------------------------------------------------------------------
284%macro DEBLOCK_LUMA 0
285cglobal deblock_v_luma_8, 5,5,10, pix_, stride_, alpha_, beta_, base3_
286    movd    m8, [r4] ; tc0
287    lea     r4, [stride_q*3]
288    dec     alpha_d        ; alpha-1
289    neg     r4
290    dec     beta_d        ; beta-1
291    add     base3_q, pix_q     ; pix-3*stride
292
293    mova    m0, [base3_q + stride_q]   ; p1
294    mova    m1, [base3_q + 2*stride_q] ; p0
295    mova    m2, [pix_q]      ; q0
296    mova    m3, [pix_q + stride_q]   ; q1
297    LOAD_MASK r2d, r3d
298
299    punpcklbw m8, m8
300    punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
301    pcmpeqb m9, m9
302    pcmpeqb m9, m8
303    pandn   m9, m7
304    pand    m8, m9
305
306    movdqa  m3, [base3_q] ; p2
307    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
308    pand    m6, m9
309    psubb   m7, m8, m6
310    pand    m6, m8
311    LUMA_Q1 m0, m3, [base3_q], [base3_q + stride_q], m6, m4
312
313    movdqa  m4, [pix_q + 2*stride_q] ; q2
314    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
315    pand    m6, m9
316    pand    m8, m6
317    psubb   m7, m6
318    mova    m3, [pix_q + stride_q]
319    LUMA_Q1 m3, m4, [pix_q + 2*stride_q], [pix_q + stride_q], m8, m6
320
321    DEBLOCK_P0_Q0
322    mova    [base3_q + 2*stride_q], m1
323    mova    [pix_q], m2
324    RET
325
326;-----------------------------------------------------------------------------
327; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
328;                        int8_t *tc0)
329;-----------------------------------------------------------------------------
330INIT_MMX cpuname
331cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
332    movsxd r7,  r1d
333    lea    r8,  [r7+r7*2]
334    lea    r6,  [r0-4]
335    lea    r5,  [r0-4+r8]
336%if WIN64
337    %define pix_tmp rsp+0x30 ; shadow space + r4
338%else
339    %define pix_tmp rsp
340%endif
341
342    ; transpose 6x16 -> tmp space
343    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r7, r8), pix_tmp
344    lea    r6, [r6+r7*8]
345    lea    r5, [r5+r7*8]
346    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
347
348    ; vertical filter
349    ; alpha, beta, tc0 are still in r2d, r3d, r4
350    ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
351    lea    r0, [pix_tmp+0x30]
352    mov    r1d, 0x10
353%if WIN64
354    mov    [rsp+0x20], r4
355%endif
356    call   deblock_v_luma_8
357
358    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
359    add    r6, 2
360    add    r5, 2
361    movq   m0, [pix_tmp+0x18]
362    movq   m1, [pix_tmp+0x28]
363    movq   m2, [pix_tmp+0x38]
364    movq   m3, [pix_tmp+0x48]
365    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r7, r8)
366
367    shl    r7,  3
368    sub    r6,  r7
369    sub    r5,  r7
370    shr    r7,  3
371    movq   m0, [pix_tmp+0x10]
372    movq   m1, [pix_tmp+0x20]
373    movq   m2, [pix_tmp+0x30]
374    movq   m3, [pix_tmp+0x40]
375    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r7, r8)
376
377    RET
378%endmacro
379
380%macro DEBLOCK_H_LUMA_MBAFF 0
381
382cglobal deblock_h_luma_mbaff_8, 5, 9, 10, 8*16, pix_, stride_, alpha_, beta_, tc0_, base3_, stride3_
383    movsxd stride_q,   stride_d
384    dec    alpha_d
385    dec    beta_d
386    mov    base3_q,    pix_q
387    lea    stride3_q, [3*stride_q]
388    add    base3_q,    stride3_q
389
390    movq m0, [pix_q - 4]
391    movq m1, [pix_q + stride_q - 4]
392    movq m2, [pix_q + 2*stride_q - 4]
393    movq m3, [base3_q - 4]
394    movq m4, [base3_q + stride_q - 4]
395    movq m5, [base3_q + 2*stride_q - 4]
396    movq m6, [base3_q + stride3_q - 4]
397    movq m7, [base3_q + 4*stride_q - 4]
398
399    TRANSPOSE_8X8B 0,1,2,3,4,5,6,7
400
401    %assign i 0
402    %rep 8
403        movq [rsp + 16*i], m %+ i
404        %assign i i+1
405    %endrep
406
407    ; p2 = m1 [rsp + 16]
408    ; p1 = m2 [rsp + 32]
409    ; p0 = m3 [rsp + 48]
410    ; q0 = m4 [rsp + 64]
411    ; q1 = m5 [rsp + 80]
412    ; q2 = m6 [rsp + 96]
413
414    SWAP 0, 2
415    SWAP 1, 3
416    SWAP 2, 4
417    SWAP 3, 5
418
419    LOAD_MASK alpha_d, beta_d
420    movd m8, [tc0_q]
421    punpcklbw m8, m8
422    pcmpeqb m9, m9
423    pcmpeqb m9, m8
424    pandn   m9, m7
425    pand    m8, m9
426
427    movdqa  m3, [rsp + 16] ; p2
428    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
429    pand    m6, m9
430    psubb   m7, m8, m6
431    pand    m6, m8
432    LUMA_Q1 m0, m3, [rsp + 16], [rsp + 32], m6, m4
433
434    movdqa  m4, [rsp + 96] ; q2
435    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
436    pand    m6, m9
437    pand    m8, m6
438    psubb   m7, m6
439    mova    m3, [rsp + 80]
440    LUMA_Q1 m3, m4, [rsp + 96], [rsp + 80], m8, m6
441
442    DEBLOCK_P0_Q0
443    SWAP 1, 3
444    SWAP 2, 4
445    movq m0, [rsp]
446    movq m1, [rsp + 16]
447    movq m2, [rsp + 32]
448    movq m5, [rsp + 80]
449    movq m6, [rsp + 96]
450    movq m7, [rsp + 112]
451
452    TRANSPOSE_8X8B 0,1,2,3,4,5,6,7
453    movq [pix_q - 4], m0
454    movq [pix_q + stride_q - 4], m1
455    movq [pix_q + 2*stride_q - 4], m2
456    movq [base3_q - 4], m3
457    movq [base3_q + stride_q - 4], m4
458    movq [base3_q + 2*stride_q - 4], m5
459    movq [base3_q + stride3_q - 4], m6
460    movq [base3_q + 4*stride_q - 4], m7
461
462RET
463
464%endmacro
465
466INIT_XMM sse2
467DEBLOCK_H_LUMA_MBAFF
468DEBLOCK_LUMA
469
470%if HAVE_AVX_EXTERNAL
471INIT_XMM avx
472DEBLOCK_H_LUMA_MBAFF
473DEBLOCK_LUMA
474%endif
475
476%else
477
478%macro DEBLOCK_LUMA 2
479;-----------------------------------------------------------------------------
480; void ff_deblock_v8_luma(uint8_t *pix, int stride, int alpha, int beta,
481;                         int8_t *tc0)
482;-----------------------------------------------------------------------------
483cglobal deblock_%1_luma_8, 5,5,8,2*%2
484    lea     r4, [r1*3]
485    dec     r2     ; alpha-1
486    neg     r4
487    dec     r3     ; beta-1
488    add     r4, r0 ; pix-3*stride
489
490    mova    m0, [r4+r1]   ; p1
491    mova    m1, [r4+2*r1] ; p0
492    mova    m2, [r0]      ; q0
493    mova    m3, [r0+r1]   ; q1
494    LOAD_MASK r2, r3
495
496    mov     r3, r4mp
497    pcmpeqb m3, m3
498    movd    m4, [r3] ; tc0
499    punpcklbw m4, m4
500    punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
501    mova   [esp+%2], m4 ; tc
502    pcmpgtb m4, m3
503    mova    m3, [r4] ; p2
504    pand    m4, m7
505    mova   [esp], m4 ; mask
506
507    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
508    pand    m6, m4
509    pand    m4, [esp+%2] ; tc
510    psubb   m7, m4, m6
511    pand    m6, m4
512    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
513
514    mova    m4, [r0+2*r1] ; q2
515    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
516    pand    m6, [esp] ; mask
517    mova    m5, [esp+%2] ; tc
518    psubb   m7, m6
519    pand    m5, m6
520    mova    m3, [r0+r1]
521    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
522
523    DEBLOCK_P0_Q0
524    mova    [r4+2*r1], m1
525    mova    [r0], m2
526    RET
527
528;-----------------------------------------------------------------------------
529; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
530;                        int8_t *tc0)
531;-----------------------------------------------------------------------------
532INIT_MMX cpuname
533cglobal deblock_h_luma_8, 0,5,8,0x60+12
534    mov    r0, r0mp
535    mov    r3, r1m
536    lea    r4, [r3*3]
537    sub    r0, 4
538    lea    r1, [r0+r4]
539%define pix_tmp esp+12
540
541    ; transpose 6x16 -> tmp space
542    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp
543    lea    r0, [r0+r3*8]
544    lea    r1, [r1+r3*8]
545    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
546
547    ; vertical filter
548    lea    r0, [pix_tmp+0x30]
549    PUSH   dword r4m
550    PUSH   dword r3m
551    PUSH   dword r2m
552    PUSH   dword 16
553    PUSH   dword r0
554    call   deblock_%1_luma_8
555%ifidn %1, v8
556    add    dword [esp   ], 8 ; pix_tmp+0x38
557    add    dword [esp+16], 2 ; tc0+2
558    call   deblock_%1_luma_8
559%endif
560    ADD    esp, 20
561
562    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
563    mov    r0, r0mp
564    sub    r0, 2
565
566    movq   m0, [pix_tmp+0x10]
567    movq   m1, [pix_tmp+0x20]
568    lea    r1, [r0+r4]
569    movq   m2, [pix_tmp+0x30]
570    movq   m3, [pix_tmp+0x40]
571    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
572
573    lea    r0, [r0+r3*8]
574    lea    r1, [r1+r3*8]
575    movq   m0, [pix_tmp+0x18]
576    movq   m1, [pix_tmp+0x28]
577    movq   m2, [pix_tmp+0x38]
578    movq   m3, [pix_tmp+0x48]
579    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
580
581    RET
582%endmacro ; DEBLOCK_LUMA
583
584INIT_MMX mmxext
585DEBLOCK_LUMA v8, 8
586INIT_XMM sse2
587DEBLOCK_LUMA v, 16
588%if HAVE_AVX_EXTERNAL
589INIT_XMM avx
590DEBLOCK_LUMA v, 16
591%endif
592
593%endif ; ARCH
594
595
596
597%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
598%if ARCH_X86_64
599    pavgb t0, p2, p1
600    pavgb t1, p0, q0
601%else
602    mova  t0, p2
603    mova  t1, p0
604    pavgb t0, p1
605    pavgb t1, q0
606%endif
607    pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
608    mova  t5, t1
609%if ARCH_X86_64
610    paddb t2, p2, p1
611    paddb t3, p0, q0
612%else
613    mova  t2, p2
614    mova  t3, p0
615    paddb t2, p1
616    paddb t3, q0
617%endif
618    paddb t2, t3
619    mova  t3, t2
620    mova  t4, t2
621    psrlw t2, 1
622    pavgb t2, mpb_0
623    pxor  t2, t0
624    pand  t2, mpb_1
625    psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
626
627%if ARCH_X86_64
628    pavgb t1, p2, q1
629    psubb t2, p2, q1
630%else
631    mova  t1, p2
632    mova  t2, p2
633    pavgb t1, q1
634    psubb t2, q1
635%endif
636    paddb t3, t3
637    psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
638    pand  t2, mpb_1
639    psubb t1, t2
640    pavgb t1, p1
641    pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
642    psrlw t3, 2
643    pavgb t3, mpb_0
644    pxor  t3, t1
645    pand  t3, mpb_1
646    psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
647
648    pxor  t3, p0, q1
649    pavgb t2, p0, q1
650    pand  t3, mpb_1
651    psubb t2, t3
652    pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
653
654    pxor  t1, t2
655    pxor  t2, p0
656    pand  t1, mask1p
657    pand  t2, mask0
658    pxor  t1, t2
659    pxor  t1, p0
660    mova  %1, t1 ; store p0
661
662    mova  t1, %4 ; p3
663    paddb t2, t1, p2
664    pavgb t1, p2
665    pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
666    paddb t2, t2
667    paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
668    psrlw t2, 2
669    pavgb t2, mpb_0
670    pxor  t2, t1
671    pand  t2, mpb_1
672    psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
673
674    pxor  t0, p1
675    pxor  t1, p2
676    pand  t0, mask1p
677    pand  t1, mask1p
678    pxor  t0, p1
679    pxor  t1, p2
680    mova  %2, t0 ; store p1
681    mova  %3, t1 ; store p2
682%endmacro
683
684%macro LUMA_INTRA_SWAP_PQ 0
685    %define q1 m0
686    %define q0 m1
687    %define p0 m2
688    %define p1 m3
689    %define p2 q2
690    %define mask1p mask1q
691%endmacro
692
693%macro DEBLOCK_LUMA_INTRA 1
694    %define p1 m0
695    %define p0 m1
696    %define q0 m2
697    %define q1 m3
698    %define t0 m4
699    %define t1 m5
700    %define t2 m6
701    %define t3 m7
702%if ARCH_X86_64
703    %define p2 m8
704    %define q2 m9
705    %define t4 m10
706    %define t5 m11
707    %define mask0 m12
708    %define mask1p m13
709%if WIN64
710    %define mask1q [rsp]
711%else
712    %define mask1q [rsp-24]
713%endif
714    %define mpb_0 m14
715    %define mpb_1 m15
716%else
717    %define spill(x) [esp+16*x]
718    %define p2 [r4+r1]
719    %define q2 [r0+2*r1]
720    %define t4 spill(0)
721    %define t5 spill(1)
722    %define mask0 spill(2)
723    %define mask1p spill(3)
724    %define mask1q spill(4)
725    %define mpb_0 [pb_0]
726    %define mpb_1 [pb_1]
727%endif
728
729;-----------------------------------------------------------------------------
730; void ff_deblock_v_luma_intra(uint8_t *pix, int stride, int alpha, int beta)
731;-----------------------------------------------------------------------------
732%if WIN64
733cglobal deblock_%1_luma_intra_8, 4,6,16,0x10
734%else
735cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50
736%endif
737    lea     r4, [r1*4]
738    lea     r5, [r1*3] ; 3*stride
739    dec     r2d        ; alpha-1
740    jl .end
741    neg     r4
742    dec     r3d        ; beta-1
743    jl .end
744    add     r4, r0     ; pix-4*stride
745    mova    p1, [r4+2*r1]
746    mova    p0, [r4+r5]
747    mova    q0, [r0]
748    mova    q1, [r0+r1]
749%if ARCH_X86_64
750    pxor    mpb_0, mpb_0
751    mova    mpb_1, [pb_1]
752    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
753    SWAP    7, 12 ; m12=mask0
754    pavgb   t5, mpb_0
755    pavgb   t5, mpb_1 ; alpha/4+1
756    movdqa  p2, [r4+r1]
757    movdqa  q2, [r0+2*r1]
758    DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
759    DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
760    DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
761    pand    t0, mask0
762    pand    t4, t0
763    pand    t2, t0
764    mova    mask1q, t4
765    mova    mask1p, t2
766%else
767    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
768    mova    m4, t5
769    mova    mask0, m7
770    pavgb   m4, [pb_0]
771    pavgb   m4, [pb_1] ; alpha/4+1
772    DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
773    pand    m6, mask0
774    DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
775    pand    m4, m6
776    mova    mask1p, m4
777    DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
778    pand    m4, m6
779    mova    mask1q, m4
780%endif
781    LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
782    LUMA_INTRA_SWAP_PQ
783    LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
784.end:
785    RET
786
787INIT_MMX cpuname
788%if ARCH_X86_64
789;-----------------------------------------------------------------------------
790; void ff_deblock_h_luma_intra(uint8_t *pix, int stride, int alpha, int beta)
791;-----------------------------------------------------------------------------
792cglobal deblock_h_luma_intra_8, 4,9,0,0x80
793    movsxd r7,  r1d
794    lea    r8,  [r7*3]
795    lea    r6,  [r0-4]
796    lea    r5,  [r0-4+r8]
797%if WIN64
798    %define pix_tmp rsp+0x20 ; shadow space
799%else
800    %define pix_tmp rsp
801%endif
802
803    ; transpose 8x16 -> tmp space
804    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
805    lea    r6, [r6+r7*8]
806    lea    r5, [r5+r7*8]
807    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
808
809    lea    r0,  [pix_tmp+0x40]
810    mov    r1,  0x10
811    call   deblock_v_luma_intra_8
812
813    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
814    lea    r5, [r6+r8]
815    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
816    shl    r7,  3
817    sub    r6,  r7
818    sub    r5,  r7
819    shr    r7,  3
820    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
821    RET
822%else
823cglobal deblock_h_luma_intra_8, 2,4,8,0x80
824    lea    r3,  [r1*3]
825    sub    r0,  4
826    lea    r2,  [r0+r3]
827    %define pix_tmp rsp
828
829    ; transpose 8x16 -> tmp space
830    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
831    lea    r0,  [r0+r1*8]
832    lea    r2,  [r2+r1*8]
833    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
834
835    lea    r0,  [pix_tmp+0x40]
836    PUSH   dword r3m
837    PUSH   dword r2m
838    PUSH   dword 16
839    PUSH   r0
840    call   deblock_%1_luma_intra_8
841%ifidn %1, v8
842    add    dword [rsp], 8 ; pix_tmp+8
843    call   deblock_%1_luma_intra_8
844%endif
845    ADD    esp, 16
846
847    mov    r1,  r1m
848    mov    r0,  r0mp
849    lea    r3,  [r1*3]
850    sub    r0,  4
851    lea    r2,  [r0+r3]
852    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
853    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
854    lea    r0,  [r0+r1*8]
855    lea    r2,  [r2+r1*8]
856    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
857    RET
858%endif ; ARCH_X86_64
859%endmacro ; DEBLOCK_LUMA_INTRA
860
861INIT_XMM sse2
862DEBLOCK_LUMA_INTRA v
863%if HAVE_AVX_EXTERNAL
864INIT_XMM avx
865DEBLOCK_LUMA_INTRA v
866%endif
867%if ARCH_X86_64 == 0
868INIT_MMX mmxext
869DEBLOCK_LUMA_INTRA v8
870%endif
871
872INIT_MMX mmxext
873
874%macro CHROMA_V_START 0
875    dec    r2d      ; alpha-1
876    dec    r3d      ; beta-1
877    mov    t5, r0
878    sub    t5, r1
879    sub    t5, r1
880%endmacro
881
882%macro CHROMA_H_START 0
883    dec    r2d
884    dec    r3d
885    sub    r0, 2
886    lea    t6, [r1*3]
887    mov    t5, r0
888    add    r0, t6
889%endmacro
890
891%define t5 r5
892%define t6 r6
893
894;-----------------------------------------------------------------------------
895; void ff_deblock_v_chroma(uint8_t *pix, int stride, int alpha, int beta,
896;                          int8_t *tc0)
897;-----------------------------------------------------------------------------
898cglobal deblock_v_chroma_8, 5,6
899    CHROMA_V_START
900    movq  m0, [t5]
901    movq  m1, [t5+r1]
902    movq  m2, [r0]
903    movq  m3, [r0+r1]
904    call ff_chroma_inter_body_mmxext
905    movq  [t5+r1], m1
906    movq  [r0], m2
907    RET
908
909;-----------------------------------------------------------------------------
910; void ff_deblock_h_chroma(uint8_t *pix, int stride, int alpha, int beta,
911;                          int8_t *tc0)
912;-----------------------------------------------------------------------------
913cglobal deblock_h_chroma_8, 5,7
914%if ARCH_X86_64
915    ; This could use the red zone on 64 bit unix to avoid the stack pointer
916    ; readjustment, but valgrind assumes the red zone is clobbered on
917    ; function calls and returns.
918    sub   rsp, 16
919    %define buf0 [rsp]
920    %define buf1 [rsp+8]
921%else
922    %define buf0 r0m
923    %define buf1 r2m
924%endif
925    CHROMA_H_START
926    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
927    movq  buf0, m0
928    movq  buf1, m3
929    LOAD_MASK  r2d, r3d
930    movd       m6, [r4] ; tc0
931    punpcklbw  m6, m6
932    pand       m7, m6
933    DEBLOCK_P0_Q0
934    movq  m0, buf0
935    movq  m3, buf1
936    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
937%if ARCH_X86_64
938    add   rsp, 16
939%endif
940    RET
941
942ALIGN 16
943ff_chroma_inter_body_mmxext:
944    LOAD_MASK  r2d, r3d
945    movd       m6, [r4] ; tc0
946    punpcklbw  m6, m6
947    pand       m7, m6
948    DEBLOCK_P0_Q0
949    ret
950
951%define t5 r4
952%define t6 r5
953
954cglobal deblock_h_chroma422_8, 5, 6
955    SUB rsp, (1+ARCH_X86_64*2)*mmsize
956    %if ARCH_X86_64
957        %define buf0 [rsp+16]
958        %define buf1 [rsp+8]
959    %else
960        %define buf0 r0m
961        %define buf1 r2m
962    %endif
963
964    movd m6, [r4]
965    punpcklbw m6, m6
966    movq [rsp], m6
967    CHROMA_H_START
968
969    TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6)
970    movq buf0, m0
971    movq buf1, m3
972    LOAD_MASK r2d, r3d
973    movd m6, [rsp]
974    punpcklwd m6, m6
975    pand m7, m6
976    DEBLOCK_P0_Q0
977    movq m0, buf0
978    movq m3, buf1
979    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
980
981    lea r0, [r0+r1*8]
982    lea t5, [t5+r1*8]
983
984    TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6)
985    movq buf0, m0
986    movq buf1, m3
987    LOAD_MASK r2d, r3d
988    movd m6, [rsp+4]
989    punpcklwd m6, m6
990    pand m7, m6
991    DEBLOCK_P0_Q0
992    movq m0, buf0
993    movq m3, buf1
994    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
995    ADD rsp, (1+ARCH_X86_64*2)*mmsize
996RET
997
998; in: %1=p0 %2=p1 %3=q1
999; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
1000%macro CHROMA_INTRA_P0 3
1001    movq    m4, %1
1002    pxor    m4, %3
1003    pand    m4, [pb_1] ; m4 = (p0^q1)&1
1004    pavgb   %1, %3
1005    psubusb %1, m4
1006    pavgb   %1, %2             ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
1007%endmacro
1008
1009;------------------------------------------------------------------------------
1010; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
1011;------------------------------------------------------------------------------
1012cglobal deblock_v_chroma_intra_8, 4,5
1013    CHROMA_V_START
1014    movq  m0, [t5]
1015    movq  m1, [t5+r1]
1016    movq  m2, [r0]
1017    movq  m3, [r0+r1]
1018    call ff_chroma_intra_body_mmxext
1019    movq  [t5+r1], m1
1020    movq  [r0], m2
1021    RET
1022
1023;------------------------------------------------------------------------------
1024; void ff_deblock_h_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
1025;------------------------------------------------------------------------------
1026cglobal deblock_h_chroma_intra_8, 4,6
1027    CHROMA_H_START
1028    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
1029    call ff_chroma_intra_body_mmxext
1030    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
1031    RET
1032
1033cglobal deblock_h_chroma422_intra_8, 4, 6
1034    CHROMA_H_START
1035    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
1036    call ff_chroma_intra_body_mmxext
1037    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
1038
1039    lea r0, [r0+r1*8]
1040    lea t5, [t5+r1*8]
1041
1042    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
1043    call ff_chroma_intra_body_mmxext
1044    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
1045RET
1046
1047ALIGN 16
1048ff_chroma_intra_body_mmxext:
1049    LOAD_MASK r2d, r3d
1050    movq   m5, m1
1051    movq   m6, m2
1052    CHROMA_INTRA_P0  m1, m0, m3
1053    CHROMA_INTRA_P0  m2, m3, m0
1054    psubb  m1, m5
1055    psubb  m2, m6
1056    pand   m1, m7
1057    pand   m2, m7
1058    paddb  m1, m5
1059    paddb  m2, m6
1060    ret
1061
1062%macro LOAD_8_ROWS 8
1063    movd m0, %1
1064    movd m1, %2
1065    movd m2, %3
1066    movd m3, %4
1067    movd m4, %5
1068    movd m5, %6
1069    movd m6, %7
1070    movd m7, %8
1071%endmacro
1072
1073%macro STORE_8_ROWS 8
1074    movd %1, m0
1075    movd %2, m1
1076    movd %3, m2
1077    movd %4, m3
1078    movd %5, m4
1079    movd %6, m5
1080    movd %7, m6
1081    movd %8, m7
1082%endmacro
1083
1084%macro TRANSPOSE_8x4B_XMM 0
1085    punpcklbw m0, m1
1086    punpcklbw m2, m3
1087    punpcklbw m4, m5
1088    punpcklbw m6, m7
1089    punpcklwd m0, m2
1090    punpcklwd m4, m6
1091    punpckhdq m2, m0, m4
1092    punpckldq m0, m4
1093    MOVHL m1, m0
1094    MOVHL m3, m2
1095%endmacro
1096
1097%macro TRANSPOSE_4x8B_XMM 0
1098    punpcklbw m0, m1
1099    punpcklbw m2, m3
1100    punpckhwd m4, m0, m2
1101    punpcklwd m0, m2
1102    MOVHL m6, m4
1103    MOVHL m2, m0
1104    pshufd m1, m0, 1
1105    pshufd m3, m2, 1
1106    pshufd m5, m4, 1
1107    pshufd m7, m6, 1
1108%endmacro
1109
1110%macro CHROMA_INTER_BODY_XMM 1
1111    LOAD_MASK alpha_d, beta_d
1112    movd m6, [tc0_q]
1113    %rep %1
1114        punpcklbw m6, m6
1115    %endrep
1116    pand m7, m6
1117    DEBLOCK_P0_Q0
1118%endmacro
1119
1120%macro CHROMA_INTRA_BODY_XMM 0
1121    LOAD_MASK alpha_d, beta_d
1122    mova    m5,  m1
1123    mova    m6,  m2
1124    pxor    m4,  m1, m3
1125    pand    m4, [pb_1]
1126    pavgb   m1,  m3
1127    psubusb m1,  m4
1128    pavgb   m1,  m0
1129    pxor    m4,  m2, m0
1130    pand    m4, [pb_1]
1131    pavgb   m2,  m0
1132    psubusb m2,  m4
1133    pavgb   m2,  m3
1134    psubb   m1,  m5
1135    psubb   m2,  m6
1136    pand    m1,  m7
1137    pand    m2,  m7
1138    paddb   m1,  m5
1139    paddb   m2,  m6
1140%endmacro
1141
1142%macro CHROMA_V_START_XMM 1
1143    movsxdifnidn stride_q, stride_d
1144    dec alpha_d
1145    dec beta_d
1146    mov %1, pix_q
1147    sub %1, stride_q
1148    sub %1, stride_q
1149%endmacro
1150
1151%macro CHROMA_H_START_XMM 2
1152    movsxdifnidn stride_q, stride_d
1153    dec alpha_d
1154    dec beta_d
1155    lea %2, [3*stride_q]
1156    mov %1,  pix_q
1157    add %1,  %2
1158%endmacro
1159
1160%macro DEBLOCK_CHROMA_XMM 1
1161
1162INIT_XMM %1
1163
1164cglobal deblock_v_chroma_8, 5, 6, 8, pix_, stride_, alpha_, beta_, tc0_
1165    CHROMA_V_START_XMM r5
1166    movq m0, [r5]
1167    movq m1, [r5 + stride_q]
1168    movq m2, [pix_q]
1169    movq m3, [pix_q + stride_q]
1170    CHROMA_INTER_BODY_XMM 1
1171    movq [r5 + stride_q], m1
1172    movq [pix_q], m2
1173RET
1174
1175cglobal deblock_h_chroma_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_
1176    CHROMA_H_START_XMM r5, r6
1177    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
1178    TRANSPOSE_8x4B_XMM
1179    movq [rsp], m0
1180    movq [rsp + 8], m3
1181    CHROMA_INTER_BODY_XMM 1
1182    movq m0, [rsp]
1183    movq m3, [rsp + 8]
1184    TRANSPOSE_4x8B_XMM
1185    STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
1186RET
1187
1188cglobal deblock_h_chroma422_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_
1189    CHROMA_H_START_XMM r5, r6
1190    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
1191    TRANSPOSE_8x4B_XMM
1192    movq [rsp], m0
1193    movq [rsp + 8], m3
1194    CHROMA_INTER_BODY_XMM 2
1195    movq m0, [rsp]
1196    movq m3, [rsp + 8]
1197    TRANSPOSE_4x8B_XMM
1198    STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
1199
1200    lea pix_q, [pix_q + 8*stride_q]
1201    lea r5,    [r5    + 8*stride_q]
1202    add tc0_q,  2
1203
1204    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
1205    TRANSPOSE_8x4B_XMM
1206    movq [rsp], m0
1207    movq [rsp + 8], m3
1208    CHROMA_INTER_BODY_XMM 2
1209    movq m0, [rsp]
1210    movq m3, [rsp + 8]
1211    TRANSPOSE_4x8B_XMM
1212    STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
1213RET
1214
1215cglobal deblock_v_chroma_intra_8, 4, 5, 8, pix_, stride_, alpha_, beta_
1216    CHROMA_V_START_XMM r4
1217    movq m0, [r4]
1218    movq m1, [r4 + stride_q]
1219    movq m2, [pix_q]
1220    movq m3, [pix_q + stride_q]
1221    CHROMA_INTRA_BODY_XMM
1222    movq [r4 + stride_q], m1
1223    movq [pix_q], m2
1224RET
1225
1226cglobal deblock_h_chroma_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_
1227    CHROMA_H_START_XMM r4, r5
1228    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
1229    TRANSPOSE_8x4B_XMM
1230    CHROMA_INTRA_BODY_XMM
1231    TRANSPOSE_4x8B_XMM
1232    STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
1233RET
1234
1235cglobal deblock_h_chroma422_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_
1236    CHROMA_H_START_XMM r4, r5
1237    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
1238    TRANSPOSE_8x4B_XMM
1239    CHROMA_INTRA_BODY_XMM
1240    TRANSPOSE_4x8B_XMM
1241    STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
1242
1243    lea pix_q, [pix_q + 8*stride_q]
1244    lea r4,    [r4    + 8*stride_q]
1245
1246    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
1247    TRANSPOSE_8x4B_XMM
1248    CHROMA_INTRA_BODY_XMM
1249    TRANSPOSE_4x8B_XMM
1250    STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
1251RET
1252
1253%endmacro ; DEBLOCK_CHROMA_XMM
1254
1255DEBLOCK_CHROMA_XMM sse2
1256DEBLOCK_CHROMA_XMM avx
1257
1258;-----------------------------------------------------------------------------
1259; void ff_h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
1260;                                   int8_t ref[2][40], int16_t mv[2][40][2],
1261;                                   int bidir,    int edges,    int step,
1262;                                   int mask_mv0, int mask_mv1, int field);
1263;
1264; bidir    is 0 or 1
1265; edges    is 1 or 4
1266; step     is 1 or 2
1267; mask_mv0 is 0 or 3
1268; mask_mv1 is 0 or 1
1269; field    is 0 or 1
1270;-----------------------------------------------------------------------------
1271%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv,
1272                                        ; dir, d_idx, mask_dir, bidir
1273%define edgesd    %1
1274%define stepd     %2
1275%define mask_mvd  %3
1276%define dir       %4
1277%define d_idx     %5
1278%define mask_dir  %6
1279%define bidir     %7
1280    xor          b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step)
1281%%.b_idx_loop:
1282%if mask_dir == 0
1283    pxor             m0, m0
1284%endif
1285    test         b_idxd, dword mask_mvd
1286    jnz %%.skip_loop_iter                       ; if (!(b_idx & mask_mv))
1287%if bidir == 1
1288    movd             m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] }
1289    punpckldq        m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] }
1290    pshufw           m0, [refq+b_idxq+12], 0x44 ; { ref0[b],  ref0[b]  }
1291    pshufw           m1, [refq+b_idxq+52], 0x44 ; { ref1[b],  ref1[b]  }
1292    pshufw           m3, m2, 0x4E               ; { ref1[bn], ref0[bn] }
1293    psubb            m0, m2                     ; { ref0[b] != ref0[bn],
1294                                                ;   ref0[b] != ref1[bn] }
1295    psubb            m1, m3                     ; { ref1[b] != ref1[bn],
1296                                                ;   ref1[b] != ref0[bn] }
1297
1298    por              m0, m1
1299    mova             m1, [mvq+b_idxq*4+(d_idx+12)*4]
1300    mova             m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
1301    mova             m3, m1
1302    mova             m4, m2
1303    psubw            m1, [mvq+b_idxq*4+12*4]
1304    psubw            m2, [mvq+b_idxq*4+12*4+mmsize]
1305    psubw            m3, [mvq+b_idxq*4+52*4]
1306    psubw            m4, [mvq+b_idxq*4+52*4+mmsize]
1307    packsswb         m1, m2
1308    packsswb         m3, m4
1309    paddb            m1, m6
1310    paddb            m3, m6
1311    psubusb          m1, m5 ; abs(mv[b] - mv[bn]) >= limit
1312    psubusb          m3, m5
1313    packsswb         m1, m3
1314
1315    por              m0, m1
1316    mova             m1, [mvq+b_idxq*4+(d_idx+52)*4]
1317    mova             m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize]
1318    mova             m3, m1
1319    mova             m4, m2
1320    psubw            m1, [mvq+b_idxq*4+12*4]
1321    psubw            m2, [mvq+b_idxq*4+12*4+mmsize]
1322    psubw            m3, [mvq+b_idxq*4+52*4]
1323    psubw            m4, [mvq+b_idxq*4+52*4+mmsize]
1324    packsswb         m1, m2
1325    packsswb         m3, m4
1326    paddb            m1, m6
1327    paddb            m3, m6
1328    psubusb          m1, m5 ; abs(mv[b] - mv[bn]) >= limit
1329    psubusb          m3, m5
1330    packsswb         m1, m3
1331
1332    pshufw           m1, m1, 0x4E
1333    por              m0, m1
1334    pshufw           m1, m0, 0x4E
1335    pminub           m0, m1
1336%else ; bidir == 0
1337    movd             m0, [refq+b_idxq+12]
1338    psubb            m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn]
1339
1340    mova             m1, [mvq+b_idxq*4+12*4]
1341    mova             m2, [mvq+b_idxq*4+12*4+mmsize]
1342    psubw            m1, [mvq+b_idxq*4+(d_idx+12)*4]
1343    psubw            m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
1344    packsswb         m1, m2
1345    paddb            m1, m6
1346    psubusb          m1, m5 ; abs(mv[b] - mv[bn]) >= limit
1347    packsswb         m1, m1
1348    por              m0, m1
1349%endif ; bidir == 1/0
1350
1351%%.skip_loop_iter:
1352    movd             m1, [nnzq+b_idxq+12]
1353    por              m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn]
1354
1355    pminub           m1, m7
1356    pminub           m0, m7
1357    psllw            m1, 1
1358    pxor             m2, m2
1359    pmaxub           m1, m0
1360    punpcklbw        m1, m2
1361    movq [bsq+b_idxq+32*dir], m1
1362
1363    add          b_idxd, dword stepd
1364    cmp          b_idxd, dword edgesd
1365    jl %%.b_idx_loop
1366%endmacro
1367
1368INIT_MMX mmxext
1369cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \
1370                                            step, mask_mv0, mask_mv1, field
1371%define b_idxq bidirq
1372%define b_idxd bidird
1373    cmp    dword fieldm, 0
1374    mova             m7, [pb_1]
1375    mova             m5, [pb_3]
1376    je .nofield
1377    mova             m5, [pb_3_1]
1378.nofield:
1379    mova             m6, m5
1380    paddb            m5, m5
1381
1382    shl     dword stepd, 3
1383    shl    dword edgesd, 3
1384%if ARCH_X86_32
1385%define mask_mv0d mask_mv0m
1386%define mask_mv1d mask_mv1m
1387%endif
1388    shl dword mask_mv1d, 3
1389    shl dword mask_mv0d, 3
1390
1391    cmp    dword bidird, 0
1392    jne .bidir
1393    loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8,  0, 0
1394    loop_filter_strength_iteration     32,     8, mask_mv0d, 0, -1, -1, 0
1395
1396    mova             m0, [bsq+mmsize*0]
1397    mova             m1, [bsq+mmsize*1]
1398    mova             m2, [bsq+mmsize*2]
1399    mova             m3, [bsq+mmsize*3]
1400    TRANSPOSE4x4W 0, 1, 2, 3, 4
1401    mova  [bsq+mmsize*0], m0
1402    mova  [bsq+mmsize*1], m1
1403    mova  [bsq+mmsize*2], m2
1404    mova  [bsq+mmsize*3], m3
1405    RET
1406
1407.bidir:
1408    loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8,  0, 1
1409    loop_filter_strength_iteration     32,     8, mask_mv0d, 0, -1, -1, 1
1410
1411    mova             m0, [bsq+mmsize*0]
1412    mova             m1, [bsq+mmsize*1]
1413    mova             m2, [bsq+mmsize*2]
1414    mova             m3, [bsq+mmsize*3]
1415    TRANSPOSE4x4W 0, 1, 2, 3, 4
1416    mova  [bsq+mmsize*0], m0
1417    mova  [bsq+mmsize*1], m1
1418    mova  [bsq+mmsize*2], m2
1419    mova  [bsq+mmsize*3], m3
1420    RET
1421