• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;*****************************************************************************
2;* MMX/SSE2/AVX-optimized H.264 deblocking code
3;*****************************************************************************
4;* Copyright (C) 2005-2011 x264 project
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;*          Fiona Glaser <fiona@x264.com>
8;*          Oskar Arvidsson <oskar@irock.se>
9;*
10;* This file is part of FFmpeg.
11;*
12;* FFmpeg is free software; you can redistribute it and/or
13;* modify it under the terms of the GNU Lesser General Public
14;* License as published by the Free Software Foundation; either
15;* version 2.1 of the License, or (at your option) any later version.
16;*
17;* FFmpeg is distributed in the hope that it will be useful,
18;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20;* Lesser General Public License for more details.
21;*
22;* You should have received a copy of the GNU Lesser General Public
23;* License along with FFmpeg; if not, write to the Free Software
24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25;******************************************************************************
26
27%include "libavutil/x86/x86util.asm"
28
29SECTION_RODATA
30
31pb_A1: times 16 db 0xA1
32pb_3_1: times 4 db 3, 1
33
34SECTION .text
35
36cextern pb_0
37cextern pb_1
38cextern pb_3
39
40%define PASS8ROWS(base, base3, stride, stride3, offset) \
41    PASS8ROWS(base+offset, base3+offset, stride, stride3)
42
43; in: 8 rows of 4 bytes in %4..%11
44; out: 4 rows of 8 bytes in m0..m3
45%macro TRANSPOSE4x8_LOAD 11
46    movh       m0, %4
47    movh       m2, %5
48    movh       m1, %6
49    movh       m3, %7
50    punpckl%1  m0, m2
51    punpckl%1  m1, m3
52    mova       m2, m0
53    punpckl%2  m0, m1
54    punpckh%2  m2, m1
55
56    movh       m4, %8
57    movh       m6, %9
58    movh       m5, %10
59    movh       m7, %11
60    punpckl%1  m4, m6
61    punpckl%1  m5, m7
62    mova       m6, m4
63    punpckl%2  m4, m5
64    punpckh%2  m6, m5
65
66    punpckh%3  m1, m0, m4
67    punpckh%3  m3, m2, m6
68    punpckl%3  m0, m4
69    punpckl%3  m2, m6
70%endmacro
71
72; in: 4 rows of 8 bytes in m0..m3
73; out: 8 rows of 4 bytes in %1..%8
74%macro TRANSPOSE8x4B_STORE 8
75    punpckhdq  m4, m0, m0
76    punpckhdq  m5, m1, m1
77    punpckhdq  m6, m2, m2
78
79    punpcklbw  m0, m1
80    punpcklbw  m2, m3
81    punpcklwd  m1, m0, m2
82    punpckhwd  m0, m2
83    movh       %1, m1
84    punpckhdq  m1, m1
85    movh       %2, m1
86    movh       %3, m0
87    punpckhdq  m0, m0
88    movh       %4, m0
89
90    punpckhdq  m3, m3
91    punpcklbw  m4, m5
92    punpcklbw  m6, m3
93    punpcklwd  m5, m4, m6
94    punpckhwd  m4, m6
95    movh       %5, m5
96    punpckhdq  m5, m5
97    movh       %6, m5
98    movh       %7, m4
99    punpckhdq  m4, m4
100    movh       %8, m4
101%endmacro
102
103%macro TRANSPOSE4x8B_LOAD 8
104    TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
105%endmacro
106
107%macro SBUTTERFLY3 4
108    punpckh%1  %4, %2, %3
109    punpckl%1  %2, %3
110%endmacro
111
112; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
113; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
114%macro TRANSPOSE6x8_MEM 9
115    RESET_MM_PERMUTATION
116    movq  m0, %1
117    movq  m1, %2
118    movq  m2, %3
119    movq  m3, %4
120    movq  m4, %5
121    movq  m5, %6
122    movq  m6, %7
123    SBUTTERFLY bw, 0, 1, 7
124    SBUTTERFLY bw, 2, 3, 7
125    SBUTTERFLY bw, 4, 5, 7
126    movq  [%9+0x10], m3
127    SBUTTERFLY3 bw, m6, %8, m7
128    SBUTTERFLY wd, 0, 2, 3
129    SBUTTERFLY wd, 4, 6, 3
130    punpckhdq m0, m4
131    movq  [%9+0x00], m0
132    SBUTTERFLY3 wd, m1, [%9+0x10], m3
133    SBUTTERFLY wd, 5, 7, 0
134    SBUTTERFLY dq, 1, 5, 0
135    SBUTTERFLY dq, 2, 6, 0
136    punpckldq m3, m7
137    movq  [%9+0x10], m2
138    movq  [%9+0x20], m6
139    movq  [%9+0x30], m1
140    movq  [%9+0x40], m5
141    movq  [%9+0x50], m3
142    RESET_MM_PERMUTATION
143%endmacro
144
145; in: 8 rows of 8 in %1..%8
146; out: 8 rows of 8 in %9..%16
147%macro TRANSPOSE8x8_MEM 16
148    RESET_MM_PERMUTATION
149    movq  m0, %1
150    movq  m1, %2
151    movq  m2, %3
152    movq  m3, %4
153    movq  m4, %5
154    movq  m5, %6
155    movq  m6, %7
156    SBUTTERFLY bw, 0, 1, 7
157    SBUTTERFLY bw, 2, 3, 7
158    SBUTTERFLY bw, 4, 5, 7
159    SBUTTERFLY3 bw, m6, %8, m7
160    movq  %9,  m5
161    SBUTTERFLY wd, 0, 2, 5
162    SBUTTERFLY wd, 4, 6, 5
163    SBUTTERFLY wd, 1, 3, 5
164    movq  %11, m6
165    movq  m6,  %9
166    SBUTTERFLY wd, 6, 7, 5
167    SBUTTERFLY dq, 0, 4, 5
168    SBUTTERFLY dq, 1, 6, 5
169    movq  %9,  m0
170    movq  %10, m4
171    movq  %13, m1
172    movq  %14, m6
173    SBUTTERFLY3 dq, m2, %11, m0
174    SBUTTERFLY dq, 3, 7, 4
175    movq  %11, m2
176    movq  %12, m0
177    movq  %15, m3
178    movq  %16, m7
179    RESET_MM_PERMUTATION
180%endmacro
181
182; out: %4 = |%1-%2|>%3
183; clobbers: %5
184%macro DIFF_GT 5
185%if avx_enabled == 0
186    mova    %5, %2
187    mova    %4, %1
188    psubusb %5, %1
189    psubusb %4, %2
190%else
191    psubusb %5, %2, %1
192    psubusb %4, %1, %2
193%endif
194    por     %4, %5
195    psubusb %4, %3
196%endmacro
197
198; out: %4 = |%1-%2|>%3
199; clobbers: %5
200%macro DIFF_GT2 5
201%if ARCH_X86_64
202    psubusb %5, %2, %1
203    psubusb %4, %1, %2
204%else
205    mova    %5, %2
206    mova    %4, %1
207    psubusb %5, %1
208    psubusb %4, %2
209%endif
210    psubusb %5, %3
211    psubusb %4, %3
212    pcmpeqb %4, %5
213%endmacro
214
215; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
216; out: m5=beta-1, m7=mask, %3=alpha-1
217; clobbers: m4,m6
218%macro LOAD_MASK 2-3
219    movd     m4, %1
220    movd     m5, %2
221    SPLATW   m4, m4
222    SPLATW   m5, m5
223    packuswb m4, m4  ; 16x alpha-1
224    packuswb m5, m5  ; 16x beta-1
225%if %0>2
226    mova     %3, m4
227%endif
228    DIFF_GT  m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
229    DIFF_GT  m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
230    por      m7, m4
231    DIFF_GT  m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
232    por      m7, m4
233    pxor     m6, m6
234    pcmpeqb  m7, m6
235%endmacro
236
237; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
238; out: m1=p0' m2=q0'
239; clobbers: m0,3-6
240%macro DEBLOCK_P0_Q0 0
241    pcmpeqb m4, m4
242    pxor    m5, m1, m2   ; p0^q0
243    pxor    m3, m4
244    pand    m5, [pb_1]   ; (p0^q0)&1
245    pavgb   m3, m0       ; (p1 - q1 + 256)>>1
246    pxor    m4, m1
247    pavgb   m3, [pb_3]   ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
248    pavgb   m4, m2       ; (q0 - p0 + 256)>>1
249    pavgb   m3, m5
250    mova    m6, [pb_A1]
251    paddusb m3, m4       ; d+128+33
252    psubusb m6, m3
253    psubusb m3, [pb_A1]
254    pminub  m6, m7
255    pminub  m3, m7
256    psubusb m1, m6
257    psubusb m2, m3
258    paddusb m1, m3
259    paddusb m2, m6
260%endmacro
261
262; in: m1=p0 m2=q0
263;     %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
264; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
265; clobbers: q2, tmp, tc0
266%macro LUMA_Q1 6
267    pavgb   %6, m1, m2
268    pavgb   %2, %6       ; avg(p2,avg(p0,q0))
269    pxor    %6, %3
270    pand    %6, [pb_1]   ; (p2^avg(p0,q0))&1
271    psubusb %2, %6       ; (p2+((p0+q0+1)>>1))>>1
272    psubusb %6, %1, %5
273    paddusb %5, %1
274    pmaxub  %2, %6
275    pminub  %2, %5
276    mova    %4, %2
277%endmacro
278
279%if ARCH_X86_64
280;-----------------------------------------------------------------------------
281; void ff_deblock_v_luma(uint8_t *pix, int stride, int alpha, int beta,
282;                        int8_t *tc0)
283;-----------------------------------------------------------------------------
284%macro DEBLOCK_LUMA 0
285cglobal deblock_v_luma_8, 5,5,10, pix_, stride_, alpha_, beta_, base3_
286    movd    m8, [r4] ; tc0
287    lea     r4, [stride_q*3]
288    dec     alpha_d        ; alpha-1
289    neg     r4
290    dec     beta_d        ; beta-1
291    add     base3_q, pix_q     ; pix-3*stride
292
293    mova    m0, [base3_q + stride_q]   ; p1
294    mova    m1, [base3_q + 2*stride_q] ; p0
295    mova    m2, [pix_q]      ; q0
296    mova    m3, [pix_q + stride_q]   ; q1
297    LOAD_MASK r2d, r3d
298
299    punpcklbw m8, m8
300    punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
301    pcmpeqb m9, m9
302    pcmpeqb m9, m8
303    pandn   m9, m7
304    pand    m8, m9
305
306    movdqa  m3, [base3_q] ; p2
307    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
308    pand    m6, m9
309    psubb   m7, m8, m6
310    pand    m6, m8
311    LUMA_Q1 m0, m3, [base3_q], [base3_q + stride_q], m6, m4
312
313    movdqa  m4, [pix_q + 2*stride_q] ; q2
314    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
315    pand    m6, m9
316    pand    m8, m6
317    psubb   m7, m6
318    mova    m3, [pix_q + stride_q]
319    LUMA_Q1 m3, m4, [pix_q + 2*stride_q], [pix_q + stride_q], m8, m6
320
321    DEBLOCK_P0_Q0
322    mova    [base3_q + 2*stride_q], m1
323    mova    [pix_q], m2
324    RET
325
326;-----------------------------------------------------------------------------
327; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
328;                        int8_t *tc0)
329;-----------------------------------------------------------------------------
330INIT_MMX cpuname
331cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
332    movsxd r7,  r1d
333    lea    r8,  [r7+r7*2]
334    lea    r6,  [r0-4]
335    lea    r5,  [r0-4+r8]
336%if WIN64
337    %define pix_tmp rsp+0x30 ; shadow space + r4
338%else
339    %define pix_tmp rsp
340%endif
341
342    ; transpose 6x16 -> tmp space
343    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r7, r8), pix_tmp
344    lea    r6, [r6+r7*8]
345    lea    r5, [r5+r7*8]
346    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
347
348    ; vertical filter
349    ; alpha, beta, tc0 are still in r2d, r3d, r4
350    ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
351    lea    r0, [pix_tmp+0x30]
352    mov    r1d, 0x10
353%if WIN64
354    mov    [rsp+0x20], r4
355%endif
356    call   deblock_v_luma_8
357
358    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
359    add    r6, 2
360    add    r5, 2
361    movq   m0, [pix_tmp+0x18]
362    movq   m1, [pix_tmp+0x28]
363    movq   m2, [pix_tmp+0x38]
364    movq   m3, [pix_tmp+0x48]
365    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r7, r8)
366
367    shl    r7,  3
368    sub    r6,  r7
369    sub    r5,  r7
370    shr    r7,  3
371    movq   m0, [pix_tmp+0x10]
372    movq   m1, [pix_tmp+0x20]
373    movq   m2, [pix_tmp+0x30]
374    movq   m3, [pix_tmp+0x40]
375    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r7, r8)
376
377    RET
378%endmacro
379
380%macro DEBLOCK_H_LUMA_MBAFF 0
381
382cglobal deblock_h_luma_mbaff_8, 5, 9, 10, 8*16, pix_, stride_, alpha_, beta_, tc0_, base3_, stride3_
383    movsxd stride_q,   stride_d
384    dec    alpha_d
385    dec    beta_d
386    mov    base3_q,    pix_q
387    lea    stride3_q, [3*stride_q]
388    add    base3_q,    stride3_q
389
390    movq m0, [pix_q - 4]
391    movq m1, [pix_q + stride_q - 4]
392    movq m2, [pix_q + 2*stride_q - 4]
393    movq m3, [base3_q - 4]
394    movq m4, [base3_q + stride_q - 4]
395    movq m5, [base3_q + 2*stride_q - 4]
396    movq m6, [base3_q + stride3_q - 4]
397    movq m7, [base3_q + 4*stride_q - 4]
398
399    TRANSPOSE_8X8B 0,1,2,3,4,5,6,7
400
401    %assign i 0
402    %rep 8
403        movq [rsp + 16*i], m %+ i
404        %assign i i+1
405    %endrep
406
407    ; p2 = m1 [rsp + 16]
408    ; p1 = m2 [rsp + 32]
409    ; p0 = m3 [rsp + 48]
410    ; q0 = m4 [rsp + 64]
411    ; q1 = m5 [rsp + 80]
412    ; q2 = m6 [rsp + 96]
413
414    SWAP 0, 2
415    SWAP 1, 3
416    SWAP 2, 4
417    SWAP 3, 5
418
419    LOAD_MASK alpha_d, beta_d
420    movd m8, [tc0_q]
421    punpcklbw m8, m8
422    pcmpeqb m9, m9
423    pcmpeqb m9, m8
424    pandn   m9, m7
425    pand    m8, m9
426
427    movdqa  m3, [rsp + 16] ; p2
428    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
429    pand    m6, m9
430    psubb   m7, m8, m6
431    pand    m6, m8
432    LUMA_Q1 m0, m3, [rsp + 16], [rsp + 32], m6, m4
433
434    movdqa  m4, [rsp + 96] ; q2
435    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
436    pand    m6, m9
437    pand    m8, m6
438    psubb   m7, m6
439    mova    m3, [rsp + 80]
440    LUMA_Q1 m3, m4, [rsp + 96], [rsp + 80], m8, m6
441
442    DEBLOCK_P0_Q0
443    SWAP 1, 3
444    SWAP 2, 4
445    movq m0, [rsp]
446    movq m1, [rsp + 16]
447    movq m2, [rsp + 32]
448    movq m5, [rsp + 80]
449    movq m6, [rsp + 96]
450    movq m7, [rsp + 112]
451
452    TRANSPOSE_8X8B 0,1,2,3,4,5,6,7
453    movq [pix_q - 4], m0
454    movq [pix_q + stride_q - 4], m1
455    movq [pix_q + 2*stride_q - 4], m2
456    movq [base3_q - 4], m3
457    movq [base3_q + stride_q - 4], m4
458    movq [base3_q + 2*stride_q - 4], m5
459    movq [base3_q + stride3_q - 4], m6
460    movq [base3_q + 4*stride_q - 4], m7
461
462RET
463
464%endmacro
465
466INIT_XMM sse2
467DEBLOCK_H_LUMA_MBAFF
468DEBLOCK_LUMA
469
470%if HAVE_AVX_EXTERNAL
471INIT_XMM avx
472DEBLOCK_H_LUMA_MBAFF
473DEBLOCK_LUMA
474%endif
475
476%else
477
478%macro DEBLOCK_LUMA 2
479;-----------------------------------------------------------------------------
480; void ff_deblock_v8_luma(uint8_t *pix, int stride, int alpha, int beta,
481;                         int8_t *tc0)
482;-----------------------------------------------------------------------------
483cglobal deblock_%1_luma_8, 5,5,8,2*%2
484    lea     r4, [r1*3]
485    dec     r2     ; alpha-1
486    neg     r4
487    dec     r3     ; beta-1
488    add     r4, r0 ; pix-3*stride
489
490    mova    m0, [r4+r1]   ; p1
491    mova    m1, [r4+2*r1] ; p0
492    mova    m2, [r0]      ; q0
493    mova    m3, [r0+r1]   ; q1
494    LOAD_MASK r2, r3
495
496    mov     r3, r4mp
497    pcmpeqb m3, m3
498    movd    m4, [r3] ; tc0
499    punpcklbw m4, m4
500    punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
501    mova   [esp+%2], m4 ; tc
502    pcmpgtb m4, m3
503    mova    m3, [r4] ; p2
504    pand    m4, m7
505    mova   [esp], m4 ; mask
506
507    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
508    pand    m6, m4
509    pand    m4, [esp+%2] ; tc
510    psubb   m7, m4, m6
511    pand    m6, m4
512    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
513
514    mova    m4, [r0+2*r1] ; q2
515    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
516    pand    m6, [esp] ; mask
517    mova    m5, [esp+%2] ; tc
518    psubb   m7, m6
519    pand    m5, m6
520    mova    m3, [r0+r1]
521    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
522
523    DEBLOCK_P0_Q0
524    mova    [r4+2*r1], m1
525    mova    [r0], m2
526    RET
527
528;-----------------------------------------------------------------------------
529; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
530;                        int8_t *tc0)
531;-----------------------------------------------------------------------------
532INIT_MMX cpuname
533cglobal deblock_h_luma_8, 0,5,8,0x60+12
534    mov    r0, r0mp
535    mov    r3, r1m
536    lea    r4, [r3*3]
537    sub    r0, 4
538    lea    r1, [r0+r4]
539%define pix_tmp esp+12
540
541    ; transpose 6x16 -> tmp space
542    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp
543    lea    r0, [r0+r3*8]
544    lea    r1, [r1+r3*8]
545    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
546
547    ; vertical filter
548    lea    r0, [pix_tmp+0x30]
549    PUSH   dword r4m
550    PUSH   dword r3m
551    PUSH   dword r2m
552    PUSH   dword 16
553    PUSH   dword r0
554    call   deblock_%1_luma_8
555%ifidn %1, v8
556    add    dword [esp   ], 8 ; pix_tmp+0x38
557    add    dword [esp+16], 2 ; tc0+2
558    call   deblock_%1_luma_8
559%endif
560    ADD    esp, 20
561
562    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
563    mov    r0, r0mp
564    sub    r0, 2
565
566    movq   m0, [pix_tmp+0x10]
567    movq   m1, [pix_tmp+0x20]
568    lea    r1, [r0+r4]
569    movq   m2, [pix_tmp+0x30]
570    movq   m3, [pix_tmp+0x40]
571    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
572
573    lea    r0, [r0+r3*8]
574    lea    r1, [r1+r3*8]
575    movq   m0, [pix_tmp+0x18]
576    movq   m1, [pix_tmp+0x28]
577    movq   m2, [pix_tmp+0x38]
578    movq   m3, [pix_tmp+0x48]
579    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
580
581    RET
582%endmacro ; DEBLOCK_LUMA
583
584INIT_XMM sse2
585DEBLOCK_LUMA v, 16
586%if HAVE_AVX_EXTERNAL
587INIT_XMM avx
588DEBLOCK_LUMA v, 16
589%endif
590
591%endif ; ARCH
592
593
594
595%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
596%if ARCH_X86_64
597    pavgb t0, p2, p1
598    pavgb t1, p0, q0
599%else
600    mova  t0, p2
601    mova  t1, p0
602    pavgb t0, p1
603    pavgb t1, q0
604%endif
605    pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
606    mova  t5, t1
607%if ARCH_X86_64
608    paddb t2, p2, p1
609    paddb t3, p0, q0
610%else
611    mova  t2, p2
612    mova  t3, p0
613    paddb t2, p1
614    paddb t3, q0
615%endif
616    paddb t2, t3
617    mova  t3, t2
618    mova  t4, t2
619    psrlw t2, 1
620    pavgb t2, mpb_0
621    pxor  t2, t0
622    pand  t2, mpb_1
623    psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
624
625%if ARCH_X86_64
626    pavgb t1, p2, q1
627    psubb t2, p2, q1
628%else
629    mova  t1, p2
630    mova  t2, p2
631    pavgb t1, q1
632    psubb t2, q1
633%endif
634    paddb t3, t3
635    psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
636    pand  t2, mpb_1
637    psubb t1, t2
638    pavgb t1, p1
639    pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
640    psrlw t3, 2
641    pavgb t3, mpb_0
642    pxor  t3, t1
643    pand  t3, mpb_1
644    psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
645
646    pxor  t3, p0, q1
647    pavgb t2, p0, q1
648    pand  t3, mpb_1
649    psubb t2, t3
650    pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
651
652    pxor  t1, t2
653    pxor  t2, p0
654    pand  t1, mask1p
655    pand  t2, mask0
656    pxor  t1, t2
657    pxor  t1, p0
658    mova  %1, t1 ; store p0
659
660    mova  t1, %4 ; p3
661    paddb t2, t1, p2
662    pavgb t1, p2
663    pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
664    paddb t2, t2
665    paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
666    psrlw t2, 2
667    pavgb t2, mpb_0
668    pxor  t2, t1
669    pand  t2, mpb_1
670    psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
671
672    pxor  t0, p1
673    pxor  t1, p2
674    pand  t0, mask1p
675    pand  t1, mask1p
676    pxor  t0, p1
677    pxor  t1, p2
678    mova  %2, t0 ; store p1
679    mova  %3, t1 ; store p2
680%endmacro
681
682%macro LUMA_INTRA_SWAP_PQ 0
683    %define q1 m0
684    %define q0 m1
685    %define p0 m2
686    %define p1 m3
687    %define p2 q2
688    %define mask1p mask1q
689%endmacro
690
691%macro DEBLOCK_LUMA_INTRA 1
692    %define p1 m0
693    %define p0 m1
694    %define q0 m2
695    %define q1 m3
696    %define t0 m4
697    %define t1 m5
698    %define t2 m6
699    %define t3 m7
700%if ARCH_X86_64
701    %define p2 m8
702    %define q2 m9
703    %define t4 m10
704    %define t5 m11
705    %define mask0 m12
706    %define mask1p m13
707%if WIN64
708    %define mask1q [rsp]
709%else
710    %define mask1q [rsp-24]
711%endif
712    %define mpb_0 m14
713    %define mpb_1 m15
714%else
715    %define spill(x) [esp+16*x]
716    %define p2 [r4+r1]
717    %define q2 [r0+2*r1]
718    %define t4 spill(0)
719    %define t5 spill(1)
720    %define mask0 spill(2)
721    %define mask1p spill(3)
722    %define mask1q spill(4)
723    %define mpb_0 [pb_0]
724    %define mpb_1 [pb_1]
725%endif
726
727;-----------------------------------------------------------------------------
728; void ff_deblock_v_luma_intra(uint8_t *pix, int stride, int alpha, int beta)
729;-----------------------------------------------------------------------------
730%if WIN64
731cglobal deblock_%1_luma_intra_8, 4,6,16,0x10
732%else
733cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50
734%endif
735    lea     r4, [r1*4]
736    lea     r5, [r1*3] ; 3*stride
737    dec     r2d        ; alpha-1
738    jl .end
739    neg     r4
740    dec     r3d        ; beta-1
741    jl .end
742    add     r4, r0     ; pix-4*stride
743    mova    p1, [r4+2*r1]
744    mova    p0, [r4+r5]
745    mova    q0, [r0]
746    mova    q1, [r0+r1]
747%if ARCH_X86_64
748    pxor    mpb_0, mpb_0
749    mova    mpb_1, [pb_1]
750    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
751    SWAP    7, 12 ; m12=mask0
752    pavgb   t5, mpb_0
753    pavgb   t5, mpb_1 ; alpha/4+1
754    movdqa  p2, [r4+r1]
755    movdqa  q2, [r0+2*r1]
756    DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
757    DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
758    DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
759    pand    t0, mask0
760    pand    t4, t0
761    pand    t2, t0
762    mova    mask1q, t4
763    mova    mask1p, t2
764%else
765    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
766    mova    m4, t5
767    mova    mask0, m7
768    pavgb   m4, [pb_0]
769    pavgb   m4, [pb_1] ; alpha/4+1
770    DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
771    pand    m6, mask0
772    DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
773    pand    m4, m6
774    mova    mask1p, m4
775    DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
776    pand    m4, m6
777    mova    mask1q, m4
778%endif
779    LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
780    LUMA_INTRA_SWAP_PQ
781    LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
782.end:
783    RET
784
785INIT_MMX cpuname
786%if ARCH_X86_64
787;-----------------------------------------------------------------------------
788; void ff_deblock_h_luma_intra(uint8_t *pix, int stride, int alpha, int beta)
789;-----------------------------------------------------------------------------
790cglobal deblock_h_luma_intra_8, 4,9,0,0x80
791    movsxd r7,  r1d
792    lea    r8,  [r7*3]
793    lea    r6,  [r0-4]
794    lea    r5,  [r0-4+r8]
795%if WIN64
796    %define pix_tmp rsp+0x20 ; shadow space
797%else
798    %define pix_tmp rsp
799%endif
800
801    ; transpose 8x16 -> tmp space
802    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
803    lea    r6, [r6+r7*8]
804    lea    r5, [r5+r7*8]
805    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
806
807    lea    r0,  [pix_tmp+0x40]
808    mov    r1,  0x10
809    call   deblock_v_luma_intra_8
810
811    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
812    lea    r5, [r6+r8]
813    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
814    shl    r7,  3
815    sub    r6,  r7
816    sub    r5,  r7
817    shr    r7,  3
818    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
819    RET
820%else
821cglobal deblock_h_luma_intra_8, 2,4,8,0x80
822    lea    r3,  [r1*3]
823    sub    r0,  4
824    lea    r2,  [r0+r3]
825    %define pix_tmp rsp
826
827    ; transpose 8x16 -> tmp space
828    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
829    lea    r0,  [r0+r1*8]
830    lea    r2,  [r2+r1*8]
831    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
832
833    lea    r0,  [pix_tmp+0x40]
834    PUSH   dword r3m
835    PUSH   dword r2m
836    PUSH   dword 16
837    PUSH   r0
838    call   deblock_%1_luma_intra_8
839%ifidn %1, v8
840    add    dword [rsp], 8 ; pix_tmp+8
841    call   deblock_%1_luma_intra_8
842%endif
843    ADD    esp, 16
844
845    mov    r1,  r1m
846    mov    r0,  r0mp
847    lea    r3,  [r1*3]
848    sub    r0,  4
849    lea    r2,  [r0+r3]
850    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
851    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
852    lea    r0,  [r0+r1*8]
853    lea    r2,  [r2+r1*8]
854    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
855    RET
856%endif ; ARCH_X86_64
857%endmacro ; DEBLOCK_LUMA_INTRA
858
859INIT_XMM sse2
860DEBLOCK_LUMA_INTRA v
861%if HAVE_AVX_EXTERNAL
862INIT_XMM avx
863DEBLOCK_LUMA_INTRA v
864%endif
865
866%macro LOAD_8_ROWS 8
867    movd m0, %1
868    movd m1, %2
869    movd m2, %3
870    movd m3, %4
871    movd m4, %5
872    movd m5, %6
873    movd m6, %7
874    movd m7, %8
875%endmacro
876
877%macro STORE_8_ROWS 8
878    movd %1, m0
879    movd %2, m1
880    movd %3, m2
881    movd %4, m3
882    movd %5, m4
883    movd %6, m5
884    movd %7, m6
885    movd %8, m7
886%endmacro
887
888%macro TRANSPOSE_8x4B_XMM 0
889    punpcklbw m0, m1
890    punpcklbw m2, m3
891    punpcklbw m4, m5
892    punpcklbw m6, m7
893    punpcklwd m0, m2
894    punpcklwd m4, m6
895    punpckhdq m2, m0, m4
896    punpckldq m0, m4
897    MOVHL m1, m0
898    MOVHL m3, m2
899%endmacro
900
901%macro TRANSPOSE_4x8B_XMM 0
902    punpcklbw m0, m1
903    punpcklbw m2, m3
904    punpckhwd m4, m0, m2
905    punpcklwd m0, m2
906    MOVHL m6, m4
907    MOVHL m2, m0
908    pshufd m1, m0, 1
909    pshufd m3, m2, 1
910    pshufd m5, m4, 1
911    pshufd m7, m6, 1
912%endmacro
913
914%macro CHROMA_INTER_BODY_XMM 1
915    LOAD_MASK alpha_d, beta_d
916    movd m6, [tc0_q]
917    %rep %1
918        punpcklbw m6, m6
919    %endrep
920    pand m7, m6
921    DEBLOCK_P0_Q0
922%endmacro
923
924%macro CHROMA_INTRA_BODY_XMM 0
925    LOAD_MASK alpha_d, beta_d
926    mova    m5,  m1
927    mova    m6,  m2
928    pxor    m4,  m1, m3
929    pand    m4, [pb_1]
930    pavgb   m1,  m3
931    psubusb m1,  m4
932    pavgb   m1,  m0
933    pxor    m4,  m2, m0
934    pand    m4, [pb_1]
935    pavgb   m2,  m0
936    psubusb m2,  m4
937    pavgb   m2,  m3
938    psubb   m1,  m5
939    psubb   m2,  m6
940    pand    m1,  m7
941    pand    m2,  m7
942    paddb   m1,  m5
943    paddb   m2,  m6
944%endmacro
945
946%macro CHROMA_V_START_XMM 1
947    movsxdifnidn stride_q, stride_d
948    dec alpha_d
949    dec beta_d
950    mov %1, pix_q
951    sub %1, stride_q
952    sub %1, stride_q
953%endmacro
954
955%macro CHROMA_H_START_XMM 2
956    movsxdifnidn stride_q, stride_d
957    dec alpha_d
958    dec beta_d
959    lea %2, [3*stride_q]
960    mov %1,  pix_q
961    add %1,  %2
962%endmacro
963
964%macro DEBLOCK_CHROMA_XMM 1
965
966INIT_XMM %1
967
968cglobal deblock_v_chroma_8, 5, 6, 8, pix_, stride_, alpha_, beta_, tc0_
969    CHROMA_V_START_XMM r5
970    movq m0, [r5]
971    movq m1, [r5 + stride_q]
972    movq m2, [pix_q]
973    movq m3, [pix_q + stride_q]
974    CHROMA_INTER_BODY_XMM 1
975    movq [r5 + stride_q], m1
976    movq [pix_q], m2
977RET
978
979cglobal deblock_h_chroma_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_
980    CHROMA_H_START_XMM r5, r6
981    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
982    TRANSPOSE_8x4B_XMM
983    movq [rsp], m0
984    movq [rsp + 8], m3
985    CHROMA_INTER_BODY_XMM 1
986    movq m0, [rsp]
987    movq m3, [rsp + 8]
988    TRANSPOSE_4x8B_XMM
989    STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
990RET
991
992cglobal deblock_h_chroma422_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_
993    CHROMA_H_START_XMM r5, r6
994    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
995    TRANSPOSE_8x4B_XMM
996    movq [rsp], m0
997    movq [rsp + 8], m3
998    CHROMA_INTER_BODY_XMM 2
999    movq m0, [rsp]
1000    movq m3, [rsp + 8]
1001    TRANSPOSE_4x8B_XMM
1002    STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
1003
1004    lea pix_q, [pix_q + 8*stride_q]
1005    lea r5,    [r5    + 8*stride_q]
1006    add tc0_q,  2
1007
1008    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
1009    TRANSPOSE_8x4B_XMM
1010    movq [rsp], m0
1011    movq [rsp + 8], m3
1012    CHROMA_INTER_BODY_XMM 2
1013    movq m0, [rsp]
1014    movq m3, [rsp + 8]
1015    TRANSPOSE_4x8B_XMM
1016    STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6)
1017RET
1018
1019cglobal deblock_v_chroma_intra_8, 4, 5, 8, pix_, stride_, alpha_, beta_
1020    CHROMA_V_START_XMM r4
1021    movq m0, [r4]
1022    movq m1, [r4 + stride_q]
1023    movq m2, [pix_q]
1024    movq m3, [pix_q + stride_q]
1025    CHROMA_INTRA_BODY_XMM
1026    movq [r4 + stride_q], m1
1027    movq [pix_q], m2
1028RET
1029
1030cglobal deblock_h_chroma_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_
1031    CHROMA_H_START_XMM r4, r5
1032    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
1033    TRANSPOSE_8x4B_XMM
1034    CHROMA_INTRA_BODY_XMM
1035    TRANSPOSE_4x8B_XMM
1036    STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
1037RET
1038
1039cglobal deblock_h_chroma422_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_
1040    CHROMA_H_START_XMM r4, r5
1041    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
1042    TRANSPOSE_8x4B_XMM
1043    CHROMA_INTRA_BODY_XMM
1044    TRANSPOSE_4x8B_XMM
1045    STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
1046
1047    lea pix_q, [pix_q + 8*stride_q]
1048    lea r4,    [r4    + 8*stride_q]
1049
1050    LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
1051    TRANSPOSE_8x4B_XMM
1052    CHROMA_INTRA_BODY_XMM
1053    TRANSPOSE_4x8B_XMM
1054    STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5)
1055RET
1056
1057%endmacro ; DEBLOCK_CHROMA_XMM
1058
1059DEBLOCK_CHROMA_XMM sse2
1060DEBLOCK_CHROMA_XMM avx
1061
1062;-----------------------------------------------------------------------------
1063; void ff_h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
1064;                                   int8_t ref[2][40], int16_t mv[2][40][2],
1065;                                   int bidir,    int edges,    int step,
1066;                                   int mask_mv0, int mask_mv1, int field);
1067;
1068; bidir    is 0 or 1
1069; edges    is 1 or 4
1070; step     is 1 or 2
1071; mask_mv0 is 0 or 3
1072; mask_mv1 is 0 or 1
1073; field    is 0 or 1
1074;-----------------------------------------------------------------------------
1075%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv,
1076                                        ; dir, d_idx, mask_dir, bidir
1077%define edgesd    %1
1078%define stepd     %2
1079%define mask_mvd  %3
1080%define dir       %4
1081%define d_idx     %5
1082%define mask_dir  %6
1083%define bidir     %7
1084    xor          b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step)
1085%%.b_idx_loop:
1086%if mask_dir == 0
1087    pxor             m0, m0
1088%endif
1089    test         b_idxd, dword mask_mvd
1090    jnz %%.skip_loop_iter                       ; if (!(b_idx & mask_mv))
1091%if bidir == 1
1092    movd             m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] }
1093    punpckldq        m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] }
1094    pshufw           m0, [refq+b_idxq+12], 0x44 ; { ref0[b],  ref0[b]  }
1095    pshufw           m1, [refq+b_idxq+52], 0x44 ; { ref1[b],  ref1[b]  }
1096    pshufw           m3, m2, 0x4E               ; { ref1[bn], ref0[bn] }
1097    psubb            m0, m2                     ; { ref0[b] != ref0[bn],
1098                                                ;   ref0[b] != ref1[bn] }
1099    psubb            m1, m3                     ; { ref1[b] != ref1[bn],
1100                                                ;   ref1[b] != ref0[bn] }
1101
1102    por              m0, m1
1103    mova             m1, [mvq+b_idxq*4+(d_idx+12)*4]
1104    mova             m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
1105    mova             m3, m1
1106    mova             m4, m2
1107    psubw            m1, [mvq+b_idxq*4+12*4]
1108    psubw            m2, [mvq+b_idxq*4+12*4+mmsize]
1109    psubw            m3, [mvq+b_idxq*4+52*4]
1110    psubw            m4, [mvq+b_idxq*4+52*4+mmsize]
1111    packsswb         m1, m2
1112    packsswb         m3, m4
1113    paddb            m1, m6
1114    paddb            m3, m6
1115    psubusb          m1, m5 ; abs(mv[b] - mv[bn]) >= limit
1116    psubusb          m3, m5
1117    packsswb         m1, m3
1118
1119    por              m0, m1
1120    mova             m1, [mvq+b_idxq*4+(d_idx+52)*4]
1121    mova             m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize]
1122    mova             m3, m1
1123    mova             m4, m2
1124    psubw            m1, [mvq+b_idxq*4+12*4]
1125    psubw            m2, [mvq+b_idxq*4+12*4+mmsize]
1126    psubw            m3, [mvq+b_idxq*4+52*4]
1127    psubw            m4, [mvq+b_idxq*4+52*4+mmsize]
1128    packsswb         m1, m2
1129    packsswb         m3, m4
1130    paddb            m1, m6
1131    paddb            m3, m6
1132    psubusb          m1, m5 ; abs(mv[b] - mv[bn]) >= limit
1133    psubusb          m3, m5
1134    packsswb         m1, m3
1135
1136    pshufw           m1, m1, 0x4E
1137    por              m0, m1
1138    pshufw           m1, m0, 0x4E
1139    pminub           m0, m1
1140%else ; bidir == 0
1141    movd             m0, [refq+b_idxq+12]
1142    psubb            m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn]
1143
1144    mova             m1, [mvq+b_idxq*4+12*4]
1145    mova             m2, [mvq+b_idxq*4+12*4+mmsize]
1146    psubw            m1, [mvq+b_idxq*4+(d_idx+12)*4]
1147    psubw            m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
1148    packsswb         m1, m2
1149    paddb            m1, m6
1150    psubusb          m1, m5 ; abs(mv[b] - mv[bn]) >= limit
1151    packsswb         m1, m1
1152    por              m0, m1
1153%endif ; bidir == 1/0
1154
1155%%.skip_loop_iter:
1156    movd             m1, [nnzq+b_idxq+12]
1157    por              m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn]
1158
1159    pminub           m1, m7
1160    pminub           m0, m7
1161    psllw            m1, 1
1162    pxor             m2, m2
1163    pmaxub           m1, m0
1164    punpcklbw        m1, m2
1165    movq [bsq+b_idxq+32*dir], m1
1166
1167    add          b_idxd, dword stepd
1168    cmp          b_idxd, dword edgesd
1169    jl %%.b_idx_loop
1170%endmacro
1171
1172INIT_MMX mmxext
1173cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \
1174                                            step, mask_mv0, mask_mv1, field
1175%define b_idxq bidirq
1176%define b_idxd bidird
1177    cmp    dword fieldm, 0
1178    mova             m7, [pb_1]
1179    mova             m5, [pb_3]
1180    je .nofield
1181    mova             m5, [pb_3_1]
1182.nofield:
1183    mova             m6, m5
1184    paddb            m5, m5
1185
1186    shl     dword stepd, 3
1187    shl    dword edgesd, 3
1188%if ARCH_X86_32
1189%define mask_mv0d mask_mv0m
1190%define mask_mv1d mask_mv1m
1191%endif
1192    shl dword mask_mv1d, 3
1193    shl dword mask_mv0d, 3
1194
1195    cmp    dword bidird, 0
1196    jne .bidir
1197    loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8,  0, 0
1198    loop_filter_strength_iteration     32,     8, mask_mv0d, 0, -1, -1, 0
1199
1200    mova             m0, [bsq+mmsize*0]
1201    mova             m1, [bsq+mmsize*1]
1202    mova             m2, [bsq+mmsize*2]
1203    mova             m3, [bsq+mmsize*3]
1204    TRANSPOSE4x4W 0, 1, 2, 3, 4
1205    mova  [bsq+mmsize*0], m0
1206    mova  [bsq+mmsize*1], m1
1207    mova  [bsq+mmsize*2], m2
1208    mova  [bsq+mmsize*3], m3
1209    RET
1210
1211.bidir:
1212    loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8,  0, 1
1213    loop_filter_strength_iteration     32,     8, mask_mv0d, 0, -1, -1, 1
1214
1215    mova             m0, [bsq+mmsize*0]
1216    mova             m1, [bsq+mmsize*1]
1217    mova             m2, [bsq+mmsize*2]
1218    mova             m3, [bsq+mmsize*3]
1219    TRANSPOSE4x4W 0, 1, 2, 3, 4
1220    mova  [bsq+mmsize*0], m0
1221    mova  [bsq+mmsize*1], m1
1222    mova  [bsq+mmsize*2], m2
1223    mova  [bsq+mmsize*3], m3
1224    RET
1225