• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION_RODATA
14pw_8: times  8 dw  8
15bilin_filter_m_sse2: times  8 dw 16
16                     times  8 dw  0
17                     times  8 dw 14
18                     times  8 dw  2
19                     times  8 dw 12
20                     times  8 dw  4
21                     times  8 dw 10
22                     times  8 dw  6
23                     times 16 dw  8
24                     times  8 dw  6
25                     times  8 dw 10
26                     times  8 dw  4
27                     times  8 dw 12
28                     times  8 dw  2
29                     times  8 dw 14
30
31bilin_filter_m_ssse3: times  8 db 16,  0
32                      times  8 db 14,  2
33                      times  8 db 12,  4
34                      times  8 db 10,  6
35                      times 16 db  8
36                      times  8 db  6, 10
37                      times  8 db  4, 12
38                      times  8 db  2, 14
39
40SECTION .text
41
42; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
43;                               int x_offset, int y_offset,
44;                               const uint8_t *dst, ptrdiff_t dst_stride,
45;                               int height, unsigned int *sse);
46;
47; This function returns the SE and stores SSE in the given pointer.
48
49%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
50  psubw                %3, %4
51  psubw                %1, %2
52  paddw                %5, %3
53  pmaddwd              %3, %3
54  paddw                %5, %1
55  pmaddwd              %1, %1
56  paddd                %6, %3
57  paddd                %6, %1
58%endmacro
59
60%macro STORE_AND_RET 0
61%if mmsize == 16
62  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
63  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
64  ; We have to sign-extend it before adding the words within the register
65  ; and outputing to a dword.
66  pcmpgtw              m5, m6           ; mask for 0 > x
67  movhlps              m3, m7
68  punpcklwd            m4, m6, m5
69  punpckhwd            m6, m5           ; sign-extend m6 word->dword
70  paddd                m7, m3
71  paddd                m6, m4
72  pshufd               m3, m7, 0x1
73  movhlps              m4, m6
74  paddd                m7, m3
75  paddd                m6, m4
76  mov                  r1, ssem         ; r1 = unsigned int *sse
77  pshufd               m4, m6, 0x1
78  movd               [r1], m7           ; store sse
79  paddd                m6, m4
80  movd               raxd, m6           ; store sum as return value
81%else ; mmsize == 8
82  pshufw               m4, m6, 0xe
83  pshufw               m3, m7, 0xe
84  paddw                m6, m4
85  paddd                m7, m3
86  pcmpgtw              m5, m6           ; mask for 0 > x
87  mov                  r1, ssem         ; r1 = unsigned int *sse
88  punpcklwd            m6, m5           ; sign-extend m6 word->dword
89  movd               [r1], m7           ; store sse
90  pshufw               m4, m6, 0xe
91  paddd                m6, m4
92  movd               raxd, m6           ; store sum as return value
93%endif
94  RET
95%endmacro
96
97%macro INC_SRC_BY_SRC_STRIDE  0
98%if ARCH_X86=1 && CONFIG_PIC=1
99  add                srcq, src_stridemp
100%else
101  add                srcq, src_strideq
102%endif
103%endmacro
104
105%macro SUBPEL_VARIANCE 1-2 0 ; W
106%if cpuflag(ssse3)
107%define bilin_filter_m bilin_filter_m_ssse3
108%define filter_idx_shift 4
109%else
110%define bilin_filter_m bilin_filter_m_sse2
111%define filter_idx_shift 5
112%endif
113; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
114; 11, not 13, if the registers are ordered correctly. May make a minor speed
115; difference on Win64
116
117%ifdef PIC    ; 64bit PIC
118  %if %2 == 1 ; avg
119    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
120                                      x_offset, y_offset, \
121                                      dst, dst_stride, \
122                                      sec, sec_stride, height, sse
123    %define sec_str sec_strideq
124  %else
125    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
126                                  y_offset, dst, dst_stride, height, sse
127  %endif
128  %define block_height heightd
129  %define bilin_filter sseq
130%else
131  %if ARCH_X86=1 && CONFIG_PIC=1
132    %if %2 == 1 ; avg
133      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
134                                  x_offset, y_offset, \
135                                  dst, dst_stride, \
136                                  sec, sec_stride, \
137                                  height, sse, g_bilin_filter, g_pw_8
138      %define block_height dword heightm
139      %define sec_str sec_stridemp
140
141      ;Store bilin_filter and pw_8 location in stack
142      GET_GOT eax
143      add esp, 4                ; restore esp
144
145      lea ecx, [GLOBAL(bilin_filter_m)]
146      mov g_bilin_filterm, ecx
147
148      lea ecx, [GLOBAL(pw_8)]
149      mov g_pw_8m, ecx
150
151      LOAD_IF_USED 0, 1         ; load eax, ecx back
152    %else
153      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
154                                y_offset, dst, dst_stride, height, sse, \
155                                g_bilin_filter, g_pw_8
156      %define block_height heightd
157
158      ;Store bilin_filter and pw_8 location in stack
159      GET_GOT eax
160      add esp, 4                ; restore esp
161
162      lea ecx, [GLOBAL(bilin_filter_m)]
163      mov g_bilin_filterm, ecx
164
165      lea ecx, [GLOBAL(pw_8)]
166      mov g_pw_8m, ecx
167
168      LOAD_IF_USED 0, 1         ; load eax, ecx back
169    %endif
170  %else
171    %if %2 == 1 ; avg
172      cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
173                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
174                                             x_offset, y_offset, \
175                                             dst, dst_stride, \
176                                             sec, sec_stride, \
177                                             height, sse
178      %if ARCH_X86_64
179      %define block_height heightd
180      %define sec_str sec_strideq
181      %else
182      %define block_height dword heightm
183      %define sec_str sec_stridemp
184      %endif
185    %else
186      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
187                              y_offset, dst, dst_stride, height, sse
188      %define block_height heightd
189    %endif
190
191    %define bilin_filter bilin_filter_m
192  %endif
193%endif
194
195  ASSERT               %1 <= 16         ; m6 overflows if w > 16
196  pxor                 m6, m6           ; sum
197  pxor                 m7, m7           ; sse
198  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
199  ; could perhaps use it for something more productive then
200  pxor                 m5, m5           ; dedicated zero register
201%if %1 < 16
202  sar                   block_height, 1
203%if %2 == 1 ; avg
204  shl             sec_str, 1
205%endif
206%endif
207
208  ; FIXME(rbultje) replace by jumptable?
209  test          x_offsetd, x_offsetd
210  jnz .x_nonzero
211  ; x_offset == 0
212  test          y_offsetd, y_offsetd
213  jnz .x_zero_y_nonzero
214
215  ; x_offset == 0 && y_offset == 0
216.x_zero_y_zero_loop:
217%if %1 == 16
218  movu                 m0, [srcq]
219  mova                 m1, [dstq]
220%if %2 == 1 ; avg
221  pavgb                m0, [secq]
222  punpckhbw            m3, m1, m5
223  punpcklbw            m1, m5
224%endif
225  punpckhbw            m2, m0, m5
226  punpcklbw            m0, m5
227%if %2 == 0 ; !avg
228  punpckhbw            m3, m1, m5
229  punpcklbw            m1, m5
230%endif
231  SUM_SSE              m0, m1, m2, m3, m6, m7
232
233  add                srcq, src_strideq
234  add                dstq, dst_strideq
235%else ; %1 < 16
236  movh                 m0, [srcq]
237%if %2 == 1 ; avg
238%if mmsize == 16
239  movhps               m0, [srcq+src_strideq]
240%else ; mmsize == 8
241  punpckldq            m0, [srcq+src_strideq]
242%endif
243%else ; !avg
244  movh                 m2, [srcq+src_strideq]
245%endif
246  movh                 m1, [dstq]
247  movh                 m3, [dstq+dst_strideq]
248%if %2 == 1 ; avg
249  pavgb                m0, [secq]
250  punpcklbw            m3, m5
251  punpcklbw            m1, m5
252  punpckhbw            m2, m0, m5
253  punpcklbw            m0, m5
254%else ; !avg
255  punpcklbw            m0, m5
256  punpcklbw            m2, m5
257  punpcklbw            m3, m5
258  punpcklbw            m1, m5
259%endif
260  SUM_SSE              m0, m1, m2, m3, m6, m7
261
262  lea                srcq, [srcq+src_strideq*2]
263  lea                dstq, [dstq+dst_strideq*2]
264%endif
265%if %2 == 1 ; avg
266  add                secq, sec_str
267%endif
268  dec                   block_height
269  jg .x_zero_y_zero_loop
270  STORE_AND_RET
271
272.x_zero_y_nonzero:
273  cmp           y_offsetd, 8
274  jne .x_zero_y_nonhalf
275
276  ; x_offset == 0 && y_offset == 0.5
277.x_zero_y_half_loop:
278%if %1 == 16
279  movu                 m0, [srcq]
280  movu                 m4, [srcq+src_strideq]
281  mova                 m1, [dstq]
282  pavgb                m0, m4
283  punpckhbw            m3, m1, m5
284%if %2 == 1 ; avg
285  pavgb                m0, [secq]
286%endif
287  punpcklbw            m1, m5
288  punpckhbw            m2, m0, m5
289  punpcklbw            m0, m5
290  SUM_SSE              m0, m1, m2, m3, m6, m7
291
292  add                srcq, src_strideq
293  add                dstq, dst_strideq
294%else ; %1 < 16
295  movh                 m0, [srcq]
296  movh                 m2, [srcq+src_strideq]
297%if %2 == 1 ; avg
298%if mmsize == 16
299  movhps               m2, [srcq+src_strideq*2]
300%else ; mmsize == 8
301%if %1 == 4
302  movh                 m1, [srcq+src_strideq*2]
303  punpckldq            m2, m1
304%else
305  punpckldq            m2, [srcq+src_strideq*2]
306%endif
307%endif
308  movh                 m1, [dstq]
309%if mmsize == 16
310  movlhps              m0, m2
311%else ; mmsize == 8
312  punpckldq            m0, m2
313%endif
314  movh                 m3, [dstq+dst_strideq]
315  pavgb                m0, m2
316  punpcklbw            m1, m5
317  pavgb                m0, [secq]
318  punpcklbw            m3, m5
319  punpckhbw            m2, m0, m5
320  punpcklbw            m0, m5
321%else ; !avg
322  movh                 m4, [srcq+src_strideq*2]
323  movh                 m1, [dstq]
324  pavgb                m0, m2
325  movh                 m3, [dstq+dst_strideq]
326  pavgb                m2, m4
327  punpcklbw            m0, m5
328  punpcklbw            m2, m5
329  punpcklbw            m3, m5
330  punpcklbw            m1, m5
331%endif
332  SUM_SSE              m0, m1, m2, m3, m6, m7
333
334  lea                srcq, [srcq+src_strideq*2]
335  lea                dstq, [dstq+dst_strideq*2]
336%endif
337%if %2 == 1 ; avg
338  add                secq, sec_str
339%endif
340  dec                   block_height
341  jg .x_zero_y_half_loop
342  STORE_AND_RET
343
344.x_zero_y_nonhalf:
345  ; x_offset == 0 && y_offset == bilin interpolation
346%ifdef PIC
347  lea        bilin_filter, [bilin_filter_m]
348%endif
349  shl           y_offsetd, filter_idx_shift
350%if ARCH_X86_64 && mmsize == 16
351  mova                 m8, [bilin_filter+y_offsetq]
352%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
353  mova                 m9, [bilin_filter+y_offsetq+16]
354%endif
355  mova                m10, [pw_8]
356%define filter_y_a m8
357%define filter_y_b m9
358%define filter_rnd m10
359%else ; x86-32 or mmx
360%if ARCH_X86=1 && CONFIG_PIC=1
361; x_offset == 0, reuse x_offset reg
362%define tempq x_offsetq
363  add y_offsetq, g_bilin_filterm
364%define filter_y_a [y_offsetq]
365%define filter_y_b [y_offsetq+16]
366  mov tempq, g_pw_8m
367%define filter_rnd [tempq]
368%else
369  add           y_offsetq, bilin_filter
370%define filter_y_a [y_offsetq]
371%define filter_y_b [y_offsetq+16]
372%define filter_rnd [pw_8]
373%endif
374%endif
375
376.x_zero_y_other_loop:
377%if %1 == 16
378  movu                 m0, [srcq]
379  movu                 m4, [srcq+src_strideq]
380  mova                 m1, [dstq]
381%if cpuflag(ssse3)
382  punpckhbw            m2, m0, m4
383  punpcklbw            m0, m4
384  pmaddubsw            m2, filter_y_a
385  pmaddubsw            m0, filter_y_a
386  paddw                m2, filter_rnd
387  paddw                m0, filter_rnd
388%else
389  punpckhbw            m2, m0, m5
390  punpckhbw            m3, m4, m5
391  punpcklbw            m0, m5
392  punpcklbw            m4, m5
393  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
394  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
395  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
396  ; slightly faster because of pmullw latency. It would also cut our rodata
397  ; tables in half for this function, and save 1-2 registers on x86-64.
398  pmullw               m2, filter_y_a
399  pmullw               m3, filter_y_b
400  paddw                m2, filter_rnd
401  pmullw               m0, filter_y_a
402  pmullw               m4, filter_y_b
403  paddw                m0, filter_rnd
404  paddw                m2, m3
405  paddw                m0, m4
406%endif
407  psraw                m2, 4
408  psraw                m0, 4
409%if %2 == 1 ; avg
410  ; FIXME(rbultje) pipeline
411  packuswb             m0, m2
412  pavgb                m0, [secq]
413  punpckhbw            m2, m0, m5
414  punpcklbw            m0, m5
415%endif
416  punpckhbw            m3, m1, m5
417  punpcklbw            m1, m5
418  SUM_SSE              m0, m1, m2, m3, m6, m7
419
420  add                srcq, src_strideq
421  add                dstq, dst_strideq
422%else ; %1 < 16
423  movh                 m0, [srcq]
424  movh                 m2, [srcq+src_strideq]
425  movh                 m4, [srcq+src_strideq*2]
426  movh                 m3, [dstq+dst_strideq]
427%if cpuflag(ssse3)
428  movh                 m1, [dstq]
429  punpcklbw            m0, m2
430  punpcklbw            m2, m4
431  pmaddubsw            m0, filter_y_a
432  pmaddubsw            m2, filter_y_a
433  punpcklbw            m3, m5
434  paddw                m2, filter_rnd
435  paddw                m0, filter_rnd
436%else
437  punpcklbw            m0, m5
438  punpcklbw            m2, m5
439  punpcklbw            m4, m5
440  pmullw               m0, filter_y_a
441  pmullw               m1, m2, filter_y_b
442  punpcklbw            m3, m5
443  paddw                m0, filter_rnd
444  pmullw               m2, filter_y_a
445  pmullw               m4, filter_y_b
446  paddw                m0, m1
447  paddw                m2, filter_rnd
448  movh                 m1, [dstq]
449  paddw                m2, m4
450%endif
451  psraw                m0, 4
452  psraw                m2, 4
453%if %2 == 1 ; avg
454  ; FIXME(rbultje) pipeline
455  packuswb             m0, m2
456  pavgb                m0, [secq]
457  punpckhbw            m2, m0, m5
458  punpcklbw            m0, m5
459%endif
460  punpcklbw            m1, m5
461  SUM_SSE              m0, m1, m2, m3, m6, m7
462
463  lea                srcq, [srcq+src_strideq*2]
464  lea                dstq, [dstq+dst_strideq*2]
465%endif
466%if %2 == 1 ; avg
467  add                secq, sec_str
468%endif
469  dec                   block_height
470  jg .x_zero_y_other_loop
471%undef filter_y_a
472%undef filter_y_b
473%undef filter_rnd
474  STORE_AND_RET
475
476.x_nonzero:
477  cmp           x_offsetd, 8
478  jne .x_nonhalf
479  ; x_offset == 0.5
480  test          y_offsetd, y_offsetd
481  jnz .x_half_y_nonzero
482
483  ; x_offset == 0.5 && y_offset == 0
484.x_half_y_zero_loop:
485%if %1 == 16
486  movu                 m0, [srcq]
487  movu                 m4, [srcq+1]
488  mova                 m1, [dstq]
489  pavgb                m0, m4
490  punpckhbw            m3, m1, m5
491%if %2 == 1 ; avg
492  pavgb                m0, [secq]
493%endif
494  punpcklbw            m1, m5
495  punpckhbw            m2, m0, m5
496  punpcklbw            m0, m5
497  SUM_SSE              m0, m1, m2, m3, m6, m7
498
499  add                srcq, src_strideq
500  add                dstq, dst_strideq
501%else ; %1 < 16
502  movh                 m0, [srcq]
503  movh                 m4, [srcq+1]
504%if %2 == 1 ; avg
505%if mmsize == 16
506  movhps               m0, [srcq+src_strideq]
507  movhps               m4, [srcq+src_strideq+1]
508%else ; mmsize == 8
509  punpckldq            m0, [srcq+src_strideq]
510  punpckldq            m4, [srcq+src_strideq+1]
511%endif
512  movh                 m1, [dstq]
513  movh                 m3, [dstq+dst_strideq]
514  pavgb                m0, m4
515  punpcklbw            m3, m5
516  pavgb                m0, [secq]
517  punpcklbw            m1, m5
518  punpckhbw            m2, m0, m5
519  punpcklbw            m0, m5
520%else ; !avg
521  movh                 m2, [srcq+src_strideq]
522  movh                 m1, [dstq]
523  pavgb                m0, m4
524  movh                 m4, [srcq+src_strideq+1]
525  movh                 m3, [dstq+dst_strideq]
526  pavgb                m2, m4
527  punpcklbw            m0, m5
528  punpcklbw            m2, m5
529  punpcklbw            m3, m5
530  punpcklbw            m1, m5
531%endif
532  SUM_SSE              m0, m1, m2, m3, m6, m7
533
534  lea                srcq, [srcq+src_strideq*2]
535  lea                dstq, [dstq+dst_strideq*2]
536%endif
537%if %2 == 1 ; avg
538  add                secq, sec_str
539%endif
540  dec                   block_height
541  jg .x_half_y_zero_loop
542  STORE_AND_RET
543
544.x_half_y_nonzero:
545  cmp           y_offsetd, 8
546  jne .x_half_y_nonhalf
547
548  ; x_offset == 0.5 && y_offset == 0.5
549%if %1 == 16
550  movu                 m0, [srcq]
551  movu                 m3, [srcq+1]
552  add                srcq, src_strideq
553  pavgb                m0, m3
554.x_half_y_half_loop:
555  movu                 m4, [srcq]
556  movu                 m3, [srcq+1]
557  mova                 m1, [dstq]
558  pavgb                m4, m3
559  punpckhbw            m3, m1, m5
560  pavgb                m0, m4
561%if %2 == 1 ; avg
562  punpcklbw            m1, m5
563  pavgb                m0, [secq]
564  punpckhbw            m2, m0, m5
565  punpcklbw            m0, m5
566%else
567  punpckhbw            m2, m0, m5
568  punpcklbw            m0, m5
569  punpcklbw            m1, m5
570%endif
571  SUM_SSE              m0, m1, m2, m3, m6, m7
572  mova                 m0, m4
573
574  add                srcq, src_strideq
575  add                dstq, dst_strideq
576%else ; %1 < 16
577  movh                 m0, [srcq]
578  movh                 m3, [srcq+1]
579  add                srcq, src_strideq
580  pavgb                m0, m3
581.x_half_y_half_loop:
582  movh                 m2, [srcq]
583  movh                 m3, [srcq+1]
584%if %2 == 1 ; avg
585%if mmsize == 16
586  movhps               m2, [srcq+src_strideq]
587  movhps               m3, [srcq+src_strideq+1]
588%else
589%if %1 == 4
590  movh                 m1, [srcq+src_strideq]
591  punpckldq            m2, m1
592  movh                 m1, [srcq+src_strideq+1]
593  punpckldq            m3, m1
594%else
595  punpckldq            m2, [srcq+src_strideq]
596  punpckldq            m3, [srcq+src_strideq+1]
597%endif
598%endif
599  pavgb                m2, m3
600%if mmsize == 16
601  movlhps              m0, m2
602  movhlps              m4, m2
603%else ; mmsize == 8
604  punpckldq            m0, m2
605  pshufw               m4, m2, 0xe
606%endif
607  movh                 m1, [dstq]
608  pavgb                m0, m2
609  movh                 m3, [dstq+dst_strideq]
610  pavgb                m0, [secq]
611  punpcklbw            m3, m5
612  punpcklbw            m1, m5
613  punpckhbw            m2, m0, m5
614  punpcklbw            m0, m5
615%else ; !avg
616  movh                 m4, [srcq+src_strideq]
617  movh                 m1, [srcq+src_strideq+1]
618  pavgb                m2, m3
619  pavgb                m4, m1
620  pavgb                m0, m2
621  pavgb                m2, m4
622  movh                 m1, [dstq]
623  movh                 m3, [dstq+dst_strideq]
624  punpcklbw            m0, m5
625  punpcklbw            m2, m5
626  punpcklbw            m3, m5
627  punpcklbw            m1, m5
628%endif
629  SUM_SSE              m0, m1, m2, m3, m6, m7
630  mova                 m0, m4
631
632  lea                srcq, [srcq+src_strideq*2]
633  lea                dstq, [dstq+dst_strideq*2]
634%endif
635%if %2 == 1 ; avg
636  add                secq, sec_str
637%endif
638  dec                   block_height
639  jg .x_half_y_half_loop
640  STORE_AND_RET
641
642.x_half_y_nonhalf:
643  ; x_offset == 0.5 && y_offset == bilin interpolation
644%ifdef PIC
645  lea        bilin_filter, [bilin_filter_m]
646%endif
647  shl           y_offsetd, filter_idx_shift
648%if ARCH_X86_64 && mmsize == 16
649  mova                 m8, [bilin_filter+y_offsetq]
650%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
651  mova                 m9, [bilin_filter+y_offsetq+16]
652%endif
653  mova                m10, [pw_8]
654%define filter_y_a m8
655%define filter_y_b m9
656%define filter_rnd m10
657%else  ;x86_32
658%if ARCH_X86=1 && CONFIG_PIC=1
659; x_offset == 0.5. We can reuse x_offset reg
660%define tempq x_offsetq
661  add y_offsetq, g_bilin_filterm
662%define filter_y_a [y_offsetq]
663%define filter_y_b [y_offsetq+16]
664  mov tempq, g_pw_8m
665%define filter_rnd [tempq]
666%else
667  add           y_offsetq, bilin_filter
668%define filter_y_a [y_offsetq]
669%define filter_y_b [y_offsetq+16]
670%define filter_rnd [pw_8]
671%endif
672%endif
673
674%if %1 == 16
675  movu                 m0, [srcq]
676  movu                 m3, [srcq+1]
677  add                srcq, src_strideq
678  pavgb                m0, m3
679.x_half_y_other_loop:
680  movu                 m4, [srcq]
681  movu                 m2, [srcq+1]
682  mova                 m1, [dstq]
683  pavgb                m4, m2
684%if cpuflag(ssse3)
685  punpckhbw            m2, m0, m4
686  punpcklbw            m0, m4
687  pmaddubsw            m2, filter_y_a
688  pmaddubsw            m0, filter_y_a
689  paddw                m2, filter_rnd
690  paddw                m0, filter_rnd
691  psraw                m2, 4
692%else
693  punpckhbw            m2, m0, m5
694  punpckhbw            m3, m4, m5
695  pmullw               m2, filter_y_a
696  pmullw               m3, filter_y_b
697  paddw                m2, filter_rnd
698  punpcklbw            m0, m5
699  paddw                m2, m3
700  punpcklbw            m3, m4, m5
701  pmullw               m0, filter_y_a
702  pmullw               m3, filter_y_b
703  paddw                m0, filter_rnd
704  psraw                m2, 4
705  paddw                m0, m3
706%endif
707  punpckhbw            m3, m1, m5
708  psraw                m0, 4
709%if %2 == 1 ; avg
710  ; FIXME(rbultje) pipeline
711  packuswb             m0, m2
712  pavgb                m0, [secq]
713  punpckhbw            m2, m0, m5
714  punpcklbw            m0, m5
715%endif
716  punpcklbw            m1, m5
717  SUM_SSE              m0, m1, m2, m3, m6, m7
718  mova                 m0, m4
719
720  add                srcq, src_strideq
721  add                dstq, dst_strideq
722%else ; %1 < 16
723  movh                 m0, [srcq]
724  movh                 m3, [srcq+1]
725  add                srcq, src_strideq
726  pavgb                m0, m3
727%if notcpuflag(ssse3)
728  punpcklbw            m0, m5
729%endif
730.x_half_y_other_loop:
731  movh                 m2, [srcq]
732  movh                 m1, [srcq+1]
733  movh                 m4, [srcq+src_strideq]
734  movh                 m3, [srcq+src_strideq+1]
735  pavgb                m2, m1
736  pavgb                m4, m3
737  movh                 m3, [dstq+dst_strideq]
738%if cpuflag(ssse3)
739  movh                 m1, [dstq]
740  punpcklbw            m0, m2
741  punpcklbw            m2, m4
742  pmaddubsw            m0, filter_y_a
743  pmaddubsw            m2, filter_y_a
744  punpcklbw            m3, m5
745  paddw                m0, filter_rnd
746  paddw                m2, filter_rnd
747%else
748  punpcklbw            m2, m5
749  punpcklbw            m4, m5
750  pmullw               m0, filter_y_a
751  pmullw               m1, m2, filter_y_b
752  punpcklbw            m3, m5
753  paddw                m0, filter_rnd
754  pmullw               m2, filter_y_a
755  paddw                m0, m1
756  pmullw               m1, m4, filter_y_b
757  paddw                m2, filter_rnd
758  paddw                m2, m1
759  movh                 m1, [dstq]
760%endif
761  psraw                m0, 4
762  psraw                m2, 4
763%if %2 == 1 ; avg
764  ; FIXME(rbultje) pipeline
765  packuswb             m0, m2
766  pavgb                m0, [secq]
767  punpckhbw            m2, m0, m5
768  punpcklbw            m0, m5
769%endif
770  punpcklbw            m1, m5
771  SUM_SSE              m0, m1, m2, m3, m6, m7
772  mova                 m0, m4
773
774  lea                srcq, [srcq+src_strideq*2]
775  lea                dstq, [dstq+dst_strideq*2]
776%endif
777%if %2 == 1 ; avg
778  add                secq, sec_str
779%endif
780  dec                   block_height
781  jg .x_half_y_other_loop
782%undef filter_y_a
783%undef filter_y_b
784%undef filter_rnd
785  STORE_AND_RET
786
787.x_nonhalf:
788  test          y_offsetd, y_offsetd
789  jnz .x_nonhalf_y_nonzero
790
791  ; x_offset == bilin interpolation && y_offset == 0
792%ifdef PIC
793  lea        bilin_filter, [bilin_filter_m]
794%endif
795  shl           x_offsetd, filter_idx_shift
796%if ARCH_X86_64 && mmsize == 16
797  mova                 m8, [bilin_filter+x_offsetq]
798%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
799  mova                 m9, [bilin_filter+x_offsetq+16]
800%endif
801  mova                m10, [pw_8]
802%define filter_x_a m8
803%define filter_x_b m9
804%define filter_rnd m10
805%else    ; x86-32
806%if ARCH_X86=1 && CONFIG_PIC=1
807;y_offset == 0. We can reuse y_offset reg.
808%define tempq y_offsetq
809  add x_offsetq, g_bilin_filterm
810%define filter_x_a [x_offsetq]
811%define filter_x_b [x_offsetq+16]
812  mov tempq, g_pw_8m
813%define filter_rnd [tempq]
814%else
815  add           x_offsetq, bilin_filter
816%define filter_x_a [x_offsetq]
817%define filter_x_b [x_offsetq+16]
818%define filter_rnd [pw_8]
819%endif
820%endif
821
822.x_other_y_zero_loop:
823%if %1 == 16
824  movu                 m0, [srcq]
825  movu                 m4, [srcq+1]
826  mova                 m1, [dstq]
827%if cpuflag(ssse3)
828  punpckhbw            m2, m0, m4
829  punpcklbw            m0, m4
830  pmaddubsw            m2, filter_x_a
831  pmaddubsw            m0, filter_x_a
832  paddw                m2, filter_rnd
833  paddw                m0, filter_rnd
834%else
835  punpckhbw            m2, m0, m5
836  punpckhbw            m3, m4, m5
837  punpcklbw            m0, m5
838  punpcklbw            m4, m5
839  pmullw               m2, filter_x_a
840  pmullw               m3, filter_x_b
841  paddw                m2, filter_rnd
842  pmullw               m0, filter_x_a
843  pmullw               m4, filter_x_b
844  paddw                m0, filter_rnd
845  paddw                m2, m3
846  paddw                m0, m4
847%endif
848  psraw                m2, 4
849  psraw                m0, 4
850%if %2 == 1 ; avg
851  ; FIXME(rbultje) pipeline
852  packuswb             m0, m2
853  pavgb                m0, [secq]
854  punpckhbw            m2, m0, m5
855  punpcklbw            m0, m5
856%endif
857  punpckhbw            m3, m1, m5
858  punpcklbw            m1, m5
859  SUM_SSE              m0, m1, m2, m3, m6, m7
860
861  add                srcq, src_strideq
862  add                dstq, dst_strideq
863%else ; %1 < 16
864  movh                 m0, [srcq]
865  movh                 m1, [srcq+1]
866  movh                 m2, [srcq+src_strideq]
867  movh                 m4, [srcq+src_strideq+1]
868  movh                 m3, [dstq+dst_strideq]
869%if cpuflag(ssse3)
870  punpcklbw            m0, m1
871  movh                 m1, [dstq]
872  punpcklbw            m2, m4
873  pmaddubsw            m0, filter_x_a
874  pmaddubsw            m2, filter_x_a
875  punpcklbw            m3, m5
876  paddw                m0, filter_rnd
877  paddw                m2, filter_rnd
878%else
879  punpcklbw            m0, m5
880  punpcklbw            m1, m5
881  punpcklbw            m2, m5
882  punpcklbw            m4, m5
883  pmullw               m0, filter_x_a
884  pmullw               m1, filter_x_b
885  punpcklbw            m3, m5
886  paddw                m0, filter_rnd
887  pmullw               m2, filter_x_a
888  pmullw               m4, filter_x_b
889  paddw                m0, m1
890  paddw                m2, filter_rnd
891  movh                 m1, [dstq]
892  paddw                m2, m4
893%endif
894  psraw                m0, 4
895  psraw                m2, 4
896%if %2 == 1 ; avg
897  ; FIXME(rbultje) pipeline
898  packuswb             m0, m2
899  pavgb                m0, [secq]
900  punpckhbw            m2, m0, m5
901  punpcklbw            m0, m5
902%endif
903  punpcklbw            m1, m5
904  SUM_SSE              m0, m1, m2, m3, m6, m7
905
906  lea                srcq, [srcq+src_strideq*2]
907  lea                dstq, [dstq+dst_strideq*2]
908%endif
909%if %2 == 1 ; avg
910  add                secq, sec_str
911%endif
912  dec                   block_height
913  jg .x_other_y_zero_loop
914%undef filter_x_a
915%undef filter_x_b
916%undef filter_rnd
917  STORE_AND_RET
918
919.x_nonhalf_y_nonzero:
920  cmp           y_offsetd, 8
921  jne .x_nonhalf_y_nonhalf
922
923  ; x_offset == bilin interpolation && y_offset == 0.5
924%ifdef PIC
925  lea        bilin_filter, [bilin_filter_m]
926%endif
927  shl           x_offsetd, filter_idx_shift
928%if ARCH_X86_64 && mmsize == 16
929  mova                 m8, [bilin_filter+x_offsetq]
930%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
931  mova                 m9, [bilin_filter+x_offsetq+16]
932%endif
933  mova                m10, [pw_8]
934%define filter_x_a m8
935%define filter_x_b m9
936%define filter_rnd m10
937%else    ; x86-32
938%if ARCH_X86=1 && CONFIG_PIC=1
939; y_offset == 0.5. We can reuse y_offset reg.
940%define tempq y_offsetq
941  add x_offsetq, g_bilin_filterm
942%define filter_x_a [x_offsetq]
943%define filter_x_b [x_offsetq+16]
944  mov tempq, g_pw_8m
945%define filter_rnd [tempq]
946%else
947  add           x_offsetq, bilin_filter
948%define filter_x_a [x_offsetq]
949%define filter_x_b [x_offsetq+16]
950%define filter_rnd [pw_8]
951%endif
952%endif
953
954%if %1 == 16
955  movu                 m0, [srcq]
956  movu                 m1, [srcq+1]
957%if cpuflag(ssse3)
958  punpckhbw            m2, m0, m1
959  punpcklbw            m0, m1
960  pmaddubsw            m2, filter_x_a
961  pmaddubsw            m0, filter_x_a
962  paddw                m2, filter_rnd
963  paddw                m0, filter_rnd
964%else
965  punpckhbw            m2, m0, m5
966  punpckhbw            m3, m1, m5
967  punpcklbw            m0, m5
968  punpcklbw            m1, m5
969  pmullw               m0, filter_x_a
970  pmullw               m1, filter_x_b
971  paddw                m0, filter_rnd
972  pmullw               m2, filter_x_a
973  pmullw               m3, filter_x_b
974  paddw                m2, filter_rnd
975  paddw                m0, m1
976  paddw                m2, m3
977%endif
978  psraw                m0, 4
979  psraw                m2, 4
980  add                srcq, src_strideq
981  packuswb             m0, m2
982.x_other_y_half_loop:
983  movu                 m4, [srcq]
984  movu                 m3, [srcq+1]
985%if cpuflag(ssse3)
986  mova                 m1, [dstq]
987  punpckhbw            m2, m4, m3
988  punpcklbw            m4, m3
989  pmaddubsw            m2, filter_x_a
990  pmaddubsw            m4, filter_x_a
991  paddw                m2, filter_rnd
992  paddw                m4, filter_rnd
993  psraw                m2, 4
994  psraw                m4, 4
995  packuswb             m4, m2
996  pavgb                m0, m4
997  punpckhbw            m3, m1, m5
998  punpcklbw            m1, m5
999%else
1000  punpckhbw            m2, m4, m5
1001  punpckhbw            m1, m3, m5
1002  punpcklbw            m4, m5
1003  punpcklbw            m3, m5
1004  pmullw               m4, filter_x_a
1005  pmullw               m3, filter_x_b
1006  paddw                m4, filter_rnd
1007  pmullw               m2, filter_x_a
1008  pmullw               m1, filter_x_b
1009  paddw                m2, filter_rnd
1010  paddw                m4, m3
1011  paddw                m2, m1
1012  mova                 m1, [dstq]
1013  psraw                m4, 4
1014  psraw                m2, 4
1015  punpckhbw            m3, m1, m5
1016  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
1017  ; have a 1-register shortage to be able to store the backup of the bilin
1018  ; filtered second line as words as cache for the next line. Packing into
1019  ; a byte costs 1 pack and 2 unpacks, but saves a register.
1020  packuswb             m4, m2
1021  punpcklbw            m1, m5
1022  pavgb                m0, m4
1023%endif
1024%if %2 == 1 ; avg
1025  ; FIXME(rbultje) pipeline
1026  pavgb                m0, [secq]
1027%endif
1028  punpckhbw            m2, m0, m5
1029  punpcklbw            m0, m5
1030  SUM_SSE              m0, m1, m2, m3, m6, m7
1031  mova                 m0, m4
1032
1033  add                srcq, src_strideq
1034  add                dstq, dst_strideq
1035%else ; %1 < 16
1036  movh                 m0, [srcq]
1037  movh                 m1, [srcq+1]
1038%if cpuflag(ssse3)
1039  punpcklbw            m0, m1
1040  pmaddubsw            m0, filter_x_a
1041  paddw                m0, filter_rnd
1042%else
1043  punpcklbw            m0, m5
1044  punpcklbw            m1, m5
1045  pmullw               m0, filter_x_a
1046  pmullw               m1, filter_x_b
1047  paddw                m0, filter_rnd
1048  paddw                m0, m1
1049%endif
1050  add                srcq, src_strideq
1051  psraw                m0, 4
1052.x_other_y_half_loop:
1053  movh                 m2, [srcq]
1054  movh                 m1, [srcq+1]
1055  movh                 m4, [srcq+src_strideq]
1056  movh                 m3, [srcq+src_strideq+1]
1057%if cpuflag(ssse3)
1058  punpcklbw            m2, m1
1059  punpcklbw            m4, m3
1060  pmaddubsw            m2, filter_x_a
1061  pmaddubsw            m4, filter_x_a
1062  movh                 m1, [dstq]
1063  movh                 m3, [dstq+dst_strideq]
1064  paddw                m2, filter_rnd
1065  paddw                m4, filter_rnd
1066%else
1067  punpcklbw            m2, m5
1068  punpcklbw            m1, m5
1069  punpcklbw            m4, m5
1070  punpcklbw            m3, m5
1071  pmullw               m2, filter_x_a
1072  pmullw               m1, filter_x_b
1073  paddw                m2, filter_rnd
1074  pmullw               m4, filter_x_a
1075  pmullw               m3, filter_x_b
1076  paddw                m4, filter_rnd
1077  paddw                m2, m1
1078  movh                 m1, [dstq]
1079  paddw                m4, m3
1080  movh                 m3, [dstq+dst_strideq]
1081%endif
1082  psraw                m2, 4
1083  psraw                m4, 4
1084  pavgw                m0, m2
1085  pavgw                m2, m4
1086%if %2 == 1 ; avg
1087  ; FIXME(rbultje) pipeline - also consider going to bytes here
1088  packuswb             m0, m2
1089  pavgb                m0, [secq]
1090  punpckhbw            m2, m0, m5
1091  punpcklbw            m0, m5
1092%endif
1093  punpcklbw            m3, m5
1094  punpcklbw            m1, m5
1095  SUM_SSE              m0, m1, m2, m3, m6, m7
1096  mova                 m0, m4
1097
1098  lea                srcq, [srcq+src_strideq*2]
1099  lea                dstq, [dstq+dst_strideq*2]
1100%endif
1101%if %2 == 1 ; avg
1102  add                secq, sec_str
1103%endif
1104  dec                   block_height
1105  jg .x_other_y_half_loop
1106%undef filter_x_a
1107%undef filter_x_b
1108%undef filter_rnd
1109  STORE_AND_RET
1110
1111.x_nonhalf_y_nonhalf:
1112%ifdef PIC
1113  lea        bilin_filter, [bilin_filter_m]
1114%endif
1115  shl           x_offsetd, filter_idx_shift
1116  shl           y_offsetd, filter_idx_shift
1117%if ARCH_X86_64 && mmsize == 16
1118  mova                 m8, [bilin_filter+x_offsetq]
1119%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
1120  mova                 m9, [bilin_filter+x_offsetq+16]
1121%endif
1122  mova                m10, [bilin_filter+y_offsetq]
1123%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
1124  mova                m11, [bilin_filter+y_offsetq+16]
1125%endif
1126  mova                m12, [pw_8]
1127%define filter_x_a m8
1128%define filter_x_b m9
1129%define filter_y_a m10
1130%define filter_y_b m11
1131%define filter_rnd m12
1132%else   ; x86-32
1133%if ARCH_X86=1 && CONFIG_PIC=1
1134; In this case, there is NO unused register. Used src_stride register. Later,
1135; src_stride has to be loaded from stack when it is needed.
1136%define tempq src_strideq
1137  mov tempq, g_bilin_filterm
1138  add           x_offsetq, tempq
1139  add           y_offsetq, tempq
1140%define filter_x_a [x_offsetq]
1141%define filter_x_b [x_offsetq+16]
1142%define filter_y_a [y_offsetq]
1143%define filter_y_b [y_offsetq+16]
1144
1145  mov tempq, g_pw_8m
1146%define filter_rnd [tempq]
1147%else
1148  add           x_offsetq, bilin_filter
1149  add           y_offsetq, bilin_filter
1150%define filter_x_a [x_offsetq]
1151%define filter_x_b [x_offsetq+16]
1152%define filter_y_a [y_offsetq]
1153%define filter_y_b [y_offsetq+16]
1154%define filter_rnd [pw_8]
1155%endif
1156%endif
1157
1158  ; x_offset == bilin interpolation && y_offset == bilin interpolation
1159%if %1 == 16
1160  movu                 m0, [srcq]
1161  movu                 m1, [srcq+1]
1162%if cpuflag(ssse3)
1163  punpckhbw            m2, m0, m1
1164  punpcklbw            m0, m1
1165  pmaddubsw            m2, filter_x_a
1166  pmaddubsw            m0, filter_x_a
1167  paddw                m2, filter_rnd
1168  paddw                m0, filter_rnd
1169%else
1170  punpckhbw            m2, m0, m5
1171  punpckhbw            m3, m1, m5
1172  punpcklbw            m0, m5
1173  punpcklbw            m1, m5
1174  pmullw               m0, filter_x_a
1175  pmullw               m1, filter_x_b
1176  paddw                m0, filter_rnd
1177  pmullw               m2, filter_x_a
1178  pmullw               m3, filter_x_b
1179  paddw                m2, filter_rnd
1180  paddw                m0, m1
1181  paddw                m2, m3
1182%endif
1183  psraw                m0, 4
1184  psraw                m2, 4
1185
1186  INC_SRC_BY_SRC_STRIDE
1187
1188  packuswb             m0, m2
1189.x_other_y_other_loop:
1190%if cpuflag(ssse3)
1191  movu                 m4, [srcq]
1192  movu                 m3, [srcq+1]
1193  mova                 m1, [dstq]
1194  punpckhbw            m2, m4, m3
1195  punpcklbw            m4, m3
1196  pmaddubsw            m2, filter_x_a
1197  pmaddubsw            m4, filter_x_a
1198  punpckhbw            m3, m1, m5
1199  paddw                m2, filter_rnd
1200  paddw                m4, filter_rnd
1201  psraw                m2, 4
1202  psraw                m4, 4
1203  packuswb             m4, m2
1204  punpckhbw            m2, m0, m4
1205  punpcklbw            m0, m4
1206  pmaddubsw            m2, filter_y_a
1207  pmaddubsw            m0, filter_y_a
1208  punpcklbw            m1, m5
1209  paddw                m2, filter_rnd
1210  paddw                m0, filter_rnd
1211  psraw                m2, 4
1212  psraw                m0, 4
1213%else
1214  movu                 m3, [srcq]
1215  movu                 m4, [srcq+1]
1216  punpckhbw            m1, m3, m5
1217  punpckhbw            m2, m4, m5
1218  punpcklbw            m3, m5
1219  punpcklbw            m4, m5
1220  pmullw               m3, filter_x_a
1221  pmullw               m4, filter_x_b
1222  paddw                m3, filter_rnd
1223  pmullw               m1, filter_x_a
1224  pmullw               m2, filter_x_b
1225  paddw                m1, filter_rnd
1226  paddw                m3, m4
1227  paddw                m1, m2
1228  psraw                m3, 4
1229  psraw                m1, 4
1230  packuswb             m4, m3, m1
1231  punpckhbw            m2, m0, m5
1232  punpcklbw            m0, m5
1233  pmullw               m2, filter_y_a
1234  pmullw               m1, filter_y_b
1235  paddw                m2, filter_rnd
1236  pmullw               m0, filter_y_a
1237  pmullw               m3, filter_y_b
1238  paddw                m2, m1
1239  mova                 m1, [dstq]
1240  paddw                m0, filter_rnd
1241  psraw                m2, 4
1242  paddw                m0, m3
1243  punpckhbw            m3, m1, m5
1244  psraw                m0, 4
1245  punpcklbw            m1, m5
1246%endif
1247%if %2 == 1 ; avg
1248  ; FIXME(rbultje) pipeline
1249  packuswb             m0, m2
1250  pavgb                m0, [secq]
1251  punpckhbw            m2, m0, m5
1252  punpcklbw            m0, m5
1253%endif
1254  SUM_SSE              m0, m1, m2, m3, m6, m7
1255  mova                 m0, m4
1256
1257  INC_SRC_BY_SRC_STRIDE
1258  add                dstq, dst_strideq
1259%else ; %1 < 16
1260  movh                 m0, [srcq]
1261  movh                 m1, [srcq+1]
1262%if cpuflag(ssse3)
1263  punpcklbw            m0, m1
1264  pmaddubsw            m0, filter_x_a
1265  paddw                m0, filter_rnd
1266%else
1267  punpcklbw            m0, m5
1268  punpcklbw            m1, m5
1269  pmullw               m0, filter_x_a
1270  pmullw               m1, filter_x_b
1271  paddw                m0, filter_rnd
1272  paddw                m0, m1
1273%endif
1274  psraw                m0, 4
1275%if cpuflag(ssse3)
1276  packuswb             m0, m0
1277%endif
1278
1279  INC_SRC_BY_SRC_STRIDE
1280
1281.x_other_y_other_loop:
1282  movh                 m2, [srcq]
1283  movh                 m1, [srcq+1]
1284
1285  INC_SRC_BY_SRC_STRIDE
1286  movh                 m4, [srcq]
1287  movh                 m3, [srcq+1]
1288
1289%if cpuflag(ssse3)
1290  punpcklbw            m2, m1
1291  punpcklbw            m4, m3
1292  pmaddubsw            m2, filter_x_a
1293  pmaddubsw            m4, filter_x_a
1294  movh                 m3, [dstq+dst_strideq]
1295  movh                 m1, [dstq]
1296  paddw                m2, filter_rnd
1297  paddw                m4, filter_rnd
1298  psraw                m2, 4
1299  psraw                m4, 4
1300  packuswb             m2, m2
1301  packuswb             m4, m4
1302  punpcklbw            m0, m2
1303  punpcklbw            m2, m4
1304  pmaddubsw            m0, filter_y_a
1305  pmaddubsw            m2, filter_y_a
1306  punpcklbw            m3, m5
1307  paddw                m0, filter_rnd
1308  paddw                m2, filter_rnd
1309  psraw                m0, 4
1310  psraw                m2, 4
1311  punpcklbw            m1, m5
1312%else
1313  punpcklbw            m2, m5
1314  punpcklbw            m1, m5
1315  punpcklbw            m4, m5
1316  punpcklbw            m3, m5
1317  pmullw               m2, filter_x_a
1318  pmullw               m1, filter_x_b
1319  paddw                m2, filter_rnd
1320  pmullw               m4, filter_x_a
1321  pmullw               m3, filter_x_b
1322  paddw                m4, filter_rnd
1323  paddw                m2, m1
1324  paddw                m4, m3
1325  psraw                m2, 4
1326  psraw                m4, 4
1327  pmullw               m0, filter_y_a
1328  pmullw               m3, m2, filter_y_b
1329  paddw                m0, filter_rnd
1330  pmullw               m2, filter_y_a
1331  pmullw               m1, m4, filter_y_b
1332  paddw                m2, filter_rnd
1333  paddw                m0, m3
1334  movh                 m3, [dstq+dst_strideq]
1335  paddw                m2, m1
1336  movh                 m1, [dstq]
1337  psraw                m0, 4
1338  psraw                m2, 4
1339  punpcklbw            m3, m5
1340  punpcklbw            m1, m5
1341%endif
1342%if %2 == 1 ; avg
1343  ; FIXME(rbultje) pipeline
1344  packuswb             m0, m2
1345  pavgb                m0, [secq]
1346  punpckhbw            m2, m0, m5
1347  punpcklbw            m0, m5
1348%endif
1349  SUM_SSE              m0, m1, m2, m3, m6, m7
1350  mova                 m0, m4
1351
1352  INC_SRC_BY_SRC_STRIDE
1353  lea                dstq, [dstq+dst_strideq*2]
1354%endif
1355%if %2 == 1 ; avg
1356  add                secq, sec_str
1357%endif
1358  dec                   block_height
1359  jg .x_other_y_other_loop
1360%undef filter_x_a
1361%undef filter_x_b
1362%undef filter_y_a
1363%undef filter_y_b
1364%undef filter_rnd
1365  STORE_AND_RET
1366%endmacro
1367
1368; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
1369; between the ssse3 and non-ssse3 version. It may make sense to merge their
1370; code in the sense that the ssse3 version would jump to the appropriate
1371; location in the sse/2 version, rather than duplicating that code in the
1372; binary.
1373
1374INIT_MMX sse
1375SUBPEL_VARIANCE  4
1376INIT_XMM sse2
1377SUBPEL_VARIANCE  8
1378SUBPEL_VARIANCE 16
1379
1380INIT_MMX ssse3
1381SUBPEL_VARIANCE  4
1382INIT_XMM ssse3
1383SUBPEL_VARIANCE  8
1384SUBPEL_VARIANCE 16
1385
1386INIT_MMX sse
1387SUBPEL_VARIANCE  4, 1
1388INIT_XMM sse2
1389SUBPEL_VARIANCE  8, 1
1390SUBPEL_VARIANCE 16, 1
1391
1392INIT_MMX ssse3
1393SUBPEL_VARIANCE  4, 1
1394INIT_XMM ssse3
1395SUBPEL_VARIANCE  8, 1
1396SUBPEL_VARIANCE 16, 1
1397