• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION_RODATA
14pw_8: times  8 dw  8
15bilin_filter_m_sse2: times  8 dw 16
16                     times  8 dw  0
17                     times  8 dw 14
18                     times  8 dw  2
19                     times  8 dw 12
20                     times  8 dw  4
21                     times  8 dw 10
22                     times  8 dw  6
23                     times 16 dw  8
24                     times  8 dw  6
25                     times  8 dw 10
26                     times  8 dw  4
27                     times  8 dw 12
28                     times  8 dw  2
29                     times  8 dw 14
30
31SECTION .text
32
33; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
34;                               int x_offset, int y_offset,
35;                               const uint8_t *ref, ptrdiff_t ref_stride,
36;                               int height, unsigned int *sse);
37;
38; This function returns the SE and stores SSE in the given pointer.
39
40%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse
41  psubw                %3, %4
42  psubw                %1, %2
43  mova                 %4, %3       ; make copies to manipulate to calc sum
44  mova                 %2, %1       ; use originals for calc sse
45  pmaddwd              %3, %3
46  paddw                %4, %2
47  pmaddwd              %1, %1
48  movhlps              %2, %4
49  paddd                %6, %3
50  paddw                %4, %2
51  pxor                 %2, %2
52  pcmpgtw              %2, %4       ; mask for 0 > %4 (sum)
53  punpcklwd            %4, %2       ; sign-extend word to dword
54  paddd                %6, %1
55  paddd                %5, %4
56
57%endmacro
58
59%macro STORE_AND_RET 0
60%if mmsize == 16
61  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
62  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
63  ; We have to sign-extend it before adding the words within the register
64  ; and outputing to a dword.
65  movhlps              m3, m7
66  movhlps              m4, m6
67  paddd                m7, m3
68  paddd                m6, m4
69  pshufd               m3, m7, 0x1
70  pshufd               m4, m6, 0x1
71  paddd                m7, m3
72  paddd                m6, m4
73  mov                  r1, ssem         ; r1 = unsigned int *sse
74  movd               [r1], m7           ; store sse
75  movd                eax, m6           ; store sum as return value
76%endif
77  RET
78%endmacro
79
80%macro INC_SRC_BY_SRC_STRIDE  0
81%if VPX_ARCH_X86=1 && CONFIG_PIC=1
82  add                srcq, src_stridemp
83  add                srcq, src_stridemp
84%else
85  lea                srcq, [srcq + src_strideq*2]
86%endif
87%endmacro
88
89%macro SUBPEL_VARIANCE 1-2 0 ; W
90%define bilin_filter_m bilin_filter_m_sse2
91%define filter_idx_shift 5
92
93
94%if VPX_ARCH_X86_64
95  %if %2 == 1 ; avg
96    cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
97                                      x_offset, y_offset, \
98                                      ref, ref_stride, \
99                                      second_pred, second_stride, height, sse
100    %define second_str second_strideq
101  %else
102    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
103                                  x_offset, y_offset, \
104                                  ref, ref_stride, height, sse
105  %endif
106  %define block_height heightd
107  %define bilin_filter sseq
108%else
109  %if CONFIG_PIC=1
110    %if %2 == 1 ; avg
111      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
112                                        x_offset, y_offset, \
113                                        ref, ref_stride, \
114                                        second_pred, second_stride, height, sse
115      %define block_height dword heightm
116      %define second_str second_stridemp
117    %else
118      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
119                                    x_offset, y_offset, \
120                                    ref, ref_stride, height, sse
121      %define block_height heightd
122    %endif
123
124    ; reuse argument stack space
125    %define g_bilin_filterm x_offsetm
126    %define g_pw_8m y_offsetm
127
128    ; Store bilin_filter and pw_8 location in stack
129    %if GET_GOT_DEFINED == 1
130      GET_GOT eax
131      add esp, 4                ; restore esp
132    %endif
133
134    lea ecx, [GLOBAL(bilin_filter_m)]
135    mov g_bilin_filterm, ecx
136
137    lea ecx, [GLOBAL(pw_8)]
138    mov g_pw_8m, ecx
139
140    LOAD_IF_USED 0, 1         ; load eax, ecx back
141  %else
142    %if %2 == 1 ; avg
143      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
144                                        x_offset, y_offset, \
145                                        ref, ref_stride, \
146                                        second_pred, second_stride, height, sse
147      %define block_height dword heightm
148      %define second_str second_stridemp
149    %else
150      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
151                                    x_offset, y_offset, \
152                                    ref, ref_stride, height, sse
153      %define block_height heightd
154    %endif
155
156    %define bilin_filter bilin_filter_m
157  %endif
158%endif
159
160  ASSERT               %1 <= 16         ; m6 overflows if w > 16
161  pxor                 m6, m6           ; sum
162  pxor                 m7, m7           ; sse
163
164%if %1 < 16
165  sar                   block_height, 1
166%endif
167%if %2 == 1 ; avg
168  shl             second_str, 1
169%endif
170
171  ; FIXME(rbultje) replace by jumptable?
172  test          x_offsetd, x_offsetd
173  jnz .x_nonzero
174  ; x_offset == 0
175  test          y_offsetd, y_offsetd
176  jnz .x_zero_y_nonzero
177
178  ; x_offset == 0 && y_offset == 0
179.x_zero_y_zero_loop:
180%if %1 == 16
181  movu                 m0, [srcq]
182  movu                 m2, [srcq + 16]
183  mova                 m1, [refq]
184  mova                 m3, [refq + 16]
185%if %2 == 1 ; avg
186  pavgw                m0, [second_predq]
187  pavgw                m2, [second_predq+16]
188%endif
189  SUM_SSE              m0, m1, m2, m3, m6, m7
190
191  lea                srcq, [srcq + src_strideq*2]
192  lea                refq, [refq + ref_strideq*2]
193%if %2 == 1 ; avg
194  add                second_predq, second_str
195%endif
196%else ; %1 < 16
197  movu                 m0, [srcq]
198  movu                 m2, [srcq + src_strideq*2]
199  mova                 m1, [refq]
200  mova                 m3, [refq + ref_strideq*2]
201%if %2 == 1 ; avg
202  pavgw                m0, [second_predq]
203  add                second_predq, second_str
204  pavgw                m2, [second_predq]
205%endif
206  SUM_SSE              m0, m1, m2, m3, m6, m7
207
208  lea                srcq, [srcq + src_strideq*4]
209  lea                refq, [refq + ref_strideq*4]
210%if %2 == 1 ; avg
211  add                second_predq, second_str
212%endif
213%endif
214  dec                   block_height
215  jg .x_zero_y_zero_loop
216  STORE_AND_RET
217
218.x_zero_y_nonzero:
219  cmp           y_offsetd, 8
220  jne .x_zero_y_nonhalf
221
222  ; x_offset == 0 && y_offset == 0.5
223.x_zero_y_half_loop:
224%if %1 == 16
225  movu                 m0, [srcq]
226  movu                 m1, [srcq+16]
227  movu                 m4, [srcq+src_strideq*2]
228  movu                 m5, [srcq+src_strideq*2+16]
229  mova                 m2, [refq]
230  mova                 m3, [refq+16]
231  pavgw                m0, m4
232  pavgw                m1, m5
233%if %2 == 1 ; avg
234  pavgw                m0, [second_predq]
235  pavgw                m1, [second_predq+16]
236%endif
237  SUM_SSE              m0, m2, m1, m3, m6, m7
238
239  lea                srcq, [srcq + src_strideq*2]
240  lea                refq, [refq + ref_strideq*2]
241%if %2 == 1 ; avg
242  add                second_predq, second_str
243%endif
244%else ; %1 < 16
245  movu                 m0, [srcq]
246  movu                 m1, [srcq+src_strideq*2]
247  movu                 m5, [srcq+src_strideq*4]
248  mova                 m2, [refq]
249  mova                 m3, [refq+ref_strideq*2]
250  pavgw                m0, m1
251  pavgw                m1, m5
252%if %2 == 1 ; avg
253  pavgw                m0, [second_predq]
254  add                second_predq, second_str
255  pavgw                m1, [second_predq]
256%endif
257  SUM_SSE              m0, m2, m1, m3, m6, m7
258
259  lea                srcq, [srcq + src_strideq*4]
260  lea                refq, [refq + ref_strideq*4]
261%if %2 == 1 ; avg
262  add                second_predq, second_str
263%endif
264%endif
265  dec                   block_height
266  jg .x_zero_y_half_loop
267  STORE_AND_RET
268
269.x_zero_y_nonhalf:
270  ; x_offset == 0 && y_offset == bilin interpolation
271%if VPX_ARCH_X86_64
272  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
273%endif
274  shl           y_offsetd, filter_idx_shift
275%if VPX_ARCH_X86_64 && mmsize == 16
276  mova                 m8, [bilin_filter+y_offsetq]
277  mova                 m9, [bilin_filter+y_offsetq+16]
278  mova                m10, [GLOBAL(pw_8)]
279%define filter_y_a m8
280%define filter_y_b m9
281%define filter_rnd m10
282%else ; x86-32 or mmx
283%if VPX_ARCH_X86=1 && CONFIG_PIC=1
284; x_offset == 0, reuse x_offset reg
285%define tempq x_offsetq
286  add y_offsetq, g_bilin_filterm
287%define filter_y_a [y_offsetq]
288%define filter_y_b [y_offsetq+16]
289  mov tempq, g_pw_8m
290%define filter_rnd [tempq]
291%else
292  add           y_offsetq, bilin_filter
293%define filter_y_a [y_offsetq]
294%define filter_y_b [y_offsetq+16]
295%define filter_rnd [GLOBAL(pw_8)]
296%endif
297%endif
298
299.x_zero_y_other_loop:
300%if %1 == 16
301  movu                 m0, [srcq]
302  movu                 m1, [srcq + 16]
303  movu                 m4, [srcq+src_strideq*2]
304  movu                 m5, [srcq+src_strideq*2+16]
305  mova                 m2, [refq]
306  mova                 m3, [refq+16]
307  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
308  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
309  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
310  ; slightly faster because of pmullw latency. It would also cut our rodata
311  ; tables in half for this function, and save 1-2 registers on x86-64.
312  pmullw               m1, filter_y_a
313  pmullw               m5, filter_y_b
314  paddw                m1, filter_rnd
315  pmullw               m0, filter_y_a
316  pmullw               m4, filter_y_b
317  paddw                m0, filter_rnd
318  paddw                m1, m5
319  paddw                m0, m4
320  psrlw                m1, 4
321  psrlw                m0, 4
322%if %2 == 1 ; avg
323  pavgw                m0, [second_predq]
324  pavgw                m1, [second_predq+16]
325%endif
326  SUM_SSE              m0, m2, m1, m3, m6, m7
327
328  lea                srcq, [srcq + src_strideq*2]
329  lea                refq, [refq + ref_strideq*2]
330%if %2 == 1 ; avg
331  add                second_predq, second_str
332%endif
333%else ; %1 < 16
334  movu                 m0, [srcq]
335  movu                 m1, [srcq+src_strideq*2]
336  movu                 m5, [srcq+src_strideq*4]
337  mova                 m4, m1
338  mova                 m2, [refq]
339  mova                 m3, [refq+ref_strideq*2]
340  pmullw               m1, filter_y_a
341  pmullw               m5, filter_y_b
342  paddw                m1, filter_rnd
343  pmullw               m0, filter_y_a
344  pmullw               m4, filter_y_b
345  paddw                m0, filter_rnd
346  paddw                m1, m5
347  paddw                m0, m4
348  psrlw                m1, 4
349  psrlw                m0, 4
350%if %2 == 1 ; avg
351  pavgw                m0, [second_predq]
352  add                second_predq, second_str
353  pavgw                m1, [second_predq]
354%endif
355  SUM_SSE              m0, m2, m1, m3, m6, m7
356
357  lea                srcq, [srcq + src_strideq*4]
358  lea                refq, [refq + ref_strideq*4]
359%if %2 == 1 ; avg
360  add                second_predq, second_str
361%endif
362%endif
363  dec                   block_height
364  jg .x_zero_y_other_loop
365%undef filter_y_a
366%undef filter_y_b
367%undef filter_rnd
368  STORE_AND_RET
369
370.x_nonzero:
371  cmp           x_offsetd, 8
372  jne .x_nonhalf
373  ; x_offset == 0.5
374  test          y_offsetd, y_offsetd
375  jnz .x_half_y_nonzero
376
377  ; x_offset == 0.5 && y_offset == 0
378.x_half_y_zero_loop:
379%if %1 == 16
380  movu                 m0, [srcq]
381  movu                 m1, [srcq + 16]
382  movu                 m4, [srcq + 2]
383  movu                 m5, [srcq + 18]
384  mova                 m2, [refq]
385  mova                 m3, [refq + 16]
386  pavgw                m0, m4
387  pavgw                m1, m5
388%if %2 == 1 ; avg
389  pavgw                m0, [second_predq]
390  pavgw                m1, [second_predq+16]
391%endif
392  SUM_SSE              m0, m2, m1, m3, m6, m7
393
394  lea                srcq, [srcq + src_strideq*2]
395  lea                refq, [refq + ref_strideq*2]
396%if %2 == 1 ; avg
397  add                second_predq, second_str
398%endif
399%else ; %1 < 16
400  movu                 m0, [srcq]
401  movu                 m1, [srcq + src_strideq*2]
402  movu                 m4, [srcq + 2]
403  movu                 m5, [srcq + src_strideq*2 + 2]
404  mova                 m2, [refq]
405  mova                 m3, [refq + ref_strideq*2]
406  pavgw                m0, m4
407  pavgw                m1, m5
408%if %2 == 1 ; avg
409  pavgw                m0, [second_predq]
410  add                second_predq, second_str
411  pavgw                m1, [second_predq]
412%endif
413  SUM_SSE              m0, m2, m1, m3, m6, m7
414
415  lea                srcq, [srcq + src_strideq*4]
416  lea                refq, [refq + ref_strideq*4]
417%if %2 == 1 ; avg
418  add                second_predq, second_str
419%endif
420%endif
421  dec                   block_height
422  jg .x_half_y_zero_loop
423  STORE_AND_RET
424
425.x_half_y_nonzero:
426  cmp           y_offsetd, 8
427  jne .x_half_y_nonhalf
428
429  ; x_offset == 0.5 && y_offset == 0.5
430%if %1 == 16
431  movu                 m0, [srcq]
432  movu                 m1, [srcq+16]
433  movu                 m2, [srcq+2]
434  movu                 m3, [srcq+18]
435  lea                srcq, [srcq + src_strideq*2]
436  pavgw                m0, m2
437  pavgw                m1, m3
438.x_half_y_half_loop:
439  movu                 m2, [srcq]
440  movu                 m3, [srcq + 16]
441  movu                 m4, [srcq + 2]
442  movu                 m5, [srcq + 18]
443  pavgw                m2, m4
444  pavgw                m3, m5
445  pavgw                m0, m2
446  pavgw                m1, m3
447  mova                 m4, [refq]
448  mova                 m5, [refq + 16]
449%if %2 == 1 ; avg
450  pavgw                m0, [second_predq]
451  pavgw                m1, [second_predq+16]
452%endif
453  SUM_SSE              m0, m4, m1, m5, m6, m7
454  mova                 m0, m2
455  mova                 m1, m3
456
457  lea                srcq, [srcq + src_strideq*2]
458  lea                refq, [refq + ref_strideq*2]
459%if %2 == 1 ; avg
460  add                second_predq, second_str
461%endif
462%else ; %1 < 16
463  movu                 m0, [srcq]
464  movu                 m2, [srcq+2]
465  lea                srcq, [srcq + src_strideq*2]
466  pavgw                m0, m2
467.x_half_y_half_loop:
468  movu                 m2, [srcq]
469  movu                 m3, [srcq + src_strideq*2]
470  movu                 m4, [srcq + 2]
471  movu                 m5, [srcq + src_strideq*2 + 2]
472  pavgw                m2, m4
473  pavgw                m3, m5
474  pavgw                m0, m2
475  pavgw                m2, m3
476  mova                 m4, [refq]
477  mova                 m5, [refq + ref_strideq*2]
478%if %2 == 1 ; avg
479  pavgw                m0, [second_predq]
480  add                second_predq, second_str
481  pavgw                m2, [second_predq]
482%endif
483  SUM_SSE              m0, m4, m2, m5, m6, m7
484  mova                 m0, m3
485
486  lea                srcq, [srcq + src_strideq*4]
487  lea                refq, [refq + ref_strideq*4]
488%if %2 == 1 ; avg
489  add                second_predq, second_str
490%endif
491%endif
492  dec                   block_height
493  jg .x_half_y_half_loop
494  STORE_AND_RET
495
496.x_half_y_nonhalf:
497  ; x_offset == 0.5 && y_offset == bilin interpolation
498%if VPX_ARCH_X86_64
499  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
500%endif
501  shl           y_offsetd, filter_idx_shift
502%if VPX_ARCH_X86_64 && mmsize == 16
503  mova                 m8, [bilin_filter+y_offsetq]
504  mova                 m9, [bilin_filter+y_offsetq+16]
505  mova                m10, [GLOBAL(pw_8)]
506%define filter_y_a m8
507%define filter_y_b m9
508%define filter_rnd m10
509%else  ; x86_32
510%if VPX_ARCH_X86=1 && CONFIG_PIC=1
511; x_offset == 0.5. We can reuse x_offset reg
512%define tempq x_offsetq
513  add y_offsetq, g_bilin_filterm
514%define filter_y_a [y_offsetq]
515%define filter_y_b [y_offsetq+16]
516  mov tempq, g_pw_8m
517%define filter_rnd [tempq]
518%else
519  add           y_offsetq, bilin_filter
520%define filter_y_a [y_offsetq]
521%define filter_y_b [y_offsetq+16]
522%define filter_rnd [GLOBAL(pw_8)]
523%endif
524%endif
525
526%if %1 == 16
527  movu                 m0, [srcq]
528  movu                 m1, [srcq+16]
529  movu                 m2, [srcq+2]
530  movu                 m3, [srcq+18]
531  lea                srcq, [srcq + src_strideq*2]
532  pavgw                m0, m2
533  pavgw                m1, m3
534.x_half_y_other_loop:
535  movu                 m2, [srcq]
536  movu                 m3, [srcq+16]
537  movu                 m4, [srcq+2]
538  movu                 m5, [srcq+18]
539  pavgw                m2, m4
540  pavgw                m3, m5
541  mova                 m4, m2
542  mova                 m5, m3
543  pmullw               m1, filter_y_a
544  pmullw               m3, filter_y_b
545  paddw                m1, filter_rnd
546  paddw                m1, m3
547  pmullw               m0, filter_y_a
548  pmullw               m2, filter_y_b
549  paddw                m0, filter_rnd
550  psrlw                m1, 4
551  paddw                m0, m2
552  mova                 m2, [refq]
553  psrlw                m0, 4
554  mova                 m3, [refq+16]
555%if %2 == 1 ; avg
556  pavgw                m0, [second_predq]
557  pavgw                m1, [second_predq+16]
558%endif
559  SUM_SSE              m0, m2, m1, m3, m6, m7
560  mova                 m0, m4
561  mova                 m1, m5
562
563  lea                srcq, [srcq + src_strideq*2]
564  lea                refq, [refq + ref_strideq*2]
565%if %2 == 1 ; avg
566  add                second_predq, second_str
567%endif
568%else ; %1 < 16
569  movu                 m0, [srcq]
570  movu                 m2, [srcq+2]
571  lea                srcq, [srcq + src_strideq*2]
572  pavgw                m0, m2
573.x_half_y_other_loop:
574  movu                 m2, [srcq]
575  movu                 m3, [srcq+src_strideq*2]
576  movu                 m4, [srcq+2]
577  movu                 m5, [srcq+src_strideq*2+2]
578  pavgw                m2, m4
579  pavgw                m3, m5
580  mova                 m4, m2
581  mova                 m5, m3
582  pmullw               m4, filter_y_a
583  pmullw               m3, filter_y_b
584  paddw                m4, filter_rnd
585  paddw                m4, m3
586  pmullw               m0, filter_y_a
587  pmullw               m2, filter_y_b
588  paddw                m0, filter_rnd
589  psrlw                m4, 4
590  paddw                m0, m2
591  mova                 m2, [refq]
592  psrlw                m0, 4
593  mova                 m3, [refq+ref_strideq*2]
594%if %2 == 1 ; avg
595  pavgw                m0, [second_predq]
596  add                second_predq, second_str
597  pavgw                m4, [second_predq]
598%endif
599  SUM_SSE              m0, m2, m4, m3, m6, m7
600  mova                 m0, m5
601
602  lea                srcq, [srcq + src_strideq*4]
603  lea                refq, [refq + ref_strideq*4]
604%if %2 == 1 ; avg
605  add                second_predq, second_str
606%endif
607%endif
608  dec                   block_height
609  jg .x_half_y_other_loop
610%undef filter_y_a
611%undef filter_y_b
612%undef filter_rnd
613  STORE_AND_RET
614
615.x_nonhalf:
616  test          y_offsetd, y_offsetd
617  jnz .x_nonhalf_y_nonzero
618
619  ; x_offset == bilin interpolation && y_offset == 0
620%if VPX_ARCH_X86_64
621  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
622%endif
623  shl           x_offsetd, filter_idx_shift
624%if VPX_ARCH_X86_64 && mmsize == 16
625  mova                 m8, [bilin_filter+x_offsetq]
626  mova                 m9, [bilin_filter+x_offsetq+16]
627  mova                m10, [GLOBAL(pw_8)]
628%define filter_x_a m8
629%define filter_x_b m9
630%define filter_rnd m10
631%else    ; x86-32
632%if VPX_ARCH_X86=1 && CONFIG_PIC=1
633; y_offset == 0. We can reuse y_offset reg.
634%define tempq y_offsetq
635  add x_offsetq, g_bilin_filterm
636%define filter_x_a [x_offsetq]
637%define filter_x_b [x_offsetq+16]
638  mov tempq, g_pw_8m
639%define filter_rnd [tempq]
640%else
641  add           x_offsetq, bilin_filter
642%define filter_x_a [x_offsetq]
643%define filter_x_b [x_offsetq+16]
644%define filter_rnd [GLOBAL(pw_8)]
645%endif
646%endif
647
648.x_other_y_zero_loop:
649%if %1 == 16
650  movu                 m0, [srcq]
651  movu                 m1, [srcq+16]
652  movu                 m2, [srcq+2]
653  movu                 m3, [srcq+18]
654  mova                 m4, [refq]
655  mova                 m5, [refq+16]
656  pmullw               m1, filter_x_a
657  pmullw               m3, filter_x_b
658  paddw                m1, filter_rnd
659  pmullw               m0, filter_x_a
660  pmullw               m2, filter_x_b
661  paddw                m0, filter_rnd
662  paddw                m1, m3
663  paddw                m0, m2
664  psrlw                m1, 4
665  psrlw                m0, 4
666%if %2 == 1 ; avg
667  pavgw                m0, [second_predq]
668  pavgw                m1, [second_predq+16]
669%endif
670  SUM_SSE              m0, m4, m1, m5, m6, m7
671
672  lea                srcq, [srcq+src_strideq*2]
673  lea                refq, [refq+ref_strideq*2]
674%if %2 == 1 ; avg
675  add                second_predq, second_str
676%endif
677%else ; %1 < 16
678  movu                 m0, [srcq]
679  movu                 m1, [srcq+src_strideq*2]
680  movu                 m2, [srcq+2]
681  movu                 m3, [srcq+src_strideq*2+2]
682  mova                 m4, [refq]
683  mova                 m5, [refq+ref_strideq*2]
684  pmullw               m1, filter_x_a
685  pmullw               m3, filter_x_b
686  paddw                m1, filter_rnd
687  pmullw               m0, filter_x_a
688  pmullw               m2, filter_x_b
689  paddw                m0, filter_rnd
690  paddw                m1, m3
691  paddw                m0, m2
692  psrlw                m1, 4
693  psrlw                m0, 4
694%if %2 == 1 ; avg
695  pavgw                m0, [second_predq]
696  add                second_predq, second_str
697  pavgw                m1, [second_predq]
698%endif
699  SUM_SSE              m0, m4, m1, m5, m6, m7
700
701  lea                srcq, [srcq+src_strideq*4]
702  lea                refq, [refq+ref_strideq*4]
703%if %2 == 1 ; avg
704  add                second_predq, second_str
705%endif
706%endif
707  dec                   block_height
708  jg .x_other_y_zero_loop
709%undef filter_x_a
710%undef filter_x_b
711%undef filter_rnd
712  STORE_AND_RET
713
714.x_nonhalf_y_nonzero:
715  cmp           y_offsetd, 8
716  jne .x_nonhalf_y_nonhalf
717
718  ; x_offset == bilin interpolation && y_offset == 0.5
719%if VPX_ARCH_X86_64
720  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
721%endif
722  shl           x_offsetd, filter_idx_shift
723%if VPX_ARCH_X86_64 && mmsize == 16
724  mova                 m8, [bilin_filter+x_offsetq]
725  mova                 m9, [bilin_filter+x_offsetq+16]
726  mova                m10, [GLOBAL(pw_8)]
727%define filter_x_a m8
728%define filter_x_b m9
729%define filter_rnd m10
730%else    ; x86-32
731%if VPX_ARCH_X86=1 && CONFIG_PIC=1
732; y_offset == 0.5. We can reuse y_offset reg.
733%define tempq y_offsetq
734  add x_offsetq, g_bilin_filterm
735%define filter_x_a [x_offsetq]
736%define filter_x_b [x_offsetq+16]
737  mov tempq, g_pw_8m
738%define filter_rnd [tempq]
739%else
740  add           x_offsetq, bilin_filter
741%define filter_x_a [x_offsetq]
742%define filter_x_b [x_offsetq+16]
743%define filter_rnd [GLOBAL(pw_8)]
744%endif
745%endif
746
747%if %1 == 16
748  movu                 m0, [srcq]
749  movu                 m1, [srcq+16]
750  movu                 m2, [srcq+2]
751  movu                 m3, [srcq+18]
752  pmullw               m0, filter_x_a
753  pmullw               m2, filter_x_b
754  paddw                m0, filter_rnd
755  pmullw               m1, filter_x_a
756  pmullw               m3, filter_x_b
757  paddw                m1, filter_rnd
758  paddw                m0, m2
759  paddw                m1, m3
760  psrlw                m0, 4
761  psrlw                m1, 4
762  lea                srcq, [srcq+src_strideq*2]
763.x_other_y_half_loop:
764  movu                 m2, [srcq]
765  movu                 m3, [srcq+16]
766  movu                 m4, [srcq+2]
767  movu                 m5, [srcq+18]
768  pmullw               m2, filter_x_a
769  pmullw               m4, filter_x_b
770  paddw                m2, filter_rnd
771  pmullw               m3, filter_x_a
772  pmullw               m5, filter_x_b
773  paddw                m3, filter_rnd
774  paddw                m2, m4
775  paddw                m3, m5
776  mova                 m4, [refq]
777  mova                 m5, [refq+16]
778  psrlw                m2, 4
779  psrlw                m3, 4
780  pavgw                m0, m2
781  pavgw                m1, m3
782%if %2 == 1 ; avg
783  pavgw                m0, [second_predq]
784  pavgw                m1, [second_predq+16]
785%endif
786  SUM_SSE              m0, m4, m1, m5, m6, m7
787  mova                 m0, m2
788  mova                 m1, m3
789
790  lea                srcq, [srcq+src_strideq*2]
791  lea                refq, [refq+ref_strideq*2]
792%if %2 == 1 ; avg
793  add                second_predq, second_str
794%endif
795%else ; %1 < 16
796  movu                 m0, [srcq]
797  movu                 m2, [srcq+2]
798  pmullw               m0, filter_x_a
799  pmullw               m2, filter_x_b
800  paddw                m0, filter_rnd
801  paddw                m0, m2
802  psrlw                m0, 4
803  lea                srcq, [srcq+src_strideq*2]
804.x_other_y_half_loop:
805  movu                 m2, [srcq]
806  movu                 m3, [srcq+src_strideq*2]
807  movu                 m4, [srcq+2]
808  movu                 m5, [srcq+src_strideq*2+2]
809  pmullw               m2, filter_x_a
810  pmullw               m4, filter_x_b
811  paddw                m2, filter_rnd
812  pmullw               m3, filter_x_a
813  pmullw               m5, filter_x_b
814  paddw                m3, filter_rnd
815  paddw                m2, m4
816  paddw                m3, m5
817  mova                 m4, [refq]
818  mova                 m5, [refq+ref_strideq*2]
819  psrlw                m2, 4
820  psrlw                m3, 4
821  pavgw                m0, m2
822  pavgw                m2, m3
823%if %2 == 1 ; avg
824  pavgw                m0, [second_predq]
825  add                second_predq, second_str
826  pavgw                m2, [second_predq]
827%endif
828  SUM_SSE              m0, m4, m2, m5, m6, m7
829  mova                 m0, m3
830
831  lea                srcq, [srcq+src_strideq*4]
832  lea                refq, [refq+ref_strideq*4]
833%if %2 == 1 ; avg
834  add                second_predq, second_str
835%endif
836%endif
837  dec                   block_height
838  jg .x_other_y_half_loop
839%undef filter_x_a
840%undef filter_x_b
841%undef filter_rnd
842  STORE_AND_RET
843
844.x_nonhalf_y_nonhalf:
845; loading filter - this is same as in 8-bit depth
846%if VPX_ARCH_X86_64
847  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
848%endif
849  shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
850  shl           y_offsetd, filter_idx_shift
851%if VPX_ARCH_X86_64 && mmsize == 16
852  mova                 m8, [bilin_filter+x_offsetq]
853  mova                 m9, [bilin_filter+x_offsetq+16]
854  mova                m10, [bilin_filter+y_offsetq]
855  mova                m11, [bilin_filter+y_offsetq+16]
856  mova                m12, [GLOBAL(pw_8)]
857%define filter_x_a m8
858%define filter_x_b m9
859%define filter_y_a m10
860%define filter_y_b m11
861%define filter_rnd m12
862%else   ; x86-32
863%if VPX_ARCH_X86=1 && CONFIG_PIC=1
864; In this case, there is NO unused register. Used src_stride register. Later,
865; src_stride has to be loaded from stack when it is needed.
866%define tempq src_strideq
867  mov tempq, g_bilin_filterm
868  add           x_offsetq, tempq
869  add           y_offsetq, tempq
870%define filter_x_a [x_offsetq]
871%define filter_x_b [x_offsetq+16]
872%define filter_y_a [y_offsetq]
873%define filter_y_b [y_offsetq+16]
874
875  mov tempq, g_pw_8m
876%define filter_rnd [tempq]
877%else
878  add           x_offsetq, bilin_filter
879  add           y_offsetq, bilin_filter
880%define filter_x_a [x_offsetq]
881%define filter_x_b [x_offsetq+16]
882%define filter_y_a [y_offsetq]
883%define filter_y_b [y_offsetq+16]
884%define filter_rnd [GLOBAL(pw_8)]
885%endif
886%endif
887; end of load filter
888
889  ; x_offset == bilin interpolation && y_offset == bilin interpolation
890%if %1 == 16
891  movu                 m0, [srcq]
892  movu                 m2, [srcq+2]
893  movu                 m1, [srcq+16]
894  movu                 m3, [srcq+18]
895  pmullw               m0, filter_x_a
896  pmullw               m2, filter_x_b
897  paddw                m0, filter_rnd
898  pmullw               m1, filter_x_a
899  pmullw               m3, filter_x_b
900  paddw                m1, filter_rnd
901  paddw                m0, m2
902  paddw                m1, m3
903  psrlw                m0, 4
904  psrlw                m1, 4
905
906  INC_SRC_BY_SRC_STRIDE
907
908.x_other_y_other_loop:
909  movu                 m2, [srcq]
910  movu                 m4, [srcq+2]
911  movu                 m3, [srcq+16]
912  movu                 m5, [srcq+18]
913  pmullw               m2, filter_x_a
914  pmullw               m4, filter_x_b
915  paddw                m2, filter_rnd
916  pmullw               m3, filter_x_a
917  pmullw               m5, filter_x_b
918  paddw                m3, filter_rnd
919  paddw                m2, m4
920  paddw                m3, m5
921  psrlw                m2, 4
922  psrlw                m3, 4
923  mova                 m4, m2
924  mova                 m5, m3
925  pmullw               m0, filter_y_a
926  pmullw               m2, filter_y_b
927  paddw                m0, filter_rnd
928  pmullw               m1, filter_y_a
929  pmullw               m3, filter_y_b
930  paddw                m0, m2
931  paddw                m1, filter_rnd
932  mova                 m2, [refq]
933  paddw                m1, m3
934  psrlw                m0, 4
935  psrlw                m1, 4
936  mova                 m3, [refq+16]
937%if %2 == 1 ; avg
938  pavgw                m0, [second_predq]
939  pavgw                m1, [second_predq+16]
940%endif
941  SUM_SSE              m0, m2, m1, m3, m6, m7
942  mova                 m0, m4
943  mova                 m1, m5
944
945  INC_SRC_BY_SRC_STRIDE
946  lea                refq, [refq + ref_strideq * 2]
947%if %2 == 1 ; avg
948  add                second_predq, second_str
949%endif
950%else ; %1 < 16
951  movu                 m0, [srcq]
952  movu                 m2, [srcq+2]
953  pmullw               m0, filter_x_a
954  pmullw               m2, filter_x_b
955  paddw                m0, filter_rnd
956  paddw                m0, m2
957  psrlw                m0, 4
958
959  INC_SRC_BY_SRC_STRIDE
960
961.x_other_y_other_loop:
962  movu                 m2, [srcq]
963  movu                 m4, [srcq+2]
964  INC_SRC_BY_SRC_STRIDE
965  movu                 m3, [srcq]
966  movu                 m5, [srcq+2]
967  pmullw               m2, filter_x_a
968  pmullw               m4, filter_x_b
969  paddw                m2, filter_rnd
970  pmullw               m3, filter_x_a
971  pmullw               m5, filter_x_b
972  paddw                m3, filter_rnd
973  paddw                m2, m4
974  paddw                m3, m5
975  psrlw                m2, 4
976  psrlw                m3, 4
977  mova                 m4, m2
978  mova                 m5, m3
979  pmullw               m0, filter_y_a
980  pmullw               m2, filter_y_b
981  paddw                m0, filter_rnd
982  pmullw               m4, filter_y_a
983  pmullw               m3, filter_y_b
984  paddw                m0, m2
985  paddw                m4, filter_rnd
986  mova                 m2, [refq]
987  paddw                m4, m3
988  psrlw                m0, 4
989  psrlw                m4, 4
990  mova                 m3, [refq+ref_strideq*2]
991%if %2 == 1 ; avg
992  pavgw                m0, [second_predq]
993  add                second_predq, second_str
994  pavgw                m4, [second_predq]
995%endif
996  SUM_SSE              m0, m2, m4, m3, m6, m7
997  mova                 m0, m5
998
999  INC_SRC_BY_SRC_STRIDE
1000  lea                refq, [refq + ref_strideq * 4]
1001%if %2 == 1 ; avg
1002  add                second_predq, second_str
1003%endif
1004%endif
1005  dec                   block_height
1006  jg .x_other_y_other_loop
1007%undef filter_x_a
1008%undef filter_x_b
1009%undef filter_y_a
1010%undef filter_y_b
1011%undef filter_rnd
1012  STORE_AND_RET
1013%endmacro
1014
1015INIT_XMM sse2
1016SUBPEL_VARIANCE  8
1017SUBPEL_VARIANCE 16
1018
1019INIT_XMM sse2
1020SUBPEL_VARIANCE  8, 1
1021SUBPEL_VARIANCE 16, 1
1022