• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "third_party/x86inc/x86inc.asm"
15
16SECTION .text
17
18; Macro Arguments
19; Arg 1: Width
20; Arg 2: Height
21; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
22; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
23; Arg 5: Number of xmm registers. 8xh needs 8, others only need 7
24%macro HIGH_SAD_FN 4-5 7
25%if %4 == 0
26%if %3 == 5
27cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
28%else ; %3 == 7
29cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
30                            src_stride3, ref_stride3, n_rows
31%endif ; %3 == 5/7
32%elif %4 == 1 ; avg
33%if %3 == 5
34cglobal highbd_sad%1x%2_avg, 5, 1 + %3, %5, src, src_stride, ref, ref_stride, \
35                                    second_pred, n_rows
36%else ; %3 == 7
37cglobal highbd_sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, %5, src, src_stride, \
38                                              ref, ref_stride, \
39                                              second_pred, \
40                                              src_stride3, ref_stride3
41%if AOM_ARCH_X86_64
42%define n_rowsd r7d
43%else ; x86-32
44%define n_rowsd dword r0m
45%endif ; x86-32/64
46%endif ; %3 == 5/7
47%else  ; %4 == 2, skip rows
48%if %3 == 5
49cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
50%else ; %3 == 7
51cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
52                            src_stride3, ref_stride3, n_rows
53%endif ; %3 == 5/7
54%endif ; sad/avg/skip
55%if %4 == 2  ; double the stride if we are skipping rows
56  lea          src_strided, [src_strided*2]
57  lea          ref_strided, [ref_strided*2]
58%endif
59  movsxdifnidn src_strideq, src_strided
60  movsxdifnidn ref_strideq, ref_strided
61%if %3 == 7
62  lea         src_stride3q, [src_strideq*3]
63  lea         ref_stride3q, [ref_strideq*3]
64%endif ; %3 == 7
65; convert src, ref & second_pred to short ptrs (from byte ptrs)
66  shl                 srcq, 1
67  shl                 refq, 1
68%if %4 == 1
69  shl         second_predq, 1
70%endif
71%endmacro
72
73; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
74;                                    uint8_t *ref, int ref_stride);
75%macro HIGH_SAD64XN 1-2 0
76  HIGH_SAD_FN 64, %1, 5, %2
77%if %2 == 2  ; skip rows, so divide number of rows by 2
78  mov              n_rowsd, %1/2
79%else
80  mov              n_rowsd, %1
81%endif
82  pxor                  m0, m0
83  pxor                  m6, m6
84
85.loop:
86  ; first half of each row
87  movu                  m1, [refq]
88  movu                  m2, [refq+16]
89  movu                  m3, [refq+32]
90  movu                  m4, [refq+48]
91%if %2 == 1
92  pavgw                 m1, [second_predq+mmsize*0]
93  pavgw                 m2, [second_predq+mmsize*1]
94  pavgw                 m3, [second_predq+mmsize*2]
95  pavgw                 m4, [second_predq+mmsize*3]
96  lea         second_predq, [second_predq+mmsize*4]
97%endif
98  mova                  m5, [srcq]
99  psubusw               m5, m1
100  psubusw               m1, [srcq]
101  por                   m1, m5
102  mova                  m5, [srcq+16]
103  psubusw               m5, m2
104  psubusw               m2, [srcq+16]
105  por                   m2, m5
106  mova                  m5, [srcq+32]
107  psubusw               m5, m3
108  psubusw               m3, [srcq+32]
109  por                   m3, m5
110  mova                  m5, [srcq+48]
111  psubusw               m5, m4
112  psubusw               m4, [srcq+48]
113  por                   m4, m5
114  paddw                 m1, m2
115  paddw                 m3, m4
116  movhlps               m2, m1
117  movhlps               m4, m3
118  paddw                 m1, m2
119  paddw                 m3, m4
120  punpcklwd             m1, m6
121  punpcklwd             m3, m6
122  paddd                 m0, m1
123  paddd                 m0, m3
124  ; second half of each row
125  movu                  m1, [refq+64]
126  movu                  m2, [refq+80]
127  movu                  m3, [refq+96]
128  movu                  m4, [refq+112]
129%if %2 == 1
130  pavgw                 m1, [second_predq+mmsize*0]
131  pavgw                 m2, [second_predq+mmsize*1]
132  pavgw                 m3, [second_predq+mmsize*2]
133  pavgw                 m4, [second_predq+mmsize*3]
134  lea         second_predq, [second_predq+mmsize*4]
135%endif
136  mova                  m5, [srcq+64]
137  psubusw               m5, m1
138  psubusw               m1, [srcq+64]
139  por                   m1, m5
140  mova                  m5, [srcq+80]
141  psubusw               m5, m2
142  psubusw               m2, [srcq+80]
143  por                   m2, m5
144  mova                  m5, [srcq+96]
145  psubusw               m5, m3
146  psubusw               m3, [srcq+96]
147  por                   m3, m5
148  mova                  m5, [srcq+112]
149  psubusw               m5, m4
150  psubusw               m4, [srcq+112]
151  por                   m4, m5
152  paddw                 m1, m2
153  paddw                 m3, m4
154  movhlps               m2, m1
155  movhlps               m4, m3
156  paddw                 m1, m2
157  paddw                 m3, m4
158  punpcklwd             m1, m6
159  punpcklwd             m3, m6
160  lea                 refq, [refq+ref_strideq*2]
161  paddd                 m0, m1
162  lea                 srcq, [srcq+src_strideq*2]
163  paddd                 m0, m3
164
165  dec              n_rowsd
166  jg .loop
167
168  movhlps               m1, m0
169  paddd                 m0, m1
170  punpckldq             m0, m6
171  movhlps               m1, m0
172  paddd                 m0, m1
173%if %2 == 2  ; we skipped rows, so we need to double the sad
174  pslld                 m0, 1
175%endif
176  movd                 eax, m0
177  RET
178%endmacro
179
180INIT_XMM sse2
181HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
182HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
183HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2
184HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
185HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
186HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2
187HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2
188HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2
189HIGH_SAD64XN 16, 2 ; highbd_sad_skip_64x16_sse2
190
191; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
192;                                    uint8_t *ref, int ref_stride);
193%macro HIGH_SAD32XN 1-2 0
194  HIGH_SAD_FN 32, %1, 5, %2
195%if %2 == 2  ; skip rows, so divide number of rows by 2
196  mov              n_rowsd, %1/2
197%else
198  mov              n_rowsd, %1
199%endif
200  pxor                  m0, m0
201  pxor                  m6, m6
202
203.loop:
204  movu                  m1, [refq]
205  movu                  m2, [refq+16]
206  movu                  m3, [refq+32]
207  movu                  m4, [refq+48]
208%if %2 == 1
209  pavgw                 m1, [second_predq+mmsize*0]
210  pavgw                 m2, [second_predq+mmsize*1]
211  pavgw                 m3, [second_predq+mmsize*2]
212  pavgw                 m4, [second_predq+mmsize*3]
213  lea         second_predq, [second_predq+mmsize*4]
214%endif
215  mova                  m5, [srcq]
216  psubusw               m5, m1
217  psubusw               m1, [srcq]
218  por                   m1, m5
219  mova                  m5, [srcq+16]
220  psubusw               m5, m2
221  psubusw               m2, [srcq+16]
222  por                   m2, m5
223  mova                  m5, [srcq+32]
224  psubusw               m5, m3
225  psubusw               m3, [srcq+32]
226  por                   m3, m5
227  mova                  m5, [srcq+48]
228  psubusw               m5, m4
229  psubusw               m4, [srcq+48]
230  por                   m4, m5
231  paddw                 m1, m2
232  paddw                 m3, m4
233  movhlps               m2, m1
234  movhlps               m4, m3
235  paddw                 m1, m2
236  paddw                 m3, m4
237  punpcklwd             m1, m6
238  punpcklwd             m3, m6
239  lea                 refq, [refq+ref_strideq*2]
240  paddd                 m0, m1
241  lea                 srcq, [srcq+src_strideq*2]
242  paddd                 m0, m3
243  dec              n_rowsd
244  jg .loop
245
246  movhlps               m1, m0
247  paddd                 m0, m1
248  punpckldq             m0, m6
249  movhlps               m1, m0
250  paddd                 m0, m1
251%if %2 == 2  ; we skipped rows, so we need to double the sad
252  pslld                 m0, 1
253%endif
254  movd                 eax, m0
255  RET
256%endmacro
257
258INIT_XMM sse2
259HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
260HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
261HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
262HIGH_SAD32XN  8 ; highbd_sad_32x8_sse2
263HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
264HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
265HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
266HIGH_SAD32XN  8, 1 ; highbd_sad_32x8_avg_sse2
267HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2
268HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2
269HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2
270HIGH_SAD32XN  8, 2 ; highbd_sad_skip_32x8_sse2
271
272; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
273;                                    uint8_t *ref, int ref_stride);
274%macro HIGH_SAD16XN 1-2 0
275  HIGH_SAD_FN 16, %1, 5, %2
276%if %2 == 2  ; skip rows, so divide number of rows by 2
277  mov              n_rowsd, %1/4
278%else
279  mov              n_rowsd, %1/2
280%endif
281  pxor                  m0, m0
282  pxor                  m6, m6
283
284.loop:
285  movu                  m1, [refq]
286  movu                  m2, [refq+16]
287  movu                  m3, [refq+ref_strideq*2]
288  movu                  m4, [refq+ref_strideq*2+16]
289%if %2 == 1
290  pavgw                 m1, [second_predq+mmsize*0]
291  pavgw                 m2, [second_predq+16]
292  pavgw                 m3, [second_predq+mmsize*2]
293  pavgw                 m4, [second_predq+mmsize*2+16]
294  lea         second_predq, [second_predq+mmsize*4]
295%endif
296  mova                  m5, [srcq]
297  psubusw               m5, m1
298  psubusw               m1, [srcq]
299  por                   m1, m5
300  mova                  m5, [srcq+16]
301  psubusw               m5, m2
302  psubusw               m2, [srcq+16]
303  por                   m2, m5
304  mova                  m5, [srcq+src_strideq*2]
305  psubusw               m5, m3
306  psubusw               m3, [srcq+src_strideq*2]
307  por                   m3, m5
308  mova                  m5, [srcq+src_strideq*2+16]
309  psubusw               m5, m4
310  psubusw               m4, [srcq+src_strideq*2+16]
311  por                   m4, m5
312  paddw                 m1, m2
313  paddw                 m3, m4
314  movhlps               m2, m1
315  movhlps               m4, m3
316  paddw                 m1, m2
317  paddw                 m3, m4
318  punpcklwd             m1, m6
319  punpcklwd             m3, m6
320  lea                 refq, [refq+ref_strideq*4]
321  paddd                 m0, m1
322  lea                 srcq, [srcq+src_strideq*4]
323  paddd                 m0, m3
324  dec              n_rowsd
325  jg .loop
326
327  movhlps               m1, m0
328  paddd                 m0, m1
329  punpckldq             m0, m6
330  movhlps               m1, m0
331  paddd                 m0, m1
332%if %2 == 2  ; we skipped rows, so we need to double the sad
333  pslld                 m0, 1
334%endif
335  movd                 eax, m0
336  RET
337%endmacro
338
339INIT_XMM sse2
340HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2
341HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
342HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
343HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
344HIGH_SAD16XN  4 ; highbd_sad_16x4_sse2
345HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
346HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
347HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
348HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
349HIGH_SAD16XN  4, 1 ; highbd_sad_16x4_avg_sse2
350HIGH_SAD16XN 64, 2 ; highbd_sad_skip_16x64_sse2
351HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2
352HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2
353HIGH_SAD16XN  8, 2 ; highbd_sad_skip_16x8_sse2
354; Current code fails there are only 2 rows
355; HIGH_SAD16XN  4, 2 ; highbd_sad_skip_16x4_sse2
356
357; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
358;                                    uint8_t *ref, int ref_stride);
359%macro HIGH_SAD8XN 1-2 0
360  HIGH_SAD_FN 8, %1, 7, %2, 8
361%if %2 == 2  ; skip rows, so divide number of rows by 2
362  mov              n_rowsd, %1/8
363%else
364  mov              n_rowsd, %1/4
365%endif
366  pxor                  m0, m0
367  pxor                  m6, m6
368
369.loop:
370  movu                  m1, [refq]
371  movu                  m2, [refq+ref_strideq*2]
372  movu                  m3, [refq+ref_strideq*4]
373  movu                  m4, [refq+ref_stride3q*2]
374%if %2 == 1
375  pavgw                 m1, [second_predq+mmsize*0]
376  pavgw                 m2, [second_predq+mmsize*1]
377  pavgw                 m3, [second_predq+mmsize*2]
378  pavgw                 m4, [second_predq+mmsize*3]
379  lea         second_predq, [second_predq+mmsize*4]
380%endif
381  mova                  m7, m1
382  movu                  m5, [srcq]
383  psubusw               m1, m5
384  psubusw               m5, m7
385  por                   m1, m5
386
387  mova                  m7, m2
388  movu                  m5, [srcq+src_strideq*2]
389  psubusw               m2, m5
390  psubusw               m5, m7
391  por                   m2, m5
392
393  mova                  m7, m3
394  movu                  m5, [srcq+src_strideq*4]
395  psubusw               m3, m5
396  psubusw               m5, m7
397  por                   m3, m5
398
399  mova                  m7, m4
400  movu                  m5, [srcq+src_stride3q*2]
401  psubusw               m4, m5
402  psubusw               m5, m7
403  por                   m4, m5
404
405  paddw                 m1, m2
406  paddw                 m3, m4
407  movhlps               m2, m1
408  movhlps               m4, m3
409  paddw                 m1, m2
410  paddw                 m3, m4
411  punpcklwd             m1, m6
412  punpcklwd             m3, m6
413  lea                 refq, [refq+ref_strideq*8]
414  paddd                 m0, m1
415  lea                 srcq, [srcq+src_strideq*8]
416  paddd                 m0, m3
417  dec              n_rowsd
418  jg .loop
419
420  movhlps               m1, m0
421  paddd                 m0, m1
422  punpckldq             m0, m6
423  movhlps               m1, m0
424  paddd                 m0, m1
425%if %2 == 2  ; we skipped rows, so we need to double the sad
426  pslld                 m0, 1
427%endif
428  movd                 eax, m0
429  RET
430%endmacro
431
432INIT_XMM sse2
433HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
434HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
435HIGH_SAD8XN  8 ; highbd_sad8x8_sse2
436HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
437HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
438HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
439HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
440HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
441HIGH_SAD8XN 32, 2 ; highbd_sad_skip_8x32_sse2
442HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2
443HIGH_SAD8XN  8, 2 ; highbd_sad_skip_8x8_sse2
444; Current code fails there are only 2 rows
445; HIGH_SAD8XN  4, 2 ; highbd_sad8x4_avg_sse2
446
447; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride,
448;                                    uint8_t *ref, int ref_stride);
449%macro HIGH_SAD4XN 1-2 0
450  HIGH_SAD_FN 4, %1, 7, %2
451%if %2 == 2  ; skip rows, so divide number of rows by 2
452  mov              n_rowsd, %1/8
453%else
454  mov              n_rowsd, %1/4
455%endif
456  pxor                  m0, m0
457  pxor                  m6, m6
458
459.loop:
460  movq                  m1, [refq]
461  movq                  m2, [refq+ref_strideq*2]
462  movq                  m3, [refq+ref_strideq*4]
463  movq                  m4, [refq+ref_stride3q*2]
464  punpcklwd             m1, m3
465  punpcklwd             m2, m4
466%if %2 == 1
467  movq                  m3, [second_predq+8*0]
468  movq                  m5, [second_predq+8*2]
469  punpcklwd             m3, m5
470  movq                  m4, [second_predq+8*1]
471  movq                  m5, [second_predq+8*3]
472  punpcklwd             m4, m5
473  lea         second_predq, [second_predq+8*4]
474  pavgw                 m1, m3
475  pavgw                 m2, m4
476%endif
477  movq                  m5, [srcq]
478  movq                  m3, [srcq+src_strideq*4]
479  punpcklwd             m5, m3
480  movdqa                m3, m1
481  psubusw               m1, m5
482  psubusw               m5, m3
483  por                   m1, m5
484  movq                  m5, [srcq+src_strideq*2]
485  movq                  m4, [srcq+src_stride3q*2]
486  punpcklwd             m5, m4
487  movdqa                m4, m2
488  psubusw               m2, m5
489  psubusw               m5, m4
490  por                   m2, m5
491  paddw                 m1, m2
492  movdqa                m2, m1
493  punpcklwd             m1, m6
494  punpckhwd             m2, m6
495  lea                 refq, [refq+ref_strideq*8]
496  paddd                 m0, m1
497  lea                 srcq, [srcq+src_strideq*8]
498  paddd                 m0, m2
499  dec              n_rowsd
500  jg .loop
501
502  movhlps               m1, m0
503  paddd                 m0, m1
504  punpckldq             m0, m6
505  movhlps               m1, m0
506  paddd                 m0, m1
507%if %2 == 2  ; we skipped rows, so we need to double the sad
508  pslld                 m0, 1
509%endif
510  movd                 eax, m0
511  RET
512%endmacro
513
514INIT_XMM sse2
515HIGH_SAD4XN 16 ; highbd_sad4x16_sse2
516HIGH_SAD4XN  8 ; highbd_sad4x8_sse2
517HIGH_SAD4XN  4 ; highbd_sad4x4_sse2
518HIGH_SAD4XN 16, 1 ; highbd_sad4x16_avg_sse2
519HIGH_SAD4XN  8, 1 ; highbd_sad4x8_avg_sse2
520HIGH_SAD4XN  4, 1 ; highbd_sad4x4_avg_sse2
521HIGH_SAD4XN 16, 2 ; highbd_sad_skip_4x16_sse2
522HIGH_SAD4XN  8, 2 ; highbd_sad_skip_4x8_sse2
523; Current code fails there are only 2 rows
524; HIGH_SAD4XN  4, 2 ; highbd_sad_skip_4x4_sse2
525