• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "third_party/x86inc/x86inc.asm"
15
16SECTION .text
17
18; Macro Arguments
19; Arg 1: Width
20; Arg 2: Height
21; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
22; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
23%macro SAD_FN 4
24%if %4 == 0 ; normal sad
25%if %3 == 5
26cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
27%else ; %3 == 7
28cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
29                            src_stride3, ref_stride3, n_rows
30%endif ; %3 == 5/7
31
32%elif %4 == 2 ; skip
33%if %3 == 5
34cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
35%else ; %3 == 7
36cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
37                            src_stride3, ref_stride3, n_rows
38%endif ; %3 == 5/7
39
40%else
41%if %3 == 5
42cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
43                                    second_pred, n_rows
44%else ; %3 == 7
45cglobal sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, 6, src, src_stride, \
46                                              ref, ref_stride, \
47                                              second_pred, \
48                                              src_stride3, ref_stride3
49%if AOM_ARCH_X86_64
50%define n_rowsd r7d
51%else ; x86-32
52%define n_rowsd dword r0m
53%endif ; x86-32/64
54%endif ; %3 == 5/7
55%endif ; sad/avg/skip
56%if %4 == 2; skip rows so double the stride
57lea           src_strided, [src_strided*2]
58lea           ref_strided, [ref_strided*2]
59%endif ; %4 skip
60  movsxdifnidn src_strideq, src_strided
61  movsxdifnidn ref_strideq, ref_strided
62%if %3 == 7
63  lea         src_stride3q, [src_strideq*3]
64  lea         ref_stride3q, [ref_strideq*3]
65%endif ; %3 == 7
66%endmacro
67
68; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride,
69;                                  uint8_t *ref, int ref_stride);
70%macro SAD128XN 1-2 0
71  SAD_FN 128, %1, 5, %2
72%if %2 == 2
73  mov              n_rowsd, %1/2
74%else
75  mov              n_rowsd, %1
76%endif
77  pxor                  m0, m0
78
79.loop:
80  movu                  m1, [refq]
81  movu                  m2, [refq+16]
82  movu                  m3, [refq+32]
83  movu                  m4, [refq+48]
84%if %2 == 1
85  pavgb                 m1, [second_predq+mmsize*0]
86  pavgb                 m2, [second_predq+mmsize*1]
87  pavgb                 m3, [second_predq+mmsize*2]
88  pavgb                 m4, [second_predq+mmsize*3]
89%endif
90  psadbw                m1, [srcq]
91  psadbw                m2, [srcq+16]
92  psadbw                m3, [srcq+32]
93  psadbw                m4, [srcq+48]
94
95  paddd                 m1, m2
96  paddd                 m3, m4
97  paddd                 m0, m1
98  paddd                 m0, m3
99
100  movu                  m1, [refq+64]
101  movu                  m2, [refq+80]
102  movu                  m3, [refq+96]
103  movu                  m4, [refq+112]
104%if %2 == 1
105  pavgb                 m1, [second_predq+mmsize*4]
106  pavgb                 m2, [second_predq+mmsize*5]
107  pavgb                 m3, [second_predq+mmsize*6]
108  pavgb                 m4, [second_predq+mmsize*7]
109  lea         second_predq, [second_predq+mmsize*8]
110%endif
111  psadbw                m1, [srcq+64]
112  psadbw                m2, [srcq+80]
113  psadbw                m3, [srcq+96]
114  psadbw                m4, [srcq+112]
115
116  add                 refq, ref_strideq
117  add                 srcq, src_strideq
118
119  paddd                 m1, m2
120  paddd                 m3, m4
121  paddd                 m0, m1
122  paddd                 m0, m3
123
124  sub              n_rowsd, 1
125  jg .loop
126
127  movhlps               m1, m0
128  paddd                 m0, m1
129%if %2 == 2 ; we skipped rows, so now we need to double the sad
130  pslld                 m0, 1
131%endif
132  movd                 eax, m0
133  RET
134%endmacro
135
136INIT_XMM sse2
137SAD128XN 128     ; sad128x128_sse2
138SAD128XN 128, 1  ; sad128x128_avg_sse2
139SAD128XN 128, 2  ; sad128x128_skip_sse2
140SAD128XN 64      ; sad128x64_sse2
141SAD128XN 64, 1   ; sad128x64_avg_sse2
142SAD128XN 64, 2   ; sad128x64_skip_sse2
143
144
145; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
146;                                uint8_t *ref, int ref_stride);
147%macro SAD64XN 1-2 0
148  SAD_FN 64, %1, 5, %2
149%if %2 == 2
150  mov              n_rowsd, %1/2
151%else
152  mov              n_rowsd, %1
153%endif
154  pxor                  m0, m0
155.loop:
156  movu                  m1, [refq]
157  movu                  m2, [refq+16]
158  movu                  m3, [refq+32]
159  movu                  m4, [refq+48]
160%if %2 == 1
161  pavgb                 m1, [second_predq+mmsize*0]
162  pavgb                 m2, [second_predq+mmsize*1]
163  pavgb                 m3, [second_predq+mmsize*2]
164  pavgb                 m4, [second_predq+mmsize*3]
165  lea         second_predq, [second_predq+mmsize*4]
166%endif
167  psadbw                m1, [srcq]
168  psadbw                m2, [srcq+16]
169  psadbw                m3, [srcq+32]
170  psadbw                m4, [srcq+48]
171  paddd                 m1, m2
172  paddd                 m3, m4
173  add                 refq, ref_strideq
174  paddd                 m0, m1
175  add                 srcq, src_strideq
176  paddd                 m0, m3
177  dec              n_rowsd
178  jg .loop
179
180  movhlps               m1, m0
181  paddd                 m0, m1
182%if %2 == 2 ; we skipped rows, so now we need to double the sad
183  pslld                 m0, 1
184%endif
185  movd                 eax, m0
186  RET
187%endmacro
188
189INIT_XMM sse2
190SAD64XN 128     ; sad64x128_sse2
191SAD64XN  64     ; sad64x64_sse2
192SAD64XN  32     ; sad64x32_sse2
193SAD64XN  16     ; sad64x16_sse2
194SAD64XN 128, 1  ; sad64x128_avg_sse2
195SAD64XN  64, 1  ; sad64x64_avg_sse2
196SAD64XN  32, 1  ; sad64x32_avg_sse2
197SAD64XN  16, 1  ; sad64x16_avg_sse2
198SAD64XN 128, 2  ; sad64x128_skip_sse2
199SAD64XN  64, 2  ; sad64x64_skip_sse2
200SAD64XN  32, 2  ; sad64x32_skip_sse2
201SAD64XN  16, 2  ; sad64x16_skip_sse2
202
203; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
204;                                uint8_t *ref, int ref_stride);
205%macro SAD32XN 1-2 0
206  SAD_FN 32, %1, 5, %2
207%if %2 == 2
208  mov              n_rowsd, %1/4
209%else
210  mov              n_rowsd, %1/2
211%endif
212  pxor                  m0, m0
213.loop:
214  movu                  m1, [refq]
215  movu                  m2, [refq+16]
216  movu                  m3, [refq+ref_strideq]
217  movu                  m4, [refq+ref_strideq+16]
218%if %2 == 1
219  pavgb                 m1, [second_predq+mmsize*0]
220  pavgb                 m2, [second_predq+mmsize*1]
221  pavgb                 m3, [second_predq+mmsize*2]
222  pavgb                 m4, [second_predq+mmsize*3]
223  lea         second_predq, [second_predq+mmsize*4]
224%endif
225  psadbw                m1, [srcq]
226  psadbw                m2, [srcq+16]
227  psadbw                m3, [srcq+src_strideq]
228  psadbw                m4, [srcq+src_strideq+16]
229  paddd                 m1, m2
230  paddd                 m3, m4
231  lea                 refq, [refq+ref_strideq*2]
232  paddd                 m0, m1
233  lea                 srcq, [srcq+src_strideq*2]
234  paddd                 m0, m3
235  dec              n_rowsd
236  jg .loop
237
238  movhlps               m1, m0
239  paddd                 m0, m1
240%if %2 == 2 ; we skipped rows, so now we need to double the sad
241  pslld                 m0, 1
242%endif
243  movd                 eax, m0
244  RET
245%endmacro
246
247INIT_XMM sse2
248SAD32XN 64    ; sad32x64_sse2
249SAD32XN 32    ; sad32x32_sse2
250SAD32XN 16    ; sad32x16_sse2
251SAD32XN  8    ; sad_32x8_sse2
252SAD32XN 64, 1 ; sad32x64_avg_sse2
253SAD32XN 32, 1 ; sad32x32_avg_sse2
254SAD32XN 16, 1 ; sad32x16_avg_sse2
255SAD32XN  8, 1 ; sad_32x8_avg_sse2
256SAD32XN 64, 2 ; sad32x64_skip_sse2
257SAD32XN 32, 2 ; sad32x32_skip_sse2
258SAD32XN 16, 2 ; sad32x16_skip_sse2
259SAD32XN  8, 2 ; sad_32x8_skip_sse2
260
261; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
262;                                    uint8_t *ref, int ref_stride);
263%macro SAD16XN 1-2 0
264  SAD_FN 16, %1, 7, %2
265%if %2 == 2
266  mov              n_rowsd, %1/8
267%else
268  mov              n_rowsd, %1/4
269%endif
270  pxor                  m0, m0
271
272.loop:
273  movu                  m1, [refq]
274  movu                  m2, [refq+ref_strideq]
275  movu                  m3, [refq+ref_strideq*2]
276  movu                  m4, [refq+ref_stride3q]
277%if %2 == 1
278  pavgb                 m1, [second_predq+mmsize*0]
279  pavgb                 m2, [second_predq+mmsize*1]
280  pavgb                 m3, [second_predq+mmsize*2]
281  pavgb                 m4, [second_predq+mmsize*3]
282  lea         second_predq, [second_predq+mmsize*4]
283%endif
284  psadbw                m1, [srcq]
285  psadbw                m2, [srcq+src_strideq]
286  psadbw                m3, [srcq+src_strideq*2]
287  psadbw                m4, [srcq+src_stride3q]
288  paddd                 m1, m2
289  paddd                 m3, m4
290  lea                 refq, [refq+ref_strideq*4]
291  paddd                 m0, m1
292  lea                 srcq, [srcq+src_strideq*4]
293  paddd                 m0, m3
294  dec              n_rowsd
295  jg .loop
296
297  movhlps               m1, m0
298  paddd                 m0, m1
299%if %2 == 2 ; we skipped rows, so now we need to double the sad
300  pslld                 m0, 1
301%endif
302  movd                 eax, m0
303  RET
304%endmacro
305
306INIT_XMM sse2
307SAD16XN 64    ; sad_16x64_sse2
308SAD16XN 32    ; sad16x32_sse2
309SAD16XN 16    ; sad16x16_sse2
310SAD16XN  8    ; sad16x8_sse2
311SAD16XN  4    ; sad_16x4_sse2
312SAD16XN 64, 1 ; sad_16x64_avg_sse2
313SAD16XN 32, 1 ; sad16x32_avg_sse2
314SAD16XN 16, 1 ; sad16x16_avg_sse2
315SAD16XN  8, 1 ; sad16x8_avg_sse2
316SAD16XN  4, 1 ; sad_16x4_avg_sse2
317SAD16XN 64, 2 ; sad_16x64_skip_sse2
318SAD16XN 32, 2 ; sad16x32_skip_sse2
319SAD16XN 16, 2 ; sad16x16_skip_sse2
320SAD16XN  8, 2 ; sad16x8_skip_sse2
321
322; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
323;                                   uint8_t *ref, int ref_stride);
324%macro SAD8XN 1-2 0
325  SAD_FN 8, %1, 7, %2
326%if %2 == 2
327  mov              n_rowsd, %1/8
328%else
329  mov              n_rowsd, %1/4
330%endif
331  pxor                  m0, m0
332
333.loop:
334  movh                  m1, [refq]
335  movhps                m1, [refq+ref_strideq]
336  movh                  m2, [refq+ref_strideq*2]
337  movhps                m2, [refq+ref_stride3q]
338%if %2 == 1
339  pavgb                 m1, [second_predq+mmsize*0]
340  pavgb                 m2, [second_predq+mmsize*1]
341  lea         second_predq, [second_predq+mmsize*2]
342%endif
343  movh                  m3, [srcq]
344  movhps                m3, [srcq+src_strideq]
345  movh                  m4, [srcq+src_strideq*2]
346  movhps                m4, [srcq+src_stride3q]
347  psadbw                m1, m3
348  psadbw                m2, m4
349  lea                 refq, [refq+ref_strideq*4]
350  paddd                 m0, m1
351  lea                 srcq, [srcq+src_strideq*4]
352  paddd                 m0, m2
353  dec              n_rowsd
354  jg .loop
355
356  movhlps               m1, m0
357  paddd                 m0, m1
358%if %2 == 2 ; we skipped rows, so now we need to double the sad
359  pslld                 m0, 1
360%endif
361  movd                 eax, m0
362  RET
363%endmacro
364
365INIT_XMM sse2
366SAD8XN 32    ; sad_8x32_sse2
367SAD8XN 16    ; sad8x16_sse2
368SAD8XN  8    ; sad8x8_sse2
369SAD8XN  4    ; sad8x4_sse2
370SAD8XN 32, 1 ; sad_8x32_avg_sse2
371SAD8XN 16, 1 ; sad8x16_avg_sse2
372SAD8XN  8, 1 ; sad8x8_avg_sse2
373SAD8XN  4, 1 ; sad8x4_avg_sse2
374SAD8XN 32, 2 ; sad_8x32_skip_sse2
375SAD8XN 16, 2 ; sad8x16_skip_sse2
376SAD8XN  8, 2 ; sad8x8_skip_sse2
377
378; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
379;                                   uint8_t *ref, int ref_stride);
380%macro SAD4XN 1-2 0
381  SAD_FN 4, %1, 7, %2
382%if %2 == 2
383  mov              n_rowsd, %1/8
384%else
385  mov              n_rowsd, %1/4
386%endif
387  pxor                  m0, m0
388
389.loop:
390  movd                  m1, [refq]
391  movd                  m2, [refq+ref_strideq]
392  movd                  m3, [refq+ref_strideq*2]
393  movd                  m4, [refq+ref_stride3q]
394  punpckldq             m1, m2
395  punpckldq             m3, m4
396  movlhps               m1, m3
397%if %2 == 1
398  pavgb                 m1, [second_predq+mmsize*0]
399  lea         second_predq, [second_predq+mmsize*1]
400%endif
401  movd                  m2, [srcq]
402  movd                  m5, [srcq+src_strideq]
403  movd                  m4, [srcq+src_strideq*2]
404  movd                  m3, [srcq+src_stride3q]
405  punpckldq             m2, m5
406  punpckldq             m4, m3
407  movlhps               m2, m4
408  psadbw                m1, m2
409  lea                 refq, [refq+ref_strideq*4]
410  paddd                 m0, m1
411  lea                 srcq, [srcq+src_strideq*4]
412  dec              n_rowsd
413  jg .loop
414
415  movhlps               m1, m0
416  paddd                 m0, m1
417%if %2 == 2 ; we skipped rows, so now we need to double the sad
418  pslld                 m0, 1
419%endif
420  movd                 eax, m0
421  RET
422%endmacro
423
424INIT_XMM sse2
425SAD4XN 16 ; sad_4x16_sse2
426SAD4XN  8 ; sad4x8_sse
427SAD4XN  4 ; sad4x4_sse
428SAD4XN 16, 1 ; sad_4x16_avg_sse2
429SAD4XN  8, 1 ; sad4x8_avg_sse
430SAD4XN  4, 1 ; sad4x4_avg_sse
431SAD4XN 16, 2 ; sad_4x16_skip_sse2
432SAD4XN  8, 2 ; sad4x8_skip_sse
433