• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "aom_ports/x86_abi_support.asm"
15
16; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
17%macro TABULATE_SSIM 0
18        paddusw         xmm15, xmm3  ; sum_s
19        paddusw         xmm14, xmm4  ; sum_r
20        movdqa          xmm1, xmm3
21        pmaddwd         xmm1, xmm1
22        paddd           xmm13, xmm1 ; sum_sq_s
23        movdqa          xmm2, xmm4
24        pmaddwd         xmm2, xmm2
25        paddd           xmm12, xmm2 ; sum_sq_r
26        pmaddwd         xmm3, xmm4
27        paddd           xmm11, xmm3  ; sum_sxr
28%endmacro
29
30; Sum across the register %1 starting with q words
31%macro SUM_ACROSS_Q 1
32        movdqa          xmm2,%1
33        punpckldq       %1,xmm0
34        punpckhdq       xmm2,xmm0
35        paddq           %1,xmm2
36        movdqa          xmm2,%1
37        punpcklqdq      %1,xmm0
38        punpckhqdq      xmm2,xmm0
39        paddq           %1,xmm2
40%endmacro
41
42; Sum across the register %1 starting with q words
43%macro SUM_ACROSS_W 1
44        movdqa          xmm1, %1
45        punpcklwd       %1,xmm0
46        punpckhwd       xmm1,xmm0
47        paddd           %1, xmm1
48        SUM_ACROSS_Q    %1
49%endmacro
50
51SECTION .text
52
53;void ssim_parms_sse2(
54;    unsigned char *s,
55;    int sp,
56;    unsigned char *r,
57;    int rp
58;    uint32_t *sum_s,
59;    uint32_t *sum_r,
60;    uint32_t *sum_sq_s,
61;    uint32_t *sum_sq_r,
62;    uint32_t *sum_sxr);
63;
64; TODO: Use parm passing through structure, probably don't need the pxors
65; ( calling app will initialize to 0 ) could easily fit everything in sse2
66; without too much hastle, and can probably do better estimates with psadw
67; or pavgb At this point this is just meant to be first pass for calculating
68; all the parms needed for 16x16 ssim so we can play with dssim as distortion
69; in mode selection code.
70globalsym(aom_ssim_parms_16x16_sse2)
71sym(aom_ssim_parms_16x16_sse2):
72    push        rbp
73    mov         rbp, rsp
74    SHADOW_ARGS_TO_STACK 9
75    SAVE_XMM 15
76    push        rsi
77    push        rdi
78    ; end prolog
79
80    mov             rsi,        arg(0) ;s
81    mov             rcx,        arg(1) ;sp
82    mov             rdi,        arg(2) ;r
83    mov             rax,        arg(3) ;rp
84
85    pxor            xmm0, xmm0
86    pxor            xmm15,xmm15  ;sum_s
87    pxor            xmm14,xmm14  ;sum_r
88    pxor            xmm13,xmm13  ;sum_sq_s
89    pxor            xmm12,xmm12  ;sum_sq_r
90    pxor            xmm11,xmm11  ;sum_sxr
91
92    mov             rdx, 16      ;row counter
93.NextRow:
94
95    ;grab source and reference pixels
96    movdqu          xmm5, [rsi]
97    movdqu          xmm6, [rdi]
98    movdqa          xmm3, xmm5
99    movdqa          xmm4, xmm6
100    punpckhbw       xmm3, xmm0 ; high_s
101    punpckhbw       xmm4, xmm0 ; high_r
102
103    TABULATE_SSIM
104
105    movdqa          xmm3, xmm5
106    movdqa          xmm4, xmm6
107    punpcklbw       xmm3, xmm0 ; low_s
108    punpcklbw       xmm4, xmm0 ; low_r
109
110    TABULATE_SSIM
111
112    add             rsi, rcx   ; next s row
113    add             rdi, rax   ; next r row
114
115    dec             rdx        ; counter
116    jnz .NextRow
117
118    SUM_ACROSS_W    xmm15
119    SUM_ACROSS_W    xmm14
120    SUM_ACROSS_Q    xmm13
121    SUM_ACROSS_Q    xmm12
122    SUM_ACROSS_Q    xmm11
123
124    mov             rdi,arg(4)
125    movd            [rdi], xmm15;
126    mov             rdi,arg(5)
127    movd            [rdi], xmm14;
128    mov             rdi,arg(6)
129    movd            [rdi], xmm13;
130    mov             rdi,arg(7)
131    movd            [rdi], xmm12;
132    mov             rdi,arg(8)
133    movd            [rdi], xmm11;
134
135    ; begin epilog
136    pop         rdi
137    pop         rsi
138    RESTORE_XMM
139    UNSHADOW_ARGS
140    pop         rbp
141    ret
142
143;void ssim_parms_sse2(
144;    unsigned char *s,
145;    int sp,
146;    unsigned char *r,
147;    int rp
148;    uint32_t *sum_s,
149;    uint32_t *sum_r,
150;    uint32_t *sum_sq_s,
151;    uint32_t *sum_sq_r,
152;    uint32_t *sum_sxr);
153;
154; TODO: Use parm passing through structure, probably don't need the pxors
155; ( calling app will initialize to 0 ) could easily fit everything in sse2
156; without too much hastle, and can probably do better estimates with psadw
157; or pavgb At this point this is just meant to be first pass for calculating
158; all the parms needed for 16x16 ssim so we can play with dssim as distortion
159; in mode selection code.
160globalsym(aom_ssim_parms_8x8_sse2)
161sym(aom_ssim_parms_8x8_sse2):
162    push        rbp
163    mov         rbp, rsp
164    SHADOW_ARGS_TO_STACK 9
165    SAVE_XMM 15
166    push        rsi
167    push        rdi
168    ; end prolog
169
170    mov             rsi,        arg(0) ;s
171    mov             rcx,        arg(1) ;sp
172    mov             rdi,        arg(2) ;r
173    mov             rax,        arg(3) ;rp
174
175    pxor            xmm0, xmm0
176    pxor            xmm15,xmm15  ;sum_s
177    pxor            xmm14,xmm14  ;sum_r
178    pxor            xmm13,xmm13  ;sum_sq_s
179    pxor            xmm12,xmm12  ;sum_sq_r
180    pxor            xmm11,xmm11  ;sum_sxr
181
182    mov             rdx, 8      ;row counter
183.NextRow:
184
185    ;grab source and reference pixels
186    movq            xmm3, [rsi]
187    movq            xmm4, [rdi]
188    punpcklbw       xmm3, xmm0 ; low_s
189    punpcklbw       xmm4, xmm0 ; low_r
190
191    TABULATE_SSIM
192
193    add             rsi, rcx   ; next s row
194    add             rdi, rax   ; next r row
195
196    dec             rdx        ; counter
197    jnz .NextRow
198
199    SUM_ACROSS_W    xmm15
200    SUM_ACROSS_W    xmm14
201    SUM_ACROSS_Q    xmm13
202    SUM_ACROSS_Q    xmm12
203    SUM_ACROSS_Q    xmm11
204
205    mov             rdi,arg(4)
206    movd            [rdi], xmm15;
207    mov             rdi,arg(5)
208    movd            [rdi], xmm14;
209    mov             rdi,arg(6)
210    movd            [rdi], xmm13;
211    mov             rdi,arg(7)
212    movd            [rdi], xmm12;
213    mov             rdi,arg(8)
214    movd            [rdi], xmm11;
215
216    ; begin epilog
217    pop         rdi
218    pop         rsi
219    RESTORE_XMM
220    UNSHADOW_ARGS
221    pop         rbp
222    ret
223