• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "vpx_ports/x86_abi_support.asm"
12
13; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
14%macro TABULATE_SSIM 0
15        paddusw         xmm15, xmm3  ; sum_s
16        paddusw         xmm14, xmm4  ; sum_r
17        movdqa          xmm1, xmm3
18        pmaddwd         xmm1, xmm1
19        paddd           xmm13, xmm1 ; sum_sq_s
20        movdqa          xmm2, xmm4
21        pmaddwd         xmm2, xmm2
22        paddd           xmm12, xmm2 ; sum_sq_r
23        pmaddwd         xmm3, xmm4
24        paddd           xmm11, xmm3  ; sum_sxr
25%endmacro
26
27; Sum across the register %1 starting with q words
28%macro SUM_ACROSS_Q 1
29        movdqa          xmm2,%1
30        punpckldq       %1,xmm0
31        punpckhdq       xmm2,xmm0
32        paddq           %1,xmm2
33        movdqa          xmm2,%1
34        punpcklqdq      %1,xmm0
35        punpckhqdq      xmm2,xmm0
36        paddq           %1,xmm2
37%endmacro
38
39; Sum across the register %1 starting with q words
40%macro SUM_ACROSS_W 1
41        movdqa          xmm1, %1
42        punpcklwd       %1,xmm0
43        punpckhwd       xmm1,xmm0
44        paddd           %1, xmm1
45        SUM_ACROSS_Q    %1
46%endmacro
47
48SECTION .text
49
50;void ssim_parms_sse2(
51;    unsigned char *s,
52;    int sp,
53;    unsigned char *r,
54;    int rp
55;    uint32_t *sum_s,
56;    uint32_t *sum_r,
57;    uint32_t *sum_sq_s,
58;    uint32_t *sum_sq_r,
59;    uint32_t *sum_sxr);
60;
61; TODO: Use parm passing through structure, probably don't need the pxors
62; ( calling app will initialize to 0 ) could easily fit everything in sse2
63; without too much hastle, and can probably do better estimates with psadw
64; or pavgb At this point this is just meant to be first pass for calculating
65; all the parms needed for 16x16 ssim so we can play with dssim as distortion
66; in mode selection code.
67globalsym(vpx_ssim_parms_16x16_sse2)
68sym(vpx_ssim_parms_16x16_sse2):
69    push        rbp
70    mov         rbp, rsp
71    SHADOW_ARGS_TO_STACK 9
72    SAVE_XMM 15
73    push        rsi
74    push        rdi
75    ; end prolog
76
77    mov             rsi,        arg(0) ;s
78    mov             rcx,        arg(1) ;sp
79    mov             rdi,        arg(2) ;r
80    mov             rax,        arg(3) ;rp
81
82    pxor            xmm0, xmm0
83    pxor            xmm15,xmm15  ;sum_s
84    pxor            xmm14,xmm14  ;sum_r
85    pxor            xmm13,xmm13  ;sum_sq_s
86    pxor            xmm12,xmm12  ;sum_sq_r
87    pxor            xmm11,xmm11  ;sum_sxr
88
89    mov             rdx, 16      ;row counter
90.NextRow:
91
92    ;grab source and reference pixels
93    movdqu          xmm5, [rsi]
94    movdqu          xmm6, [rdi]
95    movdqa          xmm3, xmm5
96    movdqa          xmm4, xmm6
97    punpckhbw       xmm3, xmm0 ; high_s
98    punpckhbw       xmm4, xmm0 ; high_r
99
100    TABULATE_SSIM
101
102    movdqa          xmm3, xmm5
103    movdqa          xmm4, xmm6
104    punpcklbw       xmm3, xmm0 ; low_s
105    punpcklbw       xmm4, xmm0 ; low_r
106
107    TABULATE_SSIM
108
109    add             rsi, rcx   ; next s row
110    add             rdi, rax   ; next r row
111
112    dec             rdx        ; counter
113    jnz .NextRow
114
115    SUM_ACROSS_W    xmm15
116    SUM_ACROSS_W    xmm14
117    SUM_ACROSS_Q    xmm13
118    SUM_ACROSS_Q    xmm12
119    SUM_ACROSS_Q    xmm11
120
121    mov             rdi,arg(4)
122    movd            [rdi], xmm15;
123    mov             rdi,arg(5)
124    movd            [rdi], xmm14;
125    mov             rdi,arg(6)
126    movd            [rdi], xmm13;
127    mov             rdi,arg(7)
128    movd            [rdi], xmm12;
129    mov             rdi,arg(8)
130    movd            [rdi], xmm11;
131
132    ; begin epilog
133    pop         rdi
134    pop         rsi
135    RESTORE_XMM
136    UNSHADOW_ARGS
137    pop         rbp
138    ret
139
140;void ssim_parms_sse2(
141;    unsigned char *s,
142;    int sp,
143;    unsigned char *r,
144;    int rp
145;    uint32_t *sum_s,
146;    uint32_t *sum_r,
147;    uint32_t *sum_sq_s,
148;    uint32_t *sum_sq_r,
149;    uint32_t *sum_sxr);
150;
151; TODO: Use parm passing through structure, probably don't need the pxors
152; ( calling app will initialize to 0 ) could easily fit everything in sse2
153; without too much hastle, and can probably do better estimates with psadw
154; or pavgb At this point this is just meant to be first pass for calculating
155; all the parms needed for 16x16 ssim so we can play with dssim as distortion
156; in mode selection code.
157globalsym(vpx_ssim_parms_8x8_sse2)
158sym(vpx_ssim_parms_8x8_sse2):
159    push        rbp
160    mov         rbp, rsp
161    SHADOW_ARGS_TO_STACK 9
162    SAVE_XMM 15
163    push        rsi
164    push        rdi
165    ; end prolog
166
167    mov             rsi,        arg(0) ;s
168    mov             rcx,        arg(1) ;sp
169    mov             rdi,        arg(2) ;r
170    mov             rax,        arg(3) ;rp
171
172    pxor            xmm0, xmm0
173    pxor            xmm15,xmm15  ;sum_s
174    pxor            xmm14,xmm14  ;sum_r
175    pxor            xmm13,xmm13  ;sum_sq_s
176    pxor            xmm12,xmm12  ;sum_sq_r
177    pxor            xmm11,xmm11  ;sum_sxr
178
179    mov             rdx, 8      ;row counter
180.NextRow:
181
182    ;grab source and reference pixels
183    movq            xmm3, [rsi]
184    movq            xmm4, [rdi]
185    punpcklbw       xmm3, xmm0 ; low_s
186    punpcklbw       xmm4, xmm0 ; low_r
187
188    TABULATE_SSIM
189
190    add             rsi, rcx   ; next s row
191    add             rdi, rax   ; next r row
192
193    dec             rdx        ; counter
194    jnz .NextRow
195
196    SUM_ACROSS_W    xmm15
197    SUM_ACROSS_W    xmm14
198    SUM_ACROSS_Q    xmm13
199    SUM_ACROSS_Q    xmm12
200    SUM_ACROSS_Q    xmm11
201
202    mov             rdi,arg(4)
203    movd            [rdi], xmm15;
204    mov             rdi,arg(5)
205    movd            [rdi], xmm14;
206    mov             rdi,arg(6)
207    movd            [rdi], xmm13;
208    mov             rdi,arg(7)
209    movd            [rdi], xmm12;
210    mov             rdi,arg(8)
211    movd            [rdi], xmm11;
212
213    ; begin epilog
214    pop         rdi
215    pop         rsi
216    RESTORE_XMM
217    UNSHADOW_ARGS
218    pop         rbp
219    ret
220