1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12 #include "memory.h"
13 #include "preproc.h"
14 #include "pragmas.h"
15
16 /****************************************************************************
17 * Macros
18 ****************************************************************************/
19 #define FRAMECOUNT 7
20 #define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
21
22 /****************************************************************************
23 * Imports
24 ****************************************************************************/
25 extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
26
27 /****************************************************************************
28 * Exported Global Variables
29 ****************************************************************************/
30 void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
31
32 /****************************************************************************
33 *
34 * ROUTINE : temp_filter_wmt
35 *
36 * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance.
37 * unsigned char *s : Pointer to source frame.
38 * unsigned char *d : Pointer to destination frame.
39 * int bytes : Number of bytes to filter.
40 * int strength : Strength of filter to apply.
41 *
42 * OUTPUTS : None.
43 *
44 * RETURNS : void
45 *
46 * FUNCTION : Performs a closesness adjusted temporarl blur
47 *
48 * SPECIAL NOTES : Destination frame can be same as source frame.
49 *
50 ****************************************************************************/
temp_filter_wmt(pre_proc_instance * ppi,unsigned char * s,unsigned char * d,int bytes,int strength)51 void temp_filter_wmt
52 (
53 pre_proc_instance *ppi,
54 unsigned char *s,
55 unsigned char *d,
56 int bytes,
57 int strength
58 )
59 {
60 int byte = 0;
61 unsigned char *frameptr = ppi->frame_buffer;
62
63 __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3, 3, 3, 3, 3};
64 __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16};
65
66 if (ppi->frame == 0)
67 {
68 do
69 {
70 int i;
71 int frame = 0;
72
73 do
74 {
75 for (i = 0; i < 8; i++)
76 {
77 *frameptr = s[byte+i];
78 ++frameptr;
79 }
80
81 ++frame;
82 }
83 while (frame < FRAMECOUNT);
84
85 for (i = 0; i < 8; i++)
86 d[byte+i] = s[byte+i];
87
88 byte += 8;
89
90 }
91 while (byte < bytes);
92 }
93 else
94 {
95 int i;
96 int offset2 = (ppi->frame % FRAMECOUNT);
97
98 do
99 {
100 __declspec(align(16)) unsigned short counts[8];
101 __declspec(align(16)) unsigned short sums[8];
102 __asm
103 {
104 mov eax, offset2
105 mov edi, s // source pixels
106 pxor xmm1, xmm1 // accumulator
107
108 pxor xmm7, xmm7
109
110 mov esi, frameptr // accumulator
111 pxor xmm2, xmm2 // count
112
113 movq xmm3, QWORD PTR [edi]
114
115 movq QWORD PTR [esi+8*eax], xmm3
116
117 punpcklbw xmm3, xmm2 // xmm3 source pixels
118 mov ecx, FRAMECOUNT
119
120 next_frame:
121 movq xmm4, QWORD PTR [esi] // get frame buffer values
122 punpcklbw xmm4, xmm7 // xmm4 frame buffer pixels
123 movdqa xmm6, xmm4 // save the pixel values
124 psubsw xmm4, xmm3 // subtracted pixel values
125 pmullw xmm4, xmm4 // square xmm4
126 movd xmm5, strength
127 psrlw xmm4, xmm5 // should be strength
128 pmullw xmm4, threes // 3 * modifier
129 movdqa xmm5, sixteens // 16s
130 psubusw xmm5, xmm4 // 16 - modifiers
131 movdqa xmm4, xmm5 // save the modifiers
132 pmullw xmm4, xmm6 // multiplier values
133 paddusw xmm1, xmm4 // accumulator
134 paddusw xmm2, xmm5 // count
135 add esi, 8 // next frame
136 dec ecx // next set of eight pixels
137 jnz next_frame
138
139 movdqa counts, xmm2
140 psrlw xmm2, 1 // divide count by 2 for rounding
141 paddusw xmm1, xmm2 // rounding added in
142
143 mov frameptr, esi
144
145 movdqa sums, xmm1
146 }
147
148 for (i = 0; i < 8; i++)
149 {
150 int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
151 blurvalue >>= 16;
152 d[i] = blurvalue;
153 }
154
155 s += 8;
156 d += 8;
157 byte += 8;
158 }
159 while (byte < bytes);
160 }
161
162 ++ppi->frame;
163 __asm emms
164 }
165
166 /****************************************************************************
167 *
168 * ROUTINE : temp_filter_mmx
169 *
170 * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance.
171 * unsigned char *s : Pointer to source frame.
172 * unsigned char *d : Pointer to destination frame.
173 * int bytes : Number of bytes to filter.
174 * int strength : Strength of filter to apply.
175 *
176 * OUTPUTS : None.
177 *
178 * RETURNS : void
179 *
180 * FUNCTION : Performs a closesness adjusted temporarl blur
181 *
182 * SPECIAL NOTES : Destination frame can be same as source frame.
183 *
184 ****************************************************************************/
temp_filter_mmx(pre_proc_instance * ppi,unsigned char * s,unsigned char * d,int bytes,int strength)185 void temp_filter_mmx
186 (
187 pre_proc_instance *ppi,
188 unsigned char *s,
189 unsigned char *d,
190 int bytes,
191 int strength
192 )
193 {
194 int byte = 0;
195 unsigned char *frameptr = ppi->frame_buffer;
196
197 __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3};
198 __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16};
199
200 if (ppi->frame == 0)
201 {
202 do
203 {
204 int i;
205 int frame = 0;
206
207 do
208 {
209 for (i = 0; i < 4; i++)
210 {
211 *frameptr = s[byte+i];
212 ++frameptr;
213 }
214
215 ++frame;
216 }
217 while (frame < FRAMECOUNT);
218
219 for (i = 0; i < 4; i++)
220 d[byte+i] = s[byte+i];
221
222 byte += 4;
223
224 }
225 while (byte < bytes);
226 }
227 else
228 {
229 int i;
230 int offset2 = (ppi->frame % FRAMECOUNT);
231
232 do
233 {
234 __declspec(align(16)) unsigned short counts[8];
235 __declspec(align(16)) unsigned short sums[8];
236 __asm
237 {
238
239 mov eax, offset2
240 mov edi, s // source pixels
241 pxor mm1, mm1 // accumulator
242 pxor mm7, mm7
243
244 mov esi, frameptr // accumulator
245 pxor mm2, mm2 // count
246
247 movd mm3, DWORD PTR [edi]
248 movd DWORD PTR [esi+4*eax], mm3
249
250 punpcklbw mm3, mm2 // mm3 source pixels
251 mov ecx, FRAMECOUNT
252
253 next_frame:
254 movd mm4, DWORD PTR [esi] // get frame buffer values
255 punpcklbw mm4, mm7 // mm4 frame buffer pixels
256 movq mm6, mm4 // save the pixel values
257 psubsw mm4, mm3 // subtracted pixel values
258 pmullw mm4, mm4 // square mm4
259 movd mm5, strength
260 psrlw mm4, mm5 // should be strength
261 pmullw mm4, threes // 3 * modifier
262 movq mm5, sixteens // 16s
263 psubusw mm5, mm4 // 16 - modifiers
264 movq mm4, mm5 // save the modifiers
265 pmullw mm4, mm6 // multiplier values
266 paddusw mm1, mm4 // accumulator
267 paddusw mm2, mm5 // count
268 add esi, 4 // next frame
269 dec ecx // next set of eight pixels
270 jnz next_frame
271
272 movq counts, mm2
273 psrlw mm2, 1 // divide count by 2 for rounding
274 paddusw mm1, mm2 // rounding added in
275
276 mov frameptr, esi
277
278 movq sums, mm1
279
280 }
281
282 for (i = 0; i < 4; i++)
283 {
284 int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
285 blurvalue >>= 16;
286 d[i] = blurvalue;
287 }
288
289 s += 4;
290 d += 4;
291 byte += 4;
292 }
293 while (byte < bytes);
294 }
295
296 ++ppi->frame;
297 __asm emms
298 }
299