1 /* xmmx.c
2
3 eXtended MultiMedia eXtensions GCC interface library for IA32.
4
5 To use this library, simply include this header file
6 and compile with GCC. You MUST have inlining enabled
7 in order for xmmx_ok() to work; this can be done by
8 simply using -O on the GCC command line.
9
10 Compiling with -DXMMX_TRACE will cause detailed trace
11 output to be sent to stderr for each mmx operation.
12 This adds lots of code, and obviously slows execution to
13 a crawl, but can be very useful for debugging.
14
15 THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY
16 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
17 LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY
18 AND FITNESS FOR ANY PARTICULAR PURPOSE.
19
20 1999 by R. Fisher
21 Based on libmmx, 1997-99 by H. Dietz and R. Fisher
22
23 Notes:
24 It appears that the latest gas has the pand problem fixed, therefore
25 I'll undefine BROKEN_PAND by default.
26 */
27 #ifdef HAVE_CONFIG_H
28 #include "config.h"
29 #endif
30
31 #include "goom_config.h"
32
33 #ifdef HAVE_MMX
34
35 /* a definir pour avoir exactement le meme resultat que la fonction C
36 * (un chouillat plus lent).. mais la difference est assez peu notable.
37 */
38 // #define STRICT_COMPAT
39
40 #define BUFFPOINTNB 16
41 #define BUFFPOINTMASK 0xffff
42 #define BUFFINCR 0xff
43
44 #define sqrtperte 16
45 /* faire : a % sqrtperte <=> a & pertemask*/
46 #define PERTEMASK 0xf
47 /* faire : a / sqrtperte <=> a >> PERTEDEC*/
48 #define PERTEDEC 4
49
50
51 /*#define MMX_TRACE*/
52 #include "mmx.h"
53 /*#include "xmmx.h"*/
54 #include "goom_graphic.h"
55
56 int
xmmx_supported(void)57 xmmx_supported (void)
58 {
59 return (mm_support () & 0x8) >> 3;
60 }
61
62 void
zoom_filter_xmmx(int prevX,int prevY,Pixel * expix1,Pixel * expix2,int * lbruS,int * lbruD,int buffratio,int precalCoef[16][16])63 zoom_filter_xmmx (int prevX, int prevY,
64 Pixel * expix1, Pixel * expix2,
65 int *lbruS, int *lbruD, int buffratio, int precalCoef[16][16])
66 {
67 int bufsize = prevX * prevY; /* taille du buffer */
68 volatile int loop; /* variable de boucle */
69
70 mmx_t *brutS = (mmx_t *) lbruS; /* buffer de transformation source */
71 mmx_t *brutD = (mmx_t *) lbruD; /* buffer de transformation dest */
72
73 volatile mmx_t prevXY;
74 volatile mmx_t ratiox;
75
76 /* volatile mmx_t interpix; */
77
78 expix1[0].val = expix1[prevX - 1].val = expix1[prevX * prevY - 1].val =
79 expix1[prevX * prevY - prevX].val = 0;
80
81 prevXY.ud[0] = (prevX - 1) << PERTEDEC;
82 prevXY.ud[1] = (prevY - 1) << PERTEDEC;
83
84 ratiox.d[0] = buffratio;
85 ratiox.d[1] = buffratio;
86
87 asm volatile ("\n\t movq %[ratio], %%mm6" "\n\t pslld $16, %%mm6" /* mm6 = [rat16=buffratio<<16 | rat16=buffratio<<16] */
88 "\n\t pxor %%mm7, %%mm7" /* mm7 = 0 */
89 ::[ratio] "m" (ratiox));
90
91 loop = 0;
92
93 /*
94 * NOTE : mm6 et mm7 ne sont pas modifies dans la boucle.
95 */
96 while (loop < bufsize) {
97 /* Thread #1
98 * pre : mm6 = [rat16|rat16]
99 * post : mm0 = S + ((D-S)*rat16 format [X|Y]
100 * modified = mm0,mm1,mm2
101 */
102
103 asm volatile ("#1 \n\t movq 0(%[brutS]), %%mm0" "#1 \n\t movq 0(%[brutD]), %%mm1" "#1 \n\t psubd %%mm0, %%mm1" /* mm1 = D - S */
104 "#1 \n\t movq %%mm1, %%mm2" /* mm2 = D - S */
105 "#1 \n\t pslld $16, %%mm1" "#1 \n\t pmullw %%mm6, %%mm2" "#1 \n\t pmulhuw %%mm6, %%mm1" "#1 \n\t pslld $16, %%mm0" "#1 \n\t paddd %%mm2, %%mm1" /* mm1 = (D - S) * buffratio >> 16 */
106 "#1 \n\t paddd %%mm1, %%mm0" /* mm0 = S + mm1 */
107 "#1 \n\t psrld $16, %%mm0"::[brutS] "r" (&brutS[loop]),
108 [brutD] "r" (&brutD[loop])
109 ); /* mm0 = S */
110
111 /*
112 * pre : mm0 : position vector on screen
113 * prevXY : coordinate of the lower-right point on screen
114 * post : clipped mm0
115 * modified : mm0,mm1,mm2
116 */
117 asm volatile
118 ("#1 \n\t movq %[prevXY], %%mm1" "#1 \n\t pcmpgtd %%mm0, %%mm1"
119 /* mm0 en X contient (idem pour Y) :
120 * 1111 si prevXY > px
121 * 0000 si prevXY <= px */
122 #ifdef STRICT_COMPAT
123 "#1 \n\t movq %%mm1, %%mm2"
124 "#1 \n\t punpckhdq %%mm2, %%mm2"
125 "#1 \n\t punpckldq %%mm1, %%mm1" "#1 \n\t pand %%mm2, %%mm0"
126 #endif
127 "#1 \n\t pand %%mm1, %%mm0" /* on met a zero la partie qui deborde */
128 ::[prevXY] "m" (prevXY));
129
130 /* Thread #2
131 * pre : mm0 : clipped position on screen
132 *
133 * post : mm3 : coefs for this position
134 * mm1 : X vector [0|X]
135 *
136 * modif : eax,esi
137 */
138 __asm__ __volatile__ ("#2 \n\t movd %%mm0,%%esi"
139 "#2 \n\t movq %%mm0,%%mm1"
140 "#2 \n\t andl $15,%%esi"
141 "#2 \n\t psrlq $32,%%mm1"
142 "#2 \n\t shll $6,%%esi"
143 "#2 \n\t movd %%mm1,%%eax"
144 "#2 \n\t addl %[precalCoef],%%esi"
145 "#2 \n\t andl $15,%%eax"
146 "#2 \n\t movd (%%esi,%%eax,4),%%mm3"::[precalCoef]
147 "g" (precalCoef):"eax", "esi");
148
149 /*
150 * extraction des coefficients... (Thread #3)
151 *
152 * pre : coef dans mm3
153 *
154 * post : coef extraits dans mm3 (c1 & c2)
155 * et mm4 (c3 & c4)
156 *
157 * modif : mm5
158 */
159
160 /* (Thread #4)
161 * pre : mm0 : Y pos [*|Y]
162 * mm1 : X pos [*|X]
163 *
164 * post : mm0 : expix1[position]
165 * mm2 : expix1[position+largeur]
166 *
167 * modif : eax, esi
168 */
169 __asm__ __volatile__ ("#2 \n\t psrld $4, %%mm0" "#2 \n\t psrld $4, %%mm1" /* PERTEDEC = $4 */
170 "#4 \n\t movd %%mm1,%%eax"
171 "#3 \n\t movq %%mm3,%%mm5"
172 "#4 \n\t mull %[prevX]"
173 "#4 \n\t movd %%mm0,%%esi"
174 "#3 \n\t punpcklbw %%mm5, %%mm3"
175 "#4 \n\t addl %%esi, %%eax"
176 "#3 \n\t movq %%mm3, %%mm4"
177 "#3 \n\t movq %%mm3, %%mm5"
178 "#4 \n\t movl %[expix1], %%esi"
179 "#3 \n\t punpcklbw %%mm5, %%mm3"
180 "#4 \n\t movq (%%esi,%%eax,4),%%mm0"
181 "#3 \n\t punpckhbw %%mm5, %%mm4"
182 "#4 \n\t addl %[prevX],%%eax"
183 "#4 \n\t movq (%%esi,%%eax,4),%%mm2"::[expix1] "g" (expix1)
184 ,[prevX] "g" (prevX)
185 :"eax", "esi", "edx");
186
187 /*
188 * pre : mm0 : expix1[position]
189 * mm2 : expix1[position+largeur]
190 * mm3 & mm4 : coefs
191 */
192
193 /* recopie des deux premiers pixels dans mm0 et mm1 */
194 movq_r2r (mm0, mm1); /* b1-v1-r1-a1-b2-v2-r2-a2 */
195
196 /* depackage du premier pixel */
197 punpcklbw_r2r (mm7, mm0); /* 00-b2-00-v2-00-r2-00-a2 */
198
199 /* extraction des coefficients... */
200
201 movq_r2r (mm3, mm5); /* c2-c2-c2-c2-c1-c1-c1-c1 */
202
203 /*^en parrallele^ *//* depackage du 2ieme pixel */
204 /*^ */ punpckhbw_r2r (mm7, mm1);
205 /* 00-b1-00-v1-00-r1-00-a1 */
206
207 punpcklbw_r2r (mm7, mm5); /* 00-c1-00-c1-00-c1-00-c1 */
208 punpckhbw_r2r (mm7, mm3); /* 00-c2-00-c2-00-c2-00-c2 */
209
210 /* multiplication des pixels par les coefficients */
211 pmullw_r2r (mm5, mm0); /* c1*b2-c1*v2-c1*r2-c1*a2 */
212 pmullw_r2r (mm3, mm1); /* c2*b1-c2*v1-c2*r1-c2*a1 */
213 paddw_r2r (mm1, mm0);
214
215 /* ...extraction des 2 derniers coefficients */
216 movq_r2r (mm4, mm5); /* c4-c4-c4-c4-c3-c3-c3-c3 */
217 punpcklbw_r2r (mm7, mm4); /* 00-c3-00-c3-00-c3-00-c3 */
218 punpckhbw_r2r (mm7, mm5); /* 00-c4-00-c4-00-c4-00-c4 */
219
220 /* recuperation des 2 derniers pixels */
221 movq_r2r (mm2, mm1);
222
223 /* depackage des pixels */
224 punpcklbw_r2r (mm7, mm1);
225 punpckhbw_r2r (mm7, mm2);
226
227 /* multiplication pas les coeffs */
228 pmullw_r2r (mm4, mm1);
229 pmullw_r2r (mm5, mm2);
230
231 /* ajout des valeurs obtenues � la valeur finale */
232 paddw_r2r (mm1, mm0);
233 paddw_r2r (mm2, mm0);
234
235 /* division par 256 = 16+16+16+16, puis repackage du pixel final */
236 psrlw_i2r (8, mm0);
237 packuswb_r2r (mm7, mm0);
238
239 movd_r2m (mm0, expix2[loop]);
240
241 ++loop;
242 }
243 /* this was femms, which is AMD 3dnow */
244 __asm__ __volatile__ ("emms\n");
245 }
246
247 #define DRAWMETHOD_PLUS_XMMX(_out,_backbuf,_col) \
248 { \
249 movd_m2r(_backbuf, mm0); \
250 paddusb_m2r(_col, mm0); \
251 movd_r2m(mm0, _out); \
252 }
253
254 #define DRAWMETHOD DRAWMETHOD_PLUS_XMMX(*p,*p,col)
255
256 void
draw_line_xmmx(Pixel * data,int x1,int y1,int x2,int y2,int col,int screenx,int screeny)257 draw_line_xmmx (Pixel * data, int x1, int y1, int x2, int y2, int col,
258 int screenx, int screeny)
259 {
260 int x, y, dx, dy, yy, xx;
261 Pixel *p;
262
263 if ((y1 < 0) || (y2 < 0) || (x1 < 0) || (x2 < 0) || (y1 >= screeny)
264 || (y2 >= screeny) || (x1 >= screenx) || (x2 >= screenx))
265 goto end_of_line;
266
267 dx = x2 - x1;
268 dy = y2 - y1;
269 if (x1 >= x2) {
270 int tmp;
271
272 tmp = x1;
273 x1 = x2;
274 x2 = tmp;
275 tmp = y1;
276 y1 = y2;
277 y2 = tmp;
278 dx = x2 - x1;
279 dy = y2 - y1;
280 }
281
282 /* vertical line */
283 if (dx == 0) {
284 if (y1 < y2) {
285 p = &(data[(screenx * y1) + x1]);
286 for (y = y1; y <= y2; y++) {
287 DRAWMETHOD;
288 p += screenx;
289 }
290 } else {
291 p = &(data[(screenx * y2) + x1]);
292 for (y = y2; y <= y1; y++) {
293 DRAWMETHOD;
294 p += screenx;
295 }
296 }
297 goto end_of_line;
298 }
299 /* horizontal line */
300 if (dy == 0) {
301 if (x1 < x2) {
302 p = &(data[(screenx * y1) + x1]);
303 for (x = x1; x <= x2; x++) {
304 DRAWMETHOD;
305 p++;
306 }
307 goto end_of_line;
308 } else {
309 p = &(data[(screenx * y1) + x2]);
310 for (x = x2; x <= x1; x++) {
311 DRAWMETHOD;
312 p++;
313 }
314 goto end_of_line;
315 }
316 }
317 /* 1 */
318 /* \ */
319 /* \ */
320 /* 2 */
321 if (y2 > y1) {
322 /* steep */
323 if (dy > dx) {
324 dx = ((dx << 16) / dy);
325 x = x1 << 16;
326 for (y = y1; y <= y2; y++) {
327 xx = x >> 16;
328 p = &(data[(screenx * y) + xx]);
329 DRAWMETHOD;
330 if (xx < (screenx - 1)) {
331 p++;
332 /* DRAWMETHOD; */
333 }
334 x += dx;
335 }
336 goto end_of_line;
337 }
338 /* shallow */
339 else {
340 dy = ((dy << 16) / dx);
341 y = y1 << 16;
342 for (x = x1; x <= x2; x++) {
343 yy = y >> 16;
344 p = &(data[(screenx * yy) + x]);
345 DRAWMETHOD;
346 if (yy < (screeny - 1)) {
347 p += screeny;
348 /* DRAWMETHOD; */
349 }
350 y += dy;
351 }
352 }
353 }
354 /* 2 */
355 /* / */
356 /* / */
357 /* 1 */
358 else {
359 /* steep */
360 if (-dy > dx) {
361 dx = ((dx << 16) / -dy);
362 x = (x1 + 1) << 16;
363 for (y = y1; y >= y2; y--) {
364 xx = x >> 16;
365 p = &(data[(screenx * y) + xx]);
366 DRAWMETHOD;
367 if (xx < (screenx - 1)) {
368 p--;
369 /* DRAWMETHOD; */
370 }
371 x += dx;
372 }
373 goto end_of_line;
374 }
375 /* shallow */
376 else {
377 dy = ((dy << 16) / dx);
378 y = y1 << 16;
379 for (x = x1; x <= x2; x++) {
380 yy = y >> 16;
381 p = &(data[(screenx * yy) + x]);
382 DRAWMETHOD;
383 if (yy < (screeny - 1)) {
384 p += screeny;
385 /* DRAWMETHOD; */
386 }
387 y += dy;
388 }
389 goto end_of_line;
390 }
391 }
392 end_of_line:
393 /* this was femms, which is AMD 3dnow */
394 __asm__ __volatile__ ("emms\n");
395 }
396 #else
397 int
xmmx_supported(void)398 xmmx_supported (void)
399 {
400 return (0);
401 }
402 #endif
403