1 ; 2/* 3 * Written by Jos� Fonseca <j_r_fonseca@yahoo.co.uk> 4 */ 5 6 7#ifdef USE_MMX_ASM 8#include "assyntax.h" 9#define MATH_ASM_PTR_SIZE 4 10#include "math/m_vector_asm.h" 11 12/* integer multiplication - alpha plus one 13 * 14 * makes the following approximation to the division (Sree) 15 * 16 * rgb*a/255 ~= (rgb*(a+1)) >> 256 17 * 18 * which is the fastest method that satisfies the following OpenGL criteria 19 * 20 * 0*0 = 0 and 255*255 = 255 21 * 22 * note that MX1 is a register with 0xffffffffffffffff constant which can be easily obtained making 23 * 24 * PCMPEQW ( MX1, MX1 ) 25 */ 26#define GMB_MULT_AP1( MP1, MA1, MP2, MA2, MX1 ) \ 27 PSUBW ( MX1, MA1 ) /* a1 + 1 | a1 + 1 | a1 + 1 | a1 + 1 */ ;\ 28 PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\ 29 ;\ 30TWO(PSUBW ( MX1, MA2 )) /* a2 + 1 | a2 + 1 | a2 + 1 | a2 + 1 */ ;\ 31TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\ 32 ;\ 33 PSRLW ( CONST(8), MA1 ) /* t1 >> 8 ~= t1/255 */ ;\ 34TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 ~= t2/255 */ 35 36 37/* integer multiplication - geometric series 38 * 39 * takes the geometric series approximation to the division 40 * 41 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. 42 * 43 * in this case just the first two terms to fit in 16bit arithmetic 44 * 45 * t/255 ~= (t + (t >> 8)) >> 8 46 * 47 * note that just by itself it doesn't satisfies the OpenGL criteria, as 255*255 = 254, 48 * so the special case a = 255 must be accounted or roundoff must be used 49 */ 50#define GMB_MULT_GS( MP1, MA1, MP2, MA2 ) \ 51 PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\ 52TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\ 53 ;\ 54 MOVQ ( MA1, MP1 ) ;\ 55 PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ 56 ;\ 57TWO(MOVQ ( MA2, MP2 )) ;\ 58TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ 59 ;\ 60 PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 61 PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 62 ;\ 63TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 64TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 65 66 67/* integer multiplication - geometric series plus rounding 68 * 69 * when using a geometric series division instead of truncating the result 70 * use roundoff in the approximation (Jim Blinn) 71 * 72 * t = rgb*a + 0x80 73 * 74 * achieving the exact results 75 * 76 * note that M80 is register with the 0x0080008000800080 constant 77 */ 78#define GMB_MULT_GSR( MP1, MA1, MP2, MA2, M80 ) \ 79 PMULLW ( MP1, MA1 ) /* t1 = p1*a1 */ ;\ 80 PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\ 81 ;\ 82TWO(PMULLW ( MP2, MA2 )) /* t2 = p2*a2 */ ;\ 83TWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\ 84 ;\ 85 MOVQ ( MA1, MP1 ) ;\ 86 PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ 87 ;\ 88TWO(MOVQ ( MA2, MP2 )) ;\ 89TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ 90 ;\ 91 PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 92 PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 93 ;\ 94TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 95TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 96 97 98/* linear interpolation - geometric series 99 */ 100#define GMB_LERP_GS( MP1, MQ1, MA1, MP2, MQ2, MA2) \ 101 PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\ 102 PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\ 103 PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\ 104 ;\ 105TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\ 106TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\ 107TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\ 108 ;\ 109 MOVQ ( MA1, MP1 ) ;\ 110 PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ 111 ;\ 112TWO(MOVQ ( MA2, MP2 )) ;\ 113TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ 114 ;\ 115 PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 116TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 117 ;\ 118 PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\ 119TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\ 120 ;\ 121 PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 122TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 123 124 125/* linear interpolation - geometric series with roundoff 126 * 127 * this is a generalization of Blinn's formula to signed arithmetic 128 * 129 * note that M80 is a register with the 0x0080008000800080 constant 130 */ 131#define GMB_LERP_GSR( MP1, MQ1, MA1, MP2, MQ2, MA2, M80) \ 132 PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\ 133 PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\ 134 PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\ 135 ;\ 136TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\ 137TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\ 138TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\ 139 ;\ 140 PSRLW ( CONST(15), MP1 ) /* q1 > p1 ? 1 : 0 */ ;\ 141TWO(PSRLW ( CONST(15), MP2 )) /* q2 > q2 ? 1 : 0 */ ;\ 142 ;\ 143 PSLLW ( CONST(8), MP1 ) /* q1 > p1 ? 0x100 : 0 */ ;\ 144TWO(PSLLW ( CONST(8), MP2 )) /* q2 > q2 ? 0x100 : 0 */ ;\ 145 ;\ 146 PSUBW ( MP1, MA1 ) /* t1 -=? 0x100 */ ;\ 147TWO(PSUBW ( MP2, MA2 )) /* t2 -=? 0x100 */ ;\ 148 ;\ 149 PADDW ( M80, MA1 ) /* t1 += 0x80 */ ;\ 150TWO(PADDW ( M80, MA2 )) /* t2 += 0x80 */ ;\ 151 ;\ 152 MOVQ ( MA1, MP1 ) ;\ 153 PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ 154 ;\ 155TWO(MOVQ ( MA2, MP2 )) ;\ 156TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ 157 ;\ 158 PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 159TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 160 ;\ 161 PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\ 162TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\ 163 ;\ 164 PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 165TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 166 167 168/* linear interpolation - geometric series with correction 169 * 170 * instead of the roundoff this adds a small correction to satisfy the OpenGL criteria 171 * 172 * t/255 ~= (t + (t >> 8) + (t >> 15)) >> 8 173 * 174 * note that although is faster than rounding off it doesn't give always the exact results 175 */ 176#define GMB_LERP_GSC( MP1, MQ1, MA1, MP2, MQ2, MA2) \ 177 PSUBW ( MQ1, MP1 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ ;\ 178 PSLLW ( CONST(8), MQ1 ) /* q1 << 8 */ ;\ 179 PMULLW ( MP1, MA1 ) /* t1 = (q1 - p1)*pa1 */ ;\ 180 ;\ 181TWO(PSUBW ( MQ2, MP2 )) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ ;\ 182TWO(PSLLW ( CONST(8), MQ2 )) /* q2 << 8 */ ;\ 183TWO(PMULLW ( MP2, MA2 )) /* t2 = (q2 - p2)*pa2 */ ;\ 184 ;\ 185 MOVQ ( MA1, MP1 ) ;\ 186 PSRLW ( CONST(8), MA1 ) /* t1 >> 8 */ ;\ 187 ;\ 188TWO(MOVQ ( MA2, MP2 )) ;\ 189TWO(PSRLW ( CONST(8), MA2 )) /* t2 >> 8 */ ;\ 190 ;\ 191 PADDW ( MA1, MP1 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ ;\ 192 PSRLW ( CONST(7), MA1 ) /* t1 >> 15 */ ;\ 193 ;\ 194TWO(PADDW ( MA2, MP2 )) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ ;\ 195TWO(PSRLW ( CONST(7), MA2 )) /* t2 >> 15 */ ;\ 196 ;\ 197 PADDW ( MP1, MA1 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */ ;\ 198TWO(PADDW ( MP2, MA2 )) /* t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8 */ ;\ 199 ;\ 200 PADDW ( MQ1, MA1 ) /* (t1/255 + q1) << 8 */ ;\ 201TWO(PADDW ( MQ2, MA2 )) /* (t2/255 + q2) << 8 */ ;\ 202 ;\ 203 PSRLW ( CONST(8), MA1 ) /* sa1 | sb1 | sg1 | sr1 */ ;\ 204TWO(PSRLW ( CONST(8), MA2 )) /* sa2 | sb2 | sg2 | sr2 */ 205 206 207/* common blending setup code 208 * 209 * note that M00 is a register with 0x0000000000000000 constant which can be easily obtained making 210 * 211 * PXOR ( M00, M00 ) 212 */ 213#define GMB_LOAD(rgba, dest, MPP, MQQ) \ 214ONE(MOVD ( REGIND(rgba), MPP )) /* | | | | qa1 | qb1 | qg1 | qr1 */ ;\ 215ONE(MOVD ( REGIND(dest), MQQ )) /* | | | | pa1 | pb1 | pg1 | pr1 */ ;\ 216 ;\ 217TWO(MOVQ ( REGIND(rgba), MPP )) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */ ;\ 218TWO(MOVQ ( REGIND(dest), MQQ )) /* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */ 219 220#define GMB_UNPACK(MP1, MQ1, MP2, MQ2, M00) \ 221TWO(MOVQ ( MP1, MP2 )) ;\ 222TWO(MOVQ ( MQ1, MQ2 )) ;\ 223 ;\ 224 PUNPCKLBW ( M00, MQ1 ) /* qa1 | qb1 | qg1 | qr1 */ ;\ 225TWO(PUNPCKHBW ( M00, MQ2 )) /* qa2 | qb2 | qg2 | qr2 */ ;\ 226 PUNPCKLBW ( M00, MP1 ) /* pa1 | pb1 | pg1 | pr1 */ ;\ 227TWO(PUNPCKHBW ( M00, MP2 )) /* pa2 | pb2 | pg2 | pr2 */ 228 229#define GMB_ALPHA(MP1, MA1, MP2, MA2) \ 230 MOVQ ( MP1, MA1 ) ;\ 231TWO(MOVQ ( MP2, MA2 )) ;\ 232 ;\ 233 PUNPCKHWD ( MA1, MA1 ) /* pa1 | pa1 | | */ ;\ 234TWO(PUNPCKHWD ( MA2, MA2 )) /* pa2 | pa2 | | */ ;\ 235 PUNPCKHDQ ( MA1, MA1 ) /* pa1 | pa1 | pa1 | pa1 */ ;\ 236TWO(PUNPCKHDQ ( MA2, MA2 )) /* pa2 | pa2 | pa2 | pa2 */ 237 238#define GMB_PACK( MS1, MS2 ) \ 239 PACKUSWB ( MS2, MS1 ) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ ; 240 241#define GMB_STORE(rgba, MSS ) \ 242ONE(MOVD ( MSS, REGIND(rgba) )) /* | | | | sa1 | sb1 | sg1 | sr1 */ ;\ 243TWO(MOVQ ( MSS, REGIND(rgba) )) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ 244 245/* Kevin F. Quinn <kevquinn@gentoo.org> 2 July 2006 246 * Replace data segment constants with text-segment 247 * constants (via pushl/movq) 248 SEG_DATA 249 250ALIGNDATA8 251const_0080: 252 D_LONG 0x00800080, 0x00800080 253 254const_80: 255 D_LONG 0x80808080, 0x80808080 256*/ 257#define const_0080_l 0x00800080 258#define const_0080_h 0x00800080 259#define const_80_l 0x80808080 260#define const_80_h 0x80808080 261 262 SEG_TEXT 263 264 265/* Blend transparency function 266 */ 267 268#define TAG(x) CONCAT(x,_transparency) 269#define LLTAG(x) LLBL2(x,_transparency) 270 271#define INIT \ 272 PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ 273 274#define MAIN( rgba, dest ) \ 275 GMB_LOAD( rgba, dest, MM1, MM2 ) ;\ 276 GMB_UNPACK( MM1, MM2, MM4, MM5, MM0 ) ;\ 277 GMB_ALPHA( MM1, MM3, MM4, MM6 ) ;\ 278 GMB_LERP_GSC( MM1, MM2, MM3, MM4, MM5, MM6 ) ;\ 279 GMB_PACK( MM3, MM6 ) ;\ 280 GMB_STORE( rgba, MM3 ) 281 282#include "mmx_blendtmp.h" 283 284 285/* Blend add function 286 * 287 * FIXME: Add some loop unrolling here... 288 */ 289 290#define TAG(x) CONCAT(x,_add) 291#define LLTAG(x) LLBL2(x,_add) 292 293#define INIT 294 295#define MAIN( rgba, dest ) \ 296ONE(MOVD ( REGIND(rgba), MM1 )) /* | | | | qa1 | qb1 | qg1 | qr1 */ ;\ 297ONE(MOVD ( REGIND(dest), MM2 )) /* | | | | pa1 | pb1 | pg1 | pr1 */ ;\ 298ONE(PADDUSB ( MM2, MM1 )) ;\ 299ONE(MOVD ( MM1, REGIND(rgba) )) /* | | | | sa1 | sb1 | sg1 | sr1 */ ;\ 300 ;\ 301TWO(MOVQ ( REGIND(rgba), MM1 )) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */ ;\ 302TWO(PADDUSB ( REGIND(dest), MM1 )) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ ;\ 303TWO(MOVQ ( MM1, REGIND(rgba) )) 304 305#include "mmx_blendtmp.h" 306 307 308/* Blend min function 309 */ 310 311#define TAG(x) CONCAT(x,_min) 312#define LLTAG(x) LLBL2(x,_min) 313 314/* Kevin F. Quinn 2nd July 2006 315 * Replace data segment constants with text-segment instructions 316#define INIT \ 317 MOVQ ( CONTENT(const_80), MM7 ) 318 */ 319#define INIT \ 320 PUSH_L ( CONST(const_80_h) ) /* 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80*/ ;\ 321 PUSH_L ( CONST(const_80_l) ) ;\ 322 MOVQ ( REGIND(ESP), MM7 ) ;\ 323 ADD_L ( CONST(8), ESP) 324 325#define MAIN( rgba, dest ) \ 326 GMB_LOAD( rgba, dest, MM1, MM2 ) ;\ 327 MOVQ ( MM1, MM3 ) ;\ 328 MOVQ ( MM2, MM4 ) ;\ 329 PXOR ( MM7, MM3 ) /* unsigned -> signed */ ;\ 330 PXOR ( MM7, MM4 ) /* unsigned -> signed */ ;\ 331 PCMPGTB ( MM3, MM4 ) /* q > p ? 0xff : 0x00 */ ;\ 332 PAND ( MM4, MM1 ) /* q > p ? p : 0 */ ;\ 333 PANDN ( MM2, MM4 ) /* q > p ? 0 : q */ ;\ 334 POR ( MM1, MM4 ) /* q > p ? p : q */ ;\ 335 GMB_STORE( rgba, MM4 ) 336 337#include "mmx_blendtmp.h" 338 339 340/* Blend max function 341 */ 342 343#define TAG(x) CONCAT(x,_max) 344#define LLTAG(x) LLBL2(x,_max) 345 346/* Kevin F. Quinn 2nd July 2006 347 * Replace data segment constants with text-segment instructions 348#define INIT \ 349 MOVQ ( CONTENT(const_80), MM7 ) 350 */ 351#define INIT \ 352 PUSH_L ( CONST(const_80_l) ) /* 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80| 0x80*/ ;\ 353 PUSH_L ( CONST(const_80_h) ) ;\ 354 MOVQ ( REGIND(ESP), MM7 ) ;\ 355 ADD_L ( CONST(8), ESP) 356 357#define MAIN( rgba, dest ) \ 358 GMB_LOAD( rgba, dest, MM1, MM2 ) ;\ 359 MOVQ ( MM1, MM3 ) ;\ 360 MOVQ ( MM2, MM4 ) ;\ 361 PXOR ( MM7, MM3 ) /* unsigned -> signed */ ;\ 362 PXOR ( MM7, MM4 ) /* unsigned -> signed */ ;\ 363 PCMPGTB ( MM3, MM4 ) /* q > p ? 0xff : 0x00 */ ;\ 364 PAND ( MM4, MM2 ) /* q > p ? q : 0 */ ;\ 365 PANDN ( MM1, MM4 ) /* q > p ? 0 : p */ ;\ 366 POR ( MM2, MM4 ) /* q > p ? p : q */ ;\ 367 GMB_STORE( rgba, MM4 ) 368 369#include "mmx_blendtmp.h" 370 371 372/* Blend modulate function 373 */ 374 375#define TAG(x) CONCAT(x,_modulate) 376#define LLTAG(x) LLBL2(x,_modulate) 377 378/* Kevin F. Quinn 2nd July 2006 379 * Replace data segment constants with text-segment instructions 380#define INIT \ 381 MOVQ ( CONTENT(const_0080), MM7 ) 382 */ 383#define INIT \ 384 PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ ;\ 385 PUSH_L ( CONST(const_0080_l) ) /* 0x0080 | 0x0080 | 0x0080 | 0x0080 */ ;\ 386 PUSH_L ( CONST(const_0080_h) ) ;\ 387 MOVQ ( REGIND(ESP), MM7 ) ;\ 388 ADD_L ( CONST(8), ESP) 389 390#define MAIN( rgba, dest ) \ 391 GMB_LOAD( rgba, dest, MM1, MM2 ) ;\ 392 GMB_UNPACK( MM1, MM2, MM4, MM5, MM0 ) ;\ 393 GMB_MULT_GSR( MM1, MM2, MM4, MM5, MM7 ) ;\ 394 GMB_PACK( MM2, MM5 ) ;\ 395 GMB_STORE( rgba, MM2 ) 396 397#include "mmx_blendtmp.h" 398 399#endif 400 401#if defined (__ELF__) && defined (__linux__) 402 .section .note.GNU-stack,"",%progbits 403#endif 404