• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "PixelRoutine.hpp"
16 
17 #include "Renderer.hpp"
18 #include "QuadRasterizer.hpp"
19 #include "Surface.hpp"
20 #include "Primitive.hpp"
21 #include "SamplerCore.hpp"
22 #include "Constants.hpp"
23 #include "Debug.hpp"
24 
25 namespace sw
26 {
27 	extern bool complementaryDepthBuffer;
28 	extern bool postBlendSRGB;
29 	extern bool exactColorRounding;
30 	extern bool forceClearRegisters;
31 
PixelRoutine(const PixelProcessor::State & state,const PixelShader * shader)32 	PixelRoutine::PixelRoutine(const PixelProcessor::State &state, const PixelShader *shader) : QuadRasterizer(state, shader), v(shader && shader->dynamicallyIndexedInput)
33 	{
34 		if(!shader || shader->getVersion() < 0x0200 || forceClearRegisters)
35 		{
36 			for(int i = 0; i < MAX_FRAGMENT_INPUTS; i++)
37 			{
38 				v[i].x = Float4(0.0f);
39 				v[i].y = Float4(0.0f);
40 				v[i].z = Float4(0.0f);
41 				v[i].w = Float4(0.0f);
42 			}
43 		}
44 	}
45 
~PixelRoutine()46 	PixelRoutine::~PixelRoutine()
47 	{
48 		for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
49 		{
50 			delete sampler[i];
51 		}
52 	}
53 
quad(Pointer<Byte> cBuffer[RENDERTARGETS],Pointer<Byte> & zBuffer,Pointer<Byte> & sBuffer,Int cMask[4],Int & x,Int & y)54 	void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
55 	{
56 		#if PERF_PROFILE
57 			Long pipeTime = Ticks();
58 		#endif
59 
60 		for(int i = 0; i < TEXTURE_IMAGE_UNITS; i++)
61 		{
62 			sampler[i] = new SamplerCore(constants, state.sampler[i]);
63 		}
64 
65 		const bool earlyDepthTest = !state.depthOverride && !state.alphaTestActive();
66 
67 		Int zMask[4];   // Depth mask
68 		Int sMask[4];   // Stencil mask
69 
70 		for(unsigned int q = 0; q < state.multiSample; q++)
71 		{
72 			zMask[q] = cMask[q];
73 			sMask[q] = cMask[q];
74 		}
75 
76 		for(unsigned int q = 0; q < state.multiSample; q++)
77 		{
78 			stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
79 		}
80 
81 		Float4 f;
82 		Float4 rhwCentroid;
83 
84 		Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive,xQuad), 16);
85 
86 		if(interpolateZ())
87 		{
88 			for(unsigned int q = 0; q < state.multiSample; q++)
89 			{
90 				Float4 x = xxxx;
91 
92 				if(state.multiSample > 1)
93 				{
94 					x -= *Pointer<Float4>(constants + OFFSET(Constants,X) + q * sizeof(float4));
95 				}
96 
97 				z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive,z), false, false);
98 			}
99 		}
100 
101 		Bool depthPass = false;
102 
103 		if(earlyDepthTest)
104 		{
105 			for(unsigned int q = 0; q < state.multiSample; q++)
106 			{
107 				depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
108 			}
109 		}
110 
111 		If(depthPass || Bool(!earlyDepthTest))
112 		{
113 			#if PERF_PROFILE
114 				Long interpTime = Ticks();
115 			#endif
116 
117 			Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive,yQuad), 16);
118 
119 			// Centroid locations
120 			Float4 XXXX = Float4(0.0f);
121 			Float4 YYYY = Float4(0.0f);
122 
123 			if(state.centroid)
124 			{
125 				Float4 WWWW(1.0e-9f);
126 
127 				for(unsigned int q = 0; q < state.multiSample; q++)
128 				{
129 					XXXX += *Pointer<Float4>(constants + OFFSET(Constants,sampleX[q]) + 16 * cMask[q]);
130 					YYYY += *Pointer<Float4>(constants + OFFSET(Constants,sampleY[q]) + 16 * cMask[q]);
131 					WWWW += *Pointer<Float4>(constants + OFFSET(Constants,weight) + 16 * cMask[q]);
132 				}
133 
134 				WWWW = Rcp_pp(WWWW);
135 				XXXX *= WWWW;
136 				YYYY *= WWWW;
137 
138 				XXXX += xxxx;
139 				YYYY += yyyy;
140 			}
141 
142 			if(interpolateW())
143 			{
144 				w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false);
145 				rhw = reciprocal(w, false, false, true);
146 
147 				if(state.centroid)
148 				{
149 					rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive,w), false, false));
150 				}
151 			}
152 
153 			for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
154 			{
155 				for(int component = 0; component < 4; component++)
156 				{
157 					if(state.interpolant[interpolant].component & (1 << component))
158 					{
159 						if(!state.interpolant[interpolant].centroid)
160 						{
161 							v[interpolant][component] = interpolate(xxxx, Dv[interpolant][component], rhw, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
162 						}
163 						else
164 						{
165 							v[interpolant][component] = interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, V[interpolant][component]), (state.interpolant[interpolant].flat & (1 << component)) != 0, state.perspective);
166 						}
167 					}
168 				}
169 
170 				Float4 rcp;
171 
172 				switch(state.interpolant[interpolant].project)
173 				{
174 				case 0:
175 					break;
176 				case 1:
177 					rcp = reciprocal(v[interpolant].y);
178 					v[interpolant].x = v[interpolant].x * rcp;
179 					break;
180 				case 2:
181 					rcp = reciprocal(v[interpolant].z);
182 					v[interpolant].x = v[interpolant].x * rcp;
183 					v[interpolant].y = v[interpolant].y * rcp;
184 					break;
185 				case 3:
186 					rcp = reciprocal(v[interpolant].w);
187 					v[interpolant].x = v[interpolant].x * rcp;
188 					v[interpolant].y = v[interpolant].y * rcp;
189 					v[interpolant].z = v[interpolant].z * rcp;
190 					break;
191 				}
192 			}
193 
194 			if(state.fog.component)
195 			{
196 				f = interpolate(xxxx, Df, rhw, primitive + OFFSET(Primitive,f), state.fog.flat & 0x01, state.perspective);
197 			}
198 
199 			setBuiltins(x, y, z, w);
200 
201 			#if PERF_PROFILE
202 				cycles[PERF_INTERP] += Ticks() - interpTime;
203 			#endif
204 
205 			Bool alphaPass = true;
206 
207 			if(colorUsed())
208 			{
209 				#if PERF_PROFILE
210 					Long shaderTime = Ticks();
211 				#endif
212 
213 				applyShader(cMask);
214 
215 				#if PERF_PROFILE
216 					cycles[PERF_SHADER] += Ticks() - shaderTime;
217 				#endif
218 
219 				alphaPass = alphaTest(cMask);
220 
221 				if((shader && shader->containsKill()) || state.alphaTestActive())
222 				{
223 					for(unsigned int q = 0; q < state.multiSample; q++)
224 					{
225 						zMask[q] &= cMask[q];
226 						sMask[q] &= cMask[q];
227 					}
228 				}
229 			}
230 
231 			If(alphaPass)
232 			{
233 				if(!earlyDepthTest)
234 				{
235 					for(unsigned int q = 0; q < state.multiSample; q++)
236 					{
237 						depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
238 					}
239 				}
240 
241 				#if PERF_PROFILE
242 					Long ropTime = Ticks();
243 				#endif
244 
245 				If(depthPass || Bool(earlyDepthTest))
246 				{
247 					for(unsigned int q = 0; q < state.multiSample; q++)
248 					{
249 						if(state.multiSampleMask & (1 << q))
250 						{
251 							writeDepth(zBuffer, q, x, z[q], zMask[q]);
252 
253 							if(state.occlusionEnabled)
254 							{
255 								occlusion += *Pointer<UInt>(constants + OFFSET(Constants,occlusionCount) + 4 * (zMask[q] & sMask[q]));
256 							}
257 						}
258 					}
259 
260 					if(colorUsed())
261 					{
262 						#if PERF_PROFILE
263 							AddAtomic(Pointer<Long>(&profiler.ropOperations), 4);
264 						#endif
265 
266 						rasterOperation(f, cBuffer, x, sMask, zMask, cMask);
267 					}
268 				}
269 
270 				#if PERF_PROFILE
271 					cycles[PERF_ROP] += Ticks() - ropTime;
272 				#endif
273 			}
274 		}
275 
276 		for(unsigned int q = 0; q < state.multiSample; q++)
277 		{
278 			if(state.multiSampleMask & (1 << q))
279 			{
280 				writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
281 			}
282 		}
283 
284 		#if PERF_PROFILE
285 			cycles[PERF_PIPE] += Ticks() - pipeTime;
286 		#endif
287 	}
288 
interpolateCentroid(Float4 & x,Float4 & y,Float4 & rhw,Pointer<Byte> planeEquation,bool flat,bool perspective)289 	Float4 PixelRoutine::interpolateCentroid(Float4 &x, Float4 &y, Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
290 	{
291 		Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,C), 16);
292 
293 		if(!flat)
294 		{
295 			interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,A), 16) +
296 			               y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation,B), 16);
297 
298 			if(perspective)
299 			{
300 				interpolant *= rhw;
301 			}
302 		}
303 
304 		return interpolant;
305 	}
306 
stencilTest(Pointer<Byte> & sBuffer,int q,Int & x,Int & sMask,Int & cMask)307 	void PixelRoutine::stencilTest(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &cMask)
308 	{
309 		if(!state.stencilActive)
310 		{
311 			return;
312 		}
313 
314 		// (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
315 
316 		Pointer<Byte> buffer = sBuffer + 2 * x;
317 
318 		if(q > 0)
319 		{
320 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
321 		}
322 
323 		Byte8 value = *Pointer<Byte8>(buffer);
324 		Byte8 valueCCW = value;
325 
326 		if(!state.noStencilMask)
327 		{
328 			value &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].testMaskQ));
329 		}
330 
331 		stencilTest(value, state.stencilCompareMode, false);
332 
333 		if(state.twoSidedStencil)
334 		{
335 			if(!state.noStencilMaskCCW)
336 			{
337 				valueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].testMaskQ));
338 			}
339 
340 			stencilTest(valueCCW, state.stencilCompareModeCCW, true);
341 
342 			value &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
343 			valueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
344 			value |= valueCCW;
345 		}
346 
347 		sMask = SignMask(value) & cMask;
348 	}
349 
stencilTest(Byte8 & value,StencilCompareMode stencilCompareMode,bool CCW)350 	void PixelRoutine::stencilTest(Byte8 &value, StencilCompareMode stencilCompareMode, bool CCW)
351 	{
352 		Byte8 equal;
353 
354 		switch(stencilCompareMode)
355 		{
356 		case STENCIL_ALWAYS:
357 			value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
358 			break;
359 		case STENCIL_NEVER:
360 			value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
361 			break;
362 		case STENCIL_LESS:			// a < b ~ b > a
363 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
364 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
365 			break;
366 		case STENCIL_EQUAL:
367 			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
368 			break;
369 		case STENCIL_NOTEQUAL:		// a != b ~ !(a == b)
370 			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
371 			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
372 			break;
373 		case STENCIL_LESSEQUAL:	// a <= b ~ (b > a) || (a == b)
374 			equal = value;
375 			equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedQ)));
376 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
377 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
378 			value |= equal;
379 			break;
380 		case STENCIL_GREATER:		// a > b
381 			equal = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ));
382 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
383 			equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
384 			value = equal;
385 			break;
386 		case STENCIL_GREATEREQUAL:	// a >= b ~ !(a < b) ~ !(b > a)
387 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
388 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData,stencil[CCW].referenceMaskedSignedQ)));
389 			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
390 			break;
391 		default:
392 			ASSERT(false);
393 		}
394 	}
395 
depthTest(Pointer<Byte> & zBuffer,int q,Int & x,Float4 & z,Int & sMask,Int & zMask,Int & cMask)396 	Bool PixelRoutine::depthTest(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &sMask, Int &zMask, Int &cMask)
397 	{
398 		if(!state.depthTestActive)
399 		{
400 			return true;
401 		}
402 
403 		Float4 Z = z;
404 
405 		if(shader && shader->depthOverride())
406 		{
407 			if(complementaryDepthBuffer)
408 			{
409 				Z = Float4(1.0f) - oDepth;
410 			}
411 			else
412 			{
413 				Z = oDepth;
414 			}
415 		}
416 
417 		Pointer<Byte> buffer;
418 		Int pitch;
419 
420 		if(!state.quadLayoutDepthBuffer)
421 		{
422 			buffer = zBuffer + 4 * x;
423 			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
424 		}
425 		else
426 		{
427 			buffer = zBuffer + 8 * x;
428 		}
429 
430 		if(q > 0)
431 		{
432 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
433 		}
434 
435 		Float4 zValue;
436 
437 		if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
438 		{
439 			if(!state.quadLayoutDepthBuffer)
440 			{
441 				// FIXME: Properly optimizes?
442 				zValue.xy = *Pointer<Float4>(buffer);
443 				zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
444 			}
445 			else
446 			{
447 				zValue = *Pointer<Float4>(buffer, 16);
448 			}
449 		}
450 
451 		Int4 zTest;
452 
453 		switch(state.depthCompareMode)
454 		{
455 		case DEPTH_ALWAYS:
456 			// Optimized
457 			break;
458 		case DEPTH_NEVER:
459 			// Optimized
460 			break;
461 		case DEPTH_EQUAL:
462 			zTest = CmpEQ(zValue, Z);
463 			break;
464 		case DEPTH_NOTEQUAL:
465 			zTest = CmpNEQ(zValue, Z);
466 			break;
467 		case DEPTH_LESS:
468 			if(complementaryDepthBuffer)
469 			{
470 				zTest = CmpLT(zValue, Z);
471 			}
472 			else
473 			{
474 				zTest = CmpNLE(zValue, Z);
475 			}
476 			break;
477 		case DEPTH_GREATEREQUAL:
478 			if(complementaryDepthBuffer)
479 			{
480 				zTest = CmpNLT(zValue, Z);
481 			}
482 			else
483 			{
484 				zTest = CmpLE(zValue, Z);
485 			}
486 			break;
487 		case DEPTH_LESSEQUAL:
488 			if(complementaryDepthBuffer)
489 			{
490 				zTest = CmpLE(zValue, Z);
491 			}
492 			else
493 			{
494 				zTest = CmpNLT(zValue, Z);
495 			}
496 			break;
497 		case DEPTH_GREATER:
498 			if(complementaryDepthBuffer)
499 			{
500 				zTest = CmpNLE(zValue, Z);
501 			}
502 			else
503 			{
504 				zTest = CmpLT(zValue, Z);
505 			}
506 			break;
507 		default:
508 			ASSERT(false);
509 		}
510 
511 		switch(state.depthCompareMode)
512 		{
513 		case DEPTH_ALWAYS:
514 			zMask = cMask;
515 			break;
516 		case DEPTH_NEVER:
517 			zMask = 0x0;
518 			break;
519 		default:
520 			zMask = SignMask(zTest) & cMask;
521 			break;
522 		}
523 
524 		if(state.stencilActive)
525 		{
526 			zMask &= sMask;
527 		}
528 
529 		return zMask != 0;
530 	}
531 
alphaTest(Int & aMask,Short4 & alpha)532 	void PixelRoutine::alphaTest(Int &aMask, Short4 &alpha)
533 	{
534 		Short4 cmp;
535 		Short4 equal;
536 
537 		switch(state.alphaCompareMode)
538 		{
539 		case ALPHA_ALWAYS:
540 			aMask = 0xF;
541 			break;
542 		case ALPHA_NEVER:
543 			aMask = 0x0;
544 			break;
545 		case ALPHA_EQUAL:
546 			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
547 			aMask = SignMask(Pack(cmp, Short4(0x0000)));
548 			break;
549 		case ALPHA_NOTEQUAL:       // a != b ~ !(a == b)
550 			cmp = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
551 			aMask = SignMask(Pack(cmp, Short4(0x0000)));
552 			break;
553 		case ALPHA_LESS:           // a < b ~ b > a
554 			cmp = CmpGT(*Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)), alpha);
555 			aMask = SignMask(Pack(cmp, Short4(0x0000)));
556 			break;
557 		case ALPHA_GREATEREQUAL:   // a >= b ~ (a > b) || (a == b) ~ !(b > a)   // TODO: Approximate
558 			equal = CmpEQ(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
559 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
560 			cmp |= equal;
561 			aMask = SignMask(Pack(cmp, Short4(0x0000)));
562 			break;
563 		case ALPHA_LESSEQUAL:      // a <= b ~ !(a > b)
564 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4))) ^ Short4(0xFFFFu);   // FIXME
565 			aMask = SignMask(Pack(cmp, Short4(0x0000)));
566 			break;
567 		case ALPHA_GREATER:        // a > b
568 			cmp = CmpGT(alpha, *Pointer<Short4>(data + OFFSET(DrawData,factor.alphaReference4)));
569 			aMask = SignMask(Pack(cmp, Short4(0x0000)));
570 			break;
571 		default:
572 			ASSERT(false);
573 		}
574 	}
575 
alphaToCoverage(Int cMask[4],Float4 & alpha)576 	void PixelRoutine::alphaToCoverage(Int cMask[4], Float4 &alpha)
577 	{
578 		Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c0)));
579 		Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c1)));
580 		Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c2)));
581 		Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData,a2c3)));
582 
583 		Int aMask0 = SignMask(coverage0);
584 		Int aMask1 = SignMask(coverage1);
585 		Int aMask2 = SignMask(coverage2);
586 		Int aMask3 = SignMask(coverage3);
587 
588 		cMask[0] &= aMask0;
589 		cMask[1] &= aMask1;
590 		cMask[2] &= aMask2;
591 		cMask[3] &= aMask3;
592 	}
593 
fogBlend(Vector4f & c0,Float4 & fog)594 	void PixelRoutine::fogBlend(Vector4f &c0, Float4 &fog)
595 	{
596 		if(!state.fogActive)
597 		{
598 			return;
599 		}
600 
601 		if(state.pixelFogMode != FOG_NONE)
602 		{
603 			pixelFog(fog);
604 
605 			fog = Min(fog, Float4(1.0f));
606 			fog = Max(fog, Float4(0.0f));
607 		}
608 
609 		c0.x -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
610 		c0.y -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
611 		c0.z -= *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
612 
613 		c0.x *= fog;
614 		c0.y *= fog;
615 		c0.z *= fog;
616 
617 		c0.x += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[0]));
618 		c0.y += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[1]));
619 		c0.z += *Pointer<Float4>(data + OFFSET(DrawData,fog.colorF[2]));
620 	}
621 
pixelFog(Float4 & visibility)622 	void PixelRoutine::pixelFog(Float4 &visibility)
623 	{
624 		Float4 &zw = visibility;
625 
626 		if(state.pixelFogMode != FOG_NONE)
627 		{
628 			if(state.wBasedFog)
629 			{
630 				zw = rhw;
631 			}
632 			else
633 			{
634 				if(complementaryDepthBuffer)
635 				{
636 					zw = Float4(1.0f) - z[0];
637 				}
638 				else
639 				{
640 					zw = z[0];
641 				}
642 			}
643 		}
644 
645 		switch(state.pixelFogMode)
646 		{
647 		case FOG_NONE:
648 			break;
649 		case FOG_LINEAR:
650 			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.scale));
651 			zw += *Pointer<Float4>(data + OFFSET(DrawData,fog.offset));
652 			break;
653 		case FOG_EXP:
654 			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.densityE));
655 			zw = exponential2(zw, true);
656 			break;
657 		case FOG_EXP2:
658 			zw *= zw;
659 			zw *= *Pointer<Float4>(data + OFFSET(DrawData,fog.density2E));
660 			zw = exponential2(zw, true);
661 			break;
662 		default:
663 			ASSERT(false);
664 		}
665 	}
666 
writeDepth(Pointer<Byte> & zBuffer,int q,Int & x,Float4 & z,Int & zMask)667 	void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, Int &x, Float4 &z, Int &zMask)
668 	{
669 		if(!state.depthWriteEnable)
670 		{
671 			return;
672 		}
673 
674 		Float4 Z = z;
675 
676 		if(shader && shader->depthOverride())
677 		{
678 			if(complementaryDepthBuffer)
679 			{
680 				Z = Float4(1.0f) - oDepth;
681 			}
682 			else
683 			{
684 				Z = oDepth;
685 			}
686 		}
687 
688 		Pointer<Byte> buffer;
689 		Int pitch;
690 
691 		if(!state.quadLayoutDepthBuffer)
692 		{
693 			buffer = zBuffer + 4 * x;
694 			pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
695 		}
696 		else
697 		{
698 			buffer = zBuffer + 8 * x;
699 		}
700 
701 		if(q > 0)
702 		{
703 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,depthSliceB));
704 		}
705 
706 		Float4 zValue;
707 
708 		if(state.depthCompareMode != DEPTH_NEVER || (state.depthCompareMode != DEPTH_ALWAYS && !state.depthWriteEnable))
709 		{
710 			if(!state.quadLayoutDepthBuffer)
711 			{
712 				// FIXME: Properly optimizes?
713 				zValue.xy = *Pointer<Float4>(buffer);
714 				zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
715 			}
716 			else
717 			{
718 				zValue = *Pointer<Float4>(buffer, 16);
719 			}
720 		}
721 
722 		Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + zMask * 16, 16));
723 		zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + zMask * 16, 16));
724 		Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
725 
726 		if(!state.quadLayoutDepthBuffer)
727 		{
728 			// FIXME: Properly optimizes?
729 			*Pointer<Float2>(buffer) = Float2(Z.xy);
730 			*Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
731 		}
732 		else
733 		{
734 			*Pointer<Float4>(buffer, 16) = Z;
735 		}
736 	}
737 
writeStencil(Pointer<Byte> & sBuffer,int q,Int & x,Int & sMask,Int & zMask,Int & cMask)738 	void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, Int &x, Int &sMask, Int &zMask, Int &cMask)
739 	{
740 		if(!state.stencilActive)
741 		{
742 			return;
743 		}
744 
745 		if(state.stencilPassOperation == OPERATION_KEEP && state.stencilZFailOperation == OPERATION_KEEP && state.stencilFailOperation == OPERATION_KEEP)
746 		{
747 			if(!state.twoSidedStencil || (state.stencilPassOperationCCW == OPERATION_KEEP && state.stencilZFailOperationCCW == OPERATION_KEEP && state.stencilFailOperationCCW == OPERATION_KEEP))
748 			{
749 				return;
750 			}
751 		}
752 
753 		if(state.stencilWriteMasked && (!state.twoSidedStencil || state.stencilWriteMaskedCCW))
754 		{
755 			return;
756 		}
757 
758 		Pointer<Byte> buffer = sBuffer + 2 * x;
759 
760 		if(q > 0)
761 		{
762 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData,stencilSliceB));
763 		}
764 
765 		Byte8 bufferValue = *Pointer<Byte8>(buffer);
766 
767 		Byte8 newValue;
768 		stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
769 
770 		if(!state.noStencilWriteMask)
771 		{
772 			Byte8 maskedValue = bufferValue;
773 			newValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].writeMaskQ));
774 			maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[0].invWriteMaskQ));
775 			newValue |= maskedValue;
776 		}
777 
778 		if(state.twoSidedStencil)
779 		{
780 			Byte8 newValueCCW;
781 
782 			stencilOperation(newValueCCW, bufferValue, state.stencilPassOperationCCW, state.stencilZFailOperationCCW, state.stencilFailOperationCCW, true, zMask, sMask);
783 
784 			if(!state.noStencilWriteMaskCCW)
785 			{
786 				Byte8 maskedValue = bufferValue;
787 				newValueCCW &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].writeMaskQ));
788 				maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData,stencil[1].invWriteMaskQ));
789 				newValueCCW |= maskedValue;
790 			}
791 
792 			newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive,clockwiseMask));
793 			newValueCCW &= *Pointer<Byte8>(primitive + OFFSET(Primitive,invClockwiseMask));
794 			newValue |= newValueCCW;
795 		}
796 
797 		newValue &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * cMask);
798 		bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * cMask);
799 		newValue |= bufferValue;
800 
801 		*Pointer<Byte4>(buffer) = Byte4(newValue);
802 	}
803 
stencilOperation(Byte8 & newValue,Byte8 & bufferValue,StencilOperation stencilPassOperation,StencilOperation stencilZFailOperation,StencilOperation stencilFailOperation,bool CCW,Int & zMask,Int & sMask)804 	void PixelRoutine::stencilOperation(Byte8 &newValue, Byte8 &bufferValue, StencilOperation stencilPassOperation, StencilOperation stencilZFailOperation, StencilOperation stencilFailOperation, bool CCW, Int &zMask, Int &sMask)
805 	{
806 		Byte8 &pass = newValue;
807 		Byte8 fail;
808 		Byte8 zFail;
809 
810 		stencilOperation(pass, bufferValue, stencilPassOperation, CCW);
811 
812 		if(stencilZFailOperation != stencilPassOperation)
813 		{
814 			stencilOperation(zFail, bufferValue, stencilZFailOperation, CCW);
815 		}
816 
817 		if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
818 		{
819 			stencilOperation(fail, bufferValue, stencilFailOperation, CCW);
820 		}
821 
822 		if(stencilFailOperation != stencilPassOperation || stencilFailOperation != stencilZFailOperation)
823 		{
824 			if(state.depthTestActive && stencilZFailOperation != stencilPassOperation)   // zMask valid and values not the same
825 			{
826 				pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * zMask);
827 				zFail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * zMask);
828 				pass |= zFail;
829 			}
830 
831 			pass &= *Pointer<Byte8>(constants + OFFSET(Constants,maskB4Q) + 8 * sMask);
832 			fail &= *Pointer<Byte8>(constants + OFFSET(Constants,invMaskB4Q) + 8 * sMask);
833 			pass |= fail;
834 		}
835 	}
836 
stencilOperation(Byte8 & output,Byte8 & bufferValue,StencilOperation operation,bool CCW)837 	void PixelRoutine::stencilOperation(Byte8 &output, Byte8 &bufferValue, StencilOperation operation, bool CCW)
838 	{
839 		switch(operation)
840 		{
841 		case OPERATION_KEEP:
842 			output = bufferValue;
843 			break;
844 		case OPERATION_ZERO:
845 			output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
846 			break;
847 		case OPERATION_REPLACE:
848 			output = *Pointer<Byte8>(data + OFFSET(DrawData,stencil[CCW].referenceQ));
849 			break;
850 		case OPERATION_INCRSAT:
851 			output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
852 			break;
853 		case OPERATION_DECRSAT:
854 			output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
855 			break;
856 		case OPERATION_INVERT:
857 			output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
858 			break;
859 		case OPERATION_INCR:
860 			output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
861 			break;
862 		case OPERATION_DECR:
863 			output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
864 			break;
865 		default:
866 			ASSERT(false);
867 		}
868 	}
869 
blendFactor(Vector4s & blendFactor,const Vector4s & current,const Vector4s & pixel,BlendFactor blendFactorActive)870 	void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorActive)
871 	{
872 		switch(blendFactorActive)
873 		{
874 		case BLEND_ZERO:
875 			// Optimized
876 			break;
877 		case BLEND_ONE:
878 			// Optimized
879 			break;
880 		case BLEND_SOURCE:
881 			blendFactor.x = current.x;
882 			blendFactor.y = current.y;
883 			blendFactor.z = current.z;
884 			break;
885 		case BLEND_INVSOURCE:
886 			blendFactor.x = Short4(0xFFFFu) - current.x;
887 			blendFactor.y = Short4(0xFFFFu) - current.y;
888 			blendFactor.z = Short4(0xFFFFu) - current.z;
889 			break;
890 		case BLEND_DEST:
891 			blendFactor.x = pixel.x;
892 			blendFactor.y = pixel.y;
893 			blendFactor.z = pixel.z;
894 			break;
895 		case BLEND_INVDEST:
896 			blendFactor.x = Short4(0xFFFFu) - pixel.x;
897 			blendFactor.y = Short4(0xFFFFu) - pixel.y;
898 			blendFactor.z = Short4(0xFFFFu) - pixel.z;
899 			break;
900 		case BLEND_SOURCEALPHA:
901 			blendFactor.x = current.w;
902 			blendFactor.y = current.w;
903 			blendFactor.z = current.w;
904 			break;
905 		case BLEND_INVSOURCEALPHA:
906 			blendFactor.x = Short4(0xFFFFu) - current.w;
907 			blendFactor.y = Short4(0xFFFFu) - current.w;
908 			blendFactor.z = Short4(0xFFFFu) - current.w;
909 			break;
910 		case BLEND_DESTALPHA:
911 			blendFactor.x = pixel.w;
912 			blendFactor.y = pixel.w;
913 			blendFactor.z = pixel.w;
914 			break;
915 		case BLEND_INVDESTALPHA:
916 			blendFactor.x = Short4(0xFFFFu) - pixel.w;
917 			blendFactor.y = Short4(0xFFFFu) - pixel.w;
918 			blendFactor.z = Short4(0xFFFFu) - pixel.w;
919 			break;
920 		case BLEND_SRCALPHASAT:
921 			blendFactor.x = Short4(0xFFFFu) - pixel.w;
922 			blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
923 			blendFactor.y = blendFactor.x;
924 			blendFactor.z = blendFactor.x;
925 			break;
926 		case BLEND_CONSTANT:
927 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[0]));
928 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[1]));
929 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[2]));
930 			break;
931 		case BLEND_INVCONSTANT:
932 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[0]));
933 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[1]));
934 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[2]));
935 			break;
936 		case BLEND_CONSTANTALPHA:
937 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
938 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
939 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
940 			break;
941 		case BLEND_INVCONSTANTALPHA:
942 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
943 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
944 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
945 			break;
946 		default:
947 			ASSERT(false);
948 		}
949 	}
950 
blendFactorAlpha(Vector4s & blendFactor,const Vector4s & current,const Vector4s & pixel,BlendFactor blendFactorAlphaActive)951 	void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
952 	{
953 		switch(blendFactorAlphaActive)
954 		{
955 		case BLEND_ZERO:
956 			// Optimized
957 			break;
958 		case BLEND_ONE:
959 			// Optimized
960 			break;
961 		case BLEND_SOURCE:
962 			blendFactor.w = current.w;
963 			break;
964 		case BLEND_INVSOURCE:
965 			blendFactor.w = Short4(0xFFFFu) - current.w;
966 			break;
967 		case BLEND_DEST:
968 			blendFactor.w = pixel.w;
969 			break;
970 		case BLEND_INVDEST:
971 			blendFactor.w = Short4(0xFFFFu) - pixel.w;
972 			break;
973 		case BLEND_SOURCEALPHA:
974 			blendFactor.w = current.w;
975 			break;
976 		case BLEND_INVSOURCEALPHA:
977 			blendFactor.w = Short4(0xFFFFu) - current.w;
978 			break;
979 		case BLEND_DESTALPHA:
980 			blendFactor.w = pixel.w;
981 			break;
982 		case BLEND_INVDESTALPHA:
983 			blendFactor.w = Short4(0xFFFFu) - pixel.w;
984 			break;
985 		case BLEND_SRCALPHASAT:
986 			blendFactor.w = Short4(0xFFFFu);
987 			break;
988 		case BLEND_CONSTANT:
989 		case BLEND_CONSTANTALPHA:
990 			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.blendConstant4W[3]));
991 			break;
992 		case BLEND_INVCONSTANT:
993 		case BLEND_INVCONSTANTALPHA:
994 			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData,factor.invBlendConstant4W[3]));
995 			break;
996 		default:
997 			ASSERT(false);
998 		}
999 	}
1000 
isSRGB(int index) const1001 	bool PixelRoutine::isSRGB(int index) const
1002 	{
1003 		return state.targetFormat[index] == FORMAT_SRGB8_A8 || state.targetFormat[index] == FORMAT_SRGB8_X8;
1004 	}
1005 
readPixel(int index,Pointer<Byte> & cBuffer,Int & x,Vector4s & pixel)1006 	void PixelRoutine::readPixel(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &pixel)
1007 	{
1008 		Short4 c01;
1009 		Short4 c23;
1010 		Pointer<Byte> buffer;
1011 		Pointer<Byte> buffer2;
1012 
1013 		switch(state.targetFormat[index])
1014 		{
1015 		case FORMAT_R5G6B5:
1016 			buffer = cBuffer + 2 * x;
1017 			buffer2 = buffer + *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1018 			c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1019 
1020 			pixel.x = c01 & Short4(0xF800u);
1021 			pixel.y = (c01 & Short4(0x07E0u)) << 5;
1022 			pixel.z = (c01 & Short4(0x001Fu)) << 11;
1023 			pixel.w = Short4(0xFFFFu);
1024 			break;
1025 		case FORMAT_A8R8G8B8:
1026 			buffer = cBuffer + 4 * x;
1027 			c01 = *Pointer<Short4>(buffer);
1028 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1029 			c23 = *Pointer<Short4>(buffer);
1030 			pixel.z = c01;
1031 			pixel.y = c01;
1032 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1033 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1034 			pixel.x = pixel.z;
1035 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1036 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1037 			pixel.y = pixel.z;
1038 			pixel.w = pixel.x;
1039 			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1040 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1041 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1042 			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1043 			break;
1044 		case FORMAT_A8B8G8R8:
1045 		case FORMAT_SRGB8_A8:
1046 			buffer = cBuffer + 4 * x;
1047 			c01 = *Pointer<Short4>(buffer);
1048 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1049 			c23 = *Pointer<Short4>(buffer);
1050 			pixel.z = c01;
1051 			pixel.y = c01;
1052 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1053 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1054 			pixel.x = pixel.z;
1055 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1056 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1057 			pixel.y = pixel.z;
1058 			pixel.w = pixel.x;
1059 			pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1060 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1061 			pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1062 			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1063 			break;
1064 		case FORMAT_A8:
1065 			buffer = cBuffer + 1 * x;
1066 			pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 0);
1067 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1068 			pixel.w = Insert(pixel.w, *Pointer<Short>(buffer), 1);
1069 			pixel.w = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1070 			pixel.x = Short4(0x0000);
1071 			pixel.y = Short4(0x0000);
1072 			pixel.z = Short4(0x0000);
1073 			break;
1074 		case FORMAT_X8R8G8B8:
1075 			buffer = cBuffer + 4 * x;
1076 			c01 = *Pointer<Short4>(buffer);
1077 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1078 			c23 = *Pointer<Short4>(buffer);
1079 			pixel.z = c01;
1080 			pixel.y = c01;
1081 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1082 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1083 			pixel.x = pixel.z;
1084 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1085 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1086 			pixel.y = pixel.z;
1087 			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1088 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1089 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1090 			pixel.w = Short4(0xFFFFu);
1091 			break;
1092 		case FORMAT_X8B8G8R8:
1093 		case FORMAT_SRGB8_X8:
1094 			buffer = cBuffer + 4 * x;
1095 			c01 = *Pointer<Short4>(buffer);
1096 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1097 			c23 = *Pointer<Short4>(buffer);
1098 			pixel.z = c01;
1099 			pixel.y = c01;
1100 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1101 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1102 			pixel.x = pixel.z;
1103 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1104 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1105 			pixel.y = pixel.z;
1106 			pixel.w = pixel.x;
1107 			pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1108 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1109 			pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1110 			pixel.w = Short4(0xFFFFu);
1111 			break;
1112 		case FORMAT_A8G8R8B8Q:
1113 			UNIMPLEMENTED();
1114 		//	pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1115 		//	pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1116 		//	pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1117 		//	pixel.w = UnpackHigh(As<Byte8>(pixel.w), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1118 			break;
1119 		case FORMAT_X8G8R8B8Q:
1120 			UNIMPLEMENTED();
1121 		//	pixel.z = UnpackLow(As<Byte8>(pixel.z), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1122 		//	pixel.x = UnpackHigh(As<Byte8>(pixel.x), *Pointer<Byte8>(cBuffer + 8 * x + 0));
1123 		//	pixel.y = UnpackLow(As<Byte8>(pixel.y), *Pointer<Byte8>(cBuffer + 8 * x + 8));
1124 		//	pixel.w = Short4(0xFFFFu);
1125 			break;
1126 		case FORMAT_A16B16G16R16:
1127 			buffer = cBuffer;
1128 			pixel.x = *Pointer<Short4>(buffer + 8 * x);
1129 			pixel.y = *Pointer<Short4>(buffer + 8 * x + 8);
1130 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1131 			pixel.z = *Pointer<Short4>(buffer + 8 * x);
1132 			pixel.w = *Pointer<Short4>(buffer + 8 * x + 8);
1133 			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1134 			break;
1135 		case FORMAT_G16R16:
1136 			buffer = cBuffer;
1137 			pixel.x = *Pointer<Short4>(buffer + 4 * x);
1138 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1139 			pixel.y = *Pointer<Short4>(buffer + 4 * x);
1140 			pixel.z = pixel.x;
1141 			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1142 			pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1143 			pixel.y = pixel.z;
1144 			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1145 			pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1146 			pixel.z = Short4(0xFFFFu);
1147 			pixel.w = Short4(0xFFFFu);
1148 			break;
1149 		default:
1150 			ASSERT(false);
1151 		}
1152 
1153 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1154 		{
1155 			sRGBtoLinear16_12_16(pixel);
1156 		}
1157 	}
1158 
alphaBlend(int index,Pointer<Byte> & cBuffer,Vector4s & current,Int & x)1159 	void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1160 	{
1161 		if(!state.alphaBlendActive)
1162 		{
1163 			return;
1164 		}
1165 
1166 		Vector4s pixel;
1167 		readPixel(index, cBuffer, x, pixel);
1168 
1169 		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1170 		Vector4s sourceFactor;
1171 		Vector4s destFactor;
1172 
1173 		blendFactor(sourceFactor, current, pixel, state.sourceBlendFactor);
1174 		blendFactor(destFactor, current, pixel, state.destBlendFactor);
1175 
1176 		if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
1177 		{
1178 			current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1179 			current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1180 			current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1181 		}
1182 
1183 		if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
1184 		{
1185 			pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1186 			pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1187 			pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1188 		}
1189 
1190 		switch(state.blendOperation)
1191 		{
1192 		case BLENDOP_ADD:
1193 			current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1194 			current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1195 			current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1196 			break;
1197 		case BLENDOP_SUB:
1198 			current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1199 			current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1200 			current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1201 			break;
1202 		case BLENDOP_INVSUB:
1203 			current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1204 			current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1205 			current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1206 			break;
1207 		case BLENDOP_MIN:
1208 			current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1209 			current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1210 			current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1211 			break;
1212 		case BLENDOP_MAX:
1213 			current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1214 			current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1215 			current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1216 			break;
1217 		case BLENDOP_SOURCE:
1218 			// No operation
1219 			break;
1220 		case BLENDOP_DEST:
1221 			current.x = pixel.x;
1222 			current.y = pixel.y;
1223 			current.z = pixel.z;
1224 			break;
1225 		case BLENDOP_NULL:
1226 			current.x = Short4(0x0000);
1227 			current.y = Short4(0x0000);
1228 			current.z = Short4(0x0000);
1229 			break;
1230 		default:
1231 			ASSERT(false);
1232 		}
1233 
1234 		blendFactorAlpha(sourceFactor, current, pixel, state.sourceBlendFactorAlpha);
1235 		blendFactorAlpha(destFactor, current, pixel, state.destBlendFactorAlpha);
1236 
1237 		if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
1238 		{
1239 			current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1240 		}
1241 
1242 		if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
1243 		{
1244 			pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1245 		}
1246 
1247 		switch(state.blendOperationAlpha)
1248 		{
1249 		case BLENDOP_ADD:
1250 			current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1251 			break;
1252 		case BLENDOP_SUB:
1253 			current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1254 			break;
1255 		case BLENDOP_INVSUB:
1256 			current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1257 			break;
1258 		case BLENDOP_MIN:
1259 			current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1260 			break;
1261 		case BLENDOP_MAX:
1262 			current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1263 			break;
1264 		case BLENDOP_SOURCE:
1265 			// No operation
1266 			break;
1267 		case BLENDOP_DEST:
1268 			current.w = pixel.w;
1269 			break;
1270 		case BLENDOP_NULL:
1271 			current.w = Short4(0x0000);
1272 			break;
1273 		default:
1274 			ASSERT(false);
1275 		}
1276 	}
1277 
logicOperation(int index,Pointer<Byte> & cBuffer,Vector4s & current,Int & x)1278 	void PixelRoutine::logicOperation(int index, Pointer<Byte> &cBuffer, Vector4s &current, Int &x)
1279 	{
1280 		if(state.logicalOperation == LOGICALOP_COPY)
1281 		{
1282 			return;
1283 		}
1284 
1285 		Vector4s pixel;
1286 		readPixel(index, cBuffer, x, pixel);
1287 
1288 		switch(state.logicalOperation)
1289 		{
1290 		case LOGICALOP_CLEAR:
1291 			current.x = UShort4(0);
1292 			current.y = UShort4(0);
1293 			current.z = UShort4(0);
1294 			break;
1295 		case LOGICALOP_SET:
1296 			current.x = UShort4(0xFFFFu);
1297 			current.y = UShort4(0xFFFFu);
1298 			current.z = UShort4(0xFFFFu);
1299 			break;
1300 		case LOGICALOP_COPY:
1301 			ASSERT(false);   // Optimized out
1302 			break;
1303 		case LOGICALOP_COPY_INVERTED:
1304 			current.x = ~current.x;
1305 			current.y = ~current.y;
1306 			current.z = ~current.z;
1307 			break;
1308 		case LOGICALOP_NOOP:
1309 			current.x = pixel.x;
1310 			current.y = pixel.y;
1311 			current.z = pixel.z;
1312 			break;
1313 		case LOGICALOP_INVERT:
1314 			current.x = ~pixel.x;
1315 			current.y = ~pixel.y;
1316 			current.z = ~pixel.z;
1317 			break;
1318 		case LOGICALOP_AND:
1319 			current.x = pixel.x & current.x;
1320 			current.y = pixel.y & current.y;
1321 			current.z = pixel.z & current.z;
1322 			break;
1323 		case LOGICALOP_NAND:
1324 			current.x = ~(pixel.x & current.x);
1325 			current.y = ~(pixel.y & current.y);
1326 			current.z = ~(pixel.z & current.z);
1327 			break;
1328 		case LOGICALOP_OR:
1329 			current.x = pixel.x | current.x;
1330 			current.y = pixel.y | current.y;
1331 			current.z = pixel.z | current.z;
1332 			break;
1333 		case LOGICALOP_NOR:
1334 			current.x = ~(pixel.x | current.x);
1335 			current.y = ~(pixel.y | current.y);
1336 			current.z = ~(pixel.z | current.z);
1337 			break;
1338 		case LOGICALOP_XOR:
1339 			current.x = pixel.x ^ current.x;
1340 			current.y = pixel.y ^ current.y;
1341 			current.z = pixel.z ^ current.z;
1342 			break;
1343 		case LOGICALOP_EQUIV:
1344 			current.x = ~(pixel.x ^ current.x);
1345 			current.y = ~(pixel.y ^ current.y);
1346 			current.z = ~(pixel.z ^ current.z);
1347 			break;
1348 		case LOGICALOP_AND_REVERSE:
1349 			current.x = ~pixel.x & current.x;
1350 			current.y = ~pixel.y & current.y;
1351 			current.z = ~pixel.z & current.z;
1352 			break;
1353 		case LOGICALOP_AND_INVERTED:
1354 			current.x = pixel.x & ~current.x;
1355 			current.y = pixel.y & ~current.y;
1356 			current.z = pixel.z & ~current.z;
1357 			break;
1358 		case LOGICALOP_OR_REVERSE:
1359 			current.x = ~pixel.x | current.x;
1360 			current.y = ~pixel.y | current.y;
1361 			current.z = ~pixel.z | current.z;
1362 			break;
1363 		case LOGICALOP_OR_INVERTED:
1364 			current.x = pixel.x | ~current.x;
1365 			current.y = pixel.y | ~current.y;
1366 			current.z = pixel.z | ~current.z;
1367 			break;
1368 		default:
1369 			ASSERT(false);
1370 		}
1371 	}
1372 
writeColor(int index,Pointer<Byte> & cBuffer,Int & x,Vector4s & current,Int & sMask,Int & zMask,Int & cMask)1373 	void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4s &current, Int &sMask, Int &zMask, Int &cMask)
1374 	{
1375 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
1376 		{
1377 			linearToSRGB16_12_16(current);
1378 		}
1379 
1380 		if(exactColorRounding)
1381 		{
1382 			switch(state.targetFormat[index])
1383 			{
1384 			case FORMAT_R5G6B5:
1385 				current.x = AddSat(As<UShort4>(current.x), UShort4(0x0400));
1386 				current.y = AddSat(As<UShort4>(current.y), UShort4(0x0200));
1387 				current.z = AddSat(As<UShort4>(current.z), UShort4(0x0400));
1388 				break;
1389 			case FORMAT_X8G8R8B8Q:
1390 			case FORMAT_A8G8R8B8Q:
1391 			case FORMAT_X8R8G8B8:
1392 			case FORMAT_X8B8G8R8:
1393 			case FORMAT_A8R8G8B8:
1394 			case FORMAT_A8B8G8R8:
1395 			case FORMAT_SRGB8_X8:
1396 			case FORMAT_SRGB8_A8:
1397 			case FORMAT_G8R8:
1398 			case FORMAT_R8:
1399 				current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
1400 				current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
1401 				current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
1402 				current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
1403 				break;
1404 			default:
1405 				break;
1406 			}
1407 		}
1408 
1409 		int rgbaWriteMask = state.colorWriteActive(index);
1410 		int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1411 
1412 		switch(state.targetFormat[index])
1413 		{
1414 		case FORMAT_R5G6B5:
1415 			{
1416 				current.x = current.x & Short4(0xF800u);
1417 				current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1418 				current.z = As<UShort4>(current.z) >> 11;
1419 
1420 				current.x = current.x | current.y | current.z;
1421 			}
1422 			break;
1423 		case FORMAT_X8G8R8B8Q:
1424 			UNIMPLEMENTED();
1425 		//	current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1426 		//	current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1427 		//	current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1428 
1429 		//	current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1430 		//	current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1431 			break;
1432 		case FORMAT_A8G8R8B8Q:
1433 			UNIMPLEMENTED();
1434 		//	current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1435 		//	current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1436 		//	current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1437 		//	current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1438 
1439 		//	current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1440 		//	current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1441 			break;
1442 		case FORMAT_X8R8G8B8:
1443 		case FORMAT_A8R8G8B8:
1444 			if(state.targetFormat[index] == FORMAT_X8R8G8B8 || rgbaWriteMask == 0x7)
1445 			{
1446 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1447 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1448 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1449 
1450 				current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1451 				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1452 
1453 				current.x = current.z;
1454 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1455 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1456 				current.y = current.z;
1457 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1458 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1459 			}
1460 			else
1461 			{
1462 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1463 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1464 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1465 				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1466 
1467 				current.z = As<Short4>(Pack(As<UShort4>(current.z), As<UShort4>(current.x)));
1468 				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1469 
1470 				current.x = current.z;
1471 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1472 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1473 				current.y = current.z;
1474 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1475 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1476 			}
1477 			break;
1478 		case FORMAT_X8B8G8R8:
1479 		case FORMAT_A8B8G8R8:
1480 		case FORMAT_SRGB8_X8:
1481 		case FORMAT_SRGB8_A8:
1482 			if(state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8 || rgbaWriteMask == 0x7)
1483 			{
1484 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1485 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1486 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1487 
1488 				current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1489 				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1490 
1491 				current.x = current.z;
1492 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1493 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1494 				current.y = current.z;
1495 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1496 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1497 			}
1498 			else
1499 			{
1500 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1501 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1502 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1503 				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1504 
1505 				current.z = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.z)));
1506 				current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.w)));
1507 
1508 				current.x = current.z;
1509 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1510 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1511 				current.y = current.z;
1512 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1513 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1514 			}
1515 			break;
1516 		case FORMAT_G8R8:
1517 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1518 			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1519 			current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
1520 			current.y = As<Short4>(Pack(As<UShort4>(current.y), As<UShort4>(current.y)));
1521 			current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
1522 			break;
1523 		case FORMAT_R8:
1524 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1525 			current.x = As<Short4>(Pack(As<UShort4>(current.x), As<UShort4>(current.x)));
1526 			break;
1527 		case FORMAT_A8:
1528 			current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1529 			current.w = As<Short4>(Pack(As<UShort4>(current.w), As<UShort4>(current.w)));
1530 			break;
1531 		case FORMAT_G16R16:
1532 			current.z = current.x;
1533 			current.x = As<Short4>(UnpackLow(current.x, current.y));
1534 			current.z = As<Short4>(UnpackHigh(current.z, current.y));
1535 			current.y = current.z;
1536 			break;
1537 		case FORMAT_A16B16G16R16:
1538 			transpose4x4(current.x, current.y, current.z, current.w);
1539 			break;
1540 		default:
1541 			ASSERT(false);
1542 		}
1543 
1544 		Short4 c01 = current.z;
1545 		Short4 c23 = current.y;
1546 
1547 		Int xMask;   // Combination of all masks
1548 
1549 		if(state.depthTestActive)
1550 		{
1551 			xMask = zMask;
1552 		}
1553 		else
1554 		{
1555 			xMask = cMask;
1556 		}
1557 
1558 		if(state.stencilActive)
1559 		{
1560 			xMask &= sMask;
1561 		}
1562 
1563 		switch(state.targetFormat[index])
1564 		{
1565 		case FORMAT_R5G6B5:
1566 			{
1567 				Pointer<Byte> buffer = cBuffer + 2 * x;
1568 				Int value = *Pointer<Int>(buffer);
1569 
1570 				Int c01 = Extract(As<Int2>(current.x), 0);
1571 
1572 				if((bgraWriteMask & 0x00000007) != 0x00000007)
1573 				{
1574 					Int masked = value;
1575 					c01 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1576 					masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
1577 					c01 |= masked;
1578 				}
1579 
1580 				c01 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][0]) + xMask * 8);
1581 				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][0]) + xMask * 8);
1582 				c01 |= value;
1583 				*Pointer<Int>(buffer) = c01;
1584 
1585 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1586 				value = *Pointer<Int>(buffer);
1587 
1588 				Int c23 = Extract(As<Int2>(current.x), 1);
1589 
1590 				if((bgraWriteMask & 0x00000007) != 0x00000007)
1591 				{
1592 					Int masked = value;
1593 					c23 &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[bgraWriteMask & 0x7][0]));
1594 					masked &= *Pointer<Int>(constants + OFFSET(Constants,mask565Q[~bgraWriteMask & 0x7][0]));
1595 					c23 |= masked;
1596 				}
1597 
1598 				c23 &= *Pointer<Int>(constants + OFFSET(Constants,maskW4Q[0][2]) + xMask * 8);
1599 				value &= *Pointer<Int>(constants + OFFSET(Constants,invMaskW4Q[0][2]) + xMask * 8);
1600 				c23 |= value;
1601 				*Pointer<Int>(buffer) = c23;
1602 			}
1603 			break;
1604 		case FORMAT_A8G8R8B8Q:
1605 		case FORMAT_X8G8R8B8Q:   // FIXME: Don't touch alpha?
1606 			UNIMPLEMENTED();
1607 		//	value = *Pointer<Short4>(cBuffer + 8 * x + 0);
1608 
1609 		//	if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1610 		//	   ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1611 		//	    (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1612 		//	{
1613 		//		Short4 masked = value;
1614 		//		c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1615 		//		masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1616 		//		c01 |= masked;
1617 		//	}
1618 
1619 		//	c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1620 		//	value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1621 		//	c01 |= value;
1622 		//	*Pointer<Short4>(cBuffer + 8 * x + 0) = c01;
1623 
1624 		//	value = *Pointer<Short4>(cBuffer + 8 * x + 8);
1625 
1626 		//	if((state.targetFormat[index] == FORMAT_A8G8R8B8Q && bgraWriteMask != 0x0000000F) ||
1627 		//	   ((state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x00000007) &&
1628 		//	    (state.targetFormat[index] == FORMAT_X8G8R8B8Q && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1629 		//	{
1630 		//		Short4 masked = value;
1631 		//		c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1632 		//		masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1633 		//		c23 |= masked;
1634 		//	}
1635 
1636 		//	c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1637 		//	value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1638 		//	c23 |= value;
1639 		//	*Pointer<Short4>(cBuffer + 8 * x + 8) = c23;
1640 			break;
1641 		case FORMAT_A8R8G8B8:
1642 		case FORMAT_X8R8G8B8:   // FIXME: Don't touch alpha?
1643 			{
1644 				Pointer<Byte> buffer = cBuffer + x * 4;
1645 				Short4 value = *Pointer<Short4>(buffer);
1646 
1647 				if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1648 				   ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1649 					(state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1650 				{
1651 					Short4 masked = value;
1652 					c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1653 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1654 					c01 |= masked;
1655 				}
1656 
1657 				c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1658 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1659 				c01 |= value;
1660 				*Pointer<Short4>(buffer) = c01;
1661 
1662 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1663 				value = *Pointer<Short4>(buffer);
1664 
1665 				if((state.targetFormat[index] == FORMAT_A8R8G8B8 && bgraWriteMask != 0x0000000F) ||
1666 				   ((state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x00000007) &&
1667 					(state.targetFormat[index] == FORMAT_X8R8G8B8 && bgraWriteMask != 0x0000000F)))   // FIXME: Need for masking when XRGB && Fh?
1668 				{
1669 					Short4 masked = value;
1670 					c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[bgraWriteMask][0]));
1671 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[bgraWriteMask][0]));
1672 					c23 |= masked;
1673 				}
1674 
1675 				c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1676 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1677 				c23 |= value;
1678 				*Pointer<Short4>(buffer) = c23;
1679 			}
1680 			break;
1681 		case FORMAT_A8B8G8R8:
1682 		case FORMAT_X8B8G8R8:   // FIXME: Don't touch alpha?
1683 		case FORMAT_SRGB8_X8:
1684 		case FORMAT_SRGB8_A8:
1685 			{
1686 				Pointer<Byte> buffer = cBuffer + x * 4;
1687 				Short4 value = *Pointer<Short4>(buffer);
1688 
1689 				bool masked = (((state.targetFormat[index] == FORMAT_A8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_A8) && rgbaWriteMask != 0x0000000F) ||
1690 				              (((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x00000007) &&
1691 				               ((state.targetFormat[index] == FORMAT_X8B8G8R8 || state.targetFormat[index] == FORMAT_SRGB8_X8) && rgbaWriteMask != 0x0000000F))); // FIXME: Need for masking when XBGR && Fh?
1692 
1693 				if(masked)
1694 				{
1695 					Short4 masked = value;
1696 					c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1697 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1698 					c01 |= masked;
1699 				}
1700 
1701 				c01 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1702 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1703 				c01 |= value;
1704 				*Pointer<Short4>(buffer) = c01;
1705 
1706 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1707 				value = *Pointer<Short4>(buffer);
1708 
1709 				if(masked)
1710 				{
1711 					Short4 masked = value;
1712 					c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q[rgbaWriteMask][0]));
1713 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q[rgbaWriteMask][0]));
1714 					c23 |= masked;
1715 				}
1716 
1717 				c23 &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1718 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1719 				c23 |= value;
1720 				*Pointer<Short4>(buffer) = c23;
1721 			}
1722 			break;
1723 		case FORMAT_G8R8:
1724 			if((rgbaWriteMask & 0x00000003) != 0x0)
1725 			{
1726 				Pointer<Byte> buffer = cBuffer + 2 * x;
1727 				Int2 value;
1728 				value = Insert(value, *Pointer<Int>(buffer), 0);
1729 				Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1730 				value = Insert(value, *Pointer<Int>(buffer + pitch), 1);
1731 
1732 				Int2 packedCol = As<Int2>(current.x);
1733 
1734 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
1735 				if((rgbaWriteMask & 0x3) != 0x3)
1736 				{
1737 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
1738 					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
1739 					mergedMask &= rgbaMask;
1740 				}
1741 
1742 				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
1743 
1744 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
1745 				*Pointer<UInt>(buffer + pitch) = As<UInt>(Extract(packedCol, 1));
1746 			}
1747 			break;
1748 		case FORMAT_R8:
1749 			if(rgbaWriteMask & 0x00000001)
1750 			{
1751 				Pointer<Byte> buffer = cBuffer + 1 * x;
1752 				Short4 value;
1753 				value = Insert(value, *Pointer<Short>(buffer), 0);
1754 				Int pitch = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1755 				value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1756 				value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
1757 
1758 				current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
1759 				value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
1760 				current.x |= value;
1761 
1762 				*Pointer<Short>(buffer) = Extract(current.x, 0);
1763 				*Pointer<Short>(buffer + pitch) = Extract(current.x, 1);
1764 			}
1765 			break;
1766 		case FORMAT_A8:
1767 			if(rgbaWriteMask & 0x00000008)
1768 			{
1769 				Pointer<Byte> buffer = cBuffer + 1 * x;
1770 				Short4 value;
1771 				value = Insert(value, *Pointer<Short>(buffer), 0);
1772 				Int pitch = *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1773 				value = Insert(value, *Pointer<Short>(buffer + pitch), 1);
1774 				value = UnpackLow(As<Byte8>(value), As<Byte8>(value));
1775 
1776 				current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskB4Q) + 8 * xMask);
1777 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskB4Q) + 8 * xMask);
1778 				current.w |= value;
1779 
1780 				*Pointer<Short>(buffer) = Extract(current.w, 0);
1781 				*Pointer<Short>(buffer + pitch) = Extract(current.w, 1);
1782 			}
1783 			break;
1784 		case FORMAT_G16R16:
1785 			{
1786 				Pointer<Byte> buffer = cBuffer + 4 * x;
1787 
1788 				Short4 value = *Pointer<Short4>(buffer);
1789 
1790 				if((rgbaWriteMask & 0x00000003) != 0x00000003)
1791 				{
1792 					Short4 masked = value;
1793 					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1794 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
1795 					current.x |= masked;
1796 				}
1797 
1798 				current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskD01Q) + xMask * 8);
1799 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD01Q) + xMask * 8);
1800 				current.x |= value;
1801 				*Pointer<Short4>(buffer) = current.x;
1802 
1803 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1804 
1805 				value = *Pointer<Short4>(buffer);
1806 
1807 				if((rgbaWriteMask & 0x00000003) != 0x00000003)
1808 				{
1809 					Short4 masked = value;
1810 					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[rgbaWriteMask & 0x3][0]));
1811 					masked &= *Pointer<Short4>(constants + OFFSET(Constants,maskW01Q[~rgbaWriteMask & 0x3][0]));
1812 					current.y |= masked;
1813 				}
1814 
1815 				current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskD23Q) + xMask * 8);
1816 				value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskD23Q) + xMask * 8);
1817 				current.y |= value;
1818 				*Pointer<Short4>(buffer) = current.y;
1819 			}
1820 			break;
1821 		case FORMAT_A16B16G16R16:
1822 			{
1823 				Pointer<Byte> buffer = cBuffer + 8 * x;
1824 
1825 				{
1826 					Short4 value = *Pointer<Short4>(buffer);
1827 
1828 					if(rgbaWriteMask != 0x0000000F)
1829 					{
1830 						Short4 masked = value;
1831 						current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1832 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1833 						current.x |= masked;
1834 					}
1835 
1836 					current.x &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ0Q) + xMask * 8);
1837 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ0Q) + xMask * 8);
1838 					current.x |= value;
1839 					*Pointer<Short4>(buffer) = current.x;
1840 				}
1841 
1842 				{
1843 					Short4 value = *Pointer<Short4>(buffer + 8);
1844 
1845 					if(rgbaWriteMask != 0x0000000F)
1846 					{
1847 						Short4 masked = value;
1848 						current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1849 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1850 						current.y |= masked;
1851 					}
1852 
1853 					current.y &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ1Q) + xMask * 8);
1854 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ1Q) + xMask * 8);
1855 					current.y |= value;
1856 					*Pointer<Short4>(buffer + 8) = current.y;
1857 				}
1858 
1859 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
1860 
1861 				{
1862 					Short4 value = *Pointer<Short4>(buffer);
1863 
1864 					if(rgbaWriteMask != 0x0000000F)
1865 					{
1866 						Short4 masked = value;
1867 						current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1868 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1869 						current.z |= masked;
1870 					}
1871 
1872 					current.z &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ2Q) + xMask * 8);
1873 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ2Q) + xMask * 8);
1874 					current.z |= value;
1875 					*Pointer<Short4>(buffer) = current.z;
1876 				}
1877 
1878 				{
1879 					Short4 value = *Pointer<Short4>(buffer + 8);
1880 
1881 					if(rgbaWriteMask != 0x0000000F)
1882 					{
1883 						Short4 masked = value;
1884 						current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskW4Q[rgbaWriteMask][0]));
1885 						masked &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskW4Q[rgbaWriteMask][0]));
1886 						current.w |= masked;
1887 					}
1888 
1889 					current.w &= *Pointer<Short4>(constants + OFFSET(Constants,maskQ3Q) + xMask * 8);
1890 					value &= *Pointer<Short4>(constants + OFFSET(Constants,invMaskQ3Q) + xMask * 8);
1891 					current.w |= value;
1892 					*Pointer<Short4>(buffer + 8) = current.w;
1893 				}
1894 			}
1895 			break;
1896 		default:
1897 			ASSERT(false);
1898 		}
1899 	}
1900 
blendFactor(Vector4f & blendFactor,const Vector4f & oC,const Vector4f & pixel,BlendFactor blendFactorActive)1901 	void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
1902 	{
1903 		switch(blendFactorActive)
1904 		{
1905 		case BLEND_ZERO:
1906 			// Optimized
1907 			break;
1908 		case BLEND_ONE:
1909 			// Optimized
1910 			break;
1911 		case BLEND_SOURCE:
1912 			blendFactor.x = oC.x;
1913 			blendFactor.y = oC.y;
1914 			blendFactor.z = oC.z;
1915 			break;
1916 		case BLEND_INVSOURCE:
1917 			blendFactor.x = Float4(1.0f) - oC.x;
1918 			blendFactor.y = Float4(1.0f) - oC.y;
1919 			blendFactor.z = Float4(1.0f) - oC.z;
1920 			break;
1921 		case BLEND_DEST:
1922 			blendFactor.x = pixel.x;
1923 			blendFactor.y = pixel.y;
1924 			blendFactor.z = pixel.z;
1925 			break;
1926 		case BLEND_INVDEST:
1927 			blendFactor.x = Float4(1.0f) - pixel.x;
1928 			blendFactor.y = Float4(1.0f) - pixel.y;
1929 			blendFactor.z = Float4(1.0f) - pixel.z;
1930 			break;
1931 		case BLEND_SOURCEALPHA:
1932 			blendFactor.x = oC.w;
1933 			blendFactor.y = oC.w;
1934 			blendFactor.z = oC.w;
1935 			break;
1936 		case BLEND_INVSOURCEALPHA:
1937 			blendFactor.x = Float4(1.0f) - oC.w;
1938 			blendFactor.y = Float4(1.0f) - oC.w;
1939 			blendFactor.z = Float4(1.0f) - oC.w;
1940 			break;
1941 		case BLEND_DESTALPHA:
1942 			blendFactor.x = pixel.w;
1943 			blendFactor.y = pixel.w;
1944 			blendFactor.z = pixel.w;
1945 			break;
1946 		case BLEND_INVDESTALPHA:
1947 			blendFactor.x = Float4(1.0f) - pixel.w;
1948 			blendFactor.y = Float4(1.0f) - pixel.w;
1949 			blendFactor.z = Float4(1.0f) - pixel.w;
1950 			break;
1951 		case BLEND_SRCALPHASAT:
1952 			blendFactor.x = Float4(1.0f) - pixel.w;
1953 			blendFactor.x = Min(blendFactor.x, oC.w);
1954 			blendFactor.y = blendFactor.x;
1955 			blendFactor.z = blendFactor.x;
1956 			break;
1957 		case BLEND_CONSTANT:
1958 			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[0]));
1959 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[1]));
1960 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[2]));
1961 			break;
1962 		case BLEND_INVCONSTANT:
1963 			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[0]));
1964 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[1]));
1965 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[2]));
1966 			break;
1967 		default:
1968 			ASSERT(false);
1969 		}
1970 	}
1971 
blendFactorAlpha(Vector4f & blendFactor,const Vector4f & oC,const Vector4f & pixel,BlendFactor blendFactorAlphaActive)1972 	void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
1973 	{
1974 		switch(blendFactorAlphaActive)
1975 		{
1976 		case BLEND_ZERO:
1977 			// Optimized
1978 			break;
1979 		case BLEND_ONE:
1980 			// Optimized
1981 			break;
1982 		case BLEND_SOURCE:
1983 			blendFactor.w = oC.w;
1984 			break;
1985 		case BLEND_INVSOURCE:
1986 			blendFactor.w = Float4(1.0f) - oC.w;
1987 			break;
1988 		case BLEND_DEST:
1989 			blendFactor.w = pixel.w;
1990 			break;
1991 		case BLEND_INVDEST:
1992 			blendFactor.w = Float4(1.0f) - pixel.w;
1993 			break;
1994 		case BLEND_SOURCEALPHA:
1995 			blendFactor.w = oC.w;
1996 			break;
1997 		case BLEND_INVSOURCEALPHA:
1998 			blendFactor.w = Float4(1.0f) - oC.w;
1999 			break;
2000 		case BLEND_DESTALPHA:
2001 			blendFactor.w = pixel.w;
2002 			break;
2003 		case BLEND_INVDESTALPHA:
2004 			blendFactor.w = Float4(1.0f) - pixel.w;
2005 			break;
2006 		case BLEND_SRCALPHASAT:
2007 			blendFactor.w = Float4(1.0f);
2008 			break;
2009 		case BLEND_CONSTANT:
2010 			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.blendConstant4F[3]));
2011 			break;
2012 		case BLEND_INVCONSTANT:
2013 			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData,factor.invBlendConstant4F[3]));
2014 			break;
2015 		default:
2016 			ASSERT(false);
2017 		}
2018 	}
2019 
alphaBlend(int index,Pointer<Byte> & cBuffer,Vector4f & oC,Int & x)2020 	void PixelRoutine::alphaBlend(int index, Pointer<Byte> &cBuffer, Vector4f &oC, Int &x)
2021 	{
2022 		if(!state.alphaBlendActive)
2023 		{
2024 			return;
2025 		}
2026 
2027 		Pointer<Byte> buffer;
2028 		Vector4f pixel;
2029 
2030 		Vector4s color;
2031 		Short4 c01;
2032 		Short4 c23;
2033 
2034 		Float4 one;
2035 		if(Surface::isFloatFormat(state.targetFormat[index]))
2036 		{
2037 			one = Float4(1.0f);
2038 		}
2039 		else if(Surface::isNonNormalizedInteger(state.targetFormat[index]))
2040 		{
2041 			one = As<Float4>(Surface::isUnsignedComponent(state.targetFormat[index], 0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
2042 		}
2043 
2044 		switch(state.targetFormat[index])
2045 		{
2046 		case FORMAT_R32I:
2047 		case FORMAT_R32UI:
2048 		case FORMAT_R32F:
2049 			buffer = cBuffer;
2050 			// FIXME: movlps
2051 			pixel.x.x = *Pointer<Float>(buffer + 4 * x + 0);
2052 			pixel.x.y = *Pointer<Float>(buffer + 4 * x + 4);
2053 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2054 			// FIXME: movhps
2055 			pixel.x.z = *Pointer<Float>(buffer + 4 * x + 0);
2056 			pixel.x.w = *Pointer<Float>(buffer + 4 * x + 4);
2057 			pixel.y = pixel.z = pixel.w = one;
2058 			break;
2059 		case FORMAT_G32R32I:
2060 		case FORMAT_G32R32UI:
2061 		case FORMAT_G32R32F:
2062 			buffer = cBuffer;
2063 			pixel.x = *Pointer<Float4>(buffer + 8 * x, 16);
2064 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2065 			pixel.y = *Pointer<Float4>(buffer + 8 * x, 16);
2066 			pixel.z = pixel.x;
2067 			pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x88);
2068 			pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0xDD);
2069 			pixel.y = pixel.z;
2070 			pixel.z = pixel.w = one;
2071 			break;
2072 		case FORMAT_X32B32G32R32F:
2073 		case FORMAT_A32B32G32R32F:
2074 		case FORMAT_A32B32G32R32I:
2075 		case FORMAT_A32B32G32R32UI:
2076 			buffer = cBuffer;
2077 			pixel.x = *Pointer<Float4>(buffer + 16 * x, 16);
2078 			pixel.y = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2079 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2080 			pixel.z = *Pointer<Float4>(buffer + 16 * x, 16);
2081 			pixel.w = *Pointer<Float4>(buffer + 16 * x + 16, 16);
2082 			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
2083 			if(state.targetFormat[index] == FORMAT_X32B32G32R32F)
2084 			{
2085 				pixel.w = Float4(1.0f);
2086 			}
2087 			break;
2088 		default:
2089 			ASSERT(false);
2090 		}
2091 
2092 		if((postBlendSRGB && state.writeSRGB) || isSRGB(index))
2093 		{
2094 			sRGBtoLinear(pixel.x);
2095 			sRGBtoLinear(pixel.y);
2096 			sRGBtoLinear(pixel.z);
2097 		}
2098 
2099 		// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
2100 		Vector4f sourceFactor;
2101 		Vector4f destFactor;
2102 
2103 		blendFactor(sourceFactor, oC, pixel, state.sourceBlendFactor);
2104 		blendFactor(destFactor, oC, pixel, state.destBlendFactor);
2105 
2106 		if(state.sourceBlendFactor != BLEND_ONE && state.sourceBlendFactor != BLEND_ZERO)
2107 		{
2108 			oC.x *= sourceFactor.x;
2109 			oC.y *= sourceFactor.y;
2110 			oC.z *= sourceFactor.z;
2111 		}
2112 
2113 		if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
2114 		{
2115 			pixel.x *= destFactor.x;
2116 			pixel.y *= destFactor.y;
2117 			pixel.z *= destFactor.z;
2118 		}
2119 
2120 		switch(state.blendOperation)
2121 		{
2122 		case BLENDOP_ADD:
2123 			oC.x += pixel.x;
2124 			oC.y += pixel.y;
2125 			oC.z += pixel.z;
2126 			break;
2127 		case BLENDOP_SUB:
2128 			oC.x -= pixel.x;
2129 			oC.y -= pixel.y;
2130 			oC.z -= pixel.z;
2131 			break;
2132 		case BLENDOP_INVSUB:
2133 			oC.x = pixel.x - oC.x;
2134 			oC.y = pixel.y - oC.y;
2135 			oC.z = pixel.z - oC.z;
2136 			break;
2137 		case BLENDOP_MIN:
2138 			oC.x = Min(oC.x, pixel.x);
2139 			oC.y = Min(oC.y, pixel.y);
2140 			oC.z = Min(oC.z, pixel.z);
2141 			break;
2142 		case BLENDOP_MAX:
2143 			oC.x = Max(oC.x, pixel.x);
2144 			oC.y = Max(oC.y, pixel.y);
2145 			oC.z = Max(oC.z, pixel.z);
2146 			break;
2147 		case BLENDOP_SOURCE:
2148 			// No operation
2149 			break;
2150 		case BLENDOP_DEST:
2151 			oC.x = pixel.x;
2152 			oC.y = pixel.y;
2153 			oC.z = pixel.z;
2154 			break;
2155 		case BLENDOP_NULL:
2156 			oC.x = Float4(0.0f);
2157 			oC.y = Float4(0.0f);
2158 			oC.z = Float4(0.0f);
2159 			break;
2160 		default:
2161 			ASSERT(false);
2162 		}
2163 
2164 		blendFactorAlpha(sourceFactor, oC, pixel, state.sourceBlendFactorAlpha);
2165 		blendFactorAlpha(destFactor, oC, pixel, state.destBlendFactorAlpha);
2166 
2167 		if(state.sourceBlendFactorAlpha != BLEND_ONE && state.sourceBlendFactorAlpha != BLEND_ZERO)
2168 		{
2169 			oC.w *= sourceFactor.w;
2170 		}
2171 
2172 		if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
2173 		{
2174 			pixel.w *= destFactor.w;
2175 		}
2176 
2177 		switch(state.blendOperationAlpha)
2178 		{
2179 		case BLENDOP_ADD:
2180 			oC.w += pixel.w;
2181 			break;
2182 		case BLENDOP_SUB:
2183 			oC.w -= pixel.w;
2184 			break;
2185 		case BLENDOP_INVSUB:
2186 			pixel.w -= oC.w;
2187 			oC.w = pixel.w;
2188 			break;
2189 		case BLENDOP_MIN:
2190 			oC.w = Min(oC.w, pixel.w);
2191 			break;
2192 		case BLENDOP_MAX:
2193 			oC.w = Max(oC.w, pixel.w);
2194 			break;
2195 		case BLENDOP_SOURCE:
2196 			// No operation
2197 			break;
2198 		case BLENDOP_DEST:
2199 			oC.w = pixel.w;
2200 			break;
2201 		case BLENDOP_NULL:
2202 			oC.w = Float4(0.0f);
2203 			break;
2204 		default:
2205 			ASSERT(false);
2206 		}
2207 	}
2208 
writeColor(int index,Pointer<Byte> & cBuffer,Int & x,Vector4f & oC,Int & sMask,Int & zMask,Int & cMask)2209 	void PixelRoutine::writeColor(int index, Pointer<Byte> &cBuffer, Int &x, Vector4f &oC, Int &sMask, Int &zMask, Int &cMask)
2210 	{
2211 		switch(state.targetFormat[index])
2212 		{
2213 		case FORMAT_R32F:
2214 		case FORMAT_R32I:
2215 		case FORMAT_R32UI:
2216 		case FORMAT_R16I:
2217 		case FORMAT_R16UI:
2218 		case FORMAT_R8I:
2219 		case FORMAT_R8UI:
2220 			break;
2221 		case FORMAT_G32R32F:
2222 		case FORMAT_G32R32I:
2223 		case FORMAT_G32R32UI:
2224 		case FORMAT_G16R16I:
2225 		case FORMAT_G16R16UI:
2226 		case FORMAT_G8R8I:
2227 		case FORMAT_G8R8UI:
2228 			oC.z = oC.x;
2229 			oC.x = UnpackLow(oC.x, oC.y);
2230 			oC.z = UnpackHigh(oC.z, oC.y);
2231 			oC.y = oC.z;
2232 			break;
2233 		case FORMAT_X32B32G32R32F:
2234 		case FORMAT_A32B32G32R32F:
2235 		case FORMAT_A32B32G32R32I:
2236 		case FORMAT_A32B32G32R32UI:
2237 		case FORMAT_A16B16G16R16I:
2238 		case FORMAT_A16B16G16R16UI:
2239 		case FORMAT_A8B8G8R8I:
2240 		case FORMAT_A8B8G8R8UI:
2241 			transpose4x4(oC.x, oC.y, oC.z, oC.w);
2242 			break;
2243 		default:
2244 			ASSERT(false);
2245 		}
2246 
2247 		int rgbaWriteMask = state.colorWriteActive(index);
2248 
2249 		Int xMask;   // Combination of all masks
2250 
2251 		if(state.depthTestActive)
2252 		{
2253 			xMask = zMask;
2254 		}
2255 		else
2256 		{
2257 			xMask = cMask;
2258 		}
2259 
2260 		if(state.stencilActive)
2261 		{
2262 			xMask &= sMask;
2263 		}
2264 
2265 		Pointer<Byte> buffer;
2266 		Float4 value;
2267 
2268 		switch(state.targetFormat[index])
2269 		{
2270 		case FORMAT_R32F:
2271 		case FORMAT_R32I:
2272 		case FORMAT_R32UI:
2273 			if(rgbaWriteMask & 0x00000001)
2274 			{
2275 				buffer = cBuffer + 4 * x;
2276 
2277 				// FIXME: movlps
2278 				value.x = *Pointer<Float>(buffer + 0);
2279 				value.y = *Pointer<Float>(buffer + 4);
2280 
2281 				buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2282 
2283 				// FIXME: movhps
2284 				value.z = *Pointer<Float>(buffer + 0);
2285 				value.w = *Pointer<Float>(buffer + 4);
2286 
2287 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X) + xMask * 16, 16));
2288 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X) + xMask * 16, 16));
2289 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2290 
2291 				// FIXME: movhps
2292 				*Pointer<Float>(buffer + 0) = oC.x.z;
2293 				*Pointer<Float>(buffer + 4) = oC.x.w;
2294 
2295 				buffer -= *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2296 
2297 				// FIXME: movlps
2298 				*Pointer<Float>(buffer + 0) = oC.x.x;
2299 				*Pointer<Float>(buffer + 4) = oC.x.y;
2300 			}
2301 			break;
2302 		case FORMAT_R16I:
2303 		case FORMAT_R16UI:
2304 			if(rgbaWriteMask & 0x00000001)
2305 			{
2306 				buffer = cBuffer + 2 * x;
2307 
2308 				UShort4 xyzw;
2309 				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2310 
2311 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2312 
2313 				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2314 				value = As<Float4>(Int4(xyzw));
2315 
2316 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2317 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2318 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2319 
2320 				if(state.targetFormat[index] == FORMAT_R16I)
2321 				{
2322 					Float component = oC.x.z;
2323 					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2324 					component = oC.x.w;
2325 					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2326 
2327 					buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2328 
2329 					component = oC.x.x;
2330 					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2331 					component = oC.x.y;
2332 					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2333 				}
2334 				else // FORMAT_R16UI
2335 				{
2336 					Float component = oC.x.z;
2337 					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2338 					component = oC.x.w;
2339 					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2340 
2341 					buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2342 
2343 					component = oC.x.x;
2344 					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2345 					component = oC.x.y;
2346 					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2347 				}
2348 			}
2349 			break;
2350 		case FORMAT_R8I:
2351 		case FORMAT_R8UI:
2352 			if(rgbaWriteMask & 0x00000001)
2353 			{
2354 				buffer = cBuffer + x;
2355 
2356 				UInt xyzw, packedCol;
2357 
2358 				xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
2359 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2360 				xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2361 
2362 				Short4 tmpCol = Short4(As<Int4>(oC.x));
2363 				if(state.targetFormat[index] == FORMAT_R8I)
2364 				{
2365 					tmpCol = As<Short4>(Pack(tmpCol, tmpCol));
2366 				}
2367 				else
2368 				{
2369 					tmpCol = As<Short4>(Pack(As<UShort4>(tmpCol), As<UShort4>(tmpCol)));
2370 				}
2371 				packedCol = Extract(As<Int2>(tmpCol), 0);
2372 
2373 				packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2374 				            (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2375 
2376 				*Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2377 				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2378 				*Pointer<UShort>(buffer) = UShort(packedCol);
2379 			}
2380 			break;
2381 		case FORMAT_G32R32F:
2382 		case FORMAT_G32R32I:
2383 		case FORMAT_G32R32UI:
2384 			buffer = cBuffer + 8 * x;
2385 
2386 			value = *Pointer<Float4>(buffer);
2387 
2388 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
2389 			{
2390 				Float4 masked = value;
2391 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2392 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
2393 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2394 			}
2395 
2396 			oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ01X) + xMask * 16, 16));
2397 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ01X) + xMask * 16, 16));
2398 			oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2399 			*Pointer<Float4>(buffer) = oC.x;
2400 
2401 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2402 
2403 			value = *Pointer<Float4>(buffer);
2404 
2405 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
2406 			{
2407 				Float4 masked;
2408 
2409 				masked = value;
2410 				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[rgbaWriteMask & 0x3][0])));
2411 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,maskD01X[~rgbaWriteMask & 0x3][0])));
2412 				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2413 			}
2414 
2415 			oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskQ23X) + xMask * 16, 16));
2416 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskQ23X) + xMask * 16, 16));
2417 			oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2418 			*Pointer<Float4>(buffer) = oC.y;
2419 			break;
2420 		case FORMAT_G16R16I:
2421 		case FORMAT_G16R16UI:
2422 			if((rgbaWriteMask & 0x00000003) != 0x0)
2423 			{
2424 				buffer = cBuffer + 4 * x;
2425 
2426 				UInt2 rgbaMask;
2427 				UShort4 packedCol = UShort4(As<Int4>(oC.x));
2428 				UShort4 value = *Pointer<UShort4>(buffer);
2429 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2430 				if((rgbaWriteMask & 0x3) != 0x3)
2431 				{
2432 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2433 					rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2434 					mergedMask &= rgbaMask;
2435 				}
2436 				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2437 
2438 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2439 
2440 				packedCol = UShort4(As<Int4>(oC.y));
2441 				value = *Pointer<UShort4>(buffer);
2442 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2443 				if((rgbaWriteMask & 0x3) != 0x3)
2444 				{
2445 					mergedMask &= rgbaMask;
2446 				}
2447 				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2448 			}
2449 			break;
2450 		case FORMAT_G8R8I:
2451 		case FORMAT_G8R8UI:
2452 			if((rgbaWriteMask & 0x00000003) != 0x0)
2453 			{
2454 				buffer = cBuffer + 2 * x;
2455 
2456 				Int2 xyzw, packedCol;
2457 
2458 				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2459 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2460 				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2461 
2462 				if(state.targetFormat[index] == FORMAT_G8R8I)
2463 				{
2464 					packedCol = As<Int2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2465 				}
2466 				else
2467 				{
2468 					packedCol = As<Int2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
2469 				}
2470 
2471 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2472 				if((rgbaWriteMask & 0x3) != 0x3)
2473 				{
2474 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
2475 					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2476 					mergedMask &= rgbaMask;
2477 				}
2478 
2479 				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2480 
2481 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2482 				buffer -= *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2483 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2484 			}
2485 			break;
2486 		case FORMAT_X32B32G32R32F:
2487 		case FORMAT_A32B32G32R32F:
2488 		case FORMAT_A32B32G32R32I:
2489 		case FORMAT_A32B32G32R32UI:
2490 			buffer = cBuffer + 16 * x;
2491 
2492 			{
2493 				value = *Pointer<Float4>(buffer, 16);
2494 
2495 				if(rgbaWriteMask != 0x0000000F)
2496 				{
2497 					Float4 masked = value;
2498 					oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2499 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2500 					oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2501 				}
2502 
2503 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
2504 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
2505 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2506 				*Pointer<Float4>(buffer, 16) = oC.x;
2507 			}
2508 
2509 			{
2510 				value = *Pointer<Float4>(buffer + 16, 16);
2511 
2512 				if(rgbaWriteMask != 0x0000000F)
2513 				{
2514 					Float4 masked = value;
2515 					oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2516 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2517 					oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2518 				}
2519 
2520 				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskX1X) + xMask * 16, 16));
2521 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX1X) + xMask * 16, 16));
2522 				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2523 				*Pointer<Float4>(buffer + 16, 16) = oC.y;
2524 			}
2525 
2526 			buffer += *Pointer<Int>(data + OFFSET(DrawData,colorPitchB[index]));
2527 
2528 			{
2529 				value = *Pointer<Float4>(buffer, 16);
2530 
2531 				if(rgbaWriteMask != 0x0000000F)
2532 				{
2533 					Float4 masked = value;
2534 					oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2535 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2536 					oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2537 				}
2538 
2539 				oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants,maskX2X) + xMask * 16, 16));
2540 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX2X) + xMask * 16, 16));
2541 				oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2542 				*Pointer<Float4>(buffer, 16) = oC.z;
2543 			}
2544 
2545 			{
2546 				value = *Pointer<Float4>(buffer + 16, 16);
2547 
2548 				if(rgbaWriteMask != 0x0000000F)
2549 				{
2550 					Float4 masked = value;
2551 					oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
2552 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
2553 					oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2554 				}
2555 
2556 				oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants,maskX3X) + xMask * 16, 16));
2557 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX3X) + xMask * 16, 16));
2558 				oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2559 				*Pointer<Float4>(buffer + 16, 16) = oC.w;
2560 			}
2561 			break;
2562 		case FORMAT_A16B16G16R16I:
2563 		case FORMAT_A16B16G16R16UI:
2564 			if((rgbaWriteMask & 0x0000000F) != 0x0)
2565 			{
2566 				buffer = cBuffer + 8 * x;
2567 
2568 				UInt4 rgbaMask;
2569 				UShort8 value = *Pointer<UShort8>(buffer);
2570 				UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
2571 				UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2572 				if((rgbaWriteMask & 0xF) != 0xF)
2573 				{
2574 					UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
2575 					rgbaMask = UInt4(tmpMask, tmpMask);
2576 					mergedMask &= rgbaMask;
2577 				}
2578 				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2579 
2580 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2581 
2582 				value = *Pointer<UShort8>(buffer);
2583 				packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
2584 				mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2585 				if((rgbaWriteMask & 0xF) != 0xF)
2586 				{
2587 					mergedMask &= rgbaMask;
2588 				}
2589 				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2590 			}
2591 			break;
2592 		case FORMAT_A8B8G8R8I:
2593 		case FORMAT_A8B8G8R8UI:
2594 			if((rgbaWriteMask & 0x0000000F) != 0x0)
2595 			{
2596 				UInt2 value, packedCol, mergedMask;
2597 
2598 				buffer = cBuffer + 4 * x;
2599 
2600 				if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
2601 				{
2602 					packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2603 				}
2604 				else
2605 				{
2606 					packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y))));
2607 				}
2608 				value = *Pointer<UInt2>(buffer, 16);
2609 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2610 				if(rgbaWriteMask != 0xF)
2611 				{
2612 					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2613 				}
2614 				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2615 
2616 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2617 
2618 				if(state.targetFormat[index] == FORMAT_A8B8G8R8I)
2619 				{
2620 					packedCol = As<UInt2>(Pack(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2621 				}
2622 				else
2623 				{
2624 					packedCol = As<UInt2>(Pack(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w))));
2625 				}
2626 				value = *Pointer<UInt2>(buffer, 16);
2627 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2628 				if(rgbaWriteMask != 0xF)
2629 				{
2630 					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2631 				}
2632 				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2633 			}
2634 			break;
2635 		default:
2636 			ASSERT(false);
2637 		}
2638 	}
2639 
convertFixed16(Float4 & cf,bool saturate)2640 	UShort4 PixelRoutine::convertFixed16(Float4 &cf, bool saturate)
2641 	{
2642 		return UShort4(cf * Float4(0xFFFF), saturate);
2643 	}
2644 
sRGBtoLinear16_12_16(Vector4s & c)2645 	void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
2646 	{
2647 		c.x = As<UShort4>(c.x) >> 4;
2648 		c.y = As<UShort4>(c.y) >> 4;
2649 		c.z = As<UShort4>(c.z) >> 4;
2650 
2651 		sRGBtoLinear12_16(c);
2652 	}
2653 
sRGBtoLinear12_16(Vector4s & c)2654 	void PixelRoutine::sRGBtoLinear12_16(Vector4s &c)
2655 	{
2656 		Pointer<Byte> LUT = constants + OFFSET(Constants,sRGBtoLinear12_16);
2657 
2658 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2659 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2660 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2661 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2662 
2663 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2664 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2665 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2666 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2667 
2668 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2669 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2670 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2671 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2672 	}
2673 
linearToSRGB16_12_16(Vector4s & c)2674 	void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
2675 	{
2676 		c.x = As<UShort4>(c.x) >> 4;
2677 		c.y = As<UShort4>(c.y) >> 4;
2678 		c.z = As<UShort4>(c.z) >> 4;
2679 
2680 		linearToSRGB12_16(c);
2681 	}
2682 
linearToSRGB12_16(Vector4s & c)2683 	void PixelRoutine::linearToSRGB12_16(Vector4s &c)
2684 	{
2685 		Pointer<Byte> LUT = constants + OFFSET(Constants,linearToSRGB12_16);
2686 
2687 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2688 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2689 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2690 		c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2691 
2692 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2693 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2694 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2695 		c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2696 
2697 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2698 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2699 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2700 		c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2701 	}
2702 
sRGBtoLinear(const Float4 & x)2703 	Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)   // Approximates x^2.2
2704 	{
2705 		Float4 linear = x * x;
2706 		linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2707 
2708 		return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2709 	}
2710 
colorUsed()2711 	bool PixelRoutine::colorUsed()
2712 	{
2713 		return state.colorWriteMask || state.alphaTestActive() || state.shaderContainsKill;
2714 	}
2715 }
2716