• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "PixelRoutine.hpp"
16 
17 #include "Constants.hpp"
18 #include "SamplerCore.hpp"
19 #include "Device/Primitive.hpp"
20 #include "Device/QuadRasterizer.hpp"
21 #include "Device/Renderer.hpp"
22 #include "System/Debug.hpp"
23 #include "Vulkan/VkPipelineLayout.hpp"
24 
25 namespace sw {
26 
PixelRoutine(const PixelProcessor::State & state,vk::PipelineLayout const * pipelineLayout,SpirvShader const * spirvShader,const vk::DescriptorSet::Bindings & descriptorSets)27 PixelRoutine::PixelRoutine(
28     const PixelProcessor::State &state,
29     vk::PipelineLayout const *pipelineLayout,
30     SpirvShader const *spirvShader,
31     const vk::DescriptorSet::Bindings &descriptorSets)
32     : QuadRasterizer(state, spirvShader)
33     , routine(pipelineLayout)
34     , descriptorSets(descriptorSets)
35 {
36 	if(spirvShader)
37 	{
38 		spirvShader->emitProlog(&routine);
39 
40 		// Clearing inputs to 0 is not demanded by the spec,
41 		// but it makes the undefined behavior deterministic.
42 		for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++)
43 		{
44 			routine.inputs[i] = Float4(0.0f);
45 		}
46 	}
47 }
48 
~PixelRoutine()49 PixelRoutine::~PixelRoutine()
50 {
51 }
52 
quad(Pointer<Byte> cBuffer[RENDERTARGETS],Pointer<Byte> & zBuffer,Pointer<Byte> & sBuffer,Int cMask[4],Int & x,Int & y)53 void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
54 {
55 	// TODO: consider shader which modifies sample mask in general
56 	const bool earlyDepthTest = !spirvShader || (spirvShader->getModes().EarlyFragmentTests && !spirvShader->getModes().DepthReplacing && !state.alphaToCoverage);
57 
58 	Int zMask[4];  // Depth mask
59 	Int sMask[4];  // Stencil mask
60 
61 	for(unsigned int q = 0; q < state.multiSampleCount; q++)
62 	{
63 		zMask[q] = cMask[q];
64 		sMask[q] = cMask[q];
65 	}
66 
67 	for(unsigned int q = 0; q < state.multiSampleCount; q++)
68 	{
69 		stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
70 	}
71 
72 	Float4 f;
73 	Float4 rhwCentroid;
74 
75 	Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive, xQuad), 16);
76 
77 	if(interpolateZ())
78 	{
79 		for(unsigned int q = 0; q < state.multiSampleCount; q++)
80 		{
81 			Float4 x = xxxx;
82 
83 			if(state.enableMultiSampling)
84 			{
85 				x -= *Pointer<Float4>(constants + OFFSET(Constants, X) + q * sizeof(float4));
86 			}
87 
88 			z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive, z), false, false, state.depthClamp);
89 		}
90 	}
91 
92 	Bool depthPass = false;
93 
94 	if(earlyDepthTest)
95 	{
96 		for(unsigned int q = 0; q < state.multiSampleCount; q++)
97 		{
98 			depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
99 		}
100 	}
101 
102 	If(depthPass || Bool(!earlyDepthTest))
103 	{
104 		Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive, yQuad), 16);
105 
106 		// Centroid locations
107 		Float4 XXXX = Float4(0.0f);
108 		Float4 YYYY = Float4(0.0f);
109 
110 		if(state.centroid)
111 		{
112 			Float4 WWWW(1.0e-9f);
113 
114 			for(unsigned int q = 0; q < state.multiSampleCount; q++)
115 			{
116 				XXXX += *Pointer<Float4>(constants + OFFSET(Constants, sampleX[q]) + 16 * cMask[q]);
117 				YYYY += *Pointer<Float4>(constants + OFFSET(Constants, sampleY[q]) + 16 * cMask[q]);
118 				WWWW += *Pointer<Float4>(constants + OFFSET(Constants, weight) + 16 * cMask[q]);
119 			}
120 
121 			WWWW = Rcp_pp(WWWW);
122 			XXXX *= WWWW;
123 			YYYY *= WWWW;
124 
125 			XXXX += xxxx;
126 			YYYY += yyyy;
127 		}
128 
129 		if(interpolateW())
130 		{
131 			w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive, w), false, false, false);
132 			rhw = reciprocal(w, false, false, true);
133 
134 			if(state.centroid)
135 			{
136 				rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, w), false, false));
137 			}
138 		}
139 
140 		if(spirvShader)
141 		{
142 			for(int interpolant = 0; interpolant < MAX_INTERFACE_COMPONENTS; interpolant++)
143 			{
144 				auto const &input = spirvShader->inputs[interpolant];
145 				if(input.Type != SpirvShader::ATTRIBTYPE_UNUSED)
146 				{
147 					if(input.Centroid && state.enableMultiSampling)
148 					{
149 						routine.inputs[interpolant] =
150 						    interpolateCentroid(XXXX, YYYY, rhwCentroid,
151 						                        primitive + OFFSET(Primitive, V[interpolant]),
152 						                        input.Flat, !input.NoPerspective);
153 					}
154 					else
155 					{
156 						routine.inputs[interpolant] =
157 						    interpolate(xxxx, Dv[interpolant], rhw,
158 						                primitive + OFFSET(Primitive, V[interpolant]),
159 						                input.Flat, !input.NoPerspective, false);
160 					}
161 				}
162 			}
163 
164 			setBuiltins(x, y, z, w, cMask);
165 
166 			for(uint32_t i = 0; i < state.numClipDistances; i++)
167 			{
168 				auto distance = interpolate(xxxx, DclipDistance[i], rhw,
169 				                            primitive + OFFSET(Primitive, clipDistance[i]),
170 				                            false, true, false);
171 
172 				auto clipMask = SignMask(CmpGE(distance, SIMD::Float(0)));
173 				for(auto ms = 0u; ms < state.multiSampleCount; ms++)
174 				{
175 					// FIXME(b/148105887): Fragments discarded by clipping do not exist at
176 					// all -- they should not be counted in queries or have their Z/S effects
177 					// performed when early fragment tests are enabled.
178 					cMask[ms] &= clipMask;
179 				}
180 
181 				if(spirvShader->getUsedCapabilities().ClipDistance)
182 				{
183 					auto it = spirvShader->inputBuiltins.find(spv::BuiltInClipDistance);
184 					if(it != spirvShader->inputBuiltins.end())
185 					{
186 						if(i < it->second.SizeInComponents)
187 						{
188 							routine.getVariable(it->second.Id)[it->second.FirstComponent + i] = distance;
189 						}
190 					}
191 				}
192 			}
193 
194 			if(spirvShader->getUsedCapabilities().CullDistance)
195 			{
196 				auto it = spirvShader->inputBuiltins.find(spv::BuiltInCullDistance);
197 				if(it != spirvShader->inputBuiltins.end())
198 				{
199 					for(uint32_t i = 0; i < state.numCullDistances; i++)
200 					{
201 						if(i < it->second.SizeInComponents)
202 						{
203 							routine.getVariable(it->second.Id)[it->second.FirstComponent + i] =
204 							    interpolate(xxxx, DcullDistance[i], rhw,
205 							                primitive + OFFSET(Primitive, cullDistance[i]),
206 							                false, true, false);
207 						}
208 					}
209 				}
210 			}
211 		}
212 
213 		Bool alphaPass = true;
214 
215 		if(spirvShader)
216 		{
217 			bool earlyFragTests = (spirvShader && spirvShader->getModes().EarlyFragmentTests);
218 			applyShader(cMask, earlyFragTests ? sMask : cMask, earlyDepthTest ? zMask : cMask);
219 		}
220 
221 		alphaPass = alphaTest(cMask);
222 
223 		if((spirvShader && spirvShader->getModes().ContainsKill) || state.alphaToCoverage)
224 		{
225 			for(unsigned int q = 0; q < state.multiSampleCount; q++)
226 			{
227 				zMask[q] &= cMask[q];
228 				sMask[q] &= cMask[q];
229 			}
230 		}
231 
232 		If(alphaPass)
233 		{
234 			if(!earlyDepthTest)
235 			{
236 				for(unsigned int q = 0; q < state.multiSampleCount; q++)
237 				{
238 					depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
239 				}
240 			}
241 
242 			If(depthPass || Bool(earlyDepthTest))
243 			{
244 				for(unsigned int q = 0; q < state.multiSampleCount; q++)
245 				{
246 					if(state.multiSampleMask & (1 << q))
247 					{
248 						writeDepth(zBuffer, q, x, z[q], zMask[q]);
249 
250 						if(state.occlusionEnabled)
251 						{
252 							occlusion += *Pointer<UInt>(constants + OFFSET(Constants, occlusionCount) + 4 * (zMask[q] & sMask[q]));
253 						}
254 					}
255 				}
256 
257 				rasterOperation(cBuffer, x, sMask, zMask, cMask);
258 			}
259 		}
260 	}
261 
262 	for(unsigned int q = 0; q < state.multiSampleCount; q++)
263 	{
264 		if(state.multiSampleMask & (1 << q))
265 		{
266 			writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
267 		}
268 	}
269 }
270 
interpolateCentroid(const Float4 & x,const Float4 & y,const Float4 & rhw,Pointer<Byte> planeEquation,bool flat,bool perspective)271 Float4 PixelRoutine::interpolateCentroid(const Float4 &x, const Float4 &y, const Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
272 {
273 	Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation, C), 16);
274 
275 	if(!flat)
276 	{
277 		interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation, A), 16) +
278 		               y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation, B), 16);
279 
280 		if(perspective)
281 		{
282 			interpolant *= rhw;
283 		}
284 	}
285 
286 	return interpolant;
287 }
288 
stencilTest(const Pointer<Byte> & sBuffer,int q,const Int & x,Int & sMask,const Int & cMask)289 void PixelRoutine::stencilTest(const Pointer<Byte> &sBuffer, int q, const Int &x, Int &sMask, const Int &cMask)
290 {
291 	if(!state.stencilActive)
292 	{
293 		return;
294 	}
295 
296 	// (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
297 
298 	Pointer<Byte> buffer = sBuffer + x;
299 
300 	if(q > 0)
301 	{
302 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
303 	}
304 
305 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
306 	Byte8 value = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
307 	value = value | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
308 	Byte8 valueBack = value;
309 
310 	if(state.frontStencil.compareMask != 0xff)
311 	{
312 		value &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].testMaskQ));
313 	}
314 
315 	stencilTest(value, state.frontStencil.compareOp, false);
316 
317 	if(state.backStencil.compareMask != 0xff)
318 	{
319 		valueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].testMaskQ));
320 	}
321 
322 	stencilTest(valueBack, state.backStencil.compareOp, true);
323 
324 	value &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
325 	valueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
326 	value |= valueBack;
327 
328 	sMask = SignMask(value) & cMask;
329 }
330 
stencilTest(Byte8 & value,VkCompareOp stencilCompareMode,bool isBack)331 void PixelRoutine::stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool isBack)
332 {
333 	Byte8 equal;
334 
335 	switch(stencilCompareMode)
336 	{
337 		case VK_COMPARE_OP_ALWAYS:
338 			value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
339 			break;
340 		case VK_COMPARE_OP_NEVER:
341 			value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
342 			break;
343 		case VK_COMPARE_OP_LESS:  // a < b ~ b > a
344 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
345 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
346 			break;
347 		case VK_COMPARE_OP_EQUAL:
348 			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
349 			break;
350 		case VK_COMPARE_OP_NOT_EQUAL:  // a != b ~ !(a == b)
351 			value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
352 			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
353 			break;
354 		case VK_COMPARE_OP_LESS_OR_EQUAL:  // a <= b ~ (b > a) || (a == b)
355 			equal = value;
356 			equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
357 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
358 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
359 			value |= equal;
360 			break;
361 		case VK_COMPARE_OP_GREATER:  // a > b
362 			equal = *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ));
363 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
364 			equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
365 			value = equal;
366 			break;
367 		case VK_COMPARE_OP_GREATER_OR_EQUAL:  // a >= b ~ !(a < b) ~ !(b > a)
368 			value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
369 			value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
370 			value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
371 			break;
372 		default:
373 			UNSUPPORTED("VkCompareOp: %d", int(stencilCompareMode));
374 	}
375 }
376 
depthTest32F(const Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & sMask,Int & zMask,const Int & cMask)377 Bool PixelRoutine::depthTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
378 {
379 	Float4 Z = z;
380 
381 	if(spirvShader && spirvShader->getModes().DepthReplacing)
382 	{
383 		Z = oDepth;
384 	}
385 
386 	Pointer<Byte> buffer = zBuffer + 4 * x;
387 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
388 
389 	if(q > 0)
390 	{
391 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
392 	}
393 
394 	Float4 zValue;
395 
396 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
397 	{
398 		// FIXME: Properly optimizes?
399 		zValue.xy = *Pointer<Float4>(buffer);
400 		zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
401 	}
402 
403 	Int4 zTest;
404 
405 	switch(state.depthCompareMode)
406 	{
407 		case VK_COMPARE_OP_ALWAYS:
408 			// Optimized
409 			break;
410 		case VK_COMPARE_OP_NEVER:
411 			// Optimized
412 			break;
413 		case VK_COMPARE_OP_EQUAL:
414 			zTest = CmpEQ(zValue, Z);
415 			break;
416 		case VK_COMPARE_OP_NOT_EQUAL:
417 			zTest = CmpNEQ(zValue, Z);
418 			break;
419 		case VK_COMPARE_OP_LESS:
420 			zTest = CmpNLE(zValue, Z);
421 			break;
422 		case VK_COMPARE_OP_GREATER_OR_EQUAL:
423 			zTest = CmpLE(zValue, Z);
424 			break;
425 		case VK_COMPARE_OP_LESS_OR_EQUAL:
426 			zTest = CmpNLT(zValue, Z);
427 			break;
428 		case VK_COMPARE_OP_GREATER:
429 			zTest = CmpLT(zValue, Z);
430 			break;
431 		default:
432 			UNSUPPORTED("VkCompareOp: %d", int(state.depthCompareMode));
433 	}
434 
435 	switch(state.depthCompareMode)
436 	{
437 		case VK_COMPARE_OP_ALWAYS:
438 			zMask = cMask;
439 			break;
440 		case VK_COMPARE_OP_NEVER:
441 			zMask = 0x0;
442 			break;
443 		default:
444 			zMask = SignMask(zTest) & cMask;
445 			break;
446 	}
447 
448 	if(state.stencilActive)
449 	{
450 		zMask &= sMask;
451 	}
452 
453 	return zMask != 0;
454 }
455 
depthTest16(const Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & sMask,Int & zMask,const Int & cMask)456 Bool PixelRoutine::depthTest16(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
457 {
458 	Short4 Z = convertFixed16(z, true);
459 
460 	if(spirvShader && spirvShader->getModes().DepthReplacing)
461 	{
462 		Z = convertFixed16(oDepth, true);
463 	}
464 
465 	Pointer<Byte> buffer = zBuffer + 2 * x;
466 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
467 
468 	if(q > 0)
469 	{
470 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
471 	}
472 
473 	Short4 zValue;
474 
475 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
476 	{
477 		// FIXME: Properly optimizes?
478 		zValue = *Pointer<Short4>(buffer) & Short4(-1, -1, 0, 0);
479 		zValue = zValue | (*Pointer<Short4>(buffer + pitch - 4) & Short4(0, 0, -1, -1));
480 	}
481 
482 	Int4 zTest;
483 
484 	// Bias values to make unsigned compares out of Reactor's (due SSE's) signed compares only
485 	zValue = zValue - Short4(0x8000u);
486 	Z = Z - Short4(0x8000u);
487 
488 	switch(state.depthCompareMode)
489 	{
490 		case VK_COMPARE_OP_ALWAYS:
491 			// Optimized
492 			break;
493 		case VK_COMPARE_OP_NEVER:
494 			// Optimized
495 			break;
496 		case VK_COMPARE_OP_EQUAL:
497 			zTest = Int4(CmpEQ(zValue, Z));
498 			break;
499 		case VK_COMPARE_OP_NOT_EQUAL:
500 			zTest = ~Int4(CmpEQ(zValue, Z));
501 			break;
502 		case VK_COMPARE_OP_LESS:
503 			zTest = Int4(CmpGT(zValue, Z));
504 			break;
505 		case VK_COMPARE_OP_GREATER_OR_EQUAL:
506 			zTest = ~Int4(CmpGT(zValue, Z));
507 			break;
508 		case VK_COMPARE_OP_LESS_OR_EQUAL:
509 			zTest = ~Int4(CmpGT(Z, zValue));
510 			break;
511 		case VK_COMPARE_OP_GREATER:
512 			zTest = Int4(CmpGT(Z, zValue));
513 			break;
514 		default:
515 			UNSUPPORTED("VkCompareOp: %d", int(state.depthCompareMode));
516 	}
517 
518 	switch(state.depthCompareMode)
519 	{
520 		case VK_COMPARE_OP_ALWAYS:
521 			zMask = cMask;
522 			break;
523 		case VK_COMPARE_OP_NEVER:
524 			zMask = 0x0;
525 			break;
526 		default:
527 			zMask = SignMask(zTest) & cMask;
528 			break;
529 	}
530 
531 	if(state.stencilActive)
532 	{
533 		zMask &= sMask;
534 	}
535 
536 	return zMask != 0;
537 }
538 
depthTest(const Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & sMask,Int & zMask,const Int & cMask)539 Bool PixelRoutine::depthTest(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
540 {
541 	if(!state.depthTestActive)
542 	{
543 		return true;
544 	}
545 
546 	if(state.depthFormat == VK_FORMAT_D16_UNORM)
547 		return depthTest16(zBuffer, q, x, z, sMask, zMask, cMask);
548 	else
549 		return depthTest32F(zBuffer, q, x, z, sMask, zMask, cMask);
550 }
551 
alphaToCoverage(Int cMask[4],const Float4 & alpha)552 void PixelRoutine::alphaToCoverage(Int cMask[4], const Float4 &alpha)
553 {
554 	Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData, a2c0)));
555 	Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData, a2c1)));
556 	Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData, a2c2)));
557 	Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData, a2c3)));
558 
559 	Int aMask0 = SignMask(coverage0);
560 	Int aMask1 = SignMask(coverage1);
561 	Int aMask2 = SignMask(coverage2);
562 	Int aMask3 = SignMask(coverage3);
563 
564 	cMask[0] &= aMask0;
565 	cMask[1] &= aMask1;
566 	cMask[2] &= aMask2;
567 	cMask[3] &= aMask3;
568 }
569 
writeDepth32F(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)570 void PixelRoutine::writeDepth32F(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
571 {
572 	Float4 Z = z;
573 
574 	if(spirvShader && spirvShader->getModes().DepthReplacing)
575 	{
576 		Z = oDepth;
577 	}
578 
579 	Pointer<Byte> buffer = zBuffer + 4 * x;
580 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
581 
582 	if(q > 0)
583 	{
584 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
585 	}
586 
587 	Float4 zValue;
588 
589 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
590 	{
591 		// FIXME: Properly optimizes?
592 		zValue.xy = *Pointer<Float4>(buffer);
593 		zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
594 	}
595 
596 	Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + zMask * 16, 16));
597 	zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + zMask * 16, 16));
598 	Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
599 
600 	// FIXME: Properly optimizes?
601 	*Pointer<Float2>(buffer) = Float2(Z.xy);
602 	*Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
603 }
604 
writeDepth16(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)605 void PixelRoutine::writeDepth16(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
606 {
607 	Short4 Z = As<Short4>(convertFixed16(z, true));
608 
609 	if(spirvShader && spirvShader->getModes().DepthReplacing)
610 	{
611 		Z = As<Short4>(convertFixed16(oDepth, true));
612 	}
613 
614 	Pointer<Byte> buffer = zBuffer + 2 * x;
615 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
616 
617 	if(q > 0)
618 	{
619 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
620 	}
621 
622 	Short4 zValue;
623 
624 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
625 	{
626 		// FIXME: Properly optimizes?
627 		zValue = *Pointer<Short4>(buffer) & Short4(-1, -1, 0, 0);
628 		zValue = zValue | (*Pointer<Short4>(buffer + pitch - 4) & Short4(0, 0, -1, -1));
629 	}
630 
631 	Z = Z & *Pointer<Short4>(constants + OFFSET(Constants, maskW4Q) + zMask * 8, 8);
632 	zValue = zValue & *Pointer<Short4>(constants + OFFSET(Constants, invMaskW4Q) + zMask * 8, 8);
633 	Z = Z | zValue;
634 
635 	// FIXME: Properly optimizes?
636 	*Pointer<Short>(buffer) = Extract(Z, 0);
637 	*Pointer<Short>(buffer + 2) = Extract(Z, 1);
638 	*Pointer<Short>(buffer + pitch) = Extract(Z, 2);
639 	*Pointer<Short>(buffer + pitch + 2) = Extract(Z, 3);
640 }
641 
writeDepth(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)642 void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
643 {
644 	if(!state.depthWriteEnable)
645 	{
646 		return;
647 	}
648 
649 	if(state.depthFormat == VK_FORMAT_D16_UNORM)
650 		writeDepth16(zBuffer, q, x, z, zMask);
651 	else
652 		writeDepth32F(zBuffer, q, x, z, zMask);
653 }
654 
writeStencil(Pointer<Byte> & sBuffer,int q,const Int & x,const Int & sMask,const Int & zMask,const Int & cMask)655 void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, const Int &x, const Int &sMask, const Int &zMask, const Int &cMask)
656 {
657 	if(!state.stencilActive)
658 	{
659 		return;
660 	}
661 
662 	if(state.frontStencil.passOp == VK_STENCIL_OP_KEEP && state.frontStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.frontStencil.failOp == VK_STENCIL_OP_KEEP)
663 	{
664 		if(state.backStencil.passOp == VK_STENCIL_OP_KEEP && state.backStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.backStencil.failOp == VK_STENCIL_OP_KEEP)
665 		{
666 			return;
667 		}
668 	}
669 
670 	if((state.frontStencil.writeMask == 0) && (state.backStencil.writeMask == 0))
671 	{
672 		return;
673 	}
674 
675 	Pointer<Byte> buffer = sBuffer + x;
676 
677 	if(q > 0)
678 	{
679 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
680 	}
681 
682 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
683 	Byte8 bufferValue = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
684 	bufferValue = bufferValue | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
685 	Byte8 newValue;
686 	stencilOperation(newValue, bufferValue, state.frontStencil, false, zMask, sMask);
687 
688 	if((state.frontStencil.writeMask & 0xFF) != 0xFF)  // Assume 8-bit stencil buffer
689 	{
690 		Byte8 maskedValue = bufferValue;
691 		newValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].writeMaskQ));
692 		maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].invWriteMaskQ));
693 		newValue |= maskedValue;
694 	}
695 
696 	Byte8 newValueBack;
697 
698 	stencilOperation(newValueBack, bufferValue, state.backStencil, true, zMask, sMask);
699 
700 	if((state.backStencil.writeMask & 0xFF) != 0xFF)  // Assume 8-bit stencil buffer
701 	{
702 		Byte8 maskedValue = bufferValue;
703 		newValueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].writeMaskQ));
704 		maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].invWriteMaskQ));
705 		newValueBack |= maskedValue;
706 	}
707 
708 	newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
709 	newValueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
710 	newValue |= newValueBack;
711 
712 	newValue &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * cMask);
713 	bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * cMask);
714 	newValue |= bufferValue;
715 
716 	*Pointer<Short>(buffer) = Extract(As<Short4>(newValue), 0);
717 	*Pointer<Short>(buffer + pitch) = Extract(As<Short4>(newValue), 1);
718 }
719 
stencilOperation(Byte8 & newValue,const Byte8 & bufferValue,const PixelProcessor::States::StencilOpState & ops,bool isBack,const Int & zMask,const Int & sMask)720 void PixelRoutine::stencilOperation(Byte8 &newValue, const Byte8 &bufferValue, const PixelProcessor::States::StencilOpState &ops, bool isBack, const Int &zMask, const Int &sMask)
721 {
722 	Byte8 &pass = newValue;
723 	Byte8 fail;
724 	Byte8 zFail;
725 
726 	stencilOperation(pass, bufferValue, ops.passOp, isBack);
727 
728 	if(ops.depthFailOp != ops.passOp)
729 	{
730 		stencilOperation(zFail, bufferValue, ops.depthFailOp, isBack);
731 	}
732 
733 	if(ops.failOp != ops.passOp || ops.failOp != ops.depthFailOp)
734 	{
735 		stencilOperation(fail, bufferValue, ops.failOp, isBack);
736 	}
737 
738 	if(ops.failOp != ops.passOp || ops.failOp != ops.depthFailOp)
739 	{
740 		if(state.depthTestActive && ops.depthFailOp != ops.passOp)  // zMask valid and values not the same
741 		{
742 			pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * zMask);
743 			zFail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * zMask);
744 			pass |= zFail;
745 		}
746 
747 		pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * sMask);
748 		fail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * sMask);
749 		pass |= fail;
750 	}
751 }
752 
stencilReplaceRef(bool isBack)753 Byte8 PixelRoutine::stencilReplaceRef(bool isBack)
754 {
755 	if(spirvShader)
756 	{
757 		auto it = spirvShader->outputBuiltins.find(spv::BuiltInFragStencilRefEXT);
758 		if(it != spirvShader->outputBuiltins.end())
759 		{
760 			UInt4 sRef = As<UInt4>(routine.getVariable(it->second.Id)[it->second.FirstComponent]) & UInt4(0xff);
761 			// TODO (b/148295813): Could be done with a single pshufb instruction. Optimize the
762 			//                     following line by either adding a rr::Shuffle() variant to do
763 			//                     it explicitly or adding a Byte4(Int4) constructor would work.
764 			sRef.x = rr::UInt(sRef.x) | (rr::UInt(sRef.y) << 8) | (rr::UInt(sRef.z) << 16) | (rr::UInt(sRef.w) << 24);
765 
766 			UInt2 sRefDuplicated;
767 			sRefDuplicated = Insert(sRefDuplicated, sRef.x, 0);
768 			sRefDuplicated = Insert(sRefDuplicated, sRef.x, 1);
769 			return As<Byte8>(sRefDuplicated);
770 		}
771 	}
772 
773 	return *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceQ));
774 }
775 
stencilOperation(Byte8 & output,const Byte8 & bufferValue,VkStencilOp operation,bool isBack)776 void PixelRoutine::stencilOperation(Byte8 &output, const Byte8 &bufferValue, VkStencilOp operation, bool isBack)
777 {
778 	switch(operation)
779 	{
780 		case VK_STENCIL_OP_KEEP:
781 			output = bufferValue;
782 			break;
783 		case VK_STENCIL_OP_ZERO:
784 			output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
785 			break;
786 		case VK_STENCIL_OP_REPLACE:
787 			output = stencilReplaceRef(isBack);
788 			break;
789 		case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
790 			output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
791 			break;
792 		case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
793 			output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
794 			break;
795 		case VK_STENCIL_OP_INVERT:
796 			output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
797 			break;
798 		case VK_STENCIL_OP_INCREMENT_AND_WRAP:
799 			output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
800 			break;
801 		case VK_STENCIL_OP_DECREMENT_AND_WRAP:
802 			output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
803 			break;
804 		default:
805 			UNSUPPORTED("VkStencilOp: %d", int(operation));
806 	}
807 }
808 
blendFactor(Vector4s & blendFactor,const Vector4s & current,const Vector4s & pixel,VkBlendFactor blendFactorActive)809 void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, VkBlendFactor blendFactorActive)
810 {
811 	switch(blendFactorActive)
812 	{
813 		case VK_BLEND_FACTOR_ZERO:
814 			// Optimized
815 			break;
816 		case VK_BLEND_FACTOR_ONE:
817 			// Optimized
818 			break;
819 		case VK_BLEND_FACTOR_SRC_COLOR:
820 			blendFactor.x = current.x;
821 			blendFactor.y = current.y;
822 			blendFactor.z = current.z;
823 			break;
824 		case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
825 			blendFactor.x = Short4(0xFFFFu) - current.x;
826 			blendFactor.y = Short4(0xFFFFu) - current.y;
827 			blendFactor.z = Short4(0xFFFFu) - current.z;
828 			break;
829 		case VK_BLEND_FACTOR_DST_COLOR:
830 			blendFactor.x = pixel.x;
831 			blendFactor.y = pixel.y;
832 			blendFactor.z = pixel.z;
833 			break;
834 		case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
835 			blendFactor.x = Short4(0xFFFFu) - pixel.x;
836 			blendFactor.y = Short4(0xFFFFu) - pixel.y;
837 			blendFactor.z = Short4(0xFFFFu) - pixel.z;
838 			break;
839 		case VK_BLEND_FACTOR_SRC_ALPHA:
840 			blendFactor.x = current.w;
841 			blendFactor.y = current.w;
842 			blendFactor.z = current.w;
843 			break;
844 		case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
845 			blendFactor.x = Short4(0xFFFFu) - current.w;
846 			blendFactor.y = Short4(0xFFFFu) - current.w;
847 			blendFactor.z = Short4(0xFFFFu) - current.w;
848 			break;
849 		case VK_BLEND_FACTOR_DST_ALPHA:
850 			blendFactor.x = pixel.w;
851 			blendFactor.y = pixel.w;
852 			blendFactor.z = pixel.w;
853 			break;
854 		case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
855 			blendFactor.x = Short4(0xFFFFu) - pixel.w;
856 			blendFactor.y = Short4(0xFFFFu) - pixel.w;
857 			blendFactor.z = Short4(0xFFFFu) - pixel.w;
858 			break;
859 		case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
860 			blendFactor.x = Short4(0xFFFFu) - pixel.w;
861 			blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
862 			blendFactor.y = blendFactor.x;
863 			blendFactor.z = blendFactor.x;
864 			break;
865 		case VK_BLEND_FACTOR_CONSTANT_COLOR:
866 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData, factor.blendConstant4W[0]));
867 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData, factor.blendConstant4W[1]));
868 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData, factor.blendConstant4W[2]));
869 			break;
870 		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
871 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData, factor.invBlendConstant4W[0]));
872 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData, factor.invBlendConstant4W[1]));
873 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData, factor.invBlendConstant4W[2]));
874 			break;
875 		case VK_BLEND_FACTOR_CONSTANT_ALPHA:
876 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData, factor.blendConstant4W[3]));
877 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData, factor.blendConstant4W[3]));
878 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData, factor.blendConstant4W[3]));
879 			break;
880 		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
881 			blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData, factor.invBlendConstant4W[3]));
882 			blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData, factor.invBlendConstant4W[3]));
883 			blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData, factor.invBlendConstant4W[3]));
884 			break;
885 		default:
886 			UNSUPPORTED("VkBlendFactor: %d", int(blendFactorActive));
887 	}
888 }
889 
blendFactorAlpha(Vector4s & blendFactor,const Vector4s & current,const Vector4s & pixel,VkBlendFactor blendFactorAlphaActive)890 void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, VkBlendFactor blendFactorAlphaActive)
891 {
892 	switch(blendFactorAlphaActive)
893 	{
894 		case VK_BLEND_FACTOR_ZERO:
895 			// Optimized
896 			break;
897 		case VK_BLEND_FACTOR_ONE:
898 			// Optimized
899 			break;
900 		case VK_BLEND_FACTOR_SRC_COLOR:
901 			blendFactor.w = current.w;
902 			break;
903 		case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
904 			blendFactor.w = Short4(0xFFFFu) - current.w;
905 			break;
906 		case VK_BLEND_FACTOR_DST_COLOR:
907 			blendFactor.w = pixel.w;
908 			break;
909 		case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
910 			blendFactor.w = Short4(0xFFFFu) - pixel.w;
911 			break;
912 		case VK_BLEND_FACTOR_SRC_ALPHA:
913 			blendFactor.w = current.w;
914 			break;
915 		case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
916 			blendFactor.w = Short4(0xFFFFu) - current.w;
917 			break;
918 		case VK_BLEND_FACTOR_DST_ALPHA:
919 			blendFactor.w = pixel.w;
920 			break;
921 		case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
922 			blendFactor.w = Short4(0xFFFFu) - pixel.w;
923 			break;
924 		case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
925 			blendFactor.w = Short4(0xFFFFu);
926 			break;
927 		case VK_BLEND_FACTOR_CONSTANT_COLOR:
928 		case VK_BLEND_FACTOR_CONSTANT_ALPHA:
929 			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData, factor.blendConstant4W[3]));
930 			break;
931 		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
932 		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
933 			blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData, factor.invBlendConstant4W[3]));
934 			break;
935 		default:
936 			UNSUPPORTED("VkBlendFactor: %d", int(blendFactorAlphaActive));
937 	}
938 }
939 
isSRGB(int index) const940 bool PixelRoutine::isSRGB(int index) const
941 {
942 	return vk::Format(state.targetFormat[index]).isSRGBformat();
943 }
944 
readPixel(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4s & pixel)945 void PixelRoutine::readPixel(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &pixel)
946 {
947 	Short4 c01;
948 	Short4 c23;
949 	Pointer<Byte> buffer = cBuffer;
950 	Pointer<Byte> buffer2;
951 
952 	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
953 
954 	switch(state.targetFormat[index])
955 	{
956 		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
957 			buffer += 2 * x;
958 			buffer2 = buffer + pitchB;
959 			c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
960 
961 			pixel.x = (c01 & Short4(0x7C00u)) << 1;
962 			pixel.y = (c01 & Short4(0x03E0u)) << 6;
963 			pixel.z = (c01 & Short4(0x001Fu)) << 11;
964 			pixel.w = (c01 & Short4(0x8000u)) >> 15;
965 
966 			// Expand to 16 bit range
967 			pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
968 			pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
969 			pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
970 			pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
971 			pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
972 			pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
973 			break;
974 		case VK_FORMAT_R5G6B5_UNORM_PACK16:
975 			buffer += 2 * x;
976 			buffer2 = buffer + pitchB;
977 			c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
978 
979 			pixel.x = c01 & Short4(0xF800u);
980 			pixel.y = (c01 & Short4(0x07E0u)) << 5;
981 			pixel.z = (c01 & Short4(0x001Fu)) << 11;
982 			pixel.w = Short4(0xFFFFu);
983 
984 			// Expand to 16 bit range
985 			pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
986 			pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
987 			pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
988 			pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
989 			pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
990 			pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
991 			break;
992 		case VK_FORMAT_B8G8R8A8_UNORM:
993 		case VK_FORMAT_B8G8R8A8_SRGB:
994 			buffer += 4 * x;
995 			c01 = *Pointer<Short4>(buffer);
996 			buffer += pitchB;
997 			c23 = *Pointer<Short4>(buffer);
998 			pixel.z = c01;
999 			pixel.y = c01;
1000 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1001 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1002 			pixel.x = pixel.z;
1003 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1004 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1005 			pixel.y = pixel.z;
1006 			pixel.w = pixel.x;
1007 			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1008 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1009 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1010 			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1011 			break;
1012 		case VK_FORMAT_R8G8B8A8_UNORM:
1013 		case VK_FORMAT_R8G8B8A8_SRGB:
1014 			buffer += 4 * x;
1015 			c01 = *Pointer<Short4>(buffer);
1016 			buffer += pitchB;
1017 			c23 = *Pointer<Short4>(buffer);
1018 			pixel.z = c01;
1019 			pixel.y = c01;
1020 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1021 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1022 			pixel.x = pixel.z;
1023 			pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1024 			pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1025 			pixel.y = pixel.z;
1026 			pixel.w = pixel.x;
1027 			pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1028 			pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1029 			pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1030 			pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1031 			break;
1032 		case VK_FORMAT_R8_UNORM:
1033 			buffer += 1 * x;
1034 			pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
1035 			buffer += pitchB;
1036 			pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
1037 			pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1038 			pixel.y = Short4(0x0000);
1039 			pixel.z = Short4(0x0000);
1040 			pixel.w = Short4(0xFFFFu);
1041 			break;
1042 		case VK_FORMAT_R8G8_UNORM:
1043 			buffer += 2 * x;
1044 			c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
1045 			buffer += pitchB;
1046 			c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
1047 			pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
1048 			pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
1049 			pixel.z = Short4(0x0000u);
1050 			pixel.w = Short4(0xFFFFu);
1051 			break;
1052 		case VK_FORMAT_R16G16B16A16_UNORM:
1053 			buffer += 8 * x;
1054 			pixel.x = *Pointer<Short4>(buffer + 0);
1055 			pixel.y = *Pointer<Short4>(buffer + 8);
1056 			buffer += pitchB;
1057 			pixel.z = *Pointer<Short4>(buffer + 0);
1058 			pixel.w = *Pointer<Short4>(buffer + 8);
1059 			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1060 			break;
1061 		case VK_FORMAT_R16G16_UNORM:
1062 			buffer += 4 * x;
1063 			pixel.x = *Pointer<Short4>(buffer);
1064 			buffer += pitchB;
1065 			pixel.y = *Pointer<Short4>(buffer);
1066 			pixel.z = pixel.x;
1067 			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1068 			pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1069 			pixel.y = pixel.z;
1070 			pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1071 			pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1072 			pixel.z = Short4(0xFFFFu);
1073 			pixel.w = Short4(0xFFFFu);
1074 			break;
1075 		case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1076 		{
1077 			Int4 v = Int4(0);
1078 			buffer += 4 * x;
1079 			v = Insert(v, *Pointer<Int>(buffer + 0), 0);
1080 			v = Insert(v, *Pointer<Int>(buffer + 4), 1);
1081 			buffer += pitchB;
1082 			v = Insert(v, *Pointer<Int>(buffer + 0), 2);
1083 			v = Insert(v, *Pointer<Int>(buffer + 4), 3);
1084 
1085 			pixel = a2b10g10r10Unpack(v);
1086 		}
1087 		break;
1088 		case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1089 		{
1090 			Int4 v = Int4(0);
1091 			v = Insert(v, *Pointer<Int>(buffer + 4 * x), 0);
1092 			v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 1);
1093 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1094 			v = Insert(v, *Pointer<Int>(buffer + 4 * x), 2);
1095 			v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 3);
1096 
1097 			pixel = a2r10g10b10Unpack(v);
1098 		}
1099 		break;
1100 		default:
1101 			UNSUPPORTED("VkFormat %d", state.targetFormat[index]);
1102 	}
1103 
1104 	if(isSRGB(index))
1105 	{
1106 		sRGBtoLinear16_12_16(pixel);
1107 	}
1108 }
1109 
alphaBlend(int index,const Pointer<Byte> & cBuffer,Vector4s & current,const Int & x)1110 void PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, Vector4s &current, const Int &x)
1111 {
1112 	if(!state.blendState[index].alphaBlendEnable)
1113 	{
1114 		return;
1115 	}
1116 
1117 	Vector4s pixel;
1118 	readPixel(index, cBuffer, x, pixel);
1119 
1120 	// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1121 	Vector4s sourceFactor;
1122 	Vector4s destFactor;
1123 
1124 	blendFactor(sourceFactor, current, pixel, state.blendState[index].sourceBlendFactor);
1125 	blendFactor(destFactor, current, pixel, state.blendState[index].destBlendFactor);
1126 
1127 	if(state.blendState[index].sourceBlendFactor != VK_BLEND_FACTOR_ONE && state.blendState[index].sourceBlendFactor != VK_BLEND_FACTOR_ZERO)
1128 	{
1129 		current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1130 		current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1131 		current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1132 	}
1133 
1134 	if(state.blendState[index].destBlendFactor != VK_BLEND_FACTOR_ONE && state.blendState[index].destBlendFactor != VK_BLEND_FACTOR_ZERO)
1135 	{
1136 		pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1137 		pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1138 		pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1139 	}
1140 
1141 	switch(state.blendState[index].blendOperation)
1142 	{
1143 		case VK_BLEND_OP_ADD:
1144 			current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1145 			current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1146 			current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1147 			break;
1148 		case VK_BLEND_OP_SUBTRACT:
1149 			current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1150 			current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1151 			current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1152 			break;
1153 		case VK_BLEND_OP_REVERSE_SUBTRACT:
1154 			current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1155 			current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1156 			current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1157 			break;
1158 		case VK_BLEND_OP_MIN:
1159 			current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1160 			current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1161 			current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1162 			break;
1163 		case VK_BLEND_OP_MAX:
1164 			current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1165 			current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1166 			current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1167 			break;
1168 		case VK_BLEND_OP_SRC_EXT:
1169 			// No operation
1170 			break;
1171 		case VK_BLEND_OP_DST_EXT:
1172 			current.x = pixel.x;
1173 			current.y = pixel.y;
1174 			current.z = pixel.z;
1175 			break;
1176 		case VK_BLEND_OP_ZERO_EXT:
1177 			current.x = Short4(0x0000);
1178 			current.y = Short4(0x0000);
1179 			current.z = Short4(0x0000);
1180 			break;
1181 		default:
1182 			UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperation));
1183 	}
1184 
1185 	blendFactorAlpha(sourceFactor, current, pixel, state.blendState[index].sourceBlendFactorAlpha);
1186 	blendFactorAlpha(destFactor, current, pixel, state.blendState[index].destBlendFactorAlpha);
1187 
1188 	if(state.blendState[index].sourceBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.blendState[index].sourceBlendFactorAlpha != VK_BLEND_FACTOR_ZERO)
1189 	{
1190 		current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1191 	}
1192 
1193 	if(state.blendState[index].destBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.blendState[index].destBlendFactorAlpha != VK_BLEND_FACTOR_ZERO)
1194 	{
1195 		pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1196 	}
1197 
1198 	switch(state.blendState[index].blendOperationAlpha)
1199 	{
1200 		case VK_BLEND_OP_ADD:
1201 			current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1202 			break;
1203 		case VK_BLEND_OP_SUBTRACT:
1204 			current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1205 			break;
1206 		case VK_BLEND_OP_REVERSE_SUBTRACT:
1207 			current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1208 			break;
1209 		case VK_BLEND_OP_MIN:
1210 			current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1211 			break;
1212 		case VK_BLEND_OP_MAX:
1213 			current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1214 			break;
1215 		case VK_BLEND_OP_SRC_EXT:
1216 			// No operation
1217 			break;
1218 		case VK_BLEND_OP_DST_EXT:
1219 			current.w = pixel.w;
1220 			break;
1221 		case VK_BLEND_OP_ZERO_EXT:
1222 			current.w = Short4(0x0000);
1223 			break;
1224 		default:
1225 			UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperationAlpha));
1226 	}
1227 }
1228 
writeColor(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4s & current,const Int & sMask,const Int & zMask,const Int & cMask)1229 void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &current, const Int &sMask, const Int &zMask, const Int &cMask)
1230 {
1231 	if(isSRGB(index))
1232 	{
1233 		linearToSRGB16_12_16(current);
1234 	}
1235 
1236 	switch(state.targetFormat[index])
1237 	{
1238 		case VK_FORMAT_B8G8R8A8_UNORM:
1239 		case VK_FORMAT_B8G8R8A8_SRGB:
1240 		case VK_FORMAT_R8G8B8A8_UNORM:
1241 		case VK_FORMAT_R8G8B8A8_SRGB:
1242 		case VK_FORMAT_R8G8_UNORM:
1243 		case VK_FORMAT_R8_UNORM:
1244 		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1245 		case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1246 			current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
1247 			current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
1248 			current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
1249 			current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
1250 			break;
1251 		case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1252 		case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1253 			current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 10) + Short4(0x0020);
1254 			current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 10) + Short4(0x0020);
1255 			current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 10) + Short4(0x0020);
1256 			current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 2) + Short4(0x2000);
1257 			break;
1258 		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1259 			current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 5) + Short4(0x0400);
1260 			current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 5) + Short4(0x0400);
1261 			current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 5) + Short4(0x0400);
1262 			current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 1) + Short4(0x4000);
1263 			break;
1264 		case VK_FORMAT_R5G6B5_UNORM_PACK16:
1265 			current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 5) + Short4(0x0400);
1266 			current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 6) + Short4(0x0200);
1267 			current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 5) + Short4(0x0400);
1268 			break;
1269 		default:
1270 			break;
1271 	}
1272 
1273 	int rgbaWriteMask = state.colorWriteActive(index);
1274 	int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1275 
1276 	switch(state.targetFormat[index])
1277 	{
1278 		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1279 		{
1280 			current.w = current.w & Short4(0x8000u);
1281 			current.x = As<UShort4>(current.x & Short4(0xF800)) >> 1;
1282 			current.y = As<UShort4>(current.y & Short4(0xF800)) >> 6;
1283 			current.z = As<UShort4>(current.z & Short4(0xF800)) >> 11;
1284 
1285 			current.x = current.x | current.y | current.z | current.w;
1286 		}
1287 		break;
1288 		case VK_FORMAT_R5G6B5_UNORM_PACK16:
1289 		{
1290 			current.x = current.x & Short4(0xF800u);
1291 			current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1292 			current.z = As<UShort4>(current.z) >> 11;
1293 
1294 			current.x = current.x | current.y | current.z;
1295 		}
1296 		break;
1297 		case VK_FORMAT_B8G8R8A8_UNORM:
1298 		case VK_FORMAT_B8G8R8A8_SRGB:
1299 			if(rgbaWriteMask == 0x7)
1300 			{
1301 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1302 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1303 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1304 
1305 				current.z = As<Short4>(PackUnsigned(current.z, current.x));
1306 				current.y = As<Short4>(PackUnsigned(current.y, current.y));
1307 
1308 				current.x = current.z;
1309 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1310 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1311 				current.y = current.z;
1312 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1313 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1314 			}
1315 			else
1316 			{
1317 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1318 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1319 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1320 				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1321 
1322 				current.z = As<Short4>(PackUnsigned(current.z, current.x));
1323 				current.y = As<Short4>(PackUnsigned(current.y, current.w));
1324 
1325 				current.x = current.z;
1326 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1327 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1328 				current.y = current.z;
1329 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1330 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1331 			}
1332 			break;
1333 		case VK_FORMAT_R8G8B8A8_UNORM:
1334 		case VK_FORMAT_R8G8B8A8_SRGB:
1335 		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1336 		case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1337 			if(rgbaWriteMask == 0x7)
1338 			{
1339 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1340 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1341 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1342 
1343 				current.z = As<Short4>(PackUnsigned(current.x, current.z));
1344 				current.y = As<Short4>(PackUnsigned(current.y, current.y));
1345 
1346 				current.x = current.z;
1347 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1348 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1349 				current.y = current.z;
1350 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1351 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1352 			}
1353 			else
1354 			{
1355 				current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1356 				current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1357 				current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1358 				current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1359 
1360 				current.z = As<Short4>(PackUnsigned(current.x, current.z));
1361 				current.y = As<Short4>(PackUnsigned(current.y, current.w));
1362 
1363 				current.x = current.z;
1364 				current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1365 				current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1366 				current.y = current.z;
1367 				current.z = As<Short4>(UnpackLow(current.z, current.x));
1368 				current.y = As<Short4>(UnpackHigh(current.y, current.x));
1369 			}
1370 			break;
1371 		case VK_FORMAT_R8G8_UNORM:
1372 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1373 			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1374 			current.x = As<Short4>(PackUnsigned(current.x, current.x));
1375 			current.y = As<Short4>(PackUnsigned(current.y, current.y));
1376 			current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
1377 			break;
1378 		case VK_FORMAT_R8_UNORM:
1379 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1380 			current.x = As<Short4>(PackUnsigned(current.x, current.x));
1381 			break;
1382 		case VK_FORMAT_R16G16_UNORM:
1383 			current.z = current.x;
1384 			current.x = As<Short4>(UnpackLow(current.x, current.y));
1385 			current.z = As<Short4>(UnpackHigh(current.z, current.y));
1386 			current.y = current.z;
1387 			break;
1388 		case VK_FORMAT_R16G16B16A16_UNORM:
1389 			transpose4x4(current.x, current.y, current.z, current.w);
1390 			break;
1391 		case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1392 		{
1393 			auto r = (Int4(current.x) >> 6) & Int4(0x3ff);
1394 			auto g = (Int4(current.y) >> 6) & Int4(0x3ff);
1395 			auto b = (Int4(current.z) >> 6) & Int4(0x3ff);
1396 			auto a = (Int4(current.w) >> 14) & Int4(0x3);
1397 			Int4 packed = (a << 30) | (b << 20) | (g << 10) | r;
1398 			auto c02 = As<Int2>(Int4(packed.xzzz));  // TODO: auto c02 = packed.xz;
1399 			auto c13 = As<Int2>(Int4(packed.ywww));  // TODO: auto c13 = packed.yw;
1400 			current.x = UnpackLow(c02, c13);
1401 			current.y = UnpackHigh(c02, c13);
1402 			break;
1403 		}
1404 		case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1405 		{
1406 			auto r = (Int4(current.x) >> 6) & Int4(0x3ff);
1407 			auto g = (Int4(current.y) >> 6) & Int4(0x3ff);
1408 			auto b = (Int4(current.z) >> 6) & Int4(0x3ff);
1409 			auto a = (Int4(current.w) >> 14) & Int4(0x3);
1410 			Int4 packed = (a << 30) | (r << 20) | (g << 10) | b;
1411 			auto c02 = As<Int2>(Int4(packed.xzzz));  // TODO: auto c02 = packed.xz;
1412 			auto c13 = As<Int2>(Int4(packed.ywww));  // TODO: auto c13 = packed.yw;
1413 			current.x = UnpackLow(c02, c13);
1414 			current.y = UnpackHigh(c02, c13);
1415 			break;
1416 		}
1417 		default:
1418 			UNSUPPORTED("VkFormat: %d", int(state.targetFormat[index]));
1419 	}
1420 
1421 	Short4 c01 = current.z;
1422 	Short4 c23 = current.y;
1423 
1424 	Int xMask;  // Combination of all masks
1425 
1426 	if(state.depthTestActive)
1427 	{
1428 		xMask = zMask;
1429 	}
1430 	else
1431 	{
1432 		xMask = cMask;
1433 	}
1434 
1435 	if(state.stencilActive)
1436 	{
1437 		xMask &= sMask;
1438 	}
1439 
1440 	Pointer<Byte> buffer = cBuffer;
1441 	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1442 
1443 	switch(state.targetFormat[index])
1444 	{
1445 		case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1446 		{
1447 			buffer += 2 * x;
1448 			Int value = *Pointer<Int>(buffer);
1449 
1450 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask5551Q[bgraWriteMask & 0xF][0]));
1451 
1452 			Int c01 = Extract(As<Int2>(current.x), 0);
1453 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1454 			if(bgraWriteMask != 0x0000000F)
1455 			{
1456 				mask01 &= channelMask;
1457 			}
1458 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1459 
1460 			buffer += pitchB;
1461 			value = *Pointer<Int>(buffer);
1462 
1463 			Int c23 = Extract(As<Int2>(current.x), 1);
1464 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1465 			if(bgraWriteMask != 0x0000000F)
1466 			{
1467 				mask23 &= channelMask;
1468 			}
1469 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1470 		}
1471 		break;
1472 		case VK_FORMAT_R5G6B5_UNORM_PACK16:
1473 		{
1474 			buffer += 2 * x;
1475 			Int value = *Pointer<Int>(buffer);
1476 
1477 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask565Q[bgraWriteMask & 0x7][0]));
1478 
1479 			Int c01 = Extract(As<Int2>(current.x), 0);
1480 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1481 			if((bgraWriteMask & 0x00000007) != 0x00000007)
1482 			{
1483 				mask01 &= channelMask;
1484 			}
1485 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1486 
1487 			buffer += pitchB;
1488 			value = *Pointer<Int>(buffer);
1489 
1490 			Int c23 = Extract(As<Int2>(current.x), 1);
1491 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1492 			if((bgraWriteMask & 0x00000007) != 0x00000007)
1493 			{
1494 				mask23 &= channelMask;
1495 			}
1496 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1497 		}
1498 		break;
1499 		case VK_FORMAT_B8G8R8A8_UNORM:
1500 		case VK_FORMAT_B8G8R8A8_SRGB:
1501 		{
1502 			buffer += x * 4;
1503 			Short4 value = *Pointer<Short4>(buffer);
1504 			Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q[bgraWriteMask][0]));
1505 
1506 			Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1507 			if(bgraWriteMask != 0x0000000F)
1508 			{
1509 				mask01 &= channelMask;
1510 			}
1511 			*Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
1512 
1513 			buffer += pitchB;
1514 			value = *Pointer<Short4>(buffer);
1515 
1516 			Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1517 			if(bgraWriteMask != 0x0000000F)
1518 			{
1519 				mask23 &= channelMask;
1520 			}
1521 			*Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
1522 		}
1523 		break;
1524 		case VK_FORMAT_R8G8B8A8_UNORM:
1525 		case VK_FORMAT_R8G8B8A8_SRGB:
1526 		case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1527 		case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1528 		{
1529 			buffer += x * 4;
1530 			Short4 value = *Pointer<Short4>(buffer);
1531 			Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
1532 
1533 			Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1534 			if(rgbaWriteMask != 0x0000000F)
1535 			{
1536 				mask01 &= channelMask;
1537 			}
1538 			*Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
1539 
1540 			buffer += pitchB;
1541 			value = *Pointer<Short4>(buffer);
1542 
1543 			Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1544 			if(rgbaWriteMask != 0x0000000F)
1545 			{
1546 				mask23 &= channelMask;
1547 			}
1548 			*Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
1549 		}
1550 		break;
1551 		case VK_FORMAT_R8G8_UNORM:
1552 			if((rgbaWriteMask & 0x00000003) != 0x0)
1553 			{
1554 				buffer += 2 * x;
1555 				Int2 value;
1556 				value = Insert(value, *Pointer<Int>(buffer), 0);
1557 				value = Insert(value, *Pointer<Int>(buffer + pitchB), 1);
1558 
1559 				Int2 packedCol = As<Int2>(current.x);
1560 
1561 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
1562 				if((rgbaWriteMask & 0x3) != 0x3)
1563 				{
1564 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
1565 					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
1566 					mergedMask &= rgbaMask;
1567 				}
1568 
1569 				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
1570 
1571 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
1572 				*Pointer<UInt>(buffer + pitchB) = As<UInt>(Extract(packedCol, 1));
1573 			}
1574 			break;
1575 		case VK_FORMAT_R8_UNORM:
1576 			if(rgbaWriteMask & 0x00000001)
1577 			{
1578 				buffer += 1 * x;
1579 				Short4 value;
1580 				value = Insert(value, *Pointer<Short>(buffer), 0);
1581 				value = Insert(value, *Pointer<Short>(buffer + pitchB), 1);
1582 
1583 				current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
1584 				value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
1585 				current.x |= value;
1586 
1587 				*Pointer<Short>(buffer) = Extract(current.x, 0);
1588 				*Pointer<Short>(buffer + pitchB) = Extract(current.x, 1);
1589 			}
1590 			break;
1591 		case VK_FORMAT_R16G16_UNORM:
1592 		{
1593 			buffer += 4 * x;
1594 
1595 			Short4 value = *Pointer<Short4>(buffer);
1596 
1597 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
1598 			{
1599 				Short4 masked = value;
1600 				current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskW01Q[rgbaWriteMask & 0x3][0]));
1601 				masked &= *Pointer<Short4>(constants + OFFSET(Constants, maskW01Q[~rgbaWriteMask & 0x3][0]));
1602 				current.x |= masked;
1603 			}
1604 
1605 			current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1606 			value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskD01Q) + xMask * 8);
1607 			current.x |= value;
1608 			*Pointer<Short4>(buffer) = current.x;
1609 
1610 			buffer += pitchB;
1611 
1612 			value = *Pointer<Short4>(buffer);
1613 
1614 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
1615 			{
1616 				Short4 masked = value;
1617 				current.y &= *Pointer<Short4>(constants + OFFSET(Constants, maskW01Q[rgbaWriteMask & 0x3][0]));
1618 				masked &= *Pointer<Short4>(constants + OFFSET(Constants, maskW01Q[~rgbaWriteMask & 0x3][0]));
1619 				current.y |= masked;
1620 			}
1621 
1622 			current.y &= *Pointer<Short4>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1623 			value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskD23Q) + xMask * 8);
1624 			current.y |= value;
1625 			*Pointer<Short4>(buffer) = current.y;
1626 		}
1627 		break;
1628 		case VK_FORMAT_R16G16B16A16_UNORM:
1629 		{
1630 			buffer += 8 * x;
1631 
1632 			{
1633 				Short4 value = *Pointer<Short4>(buffer);
1634 
1635 				if(rgbaWriteMask != 0x0000000F)
1636 				{
1637 					Short4 masked = value;
1638 					current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
1639 					masked &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskW4Q[rgbaWriteMask][0]));
1640 					current.x |= masked;
1641 				}
1642 
1643 				current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskQ0Q) + xMask * 8);
1644 				value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskQ0Q) + xMask * 8);
1645 				current.x |= value;
1646 				*Pointer<Short4>(buffer) = current.x;
1647 			}
1648 
1649 			{
1650 				Short4 value = *Pointer<Short4>(buffer + 8);
1651 
1652 				if(rgbaWriteMask != 0x0000000F)
1653 				{
1654 					Short4 masked = value;
1655 					current.y &= *Pointer<Short4>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
1656 					masked &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskW4Q[rgbaWriteMask][0]));
1657 					current.y |= masked;
1658 				}
1659 
1660 				current.y &= *Pointer<Short4>(constants + OFFSET(Constants, maskQ1Q) + xMask * 8);
1661 				value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskQ1Q) + xMask * 8);
1662 				current.y |= value;
1663 				*Pointer<Short4>(buffer + 8) = current.y;
1664 			}
1665 
1666 			buffer += pitchB;
1667 
1668 			{
1669 				Short4 value = *Pointer<Short4>(buffer);
1670 
1671 				if(rgbaWriteMask != 0x0000000F)
1672 				{
1673 					Short4 masked = value;
1674 					current.z &= *Pointer<Short4>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
1675 					masked &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskW4Q[rgbaWriteMask][0]));
1676 					current.z |= masked;
1677 				}
1678 
1679 				current.z &= *Pointer<Short4>(constants + OFFSET(Constants, maskQ2Q) + xMask * 8);
1680 				value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskQ2Q) + xMask * 8);
1681 				current.z |= value;
1682 				*Pointer<Short4>(buffer) = current.z;
1683 			}
1684 
1685 			{
1686 				Short4 value = *Pointer<Short4>(buffer + 8);
1687 
1688 				if(rgbaWriteMask != 0x0000000F)
1689 				{
1690 					Short4 masked = value;
1691 					current.w &= *Pointer<Short4>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
1692 					masked &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskW4Q[rgbaWriteMask][0]));
1693 					current.w |= masked;
1694 				}
1695 
1696 				current.w &= *Pointer<Short4>(constants + OFFSET(Constants, maskQ3Q) + xMask * 8);
1697 				value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskQ3Q) + xMask * 8);
1698 				current.w |= value;
1699 				*Pointer<Short4>(buffer + 8) = current.w;
1700 			}
1701 		}
1702 		break;
1703 		case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1704 			rgbaWriteMask = bgraWriteMask;
1705 			// [[fallthrough]]
1706 		case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1707 		{
1708 			buffer += 4 * x;
1709 
1710 			Int2 value = *Pointer<Int2>(buffer, 16);
1711 			Int2 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1712 			if(rgbaWriteMask != 0xF)
1713 			{
1714 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
1715 			}
1716 			*Pointer<Int2>(buffer) = (As<Int2>(current.x) & mergedMask) | (value & ~mergedMask);
1717 
1718 			buffer += pitchB;
1719 
1720 			value = *Pointer<Int2>(buffer, 16);
1721 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1722 			if(rgbaWriteMask != 0xF)
1723 			{
1724 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
1725 			}
1726 			*Pointer<Int2>(buffer) = (As<Int2>(current.y) & mergedMask) | (value & ~mergedMask);
1727 		}
1728 		break;
1729 		default:
1730 			UNSUPPORTED("VkFormat: %d", int(state.targetFormat[index]));
1731 	}
1732 }
1733 
blendFactor(Vector4f & blendFactor,const Vector4f & oC,const Vector4f & pixel,VkBlendFactor blendFactorActive)1734 void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorActive)
1735 {
1736 	switch(blendFactorActive)
1737 	{
1738 		case VK_BLEND_FACTOR_ZERO:
1739 			blendFactor.x = Float4(0);
1740 			blendFactor.y = Float4(0);
1741 			blendFactor.z = Float4(0);
1742 			break;
1743 		case VK_BLEND_FACTOR_ONE:
1744 			blendFactor.x = Float4(1);
1745 			blendFactor.y = Float4(1);
1746 			blendFactor.z = Float4(1);
1747 			break;
1748 		case VK_BLEND_FACTOR_SRC_COLOR:
1749 			blendFactor.x = oC.x;
1750 			blendFactor.y = oC.y;
1751 			blendFactor.z = oC.z;
1752 			break;
1753 		case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1754 			blendFactor.x = Float4(1.0f) - oC.x;
1755 			blendFactor.y = Float4(1.0f) - oC.y;
1756 			blendFactor.z = Float4(1.0f) - oC.z;
1757 			break;
1758 		case VK_BLEND_FACTOR_DST_COLOR:
1759 			blendFactor.x = pixel.x;
1760 			blendFactor.y = pixel.y;
1761 			blendFactor.z = pixel.z;
1762 			break;
1763 		case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1764 			blendFactor.x = Float4(1.0f) - pixel.x;
1765 			blendFactor.y = Float4(1.0f) - pixel.y;
1766 			blendFactor.z = Float4(1.0f) - pixel.z;
1767 			break;
1768 		case VK_BLEND_FACTOR_SRC_ALPHA:
1769 			blendFactor.x = oC.w;
1770 			blendFactor.y = oC.w;
1771 			blendFactor.z = oC.w;
1772 			break;
1773 		case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1774 			blendFactor.x = Float4(1.0f) - oC.w;
1775 			blendFactor.y = Float4(1.0f) - oC.w;
1776 			blendFactor.z = Float4(1.0f) - oC.w;
1777 			break;
1778 		case VK_BLEND_FACTOR_DST_ALPHA:
1779 			blendFactor.x = pixel.w;
1780 			blendFactor.y = pixel.w;
1781 			blendFactor.z = pixel.w;
1782 			break;
1783 		case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1784 			blendFactor.x = Float4(1.0f) - pixel.w;
1785 			blendFactor.y = Float4(1.0f) - pixel.w;
1786 			blendFactor.z = Float4(1.0f) - pixel.w;
1787 			break;
1788 		case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1789 			blendFactor.x = Float4(1.0f) - pixel.w;
1790 			blendFactor.x = Min(blendFactor.x, oC.w);
1791 			blendFactor.y = blendFactor.x;
1792 			blendFactor.z = blendFactor.x;
1793 			break;
1794 		case VK_BLEND_FACTOR_CONSTANT_COLOR:
1795 			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData, factor.blendConstant4F[0]));
1796 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData, factor.blendConstant4F[1]));
1797 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData, factor.blendConstant4F[2]));
1798 			break;
1799 		case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1800 			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData, factor.blendConstant4F[3]));
1801 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData, factor.blendConstant4F[3]));
1802 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData, factor.blendConstant4F[3]));
1803 			break;
1804 		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1805 			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData, factor.invBlendConstant4F[0]));
1806 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData, factor.invBlendConstant4F[1]));
1807 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData, factor.invBlendConstant4F[2]));
1808 			break;
1809 		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1810 			blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData, factor.invBlendConstant4F[3]));
1811 			blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData, factor.invBlendConstant4F[3]));
1812 			blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData, factor.invBlendConstant4F[3]));
1813 			break;
1814 
1815 		default:
1816 			UNSUPPORTED("VkBlendFactor: %d", int(blendFactorActive));
1817 	}
1818 }
1819 
blendFactorAlpha(Vector4f & blendFactor,const Vector4f & oC,const Vector4f & pixel,VkBlendFactor blendFactorAlphaActive)1820 void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorAlphaActive)
1821 {
1822 	switch(blendFactorAlphaActive)
1823 	{
1824 		case VK_BLEND_FACTOR_ZERO:
1825 			blendFactor.w = Float4(0);
1826 			break;
1827 		case VK_BLEND_FACTOR_ONE:
1828 			blendFactor.w = Float4(1);
1829 			break;
1830 		case VK_BLEND_FACTOR_SRC_COLOR:
1831 			blendFactor.w = oC.w;
1832 			break;
1833 		case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1834 			blendFactor.w = Float4(1.0f) - oC.w;
1835 			break;
1836 		case VK_BLEND_FACTOR_DST_COLOR:
1837 			blendFactor.w = pixel.w;
1838 			break;
1839 		case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1840 			blendFactor.w = Float4(1.0f) - pixel.w;
1841 			break;
1842 		case VK_BLEND_FACTOR_SRC_ALPHA:
1843 			blendFactor.w = oC.w;
1844 			break;
1845 		case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1846 			blendFactor.w = Float4(1.0f) - oC.w;
1847 			break;
1848 		case VK_BLEND_FACTOR_DST_ALPHA:
1849 			blendFactor.w = pixel.w;
1850 			break;
1851 		case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1852 			blendFactor.w = Float4(1.0f) - pixel.w;
1853 			break;
1854 		case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1855 			blendFactor.w = Float4(1.0f);
1856 			break;
1857 		case VK_BLEND_FACTOR_CONSTANT_COLOR:
1858 		case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1859 			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData, factor.blendConstant4F[3]));
1860 			break;
1861 		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1862 		case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1863 			blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData, factor.invBlendConstant4F[3]));
1864 			break;
1865 		default:
1866 			UNSUPPORTED("VkBlendFactor: %d", int(blendFactorAlphaActive));
1867 	}
1868 }
1869 
alphaBlend(int index,const Pointer<Byte> & cBuffer,Vector4f & oC,const Int & x)1870 void PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, Vector4f &oC, const Int &x)
1871 {
1872 	if(!state.blendState[index].alphaBlendEnable)
1873 	{
1874 		return;
1875 	}
1876 
1877 	Pointer<Byte> buffer = cBuffer;
1878 	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1879 
1880 	// pixel holds four texel color values.
1881 	// Note: Despite the type being Vector4f, the colors may be stored as
1882 	// integers. Half-floats are stored as full 32-bit floats.
1883 	// Non-float and non-fixed point formats are not alpha blended.
1884 	Vector4f pixel;
1885 
1886 	Vector4s color;
1887 	Short4 c01;
1888 	Short4 c23;
1889 
1890 	Float4 one;
1891 	vk::Format format(state.targetFormat[index]);
1892 	if(format.isFloatFormat())
1893 	{
1894 		one = Float4(1.0f);
1895 	}
1896 	else if(format.isUnnormalizedInteger())
1897 	{
1898 		one = As<Float4>(format.isUnsignedComponent(0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
1899 	}
1900 
1901 	switch(state.targetFormat[index])
1902 	{
1903 		case VK_FORMAT_R32_SINT:
1904 		case VK_FORMAT_R32_UINT:
1905 		case VK_FORMAT_R32_SFLOAT:
1906 			// FIXME: movlps
1907 			buffer += 4 * x;
1908 			pixel.x.x = *Pointer<Float>(buffer + 0);
1909 			pixel.x.y = *Pointer<Float>(buffer + 4);
1910 			buffer += pitchB;
1911 			// FIXME: movhps
1912 			pixel.x.z = *Pointer<Float>(buffer + 0);
1913 			pixel.x.w = *Pointer<Float>(buffer + 4);
1914 			pixel.y = pixel.z = pixel.w = one;
1915 			break;
1916 		case VK_FORMAT_R32G32_SINT:
1917 		case VK_FORMAT_R32G32_UINT:
1918 		case VK_FORMAT_R32G32_SFLOAT:
1919 			buffer += 8 * x;
1920 			pixel.x = *Pointer<Float4>(buffer, 16);
1921 			buffer += pitchB;
1922 			pixel.y = *Pointer<Float4>(buffer, 16);
1923 			pixel.z = pixel.x;
1924 			pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x0202);
1925 			pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0x1313);
1926 			pixel.y = pixel.z;
1927 			pixel.z = pixel.w = one;
1928 			break;
1929 		case VK_FORMAT_R32G32B32A32_SFLOAT:
1930 		case VK_FORMAT_R32G32B32A32_SINT:
1931 		case VK_FORMAT_R32G32B32A32_UINT:
1932 			buffer += 16 * x;
1933 			pixel.x = *Pointer<Float4>(buffer + 0, 16);
1934 			pixel.y = *Pointer<Float4>(buffer + 16, 16);
1935 			buffer += pitchB;
1936 			pixel.z = *Pointer<Float4>(buffer + 0, 16);
1937 			pixel.w = *Pointer<Float4>(buffer + 16, 16);
1938 			transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1939 			break;
1940 		case VK_FORMAT_R16_SFLOAT:
1941 			buffer += 2 * x;
1942 			pixel.x.x = Float(*Pointer<Half>(buffer + 0));
1943 			pixel.x.y = Float(*Pointer<Half>(buffer + 2));
1944 			buffer += pitchB;
1945 			pixel.x.z = Float(*Pointer<Half>(buffer + 0));
1946 			pixel.x.w = Float(*Pointer<Half>(buffer + 2));
1947 			pixel.y = pixel.z = pixel.w = one;
1948 			break;
1949 		case VK_FORMAT_R16G16_SFLOAT:
1950 			buffer += 4 * x;
1951 			pixel.x.x = Float(*Pointer<Half>(buffer + 0));
1952 			pixel.y.x = Float(*Pointer<Half>(buffer + 2));
1953 			pixel.x.y = Float(*Pointer<Half>(buffer + 4));
1954 			pixel.y.y = Float(*Pointer<Half>(buffer + 6));
1955 			buffer += pitchB;
1956 			pixel.x.z = Float(*Pointer<Half>(buffer + 0));
1957 			pixel.y.z = Float(*Pointer<Half>(buffer + 2));
1958 			pixel.x.w = Float(*Pointer<Half>(buffer + 4));
1959 			pixel.y.w = Float(*Pointer<Half>(buffer + 6));
1960 			pixel.z = pixel.w = one;
1961 			break;
1962 		case VK_FORMAT_R16G16B16A16_SFLOAT:
1963 			buffer += 8 * x;
1964 			pixel.x.x = Float(*Pointer<Half>(buffer + 0x0));
1965 			pixel.y.x = Float(*Pointer<Half>(buffer + 0x2));
1966 			pixel.z.x = Float(*Pointer<Half>(buffer + 0x4));
1967 			pixel.w.x = Float(*Pointer<Half>(buffer + 0x6));
1968 			pixel.x.y = Float(*Pointer<Half>(buffer + 0x8));
1969 			pixel.y.y = Float(*Pointer<Half>(buffer + 0xa));
1970 			pixel.z.y = Float(*Pointer<Half>(buffer + 0xc));
1971 			pixel.w.y = Float(*Pointer<Half>(buffer + 0xe));
1972 			buffer += pitchB;
1973 			pixel.x.z = Float(*Pointer<Half>(buffer + 0x0));
1974 			pixel.y.z = Float(*Pointer<Half>(buffer + 0x2));
1975 			pixel.z.z = Float(*Pointer<Half>(buffer + 0x4));
1976 			pixel.w.z = Float(*Pointer<Half>(buffer + 0x6));
1977 			pixel.x.w = Float(*Pointer<Half>(buffer + 0x8));
1978 			pixel.y.w = Float(*Pointer<Half>(buffer + 0xa));
1979 			pixel.z.w = Float(*Pointer<Half>(buffer + 0xc));
1980 			pixel.w.w = Float(*Pointer<Half>(buffer + 0xe));
1981 			break;
1982 		case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
1983 			buffer += 4 * x;
1984 			pixel.x = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
1985 			pixel.y = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
1986 			buffer += pitchB;
1987 			pixel.z = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
1988 			pixel.w = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
1989 			transpose4x3(pixel.x, pixel.y, pixel.z, pixel.w);
1990 			pixel.w = one;
1991 			break;
1992 		default:
1993 			UNSUPPORTED("VkFormat: %d", int(state.targetFormat[index]));
1994 	}
1995 
1996 	// Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1997 	Vector4f sourceFactor;
1998 	Vector4f destFactor;
1999 
2000 	blendFactor(sourceFactor, oC, pixel, state.blendState[index].sourceBlendFactor);
2001 	blendFactor(destFactor, oC, pixel, state.blendState[index].destBlendFactor);
2002 
2003 	oC.x *= sourceFactor.x;
2004 	oC.y *= sourceFactor.y;
2005 	oC.z *= sourceFactor.z;
2006 
2007 	pixel.x *= destFactor.x;
2008 	pixel.y *= destFactor.y;
2009 	pixel.z *= destFactor.z;
2010 
2011 	switch(state.blendState[index].blendOperation)
2012 	{
2013 		case VK_BLEND_OP_ADD:
2014 			oC.x += pixel.x;
2015 			oC.y += pixel.y;
2016 			oC.z += pixel.z;
2017 			break;
2018 		case VK_BLEND_OP_SUBTRACT:
2019 			oC.x -= pixel.x;
2020 			oC.y -= pixel.y;
2021 			oC.z -= pixel.z;
2022 			break;
2023 		case VK_BLEND_OP_REVERSE_SUBTRACT:
2024 			oC.x = pixel.x - oC.x;
2025 			oC.y = pixel.y - oC.y;
2026 			oC.z = pixel.z - oC.z;
2027 			break;
2028 		case VK_BLEND_OP_MIN:
2029 			oC.x = Min(oC.x, pixel.x);
2030 			oC.y = Min(oC.y, pixel.y);
2031 			oC.z = Min(oC.z, pixel.z);
2032 			break;
2033 		case VK_BLEND_OP_MAX:
2034 			oC.x = Max(oC.x, pixel.x);
2035 			oC.y = Max(oC.y, pixel.y);
2036 			oC.z = Max(oC.z, pixel.z);
2037 			break;
2038 		case VK_BLEND_OP_SRC_EXT:
2039 			// No operation
2040 			break;
2041 		case VK_BLEND_OP_DST_EXT:
2042 			oC.x = pixel.x;
2043 			oC.y = pixel.y;
2044 			oC.z = pixel.z;
2045 			break;
2046 		case VK_BLEND_OP_ZERO_EXT:
2047 			oC.x = Float4(0.0f);
2048 			oC.y = Float4(0.0f);
2049 			oC.z = Float4(0.0f);
2050 			break;
2051 		default:
2052 			UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperation));
2053 	}
2054 
2055 	blendFactorAlpha(sourceFactor, oC, pixel, state.blendState[index].sourceBlendFactorAlpha);
2056 	blendFactorAlpha(destFactor, oC, pixel, state.blendState[index].destBlendFactorAlpha);
2057 
2058 	oC.w *= sourceFactor.w;
2059 	pixel.w *= destFactor.w;
2060 
2061 	switch(state.blendState[index].blendOperationAlpha)
2062 	{
2063 		case VK_BLEND_OP_ADD:
2064 			oC.w += pixel.w;
2065 			break;
2066 		case VK_BLEND_OP_SUBTRACT:
2067 			oC.w -= pixel.w;
2068 			break;
2069 		case VK_BLEND_OP_REVERSE_SUBTRACT:
2070 			pixel.w -= oC.w;
2071 			oC.w = pixel.w;
2072 			break;
2073 		case VK_BLEND_OP_MIN:
2074 			oC.w = Min(oC.w, pixel.w);
2075 			break;
2076 		case VK_BLEND_OP_MAX:
2077 			oC.w = Max(oC.w, pixel.w);
2078 			break;
2079 		case VK_BLEND_OP_SRC_EXT:
2080 			// No operation
2081 			break;
2082 		case VK_BLEND_OP_DST_EXT:
2083 			oC.w = pixel.w;
2084 			break;
2085 		case VK_BLEND_OP_ZERO_EXT:
2086 			oC.w = Float4(0.0f);
2087 			break;
2088 		default:
2089 			UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperationAlpha));
2090 	}
2091 
2092 	if(format.isUnsignedComponent(0)) { oC.x = Max(oC.x, Float4(0.0f)); }
2093 	if(format.isUnsignedComponent(1)) { oC.y = Max(oC.y, Float4(0.0f)); }
2094 	if(format.isUnsignedComponent(2)) { oC.z = Max(oC.z, Float4(0.0f)); }
2095 	if(format.isUnsignedComponent(3)) { oC.w = Max(oC.w, Float4(0.0f)); }
2096 }
2097 
writeColor(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4f & oC,const Int & sMask,const Int & zMask,const Int & cMask)2098 void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4f &oC, const Int &sMask, const Int &zMask, const Int &cMask)
2099 {
2100 	switch(state.targetFormat[index])
2101 	{
2102 		case VK_FORMAT_R16_SFLOAT:
2103 		case VK_FORMAT_R32_SFLOAT:
2104 		case VK_FORMAT_R32_SINT:
2105 		case VK_FORMAT_R32_UINT:
2106 		case VK_FORMAT_R16_SINT:
2107 		case VK_FORMAT_R16_UINT:
2108 		case VK_FORMAT_R8_SINT:
2109 		case VK_FORMAT_R8_UINT:
2110 		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2111 		case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2112 			break;
2113 		case VK_FORMAT_R16G16_SFLOAT:
2114 		case VK_FORMAT_R32G32_SFLOAT:
2115 		case VK_FORMAT_R32G32_SINT:
2116 		case VK_FORMAT_R32G32_UINT:
2117 		case VK_FORMAT_R16G16_SINT:
2118 		case VK_FORMAT_R16G16_UINT:
2119 		case VK_FORMAT_R8G8_SINT:
2120 		case VK_FORMAT_R8G8_UINT:
2121 			oC.z = oC.x;
2122 			oC.x = UnpackLow(oC.x, oC.y);
2123 			oC.z = UnpackHigh(oC.z, oC.y);
2124 			oC.y = oC.z;
2125 			break;
2126 		case VK_FORMAT_R16G16B16A16_SFLOAT:
2127 		case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2128 		case VK_FORMAT_R32G32B32A32_SFLOAT:
2129 		case VK_FORMAT_R32G32B32A32_SINT:
2130 		case VK_FORMAT_R32G32B32A32_UINT:
2131 		case VK_FORMAT_R16G16B16A16_SINT:
2132 		case VK_FORMAT_R16G16B16A16_UINT:
2133 		case VK_FORMAT_R8G8B8A8_SINT:
2134 		case VK_FORMAT_R8G8B8A8_UINT:
2135 		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
2136 		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
2137 			transpose4x4(oC.x, oC.y, oC.z, oC.w);
2138 			break;
2139 		default:
2140 			UNSUPPORTED("VkFormat: %d", int(state.targetFormat[index]));
2141 	}
2142 
2143 	int rgbaWriteMask = state.colorWriteActive(index);
2144 	int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
2145 
2146 	Int xMask;  // Combination of all masks
2147 
2148 	if(state.depthTestActive)
2149 	{
2150 		xMask = zMask;
2151 	}
2152 	else
2153 	{
2154 		xMask = cMask;
2155 	}
2156 
2157 	if(state.stencilActive)
2158 	{
2159 		xMask &= sMask;
2160 	}
2161 
2162 	auto targetFormat = state.targetFormat[index];
2163 
2164 	Pointer<Byte> buffer = cBuffer;
2165 	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2166 	Float4 value;
2167 
2168 	switch(targetFormat)
2169 	{
2170 		case VK_FORMAT_R32_SFLOAT:
2171 		case VK_FORMAT_R32_SINT:
2172 		case VK_FORMAT_R32_UINT:
2173 			if(rgbaWriteMask & 0x00000001)
2174 			{
2175 				buffer += 4 * x;
2176 
2177 				// FIXME: movlps
2178 				value.x = *Pointer<Float>(buffer + 0);
2179 				value.y = *Pointer<Float>(buffer + 4);
2180 
2181 				buffer += pitchB;
2182 
2183 				// FIXME: movhps
2184 				value.z = *Pointer<Float>(buffer + 0);
2185 				value.w = *Pointer<Float>(buffer + 4);
2186 
2187 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2188 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2189 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2190 
2191 				// FIXME: movhps
2192 				*Pointer<Float>(buffer + 0) = oC.x.z;
2193 				*Pointer<Float>(buffer + 4) = oC.x.w;
2194 
2195 				buffer -= pitchB;
2196 
2197 				// FIXME: movlps
2198 				*Pointer<Float>(buffer + 0) = oC.x.x;
2199 				*Pointer<Float>(buffer + 4) = oC.x.y;
2200 			}
2201 			break;
2202 		case VK_FORMAT_R16_SFLOAT:
2203 			if(rgbaWriteMask & 0x00000001)
2204 			{
2205 				buffer += 2 * x;
2206 
2207 				value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 0);
2208 				value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 1);
2209 
2210 				buffer += pitchB;
2211 
2212 				value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 2);
2213 				value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 3);
2214 
2215 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2216 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2217 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2218 
2219 				*Pointer<Half>(buffer + 0) = Half(oC.x.z);
2220 				*Pointer<Half>(buffer + 2) = Half(oC.x.w);
2221 
2222 				buffer -= pitchB;
2223 
2224 				*Pointer<Half>(buffer + 0) = Half(oC.x.x);
2225 				*Pointer<Half>(buffer + 2) = Half(oC.x.y);
2226 			}
2227 			break;
2228 		case VK_FORMAT_R16_SINT:
2229 		case VK_FORMAT_R16_UINT:
2230 			if(rgbaWriteMask & 0x00000001)
2231 			{
2232 				buffer += 2 * x;
2233 
2234 				UShort4 xyzw;
2235 				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2236 
2237 				buffer += pitchB;
2238 
2239 				xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2240 				value = As<Float4>(Int4(xyzw));
2241 
2242 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2243 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2244 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2245 
2246 				if(targetFormat == VK_FORMAT_R16_SINT)
2247 				{
2248 					Float component = oC.x.z;
2249 					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2250 					component = oC.x.w;
2251 					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2252 
2253 					buffer -= pitchB;
2254 
2255 					component = oC.x.x;
2256 					*Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2257 					component = oC.x.y;
2258 					*Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2259 				}
2260 				else  // VK_FORMAT_R16_UINT
2261 				{
2262 					Float component = oC.x.z;
2263 					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2264 					component = oC.x.w;
2265 					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2266 
2267 					buffer -= pitchB;
2268 
2269 					component = oC.x.x;
2270 					*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2271 					component = oC.x.y;
2272 					*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2273 				}
2274 			}
2275 			break;
2276 		case VK_FORMAT_R8_SINT:
2277 		case VK_FORMAT_R8_UINT:
2278 			if(rgbaWriteMask & 0x00000001)
2279 			{
2280 				buffer += x;
2281 
2282 				UInt xyzw, packedCol;
2283 
2284 				xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
2285 				buffer += pitchB;
2286 				xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2287 
2288 				Short4 tmpCol = Short4(As<Int4>(oC.x));
2289 				if(targetFormat == VK_FORMAT_R8_SINT)
2290 				{
2291 					tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
2292 				}
2293 				else
2294 				{
2295 					tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
2296 				}
2297 				packedCol = Extract(As<Int2>(tmpCol), 0);
2298 
2299 				packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2300 				            (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2301 
2302 				*Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2303 				buffer -= pitchB;
2304 				*Pointer<UShort>(buffer) = UShort(packedCol);
2305 			}
2306 			break;
2307 		case VK_FORMAT_R32G32_SFLOAT:
2308 		case VK_FORMAT_R32G32_SINT:
2309 		case VK_FORMAT_R32G32_UINT:
2310 			buffer += 8 * x;
2311 
2312 			value = *Pointer<Float4>(buffer);
2313 
2314 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
2315 			{
2316 				Float4 masked = value;
2317 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[rgbaWriteMask & 0x3][0])));
2318 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~rgbaWriteMask & 0x3][0])));
2319 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2320 			}
2321 
2322 			oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16, 16));
2323 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ01X) + xMask * 16, 16));
2324 			oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2325 			*Pointer<Float4>(buffer) = oC.x;
2326 
2327 			buffer += pitchB;
2328 
2329 			value = *Pointer<Float4>(buffer);
2330 
2331 			if((rgbaWriteMask & 0x00000003) != 0x00000003)
2332 			{
2333 				Float4 masked;
2334 
2335 				masked = value;
2336 				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[rgbaWriteMask & 0x3][0])));
2337 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~rgbaWriteMask & 0x3][0])));
2338 				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2339 			}
2340 
2341 			oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16, 16));
2342 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ23X) + xMask * 16, 16));
2343 			oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2344 			*Pointer<Float4>(buffer) = oC.y;
2345 			break;
2346 		case VK_FORMAT_R16G16_SFLOAT:
2347 			if((rgbaWriteMask & 0x00000003) != 0x0)
2348 			{
2349 				buffer += 4 * x;
2350 
2351 				UInt2 rgbaMask;
2352 				UInt2 packedCol;
2353 				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.y))) << 16) | UInt(As<UShort>(Half(oC.x.x))), 0);
2354 				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.w))) << 16) | UInt(As<UShort>(Half(oC.x.z))), 1);
2355 
2356 				UShort4 value = *Pointer<UShort4>(buffer);
2357 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2358 				if((rgbaWriteMask & 0x3) != 0x3)
2359 				{
2360 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2361 					rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2362 					mergedMask &= rgbaMask;
2363 				}
2364 				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2365 
2366 				buffer += pitchB;
2367 
2368 				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.y))) << 16) | UInt(As<UShort>(Half(oC.y.x))), 0);
2369 				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.w))) << 16) | UInt(As<UShort>(Half(oC.y.z))), 1);
2370 				value = *Pointer<UShort4>(buffer);
2371 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2372 				if((rgbaWriteMask & 0x3) != 0x3)
2373 				{
2374 					mergedMask &= rgbaMask;
2375 				}
2376 				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2377 			}
2378 			break;
2379 		case VK_FORMAT_R16G16_SINT:
2380 		case VK_FORMAT_R16G16_UINT:
2381 			if((rgbaWriteMask & 0x00000003) != 0x0)
2382 			{
2383 				buffer += 4 * x;
2384 
2385 				UInt2 rgbaMask;
2386 				UShort4 packedCol = UShort4(As<Int4>(oC.x));
2387 				UShort4 value = *Pointer<UShort4>(buffer);
2388 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2389 				if((rgbaWriteMask & 0x3) != 0x3)
2390 				{
2391 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2392 					rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2393 					mergedMask &= rgbaMask;
2394 				}
2395 				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2396 
2397 				buffer += pitchB;
2398 
2399 				packedCol = UShort4(As<Int4>(oC.y));
2400 				value = *Pointer<UShort4>(buffer);
2401 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2402 				if((rgbaWriteMask & 0x3) != 0x3)
2403 				{
2404 					mergedMask &= rgbaMask;
2405 				}
2406 				*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2407 			}
2408 			break;
2409 		case VK_FORMAT_R8G8_SINT:
2410 		case VK_FORMAT_R8G8_UINT:
2411 			if((rgbaWriteMask & 0x00000003) != 0x0)
2412 			{
2413 				buffer += 2 * x;
2414 
2415 				Int2 xyzw, packedCol;
2416 
2417 				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2418 				buffer += pitchB;
2419 				xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2420 
2421 				if(targetFormat == VK_FORMAT_R8G8_SINT)
2422 				{
2423 					packedCol = As<Int2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2424 				}
2425 				else
2426 				{
2427 					packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2428 				}
2429 
2430 				UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2431 				if((rgbaWriteMask & 0x3) != 0x3)
2432 				{
2433 					Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
2434 					UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2435 					mergedMask &= rgbaMask;
2436 				}
2437 
2438 				packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2439 
2440 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2441 				buffer -= pitchB;
2442 				*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2443 			}
2444 			break;
2445 		case VK_FORMAT_R32G32B32A32_SFLOAT:
2446 		case VK_FORMAT_R32G32B32A32_SINT:
2447 		case VK_FORMAT_R32G32B32A32_UINT:
2448 			buffer += 16 * x;
2449 
2450 			{
2451 				value = *Pointer<Float4>(buffer, 16);
2452 
2453 				if(rgbaWriteMask != 0x0000000F)
2454 				{
2455 					Float4 masked = value;
2456 					oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2457 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2458 					oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2459 				}
2460 
2461 				oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskX0X) + xMask * 16, 16));
2462 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX0X) + xMask * 16, 16));
2463 				oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2464 				*Pointer<Float4>(buffer, 16) = oC.x;
2465 			}
2466 
2467 			{
2468 				value = *Pointer<Float4>(buffer + 16, 16);
2469 
2470 				if(rgbaWriteMask != 0x0000000F)
2471 				{
2472 					Float4 masked = value;
2473 					oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2474 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2475 					oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2476 				}
2477 
2478 				oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskX1X) + xMask * 16, 16));
2479 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX1X) + xMask * 16, 16));
2480 				oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2481 				*Pointer<Float4>(buffer + 16, 16) = oC.y;
2482 			}
2483 
2484 			buffer += pitchB;
2485 
2486 			{
2487 				value = *Pointer<Float4>(buffer, 16);
2488 
2489 				if(rgbaWriteMask != 0x0000000F)
2490 				{
2491 					Float4 masked = value;
2492 					oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2493 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2494 					oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2495 				}
2496 
2497 				oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskX2X) + xMask * 16, 16));
2498 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX2X) + xMask * 16, 16));
2499 				oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2500 				*Pointer<Float4>(buffer, 16) = oC.z;
2501 			}
2502 
2503 			{
2504 				value = *Pointer<Float4>(buffer + 16, 16);
2505 
2506 				if(rgbaWriteMask != 0x0000000F)
2507 				{
2508 					Float4 masked = value;
2509 					oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2510 					masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2511 					oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2512 				}
2513 
2514 				oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskX3X) + xMask * 16, 16));
2515 				value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX3X) + xMask * 16, 16));
2516 				oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2517 				*Pointer<Float4>(buffer + 16, 16) = oC.w;
2518 			}
2519 			break;
2520 		case VK_FORMAT_R16G16B16A16_SFLOAT:
2521 			if((rgbaWriteMask & 0x0000000F) != 0x0)
2522 			{
2523 				buffer += 8 * x;
2524 
2525 				UInt4 rgbaMask;
2526 				UInt4 value = *Pointer<UInt4>(buffer);
2527 				UInt4 packedCol;
2528 				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.y))) << 16) | UInt(As<UShort>(Half(oC.x.x))), 0);
2529 				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.w))) << 16) | UInt(As<UShort>(Half(oC.x.z))), 1);
2530 				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.y))) << 16) | UInt(As<UShort>(Half(oC.y.x))), 2);
2531 				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.w))) << 16) | UInt(As<UShort>(Half(oC.y.z))), 3);
2532 				UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2533 				if((rgbaWriteMask & 0xF) != 0xF)
2534 				{
2535 					UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
2536 					rgbaMask = UInt4(tmpMask, tmpMask);
2537 					mergedMask &= rgbaMask;
2538 				}
2539 				*Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2540 
2541 				buffer += pitchB;
2542 
2543 				value = *Pointer<UInt4>(buffer);
2544 				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.z.y))) << 16) | UInt(As<UShort>(Half(oC.z.x))), 0);
2545 				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.z.w))) << 16) | UInt(As<UShort>(Half(oC.z.z))), 1);
2546 				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.w.y))) << 16) | UInt(As<UShort>(Half(oC.w.x))), 2);
2547 				packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.w.w))) << 16) | UInt(As<UShort>(Half(oC.w.z))), 3);
2548 				mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2549 				if((rgbaWriteMask & 0xF) != 0xF)
2550 				{
2551 					mergedMask &= rgbaMask;
2552 				}
2553 				*Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2554 			}
2555 			break;
2556 		case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2557 			if((rgbaWriteMask & 0x7) != 0x0)
2558 			{
2559 				buffer += 4 * x;
2560 
2561 				UInt4 packedCol;
2562 				packedCol = Insert(packedCol, r11g11b10Pack(oC.x), 0);
2563 				packedCol = Insert(packedCol, r11g11b10Pack(oC.y), 1);
2564 				packedCol = Insert(packedCol, r11g11b10Pack(oC.z), 2);
2565 				packedCol = Insert(packedCol, r11g11b10Pack(oC.w), 3);
2566 
2567 				UInt4 value;
2568 				value = Insert(value, *Pointer<UInt>(buffer + 0), 0);
2569 				value = Insert(value, *Pointer<UInt>(buffer + 4), 1);
2570 				buffer += pitchB;
2571 				value = Insert(value, *Pointer<UInt>(buffer + 0), 2);
2572 				value = Insert(value, *Pointer<UInt>(buffer + 4), 3);
2573 
2574 				UInt4 mask = *Pointer<UInt4>(constants + OFFSET(Constants, maskD4X[0][0]) + xMask * 16, 16);
2575 				if((rgbaWriteMask & 0x7) != 0x7)
2576 				{
2577 					mask &= *Pointer<UInt4>(constants + OFFSET(Constants, mask11X[rgbaWriteMask & 0x7][0]), 16);
2578 				}
2579 				value = (packedCol & mask) | (value & ~mask);
2580 
2581 				*Pointer<UInt>(buffer + 0) = value.z;
2582 				*Pointer<UInt>(buffer + 4) = value.w;
2583 				buffer -= pitchB;
2584 				*Pointer<UInt>(buffer + 0) = value.x;
2585 				*Pointer<UInt>(buffer + 4) = value.y;
2586 			}
2587 			break;
2588 		case VK_FORMAT_R16G16B16A16_SINT:
2589 		case VK_FORMAT_R16G16B16A16_UINT:
2590 			if((rgbaWriteMask & 0x0000000F) != 0x0)
2591 			{
2592 				buffer += 8 * x;
2593 
2594 				UInt4 rgbaMask;
2595 				UShort8 value = *Pointer<UShort8>(buffer);
2596 				UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
2597 				UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2598 				if((rgbaWriteMask & 0xF) != 0xF)
2599 				{
2600 					UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
2601 					rgbaMask = UInt4(tmpMask, tmpMask);
2602 					mergedMask &= rgbaMask;
2603 				}
2604 				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2605 
2606 				buffer += pitchB;
2607 
2608 				value = *Pointer<UShort8>(buffer);
2609 				packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
2610 				mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2611 				if((rgbaWriteMask & 0xF) != 0xF)
2612 				{
2613 					mergedMask &= rgbaMask;
2614 				}
2615 				*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2616 			}
2617 			break;
2618 		case VK_FORMAT_R8G8B8A8_SINT:
2619 		case VK_FORMAT_R8G8B8A8_UINT:
2620 		case VK_FORMAT_A8B8G8R8_UINT_PACK32:
2621 		case VK_FORMAT_A8B8G8R8_SINT_PACK32:
2622 			if((rgbaWriteMask & 0x0000000F) != 0x0)
2623 			{
2624 				UInt2 value, packedCol, mergedMask;
2625 
2626 				buffer += 4 * x;
2627 
2628 				bool isSigned = targetFormat == VK_FORMAT_R8G8B8A8_SINT || targetFormat == VK_FORMAT_A8B8G8R8_SINT_PACK32;
2629 
2630 				if(isSigned)
2631 				{
2632 					packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2633 				}
2634 				else
2635 				{
2636 					packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2637 				}
2638 				value = *Pointer<UInt2>(buffer, 16);
2639 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2640 				if(rgbaWriteMask != 0xF)
2641 				{
2642 					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2643 				}
2644 				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2645 
2646 				buffer += pitchB;
2647 
2648 				if(isSigned)
2649 				{
2650 					packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2651 				}
2652 				else
2653 				{
2654 					packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2655 				}
2656 				value = *Pointer<UInt2>(buffer, 16);
2657 				mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2658 				if(rgbaWriteMask != 0xF)
2659 				{
2660 					mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2661 				}
2662 				*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2663 			}
2664 			break;
2665 		case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2666 			if((rgbaWriteMask & 0x0000000F) != 0x0)
2667 			{
2668 				Int2 mergedMask, packedCol, value;
2669 				Int4 packed = ((As<Int4>(oC.w) & Int4(0x3)) << 30) |
2670 				              ((As<Int4>(oC.z) & Int4(0x3ff)) << 20) |
2671 				              ((As<Int4>(oC.y) & Int4(0x3ff)) << 10) |
2672 				              ((As<Int4>(oC.x) & Int4(0x3ff)));
2673 
2674 				buffer += 4 * x;
2675 				value = *Pointer<Int2>(buffer, 16);
2676 				mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2677 				if(rgbaWriteMask != 0xF)
2678 				{
2679 					mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
2680 				}
2681 				*Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
2682 
2683 				buffer += pitchB;
2684 
2685 				value = *Pointer<Int2>(buffer, 16);
2686 				mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2687 				if(rgbaWriteMask != 0xF)
2688 				{
2689 					mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
2690 				}
2691 				*Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
2692 			}
2693 			break;
2694 		case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2695 			if((bgraWriteMask & 0x0000000F) != 0x0)
2696 			{
2697 				Int2 mergedMask, packedCol, value;
2698 				Int4 packed = ((As<Int4>(oC.w) & Int4(0x3)) << 30) |
2699 				              ((As<Int4>(oC.x) & Int4(0x3ff)) << 20) |
2700 				              ((As<Int4>(oC.y) & Int4(0x3ff)) << 10) |
2701 				              ((As<Int4>(oC.z) & Int4(0x3ff)));
2702 
2703 				buffer += 4 * x;
2704 				value = *Pointer<Int2>(buffer, 16);
2705 				mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2706 				if(bgraWriteMask != 0xF)
2707 				{
2708 					mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[bgraWriteMask][0]));
2709 				}
2710 				*Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
2711 
2712 				buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2713 
2714 				value = *Pointer<Int2>(buffer, 16);
2715 				mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2716 				if(bgraWriteMask != 0xF)
2717 				{
2718 					mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[bgraWriteMask][0]));
2719 				}
2720 				*Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
2721 			}
2722 			break;
2723 		default:
2724 			UNSUPPORTED("VkFormat: %d", int(targetFormat));
2725 	}
2726 }
2727 
convertFixed16(const Float4 & cf,bool saturate)2728 UShort4 PixelRoutine::convertFixed16(const Float4 &cf, bool saturate)
2729 {
2730 	return UShort4(cf * Float4(0xFFFF), saturate);
2731 }
2732 
sRGBtoLinear16_12_16(Vector4s & c)2733 void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
2734 {
2735 	Pointer<Byte> LUT = constants + OFFSET(Constants, sRGBtoLinear12_16);
2736 
2737 	c.x = AddSat(As<UShort4>(c.x), UShort4(0x0007)) >> 4;
2738 	c.y = AddSat(As<UShort4>(c.y), UShort4(0x0007)) >> 4;
2739 	c.z = AddSat(As<UShort4>(c.z), UShort4(0x0007)) >> 4;
2740 
2741 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2742 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2743 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2744 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2745 
2746 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2747 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2748 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2749 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2750 
2751 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2752 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2753 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2754 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2755 }
2756 
linearToSRGB16_12_16(Vector4s & c)2757 void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
2758 {
2759 	c.x = AddSat(As<UShort4>(c.x), UShort4(0x0007)) >> 4;
2760 	c.y = AddSat(As<UShort4>(c.y), UShort4(0x0007)) >> 4;
2761 	c.z = AddSat(As<UShort4>(c.z), UShort4(0x0007)) >> 4;
2762 
2763 	linearToSRGB12_16(c);
2764 }
2765 
linearToSRGB12_16(Vector4s & c)2766 void PixelRoutine::linearToSRGB12_16(Vector4s &c)
2767 {
2768 	Pointer<Byte> LUT = constants + OFFSET(Constants, linearToSRGB12_16);
2769 
2770 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2771 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2772 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2773 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2774 
2775 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2776 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2777 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2778 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2779 
2780 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2781 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2782 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2783 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2784 }
2785 
sRGBtoLinear(const Float4 & x)2786 Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)  // Approximates x^2.2
2787 {
2788 	Float4 linear = x * x;
2789 	linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2790 
2791 	return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2792 }
2793 
2794 }  // namespace sw
2795