• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "PixelRoutine.hpp"
16 
17 #include "Constants.hpp"
18 #include "SamplerCore.hpp"
19 #include "Device/Primitive.hpp"
20 #include "Device/QuadRasterizer.hpp"
21 #include "Device/Renderer.hpp"
22 #include "System/Debug.hpp"
23 #include "Vulkan/VkPipelineLayout.hpp"
24 #include "Vulkan/VkStringify.hpp"
25 
26 namespace sw {
27 
PixelRoutine(const PixelProcessor::State & state,vk::PipelineLayout const * pipelineLayout,SpirvShader const * spirvShader,const vk::DescriptorSet::Bindings & descriptorSets)28 PixelRoutine::PixelRoutine(
29     const PixelProcessor::State &state,
30     vk::PipelineLayout const *pipelineLayout,
31     SpirvShader const *spirvShader,
32     const vk::DescriptorSet::Bindings &descriptorSets)
33     : QuadRasterizer(state, spirvShader)
34     , routine(pipelineLayout)
35     , descriptorSets(descriptorSets)
36     , shaderContainsInterpolation(spirvShader && spirvShader->getUsedCapabilities().InterpolationFunction)
37     , shaderContainsSampleQualifier(spirvShader && spirvShader->getAnalysis().ContainsSampleQualifier)
38     , perSampleShading((state.sampleShadingEnabled && (state.minSampleShading * state.multiSampleCount > 1.0f)) ||
39                        shaderContainsSampleQualifier || shaderContainsInterpolation)  // TODO(b/194714095)
40     , invocationCount(perSampleShading ? state.multiSampleCount : 1)
41 {
42 	if(spirvShader)
43 	{
44 		spirvShader->emitProlog(&routine);
45 	}
46 }
47 
~PixelRoutine()48 PixelRoutine::~PixelRoutine()
49 {
50 }
51 
getSampleSet(int invocation) const52 PixelRoutine::SampleSet PixelRoutine::getSampleSet(int invocation) const
53 {
54 	unsigned int sampleBegin = perSampleShading ? invocation : 0;
55 	unsigned int sampleEnd = perSampleShading ? (invocation + 1) : state.multiSampleCount;
56 
57 	SampleSet samples;
58 
59 	for(unsigned int q = sampleBegin; q < sampleEnd; q++)
60 	{
61 		if(state.multiSampleMask & (1 << q))
62 		{
63 			samples.push_back(q);
64 		}
65 	}
66 
67 	return samples;
68 }
69 
quad(Pointer<Byte> cBuffer[MAX_COLOR_BUFFERS],Pointer<Byte> & zBuffer,Pointer<Byte> & sBuffer,Int cMask[4],Int & x,Int & y)70 void PixelRoutine::quad(Pointer<Byte> cBuffer[MAX_COLOR_BUFFERS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
71 {
72 	const bool earlyFragmentTests = !spirvShader || spirvShader->getExecutionModes().EarlyFragmentTests;
73 
74 	Int zMask[4];  // Depth mask
75 	Int sMask[4];  // Stencil mask
76 	Float4 unclampedZ[4];
77 
78 	for(int invocation = 0; invocation < invocationCount; invocation++)
79 	{
80 		SampleSet samples = getSampleSet(invocation);
81 
82 		if(samples.empty())
83 		{
84 			continue;
85 		}
86 
87 		for(unsigned int q : samples)
88 		{
89 			zMask[q] = cMask[q];
90 			sMask[q] = cMask[q];
91 		}
92 
93 		stencilTest(sBuffer, x, sMask, samples);
94 
95 		Float4 f;
96 		Float4 rhwCentroid;
97 
98 		Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive, xQuad), 16);
99 
100 		if(interpolateZ())
101 		{
102 			for(unsigned int q : samples)
103 			{
104 				Float4 x = xxxx;
105 
106 				if(state.enableMultiSampling)
107 				{
108 					x -= *Pointer<Float4>(constants + OFFSET(Constants, X) + q * sizeof(float4));
109 				}
110 
111 				z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive, z), false, false);
112 
113 				if(state.depthBias)
114 				{
115 					z[q] += *Pointer<Float4>(primitive + OFFSET(Primitive, zBias), 16);
116 				}
117 
118 				unclampedZ[q] = z[q];
119 			}
120 		}
121 
122 		Bool depthPass = false;
123 
124 		if(earlyFragmentTests)
125 		{
126 			for(unsigned int q : samples)
127 			{
128 				z[q] = clampDepth(z[q]);
129 				depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
130 				depthBoundsTest(zBuffer, q, x, zMask[q], cMask[q]);
131 			}
132 		}
133 
134 		If(depthPass || !earlyFragmentTests)
135 		{
136 			if(earlyFragmentTests)
137 			{
138 				writeDepth(zBuffer, x, zMask, samples);
139 			}
140 
141 			Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive, yQuad), 16);
142 
143 			// Centroid locations
144 			Float4 XXXX = 0.0f;
145 			Float4 YYYY = 0.0f;
146 
147 			if(state.centroid || shaderContainsInterpolation)  // TODO(b/194714095)
148 			{
149 				Float4 WWWW(1.0e-9f);
150 
151 				for(unsigned int q : samples)
152 				{
153 					XXXX += *Pointer<Float4>(constants + OFFSET(Constants, sampleX[q]) + 16 * cMask[q]);
154 					YYYY += *Pointer<Float4>(constants + OFFSET(Constants, sampleY[q]) + 16 * cMask[q]);
155 					WWWW += *Pointer<Float4>(constants + OFFSET(Constants, weight) + 16 * cMask[q]);
156 				}
157 
158 				WWWW = Rcp(WWWW, true /* relaxedPrecision */);
159 				XXXX *= WWWW;
160 				YYYY *= WWWW;
161 
162 				XXXX += xxxx;
163 				YYYY += yyyy;
164 			}
165 
166 			if(interpolateW())
167 			{
168 				w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive, w), false, false);
169 				rhw = reciprocal(w, false, true);
170 
171 				if(state.centroid || shaderContainsInterpolation)  // TODO(b/194714095)
172 				{
173 					rhwCentroid = reciprocal(SpirvRoutine::interpolateAtXY(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, w), false, false));
174 				}
175 			}
176 
177 			if(spirvShader)
178 			{
179 				if(shaderContainsInterpolation)  // TODO(b/194714095)
180 				{
181 					routine.interpolationData.primitive = primitive;
182 
183 					routine.interpolationData.x = xxxx;
184 					routine.interpolationData.y = yyyy;
185 					routine.interpolationData.rhw = rhw;
186 
187 					routine.interpolationData.xCentroid = XXXX;
188 					routine.interpolationData.yCentroid = YYYY;
189 					routine.interpolationData.rhwCentroid = rhwCentroid;
190 				}
191 
192 				if(perSampleShading && (state.multiSampleCount > 1))
193 				{
194 					xxxx += Constants::SampleLocationsX[samples[0]];
195 					yyyy += Constants::SampleLocationsY[samples[0]];
196 				}
197 
198 				int packedInterpolant = 0;
199 				for(int interfaceInterpolant = 0; interfaceInterpolant < MAX_INTERFACE_COMPONENTS; interfaceInterpolant++)
200 				{
201 					auto const &input = spirvShader->inputs[interfaceInterpolant];
202 					if(input.Type != SpirvShader::ATTRIBTYPE_UNUSED)
203 					{
204 						if(input.Centroid && state.enableMultiSampling)
205 						{
206 							routine.inputs[interfaceInterpolant] =
207 							    SpirvRoutine::interpolateAtXY(XXXX, YYYY, rhwCentroid,
208 							                                  primitive + OFFSET(Primitive, V[packedInterpolant]),
209 							                                  input.Flat, !input.NoPerspective);
210 						}
211 						else if(perSampleShading)
212 						{
213 							routine.inputs[interfaceInterpolant] =
214 							    SpirvRoutine::interpolateAtXY(xxxx, yyyy, rhw,
215 							                                  primitive + OFFSET(Primitive, V[packedInterpolant]),
216 							                                  input.Flat, !input.NoPerspective);
217 						}
218 						else
219 						{
220 							routine.inputs[interfaceInterpolant] =
221 							    interpolate(xxxx, Dv[interfaceInterpolant], rhw,
222 							                primitive + OFFSET(Primitive, V[packedInterpolant]),
223 							                input.Flat, !input.NoPerspective);
224 						}
225 						packedInterpolant++;
226 					}
227 				}
228 
229 				setBuiltins(x, y, unclampedZ, w, cMask, samples);
230 
231 				for(uint32_t i = 0; i < state.numClipDistances; i++)
232 				{
233 					auto distance = interpolate(xxxx, DclipDistance[i], rhw,
234 					                            primitive + OFFSET(Primitive, clipDistance[i]),
235 					                            false, true);
236 
237 					auto clipMask = SignMask(CmpGE(distance, SIMD::Float(0)));
238 					for(unsigned int q : samples)
239 					{
240 						// FIXME(b/148105887): Fragments discarded by clipping do not exist at
241 						// all -- they should not be counted in queries or have their Z/S effects
242 						// performed when early fragment tests are enabled.
243 						cMask[q] &= clipMask;
244 					}
245 
246 					if(spirvShader->getUsedCapabilities().ClipDistance)
247 					{
248 						auto it = spirvShader->inputBuiltins.find(spv::BuiltInClipDistance);
249 						if(it != spirvShader->inputBuiltins.end())
250 						{
251 							if(i < it->second.SizeInComponents)
252 							{
253 								routine.getVariable(it->second.Id)[it->second.FirstComponent + i] = distance;
254 							}
255 						}
256 					}
257 				}
258 
259 				if(spirvShader->getUsedCapabilities().CullDistance)
260 				{
261 					auto it = spirvShader->inputBuiltins.find(spv::BuiltInCullDistance);
262 					if(it != spirvShader->inputBuiltins.end())
263 					{
264 						for(uint32_t i = 0; i < state.numCullDistances; i++)
265 						{
266 							if(i < it->second.SizeInComponents)
267 							{
268 								routine.getVariable(it->second.Id)[it->second.FirstComponent + i] =
269 								    interpolate(xxxx, DcullDistance[i], rhw,
270 								                primitive + OFFSET(Primitive, cullDistance[i]),
271 								                false, true);
272 							}
273 						}
274 					}
275 				}
276 			}
277 
278 			if(spirvShader)
279 			{
280 				executeShader(cMask, earlyFragmentTests ? sMask : cMask, earlyFragmentTests ? zMask : cMask, samples);
281 			}
282 
283 			Bool alphaPass = alphaTest(cMask, samples);
284 
285 			if((spirvShader && spirvShader->getAnalysis().ContainsDiscard) || state.alphaToCoverage)
286 			{
287 				for(unsigned int q : samples)
288 				{
289 					zMask[q] &= cMask[q];
290 					sMask[q] &= cMask[q];
291 				}
292 			}
293 
294 			If(alphaPass)
295 			{
296 				if(!earlyFragmentTests)
297 				{
298 					for(unsigned int q : samples)
299 					{
300 						z[q] = clampDepth(z[q]);
301 						depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
302 						depthBoundsTest(zBuffer, q, x, zMask[q], cMask[q]);
303 					}
304 				}
305 
306 				If(depthPass)
307 				{
308 					if(!earlyFragmentTests)
309 					{
310 						writeDepth(zBuffer, x, zMask, samples);
311 					}
312 
313 					blendColor(cBuffer, x, sMask, zMask, cMask, samples);
314 
315 					occlusionSampleCount(zMask, sMask, samples);
316 				}
317 			}
318 		}
319 
320 		writeStencil(sBuffer, x, sMask, zMask, cMask, samples);
321 	}
322 }
323 
stencilTest(const Pointer<Byte> & sBuffer,const Int & x,Int sMask[4],const SampleSet & samples)324 void PixelRoutine::stencilTest(const Pointer<Byte> &sBuffer, const Int &x, Int sMask[4], const SampleSet &samples)
325 {
326 	if(!state.stencilActive)
327 	{
328 		return;
329 	}
330 
331 	for(unsigned int q : samples)
332 	{
333 		// (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
334 
335 		Pointer<Byte> buffer = sBuffer + x;
336 
337 		if(q > 0)
338 		{
339 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
340 		}
341 
342 		Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
343 		Byte8 value = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
344 		value = value | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
345 		Byte8 valueBack = value;
346 
347 		if(state.frontStencil.compareMask != 0xff)
348 		{
349 			value &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].testMaskQ));
350 		}
351 
352 		stencilTest(value, state.frontStencil.compareOp, false);
353 
354 		if(state.backStencil.compareMask != 0xff)
355 		{
356 			valueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].testMaskQ));
357 		}
358 
359 		stencilTest(valueBack, state.backStencil.compareOp, true);
360 
361 		value &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
362 		valueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
363 		value |= valueBack;
364 
365 		sMask[q] &= SignMask(value);
366 	}
367 }
368 
stencilTest(Byte8 & value,VkCompareOp stencilCompareMode,bool isBack)369 void PixelRoutine::stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool isBack)
370 {
371 	Byte8 equal;
372 
373 	switch(stencilCompareMode)
374 	{
375 	case VK_COMPARE_OP_ALWAYS:
376 		value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
377 		break;
378 	case VK_COMPARE_OP_NEVER:
379 		value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
380 		break;
381 	case VK_COMPARE_OP_LESS:  // a < b ~ b > a
382 		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
383 		value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
384 		break;
385 	case VK_COMPARE_OP_EQUAL:
386 		value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
387 		break;
388 	case VK_COMPARE_OP_NOT_EQUAL:  // a != b ~ !(a == b)
389 		value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
390 		value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
391 		break;
392 	case VK_COMPARE_OP_LESS_OR_EQUAL:  // a <= b ~ (b > a) || (a == b)
393 		equal = value;
394 		equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
395 		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
396 		value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
397 		value |= equal;
398 		break;
399 	case VK_COMPARE_OP_GREATER:  // a > b
400 		equal = *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ));
401 		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
402 		equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
403 		value = equal;
404 		break;
405 	case VK_COMPARE_OP_GREATER_OR_EQUAL:  // a >= b ~ !(a < b) ~ !(b > a)
406 		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
407 		value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
408 		value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
409 		break;
410 	default:
411 		UNSUPPORTED("VkCompareOp: %d", int(stencilCompareMode));
412 	}
413 }
414 
depthTest32F(const Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & sMask,Int & zMask,const Int & cMask)415 Bool PixelRoutine::depthTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
416 {
417 	Float4 Z = z;
418 
419 	Pointer<Byte> buffer = zBuffer + 4 * x;
420 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
421 
422 	if(q > 0)
423 	{
424 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
425 	}
426 
427 	Float4 zValue;
428 
429 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
430 	{
431 		zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
432 	}
433 
434 	Int4 zTest;
435 
436 	switch(state.depthCompareMode)
437 	{
438 	case VK_COMPARE_OP_ALWAYS:
439 		// Optimized
440 		break;
441 	case VK_COMPARE_OP_NEVER:
442 		// Optimized
443 		break;
444 	case VK_COMPARE_OP_EQUAL:
445 		zTest = CmpEQ(zValue, Z);
446 		break;
447 	case VK_COMPARE_OP_NOT_EQUAL:
448 		zTest = CmpNEQ(zValue, Z);
449 		break;
450 	case VK_COMPARE_OP_LESS:
451 		zTest = CmpNLE(zValue, Z);
452 		break;
453 	case VK_COMPARE_OP_GREATER_OR_EQUAL:
454 		zTest = CmpLE(zValue, Z);
455 		break;
456 	case VK_COMPARE_OP_LESS_OR_EQUAL:
457 		zTest = CmpNLT(zValue, Z);
458 		break;
459 	case VK_COMPARE_OP_GREATER:
460 		zTest = CmpLT(zValue, Z);
461 		break;
462 	default:
463 		UNSUPPORTED("VkCompareOp: %d", int(state.depthCompareMode));
464 	}
465 
466 	switch(state.depthCompareMode)
467 	{
468 	case VK_COMPARE_OP_ALWAYS:
469 		zMask = cMask;
470 		break;
471 	case VK_COMPARE_OP_NEVER:
472 		zMask = 0x0;
473 		break;
474 	default:
475 		zMask = SignMask(zTest) & cMask;
476 		break;
477 	}
478 
479 	if(state.stencilActive)
480 	{
481 		zMask &= sMask;
482 	}
483 
484 	return zMask != 0;
485 }
486 
depthTest16(const Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & sMask,Int & zMask,const Int & cMask)487 Bool PixelRoutine::depthTest16(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
488 {
489 	Short4 Z = convertFixed16(z, true);
490 
491 	Pointer<Byte> buffer = zBuffer + 2 * x;
492 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
493 
494 	if(q > 0)
495 	{
496 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
497 	}
498 
499 	Short4 zValue;
500 
501 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
502 	{
503 		zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer), 0));
504 		zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer + pitch), 1));
505 	}
506 
507 	Int4 zTest;
508 
509 	// Bias values to make unsigned compares out of Reactor's (due SSE's) signed compares only
510 	zValue = zValue - Short4(0x8000u);
511 	Z = Z - Short4(0x8000u);
512 
513 	switch(state.depthCompareMode)
514 	{
515 	case VK_COMPARE_OP_ALWAYS:
516 		// Optimized
517 		break;
518 	case VK_COMPARE_OP_NEVER:
519 		// Optimized
520 		break;
521 	case VK_COMPARE_OP_EQUAL:
522 		zTest = Int4(CmpEQ(zValue, Z));
523 		break;
524 	case VK_COMPARE_OP_NOT_EQUAL:
525 		zTest = ~Int4(CmpEQ(zValue, Z));
526 		break;
527 	case VK_COMPARE_OP_LESS:
528 		zTest = Int4(CmpGT(zValue, Z));
529 		break;
530 	case VK_COMPARE_OP_GREATER_OR_EQUAL:
531 		zTest = ~Int4(CmpGT(zValue, Z));
532 		break;
533 	case VK_COMPARE_OP_LESS_OR_EQUAL:
534 		zTest = ~Int4(CmpGT(Z, zValue));
535 		break;
536 	case VK_COMPARE_OP_GREATER:
537 		zTest = Int4(CmpGT(Z, zValue));
538 		break;
539 	default:
540 		UNSUPPORTED("VkCompareOp: %d", int(state.depthCompareMode));
541 	}
542 
543 	switch(state.depthCompareMode)
544 	{
545 	case VK_COMPARE_OP_ALWAYS:
546 		zMask = cMask;
547 		break;
548 	case VK_COMPARE_OP_NEVER:
549 		zMask = 0x0;
550 		break;
551 	default:
552 		zMask = SignMask(zTest) & cMask;
553 		break;
554 	}
555 
556 	if(state.stencilActive)
557 	{
558 		zMask &= sMask;
559 	}
560 
561 	return zMask != 0;
562 }
563 
clampDepth(const Float4 & z)564 Float4 PixelRoutine::clampDepth(const Float4 &z)
565 {
566 	if(!state.depthClamp)
567 	{
568 		return z;
569 	}
570 
571 	return Min(Max(z, state.minDepthClamp), state.maxDepthClamp);
572 }
573 
depthTest(const Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & sMask,Int & zMask,const Int & cMask)574 Bool PixelRoutine::depthTest(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
575 {
576 	if(!state.depthTestActive)
577 	{
578 		return true;
579 	}
580 
581 	switch(state.depthFormat)
582 	{
583 	case VK_FORMAT_D16_UNORM:
584 		return depthTest16(zBuffer, q, x, z, sMask, zMask, cMask);
585 	case VK_FORMAT_D32_SFLOAT:
586 	case VK_FORMAT_D32_SFLOAT_S8_UINT:
587 		return depthTest32F(zBuffer, q, x, z, sMask, zMask, cMask);
588 	default:
589 		UNSUPPORTED("Depth format: %d", int(state.depthFormat));
590 		return false;
591 	}
592 }
593 
depthBoundsTest16(const Pointer<Byte> & zBuffer,int q,const Int & x)594 Int4 PixelRoutine::depthBoundsTest16(const Pointer<Byte> &zBuffer, int q, const Int &x)
595 {
596 	Pointer<Byte> buffer = zBuffer + 2 * x;
597 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
598 
599 	if(q > 0)
600 	{
601 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
602 	}
603 
604 	Float4 minDepthBound(state.minDepthBounds);
605 	Float4 maxDepthBound(state.maxDepthBounds);
606 
607 	Int2 z;
608 	z = Insert(z, *Pointer<Int>(buffer), 0);
609 	z = Insert(z, *Pointer<Int>(buffer + pitch), 1);
610 
611 	Float4 zValue = convertFloat32(As<UShort4>(z));
612 	return Int4(CmpLE(minDepthBound, zValue) & CmpLE(zValue, maxDepthBound));
613 }
614 
depthBoundsTest32F(const Pointer<Byte> & zBuffer,int q,const Int & x)615 Int4 PixelRoutine::depthBoundsTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x)
616 {
617 	Pointer<Byte> buffer = zBuffer + 4 * x;
618 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
619 
620 	if(q > 0)
621 	{
622 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
623 	}
624 
625 	Float4 zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
626 	return Int4(CmpLE(state.minDepthBounds, zValue) & CmpLE(zValue, state.maxDepthBounds));
627 }
628 
depthBoundsTest(const Pointer<Byte> & zBuffer,int q,const Int & x,Int & zMask,Int & cMask)629 void PixelRoutine::depthBoundsTest(const Pointer<Byte> &zBuffer, int q, const Int &x, Int &zMask, Int &cMask)
630 {
631 	if(!state.depthBoundsTestActive)
632 	{
633 		return;
634 	}
635 
636 	Int4 zTest;
637 	switch(state.depthFormat)
638 	{
639 	case VK_FORMAT_D16_UNORM:
640 		zTest = depthBoundsTest16(zBuffer, q, x);
641 		break;
642 	case VK_FORMAT_D32_SFLOAT:
643 	case VK_FORMAT_D32_SFLOAT_S8_UINT:
644 		zTest = depthBoundsTest32F(zBuffer, q, x);
645 		break;
646 	default:
647 		UNSUPPORTED("Depth format: %d", int(state.depthFormat));
648 		break;
649 	}
650 
651 	if(!state.depthTestActive)
652 	{
653 		cMask &= zMask & SignMask(zTest);
654 	}
655 	else
656 	{
657 		zMask &= cMask & SignMask(zTest);
658 	}
659 }
660 
alphaToCoverage(Int cMask[4],const Float4 & alpha,const SampleSet & samples)661 void PixelRoutine::alphaToCoverage(Int cMask[4], const Float4 &alpha, const SampleSet &samples)
662 {
663 	static const int a2c[4] = {
664 		OFFSET(DrawData, a2c0),
665 		OFFSET(DrawData, a2c1),
666 		OFFSET(DrawData, a2c2),
667 		OFFSET(DrawData, a2c3),
668 	};
669 
670 	for(unsigned int q : samples)
671 	{
672 		Int4 coverage = CmpNLT(alpha, *Pointer<Float4>(data + a2c[q]));
673 		Int aMask = SignMask(coverage);
674 		cMask[q] &= aMask;
675 	}
676 }
677 
writeDepth32F(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)678 void PixelRoutine::writeDepth32F(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
679 {
680 	Float4 Z = z;
681 
682 	Pointer<Byte> buffer = zBuffer + 4 * x;
683 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
684 
685 	if(q > 0)
686 	{
687 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
688 	}
689 
690 	Float4 zValue;
691 
692 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
693 	{
694 		zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
695 	}
696 
697 	Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + zMask * 16, 16));
698 	zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + zMask * 16, 16));
699 	Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
700 
701 	*Pointer<Float2>(buffer) = Float2(Z.xy);
702 	*Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
703 }
704 
writeDepth16(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)705 void PixelRoutine::writeDepth16(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
706 {
707 	Short4 Z = As<Short4>(convertFixed16(z, true));
708 
709 	Pointer<Byte> buffer = zBuffer + 2 * x;
710 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
711 
712 	if(q > 0)
713 	{
714 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
715 	}
716 
717 	Short4 zValue;
718 
719 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
720 	{
721 		zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer), 0));
722 		zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer + pitch), 1));
723 	}
724 
725 	Z = Z & *Pointer<Short4>(constants + OFFSET(Constants, maskW4Q) + zMask * 8, 8);
726 	zValue = zValue & *Pointer<Short4>(constants + OFFSET(Constants, invMaskW4Q) + zMask * 8, 8);
727 	Z = Z | zValue;
728 
729 	*Pointer<Int>(buffer) = Extract(As<Int2>(Z), 0);
730 	*Pointer<Int>(buffer + pitch) = Extract(As<Int2>(Z), 1);
731 }
732 
writeDepth(Pointer<Byte> & zBuffer,const Int & x,const Int zMask[4],const SampleSet & samples)733 void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, const Int &x, const Int zMask[4], const SampleSet &samples)
734 {
735 	if(!state.depthWriteEnable)
736 	{
737 		return;
738 	}
739 
740 	for(unsigned int q : samples)
741 	{
742 		switch(state.depthFormat)
743 		{
744 		case VK_FORMAT_D16_UNORM:
745 			writeDepth16(zBuffer, q, x, z[q], zMask[q]);
746 			break;
747 		case VK_FORMAT_D32_SFLOAT:
748 		case VK_FORMAT_D32_SFLOAT_S8_UINT:
749 			writeDepth32F(zBuffer, q, x, z[q], zMask[q]);
750 			break;
751 		default:
752 			UNSUPPORTED("Depth format: %d", int(state.depthFormat));
753 			break;
754 		}
755 	}
756 }
757 
occlusionSampleCount(const Int zMask[4],const Int sMask[4],const SampleSet & samples)758 void PixelRoutine::occlusionSampleCount(const Int zMask[4], const Int sMask[4], const SampleSet &samples)
759 {
760 	if(!state.occlusionEnabled)
761 	{
762 		return;
763 	}
764 
765 	for(unsigned int q : samples)
766 	{
767 		occlusion += *Pointer<UInt>(constants + OFFSET(Constants, occlusionCount) + 4 * (zMask[q] & sMask[q]));
768 	}
769 }
770 
writeStencil(Pointer<Byte> & sBuffer,const Int & x,const Int sMask[4],const Int zMask[4],const Int cMask[4],const SampleSet & samples)771 void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, const Int &x, const Int sMask[4], const Int zMask[4], const Int cMask[4], const SampleSet &samples)
772 {
773 	if(!state.stencilActive)
774 	{
775 		return;
776 	}
777 
778 	if(state.frontStencil.passOp == VK_STENCIL_OP_KEEP && state.frontStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.frontStencil.failOp == VK_STENCIL_OP_KEEP)
779 	{
780 		if(state.backStencil.passOp == VK_STENCIL_OP_KEEP && state.backStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.backStencil.failOp == VK_STENCIL_OP_KEEP)
781 		{
782 			return;
783 		}
784 	}
785 
786 	if((state.frontStencil.writeMask == 0) && (state.backStencil.writeMask == 0))
787 	{
788 		return;
789 	}
790 
791 	for(unsigned int q : samples)
792 	{
793 		Pointer<Byte> buffer = sBuffer + x;
794 
795 		if(q > 0)
796 		{
797 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
798 		}
799 
800 		Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
801 		Byte8 bufferValue = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
802 		bufferValue = bufferValue | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
803 		Byte8 newValue;
804 		stencilOperation(newValue, bufferValue, state.frontStencil, false, zMask[q], sMask[q]);
805 
806 		if((state.frontStencil.writeMask & 0xFF) != 0xFF)  // Assume 8-bit stencil buffer
807 		{
808 			Byte8 maskedValue = bufferValue;
809 			newValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].writeMaskQ));
810 			maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].invWriteMaskQ));
811 			newValue |= maskedValue;
812 		}
813 
814 		Byte8 newValueBack;
815 
816 		stencilOperation(newValueBack, bufferValue, state.backStencil, true, zMask[q], sMask[q]);
817 
818 		if((state.backStencil.writeMask & 0xFF) != 0xFF)  // Assume 8-bit stencil buffer
819 		{
820 			Byte8 maskedValue = bufferValue;
821 			newValueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].writeMaskQ));
822 			maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].invWriteMaskQ));
823 			newValueBack |= maskedValue;
824 		}
825 
826 		newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
827 		newValueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
828 		newValue |= newValueBack;
829 
830 		newValue &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * cMask[q]);
831 		bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * cMask[q]);
832 		newValue |= bufferValue;
833 
834 		*Pointer<Short>(buffer) = Extract(As<Short4>(newValue), 0);
835 		*Pointer<Short>(buffer + pitch) = Extract(As<Short4>(newValue), 1);
836 	}
837 }
838 
stencilOperation(Byte8 & newValue,const Byte8 & bufferValue,const PixelProcessor::States::StencilOpState & ops,bool isBack,const Int & zMask,const Int & sMask)839 void PixelRoutine::stencilOperation(Byte8 &newValue, const Byte8 &bufferValue, const PixelProcessor::States::StencilOpState &ops, bool isBack, const Int &zMask, const Int &sMask)
840 {
841 	Byte8 &pass = newValue;
842 	Byte8 fail;
843 	Byte8 zFail;
844 
845 	stencilOperation(pass, bufferValue, ops.passOp, isBack);
846 
847 	if(ops.depthFailOp != ops.passOp)
848 	{
849 		stencilOperation(zFail, bufferValue, ops.depthFailOp, isBack);
850 	}
851 
852 	if(ops.failOp != ops.passOp || ops.failOp != ops.depthFailOp)
853 	{
854 		stencilOperation(fail, bufferValue, ops.failOp, isBack);
855 	}
856 
857 	if(ops.failOp != ops.passOp || ops.failOp != ops.depthFailOp)
858 	{
859 		if(state.depthTestActive && ops.depthFailOp != ops.passOp)  // zMask valid and values not the same
860 		{
861 			pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * zMask);
862 			zFail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * zMask);
863 			pass |= zFail;
864 		}
865 
866 		pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * sMask);
867 		fail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * sMask);
868 		pass |= fail;
869 	}
870 }
871 
hasStencilReplaceRef() const872 bool PixelRoutine::hasStencilReplaceRef() const
873 {
874 	return spirvShader &&
875 	       (spirvShader->outputBuiltins.find(spv::BuiltInFragStencilRefEXT) !=
876 	        spirvShader->outputBuiltins.end());
877 }
878 
stencilReplaceRef()879 Byte8 PixelRoutine::stencilReplaceRef()
880 {
881 	ASSERT(spirvShader);
882 
883 	auto it = spirvShader->outputBuiltins.find(spv::BuiltInFragStencilRefEXT);
884 	ASSERT(it != spirvShader->outputBuiltins.end());
885 
886 	UInt4 sRef = As<UInt4>(routine.getVariable(it->second.Id)[it->second.FirstComponent]) & UInt4(0xff);
887 	// TODO (b/148295813): Could be done with a single pshufb instruction. Optimize the
888 	//                     following line by either adding a rr::Shuffle() variant to do
889 	//                     it explicitly or adding a Byte4(Int4) constructor would work.
890 	sRef.x = rr::UInt(sRef.x) | (rr::UInt(sRef.y) << 8) | (rr::UInt(sRef.z) << 16) | (rr::UInt(sRef.w) << 24);
891 
892 	UInt2 sRefDuplicated;
893 	sRefDuplicated = Insert(sRefDuplicated, sRef.x, 0);
894 	sRefDuplicated = Insert(sRefDuplicated, sRef.x, 1);
895 	return As<Byte8>(sRefDuplicated);
896 }
897 
stencilOperation(Byte8 & output,const Byte8 & bufferValue,VkStencilOp operation,bool isBack)898 void PixelRoutine::stencilOperation(Byte8 &output, const Byte8 &bufferValue, VkStencilOp operation, bool isBack)
899 {
900 	if(hasStencilReplaceRef())
901 	{
902 		output = stencilReplaceRef();
903 	}
904 	else
905 	{
906 		switch(operation)
907 		{
908 		case VK_STENCIL_OP_KEEP:
909 			output = bufferValue;
910 			break;
911 		case VK_STENCIL_OP_ZERO:
912 			output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
913 			break;
914 		case VK_STENCIL_OP_REPLACE:
915 			output = *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceQ));
916 			break;
917 		case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
918 			output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
919 			break;
920 		case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
921 			output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
922 			break;
923 		case VK_STENCIL_OP_INVERT:
924 			output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
925 			break;
926 		case VK_STENCIL_OP_INCREMENT_AND_WRAP:
927 			output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
928 			break;
929 		case VK_STENCIL_OP_DECREMENT_AND_WRAP:
930 			output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
931 			break;
932 		default:
933 			UNSUPPORTED("VkStencilOp: %d", int(operation));
934 		}
935 	}
936 }
937 
isSRGB(int index) const938 bool PixelRoutine::isSRGB(int index) const
939 {
940 	return vk::Format(state.colorFormat[index]).isSRGBformat();
941 }
942 
readPixel(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4s & pixel)943 void PixelRoutine::readPixel(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &pixel)
944 {
945 	Short4 c01;
946 	Short4 c23;
947 	Pointer<Byte> buffer = cBuffer;
948 	Pointer<Byte> buffer2;
949 
950 	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
951 
952 	switch(state.colorFormat[index])
953 	{
954 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
955 		buffer += 2 * x;
956 		buffer2 = buffer + pitchB;
957 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
958 
959 		pixel.x = (c01 & Short4(0xF000u));
960 		pixel.y = (c01 & Short4(0x0F00u)) << 4;
961 		pixel.z = (c01 & Short4(0x00F0u)) << 8;
962 		pixel.w = (c01 & Short4(0x000Fu)) << 12;
963 
964 		// Expand to 16 bit range
965 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
966 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
967 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
968 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
969 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
970 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
971 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
972 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
973 		break;
974 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
975 		buffer += 2 * x;
976 		buffer2 = buffer + pitchB;
977 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
978 
979 		pixel.z = (c01 & Short4(0xF000u));
980 		pixel.y = (c01 & Short4(0x0F00u)) << 4;
981 		pixel.x = (c01 & Short4(0x00F0u)) << 8;
982 		pixel.w = (c01 & Short4(0x000Fu)) << 12;
983 
984 		// Expand to 16 bit range
985 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
986 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
987 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
988 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
989 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
990 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
991 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
992 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
993 		break;
994 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
995 		buffer += 2 * x;
996 		buffer2 = buffer + pitchB;
997 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
998 
999 		pixel.w = (c01 & Short4(0xF000u));
1000 		pixel.z = (c01 & Short4(0x0F00u)) << 4;
1001 		pixel.y = (c01 & Short4(0x00F0u)) << 8;
1002 		pixel.x = (c01 & Short4(0x000Fu)) << 12;
1003 
1004 		// Expand to 16 bit range
1005 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
1006 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
1007 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
1008 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
1009 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
1010 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
1011 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1012 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1013 		break;
1014 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
1015 		buffer += 2 * x;
1016 		buffer2 = buffer + pitchB;
1017 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1018 
1019 		pixel.w = (c01 & Short4(0xF000u));
1020 		pixel.x = (c01 & Short4(0x0F00u)) << 4;
1021 		pixel.y = (c01 & Short4(0x00F0u)) << 8;
1022 		pixel.z = (c01 & Short4(0x000Fu)) << 12;
1023 
1024 		// Expand to 16 bit range
1025 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
1026 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
1027 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
1028 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
1029 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
1030 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
1031 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1032 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1033 		break;
1034 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1035 		buffer += 2 * x;
1036 		buffer2 = buffer + pitchB;
1037 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1038 
1039 		pixel.x = (c01 & Short4(0xF800u));
1040 		pixel.y = (c01 & Short4(0x07C0u)) << 5;
1041 		pixel.z = (c01 & Short4(0x003Eu)) << 10;
1042 		pixel.w = ((c01 & Short4(0x0001u)) << 15) >> 15;
1043 
1044 		// Expand to 16 bit range
1045 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1046 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1047 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1048 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1049 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1050 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1051 		break;
1052 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1053 		buffer += 2 * x;
1054 		buffer2 = buffer + pitchB;
1055 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1056 
1057 		pixel.z = (c01 & Short4(0xF800u));
1058 		pixel.y = (c01 & Short4(0x07C0u)) << 5;
1059 		pixel.x = (c01 & Short4(0x003Eu)) << 10;
1060 		pixel.w = ((c01 & Short4(0x0001u)) << 15) >> 15;
1061 
1062 		// Expand to 16 bit range
1063 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1064 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1065 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1066 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1067 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1068 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1069 		break;
1070 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1071 		buffer += 2 * x;
1072 		buffer2 = buffer + pitchB;
1073 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1074 
1075 		pixel.x = (c01 & Short4(0x7C00u)) << 1;
1076 		pixel.y = (c01 & Short4(0x03E0u)) << 6;
1077 		pixel.z = (c01 & Short4(0x001Fu)) << 11;
1078 		pixel.w = (c01 & Short4(0x8000u)) >> 15;
1079 
1080 		// Expand to 16 bit range
1081 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1082 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1083 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1084 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1085 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1086 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1087 		break;
1088 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
1089 		buffer += 2 * x;
1090 		buffer2 = buffer + pitchB;
1091 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1092 
1093 		pixel.x = c01 & Short4(0xF800u);
1094 		pixel.y = (c01 & Short4(0x07E0u)) << 5;
1095 		pixel.z = (c01 & Short4(0x001Fu)) << 11;
1096 		pixel.w = Short4(0xFFFFu);
1097 
1098 		// Expand to 16 bit range
1099 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1100 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1101 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
1102 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
1103 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1104 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1105 		break;
1106 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
1107 		buffer += 2 * x;
1108 		buffer2 = buffer + pitchB;
1109 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1110 
1111 		pixel.z = c01 & Short4(0xF800u);
1112 		pixel.y = (c01 & Short4(0x07E0u)) << 5;
1113 		pixel.x = (c01 & Short4(0x001Fu)) << 11;
1114 		pixel.w = Short4(0xFFFFu);
1115 
1116 		// Expand to 16 bit range
1117 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1118 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1119 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
1120 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
1121 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1122 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1123 		break;
1124 	case VK_FORMAT_B8G8R8A8_UNORM:
1125 	case VK_FORMAT_B8G8R8A8_SRGB:
1126 		buffer += 4 * x;
1127 		c01 = *Pointer<Short4>(buffer);
1128 		buffer += pitchB;
1129 		c23 = *Pointer<Short4>(buffer);
1130 		pixel.z = c01;
1131 		pixel.y = c01;
1132 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1133 		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1134 		pixel.x = pixel.z;
1135 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1136 		pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1137 		pixel.y = pixel.z;
1138 		pixel.w = pixel.x;
1139 		pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1140 		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1141 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1142 		pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1143 		break;
1144 	case VK_FORMAT_R8G8B8A8_UNORM:
1145 	case VK_FORMAT_R8G8B8A8_SRGB:
1146 		buffer += 4 * x;
1147 		c01 = *Pointer<Short4>(buffer);
1148 		buffer += pitchB;
1149 		c23 = *Pointer<Short4>(buffer);
1150 		pixel.z = c01;
1151 		pixel.y = c01;
1152 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1153 		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1154 		pixel.x = pixel.z;
1155 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1156 		pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1157 		pixel.y = pixel.z;
1158 		pixel.w = pixel.x;
1159 		pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1160 		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1161 		pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1162 		pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1163 		break;
1164 	case VK_FORMAT_R8_UNORM:
1165 		buffer += 1 * x;
1166 		pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
1167 		buffer += pitchB;
1168 		pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
1169 		pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1170 		pixel.y = Short4(0x0000);
1171 		pixel.z = Short4(0x0000);
1172 		pixel.w = Short4(0xFFFFu);
1173 		break;
1174 	case VK_FORMAT_R8G8_UNORM:
1175 		buffer += 2 * x;
1176 		c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
1177 		buffer += pitchB;
1178 		c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
1179 		pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
1180 		pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
1181 		pixel.z = Short4(0x0000u);
1182 		pixel.w = Short4(0xFFFFu);
1183 		break;
1184 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1185 		{
1186 			Int4 v = Int4(0);
1187 			buffer += 4 * x;
1188 			v = Insert(v, *Pointer<Int>(buffer + 0), 0);
1189 			v = Insert(v, *Pointer<Int>(buffer + 4), 1);
1190 			buffer += pitchB;
1191 			v = Insert(v, *Pointer<Int>(buffer + 0), 2);
1192 			v = Insert(v, *Pointer<Int>(buffer + 4), 3);
1193 
1194 			pixel.x = Short4(v << 6) & Short4(0xFFC0u);
1195 			pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
1196 			pixel.z = Short4(v >> 14) & Short4(0xFFC0u);
1197 			pixel.w = Short4(v >> 16) & Short4(0xC000u);
1198 
1199 			// Expand to 16 bit range
1200 			pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1201 			pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1202 			pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1203 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 2);
1204 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1205 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1206 		}
1207 		break;
1208 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1209 		{
1210 			Int4 v = Int4(0);
1211 			v = Insert(v, *Pointer<Int>(buffer + 4 * x), 0);
1212 			v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 1);
1213 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1214 			v = Insert(v, *Pointer<Int>(buffer + 4 * x), 2);
1215 			v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 3);
1216 
1217 			pixel.x = Short4(v >> 14) & Short4(0xFFC0u);
1218 			pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
1219 			pixel.z = Short4(v << 6) & Short4(0xFFC0u);
1220 			pixel.w = Short4(v >> 16) & Short4(0xC000u);
1221 
1222 			// Expand to 16 bit range
1223 			pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1224 			pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1225 			pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1226 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 2);
1227 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1228 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1229 		}
1230 		break;
1231 	default:
1232 		UNSUPPORTED("VkFormat %d", int(state.colorFormat[index]));
1233 	}
1234 
1235 	if(isSRGB(index))
1236 	{
1237 		sRGBtoLinear16_12_16(pixel);
1238 	}
1239 }
1240 
writeColor(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4s & current,const Int & sMask,const Int & zMask,const Int & cMask)1241 void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &current, const Int &sMask, const Int &zMask, const Int &cMask)
1242 {
1243 	if(isSRGB(index))
1244 	{
1245 		linearToSRGB16_12_16(current);
1246 	}
1247 
1248 	switch(state.colorFormat[index])
1249 	{
1250 	case VK_FORMAT_B8G8R8A8_UNORM:
1251 	case VK_FORMAT_B8G8R8A8_SRGB:
1252 	case VK_FORMAT_R8G8B8A8_UNORM:
1253 	case VK_FORMAT_R8G8B8A8_SRGB:
1254 	case VK_FORMAT_R8G8_UNORM:
1255 	case VK_FORMAT_R8_UNORM:
1256 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1257 	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1258 		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
1259 		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
1260 		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
1261 		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
1262 		break;
1263 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1264 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1265 		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 10) + Short4(0x0020);
1266 		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 10) + Short4(0x0020);
1267 		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 10) + Short4(0x0020);
1268 		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 2) + Short4(0x2000);
1269 		break;
1270 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1271 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1272 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
1273 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
1274 		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 4) + Short4(0x0800);
1275 		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 4) + Short4(0x0800);
1276 		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 4) + Short4(0x0800);
1277 		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 4) + Short4(0x0800);
1278 		break;
1279 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1280 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1281 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1282 		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 5) + Short4(0x0400);
1283 		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 5) + Short4(0x0400);
1284 		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 5) + Short4(0x0400);
1285 		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 1) + Short4(0x4000);
1286 		break;
1287 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
1288 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
1289 		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 5) + Short4(0x0400);
1290 		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 6) + Short4(0x0200);
1291 		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 5) + Short4(0x0400);
1292 		break;
1293 	default:
1294 		break;
1295 	}
1296 
1297 	int rgbaWriteMask = state.colorWriteActive(index);
1298 	int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1299 
1300 	switch(state.colorFormat[index])
1301 	{
1302 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1303 		{
1304 			current.x = As<UShort4>(current.x & Short4(0xF000));
1305 			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 4;
1306 			current.z = As<UShort4>(current.z & Short4(0xF000)) >> 8;
1307 			current.w = As<UShort4>(current.w & Short4(0xF000u)) >> 12;
1308 
1309 			current.x = current.x | current.y | current.z | current.w;
1310 		}
1311 		break;
1312 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1313 		{
1314 			current.z = As<UShort4>(current.z & Short4(0xF000));
1315 			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 4;
1316 			current.x = As<UShort4>(current.x & Short4(0xF000)) >> 8;
1317 			current.w = As<UShort4>(current.w & Short4(0xF000u)) >> 12;
1318 
1319 			current.x = current.x | current.y | current.z | current.w;
1320 		}
1321 		break;
1322 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
1323 		{
1324 			current.w = As<UShort4>(current.w & Short4(0xF000));
1325 			current.x = As<UShort4>(current.x & Short4(0xF000)) >> 4;
1326 			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 8;
1327 			current.z = As<UShort4>(current.z & Short4(0xF000u)) >> 12;
1328 
1329 			current.x = current.x | current.y | current.z | current.w;
1330 		}
1331 		break;
1332 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
1333 		{
1334 			current.w = As<UShort4>(current.w & Short4(0xF000));
1335 			current.z = As<UShort4>(current.z & Short4(0xF000)) >> 4;
1336 			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 8;
1337 			current.x = As<UShort4>(current.x & Short4(0xF000u)) >> 12;
1338 
1339 			current.x = current.x | current.y | current.z | current.w;
1340 		}
1341 		break;
1342 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1343 		{
1344 			current.x = As<UShort4>(current.x & Short4(0xF800));
1345 			current.y = As<UShort4>(current.y & Short4(0xF800)) >> 5;
1346 			current.z = As<UShort4>(current.z & Short4(0xF800)) >> 10;
1347 			current.w = As<UShort4>(current.w & Short4(0x8000u)) >> 15;
1348 
1349 			current.x = current.x | current.y | current.z | current.w;
1350 		}
1351 		break;
1352 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1353 		{
1354 			current.z = As<UShort4>(current.z & Short4(0xF800));
1355 			current.y = As<UShort4>(current.y & Short4(0xF800)) >> 5;
1356 			current.x = As<UShort4>(current.x & Short4(0xF800)) >> 10;
1357 			current.w = As<UShort4>(current.w & Short4(0x8000u)) >> 15;
1358 
1359 			current.x = current.x | current.y | current.z | current.w;
1360 		}
1361 		break;
1362 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1363 		{
1364 			current.w = current.w & Short4(0x8000u);
1365 			current.x = As<UShort4>(current.x & Short4(0xF800)) >> 1;
1366 			current.y = As<UShort4>(current.y & Short4(0xF800)) >> 6;
1367 			current.z = As<UShort4>(current.z & Short4(0xF800)) >> 11;
1368 
1369 			current.x = current.x | current.y | current.z | current.w;
1370 		}
1371 		break;
1372 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
1373 		{
1374 			current.x = current.x & Short4(0xF800u);
1375 			current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1376 			current.z = As<UShort4>(current.z) >> 11;
1377 
1378 			current.x = current.x | current.y | current.z;
1379 		}
1380 		break;
1381 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
1382 		{
1383 			current.z = current.z & Short4(0xF800u);
1384 			current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1385 			current.x = As<UShort4>(current.x) >> 11;
1386 
1387 			current.x = current.x | current.y | current.z;
1388 		}
1389 		break;
1390 	case VK_FORMAT_B8G8R8A8_UNORM:
1391 	case VK_FORMAT_B8G8R8A8_SRGB:
1392 		if(rgbaWriteMask == 0x7)
1393 		{
1394 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1395 			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1396 			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1397 
1398 			current.z = As<Short4>(PackUnsigned(current.z, current.x));
1399 			current.y = As<Short4>(PackUnsigned(current.y, current.y));
1400 
1401 			current.x = current.z;
1402 			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1403 			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1404 			current.y = current.z;
1405 			current.z = As<Short4>(UnpackLow(current.z, current.x));
1406 			current.y = As<Short4>(UnpackHigh(current.y, current.x));
1407 		}
1408 		else
1409 		{
1410 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1411 			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1412 			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1413 			current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1414 
1415 			current.z = As<Short4>(PackUnsigned(current.z, current.x));
1416 			current.y = As<Short4>(PackUnsigned(current.y, current.w));
1417 
1418 			current.x = current.z;
1419 			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1420 			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1421 			current.y = current.z;
1422 			current.z = As<Short4>(UnpackLow(current.z, current.x));
1423 			current.y = As<Short4>(UnpackHigh(current.y, current.x));
1424 		}
1425 		break;
1426 	case VK_FORMAT_R8G8B8A8_UNORM:
1427 	case VK_FORMAT_R8G8B8A8_SRGB:
1428 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1429 	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1430 		if(rgbaWriteMask == 0x7)
1431 		{
1432 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1433 			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1434 			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1435 
1436 			current.z = As<Short4>(PackUnsigned(current.x, current.z));
1437 			current.y = As<Short4>(PackUnsigned(current.y, current.y));
1438 
1439 			current.x = current.z;
1440 			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1441 			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1442 			current.y = current.z;
1443 			current.z = As<Short4>(UnpackLow(current.z, current.x));
1444 			current.y = As<Short4>(UnpackHigh(current.y, current.x));
1445 		}
1446 		else
1447 		{
1448 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1449 			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1450 			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1451 			current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1452 
1453 			current.z = As<Short4>(PackUnsigned(current.x, current.z));
1454 			current.y = As<Short4>(PackUnsigned(current.y, current.w));
1455 
1456 			current.x = current.z;
1457 			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1458 			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1459 			current.y = current.z;
1460 			current.z = As<Short4>(UnpackLow(current.z, current.x));
1461 			current.y = As<Short4>(UnpackHigh(current.y, current.x));
1462 		}
1463 		break;
1464 	case VK_FORMAT_R8G8_UNORM:
1465 		current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1466 		current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1467 		current.x = As<Short4>(PackUnsigned(current.x, current.x));
1468 		current.y = As<Short4>(PackUnsigned(current.y, current.y));
1469 		current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
1470 		break;
1471 	case VK_FORMAT_R8_UNORM:
1472 		current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1473 		current.x = As<Short4>(PackUnsigned(current.x, current.x));
1474 		break;
1475 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1476 		{
1477 			auto r = (Int4(current.x) >> 6) & Int4(0x3ff);
1478 			auto g = (Int4(current.y) >> 6) & Int4(0x3ff);
1479 			auto b = (Int4(current.z) >> 6) & Int4(0x3ff);
1480 			auto a = (Int4(current.w) >> 14) & Int4(0x3);
1481 			Int4 packed = (a << 30) | (b << 20) | (g << 10) | r;
1482 			auto c02 = As<Int2>(Int4(packed.xzzz));  // TODO: auto c02 = packed.xz;
1483 			auto c13 = As<Int2>(Int4(packed.ywww));  // TODO: auto c13 = packed.yw;
1484 			current.x = UnpackLow(c02, c13);
1485 			current.y = UnpackHigh(c02, c13);
1486 		}
1487 		break;
1488 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1489 		{
1490 			auto r = (Int4(current.x) >> 6) & Int4(0x3ff);
1491 			auto g = (Int4(current.y) >> 6) & Int4(0x3ff);
1492 			auto b = (Int4(current.z) >> 6) & Int4(0x3ff);
1493 			auto a = (Int4(current.w) >> 14) & Int4(0x3);
1494 			Int4 packed = (a << 30) | (r << 20) | (g << 10) | b;
1495 			auto c02 = As<Int2>(Int4(packed.xzzz));  // TODO: auto c02 = packed.xz;
1496 			auto c13 = As<Int2>(Int4(packed.ywww));  // TODO: auto c13 = packed.yw;
1497 			current.x = UnpackLow(c02, c13);
1498 			current.y = UnpackHigh(c02, c13);
1499 		}
1500 		break;
1501 	default:
1502 		UNSUPPORTED("VkFormat: %d", int(state.colorFormat[index]));
1503 	}
1504 
1505 	Short4 c01 = current.z;
1506 	Short4 c23 = current.y;
1507 
1508 	Int xMask;  // Combination of all masks
1509 
1510 	if(state.depthTestActive)
1511 	{
1512 		xMask = zMask;
1513 	}
1514 	else
1515 	{
1516 		xMask = cMask;
1517 	}
1518 
1519 	if(state.stencilActive)
1520 	{
1521 		xMask &= sMask;
1522 	}
1523 
1524 	Pointer<Byte> buffer = cBuffer;
1525 	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1526 
1527 	switch(state.colorFormat[index])
1528 	{
1529 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1530 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1531 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
1532 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
1533 		{
1534 			buffer += 2 * x;
1535 			Int value = *Pointer<Int>(buffer);
1536 
1537 			Int channelMask;
1538 			switch(state.colorFormat[index])
1539 			{
1540 			case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1541 				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4rgbaQ[bgraWriteMask & 0xF][0]));
1542 				break;
1543 			case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1544 				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4bgraQ[bgraWriteMask & 0xF][0]));
1545 				break;
1546 			case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
1547 				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4argbQ[bgraWriteMask & 0xF][0]));
1548 				break;
1549 			case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
1550 				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4abgrQ[bgraWriteMask & 0xF][0]));
1551 				break;
1552 			default:
1553 				UNREACHABLE("Format: %s", vk::Stringify(state.colorFormat[index]).c_str());
1554 			}
1555 
1556 			Int c01 = Extract(As<Int2>(current.x), 0);
1557 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1558 			if(bgraWriteMask != 0x0000000F)
1559 			{
1560 				mask01 &= channelMask;
1561 			}
1562 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1563 
1564 			buffer += pitchB;
1565 			value = *Pointer<Int>(buffer);
1566 
1567 			Int c23 = Extract(As<Int2>(current.x), 1);
1568 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1569 			if(bgraWriteMask != 0x0000000F)
1570 			{
1571 				mask23 &= channelMask;
1572 			}
1573 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1574 		}
1575 		break;
1576 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1577 		{
1578 			buffer += 2 * x;
1579 			Int value = *Pointer<Int>(buffer);
1580 
1581 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskr5g5b5a1Q[bgraWriteMask & 0xF][0]));
1582 
1583 			Int c01 = Extract(As<Int2>(current.x), 0);
1584 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1585 			if(bgraWriteMask != 0x0000000F)
1586 			{
1587 				mask01 &= channelMask;
1588 			}
1589 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1590 
1591 			buffer += pitchB;
1592 			value = *Pointer<Int>(buffer);
1593 
1594 			Int c23 = Extract(As<Int2>(current.x), 1);
1595 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1596 			if(bgraWriteMask != 0x0000000F)
1597 			{
1598 				mask23 &= channelMask;
1599 			}
1600 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1601 		}
1602 		break;
1603 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1604 		{
1605 			buffer += 2 * x;
1606 			Int value = *Pointer<Int>(buffer);
1607 
1608 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskb5g5r5a1Q[bgraWriteMask & 0xF][0]));
1609 
1610 			Int c01 = Extract(As<Int2>(current.x), 0);
1611 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1612 			if(bgraWriteMask != 0x0000000F)
1613 			{
1614 				mask01 &= channelMask;
1615 			}
1616 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1617 
1618 			buffer += pitchB;
1619 			value = *Pointer<Int>(buffer);
1620 
1621 			Int c23 = Extract(As<Int2>(current.x), 1);
1622 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1623 			if(bgraWriteMask != 0x0000000F)
1624 			{
1625 				mask23 &= channelMask;
1626 			}
1627 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1628 		}
1629 		break;
1630 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1631 		{
1632 			buffer += 2 * x;
1633 			Int value = *Pointer<Int>(buffer);
1634 
1635 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask5551Q[bgraWriteMask & 0xF][0]));
1636 
1637 			Int c01 = Extract(As<Int2>(current.x), 0);
1638 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1639 			if(bgraWriteMask != 0x0000000F)
1640 			{
1641 				mask01 &= channelMask;
1642 			}
1643 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1644 
1645 			buffer += pitchB;
1646 			value = *Pointer<Int>(buffer);
1647 
1648 			Int c23 = Extract(As<Int2>(current.x), 1);
1649 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1650 			if(bgraWriteMask != 0x0000000F)
1651 			{
1652 				mask23 &= channelMask;
1653 			}
1654 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1655 		}
1656 		break;
1657 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
1658 		{
1659 			buffer += 2 * x;
1660 			Int value = *Pointer<Int>(buffer);
1661 
1662 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask565Q[bgraWriteMask & 0x7][0]));
1663 
1664 			Int c01 = Extract(As<Int2>(current.x), 0);
1665 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1666 			if((bgraWriteMask & 0x00000007) != 0x00000007)
1667 			{
1668 				mask01 &= channelMask;
1669 			}
1670 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1671 
1672 			buffer += pitchB;
1673 			value = *Pointer<Int>(buffer);
1674 
1675 			Int c23 = Extract(As<Int2>(current.x), 1);
1676 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1677 			if((bgraWriteMask & 0x00000007) != 0x00000007)
1678 			{
1679 				mask23 &= channelMask;
1680 			}
1681 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1682 		}
1683 		break;
1684 	case VK_FORMAT_B8G8R8A8_UNORM:
1685 	case VK_FORMAT_B8G8R8A8_SRGB:
1686 		{
1687 			buffer += x * 4;
1688 			Short4 value = *Pointer<Short4>(buffer);
1689 			Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q[bgraWriteMask][0]));
1690 
1691 			Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1692 			if(bgraWriteMask != 0x0000000F)
1693 			{
1694 				mask01 &= channelMask;
1695 			}
1696 			*Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
1697 
1698 			buffer += pitchB;
1699 			value = *Pointer<Short4>(buffer);
1700 
1701 			Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1702 			if(bgraWriteMask != 0x0000000F)
1703 			{
1704 				mask23 &= channelMask;
1705 			}
1706 			*Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
1707 		}
1708 		break;
1709 	case VK_FORMAT_R8G8B8A8_UNORM:
1710 	case VK_FORMAT_R8G8B8A8_SRGB:
1711 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1712 	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1713 		{
1714 			buffer += x * 4;
1715 			Short4 value = *Pointer<Short4>(buffer);
1716 			Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
1717 
1718 			Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1719 			if(rgbaWriteMask != 0x0000000F)
1720 			{
1721 				mask01 &= channelMask;
1722 			}
1723 			*Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
1724 
1725 			buffer += pitchB;
1726 			value = *Pointer<Short4>(buffer);
1727 
1728 			Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1729 			if(rgbaWriteMask != 0x0000000F)
1730 			{
1731 				mask23 &= channelMask;
1732 			}
1733 			*Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
1734 		}
1735 		break;
1736 	case VK_FORMAT_R8G8_UNORM:
1737 		if((rgbaWriteMask & 0x00000003) != 0x0)
1738 		{
1739 			buffer += 2 * x;
1740 			Int2 value;
1741 			value = Insert(value, *Pointer<Int>(buffer), 0);
1742 			value = Insert(value, *Pointer<Int>(buffer + pitchB), 1);
1743 
1744 			Int2 packedCol = As<Int2>(current.x);
1745 
1746 			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
1747 			if((rgbaWriteMask & 0x3) != 0x3)
1748 			{
1749 				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
1750 				UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
1751 				mergedMask &= rgbaMask;
1752 			}
1753 
1754 			packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
1755 
1756 			*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
1757 			*Pointer<UInt>(buffer + pitchB) = As<UInt>(Extract(packedCol, 1));
1758 		}
1759 		break;
1760 	case VK_FORMAT_R8_UNORM:
1761 		if(rgbaWriteMask & 0x00000001)
1762 		{
1763 			buffer += 1 * x;
1764 			Short4 value;
1765 			value = Insert(value, *Pointer<Short>(buffer), 0);
1766 			value = Insert(value, *Pointer<Short>(buffer + pitchB), 1);
1767 
1768 			current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
1769 			value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
1770 			current.x |= value;
1771 
1772 			*Pointer<Short>(buffer) = Extract(current.x, 0);
1773 			*Pointer<Short>(buffer + pitchB) = Extract(current.x, 1);
1774 		}
1775 		break;
1776 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1777 		rgbaWriteMask = bgraWriteMask;
1778 		// [[fallthrough]]
1779 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1780 		{
1781 			buffer += 4 * x;
1782 
1783 			Int2 value = *Pointer<Int2>(buffer, 16);
1784 			Int2 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1785 			if(rgbaWriteMask != 0xF)
1786 			{
1787 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
1788 			}
1789 			*Pointer<Int2>(buffer) = (As<Int2>(current.x) & mergedMask) | (value & ~mergedMask);
1790 
1791 			buffer += pitchB;
1792 
1793 			value = *Pointer<Int2>(buffer, 16);
1794 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1795 			if(rgbaWriteMask != 0xF)
1796 			{
1797 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
1798 			}
1799 			*Pointer<Int2>(buffer) = (As<Int2>(current.y) & mergedMask) | (value & ~mergedMask);
1800 		}
1801 		break;
1802 	default:
1803 		UNSUPPORTED("VkFormat: %d", int(state.colorFormat[index]));
1804 	}
1805 }
1806 
blendConstant(vk::Format format,int component,BlendFactorModifier modifier)1807 Float PixelRoutine::blendConstant(vk::Format format, int component, BlendFactorModifier modifier)
1808 {
1809 	bool inverse = (modifier == OneMinus);
1810 
1811 	if(format.isUnsignedNormalized())
1812 	{
1813 		return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantU[component]))
1814 		               : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantU[component]));
1815 	}
1816 	else if(format.isSignedNormalized())
1817 	{
1818 		return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantS[component]))
1819 		               : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantS[component]));
1820 	}
1821 	else  // Floating-point format
1822 	{
1823 		ASSERT(format.isFloatFormat());
1824 		return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantF[component]))
1825 		               : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantF[component]));
1826 	}
1827 }
1828 
blendFactorRGB(Vector4f & blendFactor,const Vector4f & sourceColor,const Vector4f & destColor,VkBlendFactor colorBlendFactor,vk::Format format)1829 void PixelRoutine::blendFactorRGB(Vector4f &blendFactor, const Vector4f &sourceColor, const Vector4f &destColor, VkBlendFactor colorBlendFactor, vk::Format format)
1830 {
1831 	switch(colorBlendFactor)
1832 	{
1833 	case VK_BLEND_FACTOR_ZERO:
1834 		blendFactor.x = 0.0f;
1835 		blendFactor.y = 0.0f;
1836 		blendFactor.z = 0.0f;
1837 		break;
1838 	case VK_BLEND_FACTOR_ONE:
1839 		blendFactor.x = 1.0f;
1840 		blendFactor.y = 1.0f;
1841 		blendFactor.z = 1.0f;
1842 		break;
1843 	case VK_BLEND_FACTOR_SRC_COLOR:
1844 		blendFactor.x = sourceColor.x;
1845 		blendFactor.y = sourceColor.y;
1846 		blendFactor.z = sourceColor.z;
1847 		break;
1848 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1849 		blendFactor.x = 1.0f - sourceColor.x;
1850 		blendFactor.y = 1.0f - sourceColor.y;
1851 		blendFactor.z = 1.0f - sourceColor.z;
1852 		break;
1853 	case VK_BLEND_FACTOR_DST_COLOR:
1854 		blendFactor.x = destColor.x;
1855 		blendFactor.y = destColor.y;
1856 		blendFactor.z = destColor.z;
1857 		break;
1858 	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1859 		blendFactor.x = 1.0f - destColor.x;
1860 		blendFactor.y = 1.0f - destColor.y;
1861 		blendFactor.z = 1.0f - destColor.z;
1862 		break;
1863 	case VK_BLEND_FACTOR_SRC_ALPHA:
1864 		blendFactor.x = sourceColor.w;
1865 		blendFactor.y = sourceColor.w;
1866 		blendFactor.z = sourceColor.w;
1867 		break;
1868 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1869 		blendFactor.x = 1.0f - sourceColor.w;
1870 		blendFactor.y = 1.0f - sourceColor.w;
1871 		blendFactor.z = 1.0f - sourceColor.w;
1872 		break;
1873 	case VK_BLEND_FACTOR_DST_ALPHA:
1874 		blendFactor.x = destColor.w;
1875 		blendFactor.y = destColor.w;
1876 		blendFactor.z = destColor.w;
1877 		break;
1878 	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1879 		blendFactor.x = 1.0f - destColor.w;
1880 		blendFactor.y = 1.0f - destColor.w;
1881 		blendFactor.z = 1.0f - destColor.w;
1882 		break;
1883 	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1884 		blendFactor.x = 1.0f - destColor.w;
1885 		blendFactor.x = Min(blendFactor.x, sourceColor.w);
1886 		blendFactor.y = blendFactor.x;
1887 		blendFactor.z = blendFactor.x;
1888 		break;
1889 	case VK_BLEND_FACTOR_CONSTANT_COLOR:
1890 		blendFactor.x = blendConstant(format, 0);
1891 		blendFactor.y = blendConstant(format, 1);
1892 		blendFactor.z = blendConstant(format, 2);
1893 		break;
1894 	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1895 		blendFactor.x = blendConstant(format, 3);
1896 		blendFactor.y = blendConstant(format, 3);
1897 		blendFactor.z = blendConstant(format, 3);
1898 		break;
1899 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1900 		blendFactor.x = blendConstant(format, 0, OneMinus);
1901 		blendFactor.y = blendConstant(format, 1, OneMinus);
1902 		blendFactor.z = blendConstant(format, 2, OneMinus);
1903 		break;
1904 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1905 		blendFactor.x = blendConstant(format, 3, OneMinus);
1906 		blendFactor.y = blendConstant(format, 3, OneMinus);
1907 		blendFactor.z = blendConstant(format, 3, OneMinus);
1908 		break;
1909 
1910 	default:
1911 		UNSUPPORTED("VkBlendFactor: %d", int(colorBlendFactor));
1912 	}
1913 
1914 	// "If the color attachment is fixed-point, the components of the source and destination values and blend factors are each clamped
1915 	//  to [0,1] or [-1,1] respectively for an unsigned normalized or signed normalized color attachment prior to evaluating the blend
1916 	//  operations. If the color attachment is floating-point, no clamping occurs."
1917 	if(blendFactorCanExceedFormatRange(colorBlendFactor, format))
1918 	{
1919 		if(format.isUnsignedNormalized())
1920 		{
1921 			blendFactor.x = Min(Max(blendFactor.x, 0.0f), 1.0f);
1922 			blendFactor.y = Min(Max(blendFactor.y, 0.0f), 1.0f);
1923 			blendFactor.z = Min(Max(blendFactor.z, 0.0f), 1.0f);
1924 		}
1925 		else if(format.isSignedNormalized())
1926 		{
1927 			blendFactor.x = Min(Max(blendFactor.x, -1.0f), 1.0f);
1928 			blendFactor.y = Min(Max(blendFactor.y, -1.0f), 1.0f);
1929 			blendFactor.z = Min(Max(blendFactor.z, -1.0f), 1.0f);
1930 		}
1931 	}
1932 }
1933 
blendFactorAlpha(Float4 & blendFactorAlpha,const Float4 & sourceAlpha,const Float4 & destAlpha,VkBlendFactor alphaBlendFactor,vk::Format format)1934 void PixelRoutine::blendFactorAlpha(Float4 &blendFactorAlpha, const Float4 &sourceAlpha, const Float4 &destAlpha, VkBlendFactor alphaBlendFactor, vk::Format format)
1935 {
1936 	switch(alphaBlendFactor)
1937 	{
1938 	case VK_BLEND_FACTOR_ZERO:
1939 		blendFactorAlpha = 0.0f;
1940 		break;
1941 	case VK_BLEND_FACTOR_ONE:
1942 		blendFactorAlpha = 1.0f;
1943 		break;
1944 	case VK_BLEND_FACTOR_SRC_COLOR:
1945 		blendFactorAlpha = sourceAlpha;
1946 		break;
1947 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1948 		blendFactorAlpha = 1.0f - sourceAlpha;
1949 		break;
1950 	case VK_BLEND_FACTOR_DST_COLOR:
1951 		blendFactorAlpha = destAlpha;
1952 		break;
1953 	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1954 		blendFactorAlpha = 1.0f - destAlpha;
1955 		break;
1956 	case VK_BLEND_FACTOR_SRC_ALPHA:
1957 		blendFactorAlpha = sourceAlpha;
1958 		break;
1959 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1960 		blendFactorAlpha = 1.0f - sourceAlpha;
1961 		break;
1962 	case VK_BLEND_FACTOR_DST_ALPHA:
1963 		blendFactorAlpha = destAlpha;
1964 		break;
1965 	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1966 		blendFactorAlpha = 1.0f - destAlpha;
1967 		break;
1968 	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1969 		blendFactorAlpha = 1.0f;
1970 		break;
1971 	case VK_BLEND_FACTOR_CONSTANT_COLOR:
1972 	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1973 		blendFactorAlpha = blendConstant(format, 3);
1974 		break;
1975 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1976 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1977 		blendFactorAlpha = blendConstant(format, 3, OneMinus);
1978 		break;
1979 	default:
1980 		UNSUPPORTED("VkBlendFactor: %d", int(alphaBlendFactor));
1981 	}
1982 
1983 	// "If the color attachment is fixed-point, the components of the source and destination values and blend factors are each clamped
1984 	//  to [0,1] or [-1,1] respectively for an unsigned normalized or signed normalized color attachment prior to evaluating the blend
1985 	//  operations. If the color attachment is floating-point, no clamping occurs."
1986 	if(blendFactorCanExceedFormatRange(alphaBlendFactor, format))
1987 	{
1988 		if(format.isUnsignedNormalized())
1989 		{
1990 			blendFactorAlpha = Min(Max(blendFactorAlpha, 0.0f), 1.0f);
1991 		}
1992 		else if(format.isSignedNormalized())
1993 		{
1994 			blendFactorAlpha = Min(Max(blendFactorAlpha, -1.0f), 1.0f);
1995 		}
1996 	}
1997 }
1998 
blendOpOverlay(Float4 & src,Float4 & dst)1999 Float4 PixelRoutine::blendOpOverlay(Float4 &src, Float4 &dst)
2000 {
2001 	Int4 largeDst = CmpGT(dst, 0.5f);
2002 	return As<Float4>(
2003 	    (~largeDst & As<Int4>(2.0f * src * dst)) |
2004 	    (largeDst & As<Int4>(1.0f - (2.0f * (1.0f - src) * (1.0f - dst)))));
2005 }
2006 
blendOpColorDodge(Float4 & src,Float4 & dst)2007 Float4 PixelRoutine::blendOpColorDodge(Float4 &src, Float4 &dst)
2008 {
2009 	Int4 srcBelowOne = CmpLT(src, 1.0f);
2010 	Int4 positiveDst = CmpGT(dst, 0.0f);
2011 	return As<Float4>(positiveDst & ((~srcBelowOne & As<Int4>(Float4(1.0f))) |
2012 	                                 (srcBelowOne & As<Int4>(Min(1.0f, (dst / (1.0f - src)))))));
2013 }
2014 
blendOpColorBurn(Float4 & src,Float4 & dst)2015 Float4 PixelRoutine::blendOpColorBurn(Float4 &src, Float4 &dst)
2016 {
2017 	Int4 dstBelowOne = CmpLT(dst, 1.0f);
2018 	Int4 positiveSrc = CmpGT(src, 0.0f);
2019 	return As<Float4>(
2020 	    (~dstBelowOne & As<Int4>(Float4(1.0f))) |
2021 	    (dstBelowOne & positiveSrc & As<Int4>(1.0f - Min(1.0f, (1.0f - dst) / src))));
2022 }
2023 
blendOpHardlight(Float4 & src,Float4 & dst)2024 Float4 PixelRoutine::blendOpHardlight(Float4 &src, Float4 &dst)
2025 {
2026 	Int4 largeSrc = CmpGT(src, 0.5f);
2027 	return As<Float4>(
2028 	    (~largeSrc & As<Int4>(2.0f * src * dst)) |
2029 	    (largeSrc & As<Int4>(1.0f - (2.0f * (1.0f - src) * (1.0f - dst)))));
2030 }
2031 
blendOpSoftlight(Float4 & src,Float4 & dst)2032 Float4 PixelRoutine::blendOpSoftlight(Float4 &src, Float4 &dst)
2033 {
2034 	Int4 largeSrc = CmpGT(src, 0.5f);
2035 	Int4 largeDst = CmpGT(dst, 0.25f);
2036 
2037 	return As<Float4>(
2038 	    (~largeSrc & As<Int4>(dst - ((1.0f - (2.0f * src)) * dst * (1.0f - dst)))) |
2039 	    (largeSrc & ((~largeDst & As<Int4>(dst + (((2.0f * src) - 1.0f) * dst * ((((16.0f * dst) - 12.0f) * dst) + 3.0f)))) |
2040 	                 (largeDst & As<Int4>(dst + (((2.0f * src) - 1.0f) * (Sqrt<Mediump>(dst) - dst)))))));
2041 }
2042 
maxRGB(Vector4f & c)2043 Float4 PixelRoutine::maxRGB(Vector4f &c)
2044 {
2045 	return Max(Max(c.x, c.y), c.z);
2046 }
2047 
minRGB(Vector4f & c)2048 Float4 PixelRoutine::minRGB(Vector4f &c)
2049 {
2050 	return Min(Min(c.x, c.y), c.z);
2051 }
2052 
setLumSat(Vector4f & cbase,Vector4f & csat,Vector4f & clum,Float4 & x,Float4 & y,Float4 & z)2053 void PixelRoutine::setLumSat(Vector4f &cbase, Vector4f &csat, Vector4f &clum, Float4 &x, Float4 &y, Float4 &z)
2054 {
2055 	Float4 minbase = minRGB(cbase);
2056 	Float4 sbase = maxRGB(cbase) - minbase;
2057 	Float4 ssat = maxRGB(csat) - minRGB(csat);
2058 	Int4 isNonZero = CmpGT(sbase, 0.0f);
2059 	Vector4f color;
2060 	color.x = As<Float4>(isNonZero & As<Int4>((cbase.x - minbase) * ssat / sbase));
2061 	color.y = As<Float4>(isNonZero & As<Int4>((cbase.y - minbase) * ssat / sbase));
2062 	color.z = As<Float4>(isNonZero & As<Int4>((cbase.z - minbase) * ssat / sbase));
2063 	setLum(color, clum, x, y, z);
2064 }
2065 
lumRGB(Vector4f & c)2066 Float4 PixelRoutine::lumRGB(Vector4f &c)
2067 {
2068 	return c.x * 0.3f + c.y * 0.59f + c.z * 0.11f;
2069 }
2070 
computeLum(Float4 & color,Float4 & lum,Float4 & mincol,Float4 & maxcol,Int4 & negative,Int4 & aboveOne)2071 Float4 PixelRoutine::computeLum(Float4 &color, Float4 &lum, Float4 &mincol, Float4 &maxcol, Int4 &negative, Int4 &aboveOne)
2072 {
2073 	return As<Float4>(
2074 	    (negative & As<Int4>(lum + ((color - lum) * lum) / (lum - mincol))) |
2075 	    (~negative & ((aboveOne & As<Int4>(lum + ((color - lum) * (1.0f - lum)) / (maxcol - lum))) |
2076 	                  (~aboveOne & As<Int4>(color)))));
2077 }
2078 
setLum(Vector4f & cbase,Vector4f & clum,Float4 & x,Float4 & y,Float4 & z)2079 void PixelRoutine::setLum(Vector4f &cbase, Vector4f &clum, Float4 &x, Float4 &y, Float4 &z)
2080 {
2081 	Float4 lbase = lumRGB(cbase);
2082 	Float4 llum = lumRGB(clum);
2083 	Float4 ldiff = llum - lbase;
2084 
2085 	Vector4f color;
2086 	color.x = cbase.x + ldiff;
2087 	color.y = cbase.y + ldiff;
2088 	color.z = cbase.z + ldiff;
2089 
2090 	Float4 lum = lumRGB(color);
2091 	Float4 mincol = minRGB(color);
2092 	Float4 maxcol = maxRGB(color);
2093 
2094 	Int4 negative = CmpLT(mincol, 0.0f);
2095 	Int4 aboveOne = CmpGT(maxcol, 1.0f);
2096 
2097 	x = computeLum(color.x, lum, mincol, maxcol, negative, aboveOne);
2098 	y = computeLum(color.y, lum, mincol, maxcol, negative, aboveOne);
2099 	z = computeLum(color.z, lum, mincol, maxcol, negative, aboveOne);
2100 }
2101 
premultiply(Vector4f & c)2102 void PixelRoutine::premultiply(Vector4f &c)
2103 {
2104 	Int4 nonZeroAlpha = CmpNEQ(c.w, 0.0f);
2105 	c.x = As<Float4>(nonZeroAlpha & As<Int4>(c.x / c.w));
2106 	c.y = As<Float4>(nonZeroAlpha & As<Int4>(c.y / c.w));
2107 	c.z = As<Float4>(nonZeroAlpha & As<Int4>(c.z / c.w));
2108 }
2109 
computeAdvancedBlendMode(int index,const Vector4f & src,const Vector4f & dst,const Vector4f & srcFactor,const Vector4f & dstFactor)2110 Vector4f PixelRoutine::computeAdvancedBlendMode(int index, const Vector4f &src, const Vector4f &dst, const Vector4f &srcFactor, const Vector4f &dstFactor)
2111 {
2112 	Vector4f srcColor = src;
2113 	srcColor.x *= srcFactor.x;
2114 	srcColor.y *= srcFactor.y;
2115 	srcColor.z *= srcFactor.z;
2116 	srcColor.w *= srcFactor.w;
2117 
2118 	Vector4f dstColor = dst;
2119 	dstColor.x *= dstFactor.x;
2120 	dstColor.y *= dstFactor.y;
2121 	dstColor.z *= dstFactor.z;
2122 	dstColor.w *= dstFactor.w;
2123 
2124 	premultiply(srcColor);
2125 	premultiply(dstColor);
2126 
2127 	Vector4f blendedColor;
2128 
2129 	switch(state.blendState[index].blendOperation)
2130 	{
2131 	case VK_BLEND_OP_MULTIPLY_EXT:
2132 		blendedColor.x = (srcColor.x * dstColor.x);
2133 		blendedColor.y = (srcColor.y * dstColor.y);
2134 		blendedColor.z = (srcColor.z * dstColor.z);
2135 		break;
2136 	case VK_BLEND_OP_SCREEN_EXT:
2137 		blendedColor.x = srcColor.x + dstColor.x - (srcColor.x * dstColor.x);
2138 		blendedColor.y = srcColor.y + dstColor.y - (srcColor.y * dstColor.y);
2139 		blendedColor.z = srcColor.z + dstColor.z - (srcColor.z * dstColor.z);
2140 		break;
2141 	case VK_BLEND_OP_OVERLAY_EXT:
2142 		blendedColor.x = blendOpOverlay(srcColor.x, dstColor.x);
2143 		blendedColor.y = blendOpOverlay(srcColor.y, dstColor.y);
2144 		blendedColor.z = blendOpOverlay(srcColor.z, dstColor.z);
2145 		break;
2146 	case VK_BLEND_OP_DARKEN_EXT:
2147 		blendedColor.x = Min(srcColor.x, dstColor.x);
2148 		blendedColor.y = Min(srcColor.y, dstColor.y);
2149 		blendedColor.z = Min(srcColor.z, dstColor.z);
2150 		break;
2151 	case VK_BLEND_OP_LIGHTEN_EXT:
2152 		blendedColor.x = Max(srcColor.x, dstColor.x);
2153 		blendedColor.y = Max(srcColor.y, dstColor.y);
2154 		blendedColor.z = Max(srcColor.z, dstColor.z);
2155 		break;
2156 	case VK_BLEND_OP_COLORDODGE_EXT:
2157 		blendedColor.x = blendOpColorDodge(srcColor.x, dstColor.x);
2158 		blendedColor.y = blendOpColorDodge(srcColor.y, dstColor.y);
2159 		blendedColor.z = blendOpColorDodge(srcColor.z, dstColor.z);
2160 		break;
2161 	case VK_BLEND_OP_COLORBURN_EXT:
2162 		blendedColor.x = blendOpColorBurn(srcColor.x, dstColor.x);
2163 		blendedColor.y = blendOpColorBurn(srcColor.y, dstColor.y);
2164 		blendedColor.z = blendOpColorBurn(srcColor.z, dstColor.z);
2165 		break;
2166 	case VK_BLEND_OP_HARDLIGHT_EXT:
2167 		blendedColor.x = blendOpHardlight(srcColor.x, dstColor.x);
2168 		blendedColor.y = blendOpHardlight(srcColor.y, dstColor.y);
2169 		blendedColor.z = blendOpHardlight(srcColor.z, dstColor.z);
2170 		break;
2171 	case VK_BLEND_OP_SOFTLIGHT_EXT:
2172 		blendedColor.x = blendOpSoftlight(srcColor.x, dstColor.x);
2173 		blendedColor.y = blendOpSoftlight(srcColor.y, dstColor.y);
2174 		blendedColor.z = blendOpSoftlight(srcColor.z, dstColor.z);
2175 		break;
2176 	case VK_BLEND_OP_DIFFERENCE_EXT:
2177 		blendedColor.x = Abs(srcColor.x - dstColor.x);
2178 		blendedColor.y = Abs(srcColor.y - dstColor.y);
2179 		blendedColor.z = Abs(srcColor.z - dstColor.z);
2180 		break;
2181 	case VK_BLEND_OP_EXCLUSION_EXT:
2182 		blendedColor.x = srcColor.x + dstColor.x - (srcColor.x * dstColor.x * 2.0f);
2183 		blendedColor.y = srcColor.y + dstColor.y - (srcColor.y * dstColor.y * 2.0f);
2184 		blendedColor.z = srcColor.z + dstColor.z - (srcColor.z * dstColor.z * 2.0f);
2185 		break;
2186 	case VK_BLEND_OP_HSL_HUE_EXT:
2187 		setLumSat(srcColor, dstColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
2188 		break;
2189 	case VK_BLEND_OP_HSL_SATURATION_EXT:
2190 		setLumSat(dstColor, srcColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
2191 		break;
2192 	case VK_BLEND_OP_HSL_COLOR_EXT:
2193 		setLum(srcColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
2194 		break;
2195 	case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
2196 		setLum(dstColor, srcColor, blendedColor.x, blendedColor.y, blendedColor.z);
2197 		break;
2198 	default:
2199 		UNSUPPORTED("Unsupported advanced VkBlendOp: %d", int(state.blendState[index].blendOperation));
2200 		break;
2201 	}
2202 
2203 	Float4 p = srcColor.w * dstColor.w;
2204 	blendedColor.x *= p;
2205 	blendedColor.y *= p;
2206 	blendedColor.z *= p;
2207 
2208 	p = srcColor.w * (1.0f - dstColor.w);
2209 	blendedColor.x += srcColor.x * p;
2210 	blendedColor.y += srcColor.y * p;
2211 	blendedColor.z += srcColor.z * p;
2212 
2213 	p = dstColor.w * (1.0f - srcColor.w);
2214 	blendedColor.x += dstColor.x * p;
2215 	blendedColor.y += dstColor.y * p;
2216 	blendedColor.z += dstColor.z * p;
2217 
2218 	return blendedColor;
2219 }
2220 
blendFactorCanExceedFormatRange(VkBlendFactor blendFactor,vk::Format format)2221 bool PixelRoutine::blendFactorCanExceedFormatRange(VkBlendFactor blendFactor, vk::Format format)
2222 {
2223 	switch(blendFactor)
2224 	{
2225 	case VK_BLEND_FACTOR_ZERO:
2226 	case VK_BLEND_FACTOR_ONE:
2227 		return false;
2228 	case VK_BLEND_FACTOR_SRC_COLOR:
2229 	case VK_BLEND_FACTOR_SRC_ALPHA:
2230 		// Source values have been clamped after fragment shader execution if the attachment format is normalized.
2231 		return false;
2232 	case VK_BLEND_FACTOR_DST_COLOR:
2233 	case VK_BLEND_FACTOR_DST_ALPHA:
2234 		// Dest values have a valid range due to being read from the attachment.
2235 		return false;
2236 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
2237 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
2238 	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
2239 	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
2240 		// For signed formats, negative values cause the result to exceed 1.0.
2241 		return format.isSignedNormalized();
2242 	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
2243 		// min(As, 1 - Ad)
2244 		return false;
2245 	case VK_BLEND_FACTOR_CONSTANT_COLOR:
2246 	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
2247 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
2248 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
2249 		return false;
2250 
2251 	default:
2252 		UNSUPPORTED("VkBlendFactor: %d", int(blendFactor));
2253 		return false;
2254 	}
2255 }
2256 
alphaBlend(int index,const Pointer<Byte> & cBuffer,const Vector4f & sourceColor,const Int & x)2257 Vector4f PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, const Vector4f &sourceColor, const Int &x)
2258 {
2259 	if(!state.blendState[index].alphaBlendEnable)
2260 	{
2261 		return sourceColor;
2262 	}
2263 
2264 	vk::Format format = state.colorFormat[index];
2265 	ASSERT(format.supportsColorAttachmentBlend());
2266 
2267 	Pointer<Byte> buffer = cBuffer;
2268 	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2269 
2270 	// destColor holds four texel color values.
2271 	// Note: Despite the type being Vector4f, the colors may be stored as
2272 	// integers. Half-floats are stored as full 32-bit floats.
2273 	// Non-float and non-fixed point formats are not alpha blended.
2274 	Vector4f destColor;
2275 
2276 	switch(format)
2277 	{
2278 	case VK_FORMAT_R32_SINT:
2279 	case VK_FORMAT_R32_UINT:
2280 	case VK_FORMAT_R32_SFLOAT:
2281 		// FIXME: movlps
2282 		buffer += 4 * x;
2283 		destColor.x.x = *Pointer<Float>(buffer + 0);
2284 		destColor.x.y = *Pointer<Float>(buffer + 4);
2285 		buffer += pitchB;
2286 		// FIXME: movhps
2287 		destColor.x.z = *Pointer<Float>(buffer + 0);
2288 		destColor.x.w = *Pointer<Float>(buffer + 4);
2289 		destColor.y = destColor.z = destColor.w = 1.0f;
2290 		break;
2291 	case VK_FORMAT_R32G32_SINT:
2292 	case VK_FORMAT_R32G32_UINT:
2293 	case VK_FORMAT_R32G32_SFLOAT:
2294 		buffer += 8 * x;
2295 		destColor.x = *Pointer<Float4>(buffer, 16);
2296 		buffer += pitchB;
2297 		destColor.y = *Pointer<Float4>(buffer, 16);
2298 		destColor.z = destColor.x;
2299 		destColor.x = ShuffleLowHigh(destColor.x, destColor.y, 0x0202);
2300 		destColor.z = ShuffleLowHigh(destColor.z, destColor.y, 0x1313);
2301 		destColor.y = destColor.z;
2302 		destColor.z = destColor.w = 1.0f;
2303 		break;
2304 	case VK_FORMAT_R32G32B32A32_SFLOAT:
2305 	case VK_FORMAT_R32G32B32A32_SINT:
2306 	case VK_FORMAT_R32G32B32A32_UINT:
2307 		buffer += 16 * x;
2308 		destColor.x = *Pointer<Float4>(buffer + 0, 16);
2309 		destColor.y = *Pointer<Float4>(buffer + 16, 16);
2310 		buffer += pitchB;
2311 		destColor.z = *Pointer<Float4>(buffer + 0, 16);
2312 		destColor.w = *Pointer<Float4>(buffer + 16, 16);
2313 		transpose4x4(destColor.x, destColor.y, destColor.z, destColor.w);
2314 		break;
2315 	case VK_FORMAT_R16_UNORM:
2316 		buffer += 2 * x;
2317 		destColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
2318 		destColor.x.y = Float(Int(*Pointer<UShort>(buffer + 2)));
2319 		buffer += pitchB;
2320 		destColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
2321 		destColor.x.w = Float(Int(*Pointer<UShort>(buffer + 2)));
2322 		destColor.x *= (1.0f / 0xFFFF);
2323 		destColor.y = destColor.z = destColor.w = 1.0f;
2324 		break;
2325 	case VK_FORMAT_R16_SFLOAT:
2326 		buffer += 2 * x;
2327 		destColor.x.x = Float(*Pointer<Half>(buffer + 0));
2328 		destColor.x.y = Float(*Pointer<Half>(buffer + 2));
2329 		buffer += pitchB;
2330 		destColor.x.z = Float(*Pointer<Half>(buffer + 0));
2331 		destColor.x.w = Float(*Pointer<Half>(buffer + 2));
2332 		destColor.y = destColor.z = destColor.w = 1.0f;
2333 		break;
2334 	case VK_FORMAT_R16G16_UNORM:
2335 		buffer += 4 * x;
2336 		destColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
2337 		destColor.y.x = Float(Int(*Pointer<UShort>(buffer + 2)));
2338 		destColor.x.y = Float(Int(*Pointer<UShort>(buffer + 4)));
2339 		destColor.y.y = Float(Int(*Pointer<UShort>(buffer + 6)));
2340 		buffer += pitchB;
2341 		destColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
2342 		destColor.y.z = Float(Int(*Pointer<UShort>(buffer + 2)));
2343 		destColor.x.w = Float(Int(*Pointer<UShort>(buffer + 4)));
2344 		destColor.y.w = Float(Int(*Pointer<UShort>(buffer + 6)));
2345 		destColor.x *= (1.0f / 0xFFFF);
2346 		destColor.y *= (1.0f / 0xFFFF);
2347 		destColor.z = destColor.w = 1.0f;
2348 		break;
2349 	case VK_FORMAT_R16G16_SFLOAT:
2350 		buffer += 4 * x;
2351 		destColor.x.x = Float(*Pointer<Half>(buffer + 0));
2352 		destColor.y.x = Float(*Pointer<Half>(buffer + 2));
2353 		destColor.x.y = Float(*Pointer<Half>(buffer + 4));
2354 		destColor.y.y = Float(*Pointer<Half>(buffer + 6));
2355 		buffer += pitchB;
2356 		destColor.x.z = Float(*Pointer<Half>(buffer + 0));
2357 		destColor.y.z = Float(*Pointer<Half>(buffer + 2));
2358 		destColor.x.w = Float(*Pointer<Half>(buffer + 4));
2359 		destColor.y.w = Float(*Pointer<Half>(buffer + 6));
2360 		destColor.z = destColor.w = 1.0f;
2361 		break;
2362 	case VK_FORMAT_R16G16B16A16_UNORM:
2363 		buffer += 8 * x;
2364 		destColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0x0)));
2365 		destColor.y.x = Float(Int(*Pointer<UShort>(buffer + 0x2)));
2366 		destColor.z.x = Float(Int(*Pointer<UShort>(buffer + 0x4)));
2367 		destColor.w.x = Float(Int(*Pointer<UShort>(buffer + 0x6)));
2368 		destColor.x.y = Float(Int(*Pointer<UShort>(buffer + 0x8)));
2369 		destColor.y.y = Float(Int(*Pointer<UShort>(buffer + 0xa)));
2370 		destColor.z.y = Float(Int(*Pointer<UShort>(buffer + 0xc)));
2371 		destColor.w.y = Float(Int(*Pointer<UShort>(buffer + 0xe)));
2372 		buffer += pitchB;
2373 		destColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0x0)));
2374 		destColor.y.z = Float(Int(*Pointer<UShort>(buffer + 0x2)));
2375 		destColor.z.z = Float(Int(*Pointer<UShort>(buffer + 0x4)));
2376 		destColor.w.z = Float(Int(*Pointer<UShort>(buffer + 0x6)));
2377 		destColor.x.w = Float(Int(*Pointer<UShort>(buffer + 0x8)));
2378 		destColor.y.w = Float(Int(*Pointer<UShort>(buffer + 0xa)));
2379 		destColor.z.w = Float(Int(*Pointer<UShort>(buffer + 0xc)));
2380 		destColor.w.w = Float(Int(*Pointer<UShort>(buffer + 0xe)));
2381 		destColor.x *= (1.0f / 0xFFFF);
2382 		destColor.y *= (1.0f / 0xFFFF);
2383 		destColor.z *= (1.0f / 0xFFFF);
2384 		destColor.w *= (1.0f / 0xFFFF);
2385 		break;
2386 	case VK_FORMAT_R16G16B16A16_SFLOAT:
2387 		buffer += 8 * x;
2388 		destColor.x.x = Float(*Pointer<Half>(buffer + 0x0));
2389 		destColor.y.x = Float(*Pointer<Half>(buffer + 0x2));
2390 		destColor.z.x = Float(*Pointer<Half>(buffer + 0x4));
2391 		destColor.w.x = Float(*Pointer<Half>(buffer + 0x6));
2392 		destColor.x.y = Float(*Pointer<Half>(buffer + 0x8));
2393 		destColor.y.y = Float(*Pointer<Half>(buffer + 0xa));
2394 		destColor.z.y = Float(*Pointer<Half>(buffer + 0xc));
2395 		destColor.w.y = Float(*Pointer<Half>(buffer + 0xe));
2396 		buffer += pitchB;
2397 		destColor.x.z = Float(*Pointer<Half>(buffer + 0x0));
2398 		destColor.y.z = Float(*Pointer<Half>(buffer + 0x2));
2399 		destColor.z.z = Float(*Pointer<Half>(buffer + 0x4));
2400 		destColor.w.z = Float(*Pointer<Half>(buffer + 0x6));
2401 		destColor.x.w = Float(*Pointer<Half>(buffer + 0x8));
2402 		destColor.y.w = Float(*Pointer<Half>(buffer + 0xa));
2403 		destColor.z.w = Float(*Pointer<Half>(buffer + 0xc));
2404 		destColor.w.w = Float(*Pointer<Half>(buffer + 0xe));
2405 		break;
2406 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2407 		buffer += 4 * x;
2408 		destColor.x = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
2409 		destColor.y = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
2410 		buffer += pitchB;
2411 		destColor.z = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
2412 		destColor.w = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
2413 		transpose4x3(destColor.x, destColor.y, destColor.z, destColor.w);
2414 		destColor.w = 1.0f;
2415 		break;
2416 	default:
2417 		{
2418 			// Attempt to read an integer based format and convert it to float
2419 			Vector4s color;
2420 			readPixel(index, cBuffer, x, color);
2421 			destColor.x = convertFloat32(As<UShort4>(color.x));
2422 			destColor.y = convertFloat32(As<UShort4>(color.y));
2423 			destColor.z = convertFloat32(As<UShort4>(color.z));
2424 			destColor.w = convertFloat32(As<UShort4>(color.w));
2425 		}
2426 		break;
2427 	}
2428 
2429 	Vector4f sourceFactor;
2430 	Vector4f destFactor;
2431 
2432 	blendFactorRGB(sourceFactor, sourceColor, destColor, state.blendState[index].sourceBlendFactor, format);
2433 	blendFactorRGB(destFactor, sourceColor, destColor, state.blendState[index].destBlendFactor, format);
2434 	blendFactorAlpha(sourceFactor.w, sourceColor.w, destColor.w, state.blendState[index].sourceBlendFactorAlpha, format);
2435 	blendFactorAlpha(destFactor.w, sourceColor.w, destColor.w, state.blendState[index].destBlendFactorAlpha, format);
2436 
2437 	Vector4f blendedColor;
2438 
2439 	switch(state.blendState[index].blendOperation)
2440 	{
2441 	case VK_BLEND_OP_ADD:
2442 		blendedColor.x = sourceColor.x * sourceFactor.x + destColor.x * destFactor.x;
2443 		blendedColor.y = sourceColor.y * sourceFactor.y + destColor.y * destFactor.y;
2444 		blendedColor.z = sourceColor.z * sourceFactor.z + destColor.z * destFactor.z;
2445 		break;
2446 	case VK_BLEND_OP_SUBTRACT:
2447 		blendedColor.x = sourceColor.x * sourceFactor.x - destColor.x * destFactor.x;
2448 		blendedColor.y = sourceColor.y * sourceFactor.y - destColor.y * destFactor.y;
2449 		blendedColor.z = sourceColor.z * sourceFactor.z - destColor.z * destFactor.z;
2450 		break;
2451 	case VK_BLEND_OP_REVERSE_SUBTRACT:
2452 		blendedColor.x = destColor.x * destFactor.x - sourceColor.x * sourceFactor.x;
2453 		blendedColor.y = destColor.y * destFactor.y - sourceColor.y * sourceFactor.y;
2454 		blendedColor.z = destColor.z * destFactor.z - sourceColor.z * sourceFactor.z;
2455 		break;
2456 	case VK_BLEND_OP_MIN:
2457 		blendedColor.x = Min(sourceColor.x, destColor.x);
2458 		blendedColor.y = Min(sourceColor.y, destColor.y);
2459 		blendedColor.z = Min(sourceColor.z, destColor.z);
2460 		break;
2461 	case VK_BLEND_OP_MAX:
2462 		blendedColor.x = Max(sourceColor.x, destColor.x);
2463 		blendedColor.y = Max(sourceColor.y, destColor.y);
2464 		blendedColor.z = Max(sourceColor.z, destColor.z);
2465 		break;
2466 	case VK_BLEND_OP_SRC_EXT:
2467 		blendedColor.x = sourceColor.x;
2468 		blendedColor.y = sourceColor.y;
2469 		blendedColor.z = sourceColor.z;
2470 		break;
2471 	case VK_BLEND_OP_DST_EXT:
2472 		blendedColor.x = destColor.x;
2473 		blendedColor.y = destColor.y;
2474 		blendedColor.z = destColor.z;
2475 		break;
2476 	case VK_BLEND_OP_ZERO_EXT:
2477 		blendedColor.x = 0.0f;
2478 		blendedColor.y = 0.0f;
2479 		blendedColor.z = 0.0f;
2480 		break;
2481 	case VK_BLEND_OP_MULTIPLY_EXT:
2482 	case VK_BLEND_OP_SCREEN_EXT:
2483 	case VK_BLEND_OP_OVERLAY_EXT:
2484 	case VK_BLEND_OP_DARKEN_EXT:
2485 	case VK_BLEND_OP_LIGHTEN_EXT:
2486 	case VK_BLEND_OP_COLORDODGE_EXT:
2487 	case VK_BLEND_OP_COLORBURN_EXT:
2488 	case VK_BLEND_OP_HARDLIGHT_EXT:
2489 	case VK_BLEND_OP_SOFTLIGHT_EXT:
2490 	case VK_BLEND_OP_DIFFERENCE_EXT:
2491 	case VK_BLEND_OP_EXCLUSION_EXT:
2492 	case VK_BLEND_OP_HSL_HUE_EXT:
2493 	case VK_BLEND_OP_HSL_SATURATION_EXT:
2494 	case VK_BLEND_OP_HSL_COLOR_EXT:
2495 	case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
2496 		blendedColor = computeAdvancedBlendMode(index, sourceColor, destColor, sourceFactor, destFactor);
2497 		break;
2498 	default:
2499 		UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperation));
2500 	}
2501 
2502 	switch(state.blendState[index].blendOperationAlpha)
2503 	{
2504 	case VK_BLEND_OP_ADD:
2505 		blendedColor.w = sourceColor.w * sourceFactor.w + destColor.w * destFactor.w;
2506 		break;
2507 	case VK_BLEND_OP_SUBTRACT:
2508 		blendedColor.w = sourceColor.w * sourceFactor.w - destColor.w * destFactor.w;
2509 		break;
2510 	case VK_BLEND_OP_REVERSE_SUBTRACT:
2511 		blendedColor.w = destColor.w * destFactor.w - sourceColor.w * sourceFactor.w;
2512 		break;
2513 	case VK_BLEND_OP_MIN:
2514 		blendedColor.w = Min(sourceColor.w, destColor.w);
2515 		break;
2516 	case VK_BLEND_OP_MAX:
2517 		blendedColor.w = Max(sourceColor.w, destColor.w);
2518 		break;
2519 	case VK_BLEND_OP_SRC_EXT:
2520 		blendedColor.w = sourceColor.w;
2521 		break;
2522 	case VK_BLEND_OP_DST_EXT:
2523 		blendedColor.w = destColor.w;
2524 		break;
2525 	case VK_BLEND_OP_ZERO_EXT:
2526 		blendedColor.w = 0.0f;
2527 		break;
2528 	case VK_BLEND_OP_MULTIPLY_EXT:
2529 	case VK_BLEND_OP_SCREEN_EXT:
2530 	case VK_BLEND_OP_OVERLAY_EXT:
2531 	case VK_BLEND_OP_DARKEN_EXT:
2532 	case VK_BLEND_OP_LIGHTEN_EXT:
2533 	case VK_BLEND_OP_COLORDODGE_EXT:
2534 	case VK_BLEND_OP_COLORBURN_EXT:
2535 	case VK_BLEND_OP_HARDLIGHT_EXT:
2536 	case VK_BLEND_OP_SOFTLIGHT_EXT:
2537 	case VK_BLEND_OP_DIFFERENCE_EXT:
2538 	case VK_BLEND_OP_EXCLUSION_EXT:
2539 	case VK_BLEND_OP_HSL_HUE_EXT:
2540 	case VK_BLEND_OP_HSL_SATURATION_EXT:
2541 	case VK_BLEND_OP_HSL_COLOR_EXT:
2542 	case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
2543 		// All of the currently supported 'advanced blend modes' compute the alpha the same way.
2544 		blendedColor.w = sourceColor.w + destColor.w - (sourceColor.w * destColor.w);
2545 		break;
2546 	default:
2547 		UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperationAlpha));
2548 	}
2549 
2550 	return blendedColor;
2551 }
2552 
writeColor(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4f & color,const Int & sMask,const Int & zMask,const Int & cMask)2553 void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4f &color, const Int &sMask, const Int &zMask, const Int &cMask)
2554 {
2555 	vk::Format format = state.colorFormat[index];
2556 	switch(format)
2557 	{
2558 	case VK_FORMAT_R16G16B16A16_UNORM:
2559 		color.w = Min(Max(color.w, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2560 		color.w = As<Float4>(RoundInt(color.w * 0xFFFF));
2561 		color.z = Min(Max(color.z, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2562 		color.z = As<Float4>(RoundInt(color.z * 0xFFFF));
2563 		// [[fallthrough]]
2564 	case VK_FORMAT_R16G16_UNORM:
2565 		color.y = Min(Max(color.y, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2566 		color.y = As<Float4>(RoundInt(color.y * 0xFFFF));
2567 		//[[fallthrough]]
2568 	case VK_FORMAT_R16_UNORM:
2569 		color.x = Min(Max(color.x, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2570 		color.x = As<Float4>(RoundInt(color.x * 0xFFFF));
2571 		break;
2572 	default:
2573 		// TODO(b/204560089): Omit clamp if redundant
2574 		if(format.isUnsignedNormalized())
2575 		{
2576 			color.x = Min(Max(color.x, 0.0f), 1.0f);
2577 			color.y = Min(Max(color.y, 0.0f), 1.0f);
2578 			color.z = Min(Max(color.z, 0.0f), 1.0f);
2579 			color.w = Min(Max(color.w, 0.0f), 1.0f);
2580 		}
2581 		else if(format.isSignedNormalized())
2582 		{
2583 			color.x = Min(Max(color.x, -1.0f), 1.0f);
2584 			color.y = Min(Max(color.y, -1.0f), 1.0f);
2585 			color.z = Min(Max(color.z, -1.0f), 1.0f);
2586 			color.w = Min(Max(color.w, -1.0f), 1.0f);
2587 		}
2588 	}
2589 
2590 	switch(format)
2591 	{
2592 	case VK_FORMAT_R16_SFLOAT:
2593 	case VK_FORMAT_R32_SFLOAT:
2594 	case VK_FORMAT_R32_SINT:
2595 	case VK_FORMAT_R32_UINT:
2596 	case VK_FORMAT_R16_UNORM:
2597 	case VK_FORMAT_R16_SINT:
2598 	case VK_FORMAT_R16_UINT:
2599 	case VK_FORMAT_R8_SINT:
2600 	case VK_FORMAT_R8_UINT:
2601 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2602 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2603 		break;
2604 	case VK_FORMAT_R16G16_SFLOAT:
2605 	case VK_FORMAT_R32G32_SFLOAT:
2606 	case VK_FORMAT_R32G32_SINT:
2607 	case VK_FORMAT_R32G32_UINT:
2608 	case VK_FORMAT_R16G16_UNORM:
2609 	case VK_FORMAT_R16G16_SINT:
2610 	case VK_FORMAT_R16G16_UINT:
2611 	case VK_FORMAT_R8G8_SINT:
2612 	case VK_FORMAT_R8G8_UINT:
2613 		color.z = color.x;
2614 		color.x = UnpackLow(color.x, color.y);
2615 		color.z = UnpackHigh(color.z, color.y);
2616 		color.y = color.z;
2617 		break;
2618 	case VK_FORMAT_R16G16B16A16_SFLOAT:
2619 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2620 	case VK_FORMAT_R32G32B32A32_SFLOAT:
2621 	case VK_FORMAT_R32G32B32A32_SINT:
2622 	case VK_FORMAT_R32G32B32A32_UINT:
2623 	case VK_FORMAT_R16G16B16A16_UNORM:
2624 	case VK_FORMAT_R16G16B16A16_SINT:
2625 	case VK_FORMAT_R16G16B16A16_UINT:
2626 	case VK_FORMAT_R8G8B8A8_SINT:
2627 	case VK_FORMAT_R8G8B8A8_UINT:
2628 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
2629 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
2630 		transpose4x4(color.x, color.y, color.z, color.w);
2631 		break;
2632 	default:
2633 		UNSUPPORTED("VkFormat: %d", int(format));
2634 	}
2635 
2636 	int rgbaWriteMask = state.colorWriteActive(index);
2637 	int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
2638 
2639 	Int xMask;  // Combination of all masks
2640 
2641 	if(state.depthTestActive)
2642 	{
2643 		xMask = zMask;
2644 	}
2645 	else
2646 	{
2647 		xMask = cMask;
2648 	}
2649 
2650 	if(state.stencilActive)
2651 	{
2652 		xMask &= sMask;
2653 	}
2654 
2655 	Pointer<Byte> buffer = cBuffer;
2656 	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2657 	Float4 value;
2658 
2659 	switch(format)
2660 	{
2661 	case VK_FORMAT_R32_SFLOAT:
2662 	case VK_FORMAT_R32_SINT:
2663 	case VK_FORMAT_R32_UINT:
2664 		if(rgbaWriteMask & 0x00000001)
2665 		{
2666 			buffer += 4 * x;
2667 
2668 			// FIXME: movlps
2669 			value.x = *Pointer<Float>(buffer + 0);
2670 			value.y = *Pointer<Float>(buffer + 4);
2671 
2672 			buffer += pitchB;
2673 
2674 			// FIXME: movhps
2675 			value.z = *Pointer<Float>(buffer + 0);
2676 			value.w = *Pointer<Float>(buffer + 4);
2677 
2678 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2679 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2680 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2681 
2682 			// FIXME: movhps
2683 			*Pointer<Float>(buffer + 0) = color.x.z;
2684 			*Pointer<Float>(buffer + 4) = color.x.w;
2685 
2686 			buffer -= pitchB;
2687 
2688 			// FIXME: movlps
2689 			*Pointer<Float>(buffer + 0) = color.x.x;
2690 			*Pointer<Float>(buffer + 4) = color.x.y;
2691 		}
2692 		break;
2693 	case VK_FORMAT_R16_SFLOAT:
2694 		if(rgbaWriteMask & 0x00000001)
2695 		{
2696 			buffer += 2 * x;
2697 
2698 			value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 0);
2699 			value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 1);
2700 
2701 			buffer += pitchB;
2702 
2703 			value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 2);
2704 			value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 3);
2705 
2706 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2707 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2708 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2709 
2710 			*Pointer<Half>(buffer + 0) = Half(color.x.z);
2711 			*Pointer<Half>(buffer + 2) = Half(color.x.w);
2712 
2713 			buffer -= pitchB;
2714 
2715 			*Pointer<Half>(buffer + 0) = Half(color.x.x);
2716 			*Pointer<Half>(buffer + 2) = Half(color.x.y);
2717 		}
2718 		break;
2719 	case VK_FORMAT_R16_UNORM:
2720 	case VK_FORMAT_R16_SINT:
2721 	case VK_FORMAT_R16_UINT:
2722 		if(rgbaWriteMask & 0x00000001)
2723 		{
2724 			buffer += 2 * x;
2725 
2726 			UShort4 xyzw;
2727 			xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2728 
2729 			buffer += pitchB;
2730 
2731 			xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2732 			value = As<Float4>(Int4(xyzw));
2733 
2734 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2735 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2736 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2737 
2738 			Float component = color.x.z;
2739 			*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2740 			component = color.x.w;
2741 			*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2742 
2743 			buffer -= pitchB;
2744 
2745 			component = color.x.x;
2746 			*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2747 			component = color.x.y;
2748 			*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2749 		}
2750 		break;
2751 	case VK_FORMAT_R8_SINT:
2752 	case VK_FORMAT_R8_UINT:
2753 		if(rgbaWriteMask & 0x00000001)
2754 		{
2755 			buffer += x;
2756 
2757 			UInt xyzw, packedCol;
2758 
2759 			xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
2760 			buffer += pitchB;
2761 			xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2762 
2763 			Short4 tmpCol = Short4(As<Int4>(color.x));
2764 			if(format == VK_FORMAT_R8_SINT)
2765 			{
2766 				tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
2767 			}
2768 			else
2769 			{
2770 				tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
2771 			}
2772 			packedCol = Extract(As<Int2>(tmpCol), 0);
2773 
2774 			packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2775 			            (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2776 
2777 			*Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2778 			buffer -= pitchB;
2779 			*Pointer<UShort>(buffer) = UShort(packedCol);
2780 		}
2781 		break;
2782 	case VK_FORMAT_R32G32_SFLOAT:
2783 	case VK_FORMAT_R32G32_SINT:
2784 	case VK_FORMAT_R32G32_UINT:
2785 		buffer += 8 * x;
2786 
2787 		value = *Pointer<Float4>(buffer);
2788 
2789 		if((rgbaWriteMask & 0x00000003) != 0x00000003)
2790 		{
2791 			Float4 masked = value;
2792 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[rgbaWriteMask & 0x3][0])));
2793 			masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~rgbaWriteMask & 0x3][0])));
2794 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(masked));
2795 		}
2796 
2797 		color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16, 16));
2798 		value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ01X) + xMask * 16, 16));
2799 		color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2800 		*Pointer<Float4>(buffer) = color.x;
2801 
2802 		buffer += pitchB;
2803 
2804 		value = *Pointer<Float4>(buffer);
2805 
2806 		if((rgbaWriteMask & 0x00000003) != 0x00000003)
2807 		{
2808 			Float4 masked;
2809 
2810 			masked = value;
2811 			color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[rgbaWriteMask & 0x3][0])));
2812 			masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~rgbaWriteMask & 0x3][0])));
2813 			color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(masked));
2814 		}
2815 
2816 		color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16, 16));
2817 		value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ23X) + xMask * 16, 16));
2818 		color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(value));
2819 		*Pointer<Float4>(buffer) = color.y;
2820 		break;
2821 	case VK_FORMAT_R16G16_SFLOAT:
2822 		if((rgbaWriteMask & 0x00000003) != 0x0)
2823 		{
2824 			buffer += 4 * x;
2825 
2826 			UInt2 rgbaMask;
2827 			UInt2 packedCol;
2828 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.y))) << 16) | UInt(As<UShort>(Half(color.x.x))), 0);
2829 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.w))) << 16) | UInt(As<UShort>(Half(color.x.z))), 1);
2830 
2831 			UShort4 value = *Pointer<UShort4>(buffer);
2832 			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2833 			if((rgbaWriteMask & 0x3) != 0x3)
2834 			{
2835 				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2836 				rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2837 				mergedMask &= rgbaMask;
2838 			}
2839 			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2840 
2841 			buffer += pitchB;
2842 
2843 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.y))) << 16) | UInt(As<UShort>(Half(color.y.x))), 0);
2844 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.w))) << 16) | UInt(As<UShort>(Half(color.y.z))), 1);
2845 			value = *Pointer<UShort4>(buffer);
2846 			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2847 			if((rgbaWriteMask & 0x3) != 0x3)
2848 			{
2849 				mergedMask &= rgbaMask;
2850 			}
2851 			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2852 		}
2853 		break;
2854 	case VK_FORMAT_R16G16_UNORM:
2855 	case VK_FORMAT_R16G16_SINT:
2856 	case VK_FORMAT_R16G16_UINT:
2857 		if((rgbaWriteMask & 0x00000003) != 0x0)
2858 		{
2859 			buffer += 4 * x;
2860 
2861 			UInt2 rgbaMask;
2862 			UShort4 packedCol = UShort4(As<Int4>(color.x));
2863 			UShort4 value = *Pointer<UShort4>(buffer);
2864 			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2865 			if((rgbaWriteMask & 0x3) != 0x3)
2866 			{
2867 				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2868 				rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2869 				mergedMask &= rgbaMask;
2870 			}
2871 			*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2872 
2873 			buffer += pitchB;
2874 
2875 			packedCol = UShort4(As<Int4>(color.y));
2876 			value = *Pointer<UShort4>(buffer);
2877 			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2878 			if((rgbaWriteMask & 0x3) != 0x3)
2879 			{
2880 				mergedMask &= rgbaMask;
2881 			}
2882 			*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2883 		}
2884 		break;
2885 	case VK_FORMAT_R8G8_SINT:
2886 	case VK_FORMAT_R8G8_UINT:
2887 		if((rgbaWriteMask & 0x00000003) != 0x0)
2888 		{
2889 			buffer += 2 * x;
2890 
2891 			Int2 xyzw, packedCol;
2892 
2893 			xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2894 			buffer += pitchB;
2895 			xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2896 
2897 			if(format == VK_FORMAT_R8G8_SINT)
2898 			{
2899 				packedCol = As<Int2>(PackSigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2900 			}
2901 			else
2902 			{
2903 				packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2904 			}
2905 
2906 			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2907 			if((rgbaWriteMask & 0x3) != 0x3)
2908 			{
2909 				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
2910 				UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2911 				mergedMask &= rgbaMask;
2912 			}
2913 
2914 			packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2915 
2916 			*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2917 			buffer -= pitchB;
2918 			*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2919 		}
2920 		break;
2921 	case VK_FORMAT_R32G32B32A32_SFLOAT:
2922 	case VK_FORMAT_R32G32B32A32_SINT:
2923 	case VK_FORMAT_R32G32B32A32_UINT:
2924 		buffer += 16 * x;
2925 
2926 		{
2927 			value = *Pointer<Float4>(buffer, 16);
2928 
2929 			if(rgbaWriteMask != 0x0000000F)
2930 			{
2931 				Float4 masked = value;
2932 				color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2933 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2934 				color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(masked));
2935 			}
2936 
2937 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskX0X) + xMask * 16, 16));
2938 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX0X) + xMask * 16, 16));
2939 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2940 			*Pointer<Float4>(buffer, 16) = color.x;
2941 		}
2942 
2943 		{
2944 			value = *Pointer<Float4>(buffer + 16, 16);
2945 
2946 			if(rgbaWriteMask != 0x0000000F)
2947 			{
2948 				Float4 masked = value;
2949 				color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2950 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2951 				color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(masked));
2952 			}
2953 
2954 			color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskX1X) + xMask * 16, 16));
2955 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX1X) + xMask * 16, 16));
2956 			color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(value));
2957 			*Pointer<Float4>(buffer + 16, 16) = color.y;
2958 		}
2959 
2960 		buffer += pitchB;
2961 
2962 		{
2963 			value = *Pointer<Float4>(buffer, 16);
2964 
2965 			if(rgbaWriteMask != 0x0000000F)
2966 			{
2967 				Float4 masked = value;
2968 				color.z = As<Float4>(As<Int4>(color.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2969 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2970 				color.z = As<Float4>(As<Int4>(color.z) | As<Int4>(masked));
2971 			}
2972 
2973 			color.z = As<Float4>(As<Int4>(color.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskX2X) + xMask * 16, 16));
2974 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX2X) + xMask * 16, 16));
2975 			color.z = As<Float4>(As<Int4>(color.z) | As<Int4>(value));
2976 			*Pointer<Float4>(buffer, 16) = color.z;
2977 		}
2978 
2979 		{
2980 			value = *Pointer<Float4>(buffer + 16, 16);
2981 
2982 			if(rgbaWriteMask != 0x0000000F)
2983 			{
2984 				Float4 masked = value;
2985 				color.w = As<Float4>(As<Int4>(color.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2986 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2987 				color.w = As<Float4>(As<Int4>(color.w) | As<Int4>(masked));
2988 			}
2989 
2990 			color.w = As<Float4>(As<Int4>(color.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskX3X) + xMask * 16, 16));
2991 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX3X) + xMask * 16, 16));
2992 			color.w = As<Float4>(As<Int4>(color.w) | As<Int4>(value));
2993 			*Pointer<Float4>(buffer + 16, 16) = color.w;
2994 		}
2995 		break;
2996 	case VK_FORMAT_R16G16B16A16_SFLOAT:
2997 		if((rgbaWriteMask & 0x0000000F) != 0x0)
2998 		{
2999 			buffer += 8 * x;
3000 
3001 			UInt4 rgbaMask;
3002 			UInt4 value = *Pointer<UInt4>(buffer);
3003 			UInt4 packedCol;
3004 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.y))) << 16) | UInt(As<UShort>(Half(color.x.x))), 0);
3005 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.w))) << 16) | UInt(As<UShort>(Half(color.x.z))), 1);
3006 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.y))) << 16) | UInt(As<UShort>(Half(color.y.x))), 2);
3007 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.w))) << 16) | UInt(As<UShort>(Half(color.y.z))), 3);
3008 			UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
3009 			if((rgbaWriteMask & 0xF) != 0xF)
3010 			{
3011 				UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
3012 				rgbaMask = UInt4(tmpMask, tmpMask);
3013 				mergedMask &= rgbaMask;
3014 			}
3015 			*Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
3016 
3017 			buffer += pitchB;
3018 
3019 			value = *Pointer<UInt4>(buffer);
3020 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.z.y))) << 16) | UInt(As<UShort>(Half(color.z.x))), 0);
3021 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.z.w))) << 16) | UInt(As<UShort>(Half(color.z.z))), 1);
3022 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.w.y))) << 16) | UInt(As<UShort>(Half(color.w.x))), 2);
3023 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.w.w))) << 16) | UInt(As<UShort>(Half(color.w.z))), 3);
3024 			mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
3025 			if((rgbaWriteMask & 0xF) != 0xF)
3026 			{
3027 				mergedMask &= rgbaMask;
3028 			}
3029 			*Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
3030 		}
3031 		break;
3032 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
3033 		if((rgbaWriteMask & 0x7) != 0x0)
3034 		{
3035 			buffer += 4 * x;
3036 
3037 			UInt4 packedCol;
3038 			packedCol = Insert(packedCol, r11g11b10Pack(color.x), 0);
3039 			packedCol = Insert(packedCol, r11g11b10Pack(color.y), 1);
3040 			packedCol = Insert(packedCol, r11g11b10Pack(color.z), 2);
3041 			packedCol = Insert(packedCol, r11g11b10Pack(color.w), 3);
3042 
3043 			UInt4 value;
3044 			value = Insert(value, *Pointer<UInt>(buffer + 0), 0);
3045 			value = Insert(value, *Pointer<UInt>(buffer + 4), 1);
3046 			buffer += pitchB;
3047 			value = Insert(value, *Pointer<UInt>(buffer + 0), 2);
3048 			value = Insert(value, *Pointer<UInt>(buffer + 4), 3);
3049 
3050 			UInt4 mask = *Pointer<UInt4>(constants + OFFSET(Constants, maskD4X[0][0]) + xMask * 16, 16);
3051 			if((rgbaWriteMask & 0x7) != 0x7)
3052 			{
3053 				mask &= *Pointer<UInt4>(constants + OFFSET(Constants, mask11X[rgbaWriteMask & 0x7][0]), 16);
3054 			}
3055 			value = (packedCol & mask) | (value & ~mask);
3056 
3057 			*Pointer<UInt>(buffer + 0) = value.z;
3058 			*Pointer<UInt>(buffer + 4) = value.w;
3059 			buffer -= pitchB;
3060 			*Pointer<UInt>(buffer + 0) = value.x;
3061 			*Pointer<UInt>(buffer + 4) = value.y;
3062 		}
3063 		break;
3064 	case VK_FORMAT_R16G16B16A16_UNORM:
3065 	case VK_FORMAT_R16G16B16A16_SINT:
3066 	case VK_FORMAT_R16G16B16A16_UINT:
3067 		if((rgbaWriteMask & 0x0000000F) != 0x0)
3068 		{
3069 			buffer += 8 * x;
3070 
3071 			UInt4 rgbaMask;
3072 			UShort8 value = *Pointer<UShort8>(buffer);
3073 			UShort8 packedCol = UShort8(UShort4(As<Int4>(color.x)), UShort4(As<Int4>(color.y)));
3074 			UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
3075 			if((rgbaWriteMask & 0xF) != 0xF)
3076 			{
3077 				UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
3078 				rgbaMask = UInt4(tmpMask, tmpMask);
3079 				mergedMask &= rgbaMask;
3080 			}
3081 			*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
3082 
3083 			buffer += pitchB;
3084 
3085 			value = *Pointer<UShort8>(buffer);
3086 			packedCol = UShort8(UShort4(As<Int4>(color.z)), UShort4(As<Int4>(color.w)));
3087 			mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
3088 			if((rgbaWriteMask & 0xF) != 0xF)
3089 			{
3090 				mergedMask &= rgbaMask;
3091 			}
3092 			*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
3093 		}
3094 		break;
3095 	case VK_FORMAT_R8G8B8A8_SINT:
3096 	case VK_FORMAT_R8G8B8A8_UINT:
3097 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
3098 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
3099 		if((rgbaWriteMask & 0x0000000F) != 0x0)
3100 		{
3101 			UInt2 value, packedCol, mergedMask;
3102 
3103 			buffer += 4 * x;
3104 
3105 			bool isSigned = (format == VK_FORMAT_R8G8B8A8_SINT) || (format == VK_FORMAT_A8B8G8R8_SINT_PACK32);
3106 
3107 			if(isSigned)
3108 			{
3109 				packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
3110 			}
3111 			else
3112 			{
3113 				packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
3114 			}
3115 			value = *Pointer<UInt2>(buffer, 16);
3116 			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
3117 			if(rgbaWriteMask != 0xF)
3118 			{
3119 				mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
3120 			}
3121 			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
3122 
3123 			buffer += pitchB;
3124 
3125 			if(isSigned)
3126 			{
3127 				packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(color.z)), Short4(As<Int4>(color.w))));
3128 			}
3129 			else
3130 			{
3131 				packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(color.z)), Short4(As<Int4>(color.w))));
3132 			}
3133 			value = *Pointer<UInt2>(buffer, 16);
3134 			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
3135 			if(rgbaWriteMask != 0xF)
3136 			{
3137 				mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
3138 			}
3139 			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
3140 		}
3141 		break;
3142 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
3143 		if((rgbaWriteMask & 0x0000000F) != 0x0)
3144 		{
3145 			Int2 mergedMask, packedCol, value;
3146 			Int4 packed = ((As<Int4>(color.w) & Int4(0x3)) << 30) |
3147 			              ((As<Int4>(color.z) & Int4(0x3ff)) << 20) |
3148 			              ((As<Int4>(color.y) & Int4(0x3ff)) << 10) |
3149 			              ((As<Int4>(color.x) & Int4(0x3ff)));
3150 
3151 			buffer += 4 * x;
3152 			value = *Pointer<Int2>(buffer, 16);
3153 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
3154 			if(rgbaWriteMask != 0xF)
3155 			{
3156 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
3157 			}
3158 			*Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
3159 
3160 			buffer += pitchB;
3161 
3162 			value = *Pointer<Int2>(buffer, 16);
3163 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
3164 			if(rgbaWriteMask != 0xF)
3165 			{
3166 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
3167 			}
3168 			*Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
3169 		}
3170 		break;
3171 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
3172 		if((bgraWriteMask & 0x0000000F) != 0x0)
3173 		{
3174 			Int2 mergedMask, packedCol, value;
3175 			Int4 packed = ((As<Int4>(color.w) & Int4(0x3)) << 30) |
3176 			              ((As<Int4>(color.x) & Int4(0x3ff)) << 20) |
3177 			              ((As<Int4>(color.y) & Int4(0x3ff)) << 10) |
3178 			              ((As<Int4>(color.z) & Int4(0x3ff)));
3179 
3180 			buffer += 4 * x;
3181 			value = *Pointer<Int2>(buffer, 16);
3182 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
3183 			if(bgraWriteMask != 0xF)
3184 			{
3185 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[bgraWriteMask][0]));
3186 			}
3187 			*Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
3188 
3189 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
3190 
3191 			value = *Pointer<Int2>(buffer, 16);
3192 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
3193 			if(bgraWriteMask != 0xF)
3194 			{
3195 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[bgraWriteMask][0]));
3196 			}
3197 			*Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
3198 		}
3199 		break;
3200 	default:
3201 		UNSUPPORTED("VkFormat: %d", int(format));
3202 	}
3203 }
3204 
convertFixed16(const Float4 & cf,bool saturate)3205 UShort4 PixelRoutine::convertFixed16(const Float4 &cf, bool saturate)
3206 {
3207 	return UShort4(cf * 0xFFFF, saturate);
3208 }
3209 
convertFloat32(const UShort4 & cf)3210 Float4 PixelRoutine::convertFloat32(const UShort4 &cf)
3211 {
3212 	return Float4(cf) * (1.0f / 65535.0f);
3213 }
3214 
sRGBtoLinear16_12_16(Vector4s & c)3215 void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
3216 {
3217 	Pointer<Byte> LUT = constants + OFFSET(Constants, sRGBtoLinear12_16);
3218 
3219 	c.x = AddSat(As<UShort4>(c.x), UShort4(0x0007)) >> 4;
3220 	c.y = AddSat(As<UShort4>(c.y), UShort4(0x0007)) >> 4;
3221 	c.z = AddSat(As<UShort4>(c.z), UShort4(0x0007)) >> 4;
3222 
3223 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
3224 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
3225 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
3226 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
3227 
3228 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
3229 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
3230 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
3231 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
3232 
3233 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
3234 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
3235 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
3236 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
3237 }
3238 
linearToSRGB16_12_16(Vector4s & c)3239 void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
3240 {
3241 	c.x = AddSat(As<UShort4>(c.x), UShort4(0x0007)) >> 4;
3242 	c.y = AddSat(As<UShort4>(c.y), UShort4(0x0007)) >> 4;
3243 	c.z = AddSat(As<UShort4>(c.z), UShort4(0x0007)) >> 4;
3244 
3245 	linearToSRGB12_16(c);
3246 }
3247 
linearToSRGB12_16(Vector4s & c)3248 void PixelRoutine::linearToSRGB12_16(Vector4s &c)
3249 {
3250 	Pointer<Byte> LUT = constants + OFFSET(Constants, linearToSRGB12_16);
3251 
3252 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
3253 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
3254 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
3255 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
3256 
3257 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
3258 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
3259 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
3260 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
3261 
3262 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
3263 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
3264 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
3265 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
3266 }
3267 
sRGBtoLinear(const Float4 & x)3268 Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)  // Approximates x^2.2
3269 {
3270 	Float4 linear = x * x;
3271 	linear = linear * 0.73f + linear * x * 0.27f;
3272 
3273 	return Min(Max(linear, 0.0f), 1.0f);
3274 }
3275 
3276 }  // namespace sw
3277