• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "PixelRoutine.hpp"
16 
17 #include "Constants.hpp"
18 #include "SamplerCore.hpp"
19 #include "Device/Primitive.hpp"
20 #include "Device/QuadRasterizer.hpp"
21 #include "Device/Renderer.hpp"
22 #include "System/Debug.hpp"
23 #include "Vulkan/VkPipelineLayout.hpp"
24 #include "Vulkan/VkStringify.hpp"
25 
26 namespace sw {
27 
PixelRoutine(const PixelProcessor::State & state,vk::PipelineLayout const * pipelineLayout,SpirvShader const * spirvShader,const vk::DescriptorSet::Bindings & descriptorSets)28 PixelRoutine::PixelRoutine(
29     const PixelProcessor::State &state,
30     vk::PipelineLayout const *pipelineLayout,
31     SpirvShader const *spirvShader,
32     const vk::DescriptorSet::Bindings &descriptorSets)
33     : QuadRasterizer(state, spirvShader)
34     , routine(pipelineLayout)
35     , descriptorSets(descriptorSets)
36     , shaderContainsInterpolation(spirvShader && spirvShader->getUsedCapabilities().InterpolationFunction)
37     , shaderContainsSampleQualifier(spirvShader && spirvShader->getAnalysis().ContainsSampleQualifier)
38     , perSampleShading((state.sampleShadingEnabled && (state.minSampleShading * state.multiSampleCount > 1.0f)) ||
39                        shaderContainsSampleQualifier || shaderContainsInterpolation)  // TODO(b/194714095)
40     , invocationCount(perSampleShading ? state.multiSampleCount : 1)
41 {
42 	if(spirvShader)
43 	{
44 		spirvShader->emitProlog(&routine);
45 
46 		// Clearing inputs to 0 is not demanded by the spec,
47 		// but it makes the undefined behavior deterministic.
48 		// TODO(b/155148722): Remove to detect UB.
49 		for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++)
50 		{
51 			routine.inputs[i] = Float4(0.0f);
52 		}
53 	}
54 }
55 
~PixelRoutine()56 PixelRoutine::~PixelRoutine()
57 {
58 }
59 
getSampleSet(int invocation) const60 PixelRoutine::SampleSet PixelRoutine::getSampleSet(int invocation) const
61 {
62 	unsigned int sampleBegin = perSampleShading ? invocation : 0;
63 	unsigned int sampleEnd = perSampleShading ? (invocation + 1) : state.multiSampleCount;
64 
65 	SampleSet samples;
66 
67 	for(unsigned int q = sampleBegin; q < sampleEnd; q++)
68 	{
69 		if(state.multiSampleMask & (1 << q))
70 		{
71 			samples.push_back(q);
72 		}
73 	}
74 
75 	return samples;
76 }
77 
quad(Pointer<Byte> cBuffer[MAX_COLOR_BUFFERS],Pointer<Byte> & zBuffer,Pointer<Byte> & sBuffer,Int cMask[4],Int & x,Int & y)78 void PixelRoutine::quad(Pointer<Byte> cBuffer[MAX_COLOR_BUFFERS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
79 {
80 	const bool earlyFragmentTests = !spirvShader || spirvShader->getExecutionModes().EarlyFragmentTests;
81 
82 	Int zMask[4];  // Depth mask
83 	Int sMask[4];  // Stencil mask
84 	Float4 unclampedZ[4];
85 
86 	for(int invocation = 0; invocation < invocationCount; invocation++)
87 	{
88 		SampleSet samples = getSampleSet(invocation);
89 
90 		if(samples.empty())
91 		{
92 			continue;
93 		}
94 
95 		for(unsigned int q : samples)
96 		{
97 			zMask[q] = cMask[q];
98 			sMask[q] = cMask[q];
99 		}
100 
101 		stencilTest(sBuffer, x, sMask, samples);
102 
103 		Float4 f;
104 		Float4 rhwCentroid;
105 
106 		Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive, xQuad), 16);
107 
108 		if(interpolateZ())
109 		{
110 			for(unsigned int q : samples)
111 			{
112 				Float4 x = xxxx;
113 
114 				if(state.enableMultiSampling)
115 				{
116 					x -= *Pointer<Float4>(constants + OFFSET(Constants, X) + q * sizeof(float4));
117 				}
118 
119 				z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive, z), false, false);
120 
121 				if(state.depthBias)
122 				{
123 					z[q] += *Pointer<Float4>(primitive + OFFSET(Primitive, zBias), 16);
124 				}
125 
126 				unclampedZ[q] = z[q];
127 			}
128 		}
129 
130 		Bool depthPass = false;
131 
132 		if(earlyFragmentTests)
133 		{
134 			for(unsigned int q : samples)
135 			{
136 				z[q] = clampDepth(z[q]);
137 				depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
138 				depthBoundsTest(zBuffer, q, x, zMask[q], cMask[q]);
139 			}
140 		}
141 
142 		If(depthPass || !earlyFragmentTests)
143 		{
144 			if(earlyFragmentTests)
145 			{
146 				writeDepth(zBuffer, x, zMask, samples);
147 			}
148 
149 			Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive, yQuad), 16);
150 
151 			// Centroid locations
152 			Float4 XXXX = Float4(0.0f);
153 			Float4 YYYY = Float4(0.0f);
154 
155 			if(state.centroid || shaderContainsInterpolation)  // TODO(b/194714095)
156 			{
157 				Float4 WWWW(1.0e-9f);
158 
159 				for(unsigned int q : samples)
160 				{
161 					XXXX += *Pointer<Float4>(constants + OFFSET(Constants, sampleX[q]) + 16 * cMask[q]);
162 					YYYY += *Pointer<Float4>(constants + OFFSET(Constants, sampleY[q]) + 16 * cMask[q]);
163 					WWWW += *Pointer<Float4>(constants + OFFSET(Constants, weight) + 16 * cMask[q]);
164 				}
165 
166 				WWWW = Rcp(WWWW, Precision::Relaxed);
167 				XXXX *= WWWW;
168 				YYYY *= WWWW;
169 
170 				XXXX += xxxx;
171 				YYYY += yyyy;
172 			}
173 
174 			if(interpolateW())
175 			{
176 				w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive, w), false, false);
177 				rhw = reciprocal(w, false, false, true);
178 
179 				if(state.centroid || shaderContainsInterpolation)  // TODO(b/194714095)
180 				{
181 					rhwCentroid = reciprocal(SpirvRoutine::interpolateAtXY(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, w), false, false));
182 				}
183 			}
184 
185 			if(spirvShader)
186 			{
187 				if(shaderContainsInterpolation)  // TODO(b/194714095)
188 				{
189 					routine.interpolationData.primitive = primitive;
190 
191 					routine.interpolationData.x = xxxx;
192 					routine.interpolationData.y = yyyy;
193 					routine.interpolationData.rhw = rhw;
194 
195 					routine.interpolationData.xCentroid = XXXX;
196 					routine.interpolationData.yCentroid = YYYY;
197 					routine.interpolationData.rhwCentroid = rhwCentroid;
198 				}
199 
200 				if(perSampleShading && (state.multiSampleCount > 1))
201 				{
202 					xxxx += Float4(Constants::SampleLocationsX[samples[0]]);
203 					yyyy += Float4(Constants::SampleLocationsY[samples[0]]);
204 				}
205 
206 				int packedInterpolant = 0;
207 				for(int interfaceInterpolant = 0; interfaceInterpolant < MAX_INTERFACE_COMPONENTS; interfaceInterpolant++)
208 				{
209 					auto const &input = spirvShader->inputs[interfaceInterpolant];
210 					if(input.Type != SpirvShader::ATTRIBTYPE_UNUSED)
211 					{
212 						if(input.Centroid && state.enableMultiSampling)
213 						{
214 							routine.inputs[interfaceInterpolant] =
215 							    SpirvRoutine::interpolateAtXY(XXXX, YYYY, rhwCentroid,
216 							                                  primitive + OFFSET(Primitive, V[packedInterpolant]),
217 							                                  input.Flat, !input.NoPerspective);
218 						}
219 						else if(perSampleShading)
220 						{
221 							routine.inputs[interfaceInterpolant] =
222 							    SpirvRoutine::interpolateAtXY(xxxx, yyyy, rhw,
223 							                                  primitive + OFFSET(Primitive, V[packedInterpolant]),
224 							                                  input.Flat, !input.NoPerspective);
225 						}
226 						else
227 						{
228 							routine.inputs[interfaceInterpolant] =
229 							    interpolate(xxxx, Dv[interfaceInterpolant], rhw,
230 							                primitive + OFFSET(Primitive, V[packedInterpolant]),
231 							                input.Flat, !input.NoPerspective);
232 						}
233 						packedInterpolant++;
234 					}
235 				}
236 
237 				setBuiltins(x, y, unclampedZ, w, cMask, samples);
238 
239 				for(uint32_t i = 0; i < state.numClipDistances; i++)
240 				{
241 					auto distance = interpolate(xxxx, DclipDistance[i], rhw,
242 					                            primitive + OFFSET(Primitive, clipDistance[i]),
243 					                            false, true);
244 
245 					auto clipMask = SignMask(CmpGE(distance, SIMD::Float(0)));
246 					for(unsigned int q : samples)
247 					{
248 						// FIXME(b/148105887): Fragments discarded by clipping do not exist at
249 						// all -- they should not be counted in queries or have their Z/S effects
250 						// performed when early fragment tests are enabled.
251 						cMask[q] &= clipMask;
252 					}
253 
254 					if(spirvShader->getUsedCapabilities().ClipDistance)
255 					{
256 						auto it = spirvShader->inputBuiltins.find(spv::BuiltInClipDistance);
257 						if(it != spirvShader->inputBuiltins.end())
258 						{
259 							if(i < it->second.SizeInComponents)
260 							{
261 								routine.getVariable(it->second.Id)[it->second.FirstComponent + i] = distance;
262 							}
263 						}
264 					}
265 				}
266 
267 				if(spirvShader->getUsedCapabilities().CullDistance)
268 				{
269 					auto it = spirvShader->inputBuiltins.find(spv::BuiltInCullDistance);
270 					if(it != spirvShader->inputBuiltins.end())
271 					{
272 						for(uint32_t i = 0; i < state.numCullDistances; i++)
273 						{
274 							if(i < it->second.SizeInComponents)
275 							{
276 								routine.getVariable(it->second.Id)[it->second.FirstComponent + i] =
277 								    interpolate(xxxx, DcullDistance[i], rhw,
278 								                primitive + OFFSET(Primitive, cullDistance[i]),
279 								                false, true);
280 							}
281 						}
282 					}
283 				}
284 			}
285 
286 			if(spirvShader)
287 			{
288 				executeShader(cMask, earlyFragmentTests ? sMask : cMask, earlyFragmentTests ? zMask : cMask, samples);
289 			}
290 
291 			Bool alphaPass = alphaTest(cMask, samples);
292 
293 			if((spirvShader && spirvShader->getAnalysis().ContainsKill) || state.alphaToCoverage)
294 			{
295 				for(unsigned int q : samples)
296 				{
297 					zMask[q] &= cMask[q];
298 					sMask[q] &= cMask[q];
299 				}
300 			}
301 
302 			If(alphaPass)
303 			{
304 				if(!earlyFragmentTests)
305 				{
306 					for(unsigned int q : samples)
307 					{
308 						z[q] = clampDepth(z[q]);
309 						depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
310 						depthBoundsTest(zBuffer, q, x, zMask[q], cMask[q]);
311 					}
312 				}
313 
314 				If(depthPass)
315 				{
316 					if(!earlyFragmentTests)
317 					{
318 						writeDepth(zBuffer, x, zMask, samples);
319 					}
320 
321 					blendColor(cBuffer, x, sMask, zMask, cMask, samples);
322 
323 					occlusionSampleCount(zMask, sMask, samples);
324 				}
325 			}
326 		}
327 
328 		writeStencil(sBuffer, x, sMask, zMask, cMask, samples);
329 	}
330 }
331 
stencilTest(const Pointer<Byte> & sBuffer,const Int & x,Int sMask[4],const SampleSet & samples)332 void PixelRoutine::stencilTest(const Pointer<Byte> &sBuffer, const Int &x, Int sMask[4], const SampleSet &samples)
333 {
334 	if(!state.stencilActive)
335 	{
336 		return;
337 	}
338 
339 	for(unsigned int q : samples)
340 	{
341 		// (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
342 
343 		Pointer<Byte> buffer = sBuffer + x;
344 
345 		if(q > 0)
346 		{
347 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
348 		}
349 
350 		Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
351 		Byte8 value = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
352 		value = value | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
353 		Byte8 valueBack = value;
354 
355 		if(state.frontStencil.compareMask != 0xff)
356 		{
357 			value &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].testMaskQ));
358 		}
359 
360 		stencilTest(value, state.frontStencil.compareOp, false);
361 
362 		if(state.backStencil.compareMask != 0xff)
363 		{
364 			valueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].testMaskQ));
365 		}
366 
367 		stencilTest(valueBack, state.backStencil.compareOp, true);
368 
369 		value &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
370 		valueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
371 		value |= valueBack;
372 
373 		sMask[q] &= SignMask(value);
374 	}
375 }
376 
stencilTest(Byte8 & value,VkCompareOp stencilCompareMode,bool isBack)377 void PixelRoutine::stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool isBack)
378 {
379 	Byte8 equal;
380 
381 	switch(stencilCompareMode)
382 	{
383 	case VK_COMPARE_OP_ALWAYS:
384 		value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
385 		break;
386 	case VK_COMPARE_OP_NEVER:
387 		value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
388 		break;
389 	case VK_COMPARE_OP_LESS:  // a < b ~ b > a
390 		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
391 		value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
392 		break;
393 	case VK_COMPARE_OP_EQUAL:
394 		value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
395 		break;
396 	case VK_COMPARE_OP_NOT_EQUAL:  // a != b ~ !(a == b)
397 		value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
398 		value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
399 		break;
400 	case VK_COMPARE_OP_LESS_OR_EQUAL:  // a <= b ~ (b > a) || (a == b)
401 		equal = value;
402 		equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
403 		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
404 		value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
405 		value |= equal;
406 		break;
407 	case VK_COMPARE_OP_GREATER:  // a > b
408 		equal = *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ));
409 		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
410 		equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
411 		value = equal;
412 		break;
413 	case VK_COMPARE_OP_GREATER_OR_EQUAL:  // a >= b ~ !(a < b) ~ !(b > a)
414 		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
415 		value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
416 		value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
417 		break;
418 	default:
419 		UNSUPPORTED("VkCompareOp: %d", int(stencilCompareMode));
420 	}
421 }
422 
depthTest32F(const Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & sMask,Int & zMask,const Int & cMask)423 Bool PixelRoutine::depthTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
424 {
425 	Float4 Z = z;
426 
427 	Pointer<Byte> buffer = zBuffer + 4 * x;
428 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
429 
430 	if(q > 0)
431 	{
432 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
433 	}
434 
435 	Float4 zValue;
436 
437 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
438 	{
439 		zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
440 	}
441 
442 	Int4 zTest;
443 
444 	switch(state.depthCompareMode)
445 	{
446 	case VK_COMPARE_OP_ALWAYS:
447 		// Optimized
448 		break;
449 	case VK_COMPARE_OP_NEVER:
450 		// Optimized
451 		break;
452 	case VK_COMPARE_OP_EQUAL:
453 		zTest = CmpEQ(zValue, Z);
454 		break;
455 	case VK_COMPARE_OP_NOT_EQUAL:
456 		zTest = CmpNEQ(zValue, Z);
457 		break;
458 	case VK_COMPARE_OP_LESS:
459 		zTest = CmpNLE(zValue, Z);
460 		break;
461 	case VK_COMPARE_OP_GREATER_OR_EQUAL:
462 		zTest = CmpLE(zValue, Z);
463 		break;
464 	case VK_COMPARE_OP_LESS_OR_EQUAL:
465 		zTest = CmpNLT(zValue, Z);
466 		break;
467 	case VK_COMPARE_OP_GREATER:
468 		zTest = CmpLT(zValue, Z);
469 		break;
470 	default:
471 		UNSUPPORTED("VkCompareOp: %d", int(state.depthCompareMode));
472 	}
473 
474 	switch(state.depthCompareMode)
475 	{
476 	case VK_COMPARE_OP_ALWAYS:
477 		zMask = cMask;
478 		break;
479 	case VK_COMPARE_OP_NEVER:
480 		zMask = 0x0;
481 		break;
482 	default:
483 		zMask = SignMask(zTest) & cMask;
484 		break;
485 	}
486 
487 	if(state.stencilActive)
488 	{
489 		zMask &= sMask;
490 	}
491 
492 	return zMask != 0;
493 }
494 
depthTest16(const Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & sMask,Int & zMask,const Int & cMask)495 Bool PixelRoutine::depthTest16(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
496 {
497 	Short4 Z = convertFixed16(z, true);
498 
499 	Pointer<Byte> buffer = zBuffer + 2 * x;
500 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
501 
502 	if(q > 0)
503 	{
504 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
505 	}
506 
507 	Short4 zValue;
508 
509 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
510 	{
511 		zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer), 0));
512 		zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer + pitch), 1));
513 	}
514 
515 	Int4 zTest;
516 
517 	// Bias values to make unsigned compares out of Reactor's (due SSE's) signed compares only
518 	zValue = zValue - Short4(0x8000u);
519 	Z = Z - Short4(0x8000u);
520 
521 	switch(state.depthCompareMode)
522 	{
523 	case VK_COMPARE_OP_ALWAYS:
524 		// Optimized
525 		break;
526 	case VK_COMPARE_OP_NEVER:
527 		// Optimized
528 		break;
529 	case VK_COMPARE_OP_EQUAL:
530 		zTest = Int4(CmpEQ(zValue, Z));
531 		break;
532 	case VK_COMPARE_OP_NOT_EQUAL:
533 		zTest = ~Int4(CmpEQ(zValue, Z));
534 		break;
535 	case VK_COMPARE_OP_LESS:
536 		zTest = Int4(CmpGT(zValue, Z));
537 		break;
538 	case VK_COMPARE_OP_GREATER_OR_EQUAL:
539 		zTest = ~Int4(CmpGT(zValue, Z));
540 		break;
541 	case VK_COMPARE_OP_LESS_OR_EQUAL:
542 		zTest = ~Int4(CmpGT(Z, zValue));
543 		break;
544 	case VK_COMPARE_OP_GREATER:
545 		zTest = Int4(CmpGT(Z, zValue));
546 		break;
547 	default:
548 		UNSUPPORTED("VkCompareOp: %d", int(state.depthCompareMode));
549 	}
550 
551 	switch(state.depthCompareMode)
552 	{
553 	case VK_COMPARE_OP_ALWAYS:
554 		zMask = cMask;
555 		break;
556 	case VK_COMPARE_OP_NEVER:
557 		zMask = 0x0;
558 		break;
559 	default:
560 		zMask = SignMask(zTest) & cMask;
561 		break;
562 	}
563 
564 	if(state.stencilActive)
565 	{
566 		zMask &= sMask;
567 	}
568 
569 	return zMask != 0;
570 }
571 
clampDepth(const Float4 & z)572 Float4 PixelRoutine::clampDepth(const Float4 &z)
573 {
574 	if(!state.depthClamp)
575 	{
576 		return z;
577 	}
578 
579 	return Min(Max(z, Float4(state.minDepthClamp)), Float4(state.maxDepthClamp));
580 }
581 
depthTest(const Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & sMask,Int & zMask,const Int & cMask)582 Bool PixelRoutine::depthTest(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
583 {
584 	if(!state.depthTestActive)
585 	{
586 		return true;
587 	}
588 
589 	switch(state.depthFormat)
590 	{
591 	case VK_FORMAT_D16_UNORM:
592 		return depthTest16(zBuffer, q, x, z, sMask, zMask, cMask);
593 	case VK_FORMAT_D32_SFLOAT:
594 	case VK_FORMAT_D32_SFLOAT_S8_UINT:
595 		return depthTest32F(zBuffer, q, x, z, sMask, zMask, cMask);
596 	default:
597 		UNSUPPORTED("Depth format: %d", int(state.depthFormat));
598 		return false;
599 	}
600 }
601 
depthBoundsTest16(const Pointer<Byte> & zBuffer,int q,const Int & x)602 Int4 PixelRoutine::depthBoundsTest16(const Pointer<Byte> &zBuffer, int q, const Int &x)
603 {
604 	Pointer<Byte> buffer = zBuffer + 2 * x;
605 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
606 
607 	if(q > 0)
608 	{
609 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
610 	}
611 
612 	Float4 minDepthBound(state.minDepthBounds);
613 	Float4 maxDepthBound(state.maxDepthBounds);
614 
615 	Int2 z;
616 	z = Insert(z, *Pointer<Int>(buffer), 0);
617 	z = Insert(z, *Pointer<Int>(buffer + pitch), 1);
618 
619 	Float4 zValue = convertFloat32(As<UShort4>(z));
620 	return Int4(CmpLE(minDepthBound, zValue) & CmpLE(zValue, maxDepthBound));
621 }
622 
depthBoundsTest32F(const Pointer<Byte> & zBuffer,int q,const Int & x)623 Int4 PixelRoutine::depthBoundsTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x)
624 {
625 	Pointer<Byte> buffer = zBuffer + 4 * x;
626 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
627 
628 	if(q > 0)
629 	{
630 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
631 	}
632 
633 	Float4 zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
634 	return Int4(CmpLE(Float4(state.minDepthBounds), zValue) & CmpLE(zValue, Float4(state.maxDepthBounds)));
635 }
636 
depthBoundsTest(const Pointer<Byte> & zBuffer,int q,const Int & x,Int & zMask,Int & cMask)637 void PixelRoutine::depthBoundsTest(const Pointer<Byte> &zBuffer, int q, const Int &x, Int &zMask, Int &cMask)
638 {
639 	if(!state.depthBoundsTestActive)
640 	{
641 		return;
642 	}
643 
644 	Int4 zTest;
645 	switch(state.depthFormat)
646 	{
647 	case VK_FORMAT_D16_UNORM:
648 		zTest = depthBoundsTest16(zBuffer, q, x);
649 		break;
650 	case VK_FORMAT_D32_SFLOAT:
651 	case VK_FORMAT_D32_SFLOAT_S8_UINT:
652 		zTest = depthBoundsTest32F(zBuffer, q, x);
653 		break;
654 	default:
655 		UNSUPPORTED("Depth format: %d", int(state.depthFormat));
656 		break;
657 	}
658 
659 	if(!state.depthTestActive)
660 	{
661 		cMask &= zMask & SignMask(zTest);
662 	}
663 	else
664 	{
665 		zMask &= cMask & SignMask(zTest);
666 	}
667 }
668 
alphaToCoverage(Int cMask[4],const Float4 & alpha,const SampleSet & samples)669 void PixelRoutine::alphaToCoverage(Int cMask[4], const Float4 &alpha, const SampleSet &samples)
670 {
671 	static const int a2c[4] = {
672 		OFFSET(DrawData, a2c0),
673 		OFFSET(DrawData, a2c1),
674 		OFFSET(DrawData, a2c2),
675 		OFFSET(DrawData, a2c3),
676 	};
677 
678 	for(unsigned int q : samples)
679 	{
680 		Int4 coverage = CmpNLT(alpha, *Pointer<Float4>(data + a2c[q]));
681 		Int aMask = SignMask(coverage);
682 		cMask[q] &= aMask;
683 	}
684 }
685 
writeDepth32F(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)686 void PixelRoutine::writeDepth32F(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
687 {
688 	Float4 Z = z;
689 
690 	Pointer<Byte> buffer = zBuffer + 4 * x;
691 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
692 
693 	if(q > 0)
694 	{
695 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
696 	}
697 
698 	Float4 zValue;
699 
700 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
701 	{
702 		zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
703 	}
704 
705 	Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + zMask * 16, 16));
706 	zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + zMask * 16, 16));
707 	Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
708 
709 	*Pointer<Float2>(buffer) = Float2(Z.xy);
710 	*Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
711 }
712 
writeDepth16(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)713 void PixelRoutine::writeDepth16(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
714 {
715 	Short4 Z = As<Short4>(convertFixed16(z, true));
716 
717 	Pointer<Byte> buffer = zBuffer + 2 * x;
718 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
719 
720 	if(q > 0)
721 	{
722 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
723 	}
724 
725 	Short4 zValue;
726 
727 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
728 	{
729 		zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer), 0));
730 		zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer + pitch), 1));
731 	}
732 
733 	Z = Z & *Pointer<Short4>(constants + OFFSET(Constants, maskW4Q) + zMask * 8, 8);
734 	zValue = zValue & *Pointer<Short4>(constants + OFFSET(Constants, invMaskW4Q) + zMask * 8, 8);
735 	Z = Z | zValue;
736 
737 	*Pointer<Int>(buffer) = Extract(As<Int2>(Z), 0);
738 	*Pointer<Int>(buffer + pitch) = Extract(As<Int2>(Z), 1);
739 }
740 
writeDepth(Pointer<Byte> & zBuffer,const Int & x,const Int zMask[4],const SampleSet & samples)741 void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, const Int &x, const Int zMask[4], const SampleSet &samples)
742 {
743 	if(!state.depthWriteEnable)
744 	{
745 		return;
746 	}
747 
748 	for(unsigned int q : samples)
749 	{
750 		switch(state.depthFormat)
751 		{
752 		case VK_FORMAT_D16_UNORM:
753 			writeDepth16(zBuffer, q, x, z[q], zMask[q]);
754 			break;
755 		case VK_FORMAT_D32_SFLOAT:
756 		case VK_FORMAT_D32_SFLOAT_S8_UINT:
757 			writeDepth32F(zBuffer, q, x, z[q], zMask[q]);
758 			break;
759 		default:
760 			UNSUPPORTED("Depth format: %d", int(state.depthFormat));
761 			break;
762 		}
763 	}
764 }
765 
occlusionSampleCount(const Int zMask[4],const Int sMask[4],const SampleSet & samples)766 void PixelRoutine::occlusionSampleCount(const Int zMask[4], const Int sMask[4], const SampleSet &samples)
767 {
768 	if(!state.occlusionEnabled)
769 	{
770 		return;
771 	}
772 
773 	for(unsigned int q : samples)
774 	{
775 		occlusion += *Pointer<UInt>(constants + OFFSET(Constants, occlusionCount) + 4 * (zMask[q] & sMask[q]));
776 	}
777 }
778 
writeStencil(Pointer<Byte> & sBuffer,const Int & x,const Int sMask[4],const Int zMask[4],const Int cMask[4],const SampleSet & samples)779 void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, const Int &x, const Int sMask[4], const Int zMask[4], const Int cMask[4], const SampleSet &samples)
780 {
781 	if(!state.stencilActive)
782 	{
783 		return;
784 	}
785 
786 	if(state.frontStencil.passOp == VK_STENCIL_OP_KEEP && state.frontStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.frontStencil.failOp == VK_STENCIL_OP_KEEP)
787 	{
788 		if(state.backStencil.passOp == VK_STENCIL_OP_KEEP && state.backStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.backStencil.failOp == VK_STENCIL_OP_KEEP)
789 		{
790 			return;
791 		}
792 	}
793 
794 	if((state.frontStencil.writeMask == 0) && (state.backStencil.writeMask == 0))
795 	{
796 		return;
797 	}
798 
799 	for(unsigned int q : samples)
800 	{
801 		Pointer<Byte> buffer = sBuffer + x;
802 
803 		if(q > 0)
804 		{
805 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
806 		}
807 
808 		Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
809 		Byte8 bufferValue = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
810 		bufferValue = bufferValue | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
811 		Byte8 newValue;
812 		stencilOperation(newValue, bufferValue, state.frontStencil, false, zMask[q], sMask[q]);
813 
814 		if((state.frontStencil.writeMask & 0xFF) != 0xFF)  // Assume 8-bit stencil buffer
815 		{
816 			Byte8 maskedValue = bufferValue;
817 			newValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].writeMaskQ));
818 			maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].invWriteMaskQ));
819 			newValue |= maskedValue;
820 		}
821 
822 		Byte8 newValueBack;
823 
824 		stencilOperation(newValueBack, bufferValue, state.backStencil, true, zMask[q], sMask[q]);
825 
826 		if((state.backStencil.writeMask & 0xFF) != 0xFF)  // Assume 8-bit stencil buffer
827 		{
828 			Byte8 maskedValue = bufferValue;
829 			newValueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].writeMaskQ));
830 			maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].invWriteMaskQ));
831 			newValueBack |= maskedValue;
832 		}
833 
834 		newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
835 		newValueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
836 		newValue |= newValueBack;
837 
838 		newValue &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * cMask[q]);
839 		bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * cMask[q]);
840 		newValue |= bufferValue;
841 
842 		*Pointer<Short>(buffer) = Extract(As<Short4>(newValue), 0);
843 		*Pointer<Short>(buffer + pitch) = Extract(As<Short4>(newValue), 1);
844 	}
845 }
846 
stencilOperation(Byte8 & newValue,const Byte8 & bufferValue,const PixelProcessor::States::StencilOpState & ops,bool isBack,const Int & zMask,const Int & sMask)847 void PixelRoutine::stencilOperation(Byte8 &newValue, const Byte8 &bufferValue, const PixelProcessor::States::StencilOpState &ops, bool isBack, const Int &zMask, const Int &sMask)
848 {
849 	Byte8 &pass = newValue;
850 	Byte8 fail;
851 	Byte8 zFail;
852 
853 	stencilOperation(pass, bufferValue, ops.passOp, isBack);
854 
855 	if(ops.depthFailOp != ops.passOp)
856 	{
857 		stencilOperation(zFail, bufferValue, ops.depthFailOp, isBack);
858 	}
859 
860 	if(ops.failOp != ops.passOp || ops.failOp != ops.depthFailOp)
861 	{
862 		stencilOperation(fail, bufferValue, ops.failOp, isBack);
863 	}
864 
865 	if(ops.failOp != ops.passOp || ops.failOp != ops.depthFailOp)
866 	{
867 		if(state.depthTestActive && ops.depthFailOp != ops.passOp)  // zMask valid and values not the same
868 		{
869 			pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * zMask);
870 			zFail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * zMask);
871 			pass |= zFail;
872 		}
873 
874 		pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * sMask);
875 		fail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * sMask);
876 		pass |= fail;
877 	}
878 }
879 
stencilReplaceRef(bool isBack)880 Byte8 PixelRoutine::stencilReplaceRef(bool isBack)
881 {
882 	if(spirvShader)
883 	{
884 		auto it = spirvShader->outputBuiltins.find(spv::BuiltInFragStencilRefEXT);
885 		if(it != spirvShader->outputBuiltins.end())
886 		{
887 			UInt4 sRef = As<UInt4>(routine.getVariable(it->second.Id)[it->second.FirstComponent]) & UInt4(0xff);
888 			// TODO (b/148295813): Could be done with a single pshufb instruction. Optimize the
889 			//                     following line by either adding a rr::Shuffle() variant to do
890 			//                     it explicitly or adding a Byte4(Int4) constructor would work.
891 			sRef.x = rr::UInt(sRef.x) | (rr::UInt(sRef.y) << 8) | (rr::UInt(sRef.z) << 16) | (rr::UInt(sRef.w) << 24);
892 
893 			UInt2 sRefDuplicated;
894 			sRefDuplicated = Insert(sRefDuplicated, sRef.x, 0);
895 			sRefDuplicated = Insert(sRefDuplicated, sRef.x, 1);
896 			return As<Byte8>(sRefDuplicated);
897 		}
898 	}
899 
900 	return *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceQ));
901 }
902 
stencilOperation(Byte8 & output,const Byte8 & bufferValue,VkStencilOp operation,bool isBack)903 void PixelRoutine::stencilOperation(Byte8 &output, const Byte8 &bufferValue, VkStencilOp operation, bool isBack)
904 {
905 	switch(operation)
906 	{
907 	case VK_STENCIL_OP_KEEP:
908 		output = bufferValue;
909 		break;
910 	case VK_STENCIL_OP_ZERO:
911 		output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
912 		break;
913 	case VK_STENCIL_OP_REPLACE:
914 		output = stencilReplaceRef(isBack);
915 		break;
916 	case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
917 		output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
918 		break;
919 	case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
920 		output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
921 		break;
922 	case VK_STENCIL_OP_INVERT:
923 		output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
924 		break;
925 	case VK_STENCIL_OP_INCREMENT_AND_WRAP:
926 		output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
927 		break;
928 	case VK_STENCIL_OP_DECREMENT_AND_WRAP:
929 		output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
930 		break;
931 	default:
932 		UNSUPPORTED("VkStencilOp: %d", int(operation));
933 	}
934 }
935 
isSRGB(int index) const936 bool PixelRoutine::isSRGB(int index) const
937 {
938 	return vk::Format(state.colorFormat[index]).isSRGBformat();
939 }
940 
readPixel(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4s & pixel)941 void PixelRoutine::readPixel(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &pixel)
942 {
943 	Short4 c01;
944 	Short4 c23;
945 	Pointer<Byte> buffer = cBuffer;
946 	Pointer<Byte> buffer2;
947 
948 	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
949 
950 	switch(state.colorFormat[index])
951 	{
952 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
953 		buffer += 2 * x;
954 		buffer2 = buffer + pitchB;
955 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
956 
957 		pixel.x = (c01 & Short4(0xF000u));
958 		pixel.y = (c01 & Short4(0x0F00u)) << 4;
959 		pixel.z = (c01 & Short4(0x00F0u)) << 8;
960 		pixel.w = (c01 & Short4(0x000Fu)) << 12;
961 
962 		// Expand to 16 bit range
963 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
964 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
965 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
966 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
967 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
968 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
969 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
970 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
971 		break;
972 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
973 		buffer += 2 * x;
974 		buffer2 = buffer + pitchB;
975 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
976 
977 		pixel.z = (c01 & Short4(0xF000u));
978 		pixel.y = (c01 & Short4(0x0F00u)) << 4;
979 		pixel.x = (c01 & Short4(0x00F0u)) << 8;
980 		pixel.w = (c01 & Short4(0x000Fu)) << 12;
981 
982 		// Expand to 16 bit range
983 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
984 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
985 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
986 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
987 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
988 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
989 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
990 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
991 		break;
992 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
993 		buffer += 2 * x;
994 		buffer2 = buffer + pitchB;
995 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
996 
997 		pixel.w = (c01 & Short4(0xF000u));
998 		pixel.z = (c01 & Short4(0x0F00u)) << 4;
999 		pixel.y = (c01 & Short4(0x00F0u)) << 8;
1000 		pixel.x = (c01 & Short4(0x000Fu)) << 12;
1001 
1002 		// Expand to 16 bit range
1003 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
1004 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
1005 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
1006 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
1007 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
1008 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
1009 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1010 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1011 		break;
1012 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
1013 		buffer += 2 * x;
1014 		buffer2 = buffer + pitchB;
1015 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1016 
1017 		pixel.w = (c01 & Short4(0xF000u));
1018 		pixel.x = (c01 & Short4(0x0F00u)) << 4;
1019 		pixel.y = (c01 & Short4(0x00F0u)) << 8;
1020 		pixel.z = (c01 & Short4(0x000Fu)) << 12;
1021 
1022 		// Expand to 16 bit range
1023 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
1024 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
1025 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
1026 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
1027 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
1028 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
1029 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1030 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1031 		break;
1032 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1033 		buffer += 2 * x;
1034 		buffer2 = buffer + pitchB;
1035 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1036 
1037 		pixel.x = (c01 & Short4(0xF800u));
1038 		pixel.y = (c01 & Short4(0x07C0u)) << 5;
1039 		pixel.z = (c01 & Short4(0x003Eu)) << 10;
1040 		pixel.w = ((c01 & Short4(0x0001u)) << 15) >> 15;
1041 
1042 		// Expand to 16 bit range
1043 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1044 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1045 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1046 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1047 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1048 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1049 		break;
1050 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1051 		buffer += 2 * x;
1052 		buffer2 = buffer + pitchB;
1053 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1054 
1055 		pixel.z = (c01 & Short4(0xF800u));
1056 		pixel.y = (c01 & Short4(0x07C0u)) << 5;
1057 		pixel.x = (c01 & Short4(0x003Eu)) << 10;
1058 		pixel.w = ((c01 & Short4(0x0001u)) << 15) >> 15;
1059 
1060 		// Expand to 16 bit range
1061 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1062 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1063 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1064 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1065 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1066 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1067 		break;
1068 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1069 		buffer += 2 * x;
1070 		buffer2 = buffer + pitchB;
1071 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1072 
1073 		pixel.x = (c01 & Short4(0x7C00u)) << 1;
1074 		pixel.y = (c01 & Short4(0x03E0u)) << 6;
1075 		pixel.z = (c01 & Short4(0x001Fu)) << 11;
1076 		pixel.w = (c01 & Short4(0x8000u)) >> 15;
1077 
1078 		// Expand to 16 bit range
1079 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1080 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1081 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1082 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1083 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1084 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1085 		break;
1086 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
1087 		buffer += 2 * x;
1088 		buffer2 = buffer + pitchB;
1089 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1090 
1091 		pixel.x = c01 & Short4(0xF800u);
1092 		pixel.y = (c01 & Short4(0x07E0u)) << 5;
1093 		pixel.z = (c01 & Short4(0x001Fu)) << 11;
1094 		pixel.w = Short4(0xFFFFu);
1095 
1096 		// Expand to 16 bit range
1097 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1098 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1099 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
1100 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
1101 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1102 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1103 		break;
1104 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
1105 		buffer += 2 * x;
1106 		buffer2 = buffer + pitchB;
1107 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1108 
1109 		pixel.z = c01 & Short4(0xF800u);
1110 		pixel.y = (c01 & Short4(0x07E0u)) << 5;
1111 		pixel.x = (c01 & Short4(0x001Fu)) << 11;
1112 		pixel.w = Short4(0xFFFFu);
1113 
1114 		// Expand to 16 bit range
1115 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1116 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1117 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
1118 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
1119 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1120 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1121 		break;
1122 	case VK_FORMAT_B8G8R8A8_UNORM:
1123 	case VK_FORMAT_B8G8R8A8_SRGB:
1124 		buffer += 4 * x;
1125 		c01 = *Pointer<Short4>(buffer);
1126 		buffer += pitchB;
1127 		c23 = *Pointer<Short4>(buffer);
1128 		pixel.z = c01;
1129 		pixel.y = c01;
1130 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1131 		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1132 		pixel.x = pixel.z;
1133 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1134 		pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1135 		pixel.y = pixel.z;
1136 		pixel.w = pixel.x;
1137 		pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1138 		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1139 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1140 		pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1141 		break;
1142 	case VK_FORMAT_R8G8B8A8_UNORM:
1143 	case VK_FORMAT_R8G8B8A8_SRGB:
1144 		buffer += 4 * x;
1145 		c01 = *Pointer<Short4>(buffer);
1146 		buffer += pitchB;
1147 		c23 = *Pointer<Short4>(buffer);
1148 		pixel.z = c01;
1149 		pixel.y = c01;
1150 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1151 		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1152 		pixel.x = pixel.z;
1153 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1154 		pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1155 		pixel.y = pixel.z;
1156 		pixel.w = pixel.x;
1157 		pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1158 		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1159 		pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1160 		pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1161 		break;
1162 	case VK_FORMAT_R8_UNORM:
1163 		buffer += 1 * x;
1164 		pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
1165 		buffer += pitchB;
1166 		pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
1167 		pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1168 		pixel.y = Short4(0x0000);
1169 		pixel.z = Short4(0x0000);
1170 		pixel.w = Short4(0xFFFFu);
1171 		break;
1172 	case VK_FORMAT_R8G8_UNORM:
1173 		buffer += 2 * x;
1174 		c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
1175 		buffer += pitchB;
1176 		c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
1177 		pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
1178 		pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
1179 		pixel.z = Short4(0x0000u);
1180 		pixel.w = Short4(0xFFFFu);
1181 		break;
1182 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1183 		{
1184 			Int4 v = Int4(0);
1185 			buffer += 4 * x;
1186 			v = Insert(v, *Pointer<Int>(buffer + 0), 0);
1187 			v = Insert(v, *Pointer<Int>(buffer + 4), 1);
1188 			buffer += pitchB;
1189 			v = Insert(v, *Pointer<Int>(buffer + 0), 2);
1190 			v = Insert(v, *Pointer<Int>(buffer + 4), 3);
1191 
1192 			pixel.x = Short4(v << 6) & Short4(0xFFC0u);
1193 			pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
1194 			pixel.z = Short4(v >> 14) & Short4(0xFFC0u);
1195 			pixel.w = Short4(v >> 16) & Short4(0xC000u);
1196 
1197 			// Expand to 16 bit range
1198 			pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1199 			pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1200 			pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1201 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 2);
1202 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1203 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1204 		}
1205 		break;
1206 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1207 		{
1208 			Int4 v = Int4(0);
1209 			v = Insert(v, *Pointer<Int>(buffer + 4 * x), 0);
1210 			v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 1);
1211 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1212 			v = Insert(v, *Pointer<Int>(buffer + 4 * x), 2);
1213 			v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 3);
1214 
1215 			pixel.x = Short4(v >> 14) & Short4(0xFFC0u);
1216 			pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
1217 			pixel.z = Short4(v << 6) & Short4(0xFFC0u);
1218 			pixel.w = Short4(v >> 16) & Short4(0xC000u);
1219 
1220 			// Expand to 16 bit range
1221 			pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1222 			pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1223 			pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1224 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 2);
1225 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1226 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1227 		}
1228 		break;
1229 	default:
1230 		UNSUPPORTED("VkFormat %d", int(state.colorFormat[index]));
1231 	}
1232 
1233 	if(isSRGB(index))
1234 	{
1235 		sRGBtoLinear16_12_16(pixel);
1236 	}
1237 }
1238 
writeColor(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4s & current,const Int & sMask,const Int & zMask,const Int & cMask)1239 void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &current, const Int &sMask, const Int &zMask, const Int &cMask)
1240 {
1241 	if(isSRGB(index))
1242 	{
1243 		linearToSRGB16_12_16(current);
1244 	}
1245 
1246 	switch(state.colorFormat[index])
1247 	{
1248 	case VK_FORMAT_B8G8R8A8_UNORM:
1249 	case VK_FORMAT_B8G8R8A8_SRGB:
1250 	case VK_FORMAT_R8G8B8A8_UNORM:
1251 	case VK_FORMAT_R8G8B8A8_SRGB:
1252 	case VK_FORMAT_R8G8_UNORM:
1253 	case VK_FORMAT_R8_UNORM:
1254 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1255 	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1256 		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
1257 		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
1258 		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
1259 		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
1260 		break;
1261 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1262 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1263 		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 10) + Short4(0x0020);
1264 		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 10) + Short4(0x0020);
1265 		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 10) + Short4(0x0020);
1266 		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 2) + Short4(0x2000);
1267 		break;
1268 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1269 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1270 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
1271 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
1272 		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 4) + Short4(0x0800);
1273 		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 4) + Short4(0x0800);
1274 		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 4) + Short4(0x0800);
1275 		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 4) + Short4(0x0800);
1276 		break;
1277 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1278 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1279 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1280 		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 5) + Short4(0x0400);
1281 		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 5) + Short4(0x0400);
1282 		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 5) + Short4(0x0400);
1283 		current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 1) + Short4(0x4000);
1284 		break;
1285 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
1286 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
1287 		current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 5) + Short4(0x0400);
1288 		current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 6) + Short4(0x0200);
1289 		current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 5) + Short4(0x0400);
1290 		break;
1291 	default:
1292 		break;
1293 	}
1294 
1295 	int rgbaWriteMask = state.colorWriteActive(index);
1296 	int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1297 
1298 	switch(state.colorFormat[index])
1299 	{
1300 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1301 		{
1302 			current.x = As<UShort4>(current.x & Short4(0xF000));
1303 			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 4;
1304 			current.z = As<UShort4>(current.z & Short4(0xF000)) >> 8;
1305 			current.w = As<UShort4>(current.w & Short4(0xF000u)) >> 12;
1306 
1307 			current.x = current.x | current.y | current.z | current.w;
1308 		}
1309 		break;
1310 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1311 		{
1312 			current.z = As<UShort4>(current.z & Short4(0xF000));
1313 			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 4;
1314 			current.x = As<UShort4>(current.x & Short4(0xF000)) >> 8;
1315 			current.w = As<UShort4>(current.w & Short4(0xF000u)) >> 12;
1316 
1317 			current.x = current.x | current.y | current.z | current.w;
1318 		}
1319 		break;
1320 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
1321 		{
1322 			current.w = As<UShort4>(current.w & Short4(0xF000));
1323 			current.x = As<UShort4>(current.x & Short4(0xF000)) >> 4;
1324 			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 8;
1325 			current.z = As<UShort4>(current.z & Short4(0xF000u)) >> 12;
1326 
1327 			current.x = current.x | current.y | current.z | current.w;
1328 		}
1329 		break;
1330 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
1331 		{
1332 			current.w = As<UShort4>(current.w & Short4(0xF000));
1333 			current.z = As<UShort4>(current.z & Short4(0xF000)) >> 4;
1334 			current.y = As<UShort4>(current.y & Short4(0xF000)) >> 8;
1335 			current.x = As<UShort4>(current.x & Short4(0xF000u)) >> 12;
1336 
1337 			current.x = current.x | current.y | current.z | current.w;
1338 		}
1339 		break;
1340 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1341 		{
1342 			current.x = As<UShort4>(current.x & Short4(0xF800));
1343 			current.y = As<UShort4>(current.y & Short4(0xF800)) >> 5;
1344 			current.z = As<UShort4>(current.z & Short4(0xF800)) >> 10;
1345 			current.w = As<UShort4>(current.w & Short4(0x8000u)) >> 15;
1346 
1347 			current.x = current.x | current.y | current.z | current.w;
1348 		}
1349 		break;
1350 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1351 		{
1352 			current.z = As<UShort4>(current.z & Short4(0xF800));
1353 			current.y = As<UShort4>(current.y & Short4(0xF800)) >> 5;
1354 			current.x = As<UShort4>(current.x & Short4(0xF800)) >> 10;
1355 			current.w = As<UShort4>(current.w & Short4(0x8000u)) >> 15;
1356 
1357 			current.x = current.x | current.y | current.z | current.w;
1358 		}
1359 		break;
1360 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1361 		{
1362 			current.w = current.w & Short4(0x8000u);
1363 			current.x = As<UShort4>(current.x & Short4(0xF800)) >> 1;
1364 			current.y = As<UShort4>(current.y & Short4(0xF800)) >> 6;
1365 			current.z = As<UShort4>(current.z & Short4(0xF800)) >> 11;
1366 
1367 			current.x = current.x | current.y | current.z | current.w;
1368 		}
1369 		break;
1370 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
1371 		{
1372 			current.x = current.x & Short4(0xF800u);
1373 			current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1374 			current.z = As<UShort4>(current.z) >> 11;
1375 
1376 			current.x = current.x | current.y | current.z;
1377 		}
1378 		break;
1379 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
1380 		{
1381 			current.z = current.z & Short4(0xF800u);
1382 			current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1383 			current.x = As<UShort4>(current.x) >> 11;
1384 
1385 			current.x = current.x | current.y | current.z;
1386 		}
1387 		break;
1388 	case VK_FORMAT_B8G8R8A8_UNORM:
1389 	case VK_FORMAT_B8G8R8A8_SRGB:
1390 		if(rgbaWriteMask == 0x7)
1391 		{
1392 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1393 			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1394 			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1395 
1396 			current.z = As<Short4>(PackUnsigned(current.z, current.x));
1397 			current.y = As<Short4>(PackUnsigned(current.y, current.y));
1398 
1399 			current.x = current.z;
1400 			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1401 			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1402 			current.y = current.z;
1403 			current.z = As<Short4>(UnpackLow(current.z, current.x));
1404 			current.y = As<Short4>(UnpackHigh(current.y, current.x));
1405 		}
1406 		else
1407 		{
1408 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1409 			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1410 			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1411 			current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1412 
1413 			current.z = As<Short4>(PackUnsigned(current.z, current.x));
1414 			current.y = As<Short4>(PackUnsigned(current.y, current.w));
1415 
1416 			current.x = current.z;
1417 			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1418 			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1419 			current.y = current.z;
1420 			current.z = As<Short4>(UnpackLow(current.z, current.x));
1421 			current.y = As<Short4>(UnpackHigh(current.y, current.x));
1422 		}
1423 		break;
1424 	case VK_FORMAT_R8G8B8A8_UNORM:
1425 	case VK_FORMAT_R8G8B8A8_SRGB:
1426 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1427 	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1428 		if(rgbaWriteMask == 0x7)
1429 		{
1430 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1431 			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1432 			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1433 
1434 			current.z = As<Short4>(PackUnsigned(current.x, current.z));
1435 			current.y = As<Short4>(PackUnsigned(current.y, current.y));
1436 
1437 			current.x = current.z;
1438 			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1439 			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1440 			current.y = current.z;
1441 			current.z = As<Short4>(UnpackLow(current.z, current.x));
1442 			current.y = As<Short4>(UnpackHigh(current.y, current.x));
1443 		}
1444 		else
1445 		{
1446 			current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1447 			current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1448 			current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1449 			current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1450 
1451 			current.z = As<Short4>(PackUnsigned(current.x, current.z));
1452 			current.y = As<Short4>(PackUnsigned(current.y, current.w));
1453 
1454 			current.x = current.z;
1455 			current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1456 			current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1457 			current.y = current.z;
1458 			current.z = As<Short4>(UnpackLow(current.z, current.x));
1459 			current.y = As<Short4>(UnpackHigh(current.y, current.x));
1460 		}
1461 		break;
1462 	case VK_FORMAT_R8G8_UNORM:
1463 		current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1464 		current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1465 		current.x = As<Short4>(PackUnsigned(current.x, current.x));
1466 		current.y = As<Short4>(PackUnsigned(current.y, current.y));
1467 		current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
1468 		break;
1469 	case VK_FORMAT_R8_UNORM:
1470 		current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1471 		current.x = As<Short4>(PackUnsigned(current.x, current.x));
1472 		break;
1473 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1474 		{
1475 			auto r = (Int4(current.x) >> 6) & Int4(0x3ff);
1476 			auto g = (Int4(current.y) >> 6) & Int4(0x3ff);
1477 			auto b = (Int4(current.z) >> 6) & Int4(0x3ff);
1478 			auto a = (Int4(current.w) >> 14) & Int4(0x3);
1479 			Int4 packed = (a << 30) | (b << 20) | (g << 10) | r;
1480 			auto c02 = As<Int2>(Int4(packed.xzzz));  // TODO: auto c02 = packed.xz;
1481 			auto c13 = As<Int2>(Int4(packed.ywww));  // TODO: auto c13 = packed.yw;
1482 			current.x = UnpackLow(c02, c13);
1483 			current.y = UnpackHigh(c02, c13);
1484 		}
1485 		break;
1486 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1487 		{
1488 			auto r = (Int4(current.x) >> 6) & Int4(0x3ff);
1489 			auto g = (Int4(current.y) >> 6) & Int4(0x3ff);
1490 			auto b = (Int4(current.z) >> 6) & Int4(0x3ff);
1491 			auto a = (Int4(current.w) >> 14) & Int4(0x3);
1492 			Int4 packed = (a << 30) | (r << 20) | (g << 10) | b;
1493 			auto c02 = As<Int2>(Int4(packed.xzzz));  // TODO: auto c02 = packed.xz;
1494 			auto c13 = As<Int2>(Int4(packed.ywww));  // TODO: auto c13 = packed.yw;
1495 			current.x = UnpackLow(c02, c13);
1496 			current.y = UnpackHigh(c02, c13);
1497 		}
1498 		break;
1499 	default:
1500 		UNSUPPORTED("VkFormat: %d", int(state.colorFormat[index]));
1501 	}
1502 
1503 	Short4 c01 = current.z;
1504 	Short4 c23 = current.y;
1505 
1506 	Int xMask;  // Combination of all masks
1507 
1508 	if(state.depthTestActive)
1509 	{
1510 		xMask = zMask;
1511 	}
1512 	else
1513 	{
1514 		xMask = cMask;
1515 	}
1516 
1517 	if(state.stencilActive)
1518 	{
1519 		xMask &= sMask;
1520 	}
1521 
1522 	Pointer<Byte> buffer = cBuffer;
1523 	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1524 
1525 	switch(state.colorFormat[index])
1526 	{
1527 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1528 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1529 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
1530 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
1531 		{
1532 			buffer += 2 * x;
1533 			Int value = *Pointer<Int>(buffer);
1534 
1535 			Int channelMask;
1536 			switch(state.colorFormat[index])
1537 			{
1538 			case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1539 				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4rgbaQ[bgraWriteMask & 0xF][0]));
1540 				break;
1541 			case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1542 				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4bgraQ[bgraWriteMask & 0xF][0]));
1543 				break;
1544 			case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
1545 				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4argbQ[bgraWriteMask & 0xF][0]));
1546 				break;
1547 			case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
1548 				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4abgrQ[bgraWriteMask & 0xF][0]));
1549 				break;
1550 			default:
1551 				UNREACHABLE("Format: %s", vk::Stringify(state.colorFormat[index]).c_str());
1552 			}
1553 
1554 			Int c01 = Extract(As<Int2>(current.x), 0);
1555 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1556 			if(bgraWriteMask != 0x0000000F)
1557 			{
1558 				mask01 &= channelMask;
1559 			}
1560 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1561 
1562 			buffer += pitchB;
1563 			value = *Pointer<Int>(buffer);
1564 
1565 			Int c23 = Extract(As<Int2>(current.x), 1);
1566 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1567 			if(bgraWriteMask != 0x0000000F)
1568 			{
1569 				mask23 &= channelMask;
1570 			}
1571 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1572 		}
1573 		break;
1574 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1575 		{
1576 			buffer += 2 * x;
1577 			Int value = *Pointer<Int>(buffer);
1578 
1579 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskr5g5b5a1Q[bgraWriteMask & 0xF][0]));
1580 
1581 			Int c01 = Extract(As<Int2>(current.x), 0);
1582 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1583 			if(bgraWriteMask != 0x0000000F)
1584 			{
1585 				mask01 &= channelMask;
1586 			}
1587 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1588 
1589 			buffer += pitchB;
1590 			value = *Pointer<Int>(buffer);
1591 
1592 			Int c23 = Extract(As<Int2>(current.x), 1);
1593 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1594 			if(bgraWriteMask != 0x0000000F)
1595 			{
1596 				mask23 &= channelMask;
1597 			}
1598 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1599 		}
1600 		break;
1601 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1602 		{
1603 			buffer += 2 * x;
1604 			Int value = *Pointer<Int>(buffer);
1605 
1606 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskb5g5r5a1Q[bgraWriteMask & 0xF][0]));
1607 
1608 			Int c01 = Extract(As<Int2>(current.x), 0);
1609 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1610 			if(bgraWriteMask != 0x0000000F)
1611 			{
1612 				mask01 &= channelMask;
1613 			}
1614 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1615 
1616 			buffer += pitchB;
1617 			value = *Pointer<Int>(buffer);
1618 
1619 			Int c23 = Extract(As<Int2>(current.x), 1);
1620 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1621 			if(bgraWriteMask != 0x0000000F)
1622 			{
1623 				mask23 &= channelMask;
1624 			}
1625 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1626 		}
1627 		break;
1628 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1629 		{
1630 			buffer += 2 * x;
1631 			Int value = *Pointer<Int>(buffer);
1632 
1633 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask5551Q[bgraWriteMask & 0xF][0]));
1634 
1635 			Int c01 = Extract(As<Int2>(current.x), 0);
1636 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1637 			if(bgraWriteMask != 0x0000000F)
1638 			{
1639 				mask01 &= channelMask;
1640 			}
1641 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1642 
1643 			buffer += pitchB;
1644 			value = *Pointer<Int>(buffer);
1645 
1646 			Int c23 = Extract(As<Int2>(current.x), 1);
1647 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1648 			if(bgraWriteMask != 0x0000000F)
1649 			{
1650 				mask23 &= channelMask;
1651 			}
1652 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1653 		}
1654 		break;
1655 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
1656 		{
1657 			buffer += 2 * x;
1658 			Int value = *Pointer<Int>(buffer);
1659 
1660 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask565Q[bgraWriteMask & 0x7][0]));
1661 
1662 			Int c01 = Extract(As<Int2>(current.x), 0);
1663 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1664 			if((bgraWriteMask & 0x00000007) != 0x00000007)
1665 			{
1666 				mask01 &= channelMask;
1667 			}
1668 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1669 
1670 			buffer += pitchB;
1671 			value = *Pointer<Int>(buffer);
1672 
1673 			Int c23 = Extract(As<Int2>(current.x), 1);
1674 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1675 			if((bgraWriteMask & 0x00000007) != 0x00000007)
1676 			{
1677 				mask23 &= channelMask;
1678 			}
1679 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1680 		}
1681 		break;
1682 	case VK_FORMAT_B8G8R8A8_UNORM:
1683 	case VK_FORMAT_B8G8R8A8_SRGB:
1684 		{
1685 			buffer += x * 4;
1686 			Short4 value = *Pointer<Short4>(buffer);
1687 			Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q[bgraWriteMask][0]));
1688 
1689 			Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1690 			if(bgraWriteMask != 0x0000000F)
1691 			{
1692 				mask01 &= channelMask;
1693 			}
1694 			*Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
1695 
1696 			buffer += pitchB;
1697 			value = *Pointer<Short4>(buffer);
1698 
1699 			Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1700 			if(bgraWriteMask != 0x0000000F)
1701 			{
1702 				mask23 &= channelMask;
1703 			}
1704 			*Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
1705 		}
1706 		break;
1707 	case VK_FORMAT_R8G8B8A8_UNORM:
1708 	case VK_FORMAT_R8G8B8A8_SRGB:
1709 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1710 	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1711 		{
1712 			buffer += x * 4;
1713 			Short4 value = *Pointer<Short4>(buffer);
1714 			Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
1715 
1716 			Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1717 			if(rgbaWriteMask != 0x0000000F)
1718 			{
1719 				mask01 &= channelMask;
1720 			}
1721 			*Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
1722 
1723 			buffer += pitchB;
1724 			value = *Pointer<Short4>(buffer);
1725 
1726 			Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1727 			if(rgbaWriteMask != 0x0000000F)
1728 			{
1729 				mask23 &= channelMask;
1730 			}
1731 			*Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
1732 		}
1733 		break;
1734 	case VK_FORMAT_R8G8_UNORM:
1735 		if((rgbaWriteMask & 0x00000003) != 0x0)
1736 		{
1737 			buffer += 2 * x;
1738 			Int2 value;
1739 			value = Insert(value, *Pointer<Int>(buffer), 0);
1740 			value = Insert(value, *Pointer<Int>(buffer + pitchB), 1);
1741 
1742 			Int2 packedCol = As<Int2>(current.x);
1743 
1744 			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
1745 			if((rgbaWriteMask & 0x3) != 0x3)
1746 			{
1747 				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
1748 				UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
1749 				mergedMask &= rgbaMask;
1750 			}
1751 
1752 			packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
1753 
1754 			*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
1755 			*Pointer<UInt>(buffer + pitchB) = As<UInt>(Extract(packedCol, 1));
1756 		}
1757 		break;
1758 	case VK_FORMAT_R8_UNORM:
1759 		if(rgbaWriteMask & 0x00000001)
1760 		{
1761 			buffer += 1 * x;
1762 			Short4 value;
1763 			value = Insert(value, *Pointer<Short>(buffer), 0);
1764 			value = Insert(value, *Pointer<Short>(buffer + pitchB), 1);
1765 
1766 			current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
1767 			value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
1768 			current.x |= value;
1769 
1770 			*Pointer<Short>(buffer) = Extract(current.x, 0);
1771 			*Pointer<Short>(buffer + pitchB) = Extract(current.x, 1);
1772 		}
1773 		break;
1774 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1775 		rgbaWriteMask = bgraWriteMask;
1776 		// [[fallthrough]]
1777 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1778 		{
1779 			buffer += 4 * x;
1780 
1781 			Int2 value = *Pointer<Int2>(buffer, 16);
1782 			Int2 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1783 			if(rgbaWriteMask != 0xF)
1784 			{
1785 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
1786 			}
1787 			*Pointer<Int2>(buffer) = (As<Int2>(current.x) & mergedMask) | (value & ~mergedMask);
1788 
1789 			buffer += pitchB;
1790 
1791 			value = *Pointer<Int2>(buffer, 16);
1792 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1793 			if(rgbaWriteMask != 0xF)
1794 			{
1795 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
1796 			}
1797 			*Pointer<Int2>(buffer) = (As<Int2>(current.y) & mergedMask) | (value & ~mergedMask);
1798 		}
1799 		break;
1800 	default:
1801 		UNSUPPORTED("VkFormat: %d", int(state.colorFormat[index]));
1802 	}
1803 }
1804 
blendConstant(vk::Format format,int component,BlendFactorModifier modifier)1805 Float PixelRoutine::blendConstant(vk::Format format, int component, BlendFactorModifier modifier)
1806 {
1807 	bool inverse = (modifier == OneMinus);
1808 
1809 	if(format.isUnsignedNormalized())
1810 	{
1811 		return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantU[component]))
1812 		               : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantU[component]));
1813 	}
1814 	else if(format.isSignedNormalized())
1815 	{
1816 		return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantS[component]))
1817 		               : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantS[component]));
1818 	}
1819 	else  // Floating-point format
1820 	{
1821 		ASSERT(format.isFloatFormat());
1822 		return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantF[component]))
1823 		               : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantF[component]));
1824 	}
1825 }
1826 
blendFactorRGB(Vector4f & blendFactor,const Vector4f & sourceColor,const Vector4f & destColor,VkBlendFactor colorBlendFactor,vk::Format format)1827 void PixelRoutine::blendFactorRGB(Vector4f &blendFactor, const Vector4f &sourceColor, const Vector4f &destColor, VkBlendFactor colorBlendFactor, vk::Format format)
1828 {
1829 	switch(colorBlendFactor)
1830 	{
1831 	case VK_BLEND_FACTOR_ZERO:
1832 		blendFactor.x = Float4(0);
1833 		blendFactor.y = Float4(0);
1834 		blendFactor.z = Float4(0);
1835 		break;
1836 	case VK_BLEND_FACTOR_ONE:
1837 		blendFactor.x = Float4(1);
1838 		blendFactor.y = Float4(1);
1839 		blendFactor.z = Float4(1);
1840 		break;
1841 	case VK_BLEND_FACTOR_SRC_COLOR:
1842 		blendFactor.x = sourceColor.x;
1843 		blendFactor.y = sourceColor.y;
1844 		blendFactor.z = sourceColor.z;
1845 		break;
1846 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1847 		blendFactor.x = Float4(1.0f) - sourceColor.x;
1848 		blendFactor.y = Float4(1.0f) - sourceColor.y;
1849 		blendFactor.z = Float4(1.0f) - sourceColor.z;
1850 		break;
1851 	case VK_BLEND_FACTOR_DST_COLOR:
1852 		blendFactor.x = destColor.x;
1853 		blendFactor.y = destColor.y;
1854 		blendFactor.z = destColor.z;
1855 		break;
1856 	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1857 		blendFactor.x = Float4(1.0f) - destColor.x;
1858 		blendFactor.y = Float4(1.0f) - destColor.y;
1859 		blendFactor.z = Float4(1.0f) - destColor.z;
1860 		break;
1861 	case VK_BLEND_FACTOR_SRC_ALPHA:
1862 		blendFactor.x = sourceColor.w;
1863 		blendFactor.y = sourceColor.w;
1864 		blendFactor.z = sourceColor.w;
1865 		break;
1866 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1867 		blendFactor.x = Float4(1.0f) - sourceColor.w;
1868 		blendFactor.y = Float4(1.0f) - sourceColor.w;
1869 		blendFactor.z = Float4(1.0f) - sourceColor.w;
1870 		break;
1871 	case VK_BLEND_FACTOR_DST_ALPHA:
1872 		blendFactor.x = destColor.w;
1873 		blendFactor.y = destColor.w;
1874 		blendFactor.z = destColor.w;
1875 		break;
1876 	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1877 		blendFactor.x = Float4(1.0f) - destColor.w;
1878 		blendFactor.y = Float4(1.0f) - destColor.w;
1879 		blendFactor.z = Float4(1.0f) - destColor.w;
1880 		break;
1881 	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1882 		blendFactor.x = Float4(1.0f) - destColor.w;
1883 		blendFactor.x = Min(blendFactor.x, sourceColor.w);
1884 		blendFactor.y = blendFactor.x;
1885 		blendFactor.z = blendFactor.x;
1886 		break;
1887 	case VK_BLEND_FACTOR_CONSTANT_COLOR:
1888 		blendFactor.x = Float4(blendConstant(format, 0));
1889 		blendFactor.y = Float4(blendConstant(format, 1));
1890 		blendFactor.z = Float4(blendConstant(format, 2));
1891 		break;
1892 	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1893 		blendFactor.x = Float4(blendConstant(format, 3));
1894 		blendFactor.y = Float4(blendConstant(format, 3));
1895 		blendFactor.z = Float4(blendConstant(format, 3));
1896 		break;
1897 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1898 		blendFactor.x = Float4(blendConstant(format, 0, OneMinus));
1899 		blendFactor.y = Float4(blendConstant(format, 1, OneMinus));
1900 		blendFactor.z = Float4(blendConstant(format, 2, OneMinus));
1901 		break;
1902 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1903 		blendFactor.x = Float4(blendConstant(format, 3, OneMinus));
1904 		blendFactor.y = Float4(blendConstant(format, 3, OneMinus));
1905 		blendFactor.z = Float4(blendConstant(format, 3, OneMinus));
1906 		break;
1907 
1908 	default:
1909 		UNSUPPORTED("VkBlendFactor: %d", int(colorBlendFactor));
1910 	}
1911 
1912 	// "If the color attachment is fixed-point, the components of the source and destination values and blend factors are each clamped
1913 	//  to [0,1] or [-1,1] respectively for an unsigned normalized or signed normalized color attachment prior to evaluating the blend
1914 	//  operations. If the color attachment is floating-point, no clamping occurs."
1915 	if(blendFactorCanExceedFormatRange(colorBlendFactor, format))
1916 	{
1917 		if(format.isUnsignedNormalized())
1918 		{
1919 			blendFactor.x = Min(Max(blendFactor.x, Float4(0.0f)), Float4(1.0f));
1920 			blendFactor.y = Min(Max(blendFactor.y, Float4(0.0f)), Float4(1.0f));
1921 			blendFactor.z = Min(Max(blendFactor.z, Float4(0.0f)), Float4(1.0f));
1922 		}
1923 		else if(format.isSignedNormalized())
1924 		{
1925 			blendFactor.x = Min(Max(blendFactor.x, Float4(-1.0f)), Float4(1.0f));
1926 			blendFactor.y = Min(Max(blendFactor.y, Float4(-1.0f)), Float4(1.0f));
1927 			blendFactor.z = Min(Max(blendFactor.z, Float4(-1.0f)), Float4(1.0f));
1928 		}
1929 	}
1930 }
1931 
blendFactorAlpha(Float4 & blendFactorAlpha,const Float4 & sourceAlpha,const Float4 & destAlpha,VkBlendFactor alphaBlendFactor,vk::Format format)1932 void PixelRoutine::blendFactorAlpha(Float4 &blendFactorAlpha, const Float4 &sourceAlpha, const Float4 &destAlpha, VkBlendFactor alphaBlendFactor, vk::Format format)
1933 {
1934 	switch(alphaBlendFactor)
1935 	{
1936 	case VK_BLEND_FACTOR_ZERO:
1937 		blendFactorAlpha = Float4(0);
1938 		break;
1939 	case VK_BLEND_FACTOR_ONE:
1940 		blendFactorAlpha = Float4(1);
1941 		break;
1942 	case VK_BLEND_FACTOR_SRC_COLOR:
1943 		blendFactorAlpha = sourceAlpha;
1944 		break;
1945 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1946 		blendFactorAlpha = Float4(1.0f) - sourceAlpha;
1947 		break;
1948 	case VK_BLEND_FACTOR_DST_COLOR:
1949 		blendFactorAlpha = destAlpha;
1950 		break;
1951 	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1952 		blendFactorAlpha = Float4(1.0f) - destAlpha;
1953 		break;
1954 	case VK_BLEND_FACTOR_SRC_ALPHA:
1955 		blendFactorAlpha = sourceAlpha;
1956 		break;
1957 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1958 		blendFactorAlpha = Float4(1.0f) - sourceAlpha;
1959 		break;
1960 	case VK_BLEND_FACTOR_DST_ALPHA:
1961 		blendFactorAlpha = destAlpha;
1962 		break;
1963 	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1964 		blendFactorAlpha = Float4(1.0f) - destAlpha;
1965 		break;
1966 	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1967 		blendFactorAlpha = Float4(1.0f);
1968 		break;
1969 	case VK_BLEND_FACTOR_CONSTANT_COLOR:
1970 	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1971 		blendFactorAlpha = Float4(blendConstant(format, 3));
1972 		break;
1973 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1974 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1975 		blendFactorAlpha = Float4(blendConstant(format, 3, OneMinus));
1976 		break;
1977 	default:
1978 		UNSUPPORTED("VkBlendFactor: %d", int(alphaBlendFactor));
1979 	}
1980 
1981 	// "If the color attachment is fixed-point, the components of the source and destination values and blend factors are each clamped
1982 	//  to [0,1] or [-1,1] respectively for an unsigned normalized or signed normalized color attachment prior to evaluating the blend
1983 	//  operations. If the color attachment is floating-point, no clamping occurs."
1984 	if(blendFactorCanExceedFormatRange(alphaBlendFactor, format))
1985 	{
1986 		if(format.isUnsignedNormalized())
1987 		{
1988 			blendFactorAlpha = Min(Max(blendFactorAlpha, Float4(0.0f)), Float4(1.0f));
1989 		}
1990 		else if(format.isSignedNormalized())
1991 		{
1992 			blendFactorAlpha = Min(Max(blendFactorAlpha, Float4(-1.0f)), Float4(1.0f));
1993 		}
1994 	}
1995 }
1996 
blendOpOverlay(Float4 & src,Float4 & dst)1997 Float4 PixelRoutine::blendOpOverlay(Float4 &src, Float4 &dst)
1998 {
1999 	Int4 largeDst = CmpGT(dst, Float4(0.5f));
2000 	return As<Float4>(
2001 	    (~largeDst &
2002 	     As<Int4>(Float4(2.0f) * src * dst)) |
2003 	    (largeDst &
2004 	     As<Int4>(Float4(1.0f) - (Float4(2.0f) * (Float4(1.0f) - src) * (Float4(1.0f) - dst)))));
2005 }
2006 
blendOpColorDodge(Float4 & src,Float4 & dst)2007 Float4 PixelRoutine::blendOpColorDodge(Float4 &src, Float4 &dst)
2008 {
2009 	Int4 srcBelowOne = CmpLT(src, Float4(1.0f));
2010 	Int4 positiveDst = CmpGT(dst, Float4(0.0f));
2011 	return As<Float4>(positiveDst & ((~srcBelowOne &
2012 	                                  As<Int4>(Float4(1.0f))) |
2013 	                                 (srcBelowOne &
2014 	                                  As<Int4>(Min(Float4(1.0f), (dst / (Float4(1.0f) - src)))))));
2015 }
2016 
blendOpColorBurn(Float4 & src,Float4 & dst)2017 Float4 PixelRoutine::blendOpColorBurn(Float4 &src, Float4 &dst)
2018 {
2019 	Int4 dstBelowOne = CmpLT(dst, Float4(1.0f));
2020 	Int4 positiveSrc = CmpGT(src, Float4(0.0f));
2021 	return As<Float4>(
2022 	    (~dstBelowOne &
2023 	     As<Int4>(Float4(1.0f))) |
2024 	    (dstBelowOne & positiveSrc &
2025 	     As<Int4>(Float4(1.0f) - Min(Float4(1.0f), (Float4(1.0f) - dst) / src))));
2026 }
2027 
blendOpHardlight(Float4 & src,Float4 & dst)2028 Float4 PixelRoutine::blendOpHardlight(Float4 &src, Float4 &dst)
2029 {
2030 	Int4 largeSrc = CmpGT(src, Float4(0.5f));
2031 	return As<Float4>(
2032 	    (~largeSrc &
2033 	     As<Int4>(Float4(2.0f) * src * dst)) |
2034 	    (largeSrc &
2035 	     As<Int4>(Float4(1.0f) - (Float4(2.0f) * (Float4(1.0f) - src) * (Float4(1.0f) - dst)))));
2036 }
2037 
blendOpSoftlight(Float4 & src,Float4 & dst)2038 Float4 PixelRoutine::blendOpSoftlight(Float4 &src, Float4 &dst)
2039 {
2040 	Int4 largeSrc = CmpGT(src, Float4(0.5f));
2041 	Int4 largeDst = CmpGT(dst, Float4(0.25f));
2042 
2043 	return As<Float4>(
2044 	    (~largeSrc &
2045 	     As<Int4>(dst - ((Float4(1.0f) - (Float4(2.0f) * src)) * dst * (Float4(1.0f) - dst)))) |
2046 	    (largeSrc & ((~largeDst &
2047 	                  As<Int4>(dst + (((Float4(2.0f) * src) - Float4(1.0f)) * dst * ((((Float4(16.0f) * dst) - Float4(12.0f)) * dst) + Float4(3.0f))))) |
2048 	                 (largeDst &
2049 	                  As<Int4>(dst + (((Float4(2.0f) * src) - Float4(1.0f)) * (Sqrt(dst) - dst)))))));
2050 }
2051 
maxRGB(Vector4f & c)2052 Float4 PixelRoutine::maxRGB(Vector4f &c)
2053 {
2054 	return Max(Max(c.x, c.y), c.z);
2055 }
2056 
minRGB(Vector4f & c)2057 Float4 PixelRoutine::minRGB(Vector4f &c)
2058 {
2059 	return Min(Min(c.x, c.y), c.z);
2060 }
2061 
setLumSat(Vector4f & cbase,Vector4f & csat,Vector4f & clum,Float4 & x,Float4 & y,Float4 & z)2062 void PixelRoutine::setLumSat(Vector4f &cbase, Vector4f &csat, Vector4f &clum, Float4 &x, Float4 &y, Float4 &z)
2063 {
2064 	Float4 minbase = minRGB(cbase);
2065 	Float4 sbase = maxRGB(cbase) - minbase;
2066 	Float4 ssat = maxRGB(csat) - minRGB(csat);
2067 	Int4 isNonZero = CmpGT(sbase, Float4(0.0f));
2068 	Vector4f color;
2069 	color.x = As<Float4>(isNonZero & As<Int4>((cbase.x - minbase) * ssat / sbase));
2070 	color.y = As<Float4>(isNonZero & As<Int4>((cbase.y - minbase) * ssat / sbase));
2071 	color.z = As<Float4>(isNonZero & As<Int4>((cbase.z - minbase) * ssat / sbase));
2072 	setLum(color, clum, x, y, z);
2073 }
2074 
lumRGB(Vector4f & c)2075 Float4 PixelRoutine::lumRGB(Vector4f &c)
2076 {
2077 	return c.x * Float4(0.3f) + c.y * Float4(0.59f) + c.z * Float4(0.11f);
2078 }
2079 
computeLum(Float4 & color,Float4 & lum,Float4 & mincol,Float4 & maxcol,Int4 & negative,Int4 & aboveOne)2080 Float4 PixelRoutine::computeLum(Float4 &color, Float4 &lum, Float4 &mincol, Float4 &maxcol, Int4 &negative, Int4 &aboveOne)
2081 {
2082 	return As<Float4>(
2083 	    (negative &
2084 	     As<Int4>(lum + ((color - lum) * lum) / (lum - mincol))) |
2085 	    (~negative &
2086 	     ((aboveOne &
2087 	       As<Int4>(lum + ((color - lum) * (Float4(1.0f) - lum)) / (Float4(maxcol) - lum))) |
2088 	      (~aboveOne &
2089 	       As<Int4>(color)))));
2090 }
2091 
setLum(Vector4f & cbase,Vector4f & clum,Float4 & x,Float4 & y,Float4 & z)2092 void PixelRoutine::setLum(Vector4f &cbase, Vector4f &clum, Float4 &x, Float4 &y, Float4 &z)
2093 {
2094 	Float4 lbase = lumRGB(cbase);
2095 	Float4 llum = lumRGB(clum);
2096 	Float4 ldiff = llum - lbase;
2097 
2098 	Vector4f color;
2099 	color.x = cbase.x + ldiff;
2100 	color.y = cbase.y + ldiff;
2101 	color.z = cbase.z + ldiff;
2102 
2103 	Float4 lum = lumRGB(color);
2104 	Float4 mincol = minRGB(color);
2105 	Float4 maxcol = maxRGB(color);
2106 
2107 	Int4 negative = CmpLT(mincol, Float4(0.0f));
2108 	Int4 aboveOne = CmpGT(maxcol, Float4(1.0f));
2109 
2110 	x = computeLum(color.x, lum, mincol, maxcol, negative, aboveOne);
2111 	y = computeLum(color.y, lum, mincol, maxcol, negative, aboveOne);
2112 	z = computeLum(color.z, lum, mincol, maxcol, negative, aboveOne);
2113 }
2114 
premultiply(Vector4f & c)2115 void PixelRoutine::premultiply(Vector4f &c)
2116 {
2117 	Int4 nonZeroAlpha = CmpNEQ(c.w, Float4(0.0f));
2118 	c.x = As<Float4>(nonZeroAlpha & As<Int4>(c.x / c.w));
2119 	c.y = As<Float4>(nonZeroAlpha & As<Int4>(c.y / c.w));
2120 	c.z = As<Float4>(nonZeroAlpha & As<Int4>(c.z / c.w));
2121 }
2122 
computeAdvancedBlendMode(int index,const Vector4f & src,const Vector4f & dst,const Vector4f & srcFactor,const Vector4f & dstFactor)2123 Vector4f PixelRoutine::computeAdvancedBlendMode(int index, const Vector4f &src, const Vector4f &dst, const Vector4f &srcFactor, const Vector4f &dstFactor)
2124 {
2125 	Vector4f srcColor = src;
2126 	srcColor.x *= srcFactor.x;
2127 	srcColor.y *= srcFactor.y;
2128 	srcColor.z *= srcFactor.z;
2129 	srcColor.w *= srcFactor.w;
2130 
2131 	Vector4f dstColor = dst;
2132 	dstColor.x *= dstFactor.x;
2133 	dstColor.y *= dstFactor.y;
2134 	dstColor.z *= dstFactor.z;
2135 	dstColor.w *= dstFactor.w;
2136 
2137 	premultiply(srcColor);
2138 	premultiply(dstColor);
2139 
2140 	Vector4f blendedColor;
2141 
2142 	switch(state.blendState[index].blendOperation)
2143 	{
2144 	case VK_BLEND_OP_MULTIPLY_EXT:
2145 		blendedColor.x = (srcColor.x * dstColor.x);
2146 		blendedColor.y = (srcColor.y * dstColor.y);
2147 		blendedColor.z = (srcColor.z * dstColor.z);
2148 		break;
2149 	case VK_BLEND_OP_SCREEN_EXT:
2150 		blendedColor.x = srcColor.x + dstColor.x - (srcColor.x * dstColor.x);
2151 		blendedColor.y = srcColor.y + dstColor.y - (srcColor.y * dstColor.y);
2152 		blendedColor.z = srcColor.z + dstColor.z - (srcColor.z * dstColor.z);
2153 		break;
2154 	case VK_BLEND_OP_OVERLAY_EXT:
2155 		blendedColor.x = blendOpOverlay(srcColor.x, dstColor.x);
2156 		blendedColor.y = blendOpOverlay(srcColor.y, dstColor.y);
2157 		blendedColor.z = blendOpOverlay(srcColor.z, dstColor.z);
2158 		break;
2159 	case VK_BLEND_OP_DARKEN_EXT:
2160 		blendedColor.x = Min(srcColor.x, dstColor.x);
2161 		blendedColor.y = Min(srcColor.y, dstColor.y);
2162 		blendedColor.z = Min(srcColor.z, dstColor.z);
2163 		break;
2164 	case VK_BLEND_OP_LIGHTEN_EXT:
2165 		blendedColor.x = Max(srcColor.x, dstColor.x);
2166 		blendedColor.y = Max(srcColor.y, dstColor.y);
2167 		blendedColor.z = Max(srcColor.z, dstColor.z);
2168 		break;
2169 	case VK_BLEND_OP_COLORDODGE_EXT:
2170 		blendedColor.x = blendOpColorDodge(srcColor.x, dstColor.x);
2171 		blendedColor.y = blendOpColorDodge(srcColor.y, dstColor.y);
2172 		blendedColor.z = blendOpColorDodge(srcColor.z, dstColor.z);
2173 		break;
2174 	case VK_BLEND_OP_COLORBURN_EXT:
2175 		blendedColor.x = blendOpColorBurn(srcColor.x, dstColor.x);
2176 		blendedColor.y = blendOpColorBurn(srcColor.y, dstColor.y);
2177 		blendedColor.z = blendOpColorBurn(srcColor.z, dstColor.z);
2178 		break;
2179 	case VK_BLEND_OP_HARDLIGHT_EXT:
2180 		blendedColor.x = blendOpHardlight(srcColor.x, dstColor.x);
2181 		blendedColor.y = blendOpHardlight(srcColor.y, dstColor.y);
2182 		blendedColor.z = blendOpHardlight(srcColor.z, dstColor.z);
2183 		break;
2184 	case VK_BLEND_OP_SOFTLIGHT_EXT:
2185 		blendedColor.x = blendOpSoftlight(srcColor.x, dstColor.x);
2186 		blendedColor.y = blendOpSoftlight(srcColor.y, dstColor.y);
2187 		blendedColor.z = blendOpSoftlight(srcColor.z, dstColor.z);
2188 		break;
2189 	case VK_BLEND_OP_DIFFERENCE_EXT:
2190 		blendedColor.x = Abs(srcColor.x - dstColor.x);
2191 		blendedColor.y = Abs(srcColor.y - dstColor.y);
2192 		blendedColor.z = Abs(srcColor.z - dstColor.z);
2193 		break;
2194 	case VK_BLEND_OP_EXCLUSION_EXT:
2195 		blendedColor.x = srcColor.x + dstColor.x - (srcColor.x * dstColor.x * Float4(2.0f));
2196 		blendedColor.y = srcColor.y + dstColor.y - (srcColor.y * dstColor.y * Float4(2.0f));
2197 		blendedColor.z = srcColor.z + dstColor.z - (srcColor.z * dstColor.z * Float4(2.0f));
2198 		break;
2199 	case VK_BLEND_OP_HSL_HUE_EXT:
2200 		setLumSat(srcColor, dstColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
2201 		break;
2202 	case VK_BLEND_OP_HSL_SATURATION_EXT:
2203 		setLumSat(dstColor, srcColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
2204 		break;
2205 	case VK_BLEND_OP_HSL_COLOR_EXT:
2206 		setLum(srcColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
2207 		break;
2208 	case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
2209 		setLum(dstColor, srcColor, blendedColor.x, blendedColor.y, blendedColor.z);
2210 		break;
2211 	default:
2212 		UNSUPPORTED("Unsupported advanced VkBlendOp: %d", int(state.blendState[index].blendOperation));
2213 		break;
2214 	}
2215 
2216 	Float4 p = srcColor.w * dstColor.w;
2217 	blendedColor.x *= p;
2218 	blendedColor.y *= p;
2219 	blendedColor.z *= p;
2220 
2221 	p = srcColor.w * (Float4(1.0f) - dstColor.w);
2222 	blendedColor.x += srcColor.x * p;
2223 	blendedColor.y += srcColor.y * p;
2224 	blendedColor.z += srcColor.z * p;
2225 
2226 	p = dstColor.w * (Float4(1.0f) - srcColor.w);
2227 	blendedColor.x += dstColor.x * p;
2228 	blendedColor.y += dstColor.y * p;
2229 	blendedColor.z += dstColor.z * p;
2230 
2231 	return blendedColor;
2232 }
2233 
blendFactorCanExceedFormatRange(VkBlendFactor blendFactor,vk::Format format)2234 bool PixelRoutine::blendFactorCanExceedFormatRange(VkBlendFactor blendFactor, vk::Format format)
2235 {
2236 	switch(blendFactor)
2237 	{
2238 	case VK_BLEND_FACTOR_ZERO:
2239 	case VK_BLEND_FACTOR_ONE:
2240 		return false;
2241 	case VK_BLEND_FACTOR_SRC_COLOR:
2242 	case VK_BLEND_FACTOR_SRC_ALPHA:
2243 		// Source values have been clamped after fragment shader execution if the attachment format is normalized.
2244 		return false;
2245 	case VK_BLEND_FACTOR_DST_COLOR:
2246 	case VK_BLEND_FACTOR_DST_ALPHA:
2247 		// Dest values have a valid range due to being read from the attachment.
2248 		return false;
2249 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
2250 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
2251 	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
2252 	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
2253 		// For signed formats, negative values cause the result to exceed 1.0.
2254 		return format.isSignedNormalized();
2255 	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
2256 		// min(As, 1 - Ad)
2257 		return false;
2258 	case VK_BLEND_FACTOR_CONSTANT_COLOR:
2259 	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
2260 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
2261 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
2262 		return false;
2263 
2264 	default:
2265 		UNSUPPORTED("VkBlendFactor: %d", int(blendFactor));
2266 		return false;
2267 	}
2268 }
2269 
alphaBlend(int index,const Pointer<Byte> & cBuffer,const Vector4f & sourceColor,const Int & x)2270 Vector4f PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, const Vector4f &sourceColor, const Int &x)
2271 {
2272 	if(!state.blendState[index].alphaBlendEnable)
2273 	{
2274 		return sourceColor;
2275 	}
2276 
2277 	vk::Format format = state.colorFormat[index];
2278 	ASSERT(format.supportsColorAttachmentBlend());
2279 
2280 	Pointer<Byte> buffer = cBuffer;
2281 	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2282 
2283 	// destColor holds four texel color values.
2284 	// Note: Despite the type being Vector4f, the colors may be stored as
2285 	// integers. Half-floats are stored as full 32-bit floats.
2286 	// Non-float and non-fixed point formats are not alpha blended.
2287 	Vector4f destColor;
2288 
2289 	switch(format)
2290 	{
2291 	case VK_FORMAT_R32_SINT:
2292 	case VK_FORMAT_R32_UINT:
2293 	case VK_FORMAT_R32_SFLOAT:
2294 		// FIXME: movlps
2295 		buffer += 4 * x;
2296 		destColor.x.x = *Pointer<Float>(buffer + 0);
2297 		destColor.x.y = *Pointer<Float>(buffer + 4);
2298 		buffer += pitchB;
2299 		// FIXME: movhps
2300 		destColor.x.z = *Pointer<Float>(buffer + 0);
2301 		destColor.x.w = *Pointer<Float>(buffer + 4);
2302 		destColor.y = destColor.z = destColor.w = Float4(1.0f);
2303 		break;
2304 	case VK_FORMAT_R32G32_SINT:
2305 	case VK_FORMAT_R32G32_UINT:
2306 	case VK_FORMAT_R32G32_SFLOAT:
2307 		buffer += 8 * x;
2308 		destColor.x = *Pointer<Float4>(buffer, 16);
2309 		buffer += pitchB;
2310 		destColor.y = *Pointer<Float4>(buffer, 16);
2311 		destColor.z = destColor.x;
2312 		destColor.x = ShuffleLowHigh(destColor.x, destColor.y, 0x0202);
2313 		destColor.z = ShuffleLowHigh(destColor.z, destColor.y, 0x1313);
2314 		destColor.y = destColor.z;
2315 		destColor.z = destColor.w = Float4(1.0f);
2316 		break;
2317 	case VK_FORMAT_R32G32B32A32_SFLOAT:
2318 	case VK_FORMAT_R32G32B32A32_SINT:
2319 	case VK_FORMAT_R32G32B32A32_UINT:
2320 		buffer += 16 * x;
2321 		destColor.x = *Pointer<Float4>(buffer + 0, 16);
2322 		destColor.y = *Pointer<Float4>(buffer + 16, 16);
2323 		buffer += pitchB;
2324 		destColor.z = *Pointer<Float4>(buffer + 0, 16);
2325 		destColor.w = *Pointer<Float4>(buffer + 16, 16);
2326 		transpose4x4(destColor.x, destColor.y, destColor.z, destColor.w);
2327 		break;
2328 	case VK_FORMAT_R16_UNORM:
2329 		buffer += 2 * x;
2330 		destColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
2331 		destColor.x.y = Float(Int(*Pointer<UShort>(buffer + 2)));
2332 		buffer += pitchB;
2333 		destColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
2334 		destColor.x.w = Float(Int(*Pointer<UShort>(buffer + 2)));
2335 		destColor.x *= Float4(1.0f / 0xFFFF);
2336 		destColor.y = destColor.z = destColor.w = Float4(1.0f);
2337 		break;
2338 	case VK_FORMAT_R16_SFLOAT:
2339 		buffer += 2 * x;
2340 		destColor.x.x = Float(*Pointer<Half>(buffer + 0));
2341 		destColor.x.y = Float(*Pointer<Half>(buffer + 2));
2342 		buffer += pitchB;
2343 		destColor.x.z = Float(*Pointer<Half>(buffer + 0));
2344 		destColor.x.w = Float(*Pointer<Half>(buffer + 2));
2345 		destColor.y = destColor.z = destColor.w = Float4(1.0f);
2346 		break;
2347 	case VK_FORMAT_R16G16_UNORM:
2348 		buffer += 4 * x;
2349 		destColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
2350 		destColor.y.x = Float(Int(*Pointer<UShort>(buffer + 2)));
2351 		destColor.x.y = Float(Int(*Pointer<UShort>(buffer + 4)));
2352 		destColor.y.y = Float(Int(*Pointer<UShort>(buffer + 6)));
2353 		buffer += pitchB;
2354 		destColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
2355 		destColor.y.z = Float(Int(*Pointer<UShort>(buffer + 2)));
2356 		destColor.x.w = Float(Int(*Pointer<UShort>(buffer + 4)));
2357 		destColor.y.w = Float(Int(*Pointer<UShort>(buffer + 6)));
2358 		destColor.x *= Float4(1.0f / 0xFFFF);
2359 		destColor.y *= Float4(1.0f / 0xFFFF);
2360 		destColor.z = destColor.w = Float4(1.0f);
2361 		break;
2362 	case VK_FORMAT_R16G16_SFLOAT:
2363 		buffer += 4 * x;
2364 		destColor.x.x = Float(*Pointer<Half>(buffer + 0));
2365 		destColor.y.x = Float(*Pointer<Half>(buffer + 2));
2366 		destColor.x.y = Float(*Pointer<Half>(buffer + 4));
2367 		destColor.y.y = Float(*Pointer<Half>(buffer + 6));
2368 		buffer += pitchB;
2369 		destColor.x.z = Float(*Pointer<Half>(buffer + 0));
2370 		destColor.y.z = Float(*Pointer<Half>(buffer + 2));
2371 		destColor.x.w = Float(*Pointer<Half>(buffer + 4));
2372 		destColor.y.w = Float(*Pointer<Half>(buffer + 6));
2373 		destColor.z = destColor.w = Float4(1.0f);
2374 		break;
2375 	case VK_FORMAT_R16G16B16A16_UNORM:
2376 		buffer += 8 * x;
2377 		destColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0x0)));
2378 		destColor.y.x = Float(Int(*Pointer<UShort>(buffer + 0x2)));
2379 		destColor.z.x = Float(Int(*Pointer<UShort>(buffer + 0x4)));
2380 		destColor.w.x = Float(Int(*Pointer<UShort>(buffer + 0x6)));
2381 		destColor.x.y = Float(Int(*Pointer<UShort>(buffer + 0x8)));
2382 		destColor.y.y = Float(Int(*Pointer<UShort>(buffer + 0xa)));
2383 		destColor.z.y = Float(Int(*Pointer<UShort>(buffer + 0xc)));
2384 		destColor.w.y = Float(Int(*Pointer<UShort>(buffer + 0xe)));
2385 		buffer += pitchB;
2386 		destColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0x0)));
2387 		destColor.y.z = Float(Int(*Pointer<UShort>(buffer + 0x2)));
2388 		destColor.z.z = Float(Int(*Pointer<UShort>(buffer + 0x4)));
2389 		destColor.w.z = Float(Int(*Pointer<UShort>(buffer + 0x6)));
2390 		destColor.x.w = Float(Int(*Pointer<UShort>(buffer + 0x8)));
2391 		destColor.y.w = Float(Int(*Pointer<UShort>(buffer + 0xa)));
2392 		destColor.z.w = Float(Int(*Pointer<UShort>(buffer + 0xc)));
2393 		destColor.w.w = Float(Int(*Pointer<UShort>(buffer + 0xe)));
2394 		destColor.x *= Float4(1.0f / 0xFFFF);
2395 		destColor.y *= Float4(1.0f / 0xFFFF);
2396 		destColor.z *= Float4(1.0f / 0xFFFF);
2397 		destColor.w *= Float4(1.0f / 0xFFFF);
2398 		break;
2399 	case VK_FORMAT_R16G16B16A16_SFLOAT:
2400 		buffer += 8 * x;
2401 		destColor.x.x = Float(*Pointer<Half>(buffer + 0x0));
2402 		destColor.y.x = Float(*Pointer<Half>(buffer + 0x2));
2403 		destColor.z.x = Float(*Pointer<Half>(buffer + 0x4));
2404 		destColor.w.x = Float(*Pointer<Half>(buffer + 0x6));
2405 		destColor.x.y = Float(*Pointer<Half>(buffer + 0x8));
2406 		destColor.y.y = Float(*Pointer<Half>(buffer + 0xa));
2407 		destColor.z.y = Float(*Pointer<Half>(buffer + 0xc));
2408 		destColor.w.y = Float(*Pointer<Half>(buffer + 0xe));
2409 		buffer += pitchB;
2410 		destColor.x.z = Float(*Pointer<Half>(buffer + 0x0));
2411 		destColor.y.z = Float(*Pointer<Half>(buffer + 0x2));
2412 		destColor.z.z = Float(*Pointer<Half>(buffer + 0x4));
2413 		destColor.w.z = Float(*Pointer<Half>(buffer + 0x6));
2414 		destColor.x.w = Float(*Pointer<Half>(buffer + 0x8));
2415 		destColor.y.w = Float(*Pointer<Half>(buffer + 0xa));
2416 		destColor.z.w = Float(*Pointer<Half>(buffer + 0xc));
2417 		destColor.w.w = Float(*Pointer<Half>(buffer + 0xe));
2418 		break;
2419 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2420 		buffer += 4 * x;
2421 		destColor.x = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
2422 		destColor.y = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
2423 		buffer += pitchB;
2424 		destColor.z = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
2425 		destColor.w = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
2426 		transpose4x3(destColor.x, destColor.y, destColor.z, destColor.w);
2427 		destColor.w = Float4(1.0f);
2428 		break;
2429 	default:
2430 		{
2431 			// Attempt to read an integer based format and convert it to float
2432 			Vector4s color;
2433 			readPixel(index, cBuffer, x, color);
2434 			destColor.x = convertFloat32(As<UShort4>(color.x));
2435 			destColor.y = convertFloat32(As<UShort4>(color.y));
2436 			destColor.z = convertFloat32(As<UShort4>(color.z));
2437 			destColor.w = convertFloat32(As<UShort4>(color.w));
2438 		}
2439 		break;
2440 	}
2441 
2442 	Vector4f sourceFactor;
2443 	Vector4f destFactor;
2444 
2445 	blendFactorRGB(sourceFactor, sourceColor, destColor, state.blendState[index].sourceBlendFactor, format);
2446 	blendFactorRGB(destFactor, sourceColor, destColor, state.blendState[index].destBlendFactor, format);
2447 	blendFactorAlpha(sourceFactor.w, sourceColor.w, destColor.w, state.blendState[index].sourceBlendFactorAlpha, format);
2448 	blendFactorAlpha(destFactor.w, sourceColor.w, destColor.w, state.blendState[index].destBlendFactorAlpha, format);
2449 
2450 	Vector4f blendedColor;
2451 
2452 	switch(state.blendState[index].blendOperation)
2453 	{
2454 	case VK_BLEND_OP_ADD:
2455 		blendedColor.x = sourceColor.x * sourceFactor.x + destColor.x * destFactor.x;
2456 		blendedColor.y = sourceColor.y * sourceFactor.y + destColor.y * destFactor.y;
2457 		blendedColor.z = sourceColor.z * sourceFactor.z + destColor.z * destFactor.z;
2458 		break;
2459 	case VK_BLEND_OP_SUBTRACT:
2460 		blendedColor.x = sourceColor.x * sourceFactor.x - destColor.x * destFactor.x;
2461 		blendedColor.y = sourceColor.y * sourceFactor.y - destColor.y * destFactor.y;
2462 		blendedColor.z = sourceColor.z * sourceFactor.z - destColor.z * destFactor.z;
2463 		break;
2464 	case VK_BLEND_OP_REVERSE_SUBTRACT:
2465 		blendedColor.x = destColor.x * destFactor.x - sourceColor.x * sourceFactor.x;
2466 		blendedColor.y = destColor.y * destFactor.y - sourceColor.y * sourceFactor.y;
2467 		blendedColor.z = destColor.z * destFactor.z - sourceColor.z * sourceFactor.z;
2468 		break;
2469 	case VK_BLEND_OP_MIN:
2470 		blendedColor.x = Min(sourceColor.x, destColor.x);
2471 		blendedColor.y = Min(sourceColor.y, destColor.y);
2472 		blendedColor.z = Min(sourceColor.z, destColor.z);
2473 		break;
2474 	case VK_BLEND_OP_MAX:
2475 		blendedColor.x = Max(sourceColor.x, destColor.x);
2476 		blendedColor.y = Max(sourceColor.y, destColor.y);
2477 		blendedColor.z = Max(sourceColor.z, destColor.z);
2478 		break;
2479 	case VK_BLEND_OP_SRC_EXT:
2480 		blendedColor.x = sourceColor.x * sourceFactor.x;  // TODO(b/204583457)
2481 		blendedColor.y = sourceColor.y * sourceFactor.y;  // TODO(b/204583457)
2482 		blendedColor.z = sourceColor.z * sourceFactor.z;  // TODO(b/204583457)
2483 		break;
2484 	case VK_BLEND_OP_DST_EXT:
2485 		blendedColor.x = destColor.x * destFactor.x;  // TODO(b/204583457)
2486 		blendedColor.y = destColor.y * destFactor.y;  // TODO(b/204583457)
2487 		blendedColor.z = destColor.z * destFactor.z;  // TODO(b/204583457)
2488 		break;
2489 	case VK_BLEND_OP_ZERO_EXT:
2490 		blendedColor.x = Float4(0.0f);
2491 		blendedColor.y = Float4(0.0f);
2492 		blendedColor.z = Float4(0.0f);
2493 		break;
2494 	case VK_BLEND_OP_MULTIPLY_EXT:
2495 	case VK_BLEND_OP_SCREEN_EXT:
2496 	case VK_BLEND_OP_OVERLAY_EXT:
2497 	case VK_BLEND_OP_DARKEN_EXT:
2498 	case VK_BLEND_OP_LIGHTEN_EXT:
2499 	case VK_BLEND_OP_COLORDODGE_EXT:
2500 	case VK_BLEND_OP_COLORBURN_EXT:
2501 	case VK_BLEND_OP_HARDLIGHT_EXT:
2502 	case VK_BLEND_OP_SOFTLIGHT_EXT:
2503 	case VK_BLEND_OP_DIFFERENCE_EXT:
2504 	case VK_BLEND_OP_EXCLUSION_EXT:
2505 	case VK_BLEND_OP_HSL_HUE_EXT:
2506 	case VK_BLEND_OP_HSL_SATURATION_EXT:
2507 	case VK_BLEND_OP_HSL_COLOR_EXT:
2508 	case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
2509 		blendedColor = computeAdvancedBlendMode(index, sourceColor, destColor, sourceFactor, destFactor);
2510 		break;
2511 	default:
2512 		UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperation));
2513 	}
2514 
2515 	switch(state.blendState[index].blendOperationAlpha)
2516 	{
2517 	case VK_BLEND_OP_ADD:
2518 		blendedColor.w = sourceColor.w * sourceFactor.w + destColor.w * destFactor.w;
2519 		break;
2520 	case VK_BLEND_OP_SUBTRACT:
2521 		blendedColor.w = sourceColor.w * sourceFactor.w - destColor.w * destFactor.w;
2522 		break;
2523 	case VK_BLEND_OP_REVERSE_SUBTRACT:
2524 		blendedColor.w = destColor.w * destFactor.w - sourceColor.w * sourceFactor.w;
2525 		break;
2526 	case VK_BLEND_OP_MIN:
2527 		blendedColor.w = Min(sourceColor.w, destColor.w);
2528 		break;
2529 	case VK_BLEND_OP_MAX:
2530 		blendedColor.w = Max(sourceColor.w, destColor.w);
2531 		break;
2532 	case VK_BLEND_OP_SRC_EXT:
2533 		blendedColor.w = sourceColor.w * sourceFactor.w;  // TODO(b/204583457)
2534 		break;
2535 	case VK_BLEND_OP_DST_EXT:
2536 		blendedColor.w = destColor.w * destFactor.w;  // TODO(b/204583457)
2537 		break;
2538 	case VK_BLEND_OP_ZERO_EXT:
2539 		blendedColor.w = Float4(0.0f);
2540 		break;
2541 	case VK_BLEND_OP_MULTIPLY_EXT:
2542 		// All of the currently supported advanced blend modes compute the alpha the same way
2543 		// Use VK_BLEND_OP_MULTIPLY_EXT as a placeholder
2544 		blendedColor.w = sourceColor.w + destColor.w - (sourceColor.w * destColor.w);
2545 		break;
2546 	default:
2547 		UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperationAlpha));
2548 	}
2549 
2550 	return blendedColor;
2551 }
2552 
writeColor(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4f & color,const Int & sMask,const Int & zMask,const Int & cMask)2553 void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4f &color, const Int &sMask, const Int &zMask, const Int &cMask)
2554 {
2555 	vk::Format format = state.colorFormat[index];
2556 	switch(format)
2557 	{
2558 	case VK_FORMAT_R16G16B16A16_UNORM:
2559 		color.w = Min(Max(color.w, Float4(0.0f)), Float4(1.0f));  // TODO(b/204560089): Omit clamp if redundant
2560 		color.w = As<Float4>(RoundInt(color.w * Float4(0xFFFF)));
2561 		color.z = Min(Max(color.z, Float4(0.0f)), Float4(1.0f));  // TODO(b/204560089): Omit clamp if redundant
2562 		color.z = As<Float4>(RoundInt(color.z * Float4(0xFFFF)));
2563 		// [[fallthrough]]
2564 	case VK_FORMAT_R16G16_UNORM:
2565 		color.y = Min(Max(color.y, Float4(0.0f)), Float4(1.0f));  // TODO(b/204560089): Omit clamp if redundant
2566 		color.y = As<Float4>(RoundInt(color.y * Float4(0xFFFF)));
2567 		//[[fallthrough]]
2568 	case VK_FORMAT_R16_UNORM:
2569 		color.x = Min(Max(color.x, Float4(0.0f)), Float4(1.0f));  // TODO(b/204560089): Omit clamp if redundant
2570 		color.x = As<Float4>(RoundInt(color.x * Float4(0xFFFF)));
2571 		break;
2572 	default:
2573 		// TODO(b/204560089): Omit clamp if redundant
2574 		if(format.isUnsignedNormalized())
2575 		{
2576 			color.x = Min(Max(color.x, Float4(0.0f)), Float4(1.0f));
2577 			color.y = Min(Max(color.y, Float4(0.0f)), Float4(1.0f));
2578 			color.z = Min(Max(color.z, Float4(0.0f)), Float4(1.0f));
2579 			color.w = Min(Max(color.w, Float4(0.0f)), Float4(1.0f));
2580 		}
2581 		else if(format.isSignedNormalized())
2582 		{
2583 			color.x = Min(Max(color.x, Float4(-1.0f)), Float4(1.0f));
2584 			color.y = Min(Max(color.y, Float4(-1.0f)), Float4(1.0f));
2585 			color.z = Min(Max(color.z, Float4(-1.0f)), Float4(1.0f));
2586 			color.w = Min(Max(color.w, Float4(-1.0f)), Float4(1.0f));
2587 		}
2588 	}
2589 
2590 	switch(format)
2591 	{
2592 	case VK_FORMAT_R16_SFLOAT:
2593 	case VK_FORMAT_R32_SFLOAT:
2594 	case VK_FORMAT_R32_SINT:
2595 	case VK_FORMAT_R32_UINT:
2596 	case VK_FORMAT_R16_UNORM:
2597 	case VK_FORMAT_R16_SINT:
2598 	case VK_FORMAT_R16_UINT:
2599 	case VK_FORMAT_R8_SINT:
2600 	case VK_FORMAT_R8_UINT:
2601 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2602 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2603 		break;
2604 	case VK_FORMAT_R16G16_SFLOAT:
2605 	case VK_FORMAT_R32G32_SFLOAT:
2606 	case VK_FORMAT_R32G32_SINT:
2607 	case VK_FORMAT_R32G32_UINT:
2608 	case VK_FORMAT_R16G16_UNORM:
2609 	case VK_FORMAT_R16G16_SINT:
2610 	case VK_FORMAT_R16G16_UINT:
2611 	case VK_FORMAT_R8G8_SINT:
2612 	case VK_FORMAT_R8G8_UINT:
2613 		color.z = color.x;
2614 		color.x = UnpackLow(color.x, color.y);
2615 		color.z = UnpackHigh(color.z, color.y);
2616 		color.y = color.z;
2617 		break;
2618 	case VK_FORMAT_R16G16B16A16_SFLOAT:
2619 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2620 	case VK_FORMAT_R32G32B32A32_SFLOAT:
2621 	case VK_FORMAT_R32G32B32A32_SINT:
2622 	case VK_FORMAT_R32G32B32A32_UINT:
2623 	case VK_FORMAT_R16G16B16A16_UNORM:
2624 	case VK_FORMAT_R16G16B16A16_SINT:
2625 	case VK_FORMAT_R16G16B16A16_UINT:
2626 	case VK_FORMAT_R8G8B8A8_SINT:
2627 	case VK_FORMAT_R8G8B8A8_UINT:
2628 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
2629 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
2630 		transpose4x4(color.x, color.y, color.z, color.w);
2631 		break;
2632 	default:
2633 		UNSUPPORTED("VkFormat: %d", int(format));
2634 	}
2635 
2636 	int rgbaWriteMask = state.colorWriteActive(index);
2637 	int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
2638 
2639 	Int xMask;  // Combination of all masks
2640 
2641 	if(state.depthTestActive)
2642 	{
2643 		xMask = zMask;
2644 	}
2645 	else
2646 	{
2647 		xMask = cMask;
2648 	}
2649 
2650 	if(state.stencilActive)
2651 	{
2652 		xMask &= sMask;
2653 	}
2654 
2655 	Pointer<Byte> buffer = cBuffer;
2656 	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2657 	Float4 value;
2658 
2659 	switch(format)
2660 	{
2661 	case VK_FORMAT_R32_SFLOAT:
2662 	case VK_FORMAT_R32_SINT:
2663 	case VK_FORMAT_R32_UINT:
2664 		if(rgbaWriteMask & 0x00000001)
2665 		{
2666 			buffer += 4 * x;
2667 
2668 			// FIXME: movlps
2669 			value.x = *Pointer<Float>(buffer + 0);
2670 			value.y = *Pointer<Float>(buffer + 4);
2671 
2672 			buffer += pitchB;
2673 
2674 			// FIXME: movhps
2675 			value.z = *Pointer<Float>(buffer + 0);
2676 			value.w = *Pointer<Float>(buffer + 4);
2677 
2678 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2679 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2680 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2681 
2682 			// FIXME: movhps
2683 			*Pointer<Float>(buffer + 0) = color.x.z;
2684 			*Pointer<Float>(buffer + 4) = color.x.w;
2685 
2686 			buffer -= pitchB;
2687 
2688 			// FIXME: movlps
2689 			*Pointer<Float>(buffer + 0) = color.x.x;
2690 			*Pointer<Float>(buffer + 4) = color.x.y;
2691 		}
2692 		break;
2693 	case VK_FORMAT_R16_SFLOAT:
2694 		if(rgbaWriteMask & 0x00000001)
2695 		{
2696 			buffer += 2 * x;
2697 
2698 			value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 0);
2699 			value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 1);
2700 
2701 			buffer += pitchB;
2702 
2703 			value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 2);
2704 			value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 3);
2705 
2706 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2707 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2708 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2709 
2710 			*Pointer<Half>(buffer + 0) = Half(color.x.z);
2711 			*Pointer<Half>(buffer + 2) = Half(color.x.w);
2712 
2713 			buffer -= pitchB;
2714 
2715 			*Pointer<Half>(buffer + 0) = Half(color.x.x);
2716 			*Pointer<Half>(buffer + 2) = Half(color.x.y);
2717 		}
2718 		break;
2719 	case VK_FORMAT_R16_UNORM:
2720 	case VK_FORMAT_R16_SINT:
2721 	case VK_FORMAT_R16_UINT:
2722 		if(rgbaWriteMask & 0x00000001)
2723 		{
2724 			buffer += 2 * x;
2725 
2726 			UShort4 xyzw;
2727 			xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2728 
2729 			buffer += pitchB;
2730 
2731 			xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2732 			value = As<Float4>(Int4(xyzw));
2733 
2734 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2735 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2736 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2737 
2738 			Float component = color.x.z;
2739 			*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2740 			component = color.x.w;
2741 			*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2742 
2743 			buffer -= pitchB;
2744 
2745 			component = color.x.x;
2746 			*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2747 			component = color.x.y;
2748 			*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2749 		}
2750 		break;
2751 	case VK_FORMAT_R8_SINT:
2752 	case VK_FORMAT_R8_UINT:
2753 		if(rgbaWriteMask & 0x00000001)
2754 		{
2755 			buffer += x;
2756 
2757 			UInt xyzw, packedCol;
2758 
2759 			xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
2760 			buffer += pitchB;
2761 			xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2762 
2763 			Short4 tmpCol = Short4(As<Int4>(color.x));
2764 			if(format == VK_FORMAT_R8_SINT)
2765 			{
2766 				tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
2767 			}
2768 			else
2769 			{
2770 				tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
2771 			}
2772 			packedCol = Extract(As<Int2>(tmpCol), 0);
2773 
2774 			packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2775 			            (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2776 
2777 			*Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2778 			buffer -= pitchB;
2779 			*Pointer<UShort>(buffer) = UShort(packedCol);
2780 		}
2781 		break;
2782 	case VK_FORMAT_R32G32_SFLOAT:
2783 	case VK_FORMAT_R32G32_SINT:
2784 	case VK_FORMAT_R32G32_UINT:
2785 		buffer += 8 * x;
2786 
2787 		value = *Pointer<Float4>(buffer);
2788 
2789 		if((rgbaWriteMask & 0x00000003) != 0x00000003)
2790 		{
2791 			Float4 masked = value;
2792 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[rgbaWriteMask & 0x3][0])));
2793 			masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~rgbaWriteMask & 0x3][0])));
2794 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(masked));
2795 		}
2796 
2797 		color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16, 16));
2798 		value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ01X) + xMask * 16, 16));
2799 		color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2800 		*Pointer<Float4>(buffer) = color.x;
2801 
2802 		buffer += pitchB;
2803 
2804 		value = *Pointer<Float4>(buffer);
2805 
2806 		if((rgbaWriteMask & 0x00000003) != 0x00000003)
2807 		{
2808 			Float4 masked;
2809 
2810 			masked = value;
2811 			color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[rgbaWriteMask & 0x3][0])));
2812 			masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~rgbaWriteMask & 0x3][0])));
2813 			color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(masked));
2814 		}
2815 
2816 		color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16, 16));
2817 		value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ23X) + xMask * 16, 16));
2818 		color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(value));
2819 		*Pointer<Float4>(buffer) = color.y;
2820 		break;
2821 	case VK_FORMAT_R16G16_SFLOAT:
2822 		if((rgbaWriteMask & 0x00000003) != 0x0)
2823 		{
2824 			buffer += 4 * x;
2825 
2826 			UInt2 rgbaMask;
2827 			UInt2 packedCol;
2828 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.y))) << 16) | UInt(As<UShort>(Half(color.x.x))), 0);
2829 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.w))) << 16) | UInt(As<UShort>(Half(color.x.z))), 1);
2830 
2831 			UShort4 value = *Pointer<UShort4>(buffer);
2832 			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2833 			if((rgbaWriteMask & 0x3) != 0x3)
2834 			{
2835 				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2836 				rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2837 				mergedMask &= rgbaMask;
2838 			}
2839 			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2840 
2841 			buffer += pitchB;
2842 
2843 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.y))) << 16) | UInt(As<UShort>(Half(color.y.x))), 0);
2844 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.w))) << 16) | UInt(As<UShort>(Half(color.y.z))), 1);
2845 			value = *Pointer<UShort4>(buffer);
2846 			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2847 			if((rgbaWriteMask & 0x3) != 0x3)
2848 			{
2849 				mergedMask &= rgbaMask;
2850 			}
2851 			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2852 		}
2853 		break;
2854 	case VK_FORMAT_R16G16_UNORM:
2855 	case VK_FORMAT_R16G16_SINT:
2856 	case VK_FORMAT_R16G16_UINT:
2857 		if((rgbaWriteMask & 0x00000003) != 0x0)
2858 		{
2859 			buffer += 4 * x;
2860 
2861 			UInt2 rgbaMask;
2862 			UShort4 packedCol = UShort4(As<Int4>(color.x));
2863 			UShort4 value = *Pointer<UShort4>(buffer);
2864 			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2865 			if((rgbaWriteMask & 0x3) != 0x3)
2866 			{
2867 				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2868 				rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2869 				mergedMask &= rgbaMask;
2870 			}
2871 			*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2872 
2873 			buffer += pitchB;
2874 
2875 			packedCol = UShort4(As<Int4>(color.y));
2876 			value = *Pointer<UShort4>(buffer);
2877 			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2878 			if((rgbaWriteMask & 0x3) != 0x3)
2879 			{
2880 				mergedMask &= rgbaMask;
2881 			}
2882 			*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2883 		}
2884 		break;
2885 	case VK_FORMAT_R8G8_SINT:
2886 	case VK_FORMAT_R8G8_UINT:
2887 		if((rgbaWriteMask & 0x00000003) != 0x0)
2888 		{
2889 			buffer += 2 * x;
2890 
2891 			Int2 xyzw, packedCol;
2892 
2893 			xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2894 			buffer += pitchB;
2895 			xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2896 
2897 			if(format == VK_FORMAT_R8G8_SINT)
2898 			{
2899 				packedCol = As<Int2>(PackSigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2900 			}
2901 			else
2902 			{
2903 				packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2904 			}
2905 
2906 			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2907 			if((rgbaWriteMask & 0x3) != 0x3)
2908 			{
2909 				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
2910 				UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2911 				mergedMask &= rgbaMask;
2912 			}
2913 
2914 			packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2915 
2916 			*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2917 			buffer -= pitchB;
2918 			*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2919 		}
2920 		break;
2921 	case VK_FORMAT_R32G32B32A32_SFLOAT:
2922 	case VK_FORMAT_R32G32B32A32_SINT:
2923 	case VK_FORMAT_R32G32B32A32_UINT:
2924 		buffer += 16 * x;
2925 
2926 		{
2927 			value = *Pointer<Float4>(buffer, 16);
2928 
2929 			if(rgbaWriteMask != 0x0000000F)
2930 			{
2931 				Float4 masked = value;
2932 				color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2933 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2934 				color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(masked));
2935 			}
2936 
2937 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskX0X) + xMask * 16, 16));
2938 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX0X) + xMask * 16, 16));
2939 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2940 			*Pointer<Float4>(buffer, 16) = color.x;
2941 		}
2942 
2943 		{
2944 			value = *Pointer<Float4>(buffer + 16, 16);
2945 
2946 			if(rgbaWriteMask != 0x0000000F)
2947 			{
2948 				Float4 masked = value;
2949 				color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2950 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2951 				color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(masked));
2952 			}
2953 
2954 			color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskX1X) + xMask * 16, 16));
2955 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX1X) + xMask * 16, 16));
2956 			color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(value));
2957 			*Pointer<Float4>(buffer + 16, 16) = color.y;
2958 		}
2959 
2960 		buffer += pitchB;
2961 
2962 		{
2963 			value = *Pointer<Float4>(buffer, 16);
2964 
2965 			if(rgbaWriteMask != 0x0000000F)
2966 			{
2967 				Float4 masked = value;
2968 				color.z = As<Float4>(As<Int4>(color.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2969 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2970 				color.z = As<Float4>(As<Int4>(color.z) | As<Int4>(masked));
2971 			}
2972 
2973 			color.z = As<Float4>(As<Int4>(color.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskX2X) + xMask * 16, 16));
2974 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX2X) + xMask * 16, 16));
2975 			color.z = As<Float4>(As<Int4>(color.z) | As<Int4>(value));
2976 			*Pointer<Float4>(buffer, 16) = color.z;
2977 		}
2978 
2979 		{
2980 			value = *Pointer<Float4>(buffer + 16, 16);
2981 
2982 			if(rgbaWriteMask != 0x0000000F)
2983 			{
2984 				Float4 masked = value;
2985 				color.w = As<Float4>(As<Int4>(color.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2986 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2987 				color.w = As<Float4>(As<Int4>(color.w) | As<Int4>(masked));
2988 			}
2989 
2990 			color.w = As<Float4>(As<Int4>(color.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskX3X) + xMask * 16, 16));
2991 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX3X) + xMask * 16, 16));
2992 			color.w = As<Float4>(As<Int4>(color.w) | As<Int4>(value));
2993 			*Pointer<Float4>(buffer + 16, 16) = color.w;
2994 		}
2995 		break;
2996 	case VK_FORMAT_R16G16B16A16_SFLOAT:
2997 		if((rgbaWriteMask & 0x0000000F) != 0x0)
2998 		{
2999 			buffer += 8 * x;
3000 
3001 			UInt4 rgbaMask;
3002 			UInt4 value = *Pointer<UInt4>(buffer);
3003 			UInt4 packedCol;
3004 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.y))) << 16) | UInt(As<UShort>(Half(color.x.x))), 0);
3005 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.w))) << 16) | UInt(As<UShort>(Half(color.x.z))), 1);
3006 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.y))) << 16) | UInt(As<UShort>(Half(color.y.x))), 2);
3007 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.w))) << 16) | UInt(As<UShort>(Half(color.y.z))), 3);
3008 			UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
3009 			if((rgbaWriteMask & 0xF) != 0xF)
3010 			{
3011 				UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
3012 				rgbaMask = UInt4(tmpMask, tmpMask);
3013 				mergedMask &= rgbaMask;
3014 			}
3015 			*Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
3016 
3017 			buffer += pitchB;
3018 
3019 			value = *Pointer<UInt4>(buffer);
3020 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.z.y))) << 16) | UInt(As<UShort>(Half(color.z.x))), 0);
3021 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.z.w))) << 16) | UInt(As<UShort>(Half(color.z.z))), 1);
3022 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.w.y))) << 16) | UInt(As<UShort>(Half(color.w.x))), 2);
3023 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.w.w))) << 16) | UInt(As<UShort>(Half(color.w.z))), 3);
3024 			mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
3025 			if((rgbaWriteMask & 0xF) != 0xF)
3026 			{
3027 				mergedMask &= rgbaMask;
3028 			}
3029 			*Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
3030 		}
3031 		break;
3032 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
3033 		if((rgbaWriteMask & 0x7) != 0x0)
3034 		{
3035 			buffer += 4 * x;
3036 
3037 			UInt4 packedCol;
3038 			packedCol = Insert(packedCol, r11g11b10Pack(color.x), 0);
3039 			packedCol = Insert(packedCol, r11g11b10Pack(color.y), 1);
3040 			packedCol = Insert(packedCol, r11g11b10Pack(color.z), 2);
3041 			packedCol = Insert(packedCol, r11g11b10Pack(color.w), 3);
3042 
3043 			UInt4 value;
3044 			value = Insert(value, *Pointer<UInt>(buffer + 0), 0);
3045 			value = Insert(value, *Pointer<UInt>(buffer + 4), 1);
3046 			buffer += pitchB;
3047 			value = Insert(value, *Pointer<UInt>(buffer + 0), 2);
3048 			value = Insert(value, *Pointer<UInt>(buffer + 4), 3);
3049 
3050 			UInt4 mask = *Pointer<UInt4>(constants + OFFSET(Constants, maskD4X[0][0]) + xMask * 16, 16);
3051 			if((rgbaWriteMask & 0x7) != 0x7)
3052 			{
3053 				mask &= *Pointer<UInt4>(constants + OFFSET(Constants, mask11X[rgbaWriteMask & 0x7][0]), 16);
3054 			}
3055 			value = (packedCol & mask) | (value & ~mask);
3056 
3057 			*Pointer<UInt>(buffer + 0) = value.z;
3058 			*Pointer<UInt>(buffer + 4) = value.w;
3059 			buffer -= pitchB;
3060 			*Pointer<UInt>(buffer + 0) = value.x;
3061 			*Pointer<UInt>(buffer + 4) = value.y;
3062 		}
3063 		break;
3064 	case VK_FORMAT_R16G16B16A16_UNORM:
3065 	case VK_FORMAT_R16G16B16A16_SINT:
3066 	case VK_FORMAT_R16G16B16A16_UINT:
3067 		if((rgbaWriteMask & 0x0000000F) != 0x0)
3068 		{
3069 			buffer += 8 * x;
3070 
3071 			UInt4 rgbaMask;
3072 			UShort8 value = *Pointer<UShort8>(buffer);
3073 			UShort8 packedCol = UShort8(UShort4(As<Int4>(color.x)), UShort4(As<Int4>(color.y)));
3074 			UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
3075 			if((rgbaWriteMask & 0xF) != 0xF)
3076 			{
3077 				UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
3078 				rgbaMask = UInt4(tmpMask, tmpMask);
3079 				mergedMask &= rgbaMask;
3080 			}
3081 			*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
3082 
3083 			buffer += pitchB;
3084 
3085 			value = *Pointer<UShort8>(buffer);
3086 			packedCol = UShort8(UShort4(As<Int4>(color.z)), UShort4(As<Int4>(color.w)));
3087 			mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
3088 			if((rgbaWriteMask & 0xF) != 0xF)
3089 			{
3090 				mergedMask &= rgbaMask;
3091 			}
3092 			*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
3093 		}
3094 		break;
3095 	case VK_FORMAT_R8G8B8A8_SINT:
3096 	case VK_FORMAT_R8G8B8A8_UINT:
3097 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
3098 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
3099 		if((rgbaWriteMask & 0x0000000F) != 0x0)
3100 		{
3101 			UInt2 value, packedCol, mergedMask;
3102 
3103 			buffer += 4 * x;
3104 
3105 			bool isSigned = (format == VK_FORMAT_R8G8B8A8_SINT) || (format == VK_FORMAT_A8B8G8R8_SINT_PACK32);
3106 
3107 			if(isSigned)
3108 			{
3109 				packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
3110 			}
3111 			else
3112 			{
3113 				packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
3114 			}
3115 			value = *Pointer<UInt2>(buffer, 16);
3116 			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
3117 			if(rgbaWriteMask != 0xF)
3118 			{
3119 				mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
3120 			}
3121 			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
3122 
3123 			buffer += pitchB;
3124 
3125 			if(isSigned)
3126 			{
3127 				packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(color.z)), Short4(As<Int4>(color.w))));
3128 			}
3129 			else
3130 			{
3131 				packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(color.z)), Short4(As<Int4>(color.w))));
3132 			}
3133 			value = *Pointer<UInt2>(buffer, 16);
3134 			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
3135 			if(rgbaWriteMask != 0xF)
3136 			{
3137 				mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
3138 			}
3139 			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
3140 		}
3141 		break;
3142 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
3143 		if((rgbaWriteMask & 0x0000000F) != 0x0)
3144 		{
3145 			Int2 mergedMask, packedCol, value;
3146 			Int4 packed = ((As<Int4>(color.w) & Int4(0x3)) << 30) |
3147 			              ((As<Int4>(color.z) & Int4(0x3ff)) << 20) |
3148 			              ((As<Int4>(color.y) & Int4(0x3ff)) << 10) |
3149 			              ((As<Int4>(color.x) & Int4(0x3ff)));
3150 
3151 			buffer += 4 * x;
3152 			value = *Pointer<Int2>(buffer, 16);
3153 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
3154 			if(rgbaWriteMask != 0xF)
3155 			{
3156 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
3157 			}
3158 			*Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
3159 
3160 			buffer += pitchB;
3161 
3162 			value = *Pointer<Int2>(buffer, 16);
3163 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
3164 			if(rgbaWriteMask != 0xF)
3165 			{
3166 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
3167 			}
3168 			*Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
3169 		}
3170 		break;
3171 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
3172 		if((bgraWriteMask & 0x0000000F) != 0x0)
3173 		{
3174 			Int2 mergedMask, packedCol, value;
3175 			Int4 packed = ((As<Int4>(color.w) & Int4(0x3)) << 30) |
3176 			              ((As<Int4>(color.x) & Int4(0x3ff)) << 20) |
3177 			              ((As<Int4>(color.y) & Int4(0x3ff)) << 10) |
3178 			              ((As<Int4>(color.z) & Int4(0x3ff)));
3179 
3180 			buffer += 4 * x;
3181 			value = *Pointer<Int2>(buffer, 16);
3182 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
3183 			if(bgraWriteMask != 0xF)
3184 			{
3185 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[bgraWriteMask][0]));
3186 			}
3187 			*Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
3188 
3189 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
3190 
3191 			value = *Pointer<Int2>(buffer, 16);
3192 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
3193 			if(bgraWriteMask != 0xF)
3194 			{
3195 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[bgraWriteMask][0]));
3196 			}
3197 			*Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
3198 		}
3199 		break;
3200 	default:
3201 		UNSUPPORTED("VkFormat: %d", int(format));
3202 	}
3203 }
3204 
convertFixed16(const Float4 & cf,bool saturate)3205 UShort4 PixelRoutine::convertFixed16(const Float4 &cf, bool saturate)
3206 {
3207 	return UShort4(cf * Float4(0xFFFF), saturate);
3208 }
3209 
convertFloat32(const UShort4 & cf)3210 Float4 PixelRoutine::convertFloat32(const UShort4 &cf)
3211 {
3212 	return Float4(cf) * Float4(1.0f / 65535.0f);
3213 }
3214 
sRGBtoLinear16_12_16(Vector4s & c)3215 void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
3216 {
3217 	Pointer<Byte> LUT = constants + OFFSET(Constants, sRGBtoLinear12_16);
3218 
3219 	c.x = AddSat(As<UShort4>(c.x), UShort4(0x0007)) >> 4;
3220 	c.y = AddSat(As<UShort4>(c.y), UShort4(0x0007)) >> 4;
3221 	c.z = AddSat(As<UShort4>(c.z), UShort4(0x0007)) >> 4;
3222 
3223 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
3224 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
3225 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
3226 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
3227 
3228 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
3229 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
3230 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
3231 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
3232 
3233 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
3234 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
3235 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
3236 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
3237 }
3238 
linearToSRGB16_12_16(Vector4s & c)3239 void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
3240 {
3241 	c.x = AddSat(As<UShort4>(c.x), UShort4(0x0007)) >> 4;
3242 	c.y = AddSat(As<UShort4>(c.y), UShort4(0x0007)) >> 4;
3243 	c.z = AddSat(As<UShort4>(c.z), UShort4(0x0007)) >> 4;
3244 
3245 	linearToSRGB12_16(c);
3246 }
3247 
linearToSRGB12_16(Vector4s & c)3248 void PixelRoutine::linearToSRGB12_16(Vector4s &c)
3249 {
3250 	Pointer<Byte> LUT = constants + OFFSET(Constants, linearToSRGB12_16);
3251 
3252 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
3253 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
3254 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
3255 	c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
3256 
3257 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
3258 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
3259 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
3260 	c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
3261 
3262 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
3263 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
3264 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
3265 	c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
3266 }
3267 
sRGBtoLinear(const Float4 & x)3268 Float4 PixelRoutine::sRGBtoLinear(const Float4 &x)  // Approximates x^2.2
3269 {
3270 	Float4 linear = x * x;
3271 	linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
3272 
3273 	return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
3274 }
3275 
3276 }  // namespace sw
3277