• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "PixelRoutine.hpp"
16 
17 #include "Constants.hpp"
18 #include "SamplerCore.hpp"
19 #include "Device/Primitive.hpp"
20 #include "Device/QuadRasterizer.hpp"
21 #include "Device/Renderer.hpp"
22 #include "System/Debug.hpp"
23 #include "System/Math.hpp"
24 #include "Vulkan/VkPipelineLayout.hpp"
25 #include "Vulkan/VkStringify.hpp"
26 
27 namespace sw {
28 
PixelRoutine(const PixelProcessor::State & state,const vk::PipelineLayout * pipelineLayout,const SpirvShader * spirvShader,const vk::DescriptorSet::Bindings & descriptorSets)29 PixelRoutine::PixelRoutine(
30     const PixelProcessor::State &state,
31     const vk::PipelineLayout *pipelineLayout,
32     const SpirvShader *spirvShader,
33     const vk::DescriptorSet::Bindings &descriptorSets)
34     : QuadRasterizer(state, spirvShader)
35     , routine(pipelineLayout)
36     , descriptorSets(descriptorSets)
37     , shaderContainsInterpolation(spirvShader && spirvShader->getUsedCapabilities().InterpolationFunction)
38     , shaderContainsSampleQualifier(spirvShader && spirvShader->getAnalysis().ContainsSampleQualifier)
39     , perSampleShading((state.sampleShadingEnabled && (state.minSampleShading * state.multiSampleCount > 1.0f)) ||
40                        shaderContainsSampleQualifier || shaderContainsInterpolation)  // TODO(b/194714095)
41     , invocationCount(perSampleShading ? state.multiSampleCount : 1)
42 {
43 	if(spirvShader)
44 	{
45 		spirvShader->emitProlog(&routine);
46 	}
47 }
48 
~PixelRoutine()49 PixelRoutine::~PixelRoutine()
50 {
51 }
52 
getSampleSet(int invocation) const53 PixelRoutine::SampleSet PixelRoutine::getSampleSet(int invocation) const
54 {
55 	unsigned int sampleBegin = perSampleShading ? invocation : 0;
56 	unsigned int sampleEnd = perSampleShading ? (invocation + 1) : state.multiSampleCount;
57 
58 	SampleSet samples;
59 
60 	for(unsigned int q = sampleBegin; q < sampleEnd; q++)
61 	{
62 		if(state.multiSampleMask & (1 << q))
63 		{
64 			samples.push_back(q);
65 		}
66 	}
67 
68 	return samples;
69 }
70 
quad(Pointer<Byte> cBuffer[MAX_COLOR_BUFFERS],Pointer<Byte> & zBuffer,Pointer<Byte> & sBuffer,Int cMask[4],Int & x,Int & y)71 void PixelRoutine::quad(Pointer<Byte> cBuffer[MAX_COLOR_BUFFERS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
72 {
73 	const bool earlyFragmentTests = !spirvShader || spirvShader->getExecutionModes().EarlyFragmentTests;
74 
75 	Int zMask[4];  // Depth mask
76 	Int sMask[4];  // Stencil mask
77 	SIMD::Float unclampedZ[4];
78 
79 	for(int invocation = 0; invocation < invocationCount; invocation++)
80 	{
81 		SampleSet samples = getSampleSet(invocation);
82 
83 		if(samples.empty())
84 		{
85 			continue;
86 		}
87 
88 		for(unsigned int q : samples)
89 		{
90 			zMask[q] = cMask[q];
91 			sMask[q] = cMask[q];
92 		}
93 
94 		stencilTest(sBuffer, x, sMask, samples);
95 
96 		SIMD::Float rhwCentroid;
97 
98 		// Compute the x coordinate of each fragment in the SIMD group.
99 		const auto xMorton = SIMD::Float([](int i) { return float(compactEvenBits(i)); });  // 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3, ...
100 		xFragment = SIMD::Float(Float(x)) + xMorton - SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, x0)));
101 
102 		if(interpolateZ())
103 		{
104 			for(unsigned int q : samples)
105 			{
106 				SIMD::Float x = xFragment;
107 
108 				if(state.enableMultiSampling)
109 				{
110 					x -= SIMD::Float(*Pointer<Float>(constants + OFFSET(Constants, SampleLocationsX) + q * sizeof(float)));
111 				}
112 
113 				z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive, z), false, false);
114 
115 				if(state.depthBias)
116 				{
117 					z[q] += SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, zBias)));
118 				}
119 
120 				unclampedZ[q] = z[q];
121 			}
122 		}
123 
124 		Bool depthPass = false;
125 
126 		if(earlyFragmentTests)
127 		{
128 			for(unsigned int q : samples)
129 			{
130 				z[q] = clampDepth(z[q]);
131 				depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
132 				depthBoundsTest(zBuffer, q, x, zMask[q], cMask[q]);
133 			}
134 
135 			writeStencil(sBuffer, x, sMask, zMask, cMask, samples);
136 		}
137 
138 		If(depthPass || !earlyFragmentTests)
139 		{
140 			if(earlyFragmentTests)
141 			{
142 				writeDepth(zBuffer, x, zMask, samples);
143 				occlusionSampleCount(zMask, sMask, samples);
144 			}
145 
146 			// TODO(b/236162233): Use SIMD::Float2
147 			SIMD::Float xCentroid = 0.0f;
148 			SIMD::Float yCentroid = 0.0f;
149 
150 			if(state.centroid || shaderContainsInterpolation)  // TODO(b/194714095)
151 			{
152 				SIMD::Float weight = 1.0e-9f;
153 
154 				for(unsigned int q : samples)
155 				{
156 					ASSERT(SIMD::Width == 4);
157 					xCentroid += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, sampleX[q]) + 16 * cMask[q]));
158 					yCentroid += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, sampleY[q]) + 16 * cMask[q]));
159 					weight += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, weight) + 16 * cMask[q]));
160 				}
161 
162 				weight = Rcp(weight, true /* relaxedPrecision */);
163 				xCentroid *= weight;
164 				yCentroid *= weight;
165 
166 				xCentroid += xFragment;
167 				yCentroid += yFragment;
168 			}
169 
170 			if(interpolateW())
171 			{
172 				w = interpolate(xFragment, Dw, rhw, primitive + OFFSET(Primitive, w), false, false);
173 				rhw = reciprocal(w, false, true);
174 
175 				if(state.centroid || shaderContainsInterpolation)  // TODO(b/194714095)
176 				{
177 					rhwCentroid = reciprocal(SpirvRoutine::interpolateAtXY(xCentroid, yCentroid, rhwCentroid, primitive + OFFSET(Primitive, w), SpirvRoutine::Linear));
178 				}
179 			}
180 
181 			if(spirvShader)
182 			{
183 				if(shaderContainsInterpolation)  // TODO(b/194714095)
184 				{
185 					routine.interpolationData.primitive = primitive;
186 
187 					routine.interpolationData.x = xFragment;
188 					routine.interpolationData.y = yFragment;
189 					routine.interpolationData.rhw = rhw;
190 
191 					routine.interpolationData.xCentroid = xCentroid;
192 					routine.interpolationData.yCentroid = yCentroid;
193 					routine.interpolationData.rhwCentroid = rhwCentroid;
194 				}
195 
196 				SIMD::Float xSample = xFragment;
197 				SIMD::Float ySample = yFragment;
198 
199 				if(perSampleShading && (state.multiSampleCount > 1))
200 				{
201 					xSample += SampleLocationsX[samples[0]];
202 					ySample += SampleLocationsY[samples[0]];
203 				}
204 
205 				int packedInterpolant = 0;
206 				for(int interfaceInterpolant = 0; interfaceInterpolant < MAX_INTERFACE_COMPONENTS; interfaceInterpolant++)
207 				{
208 					const auto &input = spirvShader->inputs[interfaceInterpolant];
209 					if(input.Type != Spirv::ATTRIBTYPE_UNUSED)
210 					{
211 						routine.inputsInterpolation[packedInterpolant] = input.Flat ? SpirvRoutine::Flat : (input.NoPerspective ? SpirvRoutine::Linear : SpirvRoutine::Perspective);
212 						if(input.Centroid && state.enableMultiSampling)
213 						{
214 							routine.inputs[interfaceInterpolant] =
215 							    SpirvRoutine::interpolateAtXY(xCentroid, yCentroid, rhwCentroid,
216 							                                  primitive + OFFSET(Primitive, V[packedInterpolant]),
217 							                                  routine.inputsInterpolation[packedInterpolant]);
218 						}
219 						else if(perSampleShading)
220 						{
221 							routine.inputs[interfaceInterpolant] =
222 							    SpirvRoutine::interpolateAtXY(xSample, ySample, rhw,
223 							                                  primitive + OFFSET(Primitive, V[packedInterpolant]),
224 							                                  routine.inputsInterpolation[packedInterpolant]);
225 						}
226 						else
227 						{
228 							routine.inputs[interfaceInterpolant] =
229 							    interpolate(xFragment, Dv[interfaceInterpolant], rhw,
230 							                primitive + OFFSET(Primitive, V[packedInterpolant]),
231 							                input.Flat, !input.NoPerspective);
232 						}
233 						packedInterpolant++;
234 					}
235 				}
236 
237 				setBuiltins(x, y, unclampedZ, w, cMask, samples);
238 
239 				for(uint32_t i = 0; i < state.numClipDistances; i++)
240 				{
241 					auto distance = interpolate(xFragment, DclipDistance[i], rhw,
242 					                            primitive + OFFSET(Primitive, clipDistance[i]),
243 					                            false, true);
244 
245 					auto clipMask = SignMask(CmpGE(distance, SIMD::Float(0)));
246 					for(unsigned int q : samples)
247 					{
248 						// FIXME(b/148105887): Fragments discarded by clipping do not exist at
249 						// all -- they should not be counted in queries or have their Z/S effects
250 						// performed when early fragment tests are enabled.
251 						cMask[q] &= clipMask;
252 					}
253 
254 					if(spirvShader->getUsedCapabilities().ClipDistance)
255 					{
256 						auto it = spirvShader->inputBuiltins.find(spv::BuiltInClipDistance);
257 						if(it != spirvShader->inputBuiltins.end())
258 						{
259 							if(i < it->second.SizeInComponents)
260 							{
261 								routine.getVariable(it->second.Id)[it->second.FirstComponent + i] = distance;
262 							}
263 						}
264 					}
265 				}
266 
267 				if(spirvShader->getUsedCapabilities().CullDistance)
268 				{
269 					auto it = spirvShader->inputBuiltins.find(spv::BuiltInCullDistance);
270 					if(it != spirvShader->inputBuiltins.end())
271 					{
272 						for(uint32_t i = 0; i < state.numCullDistances; i++)
273 						{
274 							if(i < it->second.SizeInComponents)
275 							{
276 								routine.getVariable(it->second.Id)[it->second.FirstComponent + i] =
277 								    interpolate(xFragment, DcullDistance[i], rhw,
278 								                primitive + OFFSET(Primitive, cullDistance[i]),
279 								                false, true);
280 							}
281 						}
282 					}
283 				}
284 			}
285 
286 			if(spirvShader)
287 			{
288 				executeShader(cMask, earlyFragmentTests ? sMask : cMask, earlyFragmentTests ? zMask : cMask, samples);
289 			}
290 
291 			Bool alphaPass = alphaTest(cMask, samples);
292 
293 			if((spirvShader && spirvShader->coverageModified()) || state.alphaToCoverage)
294 			{
295 				for(unsigned int q : samples)
296 				{
297 					zMask[q] &= cMask[q];
298 					sMask[q] &= cMask[q];
299 				}
300 			}
301 
302 			If(alphaPass)
303 			{
304 				if(!earlyFragmentTests)
305 				{
306 					for(unsigned int q : samples)
307 					{
308 						z[q] = clampDepth(z[q]);
309 						depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
310 						depthBoundsTest(zBuffer, q, x, zMask[q], cMask[q]);
311 					}
312 				}
313 
314 				If(depthPass)
315 				{
316 					if(!earlyFragmentTests)
317 					{
318 						writeDepth(zBuffer, x, zMask, samples);
319 						occlusionSampleCount(zMask, sMask, samples);
320 					}
321 
322 					blendColor(cBuffer, x, sMask, zMask, cMask, samples);
323 				}
324 			}
325 		}
326 
327 		if(!earlyFragmentTests)
328 		{
329 			writeStencil(sBuffer, x, sMask, zMask, cMask, samples);
330 		}
331 	}
332 }
333 
stencilTest(const Pointer<Byte> & sBuffer,const Int & x,Int sMask[4],const SampleSet & samples)334 void PixelRoutine::stencilTest(const Pointer<Byte> &sBuffer, const Int &x, Int sMask[4], const SampleSet &samples)
335 {
336 	if(!state.stencilActive)
337 	{
338 		return;
339 	}
340 
341 	for(unsigned int q : samples)
342 	{
343 		// (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
344 
345 		Pointer<Byte> buffer = sBuffer + x;
346 
347 		if(q > 0)
348 		{
349 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
350 		}
351 
352 		Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
353 		Byte8 value = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
354 		value = value | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
355 		Byte8 valueBack = value;
356 
357 		if(state.frontStencil.useCompareMask)
358 		{
359 			value &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].testMaskQ));
360 		}
361 
362 		stencilTest(value, state.frontStencil.compareOp, false);
363 
364 		if(state.backStencil.useCompareMask)
365 		{
366 			valueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].testMaskQ));
367 		}
368 
369 		stencilTest(valueBack, state.backStencil.compareOp, true);
370 
371 		value &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
372 		valueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
373 		value |= valueBack;
374 
375 		sMask[q] &= SignMask(value);
376 	}
377 }
378 
stencilTest(Byte8 & value,VkCompareOp stencilCompareMode,bool isBack)379 void PixelRoutine::stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool isBack)
380 {
381 	Byte8 equal;
382 
383 	switch(stencilCompareMode)
384 	{
385 	case VK_COMPARE_OP_ALWAYS:
386 		value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
387 		break;
388 	case VK_COMPARE_OP_NEVER:
389 		value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
390 		break;
391 	case VK_COMPARE_OP_LESS:  // a < b ~ b > a
392 		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
393 		value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
394 		break;
395 	case VK_COMPARE_OP_EQUAL:
396 		value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
397 		break;
398 	case VK_COMPARE_OP_NOT_EQUAL:  // a != b ~ !(a == b)
399 		value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
400 		value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
401 		break;
402 	case VK_COMPARE_OP_LESS_OR_EQUAL:  // a <= b ~ (b > a) || (a == b)
403 		equal = value;
404 		equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
405 		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
406 		value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
407 		value |= equal;
408 		break;
409 	case VK_COMPARE_OP_GREATER:  // a > b
410 		equal = *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ));
411 		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
412 		equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
413 		value = equal;
414 		break;
415 	case VK_COMPARE_OP_GREATER_OR_EQUAL:  // a >= b ~ !(a < b) ~ !(b > a)
416 		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
417 		value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
418 		value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
419 		break;
420 	default:
421 		UNSUPPORTED("VkCompareOp: %d", int(stencilCompareMode));
422 	}
423 }
424 
readDepth32F(const Pointer<Byte> & zBuffer,int q,const Int & x) const425 SIMD::Float PixelRoutine::readDepth32F(const Pointer<Byte> &zBuffer, int q, const Int &x) const
426 {
427 	ASSERT(SIMD::Width == 4);
428 	Pointer<Byte> buffer = zBuffer + 4 * x;
429 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
430 
431 	if(q > 0)
432 	{
433 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
434 	}
435 
436 	Float4 zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
437 	return SIMD::Float(zValue);
438 }
439 
readDepth16(const Pointer<Byte> & zBuffer,int q,const Int & x) const440 SIMD::Float PixelRoutine::readDepth16(const Pointer<Byte> &zBuffer, int q, const Int &x) const
441 {
442 	ASSERT(SIMD::Width == 4);
443 	Pointer<Byte> buffer = zBuffer + 2 * x;
444 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
445 
446 	if(q > 0)
447 	{
448 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
449 	}
450 
451 	UShort4 zValue16;
452 	zValue16 = As<UShort4>(Insert(As<Int2>(zValue16), *Pointer<Int>(buffer), 0));
453 	zValue16 = As<UShort4>(Insert(As<Int2>(zValue16), *Pointer<Int>(buffer + pitch), 1));
454 	Float4 zValue = Float4(zValue16);
455 	return SIMD::Float(zValue);
456 }
457 
clampDepth(const SIMD::Float & z)458 SIMD::Float PixelRoutine::clampDepth(const SIMD::Float &z)
459 {
460 	if(!state.depthClamp)
461 	{
462 		return z;
463 	}
464 
465 	return Min(Max(z, state.minDepthClamp), state.maxDepthClamp);
466 }
467 
depthTest(const Pointer<Byte> & zBuffer,int q,const Int & x,const SIMD::Float & z,const Int & sMask,Int & zMask,const Int & cMask)468 Bool PixelRoutine::depthTest(const Pointer<Byte> &zBuffer, int q, const Int &x, const SIMD::Float &z, const Int &sMask, Int &zMask, const Int &cMask)
469 {
470 	if(!state.depthTestActive)
471 	{
472 		return true;
473 	}
474 
475 	SIMD::Float Z;
476 	SIMD::Float zValue;
477 
478 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
479 	{
480 		switch(state.depthFormat)
481 		{
482 		case VK_FORMAT_D16_UNORM:
483 			Z = Min(Max(Round(z * 0xFFFF), 0.0f), 0xFFFF);
484 			zValue = readDepth16(zBuffer, q, x);
485 			break;
486 		case VK_FORMAT_D32_SFLOAT:
487 		case VK_FORMAT_D32_SFLOAT_S8_UINT:
488 			Z = z;
489 			zValue = readDepth32F(zBuffer, q, x);
490 			break;
491 		default:
492 			UNSUPPORTED("Depth format: %d", int(state.depthFormat));
493 			return false;
494 		}
495 	}
496 
497 	SIMD::Int zTest;
498 
499 	switch(state.depthCompareMode)
500 	{
501 	case VK_COMPARE_OP_ALWAYS:
502 		// Optimized
503 		break;
504 	case VK_COMPARE_OP_NEVER:
505 		// Optimized
506 		break;
507 	case VK_COMPARE_OP_EQUAL:
508 		zTest = CmpEQ(zValue, Z);
509 		break;
510 	case VK_COMPARE_OP_NOT_EQUAL:
511 		zTest = CmpNEQ(zValue, Z);
512 		break;
513 	case VK_COMPARE_OP_LESS:
514 		zTest = CmpNLE(zValue, Z);
515 		break;
516 	case VK_COMPARE_OP_GREATER_OR_EQUAL:
517 		zTest = CmpLE(zValue, Z);
518 		break;
519 	case VK_COMPARE_OP_LESS_OR_EQUAL:
520 		zTest = CmpNLT(zValue, Z);
521 		break;
522 	case VK_COMPARE_OP_GREATER:
523 		zTest = CmpLT(zValue, Z);
524 		break;
525 	default:
526 		UNSUPPORTED("VkCompareOp: %d", int(state.depthCompareMode));
527 	}
528 
529 	switch(state.depthCompareMode)
530 	{
531 	case VK_COMPARE_OP_ALWAYS:
532 		zMask = cMask;
533 		break;
534 	case VK_COMPARE_OP_NEVER:
535 		zMask = 0x0;
536 		break;
537 	default:
538 		zMask = SignMask(zTest) & cMask;
539 		break;
540 	}
541 
542 	if(state.stencilActive)
543 	{
544 		zMask &= sMask;
545 	}
546 
547 	return zMask != 0;
548 }
549 
depthBoundsTest16(const Pointer<Byte> & zBuffer,int q,const Int & x)550 Int4 PixelRoutine::depthBoundsTest16(const Pointer<Byte> &zBuffer, int q, const Int &x)
551 {
552 	Pointer<Byte> buffer = zBuffer + 2 * x;
553 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
554 
555 	if(q > 0)
556 	{
557 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
558 	}
559 
560 	Float4 minDepthBound(state.minDepthBounds);
561 	Float4 maxDepthBound(state.maxDepthBounds);
562 
563 	Int2 z;
564 	z = Insert(z, *Pointer<Int>(buffer), 0);
565 	z = Insert(z, *Pointer<Int>(buffer + pitch), 1);
566 
567 	Float4 zValue = Float4(As<UShort4>(z)) * (1.0f / 0xFFFF);
568 	return Int4(CmpLE(minDepthBound, zValue) & CmpLE(zValue, maxDepthBound));
569 }
570 
depthBoundsTest32F(const Pointer<Byte> & zBuffer,int q,const Int & x)571 Int4 PixelRoutine::depthBoundsTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x)
572 {
573 	Pointer<Byte> buffer = zBuffer + 4 * x;
574 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
575 
576 	if(q > 0)
577 	{
578 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
579 	}
580 
581 	Float4 zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
582 	return Int4(CmpLE(state.minDepthBounds, zValue) & CmpLE(zValue, state.maxDepthBounds));
583 }
584 
depthBoundsTest(const Pointer<Byte> & zBuffer,int q,const Int & x,Int & zMask,Int & cMask)585 void PixelRoutine::depthBoundsTest(const Pointer<Byte> &zBuffer, int q, const Int &x, Int &zMask, Int &cMask)
586 {
587 	if(!state.depthBoundsTestActive)
588 	{
589 		return;
590 	}
591 
592 	Int4 zTest;
593 	switch(state.depthFormat)
594 	{
595 	case VK_FORMAT_D16_UNORM:
596 		zTest = depthBoundsTest16(zBuffer, q, x);
597 		break;
598 	case VK_FORMAT_D32_SFLOAT:
599 	case VK_FORMAT_D32_SFLOAT_S8_UINT:
600 		zTest = depthBoundsTest32F(zBuffer, q, x);
601 		break;
602 	default:
603 		UNSUPPORTED("Depth format: %d", int(state.depthFormat));
604 		break;
605 	}
606 
607 	if(!state.depthTestActive)
608 	{
609 		cMask &= zMask & SignMask(zTest);
610 	}
611 	else
612 	{
613 		zMask &= cMask & SignMask(zTest);
614 	}
615 }
616 
alphaToCoverage(Int cMask[4],const SIMD::Float & alpha,const SampleSet & samples)617 void PixelRoutine::alphaToCoverage(Int cMask[4], const SIMD::Float &alpha, const SampleSet &samples)
618 {
619 	static const int a2c[4] = {
620 		OFFSET(DrawData, a2c0),
621 		OFFSET(DrawData, a2c1),
622 		OFFSET(DrawData, a2c2),
623 		OFFSET(DrawData, a2c3),
624 	};
625 
626 	for(unsigned int q : samples)
627 	{
628 		SIMD::Int coverage = CmpNLT(alpha, SIMD::Float(*Pointer<Float>(data + a2c[q])));
629 		Int aMask = SignMask(coverage);
630 		cMask[q] &= aMask;
631 	}
632 }
633 
writeDepth32F(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)634 void PixelRoutine::writeDepth32F(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
635 {
636 	Float4 Z = z;
637 
638 	Pointer<Byte> buffer = zBuffer + 4 * x;
639 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
640 
641 	if(q > 0)
642 	{
643 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
644 	}
645 
646 	Float4 zValue;
647 
648 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
649 	{
650 		zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
651 	}
652 
653 	Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + zMask * 16, 16));
654 	zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + zMask * 16, 16));
655 	Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
656 
657 	*Pointer<Float2>(buffer) = Float2(Z.xy);
658 	*Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
659 }
660 
writeDepth16(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)661 void PixelRoutine::writeDepth16(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
662 {
663 	Short4 Z = UShort4(Round(z * 0xFFFF), true);
664 
665 	Pointer<Byte> buffer = zBuffer + 2 * x;
666 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
667 
668 	if(q > 0)
669 	{
670 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
671 	}
672 
673 	Short4 zValue;
674 
675 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
676 	{
677 		zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer), 0));
678 		zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer + pitch), 1));
679 	}
680 
681 	Z = Z & *Pointer<Short4>(constants + OFFSET(Constants, maskW4Q) + zMask * 8, 8);
682 	zValue = zValue & *Pointer<Short4>(constants + OFFSET(Constants, invMaskW4Q) + zMask * 8, 8);
683 	Z = Z | zValue;
684 
685 	*Pointer<Int>(buffer) = Extract(As<Int2>(Z), 0);
686 	*Pointer<Int>(buffer + pitch) = Extract(As<Int2>(Z), 1);
687 }
688 
writeDepth(Pointer<Byte> & zBuffer,const Int & x,const Int zMask[4],const SampleSet & samples)689 void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, const Int &x, const Int zMask[4], const SampleSet &samples)
690 {
691 	if(!state.depthWriteEnable)
692 	{
693 		return;
694 	}
695 
696 	for(unsigned int q : samples)
697 	{
698 		ASSERT(SIMD::Width == 4);
699 		switch(state.depthFormat)
700 		{
701 		case VK_FORMAT_D16_UNORM:
702 			writeDepth16(zBuffer, q, x, Extract128(z[q], 0), zMask[q]);
703 			break;
704 		case VK_FORMAT_D32_SFLOAT:
705 		case VK_FORMAT_D32_SFLOAT_S8_UINT:
706 			writeDepth32F(zBuffer, q, x, Extract128(z[q], 0), zMask[q]);
707 			break;
708 		default:
709 			UNSUPPORTED("Depth format: %d", int(state.depthFormat));
710 			break;
711 		}
712 	}
713 }
714 
occlusionSampleCount(const Int zMask[4],const Int sMask[4],const SampleSet & samples)715 void PixelRoutine::occlusionSampleCount(const Int zMask[4], const Int sMask[4], const SampleSet &samples)
716 {
717 	if(!state.occlusionEnabled)
718 	{
719 		return;
720 	}
721 
722 	for(unsigned int q : samples)
723 	{
724 		occlusion += *Pointer<UInt>(constants + OFFSET(Constants, occlusionCount) + 4 * (zMask[q] & sMask[q]));
725 	}
726 }
727 
writeStencil(Pointer<Byte> & sBuffer,const Int & x,const Int sMask[4],const Int zMask[4],const Int cMask[4],const SampleSet & samples)728 void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, const Int &x, const Int sMask[4], const Int zMask[4], const Int cMask[4], const SampleSet &samples)
729 {
730 	if(!state.stencilActive)
731 	{
732 		return;
733 	}
734 
735 	if(state.frontStencil.passOp == VK_STENCIL_OP_KEEP && state.frontStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.frontStencil.failOp == VK_STENCIL_OP_KEEP)
736 	{
737 		if(state.backStencil.passOp == VK_STENCIL_OP_KEEP && state.backStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.backStencil.failOp == VK_STENCIL_OP_KEEP)
738 		{
739 			return;
740 		}
741 	}
742 
743 	if(!state.frontStencil.writeEnabled && !state.backStencil.writeEnabled)
744 	{
745 		return;
746 	}
747 
748 	for(unsigned int q : samples)
749 	{
750 		Pointer<Byte> buffer = sBuffer + x;
751 
752 		if(q > 0)
753 		{
754 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
755 		}
756 
757 		Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
758 		Byte8 bufferValue = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
759 		bufferValue = bufferValue | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
760 		Byte8 newValue = stencilOperation(bufferValue, state.frontStencil, false, zMask[q], sMask[q]);
761 
762 		if(state.frontStencil.useWriteMask)  // Assume 8-bit stencil buffer
763 		{
764 			Byte8 maskedValue = bufferValue;
765 			newValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].writeMaskQ));
766 			maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].invWriteMaskQ));
767 			newValue |= maskedValue;
768 		}
769 
770 		Byte8 newValueBack = stencilOperation(bufferValue, state.backStencil, true, zMask[q], sMask[q]);
771 
772 		if(state.backStencil.useWriteMask)  // Assume 8-bit stencil buffer
773 		{
774 			Byte8 maskedValue = bufferValue;
775 			newValueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].writeMaskQ));
776 			maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].invWriteMaskQ));
777 			newValueBack |= maskedValue;
778 		}
779 
780 		newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
781 		newValueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
782 		newValue |= newValueBack;
783 
784 		newValue &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * cMask[q]);
785 		bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * cMask[q]);
786 		newValue |= bufferValue;
787 
788 		*Pointer<Short>(buffer) = Extract(As<Short4>(newValue), 0);
789 		*Pointer<Short>(buffer + pitch) = Extract(As<Short4>(newValue), 1);
790 	}
791 }
792 
stencilOperation(const Byte8 & bufferValue,const PixelProcessor::States::StencilOpState & ops,bool isBack,const Int & zMask,const Int & sMask)793 Byte8 PixelRoutine::stencilOperation(const Byte8 &bufferValue, const PixelProcessor::States::StencilOpState &ops, bool isBack, const Int &zMask, const Int &sMask)
794 {
795 	Byte8 pass = stencilOperation(bufferValue, ops.passOp, isBack);
796 
797 	if(state.depthTestActive && ops.depthFailOp != ops.passOp)  // zMask valid and values not the same
798 	{
799 		Byte8 zFail = stencilOperation(bufferValue, ops.depthFailOp, isBack);
800 
801 		pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * zMask);
802 		zFail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * zMask);
803 		pass |= zFail;
804 	}
805 
806 	if(ops.failOp != ops.passOp || (state.depthTestActive && ops.failOp != ops.depthFailOp))
807 	{
808 		Byte8 fail = stencilOperation(bufferValue, ops.failOp, isBack);
809 
810 		pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * sMask);
811 		fail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * sMask);
812 		pass |= fail;
813 	}
814 
815 	return pass;
816 }
817 
hasStencilReplaceRef() const818 bool PixelRoutine::hasStencilReplaceRef() const
819 {
820 	return spirvShader &&
821 	       (spirvShader->outputBuiltins.find(spv::BuiltInFragStencilRefEXT) !=
822 	        spirvShader->outputBuiltins.end());
823 }
824 
stencilReplaceRef()825 Byte8 PixelRoutine::stencilReplaceRef()
826 {
827 	ASSERT(spirvShader);
828 
829 	auto it = spirvShader->outputBuiltins.find(spv::BuiltInFragStencilRefEXT);
830 	ASSERT(it != spirvShader->outputBuiltins.end());
831 
832 	UInt4 sRef = As<UInt4>(routine.getVariable(it->second.Id)[it->second.FirstComponent]) & UInt4(0xff);
833 	// TODO (b/148295813): Could be done with a single pshufb instruction. Optimize the
834 	//                     following line by either adding a rr::Shuffle() variant to do
835 	//                     it explicitly or adding a Byte4(Int4) constructor would work.
836 	sRef.x = rr::UInt(sRef.x) | (rr::UInt(sRef.y) << 8) | (rr::UInt(sRef.z) << 16) | (rr::UInt(sRef.w) << 24);
837 
838 	UInt2 sRefDuplicated;
839 	sRefDuplicated = Insert(sRefDuplicated, sRef.x, 0);
840 	sRefDuplicated = Insert(sRefDuplicated, sRef.x, 1);
841 	return As<Byte8>(sRefDuplicated);
842 }
843 
stencilOperation(const Byte8 & bufferValue,VkStencilOp operation,bool isBack)844 Byte8 PixelRoutine::stencilOperation(const Byte8 &bufferValue, VkStencilOp operation, bool isBack)
845 {
846 	if(hasStencilReplaceRef())
847 	{
848 		return stencilReplaceRef();
849 	}
850 	else
851 	{
852 		switch(operation)
853 		{
854 		case VK_STENCIL_OP_KEEP:
855 			return bufferValue;
856 		case VK_STENCIL_OP_ZERO:
857 			return Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
858 		case VK_STENCIL_OP_REPLACE:
859 			return *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceQ));
860 		case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
861 			return AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
862 		case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
863 			return SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
864 		case VK_STENCIL_OP_INVERT:
865 			return bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
866 		case VK_STENCIL_OP_INCREMENT_AND_WRAP:
867 			return bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
868 		case VK_STENCIL_OP_DECREMENT_AND_WRAP:
869 			return bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
870 		default:
871 			UNSUPPORTED("VkStencilOp: %d", int(operation));
872 		}
873 	}
874 
875 	return Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
876 }
877 
isSRGB(int index) const878 bool PixelRoutine::isSRGB(int index) const
879 {
880 	return vk::Format(state.colorFormat[index]).isSRGBformat();
881 }
882 
readPixel(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4s & pixel)883 void PixelRoutine::readPixel(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &pixel)
884 {
885 	Short4 c01;
886 	Short4 c23;
887 	Pointer<Byte> buffer = cBuffer;
888 	Pointer<Byte> buffer2;
889 
890 	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
891 
892 	vk::Format format = state.colorFormat[index];
893 	switch(format)
894 	{
895 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
896 		buffer += 2 * x;
897 		buffer2 = buffer + pitchB;
898 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
899 
900 		pixel.x = (c01 & Short4(0xF000u));
901 		pixel.y = (c01 & Short4(0x0F00u)) << 4;
902 		pixel.z = (c01 & Short4(0x00F0u)) << 8;
903 		pixel.w = (c01 & Short4(0x000Fu)) << 12;
904 
905 		// Expand to 16 bit range
906 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
907 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
908 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
909 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
910 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
911 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
912 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
913 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
914 		break;
915 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
916 		buffer += 2 * x;
917 		buffer2 = buffer + pitchB;
918 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
919 
920 		pixel.z = (c01 & Short4(0xF000u));
921 		pixel.y = (c01 & Short4(0x0F00u)) << 4;
922 		pixel.x = (c01 & Short4(0x00F0u)) << 8;
923 		pixel.w = (c01 & Short4(0x000Fu)) << 12;
924 
925 		// Expand to 16 bit range
926 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
927 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
928 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
929 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
930 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
931 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
932 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
933 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
934 		break;
935 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
936 		buffer += 2 * x;
937 		buffer2 = buffer + pitchB;
938 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
939 
940 		pixel.w = (c01 & Short4(0xF000u));
941 		pixel.z = (c01 & Short4(0x0F00u)) << 4;
942 		pixel.y = (c01 & Short4(0x00F0u)) << 8;
943 		pixel.x = (c01 & Short4(0x000Fu)) << 12;
944 
945 		// Expand to 16 bit range
946 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
947 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
948 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
949 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
950 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
951 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
952 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
953 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
954 		break;
955 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
956 		buffer += 2 * x;
957 		buffer2 = buffer + pitchB;
958 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
959 
960 		pixel.w = (c01 & Short4(0xF000u));
961 		pixel.x = (c01 & Short4(0x0F00u)) << 4;
962 		pixel.y = (c01 & Short4(0x00F0u)) << 8;
963 		pixel.z = (c01 & Short4(0x000Fu)) << 12;
964 
965 		// Expand to 16 bit range
966 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
967 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
968 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
969 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
970 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
971 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
972 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
973 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
974 		break;
975 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
976 		buffer += 2 * x;
977 		buffer2 = buffer + pitchB;
978 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
979 
980 		pixel.x = (c01 & Short4(0xF800u));
981 		pixel.y = (c01 & Short4(0x07C0u)) << 5;
982 		pixel.z = (c01 & Short4(0x003Eu)) << 10;
983 		pixel.w = ((c01 & Short4(0x0001u)) << 15) >> 15;
984 
985 		// Expand to 16 bit range
986 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
987 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
988 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
989 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
990 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
991 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
992 		break;
993 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
994 		buffer += 2 * x;
995 		buffer2 = buffer + pitchB;
996 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
997 
998 		pixel.z = (c01 & Short4(0xF800u));
999 		pixel.y = (c01 & Short4(0x07C0u)) << 5;
1000 		pixel.x = (c01 & Short4(0x003Eu)) << 10;
1001 		pixel.w = ((c01 & Short4(0x0001u)) << 15) >> 15;
1002 
1003 		// Expand to 16 bit range
1004 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1005 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1006 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1007 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1008 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1009 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1010 		break;
1011 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1012 		buffer += 2 * x;
1013 		buffer2 = buffer + pitchB;
1014 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1015 
1016 		pixel.x = (c01 & Short4(0x7C00u)) << 1;
1017 		pixel.y = (c01 & Short4(0x03E0u)) << 6;
1018 		pixel.z = (c01 & Short4(0x001Fu)) << 11;
1019 		pixel.w = (c01 & Short4(0x8000u)) >> 15;
1020 
1021 		// Expand to 16 bit range
1022 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1023 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1024 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1025 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1026 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1027 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1028 		break;
1029 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
1030 		buffer += 2 * x;
1031 		buffer2 = buffer + pitchB;
1032 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1033 
1034 		pixel.x = c01 & Short4(0xF800u);
1035 		pixel.y = (c01 & Short4(0x07E0u)) << 5;
1036 		pixel.z = (c01 & Short4(0x001Fu)) << 11;
1037 		pixel.w = Short4(0xFFFFu);
1038 
1039 		// Expand to 16 bit range
1040 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1041 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1042 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
1043 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
1044 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1045 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1046 		break;
1047 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
1048 		buffer += 2 * x;
1049 		buffer2 = buffer + pitchB;
1050 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1051 
1052 		pixel.z = c01 & Short4(0xF800u);
1053 		pixel.y = (c01 & Short4(0x07E0u)) << 5;
1054 		pixel.x = (c01 & Short4(0x001Fu)) << 11;
1055 		pixel.w = Short4(0xFFFFu);
1056 
1057 		// Expand to 16 bit range
1058 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1059 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1060 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
1061 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
1062 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1063 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1064 		break;
1065 	case VK_FORMAT_B8G8R8A8_UNORM:
1066 	case VK_FORMAT_B8G8R8A8_SRGB:
1067 		buffer += 4 * x;
1068 		c01 = *Pointer<Short4>(buffer);
1069 		buffer += pitchB;
1070 		c23 = *Pointer<Short4>(buffer);
1071 		pixel.z = c01;
1072 		pixel.y = c01;
1073 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1074 		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1075 		pixel.x = pixel.z;
1076 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1077 		pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1078 		pixel.y = pixel.z;
1079 		pixel.w = pixel.x;
1080 		pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1081 		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1082 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1083 		pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1084 		break;
1085 	case VK_FORMAT_R8G8B8A8_UNORM:
1086 	case VK_FORMAT_R8G8B8A8_SRGB:
1087 		buffer += 4 * x;
1088 		c01 = *Pointer<Short4>(buffer);
1089 		buffer += pitchB;
1090 		c23 = *Pointer<Short4>(buffer);
1091 		pixel.z = c01;
1092 		pixel.y = c01;
1093 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1094 		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1095 		pixel.x = pixel.z;
1096 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1097 		pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1098 		pixel.y = pixel.z;
1099 		pixel.w = pixel.x;
1100 		pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1101 		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1102 		pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1103 		pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1104 		break;
1105 	case VK_FORMAT_R8_UNORM:
1106 		buffer += 1 * x;
1107 		pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
1108 		buffer += pitchB;
1109 		pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
1110 		pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1111 		pixel.y = Short4(0x0000);
1112 		pixel.z = Short4(0x0000);
1113 		pixel.w = Short4(0xFFFFu);
1114 		break;
1115 	case VK_FORMAT_R8G8_UNORM:
1116 		buffer += 2 * x;
1117 		c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
1118 		buffer += pitchB;
1119 		c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
1120 		pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
1121 		pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
1122 		pixel.z = Short4(0x0000u);
1123 		pixel.w = Short4(0xFFFFu);
1124 		break;
1125 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1126 		{
1127 			Int4 v = Int4(0);
1128 			buffer += 4 * x;
1129 			v = Insert(v, *Pointer<Int>(buffer + 0), 0);
1130 			v = Insert(v, *Pointer<Int>(buffer + 4), 1);
1131 			buffer += pitchB;
1132 			v = Insert(v, *Pointer<Int>(buffer + 0), 2);
1133 			v = Insert(v, *Pointer<Int>(buffer + 4), 3);
1134 
1135 			pixel.x = Short4(v << 6) & Short4(0xFFC0u);
1136 			pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
1137 			pixel.z = Short4(v >> 14) & Short4(0xFFC0u);
1138 			pixel.w = Short4(v >> 16) & Short4(0xC000u);
1139 
1140 			// Expand to 16 bit range
1141 			pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1142 			pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1143 			pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1144 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 2);
1145 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1146 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1147 		}
1148 		break;
1149 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1150 		{
1151 			Int4 v = Int4(0);
1152 			v = Insert(v, *Pointer<Int>(buffer + 4 * x), 0);
1153 			v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 1);
1154 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1155 			v = Insert(v, *Pointer<Int>(buffer + 4 * x), 2);
1156 			v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 3);
1157 
1158 			pixel.x = Short4(v >> 14) & Short4(0xFFC0u);
1159 			pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
1160 			pixel.z = Short4(v << 6) & Short4(0xFFC0u);
1161 			pixel.w = Short4(v >> 16) & Short4(0xC000u);
1162 
1163 			// Expand to 16 bit range
1164 			pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1165 			pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1166 			pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1167 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 2);
1168 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1169 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1170 		}
1171 		break;
1172 	default:
1173 		UNSUPPORTED("VkFormat %d", int(format));
1174 	}
1175 }
1176 
blendConstant(vk::Format format,int component,BlendFactorModifier modifier)1177 Float PixelRoutine::blendConstant(vk::Format format, int component, BlendFactorModifier modifier)
1178 {
1179 	bool inverse = (modifier == OneMinus);
1180 
1181 	if(format.isUnsignedNormalized())
1182 	{
1183 		return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantU.v[component]))
1184 		               : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantU.v[component]));
1185 	}
1186 	else if(format.isSignedNormalized())
1187 	{
1188 		return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantS.v[component]))
1189 		               : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantS.v[component]));
1190 	}
1191 	else  // Floating-point format
1192 	{
1193 		ASSERT(format.isFloatFormat());
1194 		return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantF.v[component]))
1195 		               : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantF.v[component]));
1196 	}
1197 }
1198 
blendFactorRGB(SIMD::Float4 & blendFactor,const SIMD::Float4 & sourceColor,const SIMD::Float4 & destColor,VkBlendFactor colorBlendFactor,vk::Format format)1199 void PixelRoutine::blendFactorRGB(SIMD::Float4 &blendFactor, const SIMD::Float4 &sourceColor, const SIMD::Float4 &destColor, VkBlendFactor colorBlendFactor, vk::Format format)
1200 {
1201 	switch(colorBlendFactor)
1202 	{
1203 	case VK_BLEND_FACTOR_ZERO:
1204 		blendFactor.x = 0.0f;
1205 		blendFactor.y = 0.0f;
1206 		blendFactor.z = 0.0f;
1207 		break;
1208 	case VK_BLEND_FACTOR_ONE:
1209 		blendFactor.x = 1.0f;
1210 		blendFactor.y = 1.0f;
1211 		blendFactor.z = 1.0f;
1212 		break;
1213 	case VK_BLEND_FACTOR_SRC_COLOR:
1214 		blendFactor.x = sourceColor.x;
1215 		blendFactor.y = sourceColor.y;
1216 		blendFactor.z = sourceColor.z;
1217 		break;
1218 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1219 		blendFactor.x = 1.0f - sourceColor.x;
1220 		blendFactor.y = 1.0f - sourceColor.y;
1221 		blendFactor.z = 1.0f - sourceColor.z;
1222 		break;
1223 	case VK_BLEND_FACTOR_DST_COLOR:
1224 		blendFactor.x = destColor.x;
1225 		blendFactor.y = destColor.y;
1226 		blendFactor.z = destColor.z;
1227 		break;
1228 	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1229 		blendFactor.x = 1.0f - destColor.x;
1230 		blendFactor.y = 1.0f - destColor.y;
1231 		blendFactor.z = 1.0f - destColor.z;
1232 		break;
1233 	case VK_BLEND_FACTOR_SRC_ALPHA:
1234 		blendFactor.x = sourceColor.w;
1235 		blendFactor.y = sourceColor.w;
1236 		blendFactor.z = sourceColor.w;
1237 		break;
1238 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1239 		blendFactor.x = 1.0f - sourceColor.w;
1240 		blendFactor.y = 1.0f - sourceColor.w;
1241 		blendFactor.z = 1.0f - sourceColor.w;
1242 		break;
1243 	case VK_BLEND_FACTOR_DST_ALPHA:
1244 		blendFactor.x = destColor.w;
1245 		blendFactor.y = destColor.w;
1246 		blendFactor.z = destColor.w;
1247 		break;
1248 	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1249 		blendFactor.x = 1.0f - destColor.w;
1250 		blendFactor.y = 1.0f - destColor.w;
1251 		blendFactor.z = 1.0f - destColor.w;
1252 		break;
1253 	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1254 		blendFactor.x = 1.0f - destColor.w;
1255 		blendFactor.x = Min(blendFactor.x, sourceColor.w);
1256 		blendFactor.y = blendFactor.x;
1257 		blendFactor.z = blendFactor.x;
1258 		break;
1259 	case VK_BLEND_FACTOR_CONSTANT_COLOR:
1260 		blendFactor.x = blendConstant(format, 0);
1261 		blendFactor.y = blendConstant(format, 1);
1262 		blendFactor.z = blendConstant(format, 2);
1263 		break;
1264 	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1265 		blendFactor.x = blendConstant(format, 3);
1266 		blendFactor.y = blendConstant(format, 3);
1267 		blendFactor.z = blendConstant(format, 3);
1268 		break;
1269 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1270 		blendFactor.x = blendConstant(format, 0, OneMinus);
1271 		blendFactor.y = blendConstant(format, 1, OneMinus);
1272 		blendFactor.z = blendConstant(format, 2, OneMinus);
1273 		break;
1274 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1275 		blendFactor.x = blendConstant(format, 3, OneMinus);
1276 		blendFactor.y = blendConstant(format, 3, OneMinus);
1277 		blendFactor.z = blendConstant(format, 3, OneMinus);
1278 		break;
1279 
1280 	default:
1281 		UNSUPPORTED("VkBlendFactor: %d", int(colorBlendFactor));
1282 	}
1283 
1284 	// "If the color attachment is fixed-point, the components of the source and destination values and blend factors are each clamped
1285 	//  to [0,1] or [-1,1] respectively for an unsigned normalized or signed normalized color attachment prior to evaluating the blend
1286 	//  operations. If the color attachment is floating-point, no clamping occurs."
1287 	if(blendFactorCanExceedFormatRange(colorBlendFactor, format))
1288 	{
1289 		if(format.isUnsignedNormalized())
1290 		{
1291 			blendFactor.x = Min(Max(blendFactor.x, 0.0f), 1.0f);
1292 			blendFactor.y = Min(Max(blendFactor.y, 0.0f), 1.0f);
1293 			blendFactor.z = Min(Max(blendFactor.z, 0.0f), 1.0f);
1294 		}
1295 		else if(format.isSignedNormalized())
1296 		{
1297 			blendFactor.x = Min(Max(blendFactor.x, -1.0f), 1.0f);
1298 			blendFactor.y = Min(Max(blendFactor.y, -1.0f), 1.0f);
1299 			blendFactor.z = Min(Max(blendFactor.z, -1.0f), 1.0f);
1300 		}
1301 	}
1302 }
1303 
blendFactorAlpha(SIMD::Float & blendFactorAlpha,const SIMD::Float & sourceAlpha,const SIMD::Float & destAlpha,VkBlendFactor alphaBlendFactor,vk::Format format)1304 void PixelRoutine::blendFactorAlpha(SIMD::Float &blendFactorAlpha, const SIMD::Float &sourceAlpha, const SIMD::Float &destAlpha, VkBlendFactor alphaBlendFactor, vk::Format format)
1305 {
1306 	switch(alphaBlendFactor)
1307 	{
1308 	case VK_BLEND_FACTOR_ZERO:
1309 		blendFactorAlpha = 0.0f;
1310 		break;
1311 	case VK_BLEND_FACTOR_ONE:
1312 		blendFactorAlpha = 1.0f;
1313 		break;
1314 	case VK_BLEND_FACTOR_SRC_COLOR:
1315 		blendFactorAlpha = sourceAlpha;
1316 		break;
1317 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1318 		blendFactorAlpha = 1.0f - sourceAlpha;
1319 		break;
1320 	case VK_BLEND_FACTOR_DST_COLOR:
1321 		blendFactorAlpha = destAlpha;
1322 		break;
1323 	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1324 		blendFactorAlpha = 1.0f - destAlpha;
1325 		break;
1326 	case VK_BLEND_FACTOR_SRC_ALPHA:
1327 		blendFactorAlpha = sourceAlpha;
1328 		break;
1329 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1330 		blendFactorAlpha = 1.0f - sourceAlpha;
1331 		break;
1332 	case VK_BLEND_FACTOR_DST_ALPHA:
1333 		blendFactorAlpha = destAlpha;
1334 		break;
1335 	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1336 		blendFactorAlpha = 1.0f - destAlpha;
1337 		break;
1338 	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1339 		blendFactorAlpha = 1.0f;
1340 		break;
1341 	case VK_BLEND_FACTOR_CONSTANT_COLOR:
1342 	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1343 		blendFactorAlpha = blendConstant(format, 3);
1344 		break;
1345 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1346 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1347 		blendFactorAlpha = blendConstant(format, 3, OneMinus);
1348 		break;
1349 	default:
1350 		UNSUPPORTED("VkBlendFactor: %d", int(alphaBlendFactor));
1351 	}
1352 
1353 	// "If the color attachment is fixed-point, the components of the source and destination values and blend factors are each clamped
1354 	//  to [0,1] or [-1,1] respectively for an unsigned normalized or signed normalized color attachment prior to evaluating the blend
1355 	//  operations. If the color attachment is floating-point, no clamping occurs."
1356 	if(blendFactorCanExceedFormatRange(alphaBlendFactor, format))
1357 	{
1358 		if(format.isUnsignedNormalized())
1359 		{
1360 			blendFactorAlpha = Min(Max(blendFactorAlpha, 0.0f), 1.0f);
1361 		}
1362 		else if(format.isSignedNormalized())
1363 		{
1364 			blendFactorAlpha = Min(Max(blendFactorAlpha, -1.0f), 1.0f);
1365 		}
1366 	}
1367 }
1368 
blendOpOverlay(SIMD::Float & src,SIMD::Float & dst)1369 SIMD::Float PixelRoutine::blendOpOverlay(SIMD::Float &src, SIMD::Float &dst)
1370 {
1371 	SIMD::Int largeDst = CmpGT(dst, 0.5f);
1372 	return As<SIMD::Float>(
1373 	    (~largeDst & As<SIMD::Int>(2.0f * src * dst)) |
1374 	    (largeDst & As<SIMD::Int>(1.0f - (2.0f * (1.0f - src) * (1.0f - dst)))));
1375 }
1376 
blendOpColorDodge(SIMD::Float & src,SIMD::Float & dst)1377 SIMD::Float PixelRoutine::blendOpColorDodge(SIMD::Float &src, SIMD::Float &dst)
1378 {
1379 	SIMD::Int srcBelowOne = CmpLT(src, 1.0f);
1380 	SIMD::Int positiveDst = CmpGT(dst, 0.0f);
1381 	return As<SIMD::Float>(positiveDst & ((~srcBelowOne & As<SIMD::Int>(SIMD::Float(1.0f))) |
1382 	                                      (srcBelowOne & As<SIMD::Int>(Min(1.0f, (dst / (1.0f - src)))))));
1383 }
1384 
blendOpColorBurn(SIMD::Float & src,SIMD::Float & dst)1385 SIMD::Float PixelRoutine::blendOpColorBurn(SIMD::Float &src, SIMD::Float &dst)
1386 {
1387 	SIMD::Int dstBelowOne = CmpLT(dst, 1.0f);
1388 	SIMD::Int positiveSrc = CmpGT(src, 0.0f);
1389 	return As<SIMD::Float>(
1390 	    (~dstBelowOne & As<SIMD::Int>(SIMD::Float(1.0f))) |
1391 	    (dstBelowOne & positiveSrc & As<SIMD::Int>(1.0f - Min(1.0f, (1.0f - dst) / src))));
1392 }
1393 
blendOpHardlight(SIMD::Float & src,SIMD::Float & dst)1394 SIMD::Float PixelRoutine::blendOpHardlight(SIMD::Float &src, SIMD::Float &dst)
1395 {
1396 	SIMD::Int largeSrc = CmpGT(src, 0.5f);
1397 	return As<SIMD::Float>(
1398 	    (~largeSrc & As<SIMD::Int>(2.0f * src * dst)) |
1399 	    (largeSrc & As<SIMD::Int>(1.0f - (2.0f * (1.0f - src) * (1.0f - dst)))));
1400 }
1401 
blendOpSoftlight(SIMD::Float & src,SIMD::Float & dst)1402 SIMD::Float PixelRoutine::blendOpSoftlight(SIMD::Float &src, SIMD::Float &dst)
1403 {
1404 	SIMD::Int largeSrc = CmpGT(src, 0.5f);
1405 	SIMD::Int largeDst = CmpGT(dst, 0.25f);
1406 
1407 	return As<SIMD::Float>(
1408 	    (~largeSrc & As<SIMD::Int>(dst - ((1.0f - (2.0f * src)) * dst * (1.0f - dst)))) |
1409 	    (largeSrc & ((~largeDst & As<SIMD::Int>(dst + (((2.0f * src) - 1.0f) * dst * ((((16.0f * dst) - 12.0f) * dst) + 3.0f)))) |
1410 	                 (largeDst & As<SIMD::Int>(dst + (((2.0f * src) - 1.0f) * (Sqrt<Mediump>(dst) - dst)))))));
1411 }
1412 
maxRGB(SIMD::Float4 & c)1413 SIMD::Float PixelRoutine::maxRGB(SIMD::Float4 &c)
1414 {
1415 	return Max(Max(c.x, c.y), c.z);
1416 }
1417 
minRGB(SIMD::Float4 & c)1418 SIMD::Float PixelRoutine::minRGB(SIMD::Float4 &c)
1419 {
1420 	return Min(Min(c.x, c.y), c.z);
1421 }
1422 
setLumSat(SIMD::Float4 & cbase,SIMD::Float4 & csat,SIMD::Float4 & clum,SIMD::Float & x,SIMD::Float & y,SIMD::Float & z)1423 void PixelRoutine::setLumSat(SIMD::Float4 &cbase, SIMD::Float4 &csat, SIMD::Float4 &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z)
1424 {
1425 	SIMD::Float minbase = minRGB(cbase);
1426 	SIMD::Float sbase = maxRGB(cbase) - minbase;
1427 	SIMD::Float ssat = maxRGB(csat) - minRGB(csat);
1428 	SIMD::Int isNonZero = CmpGT(sbase, 0.0f);
1429 	SIMD::Float4 color;
1430 	color.x = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.x - minbase) * ssat / sbase));
1431 	color.y = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.y - minbase) * ssat / sbase));
1432 	color.z = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.z - minbase) * ssat / sbase));
1433 	setLum(color, clum, x, y, z);
1434 }
1435 
lumRGB(SIMD::Float4 & c)1436 SIMD::Float PixelRoutine::lumRGB(SIMD::Float4 &c)
1437 {
1438 	return c.x * 0.3f + c.y * 0.59f + c.z * 0.11f;
1439 }
1440 
computeLum(SIMD::Float & color,SIMD::Float & lum,SIMD::Float & mincol,SIMD::Float & maxcol,SIMD::Int & negative,SIMD::Int & aboveOne)1441 SIMD::Float PixelRoutine::computeLum(SIMD::Float &color, SIMD::Float &lum, SIMD::Float &mincol, SIMD::Float &maxcol, SIMD::Int &negative, SIMD::Int &aboveOne)
1442 {
1443 	return As<SIMD::Float>(
1444 	    (negative & As<SIMD::Int>(lum + ((color - lum) * lum) / (lum - mincol))) |
1445 	    (~negative & ((aboveOne & As<SIMD::Int>(lum + ((color - lum) * (1.0f - lum)) / (maxcol - lum))) |
1446 	                  (~aboveOne & As<SIMD::Int>(color)))));
1447 }
1448 
setLum(SIMD::Float4 & cbase,SIMD::Float4 & clum,SIMD::Float & x,SIMD::Float & y,SIMD::Float & z)1449 void PixelRoutine::setLum(SIMD::Float4 &cbase, SIMD::Float4 &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z)
1450 {
1451 	SIMD::Float lbase = lumRGB(cbase);
1452 	SIMD::Float llum = lumRGB(clum);
1453 	SIMD::Float ldiff = llum - lbase;
1454 
1455 	SIMD::Float4 color;
1456 	color.x = cbase.x + ldiff;
1457 	color.y = cbase.y + ldiff;
1458 	color.z = cbase.z + ldiff;
1459 
1460 	SIMD::Float lum = lumRGB(color);
1461 	SIMD::Float mincol = minRGB(color);
1462 	SIMD::Float maxcol = maxRGB(color);
1463 
1464 	SIMD::Int negative = CmpLT(mincol, 0.0f);
1465 	SIMD::Int aboveOne = CmpGT(maxcol, 1.0f);
1466 
1467 	x = computeLum(color.x, lum, mincol, maxcol, negative, aboveOne);
1468 	y = computeLum(color.y, lum, mincol, maxcol, negative, aboveOne);
1469 	z = computeLum(color.z, lum, mincol, maxcol, negative, aboveOne);
1470 }
1471 
premultiply(SIMD::Float4 & c)1472 void PixelRoutine::premultiply(SIMD::Float4 &c)
1473 {
1474 	SIMD::Int nonZeroAlpha = CmpNEQ(c.w, 0.0f);
1475 	c.x = As<SIMD::Float>(nonZeroAlpha & As<SIMD::Int>(c.x / c.w));
1476 	c.y = As<SIMD::Float>(nonZeroAlpha & As<SIMD::Int>(c.y / c.w));
1477 	c.z = As<SIMD::Float>(nonZeroAlpha & As<SIMD::Int>(c.z / c.w));
1478 }
1479 
computeAdvancedBlendMode(int index,const SIMD::Float4 & src,const SIMD::Float4 & dst,const SIMD::Float4 & srcFactor,const SIMD::Float4 & dstFactor)1480 SIMD::Float4 PixelRoutine::computeAdvancedBlendMode(int index, const SIMD::Float4 &src, const SIMD::Float4 &dst, const SIMD::Float4 &srcFactor, const SIMD::Float4 &dstFactor)
1481 {
1482 	SIMD::Float4 srcColor = src;
1483 	srcColor.x *= srcFactor.x;
1484 	srcColor.y *= srcFactor.y;
1485 	srcColor.z *= srcFactor.z;
1486 	srcColor.w *= srcFactor.w;
1487 
1488 	SIMD::Float4 dstColor = dst;
1489 	dstColor.x *= dstFactor.x;
1490 	dstColor.y *= dstFactor.y;
1491 	dstColor.z *= dstFactor.z;
1492 	dstColor.w *= dstFactor.w;
1493 
1494 	premultiply(srcColor);
1495 	premultiply(dstColor);
1496 
1497 	SIMD::Float4 blendedColor;
1498 
1499 	switch(state.blendState[index].blendOperation)
1500 	{
1501 	case VK_BLEND_OP_MULTIPLY_EXT:
1502 		blendedColor.x = (srcColor.x * dstColor.x);
1503 		blendedColor.y = (srcColor.y * dstColor.y);
1504 		blendedColor.z = (srcColor.z * dstColor.z);
1505 		break;
1506 	case VK_BLEND_OP_SCREEN_EXT:
1507 		blendedColor.x = srcColor.x + dstColor.x - (srcColor.x * dstColor.x);
1508 		blendedColor.y = srcColor.y + dstColor.y - (srcColor.y * dstColor.y);
1509 		blendedColor.z = srcColor.z + dstColor.z - (srcColor.z * dstColor.z);
1510 		break;
1511 	case VK_BLEND_OP_OVERLAY_EXT:
1512 		blendedColor.x = blendOpOverlay(srcColor.x, dstColor.x);
1513 		blendedColor.y = blendOpOverlay(srcColor.y, dstColor.y);
1514 		blendedColor.z = blendOpOverlay(srcColor.z, dstColor.z);
1515 		break;
1516 	case VK_BLEND_OP_DARKEN_EXT:
1517 		blendedColor.x = Min(srcColor.x, dstColor.x);
1518 		blendedColor.y = Min(srcColor.y, dstColor.y);
1519 		blendedColor.z = Min(srcColor.z, dstColor.z);
1520 		break;
1521 	case VK_BLEND_OP_LIGHTEN_EXT:
1522 		blendedColor.x = Max(srcColor.x, dstColor.x);
1523 		blendedColor.y = Max(srcColor.y, dstColor.y);
1524 		blendedColor.z = Max(srcColor.z, dstColor.z);
1525 		break;
1526 	case VK_BLEND_OP_COLORDODGE_EXT:
1527 		blendedColor.x = blendOpColorDodge(srcColor.x, dstColor.x);
1528 		blendedColor.y = blendOpColorDodge(srcColor.y, dstColor.y);
1529 		blendedColor.z = blendOpColorDodge(srcColor.z, dstColor.z);
1530 		break;
1531 	case VK_BLEND_OP_COLORBURN_EXT:
1532 		blendedColor.x = blendOpColorBurn(srcColor.x, dstColor.x);
1533 		blendedColor.y = blendOpColorBurn(srcColor.y, dstColor.y);
1534 		blendedColor.z = blendOpColorBurn(srcColor.z, dstColor.z);
1535 		break;
1536 	case VK_BLEND_OP_HARDLIGHT_EXT:
1537 		blendedColor.x = blendOpHardlight(srcColor.x, dstColor.x);
1538 		blendedColor.y = blendOpHardlight(srcColor.y, dstColor.y);
1539 		blendedColor.z = blendOpHardlight(srcColor.z, dstColor.z);
1540 		break;
1541 	case VK_BLEND_OP_SOFTLIGHT_EXT:
1542 		blendedColor.x = blendOpSoftlight(srcColor.x, dstColor.x);
1543 		blendedColor.y = blendOpSoftlight(srcColor.y, dstColor.y);
1544 		blendedColor.z = blendOpSoftlight(srcColor.z, dstColor.z);
1545 		break;
1546 	case VK_BLEND_OP_DIFFERENCE_EXT:
1547 		blendedColor.x = Abs(srcColor.x - dstColor.x);
1548 		blendedColor.y = Abs(srcColor.y - dstColor.y);
1549 		blendedColor.z = Abs(srcColor.z - dstColor.z);
1550 		break;
1551 	case VK_BLEND_OP_EXCLUSION_EXT:
1552 		blendedColor.x = srcColor.x + dstColor.x - (srcColor.x * dstColor.x * 2.0f);
1553 		blendedColor.y = srcColor.y + dstColor.y - (srcColor.y * dstColor.y * 2.0f);
1554 		blendedColor.z = srcColor.z + dstColor.z - (srcColor.z * dstColor.z * 2.0f);
1555 		break;
1556 	case VK_BLEND_OP_HSL_HUE_EXT:
1557 		setLumSat(srcColor, dstColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
1558 		break;
1559 	case VK_BLEND_OP_HSL_SATURATION_EXT:
1560 		setLumSat(dstColor, srcColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
1561 		break;
1562 	case VK_BLEND_OP_HSL_COLOR_EXT:
1563 		setLum(srcColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
1564 		break;
1565 	case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
1566 		setLum(dstColor, srcColor, blendedColor.x, blendedColor.y, blendedColor.z);
1567 		break;
1568 	default:
1569 		UNSUPPORTED("Unsupported advanced VkBlendOp: %d", int(state.blendState[index].blendOperation));
1570 		break;
1571 	}
1572 
1573 	SIMD::Float p = srcColor.w * dstColor.w;
1574 	blendedColor.x *= p;
1575 	blendedColor.y *= p;
1576 	blendedColor.z *= p;
1577 
1578 	p = srcColor.w * (1.0f - dstColor.w);
1579 	blendedColor.x += srcColor.x * p;
1580 	blendedColor.y += srcColor.y * p;
1581 	blendedColor.z += srcColor.z * p;
1582 
1583 	p = dstColor.w * (1.0f - srcColor.w);
1584 	blendedColor.x += dstColor.x * p;
1585 	blendedColor.y += dstColor.y * p;
1586 	blendedColor.z += dstColor.z * p;
1587 
1588 	return blendedColor;
1589 }
1590 
blendFactorCanExceedFormatRange(VkBlendFactor blendFactor,vk::Format format)1591 bool PixelRoutine::blendFactorCanExceedFormatRange(VkBlendFactor blendFactor, vk::Format format)
1592 {
1593 	switch(blendFactor)
1594 	{
1595 	case VK_BLEND_FACTOR_ZERO:
1596 	case VK_BLEND_FACTOR_ONE:
1597 		return false;
1598 	case VK_BLEND_FACTOR_SRC_COLOR:
1599 	case VK_BLEND_FACTOR_SRC_ALPHA:
1600 		// Source values have been clamped after fragment shader execution if the attachment format is normalized.
1601 		return false;
1602 	case VK_BLEND_FACTOR_DST_COLOR:
1603 	case VK_BLEND_FACTOR_DST_ALPHA:
1604 		// Dest values have a valid range due to being read from the attachment.
1605 		return false;
1606 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1607 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1608 	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1609 	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1610 		// For signed formats, negative values cause the result to exceed 1.0.
1611 		return format.isSignedNormalized();
1612 	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1613 		// min(As, 1 - Ad)
1614 		return false;
1615 	case VK_BLEND_FACTOR_CONSTANT_COLOR:
1616 	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1617 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1618 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1619 		return false;
1620 
1621 	default:
1622 		UNSUPPORTED("VkBlendFactor: %d", int(blendFactor));
1623 		return false;
1624 	}
1625 }
1626 
alphaBlend(int index,const Pointer<Byte> & cBuffer,const SIMD::Float4 & sourceColor,const Int & x)1627 SIMD::Float4 PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, const SIMD::Float4 &sourceColor, const Int &x)
1628 {
1629 	if(!state.blendState[index].alphaBlendEnable)
1630 	{
1631 		return sourceColor;
1632 	}
1633 
1634 	vk::Format format = state.colorFormat[index];
1635 	ASSERT(format.supportsColorAttachmentBlend());
1636 
1637 	Pointer<Byte> buffer = cBuffer;
1638 	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1639 
1640 	// texelColor holds four texel color values.
1641 	// Note: Despite the type being Vector4f, the colors may be stored as
1642 	// integers. Half-floats are stored as full 32-bit floats.
1643 	// Non-float and non-fixed point formats are not alpha blended.
1644 	Vector4f texelColor;
1645 
1646 	switch(format)
1647 	{
1648 	case VK_FORMAT_R32_SINT:
1649 	case VK_FORMAT_R32_UINT:
1650 	case VK_FORMAT_R32_SFLOAT:
1651 		// FIXME: movlps
1652 		buffer += 4 * x;
1653 		texelColor.x.x = *Pointer<Float>(buffer + 0);
1654 		texelColor.x.y = *Pointer<Float>(buffer + 4);
1655 		buffer += pitchB;
1656 		// FIXME: movhps
1657 		texelColor.x.z = *Pointer<Float>(buffer + 0);
1658 		texelColor.x.w = *Pointer<Float>(buffer + 4);
1659 		texelColor.y = texelColor.z = texelColor.w = 1.0f;
1660 		break;
1661 	case VK_FORMAT_R32G32_SINT:
1662 	case VK_FORMAT_R32G32_UINT:
1663 	case VK_FORMAT_R32G32_SFLOAT:
1664 		buffer += 8 * x;
1665 		texelColor.x = *Pointer<Float4>(buffer, 16);
1666 		buffer += pitchB;
1667 		texelColor.y = *Pointer<Float4>(buffer, 16);
1668 		texelColor.z = texelColor.x;
1669 		texelColor.x = ShuffleLowHigh(texelColor.x, texelColor.y, 0x0202);
1670 		texelColor.z = ShuffleLowHigh(texelColor.z, texelColor.y, 0x1313);
1671 		texelColor.y = texelColor.z;
1672 		texelColor.z = texelColor.w = 1.0f;
1673 		break;
1674 	case VK_FORMAT_R32G32B32A32_SFLOAT:
1675 	case VK_FORMAT_R32G32B32A32_SINT:
1676 	case VK_FORMAT_R32G32B32A32_UINT:
1677 		buffer += 16 * x;
1678 		texelColor.x = *Pointer<Float4>(buffer + 0, 16);
1679 		texelColor.y = *Pointer<Float4>(buffer + 16, 16);
1680 		buffer += pitchB;
1681 		texelColor.z = *Pointer<Float4>(buffer + 0, 16);
1682 		texelColor.w = *Pointer<Float4>(buffer + 16, 16);
1683 		transpose4x4(texelColor.x, texelColor.y, texelColor.z, texelColor.w);
1684 		break;
1685 	case VK_FORMAT_R16_UNORM:
1686 		buffer += 2 * x;
1687 		texelColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
1688 		texelColor.x.y = Float(Int(*Pointer<UShort>(buffer + 2)));
1689 		buffer += pitchB;
1690 		texelColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
1691 		texelColor.x.w = Float(Int(*Pointer<UShort>(buffer + 2)));
1692 		texelColor.x *= (1.0f / 0xFFFF);
1693 		texelColor.y = texelColor.z = texelColor.w = 1.0f;
1694 		break;
1695 	case VK_FORMAT_R16_SFLOAT:
1696 		buffer += 2 * x;
1697 		texelColor.x.x = Float(*Pointer<Half>(buffer + 0));
1698 		texelColor.x.y = Float(*Pointer<Half>(buffer + 2));
1699 		buffer += pitchB;
1700 		texelColor.x.z = Float(*Pointer<Half>(buffer + 0));
1701 		texelColor.x.w = Float(*Pointer<Half>(buffer + 2));
1702 		texelColor.y = texelColor.z = texelColor.w = 1.0f;
1703 		break;
1704 	case VK_FORMAT_R16G16_UNORM:
1705 		buffer += 4 * x;
1706 		texelColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
1707 		texelColor.y.x = Float(Int(*Pointer<UShort>(buffer + 2)));
1708 		texelColor.x.y = Float(Int(*Pointer<UShort>(buffer + 4)));
1709 		texelColor.y.y = Float(Int(*Pointer<UShort>(buffer + 6)));
1710 		buffer += pitchB;
1711 		texelColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
1712 		texelColor.y.z = Float(Int(*Pointer<UShort>(buffer + 2)));
1713 		texelColor.x.w = Float(Int(*Pointer<UShort>(buffer + 4)));
1714 		texelColor.y.w = Float(Int(*Pointer<UShort>(buffer + 6)));
1715 		texelColor.x *= (1.0f / 0xFFFF);
1716 		texelColor.y *= (1.0f / 0xFFFF);
1717 		texelColor.z = texelColor.w = 1.0f;
1718 		break;
1719 	case VK_FORMAT_R16G16_SFLOAT:
1720 		buffer += 4 * x;
1721 		texelColor.x.x = Float(*Pointer<Half>(buffer + 0));
1722 		texelColor.y.x = Float(*Pointer<Half>(buffer + 2));
1723 		texelColor.x.y = Float(*Pointer<Half>(buffer + 4));
1724 		texelColor.y.y = Float(*Pointer<Half>(buffer + 6));
1725 		buffer += pitchB;
1726 		texelColor.x.z = Float(*Pointer<Half>(buffer + 0));
1727 		texelColor.y.z = Float(*Pointer<Half>(buffer + 2));
1728 		texelColor.x.w = Float(*Pointer<Half>(buffer + 4));
1729 		texelColor.y.w = Float(*Pointer<Half>(buffer + 6));
1730 		texelColor.z = texelColor.w = 1.0f;
1731 		break;
1732 	case VK_FORMAT_R16G16B16A16_UNORM:
1733 		buffer += 8 * x;
1734 		texelColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0x0)));
1735 		texelColor.y.x = Float(Int(*Pointer<UShort>(buffer + 0x2)));
1736 		texelColor.z.x = Float(Int(*Pointer<UShort>(buffer + 0x4)));
1737 		texelColor.w.x = Float(Int(*Pointer<UShort>(buffer + 0x6)));
1738 		texelColor.x.y = Float(Int(*Pointer<UShort>(buffer + 0x8)));
1739 		texelColor.y.y = Float(Int(*Pointer<UShort>(buffer + 0xa)));
1740 		texelColor.z.y = Float(Int(*Pointer<UShort>(buffer + 0xc)));
1741 		texelColor.w.y = Float(Int(*Pointer<UShort>(buffer + 0xe)));
1742 		buffer += pitchB;
1743 		texelColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0x0)));
1744 		texelColor.y.z = Float(Int(*Pointer<UShort>(buffer + 0x2)));
1745 		texelColor.z.z = Float(Int(*Pointer<UShort>(buffer + 0x4)));
1746 		texelColor.w.z = Float(Int(*Pointer<UShort>(buffer + 0x6)));
1747 		texelColor.x.w = Float(Int(*Pointer<UShort>(buffer + 0x8)));
1748 		texelColor.y.w = Float(Int(*Pointer<UShort>(buffer + 0xa)));
1749 		texelColor.z.w = Float(Int(*Pointer<UShort>(buffer + 0xc)));
1750 		texelColor.w.w = Float(Int(*Pointer<UShort>(buffer + 0xe)));
1751 		texelColor.x *= (1.0f / 0xFFFF);
1752 		texelColor.y *= (1.0f / 0xFFFF);
1753 		texelColor.z *= (1.0f / 0xFFFF);
1754 		texelColor.w *= (1.0f / 0xFFFF);
1755 		break;
1756 	case VK_FORMAT_R16G16B16A16_SFLOAT:
1757 		buffer += 8 * x;
1758 		texelColor.x.x = Float(*Pointer<Half>(buffer + 0x0));
1759 		texelColor.y.x = Float(*Pointer<Half>(buffer + 0x2));
1760 		texelColor.z.x = Float(*Pointer<Half>(buffer + 0x4));
1761 		texelColor.w.x = Float(*Pointer<Half>(buffer + 0x6));
1762 		texelColor.x.y = Float(*Pointer<Half>(buffer + 0x8));
1763 		texelColor.y.y = Float(*Pointer<Half>(buffer + 0xa));
1764 		texelColor.z.y = Float(*Pointer<Half>(buffer + 0xc));
1765 		texelColor.w.y = Float(*Pointer<Half>(buffer + 0xe));
1766 		buffer += pitchB;
1767 		texelColor.x.z = Float(*Pointer<Half>(buffer + 0x0));
1768 		texelColor.y.z = Float(*Pointer<Half>(buffer + 0x2));
1769 		texelColor.z.z = Float(*Pointer<Half>(buffer + 0x4));
1770 		texelColor.w.z = Float(*Pointer<Half>(buffer + 0x6));
1771 		texelColor.x.w = Float(*Pointer<Half>(buffer + 0x8));
1772 		texelColor.y.w = Float(*Pointer<Half>(buffer + 0xa));
1773 		texelColor.z.w = Float(*Pointer<Half>(buffer + 0xc));
1774 		texelColor.w.w = Float(*Pointer<Half>(buffer + 0xe));
1775 		break;
1776 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
1777 		buffer += 4 * x;
1778 		texelColor.x = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
1779 		texelColor.y = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
1780 		buffer += pitchB;
1781 		texelColor.z = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
1782 		texelColor.w = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
1783 		transpose4x3(texelColor.x, texelColor.y, texelColor.z, texelColor.w);
1784 		texelColor.w = 1.0f;
1785 		break;
1786 	default:
1787 		{
1788 			// Attempt to read an integer based format and convert it to float
1789 			Vector4s color;
1790 			readPixel(index, cBuffer, x, color);
1791 			texelColor.x = Float4(As<UShort4>(color.x)) * (1.0f / 0xFFFF);
1792 			texelColor.y = Float4(As<UShort4>(color.y)) * (1.0f / 0xFFFF);
1793 			texelColor.z = Float4(As<UShort4>(color.z)) * (1.0f / 0xFFFF);
1794 			texelColor.w = Float4(As<UShort4>(color.w)) * (1.0f / 0xFFFF);
1795 
1796 			if(isSRGB(index))
1797 			{
1798 				texelColor.x = sRGBtoLinear(texelColor.x);
1799 				texelColor.y = sRGBtoLinear(texelColor.y);
1800 				texelColor.z = sRGBtoLinear(texelColor.z);
1801 			}
1802 		}
1803 		break;
1804 	}
1805 
1806 	ASSERT(SIMD::Width == 4);
1807 	SIMD::Float4 destColor;
1808 	destColor.x = texelColor.x;
1809 	destColor.y = texelColor.y;
1810 	destColor.z = texelColor.z;
1811 	destColor.w = texelColor.w;
1812 
1813 	SIMD::Float4 sourceFactor;
1814 	SIMD::Float4 destFactor;
1815 
1816 	blendFactorRGB(sourceFactor, sourceColor, destColor, state.blendState[index].sourceBlendFactor, format);
1817 	blendFactorRGB(destFactor, sourceColor, destColor, state.blendState[index].destBlendFactor, format);
1818 	blendFactorAlpha(sourceFactor.w, sourceColor.w, destColor.w, state.blendState[index].sourceBlendFactorAlpha, format);
1819 	blendFactorAlpha(destFactor.w, sourceColor.w, destColor.w, state.blendState[index].destBlendFactorAlpha, format);
1820 
1821 	SIMD::Float4 blendedColor;
1822 
1823 	switch(state.blendState[index].blendOperation)
1824 	{
1825 	case VK_BLEND_OP_ADD:
1826 		blendedColor.x = sourceColor.x * sourceFactor.x + destColor.x * destFactor.x;
1827 		blendedColor.y = sourceColor.y * sourceFactor.y + destColor.y * destFactor.y;
1828 		blendedColor.z = sourceColor.z * sourceFactor.z + destColor.z * destFactor.z;
1829 		break;
1830 	case VK_BLEND_OP_SUBTRACT:
1831 		blendedColor.x = sourceColor.x * sourceFactor.x - destColor.x * destFactor.x;
1832 		blendedColor.y = sourceColor.y * sourceFactor.y - destColor.y * destFactor.y;
1833 		blendedColor.z = sourceColor.z * sourceFactor.z - destColor.z * destFactor.z;
1834 		break;
1835 	case VK_BLEND_OP_REVERSE_SUBTRACT:
1836 		blendedColor.x = destColor.x * destFactor.x - sourceColor.x * sourceFactor.x;
1837 		blendedColor.y = destColor.y * destFactor.y - sourceColor.y * sourceFactor.y;
1838 		blendedColor.z = destColor.z * destFactor.z - sourceColor.z * sourceFactor.z;
1839 		break;
1840 	case VK_BLEND_OP_MIN:
1841 		blendedColor.x = Min(sourceColor.x, destColor.x);
1842 		blendedColor.y = Min(sourceColor.y, destColor.y);
1843 		blendedColor.z = Min(sourceColor.z, destColor.z);
1844 		break;
1845 	case VK_BLEND_OP_MAX:
1846 		blendedColor.x = Max(sourceColor.x, destColor.x);
1847 		blendedColor.y = Max(sourceColor.y, destColor.y);
1848 		blendedColor.z = Max(sourceColor.z, destColor.z);
1849 		break;
1850 	case VK_BLEND_OP_SRC_EXT:
1851 		blendedColor.x = sourceColor.x;
1852 		blendedColor.y = sourceColor.y;
1853 		blendedColor.z = sourceColor.z;
1854 		break;
1855 	case VK_BLEND_OP_DST_EXT:
1856 		blendedColor.x = destColor.x;
1857 		blendedColor.y = destColor.y;
1858 		blendedColor.z = destColor.z;
1859 		break;
1860 	case VK_BLEND_OP_ZERO_EXT:
1861 		blendedColor.x = 0.0f;
1862 		blendedColor.y = 0.0f;
1863 		blendedColor.z = 0.0f;
1864 		break;
1865 	case VK_BLEND_OP_MULTIPLY_EXT:
1866 	case VK_BLEND_OP_SCREEN_EXT:
1867 	case VK_BLEND_OP_OVERLAY_EXT:
1868 	case VK_BLEND_OP_DARKEN_EXT:
1869 	case VK_BLEND_OP_LIGHTEN_EXT:
1870 	case VK_BLEND_OP_COLORDODGE_EXT:
1871 	case VK_BLEND_OP_COLORBURN_EXT:
1872 	case VK_BLEND_OP_HARDLIGHT_EXT:
1873 	case VK_BLEND_OP_SOFTLIGHT_EXT:
1874 	case VK_BLEND_OP_DIFFERENCE_EXT:
1875 	case VK_BLEND_OP_EXCLUSION_EXT:
1876 	case VK_BLEND_OP_HSL_HUE_EXT:
1877 	case VK_BLEND_OP_HSL_SATURATION_EXT:
1878 	case VK_BLEND_OP_HSL_COLOR_EXT:
1879 	case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
1880 		blendedColor = computeAdvancedBlendMode(index, sourceColor, destColor, sourceFactor, destFactor);
1881 		break;
1882 	default:
1883 		UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperation));
1884 	}
1885 
1886 	switch(state.blendState[index].blendOperationAlpha)
1887 	{
1888 	case VK_BLEND_OP_ADD:
1889 		blendedColor.w = sourceColor.w * sourceFactor.w + destColor.w * destFactor.w;
1890 		break;
1891 	case VK_BLEND_OP_SUBTRACT:
1892 		blendedColor.w = sourceColor.w * sourceFactor.w - destColor.w * destFactor.w;
1893 		break;
1894 	case VK_BLEND_OP_REVERSE_SUBTRACT:
1895 		blendedColor.w = destColor.w * destFactor.w - sourceColor.w * sourceFactor.w;
1896 		break;
1897 	case VK_BLEND_OP_MIN:
1898 		blendedColor.w = Min(sourceColor.w, destColor.w);
1899 		break;
1900 	case VK_BLEND_OP_MAX:
1901 		blendedColor.w = Max(sourceColor.w, destColor.w);
1902 		break;
1903 	case VK_BLEND_OP_SRC_EXT:
1904 		blendedColor.w = sourceColor.w;
1905 		break;
1906 	case VK_BLEND_OP_DST_EXT:
1907 		blendedColor.w = destColor.w;
1908 		break;
1909 	case VK_BLEND_OP_ZERO_EXT:
1910 		blendedColor.w = 0.0f;
1911 		break;
1912 	case VK_BLEND_OP_MULTIPLY_EXT:
1913 	case VK_BLEND_OP_SCREEN_EXT:
1914 	case VK_BLEND_OP_OVERLAY_EXT:
1915 	case VK_BLEND_OP_DARKEN_EXT:
1916 	case VK_BLEND_OP_LIGHTEN_EXT:
1917 	case VK_BLEND_OP_COLORDODGE_EXT:
1918 	case VK_BLEND_OP_COLORBURN_EXT:
1919 	case VK_BLEND_OP_HARDLIGHT_EXT:
1920 	case VK_BLEND_OP_SOFTLIGHT_EXT:
1921 	case VK_BLEND_OP_DIFFERENCE_EXT:
1922 	case VK_BLEND_OP_EXCLUSION_EXT:
1923 	case VK_BLEND_OP_HSL_HUE_EXT:
1924 	case VK_BLEND_OP_HSL_SATURATION_EXT:
1925 	case VK_BLEND_OP_HSL_COLOR_EXT:
1926 	case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
1927 		// All of the currently supported 'advanced blend modes' compute the alpha the same way.
1928 		blendedColor.w = sourceColor.w + destColor.w - (sourceColor.w * destColor.w);
1929 		break;
1930 	default:
1931 		UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperationAlpha));
1932 	}
1933 
1934 	return blendedColor;
1935 }
1936 
writeColor(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4f & color,const Int & sMask,const Int & zMask,const Int & cMask)1937 void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4f &color, const Int &sMask, const Int &zMask, const Int &cMask)
1938 {
1939 	if(isSRGB(index))
1940 	{
1941 		color.x = linearToSRGB(color.x);
1942 		color.y = linearToSRGB(color.y);
1943 		color.z = linearToSRGB(color.z);
1944 	}
1945 
1946 	vk::Format format = state.colorFormat[index];
1947 	switch(format)
1948 	{
1949 	case VK_FORMAT_B8G8R8A8_UNORM:
1950 	case VK_FORMAT_B8G8R8A8_SRGB:
1951 	case VK_FORMAT_R8G8B8A8_UNORM:
1952 	case VK_FORMAT_R8G8B8A8_SRGB:
1953 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1954 	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1955 		color.w = Min(Max(color.w, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
1956 		color.w = As<Float4>(RoundInt(color.w * 0xFF));
1957 		color.z = Min(Max(color.z, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
1958 		color.z = As<Float4>(RoundInt(color.z * 0xFF));
1959 		// [[fallthrough]]
1960 	case VK_FORMAT_R8G8_UNORM:
1961 		color.y = Min(Max(color.y, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
1962 		color.y = As<Float4>(RoundInt(color.y * 0xFF));
1963 		//[[fallthrough]]
1964 	case VK_FORMAT_R8_UNORM:
1965 		color.x = Min(Max(color.x, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
1966 		color.x = As<Float4>(RoundInt(color.x * 0xFF));
1967 		break;
1968 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1969 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1970 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
1971 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
1972 		color.w = Min(Max(color.w, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
1973 		color.w = As<Float4>(RoundInt(color.w * 0xF));
1974 		color.z = Min(Max(color.z, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
1975 		color.z = As<Float4>(RoundInt(color.z * 0xF));
1976 		color.y = Min(Max(color.y, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
1977 		color.y = As<Float4>(RoundInt(color.y * 0xF));
1978 		color.x = Min(Max(color.x, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
1979 		color.x = As<Float4>(RoundInt(color.x * 0xF));
1980 		break;
1981 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
1982 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
1983 		color.z = Min(Max(color.z, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
1984 		color.z = As<Float4>(RoundInt(color.z * 0x1F));
1985 		color.y = Min(Max(color.y, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
1986 		color.y = As<Float4>(RoundInt(color.y * 0x3F));
1987 		color.x = Min(Max(color.x, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
1988 		color.x = As<Float4>(RoundInt(color.x * 0x1F));
1989 		break;
1990 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1991 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1992 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1993 		color.w = Min(Max(color.w, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
1994 		color.w = As<Float4>(RoundInt(color.w));
1995 		color.z = Min(Max(color.z, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
1996 		color.z = As<Float4>(RoundInt(color.z * 0x1F));
1997 		color.y = Min(Max(color.y, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
1998 		color.y = As<Float4>(RoundInt(color.y * 0x1F));
1999 		color.x = Min(Max(color.x, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2000 		color.x = As<Float4>(RoundInt(color.x * 0x1F));
2001 		break;
2002 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
2003 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
2004 		color.w = Min(Max(color.w, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2005 		color.w = As<Float4>(RoundInt(color.w * 0x3));
2006 		color.z = Min(Max(color.z, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2007 		color.z = As<Float4>(RoundInt(color.z * 0x3FF));
2008 		color.y = Min(Max(color.y, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2009 		color.y = As<Float4>(RoundInt(color.y * 0x3FF));
2010 		color.x = Min(Max(color.x, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2011 		color.x = As<Float4>(RoundInt(color.x * 0x3FF));
2012 		break;
2013 	case VK_FORMAT_R16G16B16A16_UNORM:
2014 		color.w = Min(Max(color.w, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2015 		color.w = As<Float4>(RoundInt(color.w * 0xFFFF));
2016 		color.z = Min(Max(color.z, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2017 		color.z = As<Float4>(RoundInt(color.z * 0xFFFF));
2018 		// [[fallthrough]]
2019 	case VK_FORMAT_R16G16_UNORM:
2020 		color.y = Min(Max(color.y, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2021 		color.y = As<Float4>(RoundInt(color.y * 0xFFFF));
2022 		//[[fallthrough]]
2023 	case VK_FORMAT_R16_UNORM:
2024 		color.x = Min(Max(color.x, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2025 		color.x = As<Float4>(RoundInt(color.x * 0xFFFF));
2026 		break;
2027 	default:
2028 		// TODO(b/204560089): Omit clamp if redundant
2029 		if(format.isUnsignedNormalized())
2030 		{
2031 			color.x = Min(Max(color.x, 0.0f), 1.0f);
2032 			color.y = Min(Max(color.y, 0.0f), 1.0f);
2033 			color.z = Min(Max(color.z, 0.0f), 1.0f);
2034 			color.w = Min(Max(color.w, 0.0f), 1.0f);
2035 		}
2036 		else if(format.isSignedNormalized())
2037 		{
2038 			color.x = Min(Max(color.x, -1.0f), 1.0f);
2039 			color.y = Min(Max(color.y, -1.0f), 1.0f);
2040 			color.z = Min(Max(color.z, -1.0f), 1.0f);
2041 			color.w = Min(Max(color.w, -1.0f), 1.0f);
2042 		}
2043 	}
2044 
2045 	switch(format)
2046 	{
2047 	case VK_FORMAT_R16_SFLOAT:
2048 	case VK_FORMAT_R32_SFLOAT:
2049 	case VK_FORMAT_R32_SINT:
2050 	case VK_FORMAT_R32_UINT:
2051 	case VK_FORMAT_R16_UNORM:
2052 	case VK_FORMAT_R16_SINT:
2053 	case VK_FORMAT_R16_UINT:
2054 	case VK_FORMAT_R8_SINT:
2055 	case VK_FORMAT_R8_UINT:
2056 	case VK_FORMAT_R8_UNORM:
2057 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2058 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2059 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
2060 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
2061 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
2062 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
2063 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
2064 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
2065 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
2066 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
2067 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
2068 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
2069 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
2070 		break;
2071 	case VK_FORMAT_R16G16_SFLOAT:
2072 	case VK_FORMAT_R32G32_SFLOAT:
2073 	case VK_FORMAT_R32G32_SINT:
2074 	case VK_FORMAT_R32G32_UINT:
2075 	case VK_FORMAT_R16G16_UNORM:
2076 	case VK_FORMAT_R16G16_SINT:
2077 	case VK_FORMAT_R16G16_UINT:
2078 	case VK_FORMAT_R8G8_SINT:
2079 	case VK_FORMAT_R8G8_UINT:
2080 	case VK_FORMAT_R8G8_UNORM:
2081 		color.z = color.x;
2082 		color.x = UnpackLow(color.x, color.y);
2083 		color.z = UnpackHigh(color.z, color.y);
2084 		color.y = color.z;
2085 		break;
2086 	case VK_FORMAT_R16G16B16A16_SFLOAT:
2087 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2088 	case VK_FORMAT_R32G32B32A32_SFLOAT:
2089 	case VK_FORMAT_R32G32B32A32_SINT:
2090 	case VK_FORMAT_R32G32B32A32_UINT:
2091 	case VK_FORMAT_R16G16B16A16_UNORM:
2092 	case VK_FORMAT_R16G16B16A16_SINT:
2093 	case VK_FORMAT_R16G16B16A16_UINT:
2094 	case VK_FORMAT_R8G8B8A8_SINT:
2095 	case VK_FORMAT_R8G8B8A8_UINT:
2096 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
2097 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
2098 	case VK_FORMAT_R8G8B8A8_UNORM:
2099 	case VK_FORMAT_R8G8B8A8_SRGB:
2100 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
2101 	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
2102 		transpose4x4(color.x, color.y, color.z, color.w);
2103 		break;
2104 	case VK_FORMAT_B8G8R8A8_UNORM:
2105 	case VK_FORMAT_B8G8R8A8_SRGB:
2106 		transpose4x4zyxw(color.z, color.y, color.x, color.w);
2107 		break;
2108 	default:
2109 		UNSUPPORTED("VkFormat: %d", int(format));
2110 	}
2111 
2112 	int writeMask = state.colorWriteActive(index);
2113 	if(format.isBGRformat())
2114 	{
2115 		// For BGR formats, flip R and B channels in the channels mask
2116 		writeMask = (writeMask & 0x0000000A) | (writeMask & 0x00000001) << 2 | (writeMask & 0x00000004) >> 2;
2117 	}
2118 
2119 	Int xMask;  // Combination of all masks
2120 
2121 	if(state.depthTestActive)
2122 	{
2123 		xMask = zMask;
2124 	}
2125 	else
2126 	{
2127 		xMask = cMask;
2128 	}
2129 
2130 	if(state.stencilActive)
2131 	{
2132 		xMask &= sMask;
2133 	}
2134 
2135 	Pointer<Byte> buffer = cBuffer;
2136 	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2137 	Float4 value;
2138 
2139 	switch(format)
2140 	{
2141 	case VK_FORMAT_R32_SFLOAT:
2142 	case VK_FORMAT_R32_SINT:
2143 	case VK_FORMAT_R32_UINT:
2144 		if(writeMask & 0x00000001)
2145 		{
2146 			buffer += 4 * x;
2147 
2148 			// FIXME: movlps
2149 			value.x = *Pointer<Float>(buffer + 0);
2150 			value.y = *Pointer<Float>(buffer + 4);
2151 
2152 			buffer += pitchB;
2153 
2154 			// FIXME: movhps
2155 			value.z = *Pointer<Float>(buffer + 0);
2156 			value.w = *Pointer<Float>(buffer + 4);
2157 
2158 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2159 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2160 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2161 
2162 			// FIXME: movhps
2163 			*Pointer<Float>(buffer + 0) = color.x.z;
2164 			*Pointer<Float>(buffer + 4) = color.x.w;
2165 
2166 			buffer -= pitchB;
2167 
2168 			// FIXME: movlps
2169 			*Pointer<Float>(buffer + 0) = color.x.x;
2170 			*Pointer<Float>(buffer + 4) = color.x.y;
2171 		}
2172 		break;
2173 	case VK_FORMAT_R16_SFLOAT:
2174 		if(writeMask & 0x00000001)
2175 		{
2176 			buffer += 2 * x;
2177 
2178 			value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 0);
2179 			value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 1);
2180 
2181 			buffer += pitchB;
2182 
2183 			value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 2);
2184 			value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 3);
2185 
2186 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2187 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2188 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2189 
2190 			*Pointer<Half>(buffer + 0) = Half(color.x.z);
2191 			*Pointer<Half>(buffer + 2) = Half(color.x.w);
2192 
2193 			buffer -= pitchB;
2194 
2195 			*Pointer<Half>(buffer + 0) = Half(color.x.x);
2196 			*Pointer<Half>(buffer + 2) = Half(color.x.y);
2197 		}
2198 		break;
2199 	case VK_FORMAT_R16_UNORM:
2200 	case VK_FORMAT_R16_SINT:
2201 	case VK_FORMAT_R16_UINT:
2202 		if(writeMask & 0x00000001)
2203 		{
2204 			buffer += 2 * x;
2205 
2206 			UShort4 xyzw;
2207 			xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2208 
2209 			buffer += pitchB;
2210 
2211 			xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2212 			value = As<Float4>(Int4(xyzw));
2213 
2214 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2215 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2216 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2217 
2218 			Float component = color.x.z;
2219 			*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2220 			component = color.x.w;
2221 			*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2222 
2223 			buffer -= pitchB;
2224 
2225 			component = color.x.x;
2226 			*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2227 			component = color.x.y;
2228 			*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2229 		}
2230 		break;
2231 	case VK_FORMAT_R8_SINT:
2232 	case VK_FORMAT_R8_UINT:
2233 	case VK_FORMAT_R8_UNORM:
2234 		if(writeMask & 0x00000001)
2235 		{
2236 			buffer += x;
2237 
2238 			UInt xyzw, packedCol;
2239 
2240 			xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFFu;
2241 			buffer += pitchB;
2242 			xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2243 
2244 			Short4 tmpCol = Short4(As<Int4>(color.x));
2245 			if(format == VK_FORMAT_R8_SINT)
2246 			{
2247 				tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
2248 			}
2249 			else
2250 			{
2251 				tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
2252 			}
2253 			packedCol = Extract(As<Int2>(tmpCol), 0);
2254 
2255 			packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2256 			            (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2257 
2258 			*Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2259 			buffer -= pitchB;
2260 			*Pointer<UShort>(buffer) = UShort(packedCol);
2261 		}
2262 		break;
2263 	case VK_FORMAT_R32G32_SFLOAT:
2264 	case VK_FORMAT_R32G32_SINT:
2265 	case VK_FORMAT_R32G32_UINT:
2266 		buffer += 8 * x;
2267 
2268 		value = *Pointer<Float4>(buffer);
2269 
2270 		if((writeMask & 0x00000003) != 0x00000003)
2271 		{
2272 			Float4 masked = value;
2273 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[writeMask & 0x3][0])));
2274 			masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~writeMask & 0x3][0])));
2275 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(masked));
2276 		}
2277 
2278 		color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16, 16));
2279 		value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ01X) + xMask * 16, 16));
2280 		color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2281 		*Pointer<Float4>(buffer) = color.x;
2282 
2283 		buffer += pitchB;
2284 
2285 		value = *Pointer<Float4>(buffer);
2286 
2287 		if((writeMask & 0x00000003) != 0x00000003)
2288 		{
2289 			Float4 masked;
2290 
2291 			masked = value;
2292 			color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[writeMask & 0x3][0])));
2293 			masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~writeMask & 0x3][0])));
2294 			color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(masked));
2295 		}
2296 
2297 		color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16, 16));
2298 		value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ23X) + xMask * 16, 16));
2299 		color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(value));
2300 		*Pointer<Float4>(buffer) = color.y;
2301 		break;
2302 	case VK_FORMAT_R16G16_SFLOAT:
2303 		if((writeMask & 0x00000003) != 0x0)
2304 		{
2305 			buffer += 4 * x;
2306 
2307 			UInt2 rgbaMask;
2308 			UInt2 packedCol;
2309 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.y))) << 16) | UInt(As<UShort>(Half(color.x.x))), 0);
2310 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.w))) << 16) | UInt(As<UShort>(Half(color.x.z))), 1);
2311 
2312 			UShort4 value = *Pointer<UShort4>(buffer);
2313 			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2314 			if((writeMask & 0x3) != 0x3)
2315 			{
2316 				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[writeMask & 0x3]));
2317 				rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2318 				mergedMask &= rgbaMask;
2319 			}
2320 			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2321 
2322 			buffer += pitchB;
2323 
2324 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.y))) << 16) | UInt(As<UShort>(Half(color.y.x))), 0);
2325 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.w))) << 16) | UInt(As<UShort>(Half(color.y.z))), 1);
2326 			value = *Pointer<UShort4>(buffer);
2327 			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2328 			if((writeMask & 0x3) != 0x3)
2329 			{
2330 				mergedMask &= rgbaMask;
2331 			}
2332 			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2333 		}
2334 		break;
2335 	case VK_FORMAT_R16G16_UNORM:
2336 	case VK_FORMAT_R16G16_SINT:
2337 	case VK_FORMAT_R16G16_UINT:
2338 		if((writeMask & 0x00000003) != 0x0)
2339 		{
2340 			buffer += 4 * x;
2341 
2342 			UInt2 rgbaMask;
2343 			UShort4 packedCol = UShort4(As<Int4>(color.x));
2344 			UShort4 value = *Pointer<UShort4>(buffer);
2345 			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2346 			if((writeMask & 0x3) != 0x3)
2347 			{
2348 				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[writeMask & 0x3]));
2349 				rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2350 				mergedMask &= rgbaMask;
2351 			}
2352 			*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2353 
2354 			buffer += pitchB;
2355 
2356 			packedCol = UShort4(As<Int4>(color.y));
2357 			value = *Pointer<UShort4>(buffer);
2358 			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2359 			if((writeMask & 0x3) != 0x3)
2360 			{
2361 				mergedMask &= rgbaMask;
2362 			}
2363 			*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2364 		}
2365 		break;
2366 	case VK_FORMAT_R8G8_SINT:
2367 	case VK_FORMAT_R8G8_UINT:
2368 	case VK_FORMAT_R8G8_UNORM:
2369 		if((writeMask & 0x00000003) != 0x0)
2370 		{
2371 			buffer += 2 * x;
2372 
2373 			Int2 xyzw, packedCol;
2374 
2375 			xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2376 			buffer += pitchB;
2377 			xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2378 
2379 			if(format == VK_FORMAT_R8G8_SINT)
2380 			{
2381 				packedCol = As<Int2>(PackSigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2382 			}
2383 			else
2384 			{
2385 				packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2386 			}
2387 
2388 			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2389 			if((writeMask & 0x3) != 0x3)
2390 			{
2391 				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (writeMask & 0x3)]));
2392 				UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2393 				mergedMask &= rgbaMask;
2394 			}
2395 
2396 			packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2397 
2398 			*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2399 			buffer -= pitchB;
2400 			*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2401 		}
2402 		break;
2403 	case VK_FORMAT_R32G32B32A32_SFLOAT:
2404 	case VK_FORMAT_R32G32B32A32_SINT:
2405 	case VK_FORMAT_R32G32B32A32_UINT:
2406 		buffer += 16 * x;
2407 
2408 		{
2409 			value = *Pointer<Float4>(buffer, 16);
2410 
2411 			if(writeMask != 0x0000000F)
2412 			{
2413 				Float4 masked = value;
2414 				color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[writeMask])));
2415 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[writeMask])));
2416 				color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(masked));
2417 			}
2418 
2419 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskX0X) + xMask * 16, 16));
2420 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX0X) + xMask * 16, 16));
2421 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2422 			*Pointer<Float4>(buffer, 16) = color.x;
2423 		}
2424 
2425 		{
2426 			value = *Pointer<Float4>(buffer + 16, 16);
2427 
2428 			if(writeMask != 0x0000000F)
2429 			{
2430 				Float4 masked = value;
2431 				color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[writeMask])));
2432 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[writeMask])));
2433 				color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(masked));
2434 			}
2435 
2436 			color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskX1X) + xMask * 16, 16));
2437 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX1X) + xMask * 16, 16));
2438 			color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(value));
2439 			*Pointer<Float4>(buffer + 16, 16) = color.y;
2440 		}
2441 
2442 		buffer += pitchB;
2443 
2444 		{
2445 			value = *Pointer<Float4>(buffer, 16);
2446 
2447 			if(writeMask != 0x0000000F)
2448 			{
2449 				Float4 masked = value;
2450 				color.z = As<Float4>(As<Int4>(color.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[writeMask])));
2451 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[writeMask])));
2452 				color.z = As<Float4>(As<Int4>(color.z) | As<Int4>(masked));
2453 			}
2454 
2455 			color.z = As<Float4>(As<Int4>(color.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskX2X) + xMask * 16, 16));
2456 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX2X) + xMask * 16, 16));
2457 			color.z = As<Float4>(As<Int4>(color.z) | As<Int4>(value));
2458 			*Pointer<Float4>(buffer, 16) = color.z;
2459 		}
2460 
2461 		{
2462 			value = *Pointer<Float4>(buffer + 16, 16);
2463 
2464 			if(writeMask != 0x0000000F)
2465 			{
2466 				Float4 masked = value;
2467 				color.w = As<Float4>(As<Int4>(color.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[writeMask])));
2468 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[writeMask])));
2469 				color.w = As<Float4>(As<Int4>(color.w) | As<Int4>(masked));
2470 			}
2471 
2472 			color.w = As<Float4>(As<Int4>(color.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskX3X) + xMask * 16, 16));
2473 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX3X) + xMask * 16, 16));
2474 			color.w = As<Float4>(As<Int4>(color.w) | As<Int4>(value));
2475 			*Pointer<Float4>(buffer + 16, 16) = color.w;
2476 		}
2477 		break;
2478 	case VK_FORMAT_R16G16B16A16_SFLOAT:
2479 		if((writeMask & 0x0000000F) != 0x0)
2480 		{
2481 			buffer += 8 * x;
2482 
2483 			UInt4 rgbaMask;
2484 			UInt4 value = *Pointer<UInt4>(buffer);
2485 			UInt4 packedCol;
2486 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.y))) << 16) | UInt(As<UShort>(Half(color.x.x))), 0);
2487 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.w))) << 16) | UInt(As<UShort>(Half(color.x.z))), 1);
2488 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.y))) << 16) | UInt(As<UShort>(Half(color.y.x))), 2);
2489 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.w))) << 16) | UInt(As<UShort>(Half(color.y.z))), 3);
2490 			UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2491 			if((writeMask & 0xF) != 0xF)
2492 			{
2493 				UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[writeMask]));
2494 				rgbaMask = UInt4(tmpMask, tmpMask);
2495 				mergedMask &= rgbaMask;
2496 			}
2497 			*Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2498 
2499 			buffer += pitchB;
2500 
2501 			value = *Pointer<UInt4>(buffer);
2502 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.z.y))) << 16) | UInt(As<UShort>(Half(color.z.x))), 0);
2503 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.z.w))) << 16) | UInt(As<UShort>(Half(color.z.z))), 1);
2504 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.w.y))) << 16) | UInt(As<UShort>(Half(color.w.x))), 2);
2505 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.w.w))) << 16) | UInt(As<UShort>(Half(color.w.z))), 3);
2506 			mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2507 			if((writeMask & 0xF) != 0xF)
2508 			{
2509 				mergedMask &= rgbaMask;
2510 			}
2511 			*Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2512 		}
2513 		break;
2514 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2515 		if((writeMask & 0x7) != 0x0)
2516 		{
2517 			buffer += 4 * x;
2518 
2519 			UInt4 packedCol;
2520 			packedCol = Insert(packedCol, r11g11b10Pack(color.x), 0);
2521 			packedCol = Insert(packedCol, r11g11b10Pack(color.y), 1);
2522 			packedCol = Insert(packedCol, r11g11b10Pack(color.z), 2);
2523 			packedCol = Insert(packedCol, r11g11b10Pack(color.w), 3);
2524 
2525 			UInt4 value;
2526 			value = Insert(value, *Pointer<UInt>(buffer + 0), 0);
2527 			value = Insert(value, *Pointer<UInt>(buffer + 4), 1);
2528 			buffer += pitchB;
2529 			value = Insert(value, *Pointer<UInt>(buffer + 0), 2);
2530 			value = Insert(value, *Pointer<UInt>(buffer + 4), 3);
2531 
2532 			UInt4 mask = *Pointer<UInt4>(constants + OFFSET(Constants, maskD4X[0]) + xMask * 16, 16);
2533 			if((writeMask & 0x7) != 0x7)
2534 			{
2535 				mask &= *Pointer<UInt4>(constants + OFFSET(Constants, mask11X[writeMask & 0x7]), 16);
2536 			}
2537 			value = (packedCol & mask) | (value & ~mask);
2538 
2539 			*Pointer<UInt>(buffer + 0) = value.z;
2540 			*Pointer<UInt>(buffer + 4) = value.w;
2541 			buffer -= pitchB;
2542 			*Pointer<UInt>(buffer + 0) = value.x;
2543 			*Pointer<UInt>(buffer + 4) = value.y;
2544 		}
2545 		break;
2546 	case VK_FORMAT_R16G16B16A16_UNORM:
2547 	case VK_FORMAT_R16G16B16A16_SINT:
2548 	case VK_FORMAT_R16G16B16A16_UINT:
2549 		if((writeMask & 0x0000000F) != 0x0)
2550 		{
2551 			buffer += 8 * x;
2552 
2553 			UInt4 rgbaMask;
2554 			UShort8 value = *Pointer<UShort8>(buffer);
2555 			UShort8 packedCol = UShort8(UShort4(As<Int4>(color.x)), UShort4(As<Int4>(color.y)));
2556 			UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2557 			if((writeMask & 0xF) != 0xF)
2558 			{
2559 				UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[writeMask]));
2560 				rgbaMask = UInt4(tmpMask, tmpMask);
2561 				mergedMask &= rgbaMask;
2562 			}
2563 			*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2564 
2565 			buffer += pitchB;
2566 
2567 			value = *Pointer<UShort8>(buffer);
2568 			packedCol = UShort8(UShort4(As<Int4>(color.z)), UShort4(As<Int4>(color.w)));
2569 			mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2570 			if((writeMask & 0xF) != 0xF)
2571 			{
2572 				mergedMask &= rgbaMask;
2573 			}
2574 			*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2575 		}
2576 		break;
2577 	case VK_FORMAT_B8G8R8A8_UNORM:
2578 	case VK_FORMAT_B8G8R8A8_SRGB:
2579 	case VK_FORMAT_R8G8B8A8_SINT:
2580 	case VK_FORMAT_R8G8B8A8_UINT:
2581 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
2582 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
2583 	case VK_FORMAT_R8G8B8A8_UNORM:
2584 	case VK_FORMAT_R8G8B8A8_SRGB:
2585 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
2586 	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
2587 		if((writeMask & 0x0000000F) != 0x0)
2588 		{
2589 			UInt2 value, packedCol, mergedMask;
2590 
2591 			buffer += 4 * x;
2592 
2593 			bool isSigned = !format.isUnsigned();
2594 
2595 			if(isSigned)
2596 			{
2597 				packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2598 			}
2599 			else
2600 			{
2601 				packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2602 			}
2603 			value = *Pointer<UInt2>(buffer, 16);
2604 			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2605 			if(writeMask != 0xF)
2606 			{
2607 				mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[writeMask]));
2608 			}
2609 			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2610 
2611 			buffer += pitchB;
2612 
2613 			if(isSigned)
2614 			{
2615 				packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(color.z)), Short4(As<Int4>(color.w))));
2616 			}
2617 			else
2618 			{
2619 				packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(color.z)), Short4(As<Int4>(color.w))));
2620 			}
2621 			value = *Pointer<UInt2>(buffer, 16);
2622 			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2623 			if(writeMask != 0xF)
2624 			{
2625 				mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[writeMask]));
2626 			}
2627 			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2628 		}
2629 		break;
2630 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2631 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
2632 		if((writeMask & 0x0000000F) != 0x0)
2633 		{
2634 			Int2 mergedMask, packedCol, value;
2635 			Int4 packed = ((As<Int4>(color.w) & Int4(0x3)) << 30) |
2636 			              ((As<Int4>(color.z) & Int4(0x3ff)) << 20) |
2637 			              ((As<Int4>(color.y) & Int4(0x3ff)) << 10) |
2638 			              ((As<Int4>(color.x) & Int4(0x3ff)));
2639 
2640 			buffer += 4 * x;
2641 			value = *Pointer<Int2>(buffer, 16);
2642 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2643 			if(writeMask != 0xF)
2644 			{
2645 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask]));
2646 			}
2647 			*Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
2648 
2649 			buffer += pitchB;
2650 
2651 			value = *Pointer<Int2>(buffer, 16);
2652 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2653 			if(writeMask != 0xF)
2654 			{
2655 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask]));
2656 			}
2657 			*Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
2658 		}
2659 		break;
2660 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2661 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
2662 		if((writeMask & 0x0000000F) != 0x0)
2663 		{
2664 			Int2 mergedMask, packedCol, value;
2665 			Int4 packed = ((As<Int4>(color.w) & Int4(0x3)) << 30) |
2666 			              ((As<Int4>(color.x) & Int4(0x3ff)) << 20) |
2667 			              ((As<Int4>(color.y) & Int4(0x3ff)) << 10) |
2668 			              ((As<Int4>(color.z) & Int4(0x3ff)));
2669 
2670 			buffer += 4 * x;
2671 			value = *Pointer<Int2>(buffer, 16);
2672 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2673 			if(writeMask != 0xF)
2674 			{
2675 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask]));
2676 			}
2677 			*Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
2678 
2679 			buffer += pitchB;
2680 
2681 			value = *Pointer<Int2>(buffer, 16);
2682 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2683 			if(writeMask != 0xF)
2684 			{
2685 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask]));
2686 			}
2687 			*Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
2688 		}
2689 		break;
2690 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
2691 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
2692 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
2693 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
2694 		{
2695 			buffer += 2 * x;
2696 			Int value = *Pointer<Int>(buffer);
2697 
2698 			Int channelMask;
2699 			Short4 current;
2700 			switch(format)
2701 			{
2702 			case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
2703 				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4argbQ[writeMask][0]));
2704 				current = (UShort4(As<Int4>(color.x)) & UShort4(0xF)) << 12 |
2705 				          (UShort4(As<Int4>(color.y)) & UShort4(0xF)) << 8 |
2706 				          (UShort4(As<Int4>(color.z)) & UShort4(0xF)) << 4 |
2707 				          (UShort4(As<Int4>(color.w)) & UShort4(0xF));
2708 				break;
2709 			case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
2710 				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4argbQ[writeMask][0]));
2711 				current = (UShort4(As<Int4>(color.z)) & UShort4(0xF)) << 12 |
2712 				          (UShort4(As<Int4>(color.y)) & UShort4(0xF)) << 8 |
2713 				          (UShort4(As<Int4>(color.x)) & UShort4(0xF)) << 4 |
2714 				          (UShort4(As<Int4>(color.w)) & UShort4(0xF));
2715 				break;
2716 			case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
2717 				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4rgbaQ[writeMask][0]));
2718 				current = (UShort4(As<Int4>(color.w)) & UShort4(0xF)) << 12 |
2719 				          (UShort4(As<Int4>(color.x)) & UShort4(0xF)) << 8 |
2720 				          (UShort4(As<Int4>(color.y)) & UShort4(0xF)) << 4 |
2721 				          (UShort4(As<Int4>(color.z)) & UShort4(0xF));
2722 				break;
2723 			case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
2724 				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4rgbaQ[writeMask][0]));
2725 				current = (UShort4(As<Int4>(color.w)) & UShort4(0xF)) << 12 |
2726 				          (UShort4(As<Int4>(color.z)) & UShort4(0xF)) << 8 |
2727 				          (UShort4(As<Int4>(color.y)) & UShort4(0xF)) << 4 |
2728 				          (UShort4(As<Int4>(color.x)) & UShort4(0xF));
2729 				break;
2730 			default:
2731 				UNREACHABLE("Format: %s", vk::Stringify(format).c_str());
2732 			}
2733 
2734 			Int c01 = Extract(As<Int2>(current), 0);
2735 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2736 			if(writeMask != 0x0000000F)
2737 			{
2738 				mask01 &= channelMask;
2739 			}
2740 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2741 
2742 			buffer += pitchB;
2743 			value = *Pointer<Int>(buffer);
2744 
2745 			Int c23 = Extract(As<Int2>(current), 1);
2746 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2747 			if(writeMask != 0x0000000F)
2748 			{
2749 				mask23 &= channelMask;
2750 			}
2751 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2752 		}
2753 		break;
2754 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
2755 		{
2756 			buffer += 2 * x;
2757 			Int value = *Pointer<Int>(buffer);
2758 
2759 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskr5g5b5a1Q[writeMask][0]));
2760 			Short4 current = (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) << 11 |
2761 			                 (UShort4(As<Int4>(color.y)) & UShort4(0x1F)) << 6 |
2762 			                 (UShort4(As<Int4>(color.z)) & UShort4(0x1F)) << 1 |
2763 			                 (UShort4(As<Int4>(color.w)) & UShort4(0x1));
2764 
2765 			Int c01 = Extract(As<Int2>(current), 0);
2766 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2767 			if(writeMask != 0x0000000F)
2768 			{
2769 				mask01 &= channelMask;
2770 			}
2771 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2772 
2773 			buffer += pitchB;
2774 			value = *Pointer<Int>(buffer);
2775 
2776 			Int c23 = Extract(As<Int2>(current), 1);
2777 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2778 			if(writeMask != 0x0000000F)
2779 			{
2780 				mask23 &= channelMask;
2781 			}
2782 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2783 		}
2784 		break;
2785 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
2786 		{
2787 			buffer += 2 * x;
2788 			Int value = *Pointer<Int>(buffer);
2789 
2790 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskb5g5r5a1Q[writeMask][0]));
2791 			Short4 current = (UShort4(As<Int4>(color.z)) & UShort4(0x1F)) << 11 |
2792 			                 (UShort4(As<Int4>(color.y)) & UShort4(0x1F)) << 6 |
2793 			                 (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) << 1 |
2794 			                 (UShort4(As<Int4>(color.w)) & UShort4(0x1));
2795 
2796 			Int c01 = Extract(As<Int2>(current), 0);
2797 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2798 			if(writeMask != 0x0000000F)
2799 			{
2800 				mask01 &= channelMask;
2801 			}
2802 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2803 
2804 			buffer += pitchB;
2805 			value = *Pointer<Int>(buffer);
2806 
2807 			Int c23 = Extract(As<Int2>(current), 1);
2808 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2809 			if(writeMask != 0x0000000F)
2810 			{
2811 				mask23 &= channelMask;
2812 			}
2813 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2814 		}
2815 		break;
2816 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
2817 		{
2818 			buffer += 2 * x;
2819 			Int value = *Pointer<Int>(buffer);
2820 
2821 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask5551Q[writeMask][0]));
2822 			Short4 current = (UShort4(As<Int4>(color.w)) & UShort4(0x1)) << 15 |
2823 			                 (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) << 10 |
2824 			                 (UShort4(As<Int4>(color.y)) & UShort4(0x1F)) << 5 |
2825 			                 (UShort4(As<Int4>(color.z)) & UShort4(0x1F));
2826 
2827 			Int c01 = Extract(As<Int2>(current), 0);
2828 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2829 			if(writeMask != 0x0000000F)
2830 			{
2831 				mask01 &= channelMask;
2832 			}
2833 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2834 
2835 			buffer += pitchB;
2836 			value = *Pointer<Int>(buffer);
2837 
2838 			Int c23 = Extract(As<Int2>(current), 1);
2839 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2840 			if(writeMask != 0x0000000F)
2841 			{
2842 				mask23 &= channelMask;
2843 			}
2844 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2845 		}
2846 		break;
2847 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
2848 		{
2849 			buffer += 2 * x;
2850 			Int value = *Pointer<Int>(buffer);
2851 
2852 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask565Q[writeMask & 0x7][0]));
2853 			Short4 current = (UShort4(As<Int4>(color.z)) & UShort4(0x1F)) |
2854 			                 (UShort4(As<Int4>(color.y)) & UShort4(0x3F)) << 5 |
2855 			                 (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) << 11;
2856 
2857 			Int c01 = Extract(As<Int2>(current), 0);
2858 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2859 			if((writeMask & 0x00000007) != 0x00000007)
2860 			{
2861 				mask01 &= channelMask;
2862 			}
2863 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2864 
2865 			buffer += pitchB;
2866 			value = *Pointer<Int>(buffer);
2867 
2868 			Int c23 = Extract(As<Int2>(current), 1);
2869 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2870 			if((writeMask & 0x00000007) != 0x00000007)
2871 			{
2872 				mask23 &= channelMask;
2873 			}
2874 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2875 		}
2876 		break;
2877 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
2878 		{
2879 			buffer += 2 * x;
2880 			Int value = *Pointer<Int>(buffer);
2881 
2882 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask565Q[writeMask & 0x7][0]));
2883 			Short4 current = (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) |
2884 			                 (UShort4(As<Int4>(color.y)) & UShort4(0x3F)) << 5 |
2885 			                 (UShort4(As<Int4>(color.z)) & UShort4(0x1F)) << 11;
2886 
2887 			Int c01 = Extract(As<Int2>(current), 0);
2888 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2889 			if((writeMask & 0x00000007) != 0x00000007)
2890 			{
2891 				mask01 &= channelMask;
2892 			}
2893 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2894 
2895 			buffer += pitchB;
2896 			value = *Pointer<Int>(buffer);
2897 
2898 			Int c23 = Extract(As<Int2>(current), 1);
2899 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2900 			if((writeMask & 0x00000007) != 0x00000007)
2901 			{
2902 				mask23 &= channelMask;
2903 			}
2904 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2905 		}
2906 		break;
2907 	default:
2908 		UNSUPPORTED("VkFormat: %d", int(format));
2909 	}
2910 }
2911 
2912 }  // namespace sw
2913