1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "PixelRoutine.hpp"
16
17 #include "Constants.hpp"
18 #include "SamplerCore.hpp"
19 #include "Device/Primitive.hpp"
20 #include "Device/QuadRasterizer.hpp"
21 #include "Device/Renderer.hpp"
22 #include "System/Debug.hpp"
23 #include "System/Math.hpp"
24 #include "Vulkan/VkPipelineLayout.hpp"
25 #include "Vulkan/VkStringify.hpp"
26
27 namespace sw {
28
PixelRoutine(const PixelProcessor::State & state,const vk::PipelineLayout * pipelineLayout,const SpirvShader * spirvShader,const vk::DescriptorSet::Bindings & descriptorSets)29 PixelRoutine::PixelRoutine(
30 const PixelProcessor::State &state,
31 const vk::PipelineLayout *pipelineLayout,
32 const SpirvShader *spirvShader,
33 const vk::DescriptorSet::Bindings &descriptorSets)
34 : QuadRasterizer(state, spirvShader)
35 , routine(pipelineLayout)
36 , descriptorSets(descriptorSets)
37 , shaderContainsInterpolation(spirvShader && spirvShader->getUsedCapabilities().InterpolationFunction)
38 , shaderContainsSampleQualifier(spirvShader && spirvShader->getAnalysis().ContainsSampleQualifier)
39 , perSampleShading((state.sampleShadingEnabled && (state.minSampleShading * state.multiSampleCount > 1.0f)) ||
40 shaderContainsSampleQualifier || shaderContainsInterpolation) // TODO(b/194714095)
41 , invocationCount(perSampleShading ? state.multiSampleCount : 1)
42 {
43 if(spirvShader)
44 {
45 spirvShader->emitProlog(&routine);
46 }
47 }
48
~PixelRoutine()49 PixelRoutine::~PixelRoutine()
50 {
51 }
52
getSampleSet(int invocation) const53 PixelRoutine::SampleSet PixelRoutine::getSampleSet(int invocation) const
54 {
55 unsigned int sampleBegin = perSampleShading ? invocation : 0;
56 unsigned int sampleEnd = perSampleShading ? (invocation + 1) : state.multiSampleCount;
57
58 SampleSet samples;
59
60 for(unsigned int q = sampleBegin; q < sampleEnd; q++)
61 {
62 if(state.multiSampleMask & (1 << q))
63 {
64 samples.push_back(q);
65 }
66 }
67
68 return samples;
69 }
70
quad(Pointer<Byte> cBuffer[MAX_COLOR_BUFFERS],Pointer<Byte> & zBuffer,Pointer<Byte> & sBuffer,Int cMask[4],Int & x,Int & y)71 void PixelRoutine::quad(Pointer<Byte> cBuffer[MAX_COLOR_BUFFERS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
72 {
73 const bool earlyFragmentTests = !spirvShader || spirvShader->getExecutionModes().EarlyFragmentTests;
74
75 Int zMask[4]; // Depth mask
76 Int sMask[4]; // Stencil mask
77 SIMD::Float unclampedZ[4];
78
79 for(int invocation = 0; invocation < invocationCount; invocation++)
80 {
81 SampleSet samples = getSampleSet(invocation);
82
83 if(samples.empty())
84 {
85 continue;
86 }
87
88 for(unsigned int q : samples)
89 {
90 zMask[q] = cMask[q];
91 sMask[q] = cMask[q];
92 }
93
94 stencilTest(sBuffer, x, sMask, samples);
95
96 SIMD::Float rhwCentroid;
97
98 // Compute the x coordinate of each fragment in the SIMD group.
99 const auto xMorton = SIMD::Float([](int i) { return float(compactEvenBits(i)); }); // 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3, ...
100 xFragment = SIMD::Float(Float(x)) + xMorton - SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, x0)));
101
102 if(interpolateZ())
103 {
104 for(unsigned int q : samples)
105 {
106 SIMD::Float x = xFragment;
107
108 if(state.enableMultiSampling)
109 {
110 x -= SIMD::Float(*Pointer<Float>(constants + OFFSET(Constants, SampleLocationsX) + q * sizeof(float)));
111 }
112
113 z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive, z), false, false);
114
115 if(state.depthBias)
116 {
117 z[q] += SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, zBias)));
118 }
119
120 unclampedZ[q] = z[q];
121 }
122 }
123
124 Bool depthPass = false;
125
126 if(earlyFragmentTests)
127 {
128 for(unsigned int q : samples)
129 {
130 z[q] = clampDepth(z[q]);
131 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
132 depthBoundsTest(zBuffer, q, x, zMask[q], cMask[q]);
133 }
134
135 writeStencil(sBuffer, x, sMask, zMask, cMask, samples);
136 }
137
138 If(depthPass || !earlyFragmentTests)
139 {
140 if(earlyFragmentTests)
141 {
142 writeDepth(zBuffer, x, zMask, samples);
143 occlusionSampleCount(zMask, sMask, samples);
144 }
145
146 // TODO(b/236162233): Use SIMD::Float2
147 SIMD::Float xCentroid = 0.0f;
148 SIMD::Float yCentroid = 0.0f;
149
150 if(state.centroid || shaderContainsInterpolation) // TODO(b/194714095)
151 {
152 SIMD::Float weight = 1.0e-9f;
153
154 for(unsigned int q : samples)
155 {
156 ASSERT(SIMD::Width == 4);
157 xCentroid += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, sampleX[q]) + 16 * cMask[q]));
158 yCentroid += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, sampleY[q]) + 16 * cMask[q]));
159 weight += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, weight) + 16 * cMask[q]));
160 }
161
162 weight = Rcp(weight, true /* relaxedPrecision */);
163 xCentroid *= weight;
164 yCentroid *= weight;
165
166 xCentroid += xFragment;
167 yCentroid += yFragment;
168 }
169
170 if(interpolateW())
171 {
172 w = interpolate(xFragment, Dw, rhw, primitive + OFFSET(Primitive, w), false, false);
173 rhw = reciprocal(w, false, true);
174
175 if(state.centroid || shaderContainsInterpolation) // TODO(b/194714095)
176 {
177 rhwCentroid = reciprocal(SpirvRoutine::interpolateAtXY(xCentroid, yCentroid, rhwCentroid, primitive + OFFSET(Primitive, w), SpirvRoutine::Linear));
178 }
179 }
180
181 if(spirvShader)
182 {
183 if(shaderContainsInterpolation) // TODO(b/194714095)
184 {
185 routine.interpolationData.primitive = primitive;
186
187 routine.interpolationData.x = xFragment;
188 routine.interpolationData.y = yFragment;
189 routine.interpolationData.rhw = rhw;
190
191 routine.interpolationData.xCentroid = xCentroid;
192 routine.interpolationData.yCentroid = yCentroid;
193 routine.interpolationData.rhwCentroid = rhwCentroid;
194 }
195
196 SIMD::Float xSample = xFragment;
197 SIMD::Float ySample = yFragment;
198
199 if(perSampleShading && (state.multiSampleCount > 1))
200 {
201 xSample += SampleLocationsX[samples[0]];
202 ySample += SampleLocationsY[samples[0]];
203 }
204
205 int packedInterpolant = 0;
206 for(int interfaceInterpolant = 0; interfaceInterpolant < MAX_INTERFACE_COMPONENTS; interfaceInterpolant++)
207 {
208 const auto &input = spirvShader->inputs[interfaceInterpolant];
209 if(input.Type != Spirv::ATTRIBTYPE_UNUSED)
210 {
211 routine.inputsInterpolation[packedInterpolant] = input.Flat ? SpirvRoutine::Flat : (input.NoPerspective ? SpirvRoutine::Linear : SpirvRoutine::Perspective);
212 if(input.Centroid && state.enableMultiSampling)
213 {
214 routine.inputs[interfaceInterpolant] =
215 SpirvRoutine::interpolateAtXY(xCentroid, yCentroid, rhwCentroid,
216 primitive + OFFSET(Primitive, V[packedInterpolant]),
217 routine.inputsInterpolation[packedInterpolant]);
218 }
219 else if(perSampleShading)
220 {
221 routine.inputs[interfaceInterpolant] =
222 SpirvRoutine::interpolateAtXY(xSample, ySample, rhw,
223 primitive + OFFSET(Primitive, V[packedInterpolant]),
224 routine.inputsInterpolation[packedInterpolant]);
225 }
226 else
227 {
228 routine.inputs[interfaceInterpolant] =
229 interpolate(xFragment, Dv[interfaceInterpolant], rhw,
230 primitive + OFFSET(Primitive, V[packedInterpolant]),
231 input.Flat, !input.NoPerspective);
232 }
233 packedInterpolant++;
234 }
235 }
236
237 setBuiltins(x, y, unclampedZ, w, cMask, samples);
238
239 for(uint32_t i = 0; i < state.numClipDistances; i++)
240 {
241 auto distance = interpolate(xFragment, DclipDistance[i], rhw,
242 primitive + OFFSET(Primitive, clipDistance[i]),
243 false, true);
244
245 auto clipMask = SignMask(CmpGE(distance, SIMD::Float(0)));
246 for(unsigned int q : samples)
247 {
248 // FIXME(b/148105887): Fragments discarded by clipping do not exist at
249 // all -- they should not be counted in queries or have their Z/S effects
250 // performed when early fragment tests are enabled.
251 cMask[q] &= clipMask;
252 }
253
254 if(spirvShader->getUsedCapabilities().ClipDistance)
255 {
256 auto it = spirvShader->inputBuiltins.find(spv::BuiltInClipDistance);
257 if(it != spirvShader->inputBuiltins.end())
258 {
259 if(i < it->second.SizeInComponents)
260 {
261 routine.getVariable(it->second.Id)[it->second.FirstComponent + i] = distance;
262 }
263 }
264 }
265 }
266
267 if(spirvShader->getUsedCapabilities().CullDistance)
268 {
269 auto it = spirvShader->inputBuiltins.find(spv::BuiltInCullDistance);
270 if(it != spirvShader->inputBuiltins.end())
271 {
272 for(uint32_t i = 0; i < state.numCullDistances; i++)
273 {
274 if(i < it->second.SizeInComponents)
275 {
276 routine.getVariable(it->second.Id)[it->second.FirstComponent + i] =
277 interpolate(xFragment, DcullDistance[i], rhw,
278 primitive + OFFSET(Primitive, cullDistance[i]),
279 false, true);
280 }
281 }
282 }
283 }
284 }
285
286 if(spirvShader)
287 {
288 executeShader(cMask, earlyFragmentTests ? sMask : cMask, earlyFragmentTests ? zMask : cMask, samples);
289 }
290
291 Bool alphaPass = alphaTest(cMask, samples);
292
293 if((spirvShader && spirvShader->coverageModified()) || state.alphaToCoverage)
294 {
295 for(unsigned int q : samples)
296 {
297 zMask[q] &= cMask[q];
298 sMask[q] &= cMask[q];
299 }
300 }
301
302 If(alphaPass)
303 {
304 if(!earlyFragmentTests)
305 {
306 for(unsigned int q : samples)
307 {
308 z[q] = clampDepth(z[q]);
309 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
310 depthBoundsTest(zBuffer, q, x, zMask[q], cMask[q]);
311 }
312 }
313
314 If(depthPass)
315 {
316 if(!earlyFragmentTests)
317 {
318 writeDepth(zBuffer, x, zMask, samples);
319 occlusionSampleCount(zMask, sMask, samples);
320 }
321
322 blendColor(cBuffer, x, sMask, zMask, cMask, samples);
323 }
324 }
325 }
326
327 if(!earlyFragmentTests)
328 {
329 writeStencil(sBuffer, x, sMask, zMask, cMask, samples);
330 }
331 }
332 }
333
stencilTest(const Pointer<Byte> & sBuffer,const Int & x,Int sMask[4],const SampleSet & samples)334 void PixelRoutine::stencilTest(const Pointer<Byte> &sBuffer, const Int &x, Int sMask[4], const SampleSet &samples)
335 {
336 if(!state.stencilActive)
337 {
338 return;
339 }
340
341 for(unsigned int q : samples)
342 {
343 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
344
345 Pointer<Byte> buffer = sBuffer + x;
346
347 if(q > 0)
348 {
349 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
350 }
351
352 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
353 Byte8 value = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
354 value = value | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
355 Byte8 valueBack = value;
356
357 if(state.frontStencil.useCompareMask)
358 {
359 value &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].testMaskQ));
360 }
361
362 stencilTest(value, state.frontStencil.compareOp, false);
363
364 if(state.backStencil.useCompareMask)
365 {
366 valueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].testMaskQ));
367 }
368
369 stencilTest(valueBack, state.backStencil.compareOp, true);
370
371 value &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
372 valueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
373 value |= valueBack;
374
375 sMask[q] &= SignMask(value);
376 }
377 }
378
stencilTest(Byte8 & value,VkCompareOp stencilCompareMode,bool isBack)379 void PixelRoutine::stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool isBack)
380 {
381 Byte8 equal;
382
383 switch(stencilCompareMode)
384 {
385 case VK_COMPARE_OP_ALWAYS:
386 value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
387 break;
388 case VK_COMPARE_OP_NEVER:
389 value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
390 break;
391 case VK_COMPARE_OP_LESS: // a < b ~ b > a
392 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
393 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
394 break;
395 case VK_COMPARE_OP_EQUAL:
396 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
397 break;
398 case VK_COMPARE_OP_NOT_EQUAL: // a != b ~ !(a == b)
399 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
400 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
401 break;
402 case VK_COMPARE_OP_LESS_OR_EQUAL: // a <= b ~ (b > a) || (a == b)
403 equal = value;
404 equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
405 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
406 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
407 value |= equal;
408 break;
409 case VK_COMPARE_OP_GREATER: // a > b
410 equal = *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ));
411 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
412 equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
413 value = equal;
414 break;
415 case VK_COMPARE_OP_GREATER_OR_EQUAL: // a >= b ~ !(a < b) ~ !(b > a)
416 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
417 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
418 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
419 break;
420 default:
421 UNSUPPORTED("VkCompareOp: %d", int(stencilCompareMode));
422 }
423 }
424
readDepth32F(const Pointer<Byte> & zBuffer,int q,const Int & x) const425 SIMD::Float PixelRoutine::readDepth32F(const Pointer<Byte> &zBuffer, int q, const Int &x) const
426 {
427 ASSERT(SIMD::Width == 4);
428 Pointer<Byte> buffer = zBuffer + 4 * x;
429 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
430
431 if(q > 0)
432 {
433 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
434 }
435
436 Float4 zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
437 return SIMD::Float(zValue);
438 }
439
readDepth16(const Pointer<Byte> & zBuffer,int q,const Int & x) const440 SIMD::Float PixelRoutine::readDepth16(const Pointer<Byte> &zBuffer, int q, const Int &x) const
441 {
442 ASSERT(SIMD::Width == 4);
443 Pointer<Byte> buffer = zBuffer + 2 * x;
444 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
445
446 if(q > 0)
447 {
448 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
449 }
450
451 UShort4 zValue16;
452 zValue16 = As<UShort4>(Insert(As<Int2>(zValue16), *Pointer<Int>(buffer), 0));
453 zValue16 = As<UShort4>(Insert(As<Int2>(zValue16), *Pointer<Int>(buffer + pitch), 1));
454 Float4 zValue = Float4(zValue16);
455 return SIMD::Float(zValue);
456 }
457
clampDepth(const SIMD::Float & z)458 SIMD::Float PixelRoutine::clampDepth(const SIMD::Float &z)
459 {
460 if(!state.depthClamp)
461 {
462 return z;
463 }
464
465 return Min(Max(z, state.minDepthClamp), state.maxDepthClamp);
466 }
467
depthTest(const Pointer<Byte> & zBuffer,int q,const Int & x,const SIMD::Float & z,const Int & sMask,Int & zMask,const Int & cMask)468 Bool PixelRoutine::depthTest(const Pointer<Byte> &zBuffer, int q, const Int &x, const SIMD::Float &z, const Int &sMask, Int &zMask, const Int &cMask)
469 {
470 if(!state.depthTestActive)
471 {
472 return true;
473 }
474
475 SIMD::Float Z;
476 SIMD::Float zValue;
477
478 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
479 {
480 switch(state.depthFormat)
481 {
482 case VK_FORMAT_D16_UNORM:
483 Z = Min(Max(Round(z * 0xFFFF), 0.0f), 0xFFFF);
484 zValue = readDepth16(zBuffer, q, x);
485 break;
486 case VK_FORMAT_D32_SFLOAT:
487 case VK_FORMAT_D32_SFLOAT_S8_UINT:
488 Z = z;
489 zValue = readDepth32F(zBuffer, q, x);
490 break;
491 default:
492 UNSUPPORTED("Depth format: %d", int(state.depthFormat));
493 return false;
494 }
495 }
496
497 SIMD::Int zTest;
498
499 switch(state.depthCompareMode)
500 {
501 case VK_COMPARE_OP_ALWAYS:
502 // Optimized
503 break;
504 case VK_COMPARE_OP_NEVER:
505 // Optimized
506 break;
507 case VK_COMPARE_OP_EQUAL:
508 zTest = CmpEQ(zValue, Z);
509 break;
510 case VK_COMPARE_OP_NOT_EQUAL:
511 zTest = CmpNEQ(zValue, Z);
512 break;
513 case VK_COMPARE_OP_LESS:
514 zTest = CmpNLE(zValue, Z);
515 break;
516 case VK_COMPARE_OP_GREATER_OR_EQUAL:
517 zTest = CmpLE(zValue, Z);
518 break;
519 case VK_COMPARE_OP_LESS_OR_EQUAL:
520 zTest = CmpNLT(zValue, Z);
521 break;
522 case VK_COMPARE_OP_GREATER:
523 zTest = CmpLT(zValue, Z);
524 break;
525 default:
526 UNSUPPORTED("VkCompareOp: %d", int(state.depthCompareMode));
527 }
528
529 switch(state.depthCompareMode)
530 {
531 case VK_COMPARE_OP_ALWAYS:
532 zMask = cMask;
533 break;
534 case VK_COMPARE_OP_NEVER:
535 zMask = 0x0;
536 break;
537 default:
538 zMask = SignMask(zTest) & cMask;
539 break;
540 }
541
542 if(state.stencilActive)
543 {
544 zMask &= sMask;
545 }
546
547 return zMask != 0;
548 }
549
depthBoundsTest16(const Pointer<Byte> & zBuffer,int q,const Int & x)550 Int4 PixelRoutine::depthBoundsTest16(const Pointer<Byte> &zBuffer, int q, const Int &x)
551 {
552 Pointer<Byte> buffer = zBuffer + 2 * x;
553 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
554
555 if(q > 0)
556 {
557 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
558 }
559
560 Float4 minDepthBound(state.minDepthBounds);
561 Float4 maxDepthBound(state.maxDepthBounds);
562
563 Int2 z;
564 z = Insert(z, *Pointer<Int>(buffer), 0);
565 z = Insert(z, *Pointer<Int>(buffer + pitch), 1);
566
567 Float4 zValue = Float4(As<UShort4>(z)) * (1.0f / 0xFFFF);
568 return Int4(CmpLE(minDepthBound, zValue) & CmpLE(zValue, maxDepthBound));
569 }
570
depthBoundsTest32F(const Pointer<Byte> & zBuffer,int q,const Int & x)571 Int4 PixelRoutine::depthBoundsTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x)
572 {
573 Pointer<Byte> buffer = zBuffer + 4 * x;
574 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
575
576 if(q > 0)
577 {
578 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
579 }
580
581 Float4 zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
582 return Int4(CmpLE(state.minDepthBounds, zValue) & CmpLE(zValue, state.maxDepthBounds));
583 }
584
depthBoundsTest(const Pointer<Byte> & zBuffer,int q,const Int & x,Int & zMask,Int & cMask)585 void PixelRoutine::depthBoundsTest(const Pointer<Byte> &zBuffer, int q, const Int &x, Int &zMask, Int &cMask)
586 {
587 if(!state.depthBoundsTestActive)
588 {
589 return;
590 }
591
592 Int4 zTest;
593 switch(state.depthFormat)
594 {
595 case VK_FORMAT_D16_UNORM:
596 zTest = depthBoundsTest16(zBuffer, q, x);
597 break;
598 case VK_FORMAT_D32_SFLOAT:
599 case VK_FORMAT_D32_SFLOAT_S8_UINT:
600 zTest = depthBoundsTest32F(zBuffer, q, x);
601 break;
602 default:
603 UNSUPPORTED("Depth format: %d", int(state.depthFormat));
604 break;
605 }
606
607 if(!state.depthTestActive)
608 {
609 cMask &= zMask & SignMask(zTest);
610 }
611 else
612 {
613 zMask &= cMask & SignMask(zTest);
614 }
615 }
616
alphaToCoverage(Int cMask[4],const SIMD::Float & alpha,const SampleSet & samples)617 void PixelRoutine::alphaToCoverage(Int cMask[4], const SIMD::Float &alpha, const SampleSet &samples)
618 {
619 static const int a2c[4] = {
620 OFFSET(DrawData, a2c0),
621 OFFSET(DrawData, a2c1),
622 OFFSET(DrawData, a2c2),
623 OFFSET(DrawData, a2c3),
624 };
625
626 for(unsigned int q : samples)
627 {
628 SIMD::Int coverage = CmpNLT(alpha, SIMD::Float(*Pointer<Float>(data + a2c[q])));
629 Int aMask = SignMask(coverage);
630 cMask[q] &= aMask;
631 }
632 }
633
writeDepth32F(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)634 void PixelRoutine::writeDepth32F(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
635 {
636 Float4 Z = z;
637
638 Pointer<Byte> buffer = zBuffer + 4 * x;
639 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
640
641 if(q > 0)
642 {
643 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
644 }
645
646 Float4 zValue;
647
648 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
649 {
650 zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
651 }
652
653 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + zMask * 16, 16));
654 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + zMask * 16, 16));
655 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
656
657 *Pointer<Float2>(buffer) = Float2(Z.xy);
658 *Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
659 }
660
writeDepth16(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)661 void PixelRoutine::writeDepth16(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
662 {
663 Short4 Z = UShort4(Round(z * 0xFFFF), true);
664
665 Pointer<Byte> buffer = zBuffer + 2 * x;
666 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
667
668 if(q > 0)
669 {
670 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
671 }
672
673 Short4 zValue;
674
675 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
676 {
677 zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer), 0));
678 zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer + pitch), 1));
679 }
680
681 Z = Z & *Pointer<Short4>(constants + OFFSET(Constants, maskW4Q) + zMask * 8, 8);
682 zValue = zValue & *Pointer<Short4>(constants + OFFSET(Constants, invMaskW4Q) + zMask * 8, 8);
683 Z = Z | zValue;
684
685 *Pointer<Int>(buffer) = Extract(As<Int2>(Z), 0);
686 *Pointer<Int>(buffer + pitch) = Extract(As<Int2>(Z), 1);
687 }
688
writeDepth(Pointer<Byte> & zBuffer,const Int & x,const Int zMask[4],const SampleSet & samples)689 void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, const Int &x, const Int zMask[4], const SampleSet &samples)
690 {
691 if(!state.depthWriteEnable)
692 {
693 return;
694 }
695
696 for(unsigned int q : samples)
697 {
698 ASSERT(SIMD::Width == 4);
699 switch(state.depthFormat)
700 {
701 case VK_FORMAT_D16_UNORM:
702 writeDepth16(zBuffer, q, x, Extract128(z[q], 0), zMask[q]);
703 break;
704 case VK_FORMAT_D32_SFLOAT:
705 case VK_FORMAT_D32_SFLOAT_S8_UINT:
706 writeDepth32F(zBuffer, q, x, Extract128(z[q], 0), zMask[q]);
707 break;
708 default:
709 UNSUPPORTED("Depth format: %d", int(state.depthFormat));
710 break;
711 }
712 }
713 }
714
occlusionSampleCount(const Int zMask[4],const Int sMask[4],const SampleSet & samples)715 void PixelRoutine::occlusionSampleCount(const Int zMask[4], const Int sMask[4], const SampleSet &samples)
716 {
717 if(!state.occlusionEnabled)
718 {
719 return;
720 }
721
722 for(unsigned int q : samples)
723 {
724 occlusion += *Pointer<UInt>(constants + OFFSET(Constants, occlusionCount) + 4 * (zMask[q] & sMask[q]));
725 }
726 }
727
writeStencil(Pointer<Byte> & sBuffer,const Int & x,const Int sMask[4],const Int zMask[4],const Int cMask[4],const SampleSet & samples)728 void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, const Int &x, const Int sMask[4], const Int zMask[4], const Int cMask[4], const SampleSet &samples)
729 {
730 if(!state.stencilActive)
731 {
732 return;
733 }
734
735 if(state.frontStencil.passOp == VK_STENCIL_OP_KEEP && state.frontStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.frontStencil.failOp == VK_STENCIL_OP_KEEP)
736 {
737 if(state.backStencil.passOp == VK_STENCIL_OP_KEEP && state.backStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.backStencil.failOp == VK_STENCIL_OP_KEEP)
738 {
739 return;
740 }
741 }
742
743 if(!state.frontStencil.writeEnabled && !state.backStencil.writeEnabled)
744 {
745 return;
746 }
747
748 for(unsigned int q : samples)
749 {
750 Pointer<Byte> buffer = sBuffer + x;
751
752 if(q > 0)
753 {
754 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
755 }
756
757 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
758 Byte8 bufferValue = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
759 bufferValue = bufferValue | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
760 Byte8 newValue = stencilOperation(bufferValue, state.frontStencil, false, zMask[q], sMask[q]);
761
762 if(state.frontStencil.useWriteMask) // Assume 8-bit stencil buffer
763 {
764 Byte8 maskedValue = bufferValue;
765 newValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].writeMaskQ));
766 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].invWriteMaskQ));
767 newValue |= maskedValue;
768 }
769
770 Byte8 newValueBack = stencilOperation(bufferValue, state.backStencil, true, zMask[q], sMask[q]);
771
772 if(state.backStencil.useWriteMask) // Assume 8-bit stencil buffer
773 {
774 Byte8 maskedValue = bufferValue;
775 newValueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].writeMaskQ));
776 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].invWriteMaskQ));
777 newValueBack |= maskedValue;
778 }
779
780 newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
781 newValueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
782 newValue |= newValueBack;
783
784 newValue &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * cMask[q]);
785 bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * cMask[q]);
786 newValue |= bufferValue;
787
788 *Pointer<Short>(buffer) = Extract(As<Short4>(newValue), 0);
789 *Pointer<Short>(buffer + pitch) = Extract(As<Short4>(newValue), 1);
790 }
791 }
792
stencilOperation(const Byte8 & bufferValue,const PixelProcessor::States::StencilOpState & ops,bool isBack,const Int & zMask,const Int & sMask)793 Byte8 PixelRoutine::stencilOperation(const Byte8 &bufferValue, const PixelProcessor::States::StencilOpState &ops, bool isBack, const Int &zMask, const Int &sMask)
794 {
795 Byte8 pass = stencilOperation(bufferValue, ops.passOp, isBack);
796
797 if(state.depthTestActive && ops.depthFailOp != ops.passOp) // zMask valid and values not the same
798 {
799 Byte8 zFail = stencilOperation(bufferValue, ops.depthFailOp, isBack);
800
801 pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * zMask);
802 zFail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * zMask);
803 pass |= zFail;
804 }
805
806 if(ops.failOp != ops.passOp || (state.depthTestActive && ops.failOp != ops.depthFailOp))
807 {
808 Byte8 fail = stencilOperation(bufferValue, ops.failOp, isBack);
809
810 pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * sMask);
811 fail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * sMask);
812 pass |= fail;
813 }
814
815 return pass;
816 }
817
hasStencilReplaceRef() const818 bool PixelRoutine::hasStencilReplaceRef() const
819 {
820 return spirvShader &&
821 (spirvShader->outputBuiltins.find(spv::BuiltInFragStencilRefEXT) !=
822 spirvShader->outputBuiltins.end());
823 }
824
stencilReplaceRef()825 Byte8 PixelRoutine::stencilReplaceRef()
826 {
827 ASSERT(spirvShader);
828
829 auto it = spirvShader->outputBuiltins.find(spv::BuiltInFragStencilRefEXT);
830 ASSERT(it != spirvShader->outputBuiltins.end());
831
832 UInt4 sRef = As<UInt4>(routine.getVariable(it->second.Id)[it->second.FirstComponent]) & UInt4(0xff);
833 // TODO (b/148295813): Could be done with a single pshufb instruction. Optimize the
834 // following line by either adding a rr::Shuffle() variant to do
835 // it explicitly or adding a Byte4(Int4) constructor would work.
836 sRef.x = rr::UInt(sRef.x) | (rr::UInt(sRef.y) << 8) | (rr::UInt(sRef.z) << 16) | (rr::UInt(sRef.w) << 24);
837
838 UInt2 sRefDuplicated;
839 sRefDuplicated = Insert(sRefDuplicated, sRef.x, 0);
840 sRefDuplicated = Insert(sRefDuplicated, sRef.x, 1);
841 return As<Byte8>(sRefDuplicated);
842 }
843
stencilOperation(const Byte8 & bufferValue,VkStencilOp operation,bool isBack)844 Byte8 PixelRoutine::stencilOperation(const Byte8 &bufferValue, VkStencilOp operation, bool isBack)
845 {
846 if(hasStencilReplaceRef())
847 {
848 return stencilReplaceRef();
849 }
850 else
851 {
852 switch(operation)
853 {
854 case VK_STENCIL_OP_KEEP:
855 return bufferValue;
856 case VK_STENCIL_OP_ZERO:
857 return Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
858 case VK_STENCIL_OP_REPLACE:
859 return *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceQ));
860 case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
861 return AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
862 case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
863 return SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
864 case VK_STENCIL_OP_INVERT:
865 return bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
866 case VK_STENCIL_OP_INCREMENT_AND_WRAP:
867 return bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
868 case VK_STENCIL_OP_DECREMENT_AND_WRAP:
869 return bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
870 default:
871 UNSUPPORTED("VkStencilOp: %d", int(operation));
872 }
873 }
874
875 return Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
876 }
877
isSRGB(int index) const878 bool PixelRoutine::isSRGB(int index) const
879 {
880 return vk::Format(state.colorFormat[index]).isSRGBformat();
881 }
882
readPixel(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4s & pixel)883 void PixelRoutine::readPixel(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &pixel)
884 {
885 Short4 c01;
886 Short4 c23;
887 Pointer<Byte> buffer = cBuffer;
888 Pointer<Byte> buffer2;
889
890 Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
891
892 vk::Format format = state.colorFormat[index];
893 switch(format)
894 {
895 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
896 buffer += 2 * x;
897 buffer2 = buffer + pitchB;
898 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
899
900 pixel.x = (c01 & Short4(0xF000u));
901 pixel.y = (c01 & Short4(0x0F00u)) << 4;
902 pixel.z = (c01 & Short4(0x00F0u)) << 8;
903 pixel.w = (c01 & Short4(0x000Fu)) << 12;
904
905 // Expand to 16 bit range
906 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
907 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
908 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
909 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
910 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
911 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
912 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
913 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
914 break;
915 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
916 buffer += 2 * x;
917 buffer2 = buffer + pitchB;
918 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
919
920 pixel.z = (c01 & Short4(0xF000u));
921 pixel.y = (c01 & Short4(0x0F00u)) << 4;
922 pixel.x = (c01 & Short4(0x00F0u)) << 8;
923 pixel.w = (c01 & Short4(0x000Fu)) << 12;
924
925 // Expand to 16 bit range
926 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
927 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
928 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
929 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
930 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
931 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
932 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
933 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
934 break;
935 case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
936 buffer += 2 * x;
937 buffer2 = buffer + pitchB;
938 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
939
940 pixel.w = (c01 & Short4(0xF000u));
941 pixel.z = (c01 & Short4(0x0F00u)) << 4;
942 pixel.y = (c01 & Short4(0x00F0u)) << 8;
943 pixel.x = (c01 & Short4(0x000Fu)) << 12;
944
945 // Expand to 16 bit range
946 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
947 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
948 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
949 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
950 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
951 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
952 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
953 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
954 break;
955 case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
956 buffer += 2 * x;
957 buffer2 = buffer + pitchB;
958 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
959
960 pixel.w = (c01 & Short4(0xF000u));
961 pixel.x = (c01 & Short4(0x0F00u)) << 4;
962 pixel.y = (c01 & Short4(0x00F0u)) << 8;
963 pixel.z = (c01 & Short4(0x000Fu)) << 12;
964
965 // Expand to 16 bit range
966 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
967 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
968 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
969 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
970 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
971 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
972 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
973 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
974 break;
975 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
976 buffer += 2 * x;
977 buffer2 = buffer + pitchB;
978 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
979
980 pixel.x = (c01 & Short4(0xF800u));
981 pixel.y = (c01 & Short4(0x07C0u)) << 5;
982 pixel.z = (c01 & Short4(0x003Eu)) << 10;
983 pixel.w = ((c01 & Short4(0x0001u)) << 15) >> 15;
984
985 // Expand to 16 bit range
986 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
987 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
988 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
989 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
990 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
991 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
992 break;
993 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
994 buffer += 2 * x;
995 buffer2 = buffer + pitchB;
996 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
997
998 pixel.z = (c01 & Short4(0xF800u));
999 pixel.y = (c01 & Short4(0x07C0u)) << 5;
1000 pixel.x = (c01 & Short4(0x003Eu)) << 10;
1001 pixel.w = ((c01 & Short4(0x0001u)) << 15) >> 15;
1002
1003 // Expand to 16 bit range
1004 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1005 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1006 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1007 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1008 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1009 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1010 break;
1011 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1012 buffer += 2 * x;
1013 buffer2 = buffer + pitchB;
1014 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1015
1016 pixel.x = (c01 & Short4(0x7C00u)) << 1;
1017 pixel.y = (c01 & Short4(0x03E0u)) << 6;
1018 pixel.z = (c01 & Short4(0x001Fu)) << 11;
1019 pixel.w = (c01 & Short4(0x8000u)) >> 15;
1020
1021 // Expand to 16 bit range
1022 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1023 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1024 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1025 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1026 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1027 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1028 break;
1029 case VK_FORMAT_R5G6B5_UNORM_PACK16:
1030 buffer += 2 * x;
1031 buffer2 = buffer + pitchB;
1032 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1033
1034 pixel.x = c01 & Short4(0xF800u);
1035 pixel.y = (c01 & Short4(0x07E0u)) << 5;
1036 pixel.z = (c01 & Short4(0x001Fu)) << 11;
1037 pixel.w = Short4(0xFFFFu);
1038
1039 // Expand to 16 bit range
1040 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1041 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1042 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
1043 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
1044 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1045 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1046 break;
1047 case VK_FORMAT_B5G6R5_UNORM_PACK16:
1048 buffer += 2 * x;
1049 buffer2 = buffer + pitchB;
1050 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1051
1052 pixel.z = c01 & Short4(0xF800u);
1053 pixel.y = (c01 & Short4(0x07E0u)) << 5;
1054 pixel.x = (c01 & Short4(0x001Fu)) << 11;
1055 pixel.w = Short4(0xFFFFu);
1056
1057 // Expand to 16 bit range
1058 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1059 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1060 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
1061 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
1062 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1063 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1064 break;
1065 case VK_FORMAT_B8G8R8A8_UNORM:
1066 case VK_FORMAT_B8G8R8A8_SRGB:
1067 buffer += 4 * x;
1068 c01 = *Pointer<Short4>(buffer);
1069 buffer += pitchB;
1070 c23 = *Pointer<Short4>(buffer);
1071 pixel.z = c01;
1072 pixel.y = c01;
1073 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1074 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1075 pixel.x = pixel.z;
1076 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1077 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1078 pixel.y = pixel.z;
1079 pixel.w = pixel.x;
1080 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1081 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1082 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1083 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1084 break;
1085 case VK_FORMAT_R8G8B8A8_UNORM:
1086 case VK_FORMAT_R8G8B8A8_SRGB:
1087 buffer += 4 * x;
1088 c01 = *Pointer<Short4>(buffer);
1089 buffer += pitchB;
1090 c23 = *Pointer<Short4>(buffer);
1091 pixel.z = c01;
1092 pixel.y = c01;
1093 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1094 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1095 pixel.x = pixel.z;
1096 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1097 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1098 pixel.y = pixel.z;
1099 pixel.w = pixel.x;
1100 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1101 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1102 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1103 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1104 break;
1105 case VK_FORMAT_R8_UNORM:
1106 buffer += 1 * x;
1107 pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
1108 buffer += pitchB;
1109 pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
1110 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1111 pixel.y = Short4(0x0000);
1112 pixel.z = Short4(0x0000);
1113 pixel.w = Short4(0xFFFFu);
1114 break;
1115 case VK_FORMAT_R8G8_UNORM:
1116 buffer += 2 * x;
1117 c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
1118 buffer += pitchB;
1119 c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
1120 pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
1121 pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
1122 pixel.z = Short4(0x0000u);
1123 pixel.w = Short4(0xFFFFu);
1124 break;
1125 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1126 {
1127 Int4 v = Int4(0);
1128 buffer += 4 * x;
1129 v = Insert(v, *Pointer<Int>(buffer + 0), 0);
1130 v = Insert(v, *Pointer<Int>(buffer + 4), 1);
1131 buffer += pitchB;
1132 v = Insert(v, *Pointer<Int>(buffer + 0), 2);
1133 v = Insert(v, *Pointer<Int>(buffer + 4), 3);
1134
1135 pixel.x = Short4(v << 6) & Short4(0xFFC0u);
1136 pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
1137 pixel.z = Short4(v >> 14) & Short4(0xFFC0u);
1138 pixel.w = Short4(v >> 16) & Short4(0xC000u);
1139
1140 // Expand to 16 bit range
1141 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1142 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1143 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1144 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 2);
1145 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1146 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1147 }
1148 break;
1149 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1150 {
1151 Int4 v = Int4(0);
1152 v = Insert(v, *Pointer<Int>(buffer + 4 * x), 0);
1153 v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 1);
1154 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1155 v = Insert(v, *Pointer<Int>(buffer + 4 * x), 2);
1156 v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 3);
1157
1158 pixel.x = Short4(v >> 14) & Short4(0xFFC0u);
1159 pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
1160 pixel.z = Short4(v << 6) & Short4(0xFFC0u);
1161 pixel.w = Short4(v >> 16) & Short4(0xC000u);
1162
1163 // Expand to 16 bit range
1164 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1165 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1166 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1167 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 2);
1168 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1169 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1170 }
1171 break;
1172 default:
1173 UNSUPPORTED("VkFormat %d", int(format));
1174 }
1175 }
1176
blendConstant(vk::Format format,int component,BlendFactorModifier modifier)1177 Float PixelRoutine::blendConstant(vk::Format format, int component, BlendFactorModifier modifier)
1178 {
1179 bool inverse = (modifier == OneMinus);
1180
1181 if(format.isUnsignedNormalized())
1182 {
1183 return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantU.v[component]))
1184 : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantU.v[component]));
1185 }
1186 else if(format.isSignedNormalized())
1187 {
1188 return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantS.v[component]))
1189 : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantS.v[component]));
1190 }
1191 else // Floating-point format
1192 {
1193 ASSERT(format.isFloatFormat());
1194 return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantF.v[component]))
1195 : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantF.v[component]));
1196 }
1197 }
1198
blendFactorRGB(SIMD::Float4 & blendFactor,const SIMD::Float4 & sourceColor,const SIMD::Float4 & destColor,VkBlendFactor colorBlendFactor,vk::Format format)1199 void PixelRoutine::blendFactorRGB(SIMD::Float4 &blendFactor, const SIMD::Float4 &sourceColor, const SIMD::Float4 &destColor, VkBlendFactor colorBlendFactor, vk::Format format)
1200 {
1201 switch(colorBlendFactor)
1202 {
1203 case VK_BLEND_FACTOR_ZERO:
1204 blendFactor.x = 0.0f;
1205 blendFactor.y = 0.0f;
1206 blendFactor.z = 0.0f;
1207 break;
1208 case VK_BLEND_FACTOR_ONE:
1209 blendFactor.x = 1.0f;
1210 blendFactor.y = 1.0f;
1211 blendFactor.z = 1.0f;
1212 break;
1213 case VK_BLEND_FACTOR_SRC_COLOR:
1214 blendFactor.x = sourceColor.x;
1215 blendFactor.y = sourceColor.y;
1216 blendFactor.z = sourceColor.z;
1217 break;
1218 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1219 blendFactor.x = 1.0f - sourceColor.x;
1220 blendFactor.y = 1.0f - sourceColor.y;
1221 blendFactor.z = 1.0f - sourceColor.z;
1222 break;
1223 case VK_BLEND_FACTOR_DST_COLOR:
1224 blendFactor.x = destColor.x;
1225 blendFactor.y = destColor.y;
1226 blendFactor.z = destColor.z;
1227 break;
1228 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1229 blendFactor.x = 1.0f - destColor.x;
1230 blendFactor.y = 1.0f - destColor.y;
1231 blendFactor.z = 1.0f - destColor.z;
1232 break;
1233 case VK_BLEND_FACTOR_SRC_ALPHA:
1234 blendFactor.x = sourceColor.w;
1235 blendFactor.y = sourceColor.w;
1236 blendFactor.z = sourceColor.w;
1237 break;
1238 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1239 blendFactor.x = 1.0f - sourceColor.w;
1240 blendFactor.y = 1.0f - sourceColor.w;
1241 blendFactor.z = 1.0f - sourceColor.w;
1242 break;
1243 case VK_BLEND_FACTOR_DST_ALPHA:
1244 blendFactor.x = destColor.w;
1245 blendFactor.y = destColor.w;
1246 blendFactor.z = destColor.w;
1247 break;
1248 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1249 blendFactor.x = 1.0f - destColor.w;
1250 blendFactor.y = 1.0f - destColor.w;
1251 blendFactor.z = 1.0f - destColor.w;
1252 break;
1253 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1254 blendFactor.x = 1.0f - destColor.w;
1255 blendFactor.x = Min(blendFactor.x, sourceColor.w);
1256 blendFactor.y = blendFactor.x;
1257 blendFactor.z = blendFactor.x;
1258 break;
1259 case VK_BLEND_FACTOR_CONSTANT_COLOR:
1260 blendFactor.x = blendConstant(format, 0);
1261 blendFactor.y = blendConstant(format, 1);
1262 blendFactor.z = blendConstant(format, 2);
1263 break;
1264 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1265 blendFactor.x = blendConstant(format, 3);
1266 blendFactor.y = blendConstant(format, 3);
1267 blendFactor.z = blendConstant(format, 3);
1268 break;
1269 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1270 blendFactor.x = blendConstant(format, 0, OneMinus);
1271 blendFactor.y = blendConstant(format, 1, OneMinus);
1272 blendFactor.z = blendConstant(format, 2, OneMinus);
1273 break;
1274 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1275 blendFactor.x = blendConstant(format, 3, OneMinus);
1276 blendFactor.y = blendConstant(format, 3, OneMinus);
1277 blendFactor.z = blendConstant(format, 3, OneMinus);
1278 break;
1279
1280 default:
1281 UNSUPPORTED("VkBlendFactor: %d", int(colorBlendFactor));
1282 }
1283
1284 // "If the color attachment is fixed-point, the components of the source and destination values and blend factors are each clamped
1285 // to [0,1] or [-1,1] respectively for an unsigned normalized or signed normalized color attachment prior to evaluating the blend
1286 // operations. If the color attachment is floating-point, no clamping occurs."
1287 if(blendFactorCanExceedFormatRange(colorBlendFactor, format))
1288 {
1289 if(format.isUnsignedNormalized())
1290 {
1291 blendFactor.x = Min(Max(blendFactor.x, 0.0f), 1.0f);
1292 blendFactor.y = Min(Max(blendFactor.y, 0.0f), 1.0f);
1293 blendFactor.z = Min(Max(blendFactor.z, 0.0f), 1.0f);
1294 }
1295 else if(format.isSignedNormalized())
1296 {
1297 blendFactor.x = Min(Max(blendFactor.x, -1.0f), 1.0f);
1298 blendFactor.y = Min(Max(blendFactor.y, -1.0f), 1.0f);
1299 blendFactor.z = Min(Max(blendFactor.z, -1.0f), 1.0f);
1300 }
1301 }
1302 }
1303
blendFactorAlpha(SIMD::Float & blendFactorAlpha,const SIMD::Float & sourceAlpha,const SIMD::Float & destAlpha,VkBlendFactor alphaBlendFactor,vk::Format format)1304 void PixelRoutine::blendFactorAlpha(SIMD::Float &blendFactorAlpha, const SIMD::Float &sourceAlpha, const SIMD::Float &destAlpha, VkBlendFactor alphaBlendFactor, vk::Format format)
1305 {
1306 switch(alphaBlendFactor)
1307 {
1308 case VK_BLEND_FACTOR_ZERO:
1309 blendFactorAlpha = 0.0f;
1310 break;
1311 case VK_BLEND_FACTOR_ONE:
1312 blendFactorAlpha = 1.0f;
1313 break;
1314 case VK_BLEND_FACTOR_SRC_COLOR:
1315 blendFactorAlpha = sourceAlpha;
1316 break;
1317 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1318 blendFactorAlpha = 1.0f - sourceAlpha;
1319 break;
1320 case VK_BLEND_FACTOR_DST_COLOR:
1321 blendFactorAlpha = destAlpha;
1322 break;
1323 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1324 blendFactorAlpha = 1.0f - destAlpha;
1325 break;
1326 case VK_BLEND_FACTOR_SRC_ALPHA:
1327 blendFactorAlpha = sourceAlpha;
1328 break;
1329 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1330 blendFactorAlpha = 1.0f - sourceAlpha;
1331 break;
1332 case VK_BLEND_FACTOR_DST_ALPHA:
1333 blendFactorAlpha = destAlpha;
1334 break;
1335 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1336 blendFactorAlpha = 1.0f - destAlpha;
1337 break;
1338 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1339 blendFactorAlpha = 1.0f;
1340 break;
1341 case VK_BLEND_FACTOR_CONSTANT_COLOR:
1342 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1343 blendFactorAlpha = blendConstant(format, 3);
1344 break;
1345 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1346 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1347 blendFactorAlpha = blendConstant(format, 3, OneMinus);
1348 break;
1349 default:
1350 UNSUPPORTED("VkBlendFactor: %d", int(alphaBlendFactor));
1351 }
1352
1353 // "If the color attachment is fixed-point, the components of the source and destination values and blend factors are each clamped
1354 // to [0,1] or [-1,1] respectively for an unsigned normalized or signed normalized color attachment prior to evaluating the blend
1355 // operations. If the color attachment is floating-point, no clamping occurs."
1356 if(blendFactorCanExceedFormatRange(alphaBlendFactor, format))
1357 {
1358 if(format.isUnsignedNormalized())
1359 {
1360 blendFactorAlpha = Min(Max(blendFactorAlpha, 0.0f), 1.0f);
1361 }
1362 else if(format.isSignedNormalized())
1363 {
1364 blendFactorAlpha = Min(Max(blendFactorAlpha, -1.0f), 1.0f);
1365 }
1366 }
1367 }
1368
blendOpOverlay(SIMD::Float & src,SIMD::Float & dst)1369 SIMD::Float PixelRoutine::blendOpOverlay(SIMD::Float &src, SIMD::Float &dst)
1370 {
1371 SIMD::Int largeDst = CmpGT(dst, 0.5f);
1372 return As<SIMD::Float>(
1373 (~largeDst & As<SIMD::Int>(2.0f * src * dst)) |
1374 (largeDst & As<SIMD::Int>(1.0f - (2.0f * (1.0f - src) * (1.0f - dst)))));
1375 }
1376
blendOpColorDodge(SIMD::Float & src,SIMD::Float & dst)1377 SIMD::Float PixelRoutine::blendOpColorDodge(SIMD::Float &src, SIMD::Float &dst)
1378 {
1379 SIMD::Int srcBelowOne = CmpLT(src, 1.0f);
1380 SIMD::Int positiveDst = CmpGT(dst, 0.0f);
1381 return As<SIMD::Float>(positiveDst & ((~srcBelowOne & As<SIMD::Int>(SIMD::Float(1.0f))) |
1382 (srcBelowOne & As<SIMD::Int>(Min(1.0f, (dst / (1.0f - src)))))));
1383 }
1384
blendOpColorBurn(SIMD::Float & src,SIMD::Float & dst)1385 SIMD::Float PixelRoutine::blendOpColorBurn(SIMD::Float &src, SIMD::Float &dst)
1386 {
1387 SIMD::Int dstBelowOne = CmpLT(dst, 1.0f);
1388 SIMD::Int positiveSrc = CmpGT(src, 0.0f);
1389 return As<SIMD::Float>(
1390 (~dstBelowOne & As<SIMD::Int>(SIMD::Float(1.0f))) |
1391 (dstBelowOne & positiveSrc & As<SIMD::Int>(1.0f - Min(1.0f, (1.0f - dst) / src))));
1392 }
1393
blendOpHardlight(SIMD::Float & src,SIMD::Float & dst)1394 SIMD::Float PixelRoutine::blendOpHardlight(SIMD::Float &src, SIMD::Float &dst)
1395 {
1396 SIMD::Int largeSrc = CmpGT(src, 0.5f);
1397 return As<SIMD::Float>(
1398 (~largeSrc & As<SIMD::Int>(2.0f * src * dst)) |
1399 (largeSrc & As<SIMD::Int>(1.0f - (2.0f * (1.0f - src) * (1.0f - dst)))));
1400 }
1401
blendOpSoftlight(SIMD::Float & src,SIMD::Float & dst)1402 SIMD::Float PixelRoutine::blendOpSoftlight(SIMD::Float &src, SIMD::Float &dst)
1403 {
1404 SIMD::Int largeSrc = CmpGT(src, 0.5f);
1405 SIMD::Int largeDst = CmpGT(dst, 0.25f);
1406
1407 return As<SIMD::Float>(
1408 (~largeSrc & As<SIMD::Int>(dst - ((1.0f - (2.0f * src)) * dst * (1.0f - dst)))) |
1409 (largeSrc & ((~largeDst & As<SIMD::Int>(dst + (((2.0f * src) - 1.0f) * dst * ((((16.0f * dst) - 12.0f) * dst) + 3.0f)))) |
1410 (largeDst & As<SIMD::Int>(dst + (((2.0f * src) - 1.0f) * (Sqrt<Mediump>(dst) - dst)))))));
1411 }
1412
maxRGB(SIMD::Float4 & c)1413 SIMD::Float PixelRoutine::maxRGB(SIMD::Float4 &c)
1414 {
1415 return Max(Max(c.x, c.y), c.z);
1416 }
1417
minRGB(SIMD::Float4 & c)1418 SIMD::Float PixelRoutine::minRGB(SIMD::Float4 &c)
1419 {
1420 return Min(Min(c.x, c.y), c.z);
1421 }
1422
setLumSat(SIMD::Float4 & cbase,SIMD::Float4 & csat,SIMD::Float4 & clum,SIMD::Float & x,SIMD::Float & y,SIMD::Float & z)1423 void PixelRoutine::setLumSat(SIMD::Float4 &cbase, SIMD::Float4 &csat, SIMD::Float4 &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z)
1424 {
1425 SIMD::Float minbase = minRGB(cbase);
1426 SIMD::Float sbase = maxRGB(cbase) - minbase;
1427 SIMD::Float ssat = maxRGB(csat) - minRGB(csat);
1428 SIMD::Int isNonZero = CmpGT(sbase, 0.0f);
1429 SIMD::Float4 color;
1430 color.x = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.x - minbase) * ssat / sbase));
1431 color.y = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.y - minbase) * ssat / sbase));
1432 color.z = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.z - minbase) * ssat / sbase));
1433 setLum(color, clum, x, y, z);
1434 }
1435
lumRGB(SIMD::Float4 & c)1436 SIMD::Float PixelRoutine::lumRGB(SIMD::Float4 &c)
1437 {
1438 return c.x * 0.3f + c.y * 0.59f + c.z * 0.11f;
1439 }
1440
computeLum(SIMD::Float & color,SIMD::Float & lum,SIMD::Float & mincol,SIMD::Float & maxcol,SIMD::Int & negative,SIMD::Int & aboveOne)1441 SIMD::Float PixelRoutine::computeLum(SIMD::Float &color, SIMD::Float &lum, SIMD::Float &mincol, SIMD::Float &maxcol, SIMD::Int &negative, SIMD::Int &aboveOne)
1442 {
1443 return As<SIMD::Float>(
1444 (negative & As<SIMD::Int>(lum + ((color - lum) * lum) / (lum - mincol))) |
1445 (~negative & ((aboveOne & As<SIMD::Int>(lum + ((color - lum) * (1.0f - lum)) / (maxcol - lum))) |
1446 (~aboveOne & As<SIMD::Int>(color)))));
1447 }
1448
setLum(SIMD::Float4 & cbase,SIMD::Float4 & clum,SIMD::Float & x,SIMD::Float & y,SIMD::Float & z)1449 void PixelRoutine::setLum(SIMD::Float4 &cbase, SIMD::Float4 &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z)
1450 {
1451 SIMD::Float lbase = lumRGB(cbase);
1452 SIMD::Float llum = lumRGB(clum);
1453 SIMD::Float ldiff = llum - lbase;
1454
1455 SIMD::Float4 color;
1456 color.x = cbase.x + ldiff;
1457 color.y = cbase.y + ldiff;
1458 color.z = cbase.z + ldiff;
1459
1460 SIMD::Float lum = lumRGB(color);
1461 SIMD::Float mincol = minRGB(color);
1462 SIMD::Float maxcol = maxRGB(color);
1463
1464 SIMD::Int negative = CmpLT(mincol, 0.0f);
1465 SIMD::Int aboveOne = CmpGT(maxcol, 1.0f);
1466
1467 x = computeLum(color.x, lum, mincol, maxcol, negative, aboveOne);
1468 y = computeLum(color.y, lum, mincol, maxcol, negative, aboveOne);
1469 z = computeLum(color.z, lum, mincol, maxcol, negative, aboveOne);
1470 }
1471
premultiply(SIMD::Float4 & c)1472 void PixelRoutine::premultiply(SIMD::Float4 &c)
1473 {
1474 SIMD::Int nonZeroAlpha = CmpNEQ(c.w, 0.0f);
1475 c.x = As<SIMD::Float>(nonZeroAlpha & As<SIMD::Int>(c.x / c.w));
1476 c.y = As<SIMD::Float>(nonZeroAlpha & As<SIMD::Int>(c.y / c.w));
1477 c.z = As<SIMD::Float>(nonZeroAlpha & As<SIMD::Int>(c.z / c.w));
1478 }
1479
computeAdvancedBlendMode(int index,const SIMD::Float4 & src,const SIMD::Float4 & dst,const SIMD::Float4 & srcFactor,const SIMD::Float4 & dstFactor)1480 SIMD::Float4 PixelRoutine::computeAdvancedBlendMode(int index, const SIMD::Float4 &src, const SIMD::Float4 &dst, const SIMD::Float4 &srcFactor, const SIMD::Float4 &dstFactor)
1481 {
1482 SIMD::Float4 srcColor = src;
1483 srcColor.x *= srcFactor.x;
1484 srcColor.y *= srcFactor.y;
1485 srcColor.z *= srcFactor.z;
1486 srcColor.w *= srcFactor.w;
1487
1488 SIMD::Float4 dstColor = dst;
1489 dstColor.x *= dstFactor.x;
1490 dstColor.y *= dstFactor.y;
1491 dstColor.z *= dstFactor.z;
1492 dstColor.w *= dstFactor.w;
1493
1494 premultiply(srcColor);
1495 premultiply(dstColor);
1496
1497 SIMD::Float4 blendedColor;
1498
1499 switch(state.blendState[index].blendOperation)
1500 {
1501 case VK_BLEND_OP_MULTIPLY_EXT:
1502 blendedColor.x = (srcColor.x * dstColor.x);
1503 blendedColor.y = (srcColor.y * dstColor.y);
1504 blendedColor.z = (srcColor.z * dstColor.z);
1505 break;
1506 case VK_BLEND_OP_SCREEN_EXT:
1507 blendedColor.x = srcColor.x + dstColor.x - (srcColor.x * dstColor.x);
1508 blendedColor.y = srcColor.y + dstColor.y - (srcColor.y * dstColor.y);
1509 blendedColor.z = srcColor.z + dstColor.z - (srcColor.z * dstColor.z);
1510 break;
1511 case VK_BLEND_OP_OVERLAY_EXT:
1512 blendedColor.x = blendOpOverlay(srcColor.x, dstColor.x);
1513 blendedColor.y = blendOpOverlay(srcColor.y, dstColor.y);
1514 blendedColor.z = blendOpOverlay(srcColor.z, dstColor.z);
1515 break;
1516 case VK_BLEND_OP_DARKEN_EXT:
1517 blendedColor.x = Min(srcColor.x, dstColor.x);
1518 blendedColor.y = Min(srcColor.y, dstColor.y);
1519 blendedColor.z = Min(srcColor.z, dstColor.z);
1520 break;
1521 case VK_BLEND_OP_LIGHTEN_EXT:
1522 blendedColor.x = Max(srcColor.x, dstColor.x);
1523 blendedColor.y = Max(srcColor.y, dstColor.y);
1524 blendedColor.z = Max(srcColor.z, dstColor.z);
1525 break;
1526 case VK_BLEND_OP_COLORDODGE_EXT:
1527 blendedColor.x = blendOpColorDodge(srcColor.x, dstColor.x);
1528 blendedColor.y = blendOpColorDodge(srcColor.y, dstColor.y);
1529 blendedColor.z = blendOpColorDodge(srcColor.z, dstColor.z);
1530 break;
1531 case VK_BLEND_OP_COLORBURN_EXT:
1532 blendedColor.x = blendOpColorBurn(srcColor.x, dstColor.x);
1533 blendedColor.y = blendOpColorBurn(srcColor.y, dstColor.y);
1534 blendedColor.z = blendOpColorBurn(srcColor.z, dstColor.z);
1535 break;
1536 case VK_BLEND_OP_HARDLIGHT_EXT:
1537 blendedColor.x = blendOpHardlight(srcColor.x, dstColor.x);
1538 blendedColor.y = blendOpHardlight(srcColor.y, dstColor.y);
1539 blendedColor.z = blendOpHardlight(srcColor.z, dstColor.z);
1540 break;
1541 case VK_BLEND_OP_SOFTLIGHT_EXT:
1542 blendedColor.x = blendOpSoftlight(srcColor.x, dstColor.x);
1543 blendedColor.y = blendOpSoftlight(srcColor.y, dstColor.y);
1544 blendedColor.z = blendOpSoftlight(srcColor.z, dstColor.z);
1545 break;
1546 case VK_BLEND_OP_DIFFERENCE_EXT:
1547 blendedColor.x = Abs(srcColor.x - dstColor.x);
1548 blendedColor.y = Abs(srcColor.y - dstColor.y);
1549 blendedColor.z = Abs(srcColor.z - dstColor.z);
1550 break;
1551 case VK_BLEND_OP_EXCLUSION_EXT:
1552 blendedColor.x = srcColor.x + dstColor.x - (srcColor.x * dstColor.x * 2.0f);
1553 blendedColor.y = srcColor.y + dstColor.y - (srcColor.y * dstColor.y * 2.0f);
1554 blendedColor.z = srcColor.z + dstColor.z - (srcColor.z * dstColor.z * 2.0f);
1555 break;
1556 case VK_BLEND_OP_HSL_HUE_EXT:
1557 setLumSat(srcColor, dstColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
1558 break;
1559 case VK_BLEND_OP_HSL_SATURATION_EXT:
1560 setLumSat(dstColor, srcColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
1561 break;
1562 case VK_BLEND_OP_HSL_COLOR_EXT:
1563 setLum(srcColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
1564 break;
1565 case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
1566 setLum(dstColor, srcColor, blendedColor.x, blendedColor.y, blendedColor.z);
1567 break;
1568 default:
1569 UNSUPPORTED("Unsupported advanced VkBlendOp: %d", int(state.blendState[index].blendOperation));
1570 break;
1571 }
1572
1573 SIMD::Float p = srcColor.w * dstColor.w;
1574 blendedColor.x *= p;
1575 blendedColor.y *= p;
1576 blendedColor.z *= p;
1577
1578 p = srcColor.w * (1.0f - dstColor.w);
1579 blendedColor.x += srcColor.x * p;
1580 blendedColor.y += srcColor.y * p;
1581 blendedColor.z += srcColor.z * p;
1582
1583 p = dstColor.w * (1.0f - srcColor.w);
1584 blendedColor.x += dstColor.x * p;
1585 blendedColor.y += dstColor.y * p;
1586 blendedColor.z += dstColor.z * p;
1587
1588 return blendedColor;
1589 }
1590
blendFactorCanExceedFormatRange(VkBlendFactor blendFactor,vk::Format format)1591 bool PixelRoutine::blendFactorCanExceedFormatRange(VkBlendFactor blendFactor, vk::Format format)
1592 {
1593 switch(blendFactor)
1594 {
1595 case VK_BLEND_FACTOR_ZERO:
1596 case VK_BLEND_FACTOR_ONE:
1597 return false;
1598 case VK_BLEND_FACTOR_SRC_COLOR:
1599 case VK_BLEND_FACTOR_SRC_ALPHA:
1600 // Source values have been clamped after fragment shader execution if the attachment format is normalized.
1601 return false;
1602 case VK_BLEND_FACTOR_DST_COLOR:
1603 case VK_BLEND_FACTOR_DST_ALPHA:
1604 // Dest values have a valid range due to being read from the attachment.
1605 return false;
1606 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1607 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1608 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1609 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1610 // For signed formats, negative values cause the result to exceed 1.0.
1611 return format.isSignedNormalized();
1612 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1613 // min(As, 1 - Ad)
1614 return false;
1615 case VK_BLEND_FACTOR_CONSTANT_COLOR:
1616 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1617 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1618 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1619 return false;
1620
1621 default:
1622 UNSUPPORTED("VkBlendFactor: %d", int(blendFactor));
1623 return false;
1624 }
1625 }
1626
alphaBlend(int index,const Pointer<Byte> & cBuffer,const SIMD::Float4 & sourceColor,const Int & x)1627 SIMD::Float4 PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, const SIMD::Float4 &sourceColor, const Int &x)
1628 {
1629 if(!state.blendState[index].alphaBlendEnable)
1630 {
1631 return sourceColor;
1632 }
1633
1634 vk::Format format = state.colorFormat[index];
1635 ASSERT(format.supportsColorAttachmentBlend());
1636
1637 Pointer<Byte> buffer = cBuffer;
1638 Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1639
1640 // texelColor holds four texel color values.
1641 // Note: Despite the type being Vector4f, the colors may be stored as
1642 // integers. Half-floats are stored as full 32-bit floats.
1643 // Non-float and non-fixed point formats are not alpha blended.
1644 Vector4f texelColor;
1645
1646 switch(format)
1647 {
1648 case VK_FORMAT_R32_SINT:
1649 case VK_FORMAT_R32_UINT:
1650 case VK_FORMAT_R32_SFLOAT:
1651 // FIXME: movlps
1652 buffer += 4 * x;
1653 texelColor.x.x = *Pointer<Float>(buffer + 0);
1654 texelColor.x.y = *Pointer<Float>(buffer + 4);
1655 buffer += pitchB;
1656 // FIXME: movhps
1657 texelColor.x.z = *Pointer<Float>(buffer + 0);
1658 texelColor.x.w = *Pointer<Float>(buffer + 4);
1659 texelColor.y = texelColor.z = texelColor.w = 1.0f;
1660 break;
1661 case VK_FORMAT_R32G32_SINT:
1662 case VK_FORMAT_R32G32_UINT:
1663 case VK_FORMAT_R32G32_SFLOAT:
1664 buffer += 8 * x;
1665 texelColor.x = *Pointer<Float4>(buffer, 16);
1666 buffer += pitchB;
1667 texelColor.y = *Pointer<Float4>(buffer, 16);
1668 texelColor.z = texelColor.x;
1669 texelColor.x = ShuffleLowHigh(texelColor.x, texelColor.y, 0x0202);
1670 texelColor.z = ShuffleLowHigh(texelColor.z, texelColor.y, 0x1313);
1671 texelColor.y = texelColor.z;
1672 texelColor.z = texelColor.w = 1.0f;
1673 break;
1674 case VK_FORMAT_R32G32B32A32_SFLOAT:
1675 case VK_FORMAT_R32G32B32A32_SINT:
1676 case VK_FORMAT_R32G32B32A32_UINT:
1677 buffer += 16 * x;
1678 texelColor.x = *Pointer<Float4>(buffer + 0, 16);
1679 texelColor.y = *Pointer<Float4>(buffer + 16, 16);
1680 buffer += pitchB;
1681 texelColor.z = *Pointer<Float4>(buffer + 0, 16);
1682 texelColor.w = *Pointer<Float4>(buffer + 16, 16);
1683 transpose4x4(texelColor.x, texelColor.y, texelColor.z, texelColor.w);
1684 break;
1685 case VK_FORMAT_R16_UNORM:
1686 buffer += 2 * x;
1687 texelColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
1688 texelColor.x.y = Float(Int(*Pointer<UShort>(buffer + 2)));
1689 buffer += pitchB;
1690 texelColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
1691 texelColor.x.w = Float(Int(*Pointer<UShort>(buffer + 2)));
1692 texelColor.x *= (1.0f / 0xFFFF);
1693 texelColor.y = texelColor.z = texelColor.w = 1.0f;
1694 break;
1695 case VK_FORMAT_R16_SFLOAT:
1696 buffer += 2 * x;
1697 texelColor.x.x = Float(*Pointer<Half>(buffer + 0));
1698 texelColor.x.y = Float(*Pointer<Half>(buffer + 2));
1699 buffer += pitchB;
1700 texelColor.x.z = Float(*Pointer<Half>(buffer + 0));
1701 texelColor.x.w = Float(*Pointer<Half>(buffer + 2));
1702 texelColor.y = texelColor.z = texelColor.w = 1.0f;
1703 break;
1704 case VK_FORMAT_R16G16_UNORM:
1705 buffer += 4 * x;
1706 texelColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
1707 texelColor.y.x = Float(Int(*Pointer<UShort>(buffer + 2)));
1708 texelColor.x.y = Float(Int(*Pointer<UShort>(buffer + 4)));
1709 texelColor.y.y = Float(Int(*Pointer<UShort>(buffer + 6)));
1710 buffer += pitchB;
1711 texelColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
1712 texelColor.y.z = Float(Int(*Pointer<UShort>(buffer + 2)));
1713 texelColor.x.w = Float(Int(*Pointer<UShort>(buffer + 4)));
1714 texelColor.y.w = Float(Int(*Pointer<UShort>(buffer + 6)));
1715 texelColor.x *= (1.0f / 0xFFFF);
1716 texelColor.y *= (1.0f / 0xFFFF);
1717 texelColor.z = texelColor.w = 1.0f;
1718 break;
1719 case VK_FORMAT_R16G16_SFLOAT:
1720 buffer += 4 * x;
1721 texelColor.x.x = Float(*Pointer<Half>(buffer + 0));
1722 texelColor.y.x = Float(*Pointer<Half>(buffer + 2));
1723 texelColor.x.y = Float(*Pointer<Half>(buffer + 4));
1724 texelColor.y.y = Float(*Pointer<Half>(buffer + 6));
1725 buffer += pitchB;
1726 texelColor.x.z = Float(*Pointer<Half>(buffer + 0));
1727 texelColor.y.z = Float(*Pointer<Half>(buffer + 2));
1728 texelColor.x.w = Float(*Pointer<Half>(buffer + 4));
1729 texelColor.y.w = Float(*Pointer<Half>(buffer + 6));
1730 texelColor.z = texelColor.w = 1.0f;
1731 break;
1732 case VK_FORMAT_R16G16B16A16_UNORM:
1733 buffer += 8 * x;
1734 texelColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0x0)));
1735 texelColor.y.x = Float(Int(*Pointer<UShort>(buffer + 0x2)));
1736 texelColor.z.x = Float(Int(*Pointer<UShort>(buffer + 0x4)));
1737 texelColor.w.x = Float(Int(*Pointer<UShort>(buffer + 0x6)));
1738 texelColor.x.y = Float(Int(*Pointer<UShort>(buffer + 0x8)));
1739 texelColor.y.y = Float(Int(*Pointer<UShort>(buffer + 0xa)));
1740 texelColor.z.y = Float(Int(*Pointer<UShort>(buffer + 0xc)));
1741 texelColor.w.y = Float(Int(*Pointer<UShort>(buffer + 0xe)));
1742 buffer += pitchB;
1743 texelColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0x0)));
1744 texelColor.y.z = Float(Int(*Pointer<UShort>(buffer + 0x2)));
1745 texelColor.z.z = Float(Int(*Pointer<UShort>(buffer + 0x4)));
1746 texelColor.w.z = Float(Int(*Pointer<UShort>(buffer + 0x6)));
1747 texelColor.x.w = Float(Int(*Pointer<UShort>(buffer + 0x8)));
1748 texelColor.y.w = Float(Int(*Pointer<UShort>(buffer + 0xa)));
1749 texelColor.z.w = Float(Int(*Pointer<UShort>(buffer + 0xc)));
1750 texelColor.w.w = Float(Int(*Pointer<UShort>(buffer + 0xe)));
1751 texelColor.x *= (1.0f / 0xFFFF);
1752 texelColor.y *= (1.0f / 0xFFFF);
1753 texelColor.z *= (1.0f / 0xFFFF);
1754 texelColor.w *= (1.0f / 0xFFFF);
1755 break;
1756 case VK_FORMAT_R16G16B16A16_SFLOAT:
1757 buffer += 8 * x;
1758 texelColor.x.x = Float(*Pointer<Half>(buffer + 0x0));
1759 texelColor.y.x = Float(*Pointer<Half>(buffer + 0x2));
1760 texelColor.z.x = Float(*Pointer<Half>(buffer + 0x4));
1761 texelColor.w.x = Float(*Pointer<Half>(buffer + 0x6));
1762 texelColor.x.y = Float(*Pointer<Half>(buffer + 0x8));
1763 texelColor.y.y = Float(*Pointer<Half>(buffer + 0xa));
1764 texelColor.z.y = Float(*Pointer<Half>(buffer + 0xc));
1765 texelColor.w.y = Float(*Pointer<Half>(buffer + 0xe));
1766 buffer += pitchB;
1767 texelColor.x.z = Float(*Pointer<Half>(buffer + 0x0));
1768 texelColor.y.z = Float(*Pointer<Half>(buffer + 0x2));
1769 texelColor.z.z = Float(*Pointer<Half>(buffer + 0x4));
1770 texelColor.w.z = Float(*Pointer<Half>(buffer + 0x6));
1771 texelColor.x.w = Float(*Pointer<Half>(buffer + 0x8));
1772 texelColor.y.w = Float(*Pointer<Half>(buffer + 0xa));
1773 texelColor.z.w = Float(*Pointer<Half>(buffer + 0xc));
1774 texelColor.w.w = Float(*Pointer<Half>(buffer + 0xe));
1775 break;
1776 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
1777 buffer += 4 * x;
1778 texelColor.x = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
1779 texelColor.y = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
1780 buffer += pitchB;
1781 texelColor.z = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
1782 texelColor.w = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
1783 transpose4x3(texelColor.x, texelColor.y, texelColor.z, texelColor.w);
1784 texelColor.w = 1.0f;
1785 break;
1786 default:
1787 {
1788 // Attempt to read an integer based format and convert it to float
1789 Vector4s color;
1790 readPixel(index, cBuffer, x, color);
1791 texelColor.x = Float4(As<UShort4>(color.x)) * (1.0f / 0xFFFF);
1792 texelColor.y = Float4(As<UShort4>(color.y)) * (1.0f / 0xFFFF);
1793 texelColor.z = Float4(As<UShort4>(color.z)) * (1.0f / 0xFFFF);
1794 texelColor.w = Float4(As<UShort4>(color.w)) * (1.0f / 0xFFFF);
1795
1796 if(isSRGB(index))
1797 {
1798 texelColor.x = sRGBtoLinear(texelColor.x);
1799 texelColor.y = sRGBtoLinear(texelColor.y);
1800 texelColor.z = sRGBtoLinear(texelColor.z);
1801 }
1802 }
1803 break;
1804 }
1805
1806 ASSERT(SIMD::Width == 4);
1807 SIMD::Float4 destColor;
1808 destColor.x = texelColor.x;
1809 destColor.y = texelColor.y;
1810 destColor.z = texelColor.z;
1811 destColor.w = texelColor.w;
1812
1813 SIMD::Float4 sourceFactor;
1814 SIMD::Float4 destFactor;
1815
1816 blendFactorRGB(sourceFactor, sourceColor, destColor, state.blendState[index].sourceBlendFactor, format);
1817 blendFactorRGB(destFactor, sourceColor, destColor, state.blendState[index].destBlendFactor, format);
1818 blendFactorAlpha(sourceFactor.w, sourceColor.w, destColor.w, state.blendState[index].sourceBlendFactorAlpha, format);
1819 blendFactorAlpha(destFactor.w, sourceColor.w, destColor.w, state.blendState[index].destBlendFactorAlpha, format);
1820
1821 SIMD::Float4 blendedColor;
1822
1823 switch(state.blendState[index].blendOperation)
1824 {
1825 case VK_BLEND_OP_ADD:
1826 blendedColor.x = sourceColor.x * sourceFactor.x + destColor.x * destFactor.x;
1827 blendedColor.y = sourceColor.y * sourceFactor.y + destColor.y * destFactor.y;
1828 blendedColor.z = sourceColor.z * sourceFactor.z + destColor.z * destFactor.z;
1829 break;
1830 case VK_BLEND_OP_SUBTRACT:
1831 blendedColor.x = sourceColor.x * sourceFactor.x - destColor.x * destFactor.x;
1832 blendedColor.y = sourceColor.y * sourceFactor.y - destColor.y * destFactor.y;
1833 blendedColor.z = sourceColor.z * sourceFactor.z - destColor.z * destFactor.z;
1834 break;
1835 case VK_BLEND_OP_REVERSE_SUBTRACT:
1836 blendedColor.x = destColor.x * destFactor.x - sourceColor.x * sourceFactor.x;
1837 blendedColor.y = destColor.y * destFactor.y - sourceColor.y * sourceFactor.y;
1838 blendedColor.z = destColor.z * destFactor.z - sourceColor.z * sourceFactor.z;
1839 break;
1840 case VK_BLEND_OP_MIN:
1841 blendedColor.x = Min(sourceColor.x, destColor.x);
1842 blendedColor.y = Min(sourceColor.y, destColor.y);
1843 blendedColor.z = Min(sourceColor.z, destColor.z);
1844 break;
1845 case VK_BLEND_OP_MAX:
1846 blendedColor.x = Max(sourceColor.x, destColor.x);
1847 blendedColor.y = Max(sourceColor.y, destColor.y);
1848 blendedColor.z = Max(sourceColor.z, destColor.z);
1849 break;
1850 case VK_BLEND_OP_SRC_EXT:
1851 blendedColor.x = sourceColor.x;
1852 blendedColor.y = sourceColor.y;
1853 blendedColor.z = sourceColor.z;
1854 break;
1855 case VK_BLEND_OP_DST_EXT:
1856 blendedColor.x = destColor.x;
1857 blendedColor.y = destColor.y;
1858 blendedColor.z = destColor.z;
1859 break;
1860 case VK_BLEND_OP_ZERO_EXT:
1861 blendedColor.x = 0.0f;
1862 blendedColor.y = 0.0f;
1863 blendedColor.z = 0.0f;
1864 break;
1865 case VK_BLEND_OP_MULTIPLY_EXT:
1866 case VK_BLEND_OP_SCREEN_EXT:
1867 case VK_BLEND_OP_OVERLAY_EXT:
1868 case VK_BLEND_OP_DARKEN_EXT:
1869 case VK_BLEND_OP_LIGHTEN_EXT:
1870 case VK_BLEND_OP_COLORDODGE_EXT:
1871 case VK_BLEND_OP_COLORBURN_EXT:
1872 case VK_BLEND_OP_HARDLIGHT_EXT:
1873 case VK_BLEND_OP_SOFTLIGHT_EXT:
1874 case VK_BLEND_OP_DIFFERENCE_EXT:
1875 case VK_BLEND_OP_EXCLUSION_EXT:
1876 case VK_BLEND_OP_HSL_HUE_EXT:
1877 case VK_BLEND_OP_HSL_SATURATION_EXT:
1878 case VK_BLEND_OP_HSL_COLOR_EXT:
1879 case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
1880 blendedColor = computeAdvancedBlendMode(index, sourceColor, destColor, sourceFactor, destFactor);
1881 break;
1882 default:
1883 UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperation));
1884 }
1885
1886 switch(state.blendState[index].blendOperationAlpha)
1887 {
1888 case VK_BLEND_OP_ADD:
1889 blendedColor.w = sourceColor.w * sourceFactor.w + destColor.w * destFactor.w;
1890 break;
1891 case VK_BLEND_OP_SUBTRACT:
1892 blendedColor.w = sourceColor.w * sourceFactor.w - destColor.w * destFactor.w;
1893 break;
1894 case VK_BLEND_OP_REVERSE_SUBTRACT:
1895 blendedColor.w = destColor.w * destFactor.w - sourceColor.w * sourceFactor.w;
1896 break;
1897 case VK_BLEND_OP_MIN:
1898 blendedColor.w = Min(sourceColor.w, destColor.w);
1899 break;
1900 case VK_BLEND_OP_MAX:
1901 blendedColor.w = Max(sourceColor.w, destColor.w);
1902 break;
1903 case VK_BLEND_OP_SRC_EXT:
1904 blendedColor.w = sourceColor.w;
1905 break;
1906 case VK_BLEND_OP_DST_EXT:
1907 blendedColor.w = destColor.w;
1908 break;
1909 case VK_BLEND_OP_ZERO_EXT:
1910 blendedColor.w = 0.0f;
1911 break;
1912 case VK_BLEND_OP_MULTIPLY_EXT:
1913 case VK_BLEND_OP_SCREEN_EXT:
1914 case VK_BLEND_OP_OVERLAY_EXT:
1915 case VK_BLEND_OP_DARKEN_EXT:
1916 case VK_BLEND_OP_LIGHTEN_EXT:
1917 case VK_BLEND_OP_COLORDODGE_EXT:
1918 case VK_BLEND_OP_COLORBURN_EXT:
1919 case VK_BLEND_OP_HARDLIGHT_EXT:
1920 case VK_BLEND_OP_SOFTLIGHT_EXT:
1921 case VK_BLEND_OP_DIFFERENCE_EXT:
1922 case VK_BLEND_OP_EXCLUSION_EXT:
1923 case VK_BLEND_OP_HSL_HUE_EXT:
1924 case VK_BLEND_OP_HSL_SATURATION_EXT:
1925 case VK_BLEND_OP_HSL_COLOR_EXT:
1926 case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
1927 // All of the currently supported 'advanced blend modes' compute the alpha the same way.
1928 blendedColor.w = sourceColor.w + destColor.w - (sourceColor.w * destColor.w);
1929 break;
1930 default:
1931 UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperationAlpha));
1932 }
1933
1934 return blendedColor;
1935 }
1936
writeColor(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4f & color,const Int & sMask,const Int & zMask,const Int & cMask)1937 void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4f &color, const Int &sMask, const Int &zMask, const Int &cMask)
1938 {
1939 if(isSRGB(index))
1940 {
1941 color.x = linearToSRGB(color.x);
1942 color.y = linearToSRGB(color.y);
1943 color.z = linearToSRGB(color.z);
1944 }
1945
1946 vk::Format format = state.colorFormat[index];
1947 switch(format)
1948 {
1949 case VK_FORMAT_B8G8R8A8_UNORM:
1950 case VK_FORMAT_B8G8R8A8_SRGB:
1951 case VK_FORMAT_R8G8B8A8_UNORM:
1952 case VK_FORMAT_R8G8B8A8_SRGB:
1953 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1954 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1955 color.w = Min(Max(color.w, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
1956 color.w = As<Float4>(RoundInt(color.w * 0xFF));
1957 color.z = Min(Max(color.z, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
1958 color.z = As<Float4>(RoundInt(color.z * 0xFF));
1959 // [[fallthrough]]
1960 case VK_FORMAT_R8G8_UNORM:
1961 color.y = Min(Max(color.y, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
1962 color.y = As<Float4>(RoundInt(color.y * 0xFF));
1963 //[[fallthrough]]
1964 case VK_FORMAT_R8_UNORM:
1965 color.x = Min(Max(color.x, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
1966 color.x = As<Float4>(RoundInt(color.x * 0xFF));
1967 break;
1968 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1969 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1970 case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
1971 case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
1972 color.w = Min(Max(color.w, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
1973 color.w = As<Float4>(RoundInt(color.w * 0xF));
1974 color.z = Min(Max(color.z, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
1975 color.z = As<Float4>(RoundInt(color.z * 0xF));
1976 color.y = Min(Max(color.y, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
1977 color.y = As<Float4>(RoundInt(color.y * 0xF));
1978 color.x = Min(Max(color.x, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
1979 color.x = As<Float4>(RoundInt(color.x * 0xF));
1980 break;
1981 case VK_FORMAT_B5G6R5_UNORM_PACK16:
1982 case VK_FORMAT_R5G6B5_UNORM_PACK16:
1983 color.z = Min(Max(color.z, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
1984 color.z = As<Float4>(RoundInt(color.z * 0x1F));
1985 color.y = Min(Max(color.y, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
1986 color.y = As<Float4>(RoundInt(color.y * 0x3F));
1987 color.x = Min(Max(color.x, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
1988 color.x = As<Float4>(RoundInt(color.x * 0x1F));
1989 break;
1990 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1991 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1992 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1993 color.w = Min(Max(color.w, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
1994 color.w = As<Float4>(RoundInt(color.w));
1995 color.z = Min(Max(color.z, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
1996 color.z = As<Float4>(RoundInt(color.z * 0x1F));
1997 color.y = Min(Max(color.y, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
1998 color.y = As<Float4>(RoundInt(color.y * 0x1F));
1999 color.x = Min(Max(color.x, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2000 color.x = As<Float4>(RoundInt(color.x * 0x1F));
2001 break;
2002 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
2003 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
2004 color.w = Min(Max(color.w, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2005 color.w = As<Float4>(RoundInt(color.w * 0x3));
2006 color.z = Min(Max(color.z, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2007 color.z = As<Float4>(RoundInt(color.z * 0x3FF));
2008 color.y = Min(Max(color.y, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2009 color.y = As<Float4>(RoundInt(color.y * 0x3FF));
2010 color.x = Min(Max(color.x, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2011 color.x = As<Float4>(RoundInt(color.x * 0x3FF));
2012 break;
2013 case VK_FORMAT_R16G16B16A16_UNORM:
2014 color.w = Min(Max(color.w, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2015 color.w = As<Float4>(RoundInt(color.w * 0xFFFF));
2016 color.z = Min(Max(color.z, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2017 color.z = As<Float4>(RoundInt(color.z * 0xFFFF));
2018 // [[fallthrough]]
2019 case VK_FORMAT_R16G16_UNORM:
2020 color.y = Min(Max(color.y, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2021 color.y = As<Float4>(RoundInt(color.y * 0xFFFF));
2022 //[[fallthrough]]
2023 case VK_FORMAT_R16_UNORM:
2024 color.x = Min(Max(color.x, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2025 color.x = As<Float4>(RoundInt(color.x * 0xFFFF));
2026 break;
2027 default:
2028 // TODO(b/204560089): Omit clamp if redundant
2029 if(format.isUnsignedNormalized())
2030 {
2031 color.x = Min(Max(color.x, 0.0f), 1.0f);
2032 color.y = Min(Max(color.y, 0.0f), 1.0f);
2033 color.z = Min(Max(color.z, 0.0f), 1.0f);
2034 color.w = Min(Max(color.w, 0.0f), 1.0f);
2035 }
2036 else if(format.isSignedNormalized())
2037 {
2038 color.x = Min(Max(color.x, -1.0f), 1.0f);
2039 color.y = Min(Max(color.y, -1.0f), 1.0f);
2040 color.z = Min(Max(color.z, -1.0f), 1.0f);
2041 color.w = Min(Max(color.w, -1.0f), 1.0f);
2042 }
2043 }
2044
2045 switch(format)
2046 {
2047 case VK_FORMAT_R16_SFLOAT:
2048 case VK_FORMAT_R32_SFLOAT:
2049 case VK_FORMAT_R32_SINT:
2050 case VK_FORMAT_R32_UINT:
2051 case VK_FORMAT_R16_UNORM:
2052 case VK_FORMAT_R16_SINT:
2053 case VK_FORMAT_R16_UINT:
2054 case VK_FORMAT_R8_SINT:
2055 case VK_FORMAT_R8_UINT:
2056 case VK_FORMAT_R8_UNORM:
2057 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2058 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2059 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
2060 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
2061 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
2062 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
2063 case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
2064 case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
2065 case VK_FORMAT_B5G6R5_UNORM_PACK16:
2066 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
2067 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
2068 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
2069 case VK_FORMAT_R5G6B5_UNORM_PACK16:
2070 break;
2071 case VK_FORMAT_R16G16_SFLOAT:
2072 case VK_FORMAT_R32G32_SFLOAT:
2073 case VK_FORMAT_R32G32_SINT:
2074 case VK_FORMAT_R32G32_UINT:
2075 case VK_FORMAT_R16G16_UNORM:
2076 case VK_FORMAT_R16G16_SINT:
2077 case VK_FORMAT_R16G16_UINT:
2078 case VK_FORMAT_R8G8_SINT:
2079 case VK_FORMAT_R8G8_UINT:
2080 case VK_FORMAT_R8G8_UNORM:
2081 color.z = color.x;
2082 color.x = UnpackLow(color.x, color.y);
2083 color.z = UnpackHigh(color.z, color.y);
2084 color.y = color.z;
2085 break;
2086 case VK_FORMAT_R16G16B16A16_SFLOAT:
2087 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2088 case VK_FORMAT_R32G32B32A32_SFLOAT:
2089 case VK_FORMAT_R32G32B32A32_SINT:
2090 case VK_FORMAT_R32G32B32A32_UINT:
2091 case VK_FORMAT_R16G16B16A16_UNORM:
2092 case VK_FORMAT_R16G16B16A16_SINT:
2093 case VK_FORMAT_R16G16B16A16_UINT:
2094 case VK_FORMAT_R8G8B8A8_SINT:
2095 case VK_FORMAT_R8G8B8A8_UINT:
2096 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
2097 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
2098 case VK_FORMAT_R8G8B8A8_UNORM:
2099 case VK_FORMAT_R8G8B8A8_SRGB:
2100 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
2101 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
2102 transpose4x4(color.x, color.y, color.z, color.w);
2103 break;
2104 case VK_FORMAT_B8G8R8A8_UNORM:
2105 case VK_FORMAT_B8G8R8A8_SRGB:
2106 transpose4x4zyxw(color.z, color.y, color.x, color.w);
2107 break;
2108 default:
2109 UNSUPPORTED("VkFormat: %d", int(format));
2110 }
2111
2112 int writeMask = state.colorWriteActive(index);
2113 if(format.isBGRformat())
2114 {
2115 // For BGR formats, flip R and B channels in the channels mask
2116 writeMask = (writeMask & 0x0000000A) | (writeMask & 0x00000001) << 2 | (writeMask & 0x00000004) >> 2;
2117 }
2118
2119 Int xMask; // Combination of all masks
2120
2121 if(state.depthTestActive)
2122 {
2123 xMask = zMask;
2124 }
2125 else
2126 {
2127 xMask = cMask;
2128 }
2129
2130 if(state.stencilActive)
2131 {
2132 xMask &= sMask;
2133 }
2134
2135 Pointer<Byte> buffer = cBuffer;
2136 Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2137 Float4 value;
2138
2139 switch(format)
2140 {
2141 case VK_FORMAT_R32_SFLOAT:
2142 case VK_FORMAT_R32_SINT:
2143 case VK_FORMAT_R32_UINT:
2144 if(writeMask & 0x00000001)
2145 {
2146 buffer += 4 * x;
2147
2148 // FIXME: movlps
2149 value.x = *Pointer<Float>(buffer + 0);
2150 value.y = *Pointer<Float>(buffer + 4);
2151
2152 buffer += pitchB;
2153
2154 // FIXME: movhps
2155 value.z = *Pointer<Float>(buffer + 0);
2156 value.w = *Pointer<Float>(buffer + 4);
2157
2158 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2159 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2160 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2161
2162 // FIXME: movhps
2163 *Pointer<Float>(buffer + 0) = color.x.z;
2164 *Pointer<Float>(buffer + 4) = color.x.w;
2165
2166 buffer -= pitchB;
2167
2168 // FIXME: movlps
2169 *Pointer<Float>(buffer + 0) = color.x.x;
2170 *Pointer<Float>(buffer + 4) = color.x.y;
2171 }
2172 break;
2173 case VK_FORMAT_R16_SFLOAT:
2174 if(writeMask & 0x00000001)
2175 {
2176 buffer += 2 * x;
2177
2178 value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 0);
2179 value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 1);
2180
2181 buffer += pitchB;
2182
2183 value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 2);
2184 value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 3);
2185
2186 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2187 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2188 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2189
2190 *Pointer<Half>(buffer + 0) = Half(color.x.z);
2191 *Pointer<Half>(buffer + 2) = Half(color.x.w);
2192
2193 buffer -= pitchB;
2194
2195 *Pointer<Half>(buffer + 0) = Half(color.x.x);
2196 *Pointer<Half>(buffer + 2) = Half(color.x.y);
2197 }
2198 break;
2199 case VK_FORMAT_R16_UNORM:
2200 case VK_FORMAT_R16_SINT:
2201 case VK_FORMAT_R16_UINT:
2202 if(writeMask & 0x00000001)
2203 {
2204 buffer += 2 * x;
2205
2206 UShort4 xyzw;
2207 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2208
2209 buffer += pitchB;
2210
2211 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2212 value = As<Float4>(Int4(xyzw));
2213
2214 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2215 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2216 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2217
2218 Float component = color.x.z;
2219 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2220 component = color.x.w;
2221 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2222
2223 buffer -= pitchB;
2224
2225 component = color.x.x;
2226 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2227 component = color.x.y;
2228 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2229 }
2230 break;
2231 case VK_FORMAT_R8_SINT:
2232 case VK_FORMAT_R8_UINT:
2233 case VK_FORMAT_R8_UNORM:
2234 if(writeMask & 0x00000001)
2235 {
2236 buffer += x;
2237
2238 UInt xyzw, packedCol;
2239
2240 xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFFu;
2241 buffer += pitchB;
2242 xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2243
2244 Short4 tmpCol = Short4(As<Int4>(color.x));
2245 if(format == VK_FORMAT_R8_SINT)
2246 {
2247 tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
2248 }
2249 else
2250 {
2251 tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
2252 }
2253 packedCol = Extract(As<Int2>(tmpCol), 0);
2254
2255 packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2256 (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2257
2258 *Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2259 buffer -= pitchB;
2260 *Pointer<UShort>(buffer) = UShort(packedCol);
2261 }
2262 break;
2263 case VK_FORMAT_R32G32_SFLOAT:
2264 case VK_FORMAT_R32G32_SINT:
2265 case VK_FORMAT_R32G32_UINT:
2266 buffer += 8 * x;
2267
2268 value = *Pointer<Float4>(buffer);
2269
2270 if((writeMask & 0x00000003) != 0x00000003)
2271 {
2272 Float4 masked = value;
2273 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[writeMask & 0x3][0])));
2274 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~writeMask & 0x3][0])));
2275 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(masked));
2276 }
2277
2278 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16, 16));
2279 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ01X) + xMask * 16, 16));
2280 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2281 *Pointer<Float4>(buffer) = color.x;
2282
2283 buffer += pitchB;
2284
2285 value = *Pointer<Float4>(buffer);
2286
2287 if((writeMask & 0x00000003) != 0x00000003)
2288 {
2289 Float4 masked;
2290
2291 masked = value;
2292 color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[writeMask & 0x3][0])));
2293 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~writeMask & 0x3][0])));
2294 color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(masked));
2295 }
2296
2297 color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16, 16));
2298 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ23X) + xMask * 16, 16));
2299 color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(value));
2300 *Pointer<Float4>(buffer) = color.y;
2301 break;
2302 case VK_FORMAT_R16G16_SFLOAT:
2303 if((writeMask & 0x00000003) != 0x0)
2304 {
2305 buffer += 4 * x;
2306
2307 UInt2 rgbaMask;
2308 UInt2 packedCol;
2309 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.y))) << 16) | UInt(As<UShort>(Half(color.x.x))), 0);
2310 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.w))) << 16) | UInt(As<UShort>(Half(color.x.z))), 1);
2311
2312 UShort4 value = *Pointer<UShort4>(buffer);
2313 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2314 if((writeMask & 0x3) != 0x3)
2315 {
2316 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[writeMask & 0x3]));
2317 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2318 mergedMask &= rgbaMask;
2319 }
2320 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2321
2322 buffer += pitchB;
2323
2324 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.y))) << 16) | UInt(As<UShort>(Half(color.y.x))), 0);
2325 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.w))) << 16) | UInt(As<UShort>(Half(color.y.z))), 1);
2326 value = *Pointer<UShort4>(buffer);
2327 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2328 if((writeMask & 0x3) != 0x3)
2329 {
2330 mergedMask &= rgbaMask;
2331 }
2332 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2333 }
2334 break;
2335 case VK_FORMAT_R16G16_UNORM:
2336 case VK_FORMAT_R16G16_SINT:
2337 case VK_FORMAT_R16G16_UINT:
2338 if((writeMask & 0x00000003) != 0x0)
2339 {
2340 buffer += 4 * x;
2341
2342 UInt2 rgbaMask;
2343 UShort4 packedCol = UShort4(As<Int4>(color.x));
2344 UShort4 value = *Pointer<UShort4>(buffer);
2345 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2346 if((writeMask & 0x3) != 0x3)
2347 {
2348 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[writeMask & 0x3]));
2349 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2350 mergedMask &= rgbaMask;
2351 }
2352 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2353
2354 buffer += pitchB;
2355
2356 packedCol = UShort4(As<Int4>(color.y));
2357 value = *Pointer<UShort4>(buffer);
2358 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2359 if((writeMask & 0x3) != 0x3)
2360 {
2361 mergedMask &= rgbaMask;
2362 }
2363 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2364 }
2365 break;
2366 case VK_FORMAT_R8G8_SINT:
2367 case VK_FORMAT_R8G8_UINT:
2368 case VK_FORMAT_R8G8_UNORM:
2369 if((writeMask & 0x00000003) != 0x0)
2370 {
2371 buffer += 2 * x;
2372
2373 Int2 xyzw, packedCol;
2374
2375 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2376 buffer += pitchB;
2377 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2378
2379 if(format == VK_FORMAT_R8G8_SINT)
2380 {
2381 packedCol = As<Int2>(PackSigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2382 }
2383 else
2384 {
2385 packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2386 }
2387
2388 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2389 if((writeMask & 0x3) != 0x3)
2390 {
2391 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (writeMask & 0x3)]));
2392 UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2393 mergedMask &= rgbaMask;
2394 }
2395
2396 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2397
2398 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2399 buffer -= pitchB;
2400 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2401 }
2402 break;
2403 case VK_FORMAT_R32G32B32A32_SFLOAT:
2404 case VK_FORMAT_R32G32B32A32_SINT:
2405 case VK_FORMAT_R32G32B32A32_UINT:
2406 buffer += 16 * x;
2407
2408 {
2409 value = *Pointer<Float4>(buffer, 16);
2410
2411 if(writeMask != 0x0000000F)
2412 {
2413 Float4 masked = value;
2414 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[writeMask])));
2415 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[writeMask])));
2416 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(masked));
2417 }
2418
2419 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskX0X) + xMask * 16, 16));
2420 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX0X) + xMask * 16, 16));
2421 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2422 *Pointer<Float4>(buffer, 16) = color.x;
2423 }
2424
2425 {
2426 value = *Pointer<Float4>(buffer + 16, 16);
2427
2428 if(writeMask != 0x0000000F)
2429 {
2430 Float4 masked = value;
2431 color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[writeMask])));
2432 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[writeMask])));
2433 color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(masked));
2434 }
2435
2436 color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskX1X) + xMask * 16, 16));
2437 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX1X) + xMask * 16, 16));
2438 color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(value));
2439 *Pointer<Float4>(buffer + 16, 16) = color.y;
2440 }
2441
2442 buffer += pitchB;
2443
2444 {
2445 value = *Pointer<Float4>(buffer, 16);
2446
2447 if(writeMask != 0x0000000F)
2448 {
2449 Float4 masked = value;
2450 color.z = As<Float4>(As<Int4>(color.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[writeMask])));
2451 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[writeMask])));
2452 color.z = As<Float4>(As<Int4>(color.z) | As<Int4>(masked));
2453 }
2454
2455 color.z = As<Float4>(As<Int4>(color.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskX2X) + xMask * 16, 16));
2456 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX2X) + xMask * 16, 16));
2457 color.z = As<Float4>(As<Int4>(color.z) | As<Int4>(value));
2458 *Pointer<Float4>(buffer, 16) = color.z;
2459 }
2460
2461 {
2462 value = *Pointer<Float4>(buffer + 16, 16);
2463
2464 if(writeMask != 0x0000000F)
2465 {
2466 Float4 masked = value;
2467 color.w = As<Float4>(As<Int4>(color.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[writeMask])));
2468 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[writeMask])));
2469 color.w = As<Float4>(As<Int4>(color.w) | As<Int4>(masked));
2470 }
2471
2472 color.w = As<Float4>(As<Int4>(color.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskX3X) + xMask * 16, 16));
2473 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX3X) + xMask * 16, 16));
2474 color.w = As<Float4>(As<Int4>(color.w) | As<Int4>(value));
2475 *Pointer<Float4>(buffer + 16, 16) = color.w;
2476 }
2477 break;
2478 case VK_FORMAT_R16G16B16A16_SFLOAT:
2479 if((writeMask & 0x0000000F) != 0x0)
2480 {
2481 buffer += 8 * x;
2482
2483 UInt4 rgbaMask;
2484 UInt4 value = *Pointer<UInt4>(buffer);
2485 UInt4 packedCol;
2486 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.y))) << 16) | UInt(As<UShort>(Half(color.x.x))), 0);
2487 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.w))) << 16) | UInt(As<UShort>(Half(color.x.z))), 1);
2488 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.y))) << 16) | UInt(As<UShort>(Half(color.y.x))), 2);
2489 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.w))) << 16) | UInt(As<UShort>(Half(color.y.z))), 3);
2490 UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2491 if((writeMask & 0xF) != 0xF)
2492 {
2493 UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[writeMask]));
2494 rgbaMask = UInt4(tmpMask, tmpMask);
2495 mergedMask &= rgbaMask;
2496 }
2497 *Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2498
2499 buffer += pitchB;
2500
2501 value = *Pointer<UInt4>(buffer);
2502 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.z.y))) << 16) | UInt(As<UShort>(Half(color.z.x))), 0);
2503 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.z.w))) << 16) | UInt(As<UShort>(Half(color.z.z))), 1);
2504 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.w.y))) << 16) | UInt(As<UShort>(Half(color.w.x))), 2);
2505 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.w.w))) << 16) | UInt(As<UShort>(Half(color.w.z))), 3);
2506 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2507 if((writeMask & 0xF) != 0xF)
2508 {
2509 mergedMask &= rgbaMask;
2510 }
2511 *Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2512 }
2513 break;
2514 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2515 if((writeMask & 0x7) != 0x0)
2516 {
2517 buffer += 4 * x;
2518
2519 UInt4 packedCol;
2520 packedCol = Insert(packedCol, r11g11b10Pack(color.x), 0);
2521 packedCol = Insert(packedCol, r11g11b10Pack(color.y), 1);
2522 packedCol = Insert(packedCol, r11g11b10Pack(color.z), 2);
2523 packedCol = Insert(packedCol, r11g11b10Pack(color.w), 3);
2524
2525 UInt4 value;
2526 value = Insert(value, *Pointer<UInt>(buffer + 0), 0);
2527 value = Insert(value, *Pointer<UInt>(buffer + 4), 1);
2528 buffer += pitchB;
2529 value = Insert(value, *Pointer<UInt>(buffer + 0), 2);
2530 value = Insert(value, *Pointer<UInt>(buffer + 4), 3);
2531
2532 UInt4 mask = *Pointer<UInt4>(constants + OFFSET(Constants, maskD4X[0]) + xMask * 16, 16);
2533 if((writeMask & 0x7) != 0x7)
2534 {
2535 mask &= *Pointer<UInt4>(constants + OFFSET(Constants, mask11X[writeMask & 0x7]), 16);
2536 }
2537 value = (packedCol & mask) | (value & ~mask);
2538
2539 *Pointer<UInt>(buffer + 0) = value.z;
2540 *Pointer<UInt>(buffer + 4) = value.w;
2541 buffer -= pitchB;
2542 *Pointer<UInt>(buffer + 0) = value.x;
2543 *Pointer<UInt>(buffer + 4) = value.y;
2544 }
2545 break;
2546 case VK_FORMAT_R16G16B16A16_UNORM:
2547 case VK_FORMAT_R16G16B16A16_SINT:
2548 case VK_FORMAT_R16G16B16A16_UINT:
2549 if((writeMask & 0x0000000F) != 0x0)
2550 {
2551 buffer += 8 * x;
2552
2553 UInt4 rgbaMask;
2554 UShort8 value = *Pointer<UShort8>(buffer);
2555 UShort8 packedCol = UShort8(UShort4(As<Int4>(color.x)), UShort4(As<Int4>(color.y)));
2556 UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2557 if((writeMask & 0xF) != 0xF)
2558 {
2559 UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[writeMask]));
2560 rgbaMask = UInt4(tmpMask, tmpMask);
2561 mergedMask &= rgbaMask;
2562 }
2563 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2564
2565 buffer += pitchB;
2566
2567 value = *Pointer<UShort8>(buffer);
2568 packedCol = UShort8(UShort4(As<Int4>(color.z)), UShort4(As<Int4>(color.w)));
2569 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2570 if((writeMask & 0xF) != 0xF)
2571 {
2572 mergedMask &= rgbaMask;
2573 }
2574 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2575 }
2576 break;
2577 case VK_FORMAT_B8G8R8A8_UNORM:
2578 case VK_FORMAT_B8G8R8A8_SRGB:
2579 case VK_FORMAT_R8G8B8A8_SINT:
2580 case VK_FORMAT_R8G8B8A8_UINT:
2581 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
2582 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
2583 case VK_FORMAT_R8G8B8A8_UNORM:
2584 case VK_FORMAT_R8G8B8A8_SRGB:
2585 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
2586 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
2587 if((writeMask & 0x0000000F) != 0x0)
2588 {
2589 UInt2 value, packedCol, mergedMask;
2590
2591 buffer += 4 * x;
2592
2593 bool isSigned = !format.isUnsigned();
2594
2595 if(isSigned)
2596 {
2597 packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2598 }
2599 else
2600 {
2601 packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2602 }
2603 value = *Pointer<UInt2>(buffer, 16);
2604 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2605 if(writeMask != 0xF)
2606 {
2607 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[writeMask]));
2608 }
2609 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2610
2611 buffer += pitchB;
2612
2613 if(isSigned)
2614 {
2615 packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(color.z)), Short4(As<Int4>(color.w))));
2616 }
2617 else
2618 {
2619 packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(color.z)), Short4(As<Int4>(color.w))));
2620 }
2621 value = *Pointer<UInt2>(buffer, 16);
2622 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2623 if(writeMask != 0xF)
2624 {
2625 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[writeMask]));
2626 }
2627 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2628 }
2629 break;
2630 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2631 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
2632 if((writeMask & 0x0000000F) != 0x0)
2633 {
2634 Int2 mergedMask, packedCol, value;
2635 Int4 packed = ((As<Int4>(color.w) & Int4(0x3)) << 30) |
2636 ((As<Int4>(color.z) & Int4(0x3ff)) << 20) |
2637 ((As<Int4>(color.y) & Int4(0x3ff)) << 10) |
2638 ((As<Int4>(color.x) & Int4(0x3ff)));
2639
2640 buffer += 4 * x;
2641 value = *Pointer<Int2>(buffer, 16);
2642 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2643 if(writeMask != 0xF)
2644 {
2645 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask]));
2646 }
2647 *Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
2648
2649 buffer += pitchB;
2650
2651 value = *Pointer<Int2>(buffer, 16);
2652 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2653 if(writeMask != 0xF)
2654 {
2655 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask]));
2656 }
2657 *Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
2658 }
2659 break;
2660 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2661 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
2662 if((writeMask & 0x0000000F) != 0x0)
2663 {
2664 Int2 mergedMask, packedCol, value;
2665 Int4 packed = ((As<Int4>(color.w) & Int4(0x3)) << 30) |
2666 ((As<Int4>(color.x) & Int4(0x3ff)) << 20) |
2667 ((As<Int4>(color.y) & Int4(0x3ff)) << 10) |
2668 ((As<Int4>(color.z) & Int4(0x3ff)));
2669
2670 buffer += 4 * x;
2671 value = *Pointer<Int2>(buffer, 16);
2672 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2673 if(writeMask != 0xF)
2674 {
2675 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask]));
2676 }
2677 *Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
2678
2679 buffer += pitchB;
2680
2681 value = *Pointer<Int2>(buffer, 16);
2682 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2683 if(writeMask != 0xF)
2684 {
2685 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask]));
2686 }
2687 *Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
2688 }
2689 break;
2690 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
2691 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
2692 case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
2693 case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
2694 {
2695 buffer += 2 * x;
2696 Int value = *Pointer<Int>(buffer);
2697
2698 Int channelMask;
2699 Short4 current;
2700 switch(format)
2701 {
2702 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
2703 channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4argbQ[writeMask][0]));
2704 current = (UShort4(As<Int4>(color.x)) & UShort4(0xF)) << 12 |
2705 (UShort4(As<Int4>(color.y)) & UShort4(0xF)) << 8 |
2706 (UShort4(As<Int4>(color.z)) & UShort4(0xF)) << 4 |
2707 (UShort4(As<Int4>(color.w)) & UShort4(0xF));
2708 break;
2709 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
2710 channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4argbQ[writeMask][0]));
2711 current = (UShort4(As<Int4>(color.z)) & UShort4(0xF)) << 12 |
2712 (UShort4(As<Int4>(color.y)) & UShort4(0xF)) << 8 |
2713 (UShort4(As<Int4>(color.x)) & UShort4(0xF)) << 4 |
2714 (UShort4(As<Int4>(color.w)) & UShort4(0xF));
2715 break;
2716 case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
2717 channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4rgbaQ[writeMask][0]));
2718 current = (UShort4(As<Int4>(color.w)) & UShort4(0xF)) << 12 |
2719 (UShort4(As<Int4>(color.x)) & UShort4(0xF)) << 8 |
2720 (UShort4(As<Int4>(color.y)) & UShort4(0xF)) << 4 |
2721 (UShort4(As<Int4>(color.z)) & UShort4(0xF));
2722 break;
2723 case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
2724 channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4rgbaQ[writeMask][0]));
2725 current = (UShort4(As<Int4>(color.w)) & UShort4(0xF)) << 12 |
2726 (UShort4(As<Int4>(color.z)) & UShort4(0xF)) << 8 |
2727 (UShort4(As<Int4>(color.y)) & UShort4(0xF)) << 4 |
2728 (UShort4(As<Int4>(color.x)) & UShort4(0xF));
2729 break;
2730 default:
2731 UNREACHABLE("Format: %s", vk::Stringify(format).c_str());
2732 }
2733
2734 Int c01 = Extract(As<Int2>(current), 0);
2735 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2736 if(writeMask != 0x0000000F)
2737 {
2738 mask01 &= channelMask;
2739 }
2740 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2741
2742 buffer += pitchB;
2743 value = *Pointer<Int>(buffer);
2744
2745 Int c23 = Extract(As<Int2>(current), 1);
2746 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2747 if(writeMask != 0x0000000F)
2748 {
2749 mask23 &= channelMask;
2750 }
2751 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2752 }
2753 break;
2754 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
2755 {
2756 buffer += 2 * x;
2757 Int value = *Pointer<Int>(buffer);
2758
2759 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskr5g5b5a1Q[writeMask][0]));
2760 Short4 current = (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) << 11 |
2761 (UShort4(As<Int4>(color.y)) & UShort4(0x1F)) << 6 |
2762 (UShort4(As<Int4>(color.z)) & UShort4(0x1F)) << 1 |
2763 (UShort4(As<Int4>(color.w)) & UShort4(0x1));
2764
2765 Int c01 = Extract(As<Int2>(current), 0);
2766 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2767 if(writeMask != 0x0000000F)
2768 {
2769 mask01 &= channelMask;
2770 }
2771 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2772
2773 buffer += pitchB;
2774 value = *Pointer<Int>(buffer);
2775
2776 Int c23 = Extract(As<Int2>(current), 1);
2777 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2778 if(writeMask != 0x0000000F)
2779 {
2780 mask23 &= channelMask;
2781 }
2782 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2783 }
2784 break;
2785 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
2786 {
2787 buffer += 2 * x;
2788 Int value = *Pointer<Int>(buffer);
2789
2790 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskb5g5r5a1Q[writeMask][0]));
2791 Short4 current = (UShort4(As<Int4>(color.z)) & UShort4(0x1F)) << 11 |
2792 (UShort4(As<Int4>(color.y)) & UShort4(0x1F)) << 6 |
2793 (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) << 1 |
2794 (UShort4(As<Int4>(color.w)) & UShort4(0x1));
2795
2796 Int c01 = Extract(As<Int2>(current), 0);
2797 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2798 if(writeMask != 0x0000000F)
2799 {
2800 mask01 &= channelMask;
2801 }
2802 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2803
2804 buffer += pitchB;
2805 value = *Pointer<Int>(buffer);
2806
2807 Int c23 = Extract(As<Int2>(current), 1);
2808 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2809 if(writeMask != 0x0000000F)
2810 {
2811 mask23 &= channelMask;
2812 }
2813 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2814 }
2815 break;
2816 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
2817 {
2818 buffer += 2 * x;
2819 Int value = *Pointer<Int>(buffer);
2820
2821 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask5551Q[writeMask][0]));
2822 Short4 current = (UShort4(As<Int4>(color.w)) & UShort4(0x1)) << 15 |
2823 (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) << 10 |
2824 (UShort4(As<Int4>(color.y)) & UShort4(0x1F)) << 5 |
2825 (UShort4(As<Int4>(color.z)) & UShort4(0x1F));
2826
2827 Int c01 = Extract(As<Int2>(current), 0);
2828 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2829 if(writeMask != 0x0000000F)
2830 {
2831 mask01 &= channelMask;
2832 }
2833 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2834
2835 buffer += pitchB;
2836 value = *Pointer<Int>(buffer);
2837
2838 Int c23 = Extract(As<Int2>(current), 1);
2839 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2840 if(writeMask != 0x0000000F)
2841 {
2842 mask23 &= channelMask;
2843 }
2844 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2845 }
2846 break;
2847 case VK_FORMAT_R5G6B5_UNORM_PACK16:
2848 {
2849 buffer += 2 * x;
2850 Int value = *Pointer<Int>(buffer);
2851
2852 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask565Q[writeMask & 0x7][0]));
2853 Short4 current = (UShort4(As<Int4>(color.z)) & UShort4(0x1F)) |
2854 (UShort4(As<Int4>(color.y)) & UShort4(0x3F)) << 5 |
2855 (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) << 11;
2856
2857 Int c01 = Extract(As<Int2>(current), 0);
2858 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2859 if((writeMask & 0x00000007) != 0x00000007)
2860 {
2861 mask01 &= channelMask;
2862 }
2863 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2864
2865 buffer += pitchB;
2866 value = *Pointer<Int>(buffer);
2867
2868 Int c23 = Extract(As<Int2>(current), 1);
2869 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2870 if((writeMask & 0x00000007) != 0x00000007)
2871 {
2872 mask23 &= channelMask;
2873 }
2874 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2875 }
2876 break;
2877 case VK_FORMAT_B5G6R5_UNORM_PACK16:
2878 {
2879 buffer += 2 * x;
2880 Int value = *Pointer<Int>(buffer);
2881
2882 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask565Q[writeMask & 0x7][0]));
2883 Short4 current = (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) |
2884 (UShort4(As<Int4>(color.y)) & UShort4(0x3F)) << 5 |
2885 (UShort4(As<Int4>(color.z)) & UShort4(0x1F)) << 11;
2886
2887 Int c01 = Extract(As<Int2>(current), 0);
2888 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2889 if((writeMask & 0x00000007) != 0x00000007)
2890 {
2891 mask01 &= channelMask;
2892 }
2893 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2894
2895 buffer += pitchB;
2896 value = *Pointer<Int>(buffer);
2897
2898 Int c23 = Extract(As<Int2>(current), 1);
2899 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2900 if((writeMask & 0x00000007) != 0x00000007)
2901 {
2902 mask23 &= channelMask;
2903 }
2904 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2905 }
2906 break;
2907 default:
2908 UNSUPPORTED("VkFormat: %d", int(format));
2909 }
2910 }
2911
2912 } // namespace sw
2913