1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "PixelRoutine.hpp"
16
17 #include "Constants.hpp"
18 #include "SamplerCore.hpp"
19 #include "Device/Primitive.hpp"
20 #include "Device/QuadRasterizer.hpp"
21 #include "Device/Renderer.hpp"
22 #include "System/Debug.hpp"
23 #include "Vulkan/VkPipelineLayout.hpp"
24 #include "Vulkan/VkStringify.hpp"
25
26 namespace sw {
27
PixelRoutine(const PixelProcessor::State & state,vk::PipelineLayout const * pipelineLayout,SpirvShader const * spirvShader,const vk::DescriptorSet::Bindings & descriptorSets)28 PixelRoutine::PixelRoutine(
29 const PixelProcessor::State &state,
30 vk::PipelineLayout const *pipelineLayout,
31 SpirvShader const *spirvShader,
32 const vk::DescriptorSet::Bindings &descriptorSets)
33 : QuadRasterizer(state, spirvShader)
34 , routine(pipelineLayout)
35 , descriptorSets(descriptorSets)
36 , shaderContainsInterpolation(spirvShader && spirvShader->getUsedCapabilities().InterpolationFunction)
37 , shaderContainsSampleQualifier(spirvShader && spirvShader->getAnalysis().ContainsSampleQualifier)
38 , perSampleShading((state.sampleShadingEnabled && (state.minSampleShading * state.multiSampleCount > 1.0f)) ||
39 shaderContainsSampleQualifier || shaderContainsInterpolation) // TODO(b/194714095)
40 , invocationCount(perSampleShading ? state.multiSampleCount : 1)
41 {
42 if(spirvShader)
43 {
44 spirvShader->emitProlog(&routine);
45 }
46 }
47
~PixelRoutine()48 PixelRoutine::~PixelRoutine()
49 {
50 }
51
getSampleSet(int invocation) const52 PixelRoutine::SampleSet PixelRoutine::getSampleSet(int invocation) const
53 {
54 unsigned int sampleBegin = perSampleShading ? invocation : 0;
55 unsigned int sampleEnd = perSampleShading ? (invocation + 1) : state.multiSampleCount;
56
57 SampleSet samples;
58
59 for(unsigned int q = sampleBegin; q < sampleEnd; q++)
60 {
61 if(state.multiSampleMask & (1 << q))
62 {
63 samples.push_back(q);
64 }
65 }
66
67 return samples;
68 }
69
quad(Pointer<Byte> cBuffer[MAX_COLOR_BUFFERS],Pointer<Byte> & zBuffer,Pointer<Byte> & sBuffer,Int cMask[4],Int & x,Int & y)70 void PixelRoutine::quad(Pointer<Byte> cBuffer[MAX_COLOR_BUFFERS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
71 {
72 const bool earlyFragmentTests = !spirvShader || spirvShader->getExecutionModes().EarlyFragmentTests;
73
74 Int zMask[4]; // Depth mask
75 Int sMask[4]; // Stencil mask
76 Float4 unclampedZ[4];
77
78 for(int invocation = 0; invocation < invocationCount; invocation++)
79 {
80 SampleSet samples = getSampleSet(invocation);
81
82 if(samples.empty())
83 {
84 continue;
85 }
86
87 for(unsigned int q : samples)
88 {
89 zMask[q] = cMask[q];
90 sMask[q] = cMask[q];
91 }
92
93 stencilTest(sBuffer, x, sMask, samples);
94
95 Float4 f;
96 Float4 rhwCentroid;
97
98 Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive, xQuad), 16);
99
100 if(interpolateZ())
101 {
102 for(unsigned int q : samples)
103 {
104 Float4 x = xxxx;
105
106 if(state.enableMultiSampling)
107 {
108 x -= *Pointer<Float4>(constants + OFFSET(Constants, X) + q * sizeof(float4));
109 }
110
111 z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive, z), false, false);
112
113 if(state.depthBias)
114 {
115 z[q] += *Pointer<Float4>(primitive + OFFSET(Primitive, zBias), 16);
116 }
117
118 unclampedZ[q] = z[q];
119 }
120 }
121
122 Bool depthPass = false;
123
124 if(earlyFragmentTests)
125 {
126 for(unsigned int q : samples)
127 {
128 z[q] = clampDepth(z[q]);
129 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
130 depthBoundsTest(zBuffer, q, x, zMask[q], cMask[q]);
131 }
132 }
133
134 If(depthPass || !earlyFragmentTests)
135 {
136 if(earlyFragmentTests)
137 {
138 writeDepth(zBuffer, x, zMask, samples);
139 }
140
141 Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive, yQuad), 16);
142
143 // Centroid locations
144 Float4 XXXX = 0.0f;
145 Float4 YYYY = 0.0f;
146
147 if(state.centroid || shaderContainsInterpolation) // TODO(b/194714095)
148 {
149 Float4 WWWW(1.0e-9f);
150
151 for(unsigned int q : samples)
152 {
153 XXXX += *Pointer<Float4>(constants + OFFSET(Constants, sampleX[q]) + 16 * cMask[q]);
154 YYYY += *Pointer<Float4>(constants + OFFSET(Constants, sampleY[q]) + 16 * cMask[q]);
155 WWWW += *Pointer<Float4>(constants + OFFSET(Constants, weight) + 16 * cMask[q]);
156 }
157
158 WWWW = Rcp(WWWW, true /* relaxedPrecision */);
159 XXXX *= WWWW;
160 YYYY *= WWWW;
161
162 XXXX += xxxx;
163 YYYY += yyyy;
164 }
165
166 if(interpolateW())
167 {
168 w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive, w), false, false);
169 rhw = reciprocal(w, false, true);
170
171 if(state.centroid || shaderContainsInterpolation) // TODO(b/194714095)
172 {
173 rhwCentroid = reciprocal(SpirvRoutine::interpolateAtXY(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, w), false, false));
174 }
175 }
176
177 if(spirvShader)
178 {
179 if(shaderContainsInterpolation) // TODO(b/194714095)
180 {
181 routine.interpolationData.primitive = primitive;
182
183 routine.interpolationData.x = xxxx;
184 routine.interpolationData.y = yyyy;
185 routine.interpolationData.rhw = rhw;
186
187 routine.interpolationData.xCentroid = XXXX;
188 routine.interpolationData.yCentroid = YYYY;
189 routine.interpolationData.rhwCentroid = rhwCentroid;
190 }
191
192 if(perSampleShading && (state.multiSampleCount > 1))
193 {
194 xxxx += Constants::SampleLocationsX[samples[0]];
195 yyyy += Constants::SampleLocationsY[samples[0]];
196 }
197
198 int packedInterpolant = 0;
199 for(int interfaceInterpolant = 0; interfaceInterpolant < MAX_INTERFACE_COMPONENTS; interfaceInterpolant++)
200 {
201 auto const &input = spirvShader->inputs[interfaceInterpolant];
202 if(input.Type != SpirvShader::ATTRIBTYPE_UNUSED)
203 {
204 if(input.Centroid && state.enableMultiSampling)
205 {
206 routine.inputs[interfaceInterpolant] =
207 SpirvRoutine::interpolateAtXY(XXXX, YYYY, rhwCentroid,
208 primitive + OFFSET(Primitive, V[packedInterpolant]),
209 input.Flat, !input.NoPerspective);
210 }
211 else if(perSampleShading)
212 {
213 routine.inputs[interfaceInterpolant] =
214 SpirvRoutine::interpolateAtXY(xxxx, yyyy, rhw,
215 primitive + OFFSET(Primitive, V[packedInterpolant]),
216 input.Flat, !input.NoPerspective);
217 }
218 else
219 {
220 routine.inputs[interfaceInterpolant] =
221 interpolate(xxxx, Dv[interfaceInterpolant], rhw,
222 primitive + OFFSET(Primitive, V[packedInterpolant]),
223 input.Flat, !input.NoPerspective);
224 }
225 packedInterpolant++;
226 }
227 }
228
229 setBuiltins(x, y, unclampedZ, w, cMask, samples);
230
231 for(uint32_t i = 0; i < state.numClipDistances; i++)
232 {
233 auto distance = interpolate(xxxx, DclipDistance[i], rhw,
234 primitive + OFFSET(Primitive, clipDistance[i]),
235 false, true);
236
237 auto clipMask = SignMask(CmpGE(distance, SIMD::Float(0)));
238 for(unsigned int q : samples)
239 {
240 // FIXME(b/148105887): Fragments discarded by clipping do not exist at
241 // all -- they should not be counted in queries or have their Z/S effects
242 // performed when early fragment tests are enabled.
243 cMask[q] &= clipMask;
244 }
245
246 if(spirvShader->getUsedCapabilities().ClipDistance)
247 {
248 auto it = spirvShader->inputBuiltins.find(spv::BuiltInClipDistance);
249 if(it != spirvShader->inputBuiltins.end())
250 {
251 if(i < it->second.SizeInComponents)
252 {
253 routine.getVariable(it->second.Id)[it->second.FirstComponent + i] = distance;
254 }
255 }
256 }
257 }
258
259 if(spirvShader->getUsedCapabilities().CullDistance)
260 {
261 auto it = spirvShader->inputBuiltins.find(spv::BuiltInCullDistance);
262 if(it != spirvShader->inputBuiltins.end())
263 {
264 for(uint32_t i = 0; i < state.numCullDistances; i++)
265 {
266 if(i < it->second.SizeInComponents)
267 {
268 routine.getVariable(it->second.Id)[it->second.FirstComponent + i] =
269 interpolate(xxxx, DcullDistance[i], rhw,
270 primitive + OFFSET(Primitive, cullDistance[i]),
271 false, true);
272 }
273 }
274 }
275 }
276 }
277
278 if(spirvShader)
279 {
280 executeShader(cMask, earlyFragmentTests ? sMask : cMask, earlyFragmentTests ? zMask : cMask, samples);
281 }
282
283 Bool alphaPass = alphaTest(cMask, samples);
284
285 if((spirvShader && spirvShader->getAnalysis().ContainsDiscard) || state.alphaToCoverage)
286 {
287 for(unsigned int q : samples)
288 {
289 zMask[q] &= cMask[q];
290 sMask[q] &= cMask[q];
291 }
292 }
293
294 If(alphaPass)
295 {
296 if(!earlyFragmentTests)
297 {
298 for(unsigned int q : samples)
299 {
300 z[q] = clampDepth(z[q]);
301 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
302 depthBoundsTest(zBuffer, q, x, zMask[q], cMask[q]);
303 }
304 }
305
306 If(depthPass)
307 {
308 if(!earlyFragmentTests)
309 {
310 writeDepth(zBuffer, x, zMask, samples);
311 }
312
313 blendColor(cBuffer, x, sMask, zMask, cMask, samples);
314
315 occlusionSampleCount(zMask, sMask, samples);
316 }
317 }
318 }
319
320 writeStencil(sBuffer, x, sMask, zMask, cMask, samples);
321 }
322 }
323
stencilTest(const Pointer<Byte> & sBuffer,const Int & x,Int sMask[4],const SampleSet & samples)324 void PixelRoutine::stencilTest(const Pointer<Byte> &sBuffer, const Int &x, Int sMask[4], const SampleSet &samples)
325 {
326 if(!state.stencilActive)
327 {
328 return;
329 }
330
331 for(unsigned int q : samples)
332 {
333 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
334
335 Pointer<Byte> buffer = sBuffer + x;
336
337 if(q > 0)
338 {
339 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
340 }
341
342 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
343 Byte8 value = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
344 value = value | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
345 Byte8 valueBack = value;
346
347 if(state.frontStencil.compareMask != 0xff)
348 {
349 value &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].testMaskQ));
350 }
351
352 stencilTest(value, state.frontStencil.compareOp, false);
353
354 if(state.backStencil.compareMask != 0xff)
355 {
356 valueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].testMaskQ));
357 }
358
359 stencilTest(valueBack, state.backStencil.compareOp, true);
360
361 value &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
362 valueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
363 value |= valueBack;
364
365 sMask[q] &= SignMask(value);
366 }
367 }
368
stencilTest(Byte8 & value,VkCompareOp stencilCompareMode,bool isBack)369 void PixelRoutine::stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool isBack)
370 {
371 Byte8 equal;
372
373 switch(stencilCompareMode)
374 {
375 case VK_COMPARE_OP_ALWAYS:
376 value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
377 break;
378 case VK_COMPARE_OP_NEVER:
379 value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
380 break;
381 case VK_COMPARE_OP_LESS: // a < b ~ b > a
382 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
383 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
384 break;
385 case VK_COMPARE_OP_EQUAL:
386 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
387 break;
388 case VK_COMPARE_OP_NOT_EQUAL: // a != b ~ !(a == b)
389 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
390 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
391 break;
392 case VK_COMPARE_OP_LESS_OR_EQUAL: // a <= b ~ (b > a) || (a == b)
393 equal = value;
394 equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
395 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
396 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
397 value |= equal;
398 break;
399 case VK_COMPARE_OP_GREATER: // a > b
400 equal = *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ));
401 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
402 equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
403 value = equal;
404 break;
405 case VK_COMPARE_OP_GREATER_OR_EQUAL: // a >= b ~ !(a < b) ~ !(b > a)
406 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
407 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
408 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
409 break;
410 default:
411 UNSUPPORTED("VkCompareOp: %d", int(stencilCompareMode));
412 }
413 }
414
depthTest32F(const Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & sMask,Int & zMask,const Int & cMask)415 Bool PixelRoutine::depthTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
416 {
417 Float4 Z = z;
418
419 Pointer<Byte> buffer = zBuffer + 4 * x;
420 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
421
422 if(q > 0)
423 {
424 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
425 }
426
427 Float4 zValue;
428
429 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
430 {
431 zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
432 }
433
434 Int4 zTest;
435
436 switch(state.depthCompareMode)
437 {
438 case VK_COMPARE_OP_ALWAYS:
439 // Optimized
440 break;
441 case VK_COMPARE_OP_NEVER:
442 // Optimized
443 break;
444 case VK_COMPARE_OP_EQUAL:
445 zTest = CmpEQ(zValue, Z);
446 break;
447 case VK_COMPARE_OP_NOT_EQUAL:
448 zTest = CmpNEQ(zValue, Z);
449 break;
450 case VK_COMPARE_OP_LESS:
451 zTest = CmpNLE(zValue, Z);
452 break;
453 case VK_COMPARE_OP_GREATER_OR_EQUAL:
454 zTest = CmpLE(zValue, Z);
455 break;
456 case VK_COMPARE_OP_LESS_OR_EQUAL:
457 zTest = CmpNLT(zValue, Z);
458 break;
459 case VK_COMPARE_OP_GREATER:
460 zTest = CmpLT(zValue, Z);
461 break;
462 default:
463 UNSUPPORTED("VkCompareOp: %d", int(state.depthCompareMode));
464 }
465
466 switch(state.depthCompareMode)
467 {
468 case VK_COMPARE_OP_ALWAYS:
469 zMask = cMask;
470 break;
471 case VK_COMPARE_OP_NEVER:
472 zMask = 0x0;
473 break;
474 default:
475 zMask = SignMask(zTest) & cMask;
476 break;
477 }
478
479 if(state.stencilActive)
480 {
481 zMask &= sMask;
482 }
483
484 return zMask != 0;
485 }
486
depthTest16(const Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & sMask,Int & zMask,const Int & cMask)487 Bool PixelRoutine::depthTest16(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
488 {
489 Short4 Z = convertFixed16(z, true);
490
491 Pointer<Byte> buffer = zBuffer + 2 * x;
492 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
493
494 if(q > 0)
495 {
496 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
497 }
498
499 Short4 zValue;
500
501 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
502 {
503 zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer), 0));
504 zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer + pitch), 1));
505 }
506
507 Int4 zTest;
508
509 // Bias values to make unsigned compares out of Reactor's (due SSE's) signed compares only
510 zValue = zValue - Short4(0x8000u);
511 Z = Z - Short4(0x8000u);
512
513 switch(state.depthCompareMode)
514 {
515 case VK_COMPARE_OP_ALWAYS:
516 // Optimized
517 break;
518 case VK_COMPARE_OP_NEVER:
519 // Optimized
520 break;
521 case VK_COMPARE_OP_EQUAL:
522 zTest = Int4(CmpEQ(zValue, Z));
523 break;
524 case VK_COMPARE_OP_NOT_EQUAL:
525 zTest = ~Int4(CmpEQ(zValue, Z));
526 break;
527 case VK_COMPARE_OP_LESS:
528 zTest = Int4(CmpGT(zValue, Z));
529 break;
530 case VK_COMPARE_OP_GREATER_OR_EQUAL:
531 zTest = ~Int4(CmpGT(zValue, Z));
532 break;
533 case VK_COMPARE_OP_LESS_OR_EQUAL:
534 zTest = ~Int4(CmpGT(Z, zValue));
535 break;
536 case VK_COMPARE_OP_GREATER:
537 zTest = Int4(CmpGT(Z, zValue));
538 break;
539 default:
540 UNSUPPORTED("VkCompareOp: %d", int(state.depthCompareMode));
541 }
542
543 switch(state.depthCompareMode)
544 {
545 case VK_COMPARE_OP_ALWAYS:
546 zMask = cMask;
547 break;
548 case VK_COMPARE_OP_NEVER:
549 zMask = 0x0;
550 break;
551 default:
552 zMask = SignMask(zTest) & cMask;
553 break;
554 }
555
556 if(state.stencilActive)
557 {
558 zMask &= sMask;
559 }
560
561 return zMask != 0;
562 }
563
clampDepth(const Float4 & z)564 Float4 PixelRoutine::clampDepth(const Float4 &z)
565 {
566 if(!state.depthClamp)
567 {
568 return z;
569 }
570
571 return Min(Max(z, state.minDepthClamp), state.maxDepthClamp);
572 }
573
depthTest(const Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & sMask,Int & zMask,const Int & cMask)574 Bool PixelRoutine::depthTest(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
575 {
576 if(!state.depthTestActive)
577 {
578 return true;
579 }
580
581 switch(state.depthFormat)
582 {
583 case VK_FORMAT_D16_UNORM:
584 return depthTest16(zBuffer, q, x, z, sMask, zMask, cMask);
585 case VK_FORMAT_D32_SFLOAT:
586 case VK_FORMAT_D32_SFLOAT_S8_UINT:
587 return depthTest32F(zBuffer, q, x, z, sMask, zMask, cMask);
588 default:
589 UNSUPPORTED("Depth format: %d", int(state.depthFormat));
590 return false;
591 }
592 }
593
depthBoundsTest16(const Pointer<Byte> & zBuffer,int q,const Int & x)594 Int4 PixelRoutine::depthBoundsTest16(const Pointer<Byte> &zBuffer, int q, const Int &x)
595 {
596 Pointer<Byte> buffer = zBuffer + 2 * x;
597 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
598
599 if(q > 0)
600 {
601 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
602 }
603
604 Float4 minDepthBound(state.minDepthBounds);
605 Float4 maxDepthBound(state.maxDepthBounds);
606
607 Int2 z;
608 z = Insert(z, *Pointer<Int>(buffer), 0);
609 z = Insert(z, *Pointer<Int>(buffer + pitch), 1);
610
611 Float4 zValue = convertFloat32(As<UShort4>(z));
612 return Int4(CmpLE(minDepthBound, zValue) & CmpLE(zValue, maxDepthBound));
613 }
614
depthBoundsTest32F(const Pointer<Byte> & zBuffer,int q,const Int & x)615 Int4 PixelRoutine::depthBoundsTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x)
616 {
617 Pointer<Byte> buffer = zBuffer + 4 * x;
618 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
619
620 if(q > 0)
621 {
622 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
623 }
624
625 Float4 zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
626 return Int4(CmpLE(state.minDepthBounds, zValue) & CmpLE(zValue, state.maxDepthBounds));
627 }
628
depthBoundsTest(const Pointer<Byte> & zBuffer,int q,const Int & x,Int & zMask,Int & cMask)629 void PixelRoutine::depthBoundsTest(const Pointer<Byte> &zBuffer, int q, const Int &x, Int &zMask, Int &cMask)
630 {
631 if(!state.depthBoundsTestActive)
632 {
633 return;
634 }
635
636 Int4 zTest;
637 switch(state.depthFormat)
638 {
639 case VK_FORMAT_D16_UNORM:
640 zTest = depthBoundsTest16(zBuffer, q, x);
641 break;
642 case VK_FORMAT_D32_SFLOAT:
643 case VK_FORMAT_D32_SFLOAT_S8_UINT:
644 zTest = depthBoundsTest32F(zBuffer, q, x);
645 break;
646 default:
647 UNSUPPORTED("Depth format: %d", int(state.depthFormat));
648 break;
649 }
650
651 if(!state.depthTestActive)
652 {
653 cMask &= zMask & SignMask(zTest);
654 }
655 else
656 {
657 zMask &= cMask & SignMask(zTest);
658 }
659 }
660
alphaToCoverage(Int cMask[4],const Float4 & alpha,const SampleSet & samples)661 void PixelRoutine::alphaToCoverage(Int cMask[4], const Float4 &alpha, const SampleSet &samples)
662 {
663 static const int a2c[4] = {
664 OFFSET(DrawData, a2c0),
665 OFFSET(DrawData, a2c1),
666 OFFSET(DrawData, a2c2),
667 OFFSET(DrawData, a2c3),
668 };
669
670 for(unsigned int q : samples)
671 {
672 Int4 coverage = CmpNLT(alpha, *Pointer<Float4>(data + a2c[q]));
673 Int aMask = SignMask(coverage);
674 cMask[q] &= aMask;
675 }
676 }
677
writeDepth32F(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)678 void PixelRoutine::writeDepth32F(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
679 {
680 Float4 Z = z;
681
682 Pointer<Byte> buffer = zBuffer + 4 * x;
683 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
684
685 if(q > 0)
686 {
687 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
688 }
689
690 Float4 zValue;
691
692 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
693 {
694 zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
695 }
696
697 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + zMask * 16, 16));
698 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + zMask * 16, 16));
699 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
700
701 *Pointer<Float2>(buffer) = Float2(Z.xy);
702 *Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
703 }
704
writeDepth16(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)705 void PixelRoutine::writeDepth16(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
706 {
707 Short4 Z = As<Short4>(convertFixed16(z, true));
708
709 Pointer<Byte> buffer = zBuffer + 2 * x;
710 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
711
712 if(q > 0)
713 {
714 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
715 }
716
717 Short4 zValue;
718
719 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
720 {
721 zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer), 0));
722 zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer + pitch), 1));
723 }
724
725 Z = Z & *Pointer<Short4>(constants + OFFSET(Constants, maskW4Q) + zMask * 8, 8);
726 zValue = zValue & *Pointer<Short4>(constants + OFFSET(Constants, invMaskW4Q) + zMask * 8, 8);
727 Z = Z | zValue;
728
729 *Pointer<Int>(buffer) = Extract(As<Int2>(Z), 0);
730 *Pointer<Int>(buffer + pitch) = Extract(As<Int2>(Z), 1);
731 }
732
writeDepth(Pointer<Byte> & zBuffer,const Int & x,const Int zMask[4],const SampleSet & samples)733 void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, const Int &x, const Int zMask[4], const SampleSet &samples)
734 {
735 if(!state.depthWriteEnable)
736 {
737 return;
738 }
739
740 for(unsigned int q : samples)
741 {
742 switch(state.depthFormat)
743 {
744 case VK_FORMAT_D16_UNORM:
745 writeDepth16(zBuffer, q, x, z[q], zMask[q]);
746 break;
747 case VK_FORMAT_D32_SFLOAT:
748 case VK_FORMAT_D32_SFLOAT_S8_UINT:
749 writeDepth32F(zBuffer, q, x, z[q], zMask[q]);
750 break;
751 default:
752 UNSUPPORTED("Depth format: %d", int(state.depthFormat));
753 break;
754 }
755 }
756 }
757
occlusionSampleCount(const Int zMask[4],const Int sMask[4],const SampleSet & samples)758 void PixelRoutine::occlusionSampleCount(const Int zMask[4], const Int sMask[4], const SampleSet &samples)
759 {
760 if(!state.occlusionEnabled)
761 {
762 return;
763 }
764
765 for(unsigned int q : samples)
766 {
767 occlusion += *Pointer<UInt>(constants + OFFSET(Constants, occlusionCount) + 4 * (zMask[q] & sMask[q]));
768 }
769 }
770
writeStencil(Pointer<Byte> & sBuffer,const Int & x,const Int sMask[4],const Int zMask[4],const Int cMask[4],const SampleSet & samples)771 void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, const Int &x, const Int sMask[4], const Int zMask[4], const Int cMask[4], const SampleSet &samples)
772 {
773 if(!state.stencilActive)
774 {
775 return;
776 }
777
778 if(state.frontStencil.passOp == VK_STENCIL_OP_KEEP && state.frontStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.frontStencil.failOp == VK_STENCIL_OP_KEEP)
779 {
780 if(state.backStencil.passOp == VK_STENCIL_OP_KEEP && state.backStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.backStencil.failOp == VK_STENCIL_OP_KEEP)
781 {
782 return;
783 }
784 }
785
786 if((state.frontStencil.writeMask == 0) && (state.backStencil.writeMask == 0))
787 {
788 return;
789 }
790
791 for(unsigned int q : samples)
792 {
793 Pointer<Byte> buffer = sBuffer + x;
794
795 if(q > 0)
796 {
797 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
798 }
799
800 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
801 Byte8 bufferValue = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
802 bufferValue = bufferValue | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
803 Byte8 newValue;
804 stencilOperation(newValue, bufferValue, state.frontStencil, false, zMask[q], sMask[q]);
805
806 if((state.frontStencil.writeMask & 0xFF) != 0xFF) // Assume 8-bit stencil buffer
807 {
808 Byte8 maskedValue = bufferValue;
809 newValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].writeMaskQ));
810 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].invWriteMaskQ));
811 newValue |= maskedValue;
812 }
813
814 Byte8 newValueBack;
815
816 stencilOperation(newValueBack, bufferValue, state.backStencil, true, zMask[q], sMask[q]);
817
818 if((state.backStencil.writeMask & 0xFF) != 0xFF) // Assume 8-bit stencil buffer
819 {
820 Byte8 maskedValue = bufferValue;
821 newValueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].writeMaskQ));
822 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].invWriteMaskQ));
823 newValueBack |= maskedValue;
824 }
825
826 newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
827 newValueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
828 newValue |= newValueBack;
829
830 newValue &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * cMask[q]);
831 bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * cMask[q]);
832 newValue |= bufferValue;
833
834 *Pointer<Short>(buffer) = Extract(As<Short4>(newValue), 0);
835 *Pointer<Short>(buffer + pitch) = Extract(As<Short4>(newValue), 1);
836 }
837 }
838
stencilOperation(Byte8 & newValue,const Byte8 & bufferValue,const PixelProcessor::States::StencilOpState & ops,bool isBack,const Int & zMask,const Int & sMask)839 void PixelRoutine::stencilOperation(Byte8 &newValue, const Byte8 &bufferValue, const PixelProcessor::States::StencilOpState &ops, bool isBack, const Int &zMask, const Int &sMask)
840 {
841 Byte8 &pass = newValue;
842 Byte8 fail;
843 Byte8 zFail;
844
845 stencilOperation(pass, bufferValue, ops.passOp, isBack);
846
847 if(ops.depthFailOp != ops.passOp)
848 {
849 stencilOperation(zFail, bufferValue, ops.depthFailOp, isBack);
850 }
851
852 if(ops.failOp != ops.passOp || ops.failOp != ops.depthFailOp)
853 {
854 stencilOperation(fail, bufferValue, ops.failOp, isBack);
855 }
856
857 if(ops.failOp != ops.passOp || ops.failOp != ops.depthFailOp)
858 {
859 if(state.depthTestActive && ops.depthFailOp != ops.passOp) // zMask valid and values not the same
860 {
861 pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * zMask);
862 zFail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * zMask);
863 pass |= zFail;
864 }
865
866 pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * sMask);
867 fail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * sMask);
868 pass |= fail;
869 }
870 }
871
hasStencilReplaceRef() const872 bool PixelRoutine::hasStencilReplaceRef() const
873 {
874 return spirvShader &&
875 (spirvShader->outputBuiltins.find(spv::BuiltInFragStencilRefEXT) !=
876 spirvShader->outputBuiltins.end());
877 }
878
stencilReplaceRef()879 Byte8 PixelRoutine::stencilReplaceRef()
880 {
881 ASSERT(spirvShader);
882
883 auto it = spirvShader->outputBuiltins.find(spv::BuiltInFragStencilRefEXT);
884 ASSERT(it != spirvShader->outputBuiltins.end());
885
886 UInt4 sRef = As<UInt4>(routine.getVariable(it->second.Id)[it->second.FirstComponent]) & UInt4(0xff);
887 // TODO (b/148295813): Could be done with a single pshufb instruction. Optimize the
888 // following line by either adding a rr::Shuffle() variant to do
889 // it explicitly or adding a Byte4(Int4) constructor would work.
890 sRef.x = rr::UInt(sRef.x) | (rr::UInt(sRef.y) << 8) | (rr::UInt(sRef.z) << 16) | (rr::UInt(sRef.w) << 24);
891
892 UInt2 sRefDuplicated;
893 sRefDuplicated = Insert(sRefDuplicated, sRef.x, 0);
894 sRefDuplicated = Insert(sRefDuplicated, sRef.x, 1);
895 return As<Byte8>(sRefDuplicated);
896 }
897
stencilOperation(Byte8 & output,const Byte8 & bufferValue,VkStencilOp operation,bool isBack)898 void PixelRoutine::stencilOperation(Byte8 &output, const Byte8 &bufferValue, VkStencilOp operation, bool isBack)
899 {
900 if(hasStencilReplaceRef())
901 {
902 output = stencilReplaceRef();
903 }
904 else
905 {
906 switch(operation)
907 {
908 case VK_STENCIL_OP_KEEP:
909 output = bufferValue;
910 break;
911 case VK_STENCIL_OP_ZERO:
912 output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
913 break;
914 case VK_STENCIL_OP_REPLACE:
915 output = *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceQ));
916 break;
917 case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
918 output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
919 break;
920 case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
921 output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
922 break;
923 case VK_STENCIL_OP_INVERT:
924 output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
925 break;
926 case VK_STENCIL_OP_INCREMENT_AND_WRAP:
927 output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
928 break;
929 case VK_STENCIL_OP_DECREMENT_AND_WRAP:
930 output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
931 break;
932 default:
933 UNSUPPORTED("VkStencilOp: %d", int(operation));
934 }
935 }
936 }
937
isSRGB(int index) const938 bool PixelRoutine::isSRGB(int index) const
939 {
940 return vk::Format(state.colorFormat[index]).isSRGBformat();
941 }
942
readPixel(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4s & pixel)943 void PixelRoutine::readPixel(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &pixel)
944 {
945 Short4 c01;
946 Short4 c23;
947 Pointer<Byte> buffer = cBuffer;
948 Pointer<Byte> buffer2;
949
950 Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
951
952 switch(state.colorFormat[index])
953 {
954 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
955 buffer += 2 * x;
956 buffer2 = buffer + pitchB;
957 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
958
959 pixel.x = (c01 & Short4(0xF000u));
960 pixel.y = (c01 & Short4(0x0F00u)) << 4;
961 pixel.z = (c01 & Short4(0x00F0u)) << 8;
962 pixel.w = (c01 & Short4(0x000Fu)) << 12;
963
964 // Expand to 16 bit range
965 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
966 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
967 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
968 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
969 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
970 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
971 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
972 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
973 break;
974 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
975 buffer += 2 * x;
976 buffer2 = buffer + pitchB;
977 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
978
979 pixel.z = (c01 & Short4(0xF000u));
980 pixel.y = (c01 & Short4(0x0F00u)) << 4;
981 pixel.x = (c01 & Short4(0x00F0u)) << 8;
982 pixel.w = (c01 & Short4(0x000Fu)) << 12;
983
984 // Expand to 16 bit range
985 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
986 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
987 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
988 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
989 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
990 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
991 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
992 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
993 break;
994 case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
995 buffer += 2 * x;
996 buffer2 = buffer + pitchB;
997 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
998
999 pixel.w = (c01 & Short4(0xF000u));
1000 pixel.z = (c01 & Short4(0x0F00u)) << 4;
1001 pixel.y = (c01 & Short4(0x00F0u)) << 8;
1002 pixel.x = (c01 & Short4(0x000Fu)) << 12;
1003
1004 // Expand to 16 bit range
1005 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
1006 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
1007 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
1008 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
1009 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
1010 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
1011 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1012 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1013 break;
1014 case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
1015 buffer += 2 * x;
1016 buffer2 = buffer + pitchB;
1017 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1018
1019 pixel.w = (c01 & Short4(0xF000u));
1020 pixel.x = (c01 & Short4(0x0F00u)) << 4;
1021 pixel.y = (c01 & Short4(0x00F0u)) << 8;
1022 pixel.z = (c01 & Short4(0x000Fu)) << 12;
1023
1024 // Expand to 16 bit range
1025 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
1026 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
1027 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
1028 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
1029 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
1030 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
1031 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1032 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1033 break;
1034 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1035 buffer += 2 * x;
1036 buffer2 = buffer + pitchB;
1037 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1038
1039 pixel.x = (c01 & Short4(0xF800u));
1040 pixel.y = (c01 & Short4(0x07C0u)) << 5;
1041 pixel.z = (c01 & Short4(0x003Eu)) << 10;
1042 pixel.w = ((c01 & Short4(0x0001u)) << 15) >> 15;
1043
1044 // Expand to 16 bit range
1045 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1046 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1047 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1048 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1049 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1050 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1051 break;
1052 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1053 buffer += 2 * x;
1054 buffer2 = buffer + pitchB;
1055 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1056
1057 pixel.z = (c01 & Short4(0xF800u));
1058 pixel.y = (c01 & Short4(0x07C0u)) << 5;
1059 pixel.x = (c01 & Short4(0x003Eu)) << 10;
1060 pixel.w = ((c01 & Short4(0x0001u)) << 15) >> 15;
1061
1062 // Expand to 16 bit range
1063 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1064 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1065 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1066 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1067 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1068 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1069 break;
1070 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1071 buffer += 2 * x;
1072 buffer2 = buffer + pitchB;
1073 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1074
1075 pixel.x = (c01 & Short4(0x7C00u)) << 1;
1076 pixel.y = (c01 & Short4(0x03E0u)) << 6;
1077 pixel.z = (c01 & Short4(0x001Fu)) << 11;
1078 pixel.w = (c01 & Short4(0x8000u)) >> 15;
1079
1080 // Expand to 16 bit range
1081 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1082 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1083 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1084 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1085 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1086 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1087 break;
1088 case VK_FORMAT_R5G6B5_UNORM_PACK16:
1089 buffer += 2 * x;
1090 buffer2 = buffer + pitchB;
1091 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1092
1093 pixel.x = c01 & Short4(0xF800u);
1094 pixel.y = (c01 & Short4(0x07E0u)) << 5;
1095 pixel.z = (c01 & Short4(0x001Fu)) << 11;
1096 pixel.w = Short4(0xFFFFu);
1097
1098 // Expand to 16 bit range
1099 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1100 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1101 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
1102 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
1103 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1104 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1105 break;
1106 case VK_FORMAT_B5G6R5_UNORM_PACK16:
1107 buffer += 2 * x;
1108 buffer2 = buffer + pitchB;
1109 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1110
1111 pixel.z = c01 & Short4(0xF800u);
1112 pixel.y = (c01 & Short4(0x07E0u)) << 5;
1113 pixel.x = (c01 & Short4(0x001Fu)) << 11;
1114 pixel.w = Short4(0xFFFFu);
1115
1116 // Expand to 16 bit range
1117 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1118 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1119 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
1120 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
1121 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1122 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1123 break;
1124 case VK_FORMAT_B8G8R8A8_UNORM:
1125 case VK_FORMAT_B8G8R8A8_SRGB:
1126 buffer += 4 * x;
1127 c01 = *Pointer<Short4>(buffer);
1128 buffer += pitchB;
1129 c23 = *Pointer<Short4>(buffer);
1130 pixel.z = c01;
1131 pixel.y = c01;
1132 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1133 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1134 pixel.x = pixel.z;
1135 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1136 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1137 pixel.y = pixel.z;
1138 pixel.w = pixel.x;
1139 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1140 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1141 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1142 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1143 break;
1144 case VK_FORMAT_R8G8B8A8_UNORM:
1145 case VK_FORMAT_R8G8B8A8_SRGB:
1146 buffer += 4 * x;
1147 c01 = *Pointer<Short4>(buffer);
1148 buffer += pitchB;
1149 c23 = *Pointer<Short4>(buffer);
1150 pixel.z = c01;
1151 pixel.y = c01;
1152 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1153 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1154 pixel.x = pixel.z;
1155 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1156 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1157 pixel.y = pixel.z;
1158 pixel.w = pixel.x;
1159 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1160 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1161 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1162 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1163 break;
1164 case VK_FORMAT_R8_UNORM:
1165 buffer += 1 * x;
1166 pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
1167 buffer += pitchB;
1168 pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
1169 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1170 pixel.y = Short4(0x0000);
1171 pixel.z = Short4(0x0000);
1172 pixel.w = Short4(0xFFFFu);
1173 break;
1174 case VK_FORMAT_R8G8_UNORM:
1175 buffer += 2 * x;
1176 c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
1177 buffer += pitchB;
1178 c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
1179 pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
1180 pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
1181 pixel.z = Short4(0x0000u);
1182 pixel.w = Short4(0xFFFFu);
1183 break;
1184 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1185 {
1186 Int4 v = Int4(0);
1187 buffer += 4 * x;
1188 v = Insert(v, *Pointer<Int>(buffer + 0), 0);
1189 v = Insert(v, *Pointer<Int>(buffer + 4), 1);
1190 buffer += pitchB;
1191 v = Insert(v, *Pointer<Int>(buffer + 0), 2);
1192 v = Insert(v, *Pointer<Int>(buffer + 4), 3);
1193
1194 pixel.x = Short4(v << 6) & Short4(0xFFC0u);
1195 pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
1196 pixel.z = Short4(v >> 14) & Short4(0xFFC0u);
1197 pixel.w = Short4(v >> 16) & Short4(0xC000u);
1198
1199 // Expand to 16 bit range
1200 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1201 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1202 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1203 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 2);
1204 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1205 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1206 }
1207 break;
1208 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1209 {
1210 Int4 v = Int4(0);
1211 v = Insert(v, *Pointer<Int>(buffer + 4 * x), 0);
1212 v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 1);
1213 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1214 v = Insert(v, *Pointer<Int>(buffer + 4 * x), 2);
1215 v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 3);
1216
1217 pixel.x = Short4(v >> 14) & Short4(0xFFC0u);
1218 pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
1219 pixel.z = Short4(v << 6) & Short4(0xFFC0u);
1220 pixel.w = Short4(v >> 16) & Short4(0xC000u);
1221
1222 // Expand to 16 bit range
1223 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1224 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1225 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1226 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 2);
1227 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1228 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1229 }
1230 break;
1231 default:
1232 UNSUPPORTED("VkFormat %d", int(state.colorFormat[index]));
1233 }
1234
1235 if(isSRGB(index))
1236 {
1237 sRGBtoLinear16_12_16(pixel);
1238 }
1239 }
1240
writeColor(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4s & current,const Int & sMask,const Int & zMask,const Int & cMask)1241 void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s ¤t, const Int &sMask, const Int &zMask, const Int &cMask)
1242 {
1243 if(isSRGB(index))
1244 {
1245 linearToSRGB16_12_16(current);
1246 }
1247
1248 switch(state.colorFormat[index])
1249 {
1250 case VK_FORMAT_B8G8R8A8_UNORM:
1251 case VK_FORMAT_B8G8R8A8_SRGB:
1252 case VK_FORMAT_R8G8B8A8_UNORM:
1253 case VK_FORMAT_R8G8B8A8_SRGB:
1254 case VK_FORMAT_R8G8_UNORM:
1255 case VK_FORMAT_R8_UNORM:
1256 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1257 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1258 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
1259 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
1260 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
1261 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
1262 break;
1263 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1264 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1265 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 10) + Short4(0x0020);
1266 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 10) + Short4(0x0020);
1267 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 10) + Short4(0x0020);
1268 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 2) + Short4(0x2000);
1269 break;
1270 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1271 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1272 case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
1273 case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
1274 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 4) + Short4(0x0800);
1275 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 4) + Short4(0x0800);
1276 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 4) + Short4(0x0800);
1277 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 4) + Short4(0x0800);
1278 break;
1279 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1280 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1281 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1282 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 5) + Short4(0x0400);
1283 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 5) + Short4(0x0400);
1284 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 5) + Short4(0x0400);
1285 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 1) + Short4(0x4000);
1286 break;
1287 case VK_FORMAT_B5G6R5_UNORM_PACK16:
1288 case VK_FORMAT_R5G6B5_UNORM_PACK16:
1289 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 5) + Short4(0x0400);
1290 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 6) + Short4(0x0200);
1291 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 5) + Short4(0x0400);
1292 break;
1293 default:
1294 break;
1295 }
1296
1297 int rgbaWriteMask = state.colorWriteActive(index);
1298 int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1299
1300 switch(state.colorFormat[index])
1301 {
1302 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1303 {
1304 current.x = As<UShort4>(current.x & Short4(0xF000));
1305 current.y = As<UShort4>(current.y & Short4(0xF000)) >> 4;
1306 current.z = As<UShort4>(current.z & Short4(0xF000)) >> 8;
1307 current.w = As<UShort4>(current.w & Short4(0xF000u)) >> 12;
1308
1309 current.x = current.x | current.y | current.z | current.w;
1310 }
1311 break;
1312 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1313 {
1314 current.z = As<UShort4>(current.z & Short4(0xF000));
1315 current.y = As<UShort4>(current.y & Short4(0xF000)) >> 4;
1316 current.x = As<UShort4>(current.x & Short4(0xF000)) >> 8;
1317 current.w = As<UShort4>(current.w & Short4(0xF000u)) >> 12;
1318
1319 current.x = current.x | current.y | current.z | current.w;
1320 }
1321 break;
1322 case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
1323 {
1324 current.w = As<UShort4>(current.w & Short4(0xF000));
1325 current.x = As<UShort4>(current.x & Short4(0xF000)) >> 4;
1326 current.y = As<UShort4>(current.y & Short4(0xF000)) >> 8;
1327 current.z = As<UShort4>(current.z & Short4(0xF000u)) >> 12;
1328
1329 current.x = current.x | current.y | current.z | current.w;
1330 }
1331 break;
1332 case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
1333 {
1334 current.w = As<UShort4>(current.w & Short4(0xF000));
1335 current.z = As<UShort4>(current.z & Short4(0xF000)) >> 4;
1336 current.y = As<UShort4>(current.y & Short4(0xF000)) >> 8;
1337 current.x = As<UShort4>(current.x & Short4(0xF000u)) >> 12;
1338
1339 current.x = current.x | current.y | current.z | current.w;
1340 }
1341 break;
1342 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1343 {
1344 current.x = As<UShort4>(current.x & Short4(0xF800));
1345 current.y = As<UShort4>(current.y & Short4(0xF800)) >> 5;
1346 current.z = As<UShort4>(current.z & Short4(0xF800)) >> 10;
1347 current.w = As<UShort4>(current.w & Short4(0x8000u)) >> 15;
1348
1349 current.x = current.x | current.y | current.z | current.w;
1350 }
1351 break;
1352 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1353 {
1354 current.z = As<UShort4>(current.z & Short4(0xF800));
1355 current.y = As<UShort4>(current.y & Short4(0xF800)) >> 5;
1356 current.x = As<UShort4>(current.x & Short4(0xF800)) >> 10;
1357 current.w = As<UShort4>(current.w & Short4(0x8000u)) >> 15;
1358
1359 current.x = current.x | current.y | current.z | current.w;
1360 }
1361 break;
1362 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1363 {
1364 current.w = current.w & Short4(0x8000u);
1365 current.x = As<UShort4>(current.x & Short4(0xF800)) >> 1;
1366 current.y = As<UShort4>(current.y & Short4(0xF800)) >> 6;
1367 current.z = As<UShort4>(current.z & Short4(0xF800)) >> 11;
1368
1369 current.x = current.x | current.y | current.z | current.w;
1370 }
1371 break;
1372 case VK_FORMAT_R5G6B5_UNORM_PACK16:
1373 {
1374 current.x = current.x & Short4(0xF800u);
1375 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1376 current.z = As<UShort4>(current.z) >> 11;
1377
1378 current.x = current.x | current.y | current.z;
1379 }
1380 break;
1381 case VK_FORMAT_B5G6R5_UNORM_PACK16:
1382 {
1383 current.z = current.z & Short4(0xF800u);
1384 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1385 current.x = As<UShort4>(current.x) >> 11;
1386
1387 current.x = current.x | current.y | current.z;
1388 }
1389 break;
1390 case VK_FORMAT_B8G8R8A8_UNORM:
1391 case VK_FORMAT_B8G8R8A8_SRGB:
1392 if(rgbaWriteMask == 0x7)
1393 {
1394 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1395 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1396 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1397
1398 current.z = As<Short4>(PackUnsigned(current.z, current.x));
1399 current.y = As<Short4>(PackUnsigned(current.y, current.y));
1400
1401 current.x = current.z;
1402 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1403 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1404 current.y = current.z;
1405 current.z = As<Short4>(UnpackLow(current.z, current.x));
1406 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1407 }
1408 else
1409 {
1410 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1411 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1412 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1413 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1414
1415 current.z = As<Short4>(PackUnsigned(current.z, current.x));
1416 current.y = As<Short4>(PackUnsigned(current.y, current.w));
1417
1418 current.x = current.z;
1419 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1420 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1421 current.y = current.z;
1422 current.z = As<Short4>(UnpackLow(current.z, current.x));
1423 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1424 }
1425 break;
1426 case VK_FORMAT_R8G8B8A8_UNORM:
1427 case VK_FORMAT_R8G8B8A8_SRGB:
1428 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1429 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1430 if(rgbaWriteMask == 0x7)
1431 {
1432 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1433 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1434 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1435
1436 current.z = As<Short4>(PackUnsigned(current.x, current.z));
1437 current.y = As<Short4>(PackUnsigned(current.y, current.y));
1438
1439 current.x = current.z;
1440 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1441 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1442 current.y = current.z;
1443 current.z = As<Short4>(UnpackLow(current.z, current.x));
1444 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1445 }
1446 else
1447 {
1448 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1449 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1450 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1451 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1452
1453 current.z = As<Short4>(PackUnsigned(current.x, current.z));
1454 current.y = As<Short4>(PackUnsigned(current.y, current.w));
1455
1456 current.x = current.z;
1457 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1458 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1459 current.y = current.z;
1460 current.z = As<Short4>(UnpackLow(current.z, current.x));
1461 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1462 }
1463 break;
1464 case VK_FORMAT_R8G8_UNORM:
1465 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1466 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1467 current.x = As<Short4>(PackUnsigned(current.x, current.x));
1468 current.y = As<Short4>(PackUnsigned(current.y, current.y));
1469 current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
1470 break;
1471 case VK_FORMAT_R8_UNORM:
1472 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1473 current.x = As<Short4>(PackUnsigned(current.x, current.x));
1474 break;
1475 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1476 {
1477 auto r = (Int4(current.x) >> 6) & Int4(0x3ff);
1478 auto g = (Int4(current.y) >> 6) & Int4(0x3ff);
1479 auto b = (Int4(current.z) >> 6) & Int4(0x3ff);
1480 auto a = (Int4(current.w) >> 14) & Int4(0x3);
1481 Int4 packed = (a << 30) | (b << 20) | (g << 10) | r;
1482 auto c02 = As<Int2>(Int4(packed.xzzz)); // TODO: auto c02 = packed.xz;
1483 auto c13 = As<Int2>(Int4(packed.ywww)); // TODO: auto c13 = packed.yw;
1484 current.x = UnpackLow(c02, c13);
1485 current.y = UnpackHigh(c02, c13);
1486 }
1487 break;
1488 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1489 {
1490 auto r = (Int4(current.x) >> 6) & Int4(0x3ff);
1491 auto g = (Int4(current.y) >> 6) & Int4(0x3ff);
1492 auto b = (Int4(current.z) >> 6) & Int4(0x3ff);
1493 auto a = (Int4(current.w) >> 14) & Int4(0x3);
1494 Int4 packed = (a << 30) | (r << 20) | (g << 10) | b;
1495 auto c02 = As<Int2>(Int4(packed.xzzz)); // TODO: auto c02 = packed.xz;
1496 auto c13 = As<Int2>(Int4(packed.ywww)); // TODO: auto c13 = packed.yw;
1497 current.x = UnpackLow(c02, c13);
1498 current.y = UnpackHigh(c02, c13);
1499 }
1500 break;
1501 default:
1502 UNSUPPORTED("VkFormat: %d", int(state.colorFormat[index]));
1503 }
1504
1505 Short4 c01 = current.z;
1506 Short4 c23 = current.y;
1507
1508 Int xMask; // Combination of all masks
1509
1510 if(state.depthTestActive)
1511 {
1512 xMask = zMask;
1513 }
1514 else
1515 {
1516 xMask = cMask;
1517 }
1518
1519 if(state.stencilActive)
1520 {
1521 xMask &= sMask;
1522 }
1523
1524 Pointer<Byte> buffer = cBuffer;
1525 Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1526
1527 switch(state.colorFormat[index])
1528 {
1529 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1530 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1531 case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
1532 case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
1533 {
1534 buffer += 2 * x;
1535 Int value = *Pointer<Int>(buffer);
1536
1537 Int channelMask;
1538 switch(state.colorFormat[index])
1539 {
1540 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1541 channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4rgbaQ[bgraWriteMask & 0xF][0]));
1542 break;
1543 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1544 channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4bgraQ[bgraWriteMask & 0xF][0]));
1545 break;
1546 case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
1547 channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4argbQ[bgraWriteMask & 0xF][0]));
1548 break;
1549 case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
1550 channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4abgrQ[bgraWriteMask & 0xF][0]));
1551 break;
1552 default:
1553 UNREACHABLE("Format: %s", vk::Stringify(state.colorFormat[index]).c_str());
1554 }
1555
1556 Int c01 = Extract(As<Int2>(current.x), 0);
1557 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1558 if(bgraWriteMask != 0x0000000F)
1559 {
1560 mask01 &= channelMask;
1561 }
1562 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1563
1564 buffer += pitchB;
1565 value = *Pointer<Int>(buffer);
1566
1567 Int c23 = Extract(As<Int2>(current.x), 1);
1568 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1569 if(bgraWriteMask != 0x0000000F)
1570 {
1571 mask23 &= channelMask;
1572 }
1573 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1574 }
1575 break;
1576 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1577 {
1578 buffer += 2 * x;
1579 Int value = *Pointer<Int>(buffer);
1580
1581 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskr5g5b5a1Q[bgraWriteMask & 0xF][0]));
1582
1583 Int c01 = Extract(As<Int2>(current.x), 0);
1584 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1585 if(bgraWriteMask != 0x0000000F)
1586 {
1587 mask01 &= channelMask;
1588 }
1589 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1590
1591 buffer += pitchB;
1592 value = *Pointer<Int>(buffer);
1593
1594 Int c23 = Extract(As<Int2>(current.x), 1);
1595 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1596 if(bgraWriteMask != 0x0000000F)
1597 {
1598 mask23 &= channelMask;
1599 }
1600 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1601 }
1602 break;
1603 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1604 {
1605 buffer += 2 * x;
1606 Int value = *Pointer<Int>(buffer);
1607
1608 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskb5g5r5a1Q[bgraWriteMask & 0xF][0]));
1609
1610 Int c01 = Extract(As<Int2>(current.x), 0);
1611 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1612 if(bgraWriteMask != 0x0000000F)
1613 {
1614 mask01 &= channelMask;
1615 }
1616 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1617
1618 buffer += pitchB;
1619 value = *Pointer<Int>(buffer);
1620
1621 Int c23 = Extract(As<Int2>(current.x), 1);
1622 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1623 if(bgraWriteMask != 0x0000000F)
1624 {
1625 mask23 &= channelMask;
1626 }
1627 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1628 }
1629 break;
1630 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1631 {
1632 buffer += 2 * x;
1633 Int value = *Pointer<Int>(buffer);
1634
1635 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask5551Q[bgraWriteMask & 0xF][0]));
1636
1637 Int c01 = Extract(As<Int2>(current.x), 0);
1638 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1639 if(bgraWriteMask != 0x0000000F)
1640 {
1641 mask01 &= channelMask;
1642 }
1643 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1644
1645 buffer += pitchB;
1646 value = *Pointer<Int>(buffer);
1647
1648 Int c23 = Extract(As<Int2>(current.x), 1);
1649 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1650 if(bgraWriteMask != 0x0000000F)
1651 {
1652 mask23 &= channelMask;
1653 }
1654 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1655 }
1656 break;
1657 case VK_FORMAT_R5G6B5_UNORM_PACK16:
1658 {
1659 buffer += 2 * x;
1660 Int value = *Pointer<Int>(buffer);
1661
1662 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask565Q[bgraWriteMask & 0x7][0]));
1663
1664 Int c01 = Extract(As<Int2>(current.x), 0);
1665 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1666 if((bgraWriteMask & 0x00000007) != 0x00000007)
1667 {
1668 mask01 &= channelMask;
1669 }
1670 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1671
1672 buffer += pitchB;
1673 value = *Pointer<Int>(buffer);
1674
1675 Int c23 = Extract(As<Int2>(current.x), 1);
1676 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1677 if((bgraWriteMask & 0x00000007) != 0x00000007)
1678 {
1679 mask23 &= channelMask;
1680 }
1681 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1682 }
1683 break;
1684 case VK_FORMAT_B8G8R8A8_UNORM:
1685 case VK_FORMAT_B8G8R8A8_SRGB:
1686 {
1687 buffer += x * 4;
1688 Short4 value = *Pointer<Short4>(buffer);
1689 Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q[bgraWriteMask][0]));
1690
1691 Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1692 if(bgraWriteMask != 0x0000000F)
1693 {
1694 mask01 &= channelMask;
1695 }
1696 *Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
1697
1698 buffer += pitchB;
1699 value = *Pointer<Short4>(buffer);
1700
1701 Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1702 if(bgraWriteMask != 0x0000000F)
1703 {
1704 mask23 &= channelMask;
1705 }
1706 *Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
1707 }
1708 break;
1709 case VK_FORMAT_R8G8B8A8_UNORM:
1710 case VK_FORMAT_R8G8B8A8_SRGB:
1711 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1712 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1713 {
1714 buffer += x * 4;
1715 Short4 value = *Pointer<Short4>(buffer);
1716 Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
1717
1718 Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1719 if(rgbaWriteMask != 0x0000000F)
1720 {
1721 mask01 &= channelMask;
1722 }
1723 *Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
1724
1725 buffer += pitchB;
1726 value = *Pointer<Short4>(buffer);
1727
1728 Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1729 if(rgbaWriteMask != 0x0000000F)
1730 {
1731 mask23 &= channelMask;
1732 }
1733 *Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
1734 }
1735 break;
1736 case VK_FORMAT_R8G8_UNORM:
1737 if((rgbaWriteMask & 0x00000003) != 0x0)
1738 {
1739 buffer += 2 * x;
1740 Int2 value;
1741 value = Insert(value, *Pointer<Int>(buffer), 0);
1742 value = Insert(value, *Pointer<Int>(buffer + pitchB), 1);
1743
1744 Int2 packedCol = As<Int2>(current.x);
1745
1746 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
1747 if((rgbaWriteMask & 0x3) != 0x3)
1748 {
1749 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
1750 UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
1751 mergedMask &= rgbaMask;
1752 }
1753
1754 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
1755
1756 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
1757 *Pointer<UInt>(buffer + pitchB) = As<UInt>(Extract(packedCol, 1));
1758 }
1759 break;
1760 case VK_FORMAT_R8_UNORM:
1761 if(rgbaWriteMask & 0x00000001)
1762 {
1763 buffer += 1 * x;
1764 Short4 value;
1765 value = Insert(value, *Pointer<Short>(buffer), 0);
1766 value = Insert(value, *Pointer<Short>(buffer + pitchB), 1);
1767
1768 current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
1769 value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
1770 current.x |= value;
1771
1772 *Pointer<Short>(buffer) = Extract(current.x, 0);
1773 *Pointer<Short>(buffer + pitchB) = Extract(current.x, 1);
1774 }
1775 break;
1776 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1777 rgbaWriteMask = bgraWriteMask;
1778 // [[fallthrough]]
1779 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1780 {
1781 buffer += 4 * x;
1782
1783 Int2 value = *Pointer<Int2>(buffer, 16);
1784 Int2 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1785 if(rgbaWriteMask != 0xF)
1786 {
1787 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
1788 }
1789 *Pointer<Int2>(buffer) = (As<Int2>(current.x) & mergedMask) | (value & ~mergedMask);
1790
1791 buffer += pitchB;
1792
1793 value = *Pointer<Int2>(buffer, 16);
1794 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1795 if(rgbaWriteMask != 0xF)
1796 {
1797 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
1798 }
1799 *Pointer<Int2>(buffer) = (As<Int2>(current.y) & mergedMask) | (value & ~mergedMask);
1800 }
1801 break;
1802 default:
1803 UNSUPPORTED("VkFormat: %d", int(state.colorFormat[index]));
1804 }
1805 }
1806
blendConstant(vk::Format format,int component,BlendFactorModifier modifier)1807 Float PixelRoutine::blendConstant(vk::Format format, int component, BlendFactorModifier modifier)
1808 {
1809 bool inverse = (modifier == OneMinus);
1810
1811 if(format.isUnsignedNormalized())
1812 {
1813 return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantU[component]))
1814 : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantU[component]));
1815 }
1816 else if(format.isSignedNormalized())
1817 {
1818 return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantS[component]))
1819 : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantS[component]));
1820 }
1821 else // Floating-point format
1822 {
1823 ASSERT(format.isFloatFormat());
1824 return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantF[component]))
1825 : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantF[component]));
1826 }
1827 }
1828
blendFactorRGB(Vector4f & blendFactor,const Vector4f & sourceColor,const Vector4f & destColor,VkBlendFactor colorBlendFactor,vk::Format format)1829 void PixelRoutine::blendFactorRGB(Vector4f &blendFactor, const Vector4f &sourceColor, const Vector4f &destColor, VkBlendFactor colorBlendFactor, vk::Format format)
1830 {
1831 switch(colorBlendFactor)
1832 {
1833 case VK_BLEND_FACTOR_ZERO:
1834 blendFactor.x = 0.0f;
1835 blendFactor.y = 0.0f;
1836 blendFactor.z = 0.0f;
1837 break;
1838 case VK_BLEND_FACTOR_ONE:
1839 blendFactor.x = 1.0f;
1840 blendFactor.y = 1.0f;
1841 blendFactor.z = 1.0f;
1842 break;
1843 case VK_BLEND_FACTOR_SRC_COLOR:
1844 blendFactor.x = sourceColor.x;
1845 blendFactor.y = sourceColor.y;
1846 blendFactor.z = sourceColor.z;
1847 break;
1848 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1849 blendFactor.x = 1.0f - sourceColor.x;
1850 blendFactor.y = 1.0f - sourceColor.y;
1851 blendFactor.z = 1.0f - sourceColor.z;
1852 break;
1853 case VK_BLEND_FACTOR_DST_COLOR:
1854 blendFactor.x = destColor.x;
1855 blendFactor.y = destColor.y;
1856 blendFactor.z = destColor.z;
1857 break;
1858 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1859 blendFactor.x = 1.0f - destColor.x;
1860 blendFactor.y = 1.0f - destColor.y;
1861 blendFactor.z = 1.0f - destColor.z;
1862 break;
1863 case VK_BLEND_FACTOR_SRC_ALPHA:
1864 blendFactor.x = sourceColor.w;
1865 blendFactor.y = sourceColor.w;
1866 blendFactor.z = sourceColor.w;
1867 break;
1868 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1869 blendFactor.x = 1.0f - sourceColor.w;
1870 blendFactor.y = 1.0f - sourceColor.w;
1871 blendFactor.z = 1.0f - sourceColor.w;
1872 break;
1873 case VK_BLEND_FACTOR_DST_ALPHA:
1874 blendFactor.x = destColor.w;
1875 blendFactor.y = destColor.w;
1876 blendFactor.z = destColor.w;
1877 break;
1878 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1879 blendFactor.x = 1.0f - destColor.w;
1880 blendFactor.y = 1.0f - destColor.w;
1881 blendFactor.z = 1.0f - destColor.w;
1882 break;
1883 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1884 blendFactor.x = 1.0f - destColor.w;
1885 blendFactor.x = Min(blendFactor.x, sourceColor.w);
1886 blendFactor.y = blendFactor.x;
1887 blendFactor.z = blendFactor.x;
1888 break;
1889 case VK_BLEND_FACTOR_CONSTANT_COLOR:
1890 blendFactor.x = blendConstant(format, 0);
1891 blendFactor.y = blendConstant(format, 1);
1892 blendFactor.z = blendConstant(format, 2);
1893 break;
1894 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1895 blendFactor.x = blendConstant(format, 3);
1896 blendFactor.y = blendConstant(format, 3);
1897 blendFactor.z = blendConstant(format, 3);
1898 break;
1899 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1900 blendFactor.x = blendConstant(format, 0, OneMinus);
1901 blendFactor.y = blendConstant(format, 1, OneMinus);
1902 blendFactor.z = blendConstant(format, 2, OneMinus);
1903 break;
1904 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1905 blendFactor.x = blendConstant(format, 3, OneMinus);
1906 blendFactor.y = blendConstant(format, 3, OneMinus);
1907 blendFactor.z = blendConstant(format, 3, OneMinus);
1908 break;
1909
1910 default:
1911 UNSUPPORTED("VkBlendFactor: %d", int(colorBlendFactor));
1912 }
1913
1914 // "If the color attachment is fixed-point, the components of the source and destination values and blend factors are each clamped
1915 // to [0,1] or [-1,1] respectively for an unsigned normalized or signed normalized color attachment prior to evaluating the blend
1916 // operations. If the color attachment is floating-point, no clamping occurs."
1917 if(blendFactorCanExceedFormatRange(colorBlendFactor, format))
1918 {
1919 if(format.isUnsignedNormalized())
1920 {
1921 blendFactor.x = Min(Max(blendFactor.x, 0.0f), 1.0f);
1922 blendFactor.y = Min(Max(blendFactor.y, 0.0f), 1.0f);
1923 blendFactor.z = Min(Max(blendFactor.z, 0.0f), 1.0f);
1924 }
1925 else if(format.isSignedNormalized())
1926 {
1927 blendFactor.x = Min(Max(blendFactor.x, -1.0f), 1.0f);
1928 blendFactor.y = Min(Max(blendFactor.y, -1.0f), 1.0f);
1929 blendFactor.z = Min(Max(blendFactor.z, -1.0f), 1.0f);
1930 }
1931 }
1932 }
1933
blendFactorAlpha(Float4 & blendFactorAlpha,const Float4 & sourceAlpha,const Float4 & destAlpha,VkBlendFactor alphaBlendFactor,vk::Format format)1934 void PixelRoutine::blendFactorAlpha(Float4 &blendFactorAlpha, const Float4 &sourceAlpha, const Float4 &destAlpha, VkBlendFactor alphaBlendFactor, vk::Format format)
1935 {
1936 switch(alphaBlendFactor)
1937 {
1938 case VK_BLEND_FACTOR_ZERO:
1939 blendFactorAlpha = 0.0f;
1940 break;
1941 case VK_BLEND_FACTOR_ONE:
1942 blendFactorAlpha = 1.0f;
1943 break;
1944 case VK_BLEND_FACTOR_SRC_COLOR:
1945 blendFactorAlpha = sourceAlpha;
1946 break;
1947 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1948 blendFactorAlpha = 1.0f - sourceAlpha;
1949 break;
1950 case VK_BLEND_FACTOR_DST_COLOR:
1951 blendFactorAlpha = destAlpha;
1952 break;
1953 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1954 blendFactorAlpha = 1.0f - destAlpha;
1955 break;
1956 case VK_BLEND_FACTOR_SRC_ALPHA:
1957 blendFactorAlpha = sourceAlpha;
1958 break;
1959 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1960 blendFactorAlpha = 1.0f - sourceAlpha;
1961 break;
1962 case VK_BLEND_FACTOR_DST_ALPHA:
1963 blendFactorAlpha = destAlpha;
1964 break;
1965 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1966 blendFactorAlpha = 1.0f - destAlpha;
1967 break;
1968 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1969 blendFactorAlpha = 1.0f;
1970 break;
1971 case VK_BLEND_FACTOR_CONSTANT_COLOR:
1972 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1973 blendFactorAlpha = blendConstant(format, 3);
1974 break;
1975 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1976 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1977 blendFactorAlpha = blendConstant(format, 3, OneMinus);
1978 break;
1979 default:
1980 UNSUPPORTED("VkBlendFactor: %d", int(alphaBlendFactor));
1981 }
1982
1983 // "If the color attachment is fixed-point, the components of the source and destination values and blend factors are each clamped
1984 // to [0,1] or [-1,1] respectively for an unsigned normalized or signed normalized color attachment prior to evaluating the blend
1985 // operations. If the color attachment is floating-point, no clamping occurs."
1986 if(blendFactorCanExceedFormatRange(alphaBlendFactor, format))
1987 {
1988 if(format.isUnsignedNormalized())
1989 {
1990 blendFactorAlpha = Min(Max(blendFactorAlpha, 0.0f), 1.0f);
1991 }
1992 else if(format.isSignedNormalized())
1993 {
1994 blendFactorAlpha = Min(Max(blendFactorAlpha, -1.0f), 1.0f);
1995 }
1996 }
1997 }
1998
blendOpOverlay(Float4 & src,Float4 & dst)1999 Float4 PixelRoutine::blendOpOverlay(Float4 &src, Float4 &dst)
2000 {
2001 Int4 largeDst = CmpGT(dst, 0.5f);
2002 return As<Float4>(
2003 (~largeDst & As<Int4>(2.0f * src * dst)) |
2004 (largeDst & As<Int4>(1.0f - (2.0f * (1.0f - src) * (1.0f - dst)))));
2005 }
2006
blendOpColorDodge(Float4 & src,Float4 & dst)2007 Float4 PixelRoutine::blendOpColorDodge(Float4 &src, Float4 &dst)
2008 {
2009 Int4 srcBelowOne = CmpLT(src, 1.0f);
2010 Int4 positiveDst = CmpGT(dst, 0.0f);
2011 return As<Float4>(positiveDst & ((~srcBelowOne & As<Int4>(Float4(1.0f))) |
2012 (srcBelowOne & As<Int4>(Min(1.0f, (dst / (1.0f - src)))))));
2013 }
2014
blendOpColorBurn(Float4 & src,Float4 & dst)2015 Float4 PixelRoutine::blendOpColorBurn(Float4 &src, Float4 &dst)
2016 {
2017 Int4 dstBelowOne = CmpLT(dst, 1.0f);
2018 Int4 positiveSrc = CmpGT(src, 0.0f);
2019 return As<Float4>(
2020 (~dstBelowOne & As<Int4>(Float4(1.0f))) |
2021 (dstBelowOne & positiveSrc & As<Int4>(1.0f - Min(1.0f, (1.0f - dst) / src))));
2022 }
2023
blendOpHardlight(Float4 & src,Float4 & dst)2024 Float4 PixelRoutine::blendOpHardlight(Float4 &src, Float4 &dst)
2025 {
2026 Int4 largeSrc = CmpGT(src, 0.5f);
2027 return As<Float4>(
2028 (~largeSrc & As<Int4>(2.0f * src * dst)) |
2029 (largeSrc & As<Int4>(1.0f - (2.0f * (1.0f - src) * (1.0f - dst)))));
2030 }
2031
blendOpSoftlight(Float4 & src,Float4 & dst)2032 Float4 PixelRoutine::blendOpSoftlight(Float4 &src, Float4 &dst)
2033 {
2034 Int4 largeSrc = CmpGT(src, 0.5f);
2035 Int4 largeDst = CmpGT(dst, 0.25f);
2036
2037 return As<Float4>(
2038 (~largeSrc & As<Int4>(dst - ((1.0f - (2.0f * src)) * dst * (1.0f - dst)))) |
2039 (largeSrc & ((~largeDst & As<Int4>(dst + (((2.0f * src) - 1.0f) * dst * ((((16.0f * dst) - 12.0f) * dst) + 3.0f)))) |
2040 (largeDst & As<Int4>(dst + (((2.0f * src) - 1.0f) * (Sqrt<Mediump>(dst) - dst)))))));
2041 }
2042
maxRGB(Vector4f & c)2043 Float4 PixelRoutine::maxRGB(Vector4f &c)
2044 {
2045 return Max(Max(c.x, c.y), c.z);
2046 }
2047
minRGB(Vector4f & c)2048 Float4 PixelRoutine::minRGB(Vector4f &c)
2049 {
2050 return Min(Min(c.x, c.y), c.z);
2051 }
2052
setLumSat(Vector4f & cbase,Vector4f & csat,Vector4f & clum,Float4 & x,Float4 & y,Float4 & z)2053 void PixelRoutine::setLumSat(Vector4f &cbase, Vector4f &csat, Vector4f &clum, Float4 &x, Float4 &y, Float4 &z)
2054 {
2055 Float4 minbase = minRGB(cbase);
2056 Float4 sbase = maxRGB(cbase) - minbase;
2057 Float4 ssat = maxRGB(csat) - minRGB(csat);
2058 Int4 isNonZero = CmpGT(sbase, 0.0f);
2059 Vector4f color;
2060 color.x = As<Float4>(isNonZero & As<Int4>((cbase.x - minbase) * ssat / sbase));
2061 color.y = As<Float4>(isNonZero & As<Int4>((cbase.y - minbase) * ssat / sbase));
2062 color.z = As<Float4>(isNonZero & As<Int4>((cbase.z - minbase) * ssat / sbase));
2063 setLum(color, clum, x, y, z);
2064 }
2065
lumRGB(Vector4f & c)2066 Float4 PixelRoutine::lumRGB(Vector4f &c)
2067 {
2068 return c.x * 0.3f + c.y * 0.59f + c.z * 0.11f;
2069 }
2070
computeLum(Float4 & color,Float4 & lum,Float4 & mincol,Float4 & maxcol,Int4 & negative,Int4 & aboveOne)2071 Float4 PixelRoutine::computeLum(Float4 &color, Float4 &lum, Float4 &mincol, Float4 &maxcol, Int4 &negative, Int4 &aboveOne)
2072 {
2073 return As<Float4>(
2074 (negative & As<Int4>(lum + ((color - lum) * lum) / (lum - mincol))) |
2075 (~negative & ((aboveOne & As<Int4>(lum + ((color - lum) * (1.0f - lum)) / (maxcol - lum))) |
2076 (~aboveOne & As<Int4>(color)))));
2077 }
2078
setLum(Vector4f & cbase,Vector4f & clum,Float4 & x,Float4 & y,Float4 & z)2079 void PixelRoutine::setLum(Vector4f &cbase, Vector4f &clum, Float4 &x, Float4 &y, Float4 &z)
2080 {
2081 Float4 lbase = lumRGB(cbase);
2082 Float4 llum = lumRGB(clum);
2083 Float4 ldiff = llum - lbase;
2084
2085 Vector4f color;
2086 color.x = cbase.x + ldiff;
2087 color.y = cbase.y + ldiff;
2088 color.z = cbase.z + ldiff;
2089
2090 Float4 lum = lumRGB(color);
2091 Float4 mincol = minRGB(color);
2092 Float4 maxcol = maxRGB(color);
2093
2094 Int4 negative = CmpLT(mincol, 0.0f);
2095 Int4 aboveOne = CmpGT(maxcol, 1.0f);
2096
2097 x = computeLum(color.x, lum, mincol, maxcol, negative, aboveOne);
2098 y = computeLum(color.y, lum, mincol, maxcol, negative, aboveOne);
2099 z = computeLum(color.z, lum, mincol, maxcol, negative, aboveOne);
2100 }
2101
premultiply(Vector4f & c)2102 void PixelRoutine::premultiply(Vector4f &c)
2103 {
2104 Int4 nonZeroAlpha = CmpNEQ(c.w, 0.0f);
2105 c.x = As<Float4>(nonZeroAlpha & As<Int4>(c.x / c.w));
2106 c.y = As<Float4>(nonZeroAlpha & As<Int4>(c.y / c.w));
2107 c.z = As<Float4>(nonZeroAlpha & As<Int4>(c.z / c.w));
2108 }
2109
computeAdvancedBlendMode(int index,const Vector4f & src,const Vector4f & dst,const Vector4f & srcFactor,const Vector4f & dstFactor)2110 Vector4f PixelRoutine::computeAdvancedBlendMode(int index, const Vector4f &src, const Vector4f &dst, const Vector4f &srcFactor, const Vector4f &dstFactor)
2111 {
2112 Vector4f srcColor = src;
2113 srcColor.x *= srcFactor.x;
2114 srcColor.y *= srcFactor.y;
2115 srcColor.z *= srcFactor.z;
2116 srcColor.w *= srcFactor.w;
2117
2118 Vector4f dstColor = dst;
2119 dstColor.x *= dstFactor.x;
2120 dstColor.y *= dstFactor.y;
2121 dstColor.z *= dstFactor.z;
2122 dstColor.w *= dstFactor.w;
2123
2124 premultiply(srcColor);
2125 premultiply(dstColor);
2126
2127 Vector4f blendedColor;
2128
2129 switch(state.blendState[index].blendOperation)
2130 {
2131 case VK_BLEND_OP_MULTIPLY_EXT:
2132 blendedColor.x = (srcColor.x * dstColor.x);
2133 blendedColor.y = (srcColor.y * dstColor.y);
2134 blendedColor.z = (srcColor.z * dstColor.z);
2135 break;
2136 case VK_BLEND_OP_SCREEN_EXT:
2137 blendedColor.x = srcColor.x + dstColor.x - (srcColor.x * dstColor.x);
2138 blendedColor.y = srcColor.y + dstColor.y - (srcColor.y * dstColor.y);
2139 blendedColor.z = srcColor.z + dstColor.z - (srcColor.z * dstColor.z);
2140 break;
2141 case VK_BLEND_OP_OVERLAY_EXT:
2142 blendedColor.x = blendOpOverlay(srcColor.x, dstColor.x);
2143 blendedColor.y = blendOpOverlay(srcColor.y, dstColor.y);
2144 blendedColor.z = blendOpOverlay(srcColor.z, dstColor.z);
2145 break;
2146 case VK_BLEND_OP_DARKEN_EXT:
2147 blendedColor.x = Min(srcColor.x, dstColor.x);
2148 blendedColor.y = Min(srcColor.y, dstColor.y);
2149 blendedColor.z = Min(srcColor.z, dstColor.z);
2150 break;
2151 case VK_BLEND_OP_LIGHTEN_EXT:
2152 blendedColor.x = Max(srcColor.x, dstColor.x);
2153 blendedColor.y = Max(srcColor.y, dstColor.y);
2154 blendedColor.z = Max(srcColor.z, dstColor.z);
2155 break;
2156 case VK_BLEND_OP_COLORDODGE_EXT:
2157 blendedColor.x = blendOpColorDodge(srcColor.x, dstColor.x);
2158 blendedColor.y = blendOpColorDodge(srcColor.y, dstColor.y);
2159 blendedColor.z = blendOpColorDodge(srcColor.z, dstColor.z);
2160 break;
2161 case VK_BLEND_OP_COLORBURN_EXT:
2162 blendedColor.x = blendOpColorBurn(srcColor.x, dstColor.x);
2163 blendedColor.y = blendOpColorBurn(srcColor.y, dstColor.y);
2164 blendedColor.z = blendOpColorBurn(srcColor.z, dstColor.z);
2165 break;
2166 case VK_BLEND_OP_HARDLIGHT_EXT:
2167 blendedColor.x = blendOpHardlight(srcColor.x, dstColor.x);
2168 blendedColor.y = blendOpHardlight(srcColor.y, dstColor.y);
2169 blendedColor.z = blendOpHardlight(srcColor.z, dstColor.z);
2170 break;
2171 case VK_BLEND_OP_SOFTLIGHT_EXT:
2172 blendedColor.x = blendOpSoftlight(srcColor.x, dstColor.x);
2173 blendedColor.y = blendOpSoftlight(srcColor.y, dstColor.y);
2174 blendedColor.z = blendOpSoftlight(srcColor.z, dstColor.z);
2175 break;
2176 case VK_BLEND_OP_DIFFERENCE_EXT:
2177 blendedColor.x = Abs(srcColor.x - dstColor.x);
2178 blendedColor.y = Abs(srcColor.y - dstColor.y);
2179 blendedColor.z = Abs(srcColor.z - dstColor.z);
2180 break;
2181 case VK_BLEND_OP_EXCLUSION_EXT:
2182 blendedColor.x = srcColor.x + dstColor.x - (srcColor.x * dstColor.x * 2.0f);
2183 blendedColor.y = srcColor.y + dstColor.y - (srcColor.y * dstColor.y * 2.0f);
2184 blendedColor.z = srcColor.z + dstColor.z - (srcColor.z * dstColor.z * 2.0f);
2185 break;
2186 case VK_BLEND_OP_HSL_HUE_EXT:
2187 setLumSat(srcColor, dstColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
2188 break;
2189 case VK_BLEND_OP_HSL_SATURATION_EXT:
2190 setLumSat(dstColor, srcColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
2191 break;
2192 case VK_BLEND_OP_HSL_COLOR_EXT:
2193 setLum(srcColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
2194 break;
2195 case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
2196 setLum(dstColor, srcColor, blendedColor.x, blendedColor.y, blendedColor.z);
2197 break;
2198 default:
2199 UNSUPPORTED("Unsupported advanced VkBlendOp: %d", int(state.blendState[index].blendOperation));
2200 break;
2201 }
2202
2203 Float4 p = srcColor.w * dstColor.w;
2204 blendedColor.x *= p;
2205 blendedColor.y *= p;
2206 blendedColor.z *= p;
2207
2208 p = srcColor.w * (1.0f - dstColor.w);
2209 blendedColor.x += srcColor.x * p;
2210 blendedColor.y += srcColor.y * p;
2211 blendedColor.z += srcColor.z * p;
2212
2213 p = dstColor.w * (1.0f - srcColor.w);
2214 blendedColor.x += dstColor.x * p;
2215 blendedColor.y += dstColor.y * p;
2216 blendedColor.z += dstColor.z * p;
2217
2218 return blendedColor;
2219 }
2220
blendFactorCanExceedFormatRange(VkBlendFactor blendFactor,vk::Format format)2221 bool PixelRoutine::blendFactorCanExceedFormatRange(VkBlendFactor blendFactor, vk::Format format)
2222 {
2223 switch(blendFactor)
2224 {
2225 case VK_BLEND_FACTOR_ZERO:
2226 case VK_BLEND_FACTOR_ONE:
2227 return false;
2228 case VK_BLEND_FACTOR_SRC_COLOR:
2229 case VK_BLEND_FACTOR_SRC_ALPHA:
2230 // Source values have been clamped after fragment shader execution if the attachment format is normalized.
2231 return false;
2232 case VK_BLEND_FACTOR_DST_COLOR:
2233 case VK_BLEND_FACTOR_DST_ALPHA:
2234 // Dest values have a valid range due to being read from the attachment.
2235 return false;
2236 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
2237 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
2238 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
2239 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
2240 // For signed formats, negative values cause the result to exceed 1.0.
2241 return format.isSignedNormalized();
2242 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
2243 // min(As, 1 - Ad)
2244 return false;
2245 case VK_BLEND_FACTOR_CONSTANT_COLOR:
2246 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
2247 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
2248 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
2249 return false;
2250
2251 default:
2252 UNSUPPORTED("VkBlendFactor: %d", int(blendFactor));
2253 return false;
2254 }
2255 }
2256
alphaBlend(int index,const Pointer<Byte> & cBuffer,const Vector4f & sourceColor,const Int & x)2257 Vector4f PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, const Vector4f &sourceColor, const Int &x)
2258 {
2259 if(!state.blendState[index].alphaBlendEnable)
2260 {
2261 return sourceColor;
2262 }
2263
2264 vk::Format format = state.colorFormat[index];
2265 ASSERT(format.supportsColorAttachmentBlend());
2266
2267 Pointer<Byte> buffer = cBuffer;
2268 Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2269
2270 // destColor holds four texel color values.
2271 // Note: Despite the type being Vector4f, the colors may be stored as
2272 // integers. Half-floats are stored as full 32-bit floats.
2273 // Non-float and non-fixed point formats are not alpha blended.
2274 Vector4f destColor;
2275
2276 switch(format)
2277 {
2278 case VK_FORMAT_R32_SINT:
2279 case VK_FORMAT_R32_UINT:
2280 case VK_FORMAT_R32_SFLOAT:
2281 // FIXME: movlps
2282 buffer += 4 * x;
2283 destColor.x.x = *Pointer<Float>(buffer + 0);
2284 destColor.x.y = *Pointer<Float>(buffer + 4);
2285 buffer += pitchB;
2286 // FIXME: movhps
2287 destColor.x.z = *Pointer<Float>(buffer + 0);
2288 destColor.x.w = *Pointer<Float>(buffer + 4);
2289 destColor.y = destColor.z = destColor.w = 1.0f;
2290 break;
2291 case VK_FORMAT_R32G32_SINT:
2292 case VK_FORMAT_R32G32_UINT:
2293 case VK_FORMAT_R32G32_SFLOAT:
2294 buffer += 8 * x;
2295 destColor.x = *Pointer<Float4>(buffer, 16);
2296 buffer += pitchB;
2297 destColor.y = *Pointer<Float4>(buffer, 16);
2298 destColor.z = destColor.x;
2299 destColor.x = ShuffleLowHigh(destColor.x, destColor.y, 0x0202);
2300 destColor.z = ShuffleLowHigh(destColor.z, destColor.y, 0x1313);
2301 destColor.y = destColor.z;
2302 destColor.z = destColor.w = 1.0f;
2303 break;
2304 case VK_FORMAT_R32G32B32A32_SFLOAT:
2305 case VK_FORMAT_R32G32B32A32_SINT:
2306 case VK_FORMAT_R32G32B32A32_UINT:
2307 buffer += 16 * x;
2308 destColor.x = *Pointer<Float4>(buffer + 0, 16);
2309 destColor.y = *Pointer<Float4>(buffer + 16, 16);
2310 buffer += pitchB;
2311 destColor.z = *Pointer<Float4>(buffer + 0, 16);
2312 destColor.w = *Pointer<Float4>(buffer + 16, 16);
2313 transpose4x4(destColor.x, destColor.y, destColor.z, destColor.w);
2314 break;
2315 case VK_FORMAT_R16_UNORM:
2316 buffer += 2 * x;
2317 destColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
2318 destColor.x.y = Float(Int(*Pointer<UShort>(buffer + 2)));
2319 buffer += pitchB;
2320 destColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
2321 destColor.x.w = Float(Int(*Pointer<UShort>(buffer + 2)));
2322 destColor.x *= (1.0f / 0xFFFF);
2323 destColor.y = destColor.z = destColor.w = 1.0f;
2324 break;
2325 case VK_FORMAT_R16_SFLOAT:
2326 buffer += 2 * x;
2327 destColor.x.x = Float(*Pointer<Half>(buffer + 0));
2328 destColor.x.y = Float(*Pointer<Half>(buffer + 2));
2329 buffer += pitchB;
2330 destColor.x.z = Float(*Pointer<Half>(buffer + 0));
2331 destColor.x.w = Float(*Pointer<Half>(buffer + 2));
2332 destColor.y = destColor.z = destColor.w = 1.0f;
2333 break;
2334 case VK_FORMAT_R16G16_UNORM:
2335 buffer += 4 * x;
2336 destColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
2337 destColor.y.x = Float(Int(*Pointer<UShort>(buffer + 2)));
2338 destColor.x.y = Float(Int(*Pointer<UShort>(buffer + 4)));
2339 destColor.y.y = Float(Int(*Pointer<UShort>(buffer + 6)));
2340 buffer += pitchB;
2341 destColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
2342 destColor.y.z = Float(Int(*Pointer<UShort>(buffer + 2)));
2343 destColor.x.w = Float(Int(*Pointer<UShort>(buffer + 4)));
2344 destColor.y.w = Float(Int(*Pointer<UShort>(buffer + 6)));
2345 destColor.x *= (1.0f / 0xFFFF);
2346 destColor.y *= (1.0f / 0xFFFF);
2347 destColor.z = destColor.w = 1.0f;
2348 break;
2349 case VK_FORMAT_R16G16_SFLOAT:
2350 buffer += 4 * x;
2351 destColor.x.x = Float(*Pointer<Half>(buffer + 0));
2352 destColor.y.x = Float(*Pointer<Half>(buffer + 2));
2353 destColor.x.y = Float(*Pointer<Half>(buffer + 4));
2354 destColor.y.y = Float(*Pointer<Half>(buffer + 6));
2355 buffer += pitchB;
2356 destColor.x.z = Float(*Pointer<Half>(buffer + 0));
2357 destColor.y.z = Float(*Pointer<Half>(buffer + 2));
2358 destColor.x.w = Float(*Pointer<Half>(buffer + 4));
2359 destColor.y.w = Float(*Pointer<Half>(buffer + 6));
2360 destColor.z = destColor.w = 1.0f;
2361 break;
2362 case VK_FORMAT_R16G16B16A16_UNORM:
2363 buffer += 8 * x;
2364 destColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0x0)));
2365 destColor.y.x = Float(Int(*Pointer<UShort>(buffer + 0x2)));
2366 destColor.z.x = Float(Int(*Pointer<UShort>(buffer + 0x4)));
2367 destColor.w.x = Float(Int(*Pointer<UShort>(buffer + 0x6)));
2368 destColor.x.y = Float(Int(*Pointer<UShort>(buffer + 0x8)));
2369 destColor.y.y = Float(Int(*Pointer<UShort>(buffer + 0xa)));
2370 destColor.z.y = Float(Int(*Pointer<UShort>(buffer + 0xc)));
2371 destColor.w.y = Float(Int(*Pointer<UShort>(buffer + 0xe)));
2372 buffer += pitchB;
2373 destColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0x0)));
2374 destColor.y.z = Float(Int(*Pointer<UShort>(buffer + 0x2)));
2375 destColor.z.z = Float(Int(*Pointer<UShort>(buffer + 0x4)));
2376 destColor.w.z = Float(Int(*Pointer<UShort>(buffer + 0x6)));
2377 destColor.x.w = Float(Int(*Pointer<UShort>(buffer + 0x8)));
2378 destColor.y.w = Float(Int(*Pointer<UShort>(buffer + 0xa)));
2379 destColor.z.w = Float(Int(*Pointer<UShort>(buffer + 0xc)));
2380 destColor.w.w = Float(Int(*Pointer<UShort>(buffer + 0xe)));
2381 destColor.x *= (1.0f / 0xFFFF);
2382 destColor.y *= (1.0f / 0xFFFF);
2383 destColor.z *= (1.0f / 0xFFFF);
2384 destColor.w *= (1.0f / 0xFFFF);
2385 break;
2386 case VK_FORMAT_R16G16B16A16_SFLOAT:
2387 buffer += 8 * x;
2388 destColor.x.x = Float(*Pointer<Half>(buffer + 0x0));
2389 destColor.y.x = Float(*Pointer<Half>(buffer + 0x2));
2390 destColor.z.x = Float(*Pointer<Half>(buffer + 0x4));
2391 destColor.w.x = Float(*Pointer<Half>(buffer + 0x6));
2392 destColor.x.y = Float(*Pointer<Half>(buffer + 0x8));
2393 destColor.y.y = Float(*Pointer<Half>(buffer + 0xa));
2394 destColor.z.y = Float(*Pointer<Half>(buffer + 0xc));
2395 destColor.w.y = Float(*Pointer<Half>(buffer + 0xe));
2396 buffer += pitchB;
2397 destColor.x.z = Float(*Pointer<Half>(buffer + 0x0));
2398 destColor.y.z = Float(*Pointer<Half>(buffer + 0x2));
2399 destColor.z.z = Float(*Pointer<Half>(buffer + 0x4));
2400 destColor.w.z = Float(*Pointer<Half>(buffer + 0x6));
2401 destColor.x.w = Float(*Pointer<Half>(buffer + 0x8));
2402 destColor.y.w = Float(*Pointer<Half>(buffer + 0xa));
2403 destColor.z.w = Float(*Pointer<Half>(buffer + 0xc));
2404 destColor.w.w = Float(*Pointer<Half>(buffer + 0xe));
2405 break;
2406 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2407 buffer += 4 * x;
2408 destColor.x = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
2409 destColor.y = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
2410 buffer += pitchB;
2411 destColor.z = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
2412 destColor.w = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
2413 transpose4x3(destColor.x, destColor.y, destColor.z, destColor.w);
2414 destColor.w = 1.0f;
2415 break;
2416 default:
2417 {
2418 // Attempt to read an integer based format and convert it to float
2419 Vector4s color;
2420 readPixel(index, cBuffer, x, color);
2421 destColor.x = convertFloat32(As<UShort4>(color.x));
2422 destColor.y = convertFloat32(As<UShort4>(color.y));
2423 destColor.z = convertFloat32(As<UShort4>(color.z));
2424 destColor.w = convertFloat32(As<UShort4>(color.w));
2425 }
2426 break;
2427 }
2428
2429 Vector4f sourceFactor;
2430 Vector4f destFactor;
2431
2432 blendFactorRGB(sourceFactor, sourceColor, destColor, state.blendState[index].sourceBlendFactor, format);
2433 blendFactorRGB(destFactor, sourceColor, destColor, state.blendState[index].destBlendFactor, format);
2434 blendFactorAlpha(sourceFactor.w, sourceColor.w, destColor.w, state.blendState[index].sourceBlendFactorAlpha, format);
2435 blendFactorAlpha(destFactor.w, sourceColor.w, destColor.w, state.blendState[index].destBlendFactorAlpha, format);
2436
2437 Vector4f blendedColor;
2438
2439 switch(state.blendState[index].blendOperation)
2440 {
2441 case VK_BLEND_OP_ADD:
2442 blendedColor.x = sourceColor.x * sourceFactor.x + destColor.x * destFactor.x;
2443 blendedColor.y = sourceColor.y * sourceFactor.y + destColor.y * destFactor.y;
2444 blendedColor.z = sourceColor.z * sourceFactor.z + destColor.z * destFactor.z;
2445 break;
2446 case VK_BLEND_OP_SUBTRACT:
2447 blendedColor.x = sourceColor.x * sourceFactor.x - destColor.x * destFactor.x;
2448 blendedColor.y = sourceColor.y * sourceFactor.y - destColor.y * destFactor.y;
2449 blendedColor.z = sourceColor.z * sourceFactor.z - destColor.z * destFactor.z;
2450 break;
2451 case VK_BLEND_OP_REVERSE_SUBTRACT:
2452 blendedColor.x = destColor.x * destFactor.x - sourceColor.x * sourceFactor.x;
2453 blendedColor.y = destColor.y * destFactor.y - sourceColor.y * sourceFactor.y;
2454 blendedColor.z = destColor.z * destFactor.z - sourceColor.z * sourceFactor.z;
2455 break;
2456 case VK_BLEND_OP_MIN:
2457 blendedColor.x = Min(sourceColor.x, destColor.x);
2458 blendedColor.y = Min(sourceColor.y, destColor.y);
2459 blendedColor.z = Min(sourceColor.z, destColor.z);
2460 break;
2461 case VK_BLEND_OP_MAX:
2462 blendedColor.x = Max(sourceColor.x, destColor.x);
2463 blendedColor.y = Max(sourceColor.y, destColor.y);
2464 blendedColor.z = Max(sourceColor.z, destColor.z);
2465 break;
2466 case VK_BLEND_OP_SRC_EXT:
2467 blendedColor.x = sourceColor.x;
2468 blendedColor.y = sourceColor.y;
2469 blendedColor.z = sourceColor.z;
2470 break;
2471 case VK_BLEND_OP_DST_EXT:
2472 blendedColor.x = destColor.x;
2473 blendedColor.y = destColor.y;
2474 blendedColor.z = destColor.z;
2475 break;
2476 case VK_BLEND_OP_ZERO_EXT:
2477 blendedColor.x = 0.0f;
2478 blendedColor.y = 0.0f;
2479 blendedColor.z = 0.0f;
2480 break;
2481 case VK_BLEND_OP_MULTIPLY_EXT:
2482 case VK_BLEND_OP_SCREEN_EXT:
2483 case VK_BLEND_OP_OVERLAY_EXT:
2484 case VK_BLEND_OP_DARKEN_EXT:
2485 case VK_BLEND_OP_LIGHTEN_EXT:
2486 case VK_BLEND_OP_COLORDODGE_EXT:
2487 case VK_BLEND_OP_COLORBURN_EXT:
2488 case VK_BLEND_OP_HARDLIGHT_EXT:
2489 case VK_BLEND_OP_SOFTLIGHT_EXT:
2490 case VK_BLEND_OP_DIFFERENCE_EXT:
2491 case VK_BLEND_OP_EXCLUSION_EXT:
2492 case VK_BLEND_OP_HSL_HUE_EXT:
2493 case VK_BLEND_OP_HSL_SATURATION_EXT:
2494 case VK_BLEND_OP_HSL_COLOR_EXT:
2495 case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
2496 blendedColor = computeAdvancedBlendMode(index, sourceColor, destColor, sourceFactor, destFactor);
2497 break;
2498 default:
2499 UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperation));
2500 }
2501
2502 switch(state.blendState[index].blendOperationAlpha)
2503 {
2504 case VK_BLEND_OP_ADD:
2505 blendedColor.w = sourceColor.w * sourceFactor.w + destColor.w * destFactor.w;
2506 break;
2507 case VK_BLEND_OP_SUBTRACT:
2508 blendedColor.w = sourceColor.w * sourceFactor.w - destColor.w * destFactor.w;
2509 break;
2510 case VK_BLEND_OP_REVERSE_SUBTRACT:
2511 blendedColor.w = destColor.w * destFactor.w - sourceColor.w * sourceFactor.w;
2512 break;
2513 case VK_BLEND_OP_MIN:
2514 blendedColor.w = Min(sourceColor.w, destColor.w);
2515 break;
2516 case VK_BLEND_OP_MAX:
2517 blendedColor.w = Max(sourceColor.w, destColor.w);
2518 break;
2519 case VK_BLEND_OP_SRC_EXT:
2520 blendedColor.w = sourceColor.w;
2521 break;
2522 case VK_BLEND_OP_DST_EXT:
2523 blendedColor.w = destColor.w;
2524 break;
2525 case VK_BLEND_OP_ZERO_EXT:
2526 blendedColor.w = 0.0f;
2527 break;
2528 case VK_BLEND_OP_MULTIPLY_EXT:
2529 case VK_BLEND_OP_SCREEN_EXT:
2530 case VK_BLEND_OP_OVERLAY_EXT:
2531 case VK_BLEND_OP_DARKEN_EXT:
2532 case VK_BLEND_OP_LIGHTEN_EXT:
2533 case VK_BLEND_OP_COLORDODGE_EXT:
2534 case VK_BLEND_OP_COLORBURN_EXT:
2535 case VK_BLEND_OP_HARDLIGHT_EXT:
2536 case VK_BLEND_OP_SOFTLIGHT_EXT:
2537 case VK_BLEND_OP_DIFFERENCE_EXT:
2538 case VK_BLEND_OP_EXCLUSION_EXT:
2539 case VK_BLEND_OP_HSL_HUE_EXT:
2540 case VK_BLEND_OP_HSL_SATURATION_EXT:
2541 case VK_BLEND_OP_HSL_COLOR_EXT:
2542 case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
2543 // All of the currently supported 'advanced blend modes' compute the alpha the same way.
2544 blendedColor.w = sourceColor.w + destColor.w - (sourceColor.w * destColor.w);
2545 break;
2546 default:
2547 UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperationAlpha));
2548 }
2549
2550 return blendedColor;
2551 }
2552
writeColor(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4f & color,const Int & sMask,const Int & zMask,const Int & cMask)2553 void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4f &color, const Int &sMask, const Int &zMask, const Int &cMask)
2554 {
2555 vk::Format format = state.colorFormat[index];
2556 switch(format)
2557 {
2558 case VK_FORMAT_R16G16B16A16_UNORM:
2559 color.w = Min(Max(color.w, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2560 color.w = As<Float4>(RoundInt(color.w * 0xFFFF));
2561 color.z = Min(Max(color.z, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2562 color.z = As<Float4>(RoundInt(color.z * 0xFFFF));
2563 // [[fallthrough]]
2564 case VK_FORMAT_R16G16_UNORM:
2565 color.y = Min(Max(color.y, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2566 color.y = As<Float4>(RoundInt(color.y * 0xFFFF));
2567 //[[fallthrough]]
2568 case VK_FORMAT_R16_UNORM:
2569 color.x = Min(Max(color.x, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2570 color.x = As<Float4>(RoundInt(color.x * 0xFFFF));
2571 break;
2572 default:
2573 // TODO(b/204560089): Omit clamp if redundant
2574 if(format.isUnsignedNormalized())
2575 {
2576 color.x = Min(Max(color.x, 0.0f), 1.0f);
2577 color.y = Min(Max(color.y, 0.0f), 1.0f);
2578 color.z = Min(Max(color.z, 0.0f), 1.0f);
2579 color.w = Min(Max(color.w, 0.0f), 1.0f);
2580 }
2581 else if(format.isSignedNormalized())
2582 {
2583 color.x = Min(Max(color.x, -1.0f), 1.0f);
2584 color.y = Min(Max(color.y, -1.0f), 1.0f);
2585 color.z = Min(Max(color.z, -1.0f), 1.0f);
2586 color.w = Min(Max(color.w, -1.0f), 1.0f);
2587 }
2588 }
2589
2590 switch(format)
2591 {
2592 case VK_FORMAT_R16_SFLOAT:
2593 case VK_FORMAT_R32_SFLOAT:
2594 case VK_FORMAT_R32_SINT:
2595 case VK_FORMAT_R32_UINT:
2596 case VK_FORMAT_R16_UNORM:
2597 case VK_FORMAT_R16_SINT:
2598 case VK_FORMAT_R16_UINT:
2599 case VK_FORMAT_R8_SINT:
2600 case VK_FORMAT_R8_UINT:
2601 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2602 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2603 break;
2604 case VK_FORMAT_R16G16_SFLOAT:
2605 case VK_FORMAT_R32G32_SFLOAT:
2606 case VK_FORMAT_R32G32_SINT:
2607 case VK_FORMAT_R32G32_UINT:
2608 case VK_FORMAT_R16G16_UNORM:
2609 case VK_FORMAT_R16G16_SINT:
2610 case VK_FORMAT_R16G16_UINT:
2611 case VK_FORMAT_R8G8_SINT:
2612 case VK_FORMAT_R8G8_UINT:
2613 color.z = color.x;
2614 color.x = UnpackLow(color.x, color.y);
2615 color.z = UnpackHigh(color.z, color.y);
2616 color.y = color.z;
2617 break;
2618 case VK_FORMAT_R16G16B16A16_SFLOAT:
2619 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2620 case VK_FORMAT_R32G32B32A32_SFLOAT:
2621 case VK_FORMAT_R32G32B32A32_SINT:
2622 case VK_FORMAT_R32G32B32A32_UINT:
2623 case VK_FORMAT_R16G16B16A16_UNORM:
2624 case VK_FORMAT_R16G16B16A16_SINT:
2625 case VK_FORMAT_R16G16B16A16_UINT:
2626 case VK_FORMAT_R8G8B8A8_SINT:
2627 case VK_FORMAT_R8G8B8A8_UINT:
2628 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
2629 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
2630 transpose4x4(color.x, color.y, color.z, color.w);
2631 break;
2632 default:
2633 UNSUPPORTED("VkFormat: %d", int(format));
2634 }
2635
2636 int rgbaWriteMask = state.colorWriteActive(index);
2637 int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
2638
2639 Int xMask; // Combination of all masks
2640
2641 if(state.depthTestActive)
2642 {
2643 xMask = zMask;
2644 }
2645 else
2646 {
2647 xMask = cMask;
2648 }
2649
2650 if(state.stencilActive)
2651 {
2652 xMask &= sMask;
2653 }
2654
2655 Pointer<Byte> buffer = cBuffer;
2656 Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2657 Float4 value;
2658
2659 switch(format)
2660 {
2661 case VK_FORMAT_R32_SFLOAT:
2662 case VK_FORMAT_R32_SINT:
2663 case VK_FORMAT_R32_UINT:
2664 if(rgbaWriteMask & 0x00000001)
2665 {
2666 buffer += 4 * x;
2667
2668 // FIXME: movlps
2669 value.x = *Pointer<Float>(buffer + 0);
2670 value.y = *Pointer<Float>(buffer + 4);
2671
2672 buffer += pitchB;
2673
2674 // FIXME: movhps
2675 value.z = *Pointer<Float>(buffer + 0);
2676 value.w = *Pointer<Float>(buffer + 4);
2677
2678 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2679 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2680 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2681
2682 // FIXME: movhps
2683 *Pointer<Float>(buffer + 0) = color.x.z;
2684 *Pointer<Float>(buffer + 4) = color.x.w;
2685
2686 buffer -= pitchB;
2687
2688 // FIXME: movlps
2689 *Pointer<Float>(buffer + 0) = color.x.x;
2690 *Pointer<Float>(buffer + 4) = color.x.y;
2691 }
2692 break;
2693 case VK_FORMAT_R16_SFLOAT:
2694 if(rgbaWriteMask & 0x00000001)
2695 {
2696 buffer += 2 * x;
2697
2698 value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 0);
2699 value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 1);
2700
2701 buffer += pitchB;
2702
2703 value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 2);
2704 value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 3);
2705
2706 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2707 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2708 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2709
2710 *Pointer<Half>(buffer + 0) = Half(color.x.z);
2711 *Pointer<Half>(buffer + 2) = Half(color.x.w);
2712
2713 buffer -= pitchB;
2714
2715 *Pointer<Half>(buffer + 0) = Half(color.x.x);
2716 *Pointer<Half>(buffer + 2) = Half(color.x.y);
2717 }
2718 break;
2719 case VK_FORMAT_R16_UNORM:
2720 case VK_FORMAT_R16_SINT:
2721 case VK_FORMAT_R16_UINT:
2722 if(rgbaWriteMask & 0x00000001)
2723 {
2724 buffer += 2 * x;
2725
2726 UShort4 xyzw;
2727 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2728
2729 buffer += pitchB;
2730
2731 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2732 value = As<Float4>(Int4(xyzw));
2733
2734 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2735 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2736 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2737
2738 Float component = color.x.z;
2739 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2740 component = color.x.w;
2741 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2742
2743 buffer -= pitchB;
2744
2745 component = color.x.x;
2746 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2747 component = color.x.y;
2748 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2749 }
2750 break;
2751 case VK_FORMAT_R8_SINT:
2752 case VK_FORMAT_R8_UINT:
2753 if(rgbaWriteMask & 0x00000001)
2754 {
2755 buffer += x;
2756
2757 UInt xyzw, packedCol;
2758
2759 xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
2760 buffer += pitchB;
2761 xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2762
2763 Short4 tmpCol = Short4(As<Int4>(color.x));
2764 if(format == VK_FORMAT_R8_SINT)
2765 {
2766 tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
2767 }
2768 else
2769 {
2770 tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
2771 }
2772 packedCol = Extract(As<Int2>(tmpCol), 0);
2773
2774 packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2775 (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2776
2777 *Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2778 buffer -= pitchB;
2779 *Pointer<UShort>(buffer) = UShort(packedCol);
2780 }
2781 break;
2782 case VK_FORMAT_R32G32_SFLOAT:
2783 case VK_FORMAT_R32G32_SINT:
2784 case VK_FORMAT_R32G32_UINT:
2785 buffer += 8 * x;
2786
2787 value = *Pointer<Float4>(buffer);
2788
2789 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2790 {
2791 Float4 masked = value;
2792 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[rgbaWriteMask & 0x3][0])));
2793 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~rgbaWriteMask & 0x3][0])));
2794 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(masked));
2795 }
2796
2797 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16, 16));
2798 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ01X) + xMask * 16, 16));
2799 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2800 *Pointer<Float4>(buffer) = color.x;
2801
2802 buffer += pitchB;
2803
2804 value = *Pointer<Float4>(buffer);
2805
2806 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2807 {
2808 Float4 masked;
2809
2810 masked = value;
2811 color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[rgbaWriteMask & 0x3][0])));
2812 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~rgbaWriteMask & 0x3][0])));
2813 color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(masked));
2814 }
2815
2816 color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16, 16));
2817 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ23X) + xMask * 16, 16));
2818 color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(value));
2819 *Pointer<Float4>(buffer) = color.y;
2820 break;
2821 case VK_FORMAT_R16G16_SFLOAT:
2822 if((rgbaWriteMask & 0x00000003) != 0x0)
2823 {
2824 buffer += 4 * x;
2825
2826 UInt2 rgbaMask;
2827 UInt2 packedCol;
2828 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.y))) << 16) | UInt(As<UShort>(Half(color.x.x))), 0);
2829 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.w))) << 16) | UInt(As<UShort>(Half(color.x.z))), 1);
2830
2831 UShort4 value = *Pointer<UShort4>(buffer);
2832 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2833 if((rgbaWriteMask & 0x3) != 0x3)
2834 {
2835 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2836 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2837 mergedMask &= rgbaMask;
2838 }
2839 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2840
2841 buffer += pitchB;
2842
2843 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.y))) << 16) | UInt(As<UShort>(Half(color.y.x))), 0);
2844 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.w))) << 16) | UInt(As<UShort>(Half(color.y.z))), 1);
2845 value = *Pointer<UShort4>(buffer);
2846 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2847 if((rgbaWriteMask & 0x3) != 0x3)
2848 {
2849 mergedMask &= rgbaMask;
2850 }
2851 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2852 }
2853 break;
2854 case VK_FORMAT_R16G16_UNORM:
2855 case VK_FORMAT_R16G16_SINT:
2856 case VK_FORMAT_R16G16_UINT:
2857 if((rgbaWriteMask & 0x00000003) != 0x0)
2858 {
2859 buffer += 4 * x;
2860
2861 UInt2 rgbaMask;
2862 UShort4 packedCol = UShort4(As<Int4>(color.x));
2863 UShort4 value = *Pointer<UShort4>(buffer);
2864 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2865 if((rgbaWriteMask & 0x3) != 0x3)
2866 {
2867 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2868 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2869 mergedMask &= rgbaMask;
2870 }
2871 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2872
2873 buffer += pitchB;
2874
2875 packedCol = UShort4(As<Int4>(color.y));
2876 value = *Pointer<UShort4>(buffer);
2877 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2878 if((rgbaWriteMask & 0x3) != 0x3)
2879 {
2880 mergedMask &= rgbaMask;
2881 }
2882 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2883 }
2884 break;
2885 case VK_FORMAT_R8G8_SINT:
2886 case VK_FORMAT_R8G8_UINT:
2887 if((rgbaWriteMask & 0x00000003) != 0x0)
2888 {
2889 buffer += 2 * x;
2890
2891 Int2 xyzw, packedCol;
2892
2893 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2894 buffer += pitchB;
2895 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2896
2897 if(format == VK_FORMAT_R8G8_SINT)
2898 {
2899 packedCol = As<Int2>(PackSigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2900 }
2901 else
2902 {
2903 packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2904 }
2905
2906 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2907 if((rgbaWriteMask & 0x3) != 0x3)
2908 {
2909 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
2910 UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2911 mergedMask &= rgbaMask;
2912 }
2913
2914 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2915
2916 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2917 buffer -= pitchB;
2918 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2919 }
2920 break;
2921 case VK_FORMAT_R32G32B32A32_SFLOAT:
2922 case VK_FORMAT_R32G32B32A32_SINT:
2923 case VK_FORMAT_R32G32B32A32_UINT:
2924 buffer += 16 * x;
2925
2926 {
2927 value = *Pointer<Float4>(buffer, 16);
2928
2929 if(rgbaWriteMask != 0x0000000F)
2930 {
2931 Float4 masked = value;
2932 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2933 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2934 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(masked));
2935 }
2936
2937 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskX0X) + xMask * 16, 16));
2938 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX0X) + xMask * 16, 16));
2939 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2940 *Pointer<Float4>(buffer, 16) = color.x;
2941 }
2942
2943 {
2944 value = *Pointer<Float4>(buffer + 16, 16);
2945
2946 if(rgbaWriteMask != 0x0000000F)
2947 {
2948 Float4 masked = value;
2949 color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2950 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2951 color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(masked));
2952 }
2953
2954 color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskX1X) + xMask * 16, 16));
2955 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX1X) + xMask * 16, 16));
2956 color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(value));
2957 *Pointer<Float4>(buffer + 16, 16) = color.y;
2958 }
2959
2960 buffer += pitchB;
2961
2962 {
2963 value = *Pointer<Float4>(buffer, 16);
2964
2965 if(rgbaWriteMask != 0x0000000F)
2966 {
2967 Float4 masked = value;
2968 color.z = As<Float4>(As<Int4>(color.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2969 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2970 color.z = As<Float4>(As<Int4>(color.z) | As<Int4>(masked));
2971 }
2972
2973 color.z = As<Float4>(As<Int4>(color.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskX2X) + xMask * 16, 16));
2974 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX2X) + xMask * 16, 16));
2975 color.z = As<Float4>(As<Int4>(color.z) | As<Int4>(value));
2976 *Pointer<Float4>(buffer, 16) = color.z;
2977 }
2978
2979 {
2980 value = *Pointer<Float4>(buffer + 16, 16);
2981
2982 if(rgbaWriteMask != 0x0000000F)
2983 {
2984 Float4 masked = value;
2985 color.w = As<Float4>(As<Int4>(color.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2986 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2987 color.w = As<Float4>(As<Int4>(color.w) | As<Int4>(masked));
2988 }
2989
2990 color.w = As<Float4>(As<Int4>(color.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskX3X) + xMask * 16, 16));
2991 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX3X) + xMask * 16, 16));
2992 color.w = As<Float4>(As<Int4>(color.w) | As<Int4>(value));
2993 *Pointer<Float4>(buffer + 16, 16) = color.w;
2994 }
2995 break;
2996 case VK_FORMAT_R16G16B16A16_SFLOAT:
2997 if((rgbaWriteMask & 0x0000000F) != 0x0)
2998 {
2999 buffer += 8 * x;
3000
3001 UInt4 rgbaMask;
3002 UInt4 value = *Pointer<UInt4>(buffer);
3003 UInt4 packedCol;
3004 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.y))) << 16) | UInt(As<UShort>(Half(color.x.x))), 0);
3005 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.w))) << 16) | UInt(As<UShort>(Half(color.x.z))), 1);
3006 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.y))) << 16) | UInt(As<UShort>(Half(color.y.x))), 2);
3007 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.w))) << 16) | UInt(As<UShort>(Half(color.y.z))), 3);
3008 UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
3009 if((rgbaWriteMask & 0xF) != 0xF)
3010 {
3011 UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
3012 rgbaMask = UInt4(tmpMask, tmpMask);
3013 mergedMask &= rgbaMask;
3014 }
3015 *Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
3016
3017 buffer += pitchB;
3018
3019 value = *Pointer<UInt4>(buffer);
3020 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.z.y))) << 16) | UInt(As<UShort>(Half(color.z.x))), 0);
3021 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.z.w))) << 16) | UInt(As<UShort>(Half(color.z.z))), 1);
3022 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.w.y))) << 16) | UInt(As<UShort>(Half(color.w.x))), 2);
3023 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.w.w))) << 16) | UInt(As<UShort>(Half(color.w.z))), 3);
3024 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
3025 if((rgbaWriteMask & 0xF) != 0xF)
3026 {
3027 mergedMask &= rgbaMask;
3028 }
3029 *Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
3030 }
3031 break;
3032 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
3033 if((rgbaWriteMask & 0x7) != 0x0)
3034 {
3035 buffer += 4 * x;
3036
3037 UInt4 packedCol;
3038 packedCol = Insert(packedCol, r11g11b10Pack(color.x), 0);
3039 packedCol = Insert(packedCol, r11g11b10Pack(color.y), 1);
3040 packedCol = Insert(packedCol, r11g11b10Pack(color.z), 2);
3041 packedCol = Insert(packedCol, r11g11b10Pack(color.w), 3);
3042
3043 UInt4 value;
3044 value = Insert(value, *Pointer<UInt>(buffer + 0), 0);
3045 value = Insert(value, *Pointer<UInt>(buffer + 4), 1);
3046 buffer += pitchB;
3047 value = Insert(value, *Pointer<UInt>(buffer + 0), 2);
3048 value = Insert(value, *Pointer<UInt>(buffer + 4), 3);
3049
3050 UInt4 mask = *Pointer<UInt4>(constants + OFFSET(Constants, maskD4X[0][0]) + xMask * 16, 16);
3051 if((rgbaWriteMask & 0x7) != 0x7)
3052 {
3053 mask &= *Pointer<UInt4>(constants + OFFSET(Constants, mask11X[rgbaWriteMask & 0x7][0]), 16);
3054 }
3055 value = (packedCol & mask) | (value & ~mask);
3056
3057 *Pointer<UInt>(buffer + 0) = value.z;
3058 *Pointer<UInt>(buffer + 4) = value.w;
3059 buffer -= pitchB;
3060 *Pointer<UInt>(buffer + 0) = value.x;
3061 *Pointer<UInt>(buffer + 4) = value.y;
3062 }
3063 break;
3064 case VK_FORMAT_R16G16B16A16_UNORM:
3065 case VK_FORMAT_R16G16B16A16_SINT:
3066 case VK_FORMAT_R16G16B16A16_UINT:
3067 if((rgbaWriteMask & 0x0000000F) != 0x0)
3068 {
3069 buffer += 8 * x;
3070
3071 UInt4 rgbaMask;
3072 UShort8 value = *Pointer<UShort8>(buffer);
3073 UShort8 packedCol = UShort8(UShort4(As<Int4>(color.x)), UShort4(As<Int4>(color.y)));
3074 UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
3075 if((rgbaWriteMask & 0xF) != 0xF)
3076 {
3077 UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
3078 rgbaMask = UInt4(tmpMask, tmpMask);
3079 mergedMask &= rgbaMask;
3080 }
3081 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
3082
3083 buffer += pitchB;
3084
3085 value = *Pointer<UShort8>(buffer);
3086 packedCol = UShort8(UShort4(As<Int4>(color.z)), UShort4(As<Int4>(color.w)));
3087 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
3088 if((rgbaWriteMask & 0xF) != 0xF)
3089 {
3090 mergedMask &= rgbaMask;
3091 }
3092 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
3093 }
3094 break;
3095 case VK_FORMAT_R8G8B8A8_SINT:
3096 case VK_FORMAT_R8G8B8A8_UINT:
3097 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
3098 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
3099 if((rgbaWriteMask & 0x0000000F) != 0x0)
3100 {
3101 UInt2 value, packedCol, mergedMask;
3102
3103 buffer += 4 * x;
3104
3105 bool isSigned = (format == VK_FORMAT_R8G8B8A8_SINT) || (format == VK_FORMAT_A8B8G8R8_SINT_PACK32);
3106
3107 if(isSigned)
3108 {
3109 packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
3110 }
3111 else
3112 {
3113 packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
3114 }
3115 value = *Pointer<UInt2>(buffer, 16);
3116 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
3117 if(rgbaWriteMask != 0xF)
3118 {
3119 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
3120 }
3121 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
3122
3123 buffer += pitchB;
3124
3125 if(isSigned)
3126 {
3127 packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(color.z)), Short4(As<Int4>(color.w))));
3128 }
3129 else
3130 {
3131 packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(color.z)), Short4(As<Int4>(color.w))));
3132 }
3133 value = *Pointer<UInt2>(buffer, 16);
3134 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
3135 if(rgbaWriteMask != 0xF)
3136 {
3137 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
3138 }
3139 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
3140 }
3141 break;
3142 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
3143 if((rgbaWriteMask & 0x0000000F) != 0x0)
3144 {
3145 Int2 mergedMask, packedCol, value;
3146 Int4 packed = ((As<Int4>(color.w) & Int4(0x3)) << 30) |
3147 ((As<Int4>(color.z) & Int4(0x3ff)) << 20) |
3148 ((As<Int4>(color.y) & Int4(0x3ff)) << 10) |
3149 ((As<Int4>(color.x) & Int4(0x3ff)));
3150
3151 buffer += 4 * x;
3152 value = *Pointer<Int2>(buffer, 16);
3153 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
3154 if(rgbaWriteMask != 0xF)
3155 {
3156 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
3157 }
3158 *Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
3159
3160 buffer += pitchB;
3161
3162 value = *Pointer<Int2>(buffer, 16);
3163 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
3164 if(rgbaWriteMask != 0xF)
3165 {
3166 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
3167 }
3168 *Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
3169 }
3170 break;
3171 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
3172 if((bgraWriteMask & 0x0000000F) != 0x0)
3173 {
3174 Int2 mergedMask, packedCol, value;
3175 Int4 packed = ((As<Int4>(color.w) & Int4(0x3)) << 30) |
3176 ((As<Int4>(color.x) & Int4(0x3ff)) << 20) |
3177 ((As<Int4>(color.y) & Int4(0x3ff)) << 10) |
3178 ((As<Int4>(color.z) & Int4(0x3ff)));
3179
3180 buffer += 4 * x;
3181 value = *Pointer<Int2>(buffer, 16);
3182 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
3183 if(bgraWriteMask != 0xF)
3184 {
3185 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[bgraWriteMask][0]));
3186 }
3187 *Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
3188
3189 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
3190
3191 value = *Pointer<Int2>(buffer, 16);
3192 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
3193 if(bgraWriteMask != 0xF)
3194 {
3195 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[bgraWriteMask][0]));
3196 }
3197 *Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
3198 }
3199 break;
3200 default:
3201 UNSUPPORTED("VkFormat: %d", int(format));
3202 }
3203 }
3204
convertFixed16(const Float4 & cf,bool saturate)3205 UShort4 PixelRoutine::convertFixed16(const Float4 &cf, bool saturate)
3206 {
3207 return UShort4(cf * 0xFFFF, saturate);
3208 }
3209
convertFloat32(const UShort4 & cf)3210 Float4 PixelRoutine::convertFloat32(const UShort4 &cf)
3211 {
3212 return Float4(cf) * (1.0f / 65535.0f);
3213 }
3214
sRGBtoLinear16_12_16(Vector4s & c)3215 void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
3216 {
3217 Pointer<Byte> LUT = constants + OFFSET(Constants, sRGBtoLinear12_16);
3218
3219 c.x = AddSat(As<UShort4>(c.x), UShort4(0x0007)) >> 4;
3220 c.y = AddSat(As<UShort4>(c.y), UShort4(0x0007)) >> 4;
3221 c.z = AddSat(As<UShort4>(c.z), UShort4(0x0007)) >> 4;
3222
3223 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
3224 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
3225 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
3226 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
3227
3228 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
3229 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
3230 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
3231 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
3232
3233 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
3234 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
3235 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
3236 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
3237 }
3238
linearToSRGB16_12_16(Vector4s & c)3239 void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
3240 {
3241 c.x = AddSat(As<UShort4>(c.x), UShort4(0x0007)) >> 4;
3242 c.y = AddSat(As<UShort4>(c.y), UShort4(0x0007)) >> 4;
3243 c.z = AddSat(As<UShort4>(c.z), UShort4(0x0007)) >> 4;
3244
3245 linearToSRGB12_16(c);
3246 }
3247
linearToSRGB12_16(Vector4s & c)3248 void PixelRoutine::linearToSRGB12_16(Vector4s &c)
3249 {
3250 Pointer<Byte> LUT = constants + OFFSET(Constants, linearToSRGB12_16);
3251
3252 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
3253 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
3254 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
3255 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
3256
3257 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
3258 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
3259 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
3260 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
3261
3262 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
3263 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
3264 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
3265 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
3266 }
3267
sRGBtoLinear(const Float4 & x)3268 Float4 PixelRoutine::sRGBtoLinear(const Float4 &x) // Approximates x^2.2
3269 {
3270 Float4 linear = x * x;
3271 linear = linear * 0.73f + linear * x * 0.27f;
3272
3273 return Min(Max(linear, 0.0f), 1.0f);
3274 }
3275
3276 } // namespace sw
3277