1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "PixelRoutine.hpp"
16
17 #include "Constants.hpp"
18 #include "SamplerCore.hpp"
19 #include "Device/Primitive.hpp"
20 #include "Device/QuadRasterizer.hpp"
21 #include "Device/Renderer.hpp"
22 #include "System/Debug.hpp"
23 #include "Vulkan/VkPipelineLayout.hpp"
24 #include "Vulkan/VkStringify.hpp"
25
26 namespace sw {
27
PixelRoutine(const PixelProcessor::State & state,vk::PipelineLayout const * pipelineLayout,SpirvShader const * spirvShader,const vk::DescriptorSet::Bindings & descriptorSets)28 PixelRoutine::PixelRoutine(
29 const PixelProcessor::State &state,
30 vk::PipelineLayout const *pipelineLayout,
31 SpirvShader const *spirvShader,
32 const vk::DescriptorSet::Bindings &descriptorSets)
33 : QuadRasterizer(state, spirvShader)
34 , routine(pipelineLayout)
35 , descriptorSets(descriptorSets)
36 , shaderContainsInterpolation(spirvShader && spirvShader->getUsedCapabilities().InterpolationFunction)
37 , shaderContainsSampleQualifier(spirvShader && spirvShader->getAnalysis().ContainsSampleQualifier)
38 , perSampleShading((state.sampleShadingEnabled && (state.minSampleShading * state.multiSampleCount > 1.0f)) ||
39 shaderContainsSampleQualifier || shaderContainsInterpolation) // TODO(b/194714095)
40 , invocationCount(perSampleShading ? state.multiSampleCount : 1)
41 {
42 if(spirvShader)
43 {
44 spirvShader->emitProlog(&routine);
45
46 // Clearing inputs to 0 is not demanded by the spec,
47 // but it makes the undefined behavior deterministic.
48 // TODO(b/155148722): Remove to detect UB.
49 for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++)
50 {
51 routine.inputs[i] = Float4(0.0f);
52 }
53 }
54 }
55
~PixelRoutine()56 PixelRoutine::~PixelRoutine()
57 {
58 }
59
getSampleSet(int invocation) const60 PixelRoutine::SampleSet PixelRoutine::getSampleSet(int invocation) const
61 {
62 unsigned int sampleBegin = perSampleShading ? invocation : 0;
63 unsigned int sampleEnd = perSampleShading ? (invocation + 1) : state.multiSampleCount;
64
65 SampleSet samples;
66
67 for(unsigned int q = sampleBegin; q < sampleEnd; q++)
68 {
69 if(state.multiSampleMask & (1 << q))
70 {
71 samples.push_back(q);
72 }
73 }
74
75 return samples;
76 }
77
quad(Pointer<Byte> cBuffer[MAX_COLOR_BUFFERS],Pointer<Byte> & zBuffer,Pointer<Byte> & sBuffer,Int cMask[4],Int & x,Int & y)78 void PixelRoutine::quad(Pointer<Byte> cBuffer[MAX_COLOR_BUFFERS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
79 {
80 const bool earlyFragmentTests = !spirvShader || spirvShader->getExecutionModes().EarlyFragmentTests;
81
82 Int zMask[4]; // Depth mask
83 Int sMask[4]; // Stencil mask
84 Float4 unclampedZ[4];
85
86 for(int invocation = 0; invocation < invocationCount; invocation++)
87 {
88 SampleSet samples = getSampleSet(invocation);
89
90 if(samples.empty())
91 {
92 continue;
93 }
94
95 for(unsigned int q : samples)
96 {
97 zMask[q] = cMask[q];
98 sMask[q] = cMask[q];
99 }
100
101 stencilTest(sBuffer, x, sMask, samples);
102
103 Float4 f;
104 Float4 rhwCentroid;
105
106 Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive, xQuad), 16);
107
108 if(interpolateZ())
109 {
110 for(unsigned int q : samples)
111 {
112 Float4 x = xxxx;
113
114 if(state.enableMultiSampling)
115 {
116 x -= *Pointer<Float4>(constants + OFFSET(Constants, X) + q * sizeof(float4));
117 }
118
119 z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive, z), false, false);
120
121 if(state.depthBias)
122 {
123 z[q] += *Pointer<Float4>(primitive + OFFSET(Primitive, zBias), 16);
124 }
125
126 unclampedZ[q] = z[q];
127 }
128 }
129
130 Bool depthPass = false;
131
132 if(earlyFragmentTests)
133 {
134 for(unsigned int q : samples)
135 {
136 z[q] = clampDepth(z[q]);
137 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
138 depthBoundsTest(zBuffer, q, x, zMask[q], cMask[q]);
139 }
140 }
141
142 If(depthPass || !earlyFragmentTests)
143 {
144 if(earlyFragmentTests)
145 {
146 writeDepth(zBuffer, x, zMask, samples);
147 }
148
149 Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive, yQuad), 16);
150
151 // Centroid locations
152 Float4 XXXX = Float4(0.0f);
153 Float4 YYYY = Float4(0.0f);
154
155 if(state.centroid || shaderContainsInterpolation) // TODO(b/194714095)
156 {
157 Float4 WWWW(1.0e-9f);
158
159 for(unsigned int q : samples)
160 {
161 XXXX += *Pointer<Float4>(constants + OFFSET(Constants, sampleX[q]) + 16 * cMask[q]);
162 YYYY += *Pointer<Float4>(constants + OFFSET(Constants, sampleY[q]) + 16 * cMask[q]);
163 WWWW += *Pointer<Float4>(constants + OFFSET(Constants, weight) + 16 * cMask[q]);
164 }
165
166 WWWW = Rcp(WWWW, Precision::Relaxed);
167 XXXX *= WWWW;
168 YYYY *= WWWW;
169
170 XXXX += xxxx;
171 YYYY += yyyy;
172 }
173
174 if(interpolateW())
175 {
176 w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive, w), false, false);
177 rhw = reciprocal(w, false, false, true);
178
179 if(state.centroid || shaderContainsInterpolation) // TODO(b/194714095)
180 {
181 rhwCentroid = reciprocal(SpirvRoutine::interpolateAtXY(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, w), false, false));
182 }
183 }
184
185 if(spirvShader)
186 {
187 if(shaderContainsInterpolation) // TODO(b/194714095)
188 {
189 routine.interpolationData.primitive = primitive;
190
191 routine.interpolationData.x = xxxx;
192 routine.interpolationData.y = yyyy;
193 routine.interpolationData.rhw = rhw;
194
195 routine.interpolationData.xCentroid = XXXX;
196 routine.interpolationData.yCentroid = YYYY;
197 routine.interpolationData.rhwCentroid = rhwCentroid;
198 }
199
200 if(perSampleShading && (state.multiSampleCount > 1))
201 {
202 xxxx += Float4(Constants::SampleLocationsX[samples[0]]);
203 yyyy += Float4(Constants::SampleLocationsY[samples[0]]);
204 }
205
206 int packedInterpolant = 0;
207 for(int interfaceInterpolant = 0; interfaceInterpolant < MAX_INTERFACE_COMPONENTS; interfaceInterpolant++)
208 {
209 auto const &input = spirvShader->inputs[interfaceInterpolant];
210 if(input.Type != SpirvShader::ATTRIBTYPE_UNUSED)
211 {
212 if(input.Centroid && state.enableMultiSampling)
213 {
214 routine.inputs[interfaceInterpolant] =
215 SpirvRoutine::interpolateAtXY(XXXX, YYYY, rhwCentroid,
216 primitive + OFFSET(Primitive, V[packedInterpolant]),
217 input.Flat, !input.NoPerspective);
218 }
219 else if(perSampleShading)
220 {
221 routine.inputs[interfaceInterpolant] =
222 SpirvRoutine::interpolateAtXY(xxxx, yyyy, rhw,
223 primitive + OFFSET(Primitive, V[packedInterpolant]),
224 input.Flat, !input.NoPerspective);
225 }
226 else
227 {
228 routine.inputs[interfaceInterpolant] =
229 interpolate(xxxx, Dv[interfaceInterpolant], rhw,
230 primitive + OFFSET(Primitive, V[packedInterpolant]),
231 input.Flat, !input.NoPerspective);
232 }
233 packedInterpolant++;
234 }
235 }
236
237 setBuiltins(x, y, unclampedZ, w, cMask, samples);
238
239 for(uint32_t i = 0; i < state.numClipDistances; i++)
240 {
241 auto distance = interpolate(xxxx, DclipDistance[i], rhw,
242 primitive + OFFSET(Primitive, clipDistance[i]),
243 false, true);
244
245 auto clipMask = SignMask(CmpGE(distance, SIMD::Float(0)));
246 for(unsigned int q : samples)
247 {
248 // FIXME(b/148105887): Fragments discarded by clipping do not exist at
249 // all -- they should not be counted in queries or have their Z/S effects
250 // performed when early fragment tests are enabled.
251 cMask[q] &= clipMask;
252 }
253
254 if(spirvShader->getUsedCapabilities().ClipDistance)
255 {
256 auto it = spirvShader->inputBuiltins.find(spv::BuiltInClipDistance);
257 if(it != spirvShader->inputBuiltins.end())
258 {
259 if(i < it->second.SizeInComponents)
260 {
261 routine.getVariable(it->second.Id)[it->second.FirstComponent + i] = distance;
262 }
263 }
264 }
265 }
266
267 if(spirvShader->getUsedCapabilities().CullDistance)
268 {
269 auto it = spirvShader->inputBuiltins.find(spv::BuiltInCullDistance);
270 if(it != spirvShader->inputBuiltins.end())
271 {
272 for(uint32_t i = 0; i < state.numCullDistances; i++)
273 {
274 if(i < it->second.SizeInComponents)
275 {
276 routine.getVariable(it->second.Id)[it->second.FirstComponent + i] =
277 interpolate(xxxx, DcullDistance[i], rhw,
278 primitive + OFFSET(Primitive, cullDistance[i]),
279 false, true);
280 }
281 }
282 }
283 }
284 }
285
286 if(spirvShader)
287 {
288 executeShader(cMask, earlyFragmentTests ? sMask : cMask, earlyFragmentTests ? zMask : cMask, samples);
289 }
290
291 Bool alphaPass = alphaTest(cMask, samples);
292
293 if((spirvShader && spirvShader->getAnalysis().ContainsKill) || state.alphaToCoverage)
294 {
295 for(unsigned int q : samples)
296 {
297 zMask[q] &= cMask[q];
298 sMask[q] &= cMask[q];
299 }
300 }
301
302 If(alphaPass)
303 {
304 if(!earlyFragmentTests)
305 {
306 for(unsigned int q : samples)
307 {
308 z[q] = clampDepth(z[q]);
309 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
310 depthBoundsTest(zBuffer, q, x, zMask[q], cMask[q]);
311 }
312 }
313
314 If(depthPass)
315 {
316 if(!earlyFragmentTests)
317 {
318 writeDepth(zBuffer, x, zMask, samples);
319 }
320
321 blendColor(cBuffer, x, sMask, zMask, cMask, samples);
322
323 occlusionSampleCount(zMask, sMask, samples);
324 }
325 }
326 }
327
328 writeStencil(sBuffer, x, sMask, zMask, cMask, samples);
329 }
330 }
331
stencilTest(const Pointer<Byte> & sBuffer,const Int & x,Int sMask[4],const SampleSet & samples)332 void PixelRoutine::stencilTest(const Pointer<Byte> &sBuffer, const Int &x, Int sMask[4], const SampleSet &samples)
333 {
334 if(!state.stencilActive)
335 {
336 return;
337 }
338
339 for(unsigned int q : samples)
340 {
341 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
342
343 Pointer<Byte> buffer = sBuffer + x;
344
345 if(q > 0)
346 {
347 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
348 }
349
350 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
351 Byte8 value = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
352 value = value | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
353 Byte8 valueBack = value;
354
355 if(state.frontStencil.compareMask != 0xff)
356 {
357 value &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].testMaskQ));
358 }
359
360 stencilTest(value, state.frontStencil.compareOp, false);
361
362 if(state.backStencil.compareMask != 0xff)
363 {
364 valueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].testMaskQ));
365 }
366
367 stencilTest(valueBack, state.backStencil.compareOp, true);
368
369 value &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
370 valueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
371 value |= valueBack;
372
373 sMask[q] &= SignMask(value);
374 }
375 }
376
stencilTest(Byte8 & value,VkCompareOp stencilCompareMode,bool isBack)377 void PixelRoutine::stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool isBack)
378 {
379 Byte8 equal;
380
381 switch(stencilCompareMode)
382 {
383 case VK_COMPARE_OP_ALWAYS:
384 value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
385 break;
386 case VK_COMPARE_OP_NEVER:
387 value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
388 break;
389 case VK_COMPARE_OP_LESS: // a < b ~ b > a
390 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
391 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
392 break;
393 case VK_COMPARE_OP_EQUAL:
394 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
395 break;
396 case VK_COMPARE_OP_NOT_EQUAL: // a != b ~ !(a == b)
397 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
398 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
399 break;
400 case VK_COMPARE_OP_LESS_OR_EQUAL: // a <= b ~ (b > a) || (a == b)
401 equal = value;
402 equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
403 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
404 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
405 value |= equal;
406 break;
407 case VK_COMPARE_OP_GREATER: // a > b
408 equal = *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ));
409 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
410 equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
411 value = equal;
412 break;
413 case VK_COMPARE_OP_GREATER_OR_EQUAL: // a >= b ~ !(a < b) ~ !(b > a)
414 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
415 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
416 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
417 break;
418 default:
419 UNSUPPORTED("VkCompareOp: %d", int(stencilCompareMode));
420 }
421 }
422
depthTest32F(const Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & sMask,Int & zMask,const Int & cMask)423 Bool PixelRoutine::depthTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
424 {
425 Float4 Z = z;
426
427 Pointer<Byte> buffer = zBuffer + 4 * x;
428 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
429
430 if(q > 0)
431 {
432 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
433 }
434
435 Float4 zValue;
436
437 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
438 {
439 zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
440 }
441
442 Int4 zTest;
443
444 switch(state.depthCompareMode)
445 {
446 case VK_COMPARE_OP_ALWAYS:
447 // Optimized
448 break;
449 case VK_COMPARE_OP_NEVER:
450 // Optimized
451 break;
452 case VK_COMPARE_OP_EQUAL:
453 zTest = CmpEQ(zValue, Z);
454 break;
455 case VK_COMPARE_OP_NOT_EQUAL:
456 zTest = CmpNEQ(zValue, Z);
457 break;
458 case VK_COMPARE_OP_LESS:
459 zTest = CmpNLE(zValue, Z);
460 break;
461 case VK_COMPARE_OP_GREATER_OR_EQUAL:
462 zTest = CmpLE(zValue, Z);
463 break;
464 case VK_COMPARE_OP_LESS_OR_EQUAL:
465 zTest = CmpNLT(zValue, Z);
466 break;
467 case VK_COMPARE_OP_GREATER:
468 zTest = CmpLT(zValue, Z);
469 break;
470 default:
471 UNSUPPORTED("VkCompareOp: %d", int(state.depthCompareMode));
472 }
473
474 switch(state.depthCompareMode)
475 {
476 case VK_COMPARE_OP_ALWAYS:
477 zMask = cMask;
478 break;
479 case VK_COMPARE_OP_NEVER:
480 zMask = 0x0;
481 break;
482 default:
483 zMask = SignMask(zTest) & cMask;
484 break;
485 }
486
487 if(state.stencilActive)
488 {
489 zMask &= sMask;
490 }
491
492 return zMask != 0;
493 }
494
depthTest16(const Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & sMask,Int & zMask,const Int & cMask)495 Bool PixelRoutine::depthTest16(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
496 {
497 Short4 Z = convertFixed16(z, true);
498
499 Pointer<Byte> buffer = zBuffer + 2 * x;
500 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
501
502 if(q > 0)
503 {
504 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
505 }
506
507 Short4 zValue;
508
509 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
510 {
511 zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer), 0));
512 zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer + pitch), 1));
513 }
514
515 Int4 zTest;
516
517 // Bias values to make unsigned compares out of Reactor's (due SSE's) signed compares only
518 zValue = zValue - Short4(0x8000u);
519 Z = Z - Short4(0x8000u);
520
521 switch(state.depthCompareMode)
522 {
523 case VK_COMPARE_OP_ALWAYS:
524 // Optimized
525 break;
526 case VK_COMPARE_OP_NEVER:
527 // Optimized
528 break;
529 case VK_COMPARE_OP_EQUAL:
530 zTest = Int4(CmpEQ(zValue, Z));
531 break;
532 case VK_COMPARE_OP_NOT_EQUAL:
533 zTest = ~Int4(CmpEQ(zValue, Z));
534 break;
535 case VK_COMPARE_OP_LESS:
536 zTest = Int4(CmpGT(zValue, Z));
537 break;
538 case VK_COMPARE_OP_GREATER_OR_EQUAL:
539 zTest = ~Int4(CmpGT(zValue, Z));
540 break;
541 case VK_COMPARE_OP_LESS_OR_EQUAL:
542 zTest = ~Int4(CmpGT(Z, zValue));
543 break;
544 case VK_COMPARE_OP_GREATER:
545 zTest = Int4(CmpGT(Z, zValue));
546 break;
547 default:
548 UNSUPPORTED("VkCompareOp: %d", int(state.depthCompareMode));
549 }
550
551 switch(state.depthCompareMode)
552 {
553 case VK_COMPARE_OP_ALWAYS:
554 zMask = cMask;
555 break;
556 case VK_COMPARE_OP_NEVER:
557 zMask = 0x0;
558 break;
559 default:
560 zMask = SignMask(zTest) & cMask;
561 break;
562 }
563
564 if(state.stencilActive)
565 {
566 zMask &= sMask;
567 }
568
569 return zMask != 0;
570 }
571
clampDepth(const Float4 & z)572 Float4 PixelRoutine::clampDepth(const Float4 &z)
573 {
574 if(!state.depthClamp)
575 {
576 return z;
577 }
578
579 return Min(Max(z, Float4(state.minDepthClamp)), Float4(state.maxDepthClamp));
580 }
581
depthTest(const Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & sMask,Int & zMask,const Int & cMask)582 Bool PixelRoutine::depthTest(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
583 {
584 if(!state.depthTestActive)
585 {
586 return true;
587 }
588
589 switch(state.depthFormat)
590 {
591 case VK_FORMAT_D16_UNORM:
592 return depthTest16(zBuffer, q, x, z, sMask, zMask, cMask);
593 case VK_FORMAT_D32_SFLOAT:
594 case VK_FORMAT_D32_SFLOAT_S8_UINT:
595 return depthTest32F(zBuffer, q, x, z, sMask, zMask, cMask);
596 default:
597 UNSUPPORTED("Depth format: %d", int(state.depthFormat));
598 return false;
599 }
600 }
601
depthBoundsTest16(const Pointer<Byte> & zBuffer,int q,const Int & x)602 Int4 PixelRoutine::depthBoundsTest16(const Pointer<Byte> &zBuffer, int q, const Int &x)
603 {
604 Pointer<Byte> buffer = zBuffer + 2 * x;
605 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
606
607 if(q > 0)
608 {
609 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
610 }
611
612 Float4 minDepthBound(state.minDepthBounds);
613 Float4 maxDepthBound(state.maxDepthBounds);
614
615 Int2 z;
616 z = Insert(z, *Pointer<Int>(buffer), 0);
617 z = Insert(z, *Pointer<Int>(buffer + pitch), 1);
618
619 Float4 zValue = convertFloat32(As<UShort4>(z));
620 return Int4(CmpLE(minDepthBound, zValue) & CmpLE(zValue, maxDepthBound));
621 }
622
depthBoundsTest32F(const Pointer<Byte> & zBuffer,int q,const Int & x)623 Int4 PixelRoutine::depthBoundsTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x)
624 {
625 Pointer<Byte> buffer = zBuffer + 4 * x;
626 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
627
628 if(q > 0)
629 {
630 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
631 }
632
633 Float4 zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
634 return Int4(CmpLE(Float4(state.minDepthBounds), zValue) & CmpLE(zValue, Float4(state.maxDepthBounds)));
635 }
636
depthBoundsTest(const Pointer<Byte> & zBuffer,int q,const Int & x,Int & zMask,Int & cMask)637 void PixelRoutine::depthBoundsTest(const Pointer<Byte> &zBuffer, int q, const Int &x, Int &zMask, Int &cMask)
638 {
639 if(!state.depthBoundsTestActive)
640 {
641 return;
642 }
643
644 Int4 zTest;
645 switch(state.depthFormat)
646 {
647 case VK_FORMAT_D16_UNORM:
648 zTest = depthBoundsTest16(zBuffer, q, x);
649 break;
650 case VK_FORMAT_D32_SFLOAT:
651 case VK_FORMAT_D32_SFLOAT_S8_UINT:
652 zTest = depthBoundsTest32F(zBuffer, q, x);
653 break;
654 default:
655 UNSUPPORTED("Depth format: %d", int(state.depthFormat));
656 break;
657 }
658
659 if(!state.depthTestActive)
660 {
661 cMask &= zMask & SignMask(zTest);
662 }
663 else
664 {
665 zMask &= cMask & SignMask(zTest);
666 }
667 }
668
alphaToCoverage(Int cMask[4],const Float4 & alpha,const SampleSet & samples)669 void PixelRoutine::alphaToCoverage(Int cMask[4], const Float4 &alpha, const SampleSet &samples)
670 {
671 static const int a2c[4] = {
672 OFFSET(DrawData, a2c0),
673 OFFSET(DrawData, a2c1),
674 OFFSET(DrawData, a2c2),
675 OFFSET(DrawData, a2c3),
676 };
677
678 for(unsigned int q : samples)
679 {
680 Int4 coverage = CmpNLT(alpha, *Pointer<Float4>(data + a2c[q]));
681 Int aMask = SignMask(coverage);
682 cMask[q] &= aMask;
683 }
684 }
685
writeDepth32F(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)686 void PixelRoutine::writeDepth32F(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
687 {
688 Float4 Z = z;
689
690 Pointer<Byte> buffer = zBuffer + 4 * x;
691 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
692
693 if(q > 0)
694 {
695 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
696 }
697
698 Float4 zValue;
699
700 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
701 {
702 zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
703 }
704
705 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + zMask * 16, 16));
706 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + zMask * 16, 16));
707 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
708
709 *Pointer<Float2>(buffer) = Float2(Z.xy);
710 *Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
711 }
712
writeDepth16(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)713 void PixelRoutine::writeDepth16(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
714 {
715 Short4 Z = As<Short4>(convertFixed16(z, true));
716
717 Pointer<Byte> buffer = zBuffer + 2 * x;
718 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
719
720 if(q > 0)
721 {
722 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
723 }
724
725 Short4 zValue;
726
727 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
728 {
729 zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer), 0));
730 zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer + pitch), 1));
731 }
732
733 Z = Z & *Pointer<Short4>(constants + OFFSET(Constants, maskW4Q) + zMask * 8, 8);
734 zValue = zValue & *Pointer<Short4>(constants + OFFSET(Constants, invMaskW4Q) + zMask * 8, 8);
735 Z = Z | zValue;
736
737 *Pointer<Int>(buffer) = Extract(As<Int2>(Z), 0);
738 *Pointer<Int>(buffer + pitch) = Extract(As<Int2>(Z), 1);
739 }
740
writeDepth(Pointer<Byte> & zBuffer,const Int & x,const Int zMask[4],const SampleSet & samples)741 void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, const Int &x, const Int zMask[4], const SampleSet &samples)
742 {
743 if(!state.depthWriteEnable)
744 {
745 return;
746 }
747
748 for(unsigned int q : samples)
749 {
750 switch(state.depthFormat)
751 {
752 case VK_FORMAT_D16_UNORM:
753 writeDepth16(zBuffer, q, x, z[q], zMask[q]);
754 break;
755 case VK_FORMAT_D32_SFLOAT:
756 case VK_FORMAT_D32_SFLOAT_S8_UINT:
757 writeDepth32F(zBuffer, q, x, z[q], zMask[q]);
758 break;
759 default:
760 UNSUPPORTED("Depth format: %d", int(state.depthFormat));
761 break;
762 }
763 }
764 }
765
occlusionSampleCount(const Int zMask[4],const Int sMask[4],const SampleSet & samples)766 void PixelRoutine::occlusionSampleCount(const Int zMask[4], const Int sMask[4], const SampleSet &samples)
767 {
768 if(!state.occlusionEnabled)
769 {
770 return;
771 }
772
773 for(unsigned int q : samples)
774 {
775 occlusion += *Pointer<UInt>(constants + OFFSET(Constants, occlusionCount) + 4 * (zMask[q] & sMask[q]));
776 }
777 }
778
writeStencil(Pointer<Byte> & sBuffer,const Int & x,const Int sMask[4],const Int zMask[4],const Int cMask[4],const SampleSet & samples)779 void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, const Int &x, const Int sMask[4], const Int zMask[4], const Int cMask[4], const SampleSet &samples)
780 {
781 if(!state.stencilActive)
782 {
783 return;
784 }
785
786 if(state.frontStencil.passOp == VK_STENCIL_OP_KEEP && state.frontStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.frontStencil.failOp == VK_STENCIL_OP_KEEP)
787 {
788 if(state.backStencil.passOp == VK_STENCIL_OP_KEEP && state.backStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.backStencil.failOp == VK_STENCIL_OP_KEEP)
789 {
790 return;
791 }
792 }
793
794 if((state.frontStencil.writeMask == 0) && (state.backStencil.writeMask == 0))
795 {
796 return;
797 }
798
799 for(unsigned int q : samples)
800 {
801 Pointer<Byte> buffer = sBuffer + x;
802
803 if(q > 0)
804 {
805 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
806 }
807
808 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
809 Byte8 bufferValue = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
810 bufferValue = bufferValue | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
811 Byte8 newValue;
812 stencilOperation(newValue, bufferValue, state.frontStencil, false, zMask[q], sMask[q]);
813
814 if((state.frontStencil.writeMask & 0xFF) != 0xFF) // Assume 8-bit stencil buffer
815 {
816 Byte8 maskedValue = bufferValue;
817 newValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].writeMaskQ));
818 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].invWriteMaskQ));
819 newValue |= maskedValue;
820 }
821
822 Byte8 newValueBack;
823
824 stencilOperation(newValueBack, bufferValue, state.backStencil, true, zMask[q], sMask[q]);
825
826 if((state.backStencil.writeMask & 0xFF) != 0xFF) // Assume 8-bit stencil buffer
827 {
828 Byte8 maskedValue = bufferValue;
829 newValueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].writeMaskQ));
830 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].invWriteMaskQ));
831 newValueBack |= maskedValue;
832 }
833
834 newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
835 newValueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
836 newValue |= newValueBack;
837
838 newValue &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * cMask[q]);
839 bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * cMask[q]);
840 newValue |= bufferValue;
841
842 *Pointer<Short>(buffer) = Extract(As<Short4>(newValue), 0);
843 *Pointer<Short>(buffer + pitch) = Extract(As<Short4>(newValue), 1);
844 }
845 }
846
stencilOperation(Byte8 & newValue,const Byte8 & bufferValue,const PixelProcessor::States::StencilOpState & ops,bool isBack,const Int & zMask,const Int & sMask)847 void PixelRoutine::stencilOperation(Byte8 &newValue, const Byte8 &bufferValue, const PixelProcessor::States::StencilOpState &ops, bool isBack, const Int &zMask, const Int &sMask)
848 {
849 Byte8 &pass = newValue;
850 Byte8 fail;
851 Byte8 zFail;
852
853 stencilOperation(pass, bufferValue, ops.passOp, isBack);
854
855 if(ops.depthFailOp != ops.passOp)
856 {
857 stencilOperation(zFail, bufferValue, ops.depthFailOp, isBack);
858 }
859
860 if(ops.failOp != ops.passOp || ops.failOp != ops.depthFailOp)
861 {
862 stencilOperation(fail, bufferValue, ops.failOp, isBack);
863 }
864
865 if(ops.failOp != ops.passOp || ops.failOp != ops.depthFailOp)
866 {
867 if(state.depthTestActive && ops.depthFailOp != ops.passOp) // zMask valid and values not the same
868 {
869 pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * zMask);
870 zFail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * zMask);
871 pass |= zFail;
872 }
873
874 pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * sMask);
875 fail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * sMask);
876 pass |= fail;
877 }
878 }
879
stencilReplaceRef(bool isBack)880 Byte8 PixelRoutine::stencilReplaceRef(bool isBack)
881 {
882 if(spirvShader)
883 {
884 auto it = spirvShader->outputBuiltins.find(spv::BuiltInFragStencilRefEXT);
885 if(it != spirvShader->outputBuiltins.end())
886 {
887 UInt4 sRef = As<UInt4>(routine.getVariable(it->second.Id)[it->second.FirstComponent]) & UInt4(0xff);
888 // TODO (b/148295813): Could be done with a single pshufb instruction. Optimize the
889 // following line by either adding a rr::Shuffle() variant to do
890 // it explicitly or adding a Byte4(Int4) constructor would work.
891 sRef.x = rr::UInt(sRef.x) | (rr::UInt(sRef.y) << 8) | (rr::UInt(sRef.z) << 16) | (rr::UInt(sRef.w) << 24);
892
893 UInt2 sRefDuplicated;
894 sRefDuplicated = Insert(sRefDuplicated, sRef.x, 0);
895 sRefDuplicated = Insert(sRefDuplicated, sRef.x, 1);
896 return As<Byte8>(sRefDuplicated);
897 }
898 }
899
900 return *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceQ));
901 }
902
stencilOperation(Byte8 & output,const Byte8 & bufferValue,VkStencilOp operation,bool isBack)903 void PixelRoutine::stencilOperation(Byte8 &output, const Byte8 &bufferValue, VkStencilOp operation, bool isBack)
904 {
905 switch(operation)
906 {
907 case VK_STENCIL_OP_KEEP:
908 output = bufferValue;
909 break;
910 case VK_STENCIL_OP_ZERO:
911 output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
912 break;
913 case VK_STENCIL_OP_REPLACE:
914 output = stencilReplaceRef(isBack);
915 break;
916 case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
917 output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
918 break;
919 case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
920 output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
921 break;
922 case VK_STENCIL_OP_INVERT:
923 output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
924 break;
925 case VK_STENCIL_OP_INCREMENT_AND_WRAP:
926 output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
927 break;
928 case VK_STENCIL_OP_DECREMENT_AND_WRAP:
929 output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
930 break;
931 default:
932 UNSUPPORTED("VkStencilOp: %d", int(operation));
933 }
934 }
935
isSRGB(int index) const936 bool PixelRoutine::isSRGB(int index) const
937 {
938 return vk::Format(state.colorFormat[index]).isSRGBformat();
939 }
940
readPixel(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4s & pixel)941 void PixelRoutine::readPixel(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &pixel)
942 {
943 Short4 c01;
944 Short4 c23;
945 Pointer<Byte> buffer = cBuffer;
946 Pointer<Byte> buffer2;
947
948 Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
949
950 switch(state.colorFormat[index])
951 {
952 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
953 buffer += 2 * x;
954 buffer2 = buffer + pitchB;
955 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
956
957 pixel.x = (c01 & Short4(0xF000u));
958 pixel.y = (c01 & Short4(0x0F00u)) << 4;
959 pixel.z = (c01 & Short4(0x00F0u)) << 8;
960 pixel.w = (c01 & Short4(0x000Fu)) << 12;
961
962 // Expand to 16 bit range
963 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
964 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
965 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
966 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
967 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
968 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
969 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
970 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
971 break;
972 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
973 buffer += 2 * x;
974 buffer2 = buffer + pitchB;
975 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
976
977 pixel.z = (c01 & Short4(0xF000u));
978 pixel.y = (c01 & Short4(0x0F00u)) << 4;
979 pixel.x = (c01 & Short4(0x00F0u)) << 8;
980 pixel.w = (c01 & Short4(0x000Fu)) << 12;
981
982 // Expand to 16 bit range
983 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
984 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
985 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
986 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
987 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
988 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
989 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
990 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
991 break;
992 case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
993 buffer += 2 * x;
994 buffer2 = buffer + pitchB;
995 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
996
997 pixel.w = (c01 & Short4(0xF000u));
998 pixel.z = (c01 & Short4(0x0F00u)) << 4;
999 pixel.y = (c01 & Short4(0x00F0u)) << 8;
1000 pixel.x = (c01 & Short4(0x000Fu)) << 12;
1001
1002 // Expand to 16 bit range
1003 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
1004 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
1005 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
1006 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
1007 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
1008 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
1009 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1010 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1011 break;
1012 case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
1013 buffer += 2 * x;
1014 buffer2 = buffer + pitchB;
1015 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1016
1017 pixel.w = (c01 & Short4(0xF000u));
1018 pixel.x = (c01 & Short4(0x0F00u)) << 4;
1019 pixel.y = (c01 & Short4(0x00F0u)) << 8;
1020 pixel.z = (c01 & Short4(0x000Fu)) << 12;
1021
1022 // Expand to 16 bit range
1023 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
1024 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
1025 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
1026 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
1027 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
1028 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
1029 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1030 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1031 break;
1032 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1033 buffer += 2 * x;
1034 buffer2 = buffer + pitchB;
1035 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1036
1037 pixel.x = (c01 & Short4(0xF800u));
1038 pixel.y = (c01 & Short4(0x07C0u)) << 5;
1039 pixel.z = (c01 & Short4(0x003Eu)) << 10;
1040 pixel.w = ((c01 & Short4(0x0001u)) << 15) >> 15;
1041
1042 // Expand to 16 bit range
1043 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1044 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1045 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1046 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1047 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1048 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1049 break;
1050 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1051 buffer += 2 * x;
1052 buffer2 = buffer + pitchB;
1053 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1054
1055 pixel.z = (c01 & Short4(0xF800u));
1056 pixel.y = (c01 & Short4(0x07C0u)) << 5;
1057 pixel.x = (c01 & Short4(0x003Eu)) << 10;
1058 pixel.w = ((c01 & Short4(0x0001u)) << 15) >> 15;
1059
1060 // Expand to 16 bit range
1061 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1062 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1063 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1064 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1065 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1066 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1067 break;
1068 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1069 buffer += 2 * x;
1070 buffer2 = buffer + pitchB;
1071 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1072
1073 pixel.x = (c01 & Short4(0x7C00u)) << 1;
1074 pixel.y = (c01 & Short4(0x03E0u)) << 6;
1075 pixel.z = (c01 & Short4(0x001Fu)) << 11;
1076 pixel.w = (c01 & Short4(0x8000u)) >> 15;
1077
1078 // Expand to 16 bit range
1079 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1080 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1081 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1082 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1083 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1084 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1085 break;
1086 case VK_FORMAT_R5G6B5_UNORM_PACK16:
1087 buffer += 2 * x;
1088 buffer2 = buffer + pitchB;
1089 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1090
1091 pixel.x = c01 & Short4(0xF800u);
1092 pixel.y = (c01 & Short4(0x07E0u)) << 5;
1093 pixel.z = (c01 & Short4(0x001Fu)) << 11;
1094 pixel.w = Short4(0xFFFFu);
1095
1096 // Expand to 16 bit range
1097 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1098 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1099 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
1100 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
1101 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1102 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1103 break;
1104 case VK_FORMAT_B5G6R5_UNORM_PACK16:
1105 buffer += 2 * x;
1106 buffer2 = buffer + pitchB;
1107 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1108
1109 pixel.z = c01 & Short4(0xF800u);
1110 pixel.y = (c01 & Short4(0x07E0u)) << 5;
1111 pixel.x = (c01 & Short4(0x001Fu)) << 11;
1112 pixel.w = Short4(0xFFFFu);
1113
1114 // Expand to 16 bit range
1115 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1116 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1117 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
1118 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
1119 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1120 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1121 break;
1122 case VK_FORMAT_B8G8R8A8_UNORM:
1123 case VK_FORMAT_B8G8R8A8_SRGB:
1124 buffer += 4 * x;
1125 c01 = *Pointer<Short4>(buffer);
1126 buffer += pitchB;
1127 c23 = *Pointer<Short4>(buffer);
1128 pixel.z = c01;
1129 pixel.y = c01;
1130 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1131 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1132 pixel.x = pixel.z;
1133 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1134 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1135 pixel.y = pixel.z;
1136 pixel.w = pixel.x;
1137 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1138 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1139 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1140 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1141 break;
1142 case VK_FORMAT_R8G8B8A8_UNORM:
1143 case VK_FORMAT_R8G8B8A8_SRGB:
1144 buffer += 4 * x;
1145 c01 = *Pointer<Short4>(buffer);
1146 buffer += pitchB;
1147 c23 = *Pointer<Short4>(buffer);
1148 pixel.z = c01;
1149 pixel.y = c01;
1150 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1151 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1152 pixel.x = pixel.z;
1153 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1154 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1155 pixel.y = pixel.z;
1156 pixel.w = pixel.x;
1157 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1158 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1159 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1160 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1161 break;
1162 case VK_FORMAT_R8_UNORM:
1163 buffer += 1 * x;
1164 pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
1165 buffer += pitchB;
1166 pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
1167 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1168 pixel.y = Short4(0x0000);
1169 pixel.z = Short4(0x0000);
1170 pixel.w = Short4(0xFFFFu);
1171 break;
1172 case VK_FORMAT_R8G8_UNORM:
1173 buffer += 2 * x;
1174 c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
1175 buffer += pitchB;
1176 c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
1177 pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
1178 pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
1179 pixel.z = Short4(0x0000u);
1180 pixel.w = Short4(0xFFFFu);
1181 break;
1182 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1183 {
1184 Int4 v = Int4(0);
1185 buffer += 4 * x;
1186 v = Insert(v, *Pointer<Int>(buffer + 0), 0);
1187 v = Insert(v, *Pointer<Int>(buffer + 4), 1);
1188 buffer += pitchB;
1189 v = Insert(v, *Pointer<Int>(buffer + 0), 2);
1190 v = Insert(v, *Pointer<Int>(buffer + 4), 3);
1191
1192 pixel.x = Short4(v << 6) & Short4(0xFFC0u);
1193 pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
1194 pixel.z = Short4(v >> 14) & Short4(0xFFC0u);
1195 pixel.w = Short4(v >> 16) & Short4(0xC000u);
1196
1197 // Expand to 16 bit range
1198 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1199 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1200 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1201 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 2);
1202 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1203 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1204 }
1205 break;
1206 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1207 {
1208 Int4 v = Int4(0);
1209 v = Insert(v, *Pointer<Int>(buffer + 4 * x), 0);
1210 v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 1);
1211 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1212 v = Insert(v, *Pointer<Int>(buffer + 4 * x), 2);
1213 v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 3);
1214
1215 pixel.x = Short4(v >> 14) & Short4(0xFFC0u);
1216 pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
1217 pixel.z = Short4(v << 6) & Short4(0xFFC0u);
1218 pixel.w = Short4(v >> 16) & Short4(0xC000u);
1219
1220 // Expand to 16 bit range
1221 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1222 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1223 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1224 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 2);
1225 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1226 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1227 }
1228 break;
1229 default:
1230 UNSUPPORTED("VkFormat %d", int(state.colorFormat[index]));
1231 }
1232
1233 if(isSRGB(index))
1234 {
1235 sRGBtoLinear16_12_16(pixel);
1236 }
1237 }
1238
writeColor(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4s & current,const Int & sMask,const Int & zMask,const Int & cMask)1239 void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s ¤t, const Int &sMask, const Int &zMask, const Int &cMask)
1240 {
1241 if(isSRGB(index))
1242 {
1243 linearToSRGB16_12_16(current);
1244 }
1245
1246 switch(state.colorFormat[index])
1247 {
1248 case VK_FORMAT_B8G8R8A8_UNORM:
1249 case VK_FORMAT_B8G8R8A8_SRGB:
1250 case VK_FORMAT_R8G8B8A8_UNORM:
1251 case VK_FORMAT_R8G8B8A8_SRGB:
1252 case VK_FORMAT_R8G8_UNORM:
1253 case VK_FORMAT_R8_UNORM:
1254 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1255 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1256 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
1257 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
1258 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
1259 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
1260 break;
1261 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1262 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1263 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 10) + Short4(0x0020);
1264 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 10) + Short4(0x0020);
1265 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 10) + Short4(0x0020);
1266 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 2) + Short4(0x2000);
1267 break;
1268 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1269 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1270 case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
1271 case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
1272 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 4) + Short4(0x0800);
1273 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 4) + Short4(0x0800);
1274 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 4) + Short4(0x0800);
1275 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 4) + Short4(0x0800);
1276 break;
1277 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1278 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1279 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1280 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 5) + Short4(0x0400);
1281 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 5) + Short4(0x0400);
1282 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 5) + Short4(0x0400);
1283 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 1) + Short4(0x4000);
1284 break;
1285 case VK_FORMAT_B5G6R5_UNORM_PACK16:
1286 case VK_FORMAT_R5G6B5_UNORM_PACK16:
1287 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 5) + Short4(0x0400);
1288 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 6) + Short4(0x0200);
1289 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 5) + Short4(0x0400);
1290 break;
1291 default:
1292 break;
1293 }
1294
1295 int rgbaWriteMask = state.colorWriteActive(index);
1296 int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1297
1298 switch(state.colorFormat[index])
1299 {
1300 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1301 {
1302 current.x = As<UShort4>(current.x & Short4(0xF000));
1303 current.y = As<UShort4>(current.y & Short4(0xF000)) >> 4;
1304 current.z = As<UShort4>(current.z & Short4(0xF000)) >> 8;
1305 current.w = As<UShort4>(current.w & Short4(0xF000u)) >> 12;
1306
1307 current.x = current.x | current.y | current.z | current.w;
1308 }
1309 break;
1310 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1311 {
1312 current.z = As<UShort4>(current.z & Short4(0xF000));
1313 current.y = As<UShort4>(current.y & Short4(0xF000)) >> 4;
1314 current.x = As<UShort4>(current.x & Short4(0xF000)) >> 8;
1315 current.w = As<UShort4>(current.w & Short4(0xF000u)) >> 12;
1316
1317 current.x = current.x | current.y | current.z | current.w;
1318 }
1319 break;
1320 case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
1321 {
1322 current.w = As<UShort4>(current.w & Short4(0xF000));
1323 current.x = As<UShort4>(current.x & Short4(0xF000)) >> 4;
1324 current.y = As<UShort4>(current.y & Short4(0xF000)) >> 8;
1325 current.z = As<UShort4>(current.z & Short4(0xF000u)) >> 12;
1326
1327 current.x = current.x | current.y | current.z | current.w;
1328 }
1329 break;
1330 case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
1331 {
1332 current.w = As<UShort4>(current.w & Short4(0xF000));
1333 current.z = As<UShort4>(current.z & Short4(0xF000)) >> 4;
1334 current.y = As<UShort4>(current.y & Short4(0xF000)) >> 8;
1335 current.x = As<UShort4>(current.x & Short4(0xF000u)) >> 12;
1336
1337 current.x = current.x | current.y | current.z | current.w;
1338 }
1339 break;
1340 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1341 {
1342 current.x = As<UShort4>(current.x & Short4(0xF800));
1343 current.y = As<UShort4>(current.y & Short4(0xF800)) >> 5;
1344 current.z = As<UShort4>(current.z & Short4(0xF800)) >> 10;
1345 current.w = As<UShort4>(current.w & Short4(0x8000u)) >> 15;
1346
1347 current.x = current.x | current.y | current.z | current.w;
1348 }
1349 break;
1350 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1351 {
1352 current.z = As<UShort4>(current.z & Short4(0xF800));
1353 current.y = As<UShort4>(current.y & Short4(0xF800)) >> 5;
1354 current.x = As<UShort4>(current.x & Short4(0xF800)) >> 10;
1355 current.w = As<UShort4>(current.w & Short4(0x8000u)) >> 15;
1356
1357 current.x = current.x | current.y | current.z | current.w;
1358 }
1359 break;
1360 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1361 {
1362 current.w = current.w & Short4(0x8000u);
1363 current.x = As<UShort4>(current.x & Short4(0xF800)) >> 1;
1364 current.y = As<UShort4>(current.y & Short4(0xF800)) >> 6;
1365 current.z = As<UShort4>(current.z & Short4(0xF800)) >> 11;
1366
1367 current.x = current.x | current.y | current.z | current.w;
1368 }
1369 break;
1370 case VK_FORMAT_R5G6B5_UNORM_PACK16:
1371 {
1372 current.x = current.x & Short4(0xF800u);
1373 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1374 current.z = As<UShort4>(current.z) >> 11;
1375
1376 current.x = current.x | current.y | current.z;
1377 }
1378 break;
1379 case VK_FORMAT_B5G6R5_UNORM_PACK16:
1380 {
1381 current.z = current.z & Short4(0xF800u);
1382 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1383 current.x = As<UShort4>(current.x) >> 11;
1384
1385 current.x = current.x | current.y | current.z;
1386 }
1387 break;
1388 case VK_FORMAT_B8G8R8A8_UNORM:
1389 case VK_FORMAT_B8G8R8A8_SRGB:
1390 if(rgbaWriteMask == 0x7)
1391 {
1392 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1393 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1394 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1395
1396 current.z = As<Short4>(PackUnsigned(current.z, current.x));
1397 current.y = As<Short4>(PackUnsigned(current.y, current.y));
1398
1399 current.x = current.z;
1400 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1401 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1402 current.y = current.z;
1403 current.z = As<Short4>(UnpackLow(current.z, current.x));
1404 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1405 }
1406 else
1407 {
1408 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1409 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1410 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1411 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1412
1413 current.z = As<Short4>(PackUnsigned(current.z, current.x));
1414 current.y = As<Short4>(PackUnsigned(current.y, current.w));
1415
1416 current.x = current.z;
1417 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1418 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1419 current.y = current.z;
1420 current.z = As<Short4>(UnpackLow(current.z, current.x));
1421 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1422 }
1423 break;
1424 case VK_FORMAT_R8G8B8A8_UNORM:
1425 case VK_FORMAT_R8G8B8A8_SRGB:
1426 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1427 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1428 if(rgbaWriteMask == 0x7)
1429 {
1430 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1431 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1432 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1433
1434 current.z = As<Short4>(PackUnsigned(current.x, current.z));
1435 current.y = As<Short4>(PackUnsigned(current.y, current.y));
1436
1437 current.x = current.z;
1438 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1439 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1440 current.y = current.z;
1441 current.z = As<Short4>(UnpackLow(current.z, current.x));
1442 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1443 }
1444 else
1445 {
1446 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1447 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1448 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1449 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1450
1451 current.z = As<Short4>(PackUnsigned(current.x, current.z));
1452 current.y = As<Short4>(PackUnsigned(current.y, current.w));
1453
1454 current.x = current.z;
1455 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1456 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1457 current.y = current.z;
1458 current.z = As<Short4>(UnpackLow(current.z, current.x));
1459 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1460 }
1461 break;
1462 case VK_FORMAT_R8G8_UNORM:
1463 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1464 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1465 current.x = As<Short4>(PackUnsigned(current.x, current.x));
1466 current.y = As<Short4>(PackUnsigned(current.y, current.y));
1467 current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
1468 break;
1469 case VK_FORMAT_R8_UNORM:
1470 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1471 current.x = As<Short4>(PackUnsigned(current.x, current.x));
1472 break;
1473 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1474 {
1475 auto r = (Int4(current.x) >> 6) & Int4(0x3ff);
1476 auto g = (Int4(current.y) >> 6) & Int4(0x3ff);
1477 auto b = (Int4(current.z) >> 6) & Int4(0x3ff);
1478 auto a = (Int4(current.w) >> 14) & Int4(0x3);
1479 Int4 packed = (a << 30) | (b << 20) | (g << 10) | r;
1480 auto c02 = As<Int2>(Int4(packed.xzzz)); // TODO: auto c02 = packed.xz;
1481 auto c13 = As<Int2>(Int4(packed.ywww)); // TODO: auto c13 = packed.yw;
1482 current.x = UnpackLow(c02, c13);
1483 current.y = UnpackHigh(c02, c13);
1484 }
1485 break;
1486 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1487 {
1488 auto r = (Int4(current.x) >> 6) & Int4(0x3ff);
1489 auto g = (Int4(current.y) >> 6) & Int4(0x3ff);
1490 auto b = (Int4(current.z) >> 6) & Int4(0x3ff);
1491 auto a = (Int4(current.w) >> 14) & Int4(0x3);
1492 Int4 packed = (a << 30) | (r << 20) | (g << 10) | b;
1493 auto c02 = As<Int2>(Int4(packed.xzzz)); // TODO: auto c02 = packed.xz;
1494 auto c13 = As<Int2>(Int4(packed.ywww)); // TODO: auto c13 = packed.yw;
1495 current.x = UnpackLow(c02, c13);
1496 current.y = UnpackHigh(c02, c13);
1497 }
1498 break;
1499 default:
1500 UNSUPPORTED("VkFormat: %d", int(state.colorFormat[index]));
1501 }
1502
1503 Short4 c01 = current.z;
1504 Short4 c23 = current.y;
1505
1506 Int xMask; // Combination of all masks
1507
1508 if(state.depthTestActive)
1509 {
1510 xMask = zMask;
1511 }
1512 else
1513 {
1514 xMask = cMask;
1515 }
1516
1517 if(state.stencilActive)
1518 {
1519 xMask &= sMask;
1520 }
1521
1522 Pointer<Byte> buffer = cBuffer;
1523 Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1524
1525 switch(state.colorFormat[index])
1526 {
1527 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1528 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1529 case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
1530 case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
1531 {
1532 buffer += 2 * x;
1533 Int value = *Pointer<Int>(buffer);
1534
1535 Int channelMask;
1536 switch(state.colorFormat[index])
1537 {
1538 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1539 channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4rgbaQ[bgraWriteMask & 0xF][0]));
1540 break;
1541 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1542 channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4bgraQ[bgraWriteMask & 0xF][0]));
1543 break;
1544 case VK_FORMAT_A4R4G4B4_UNORM_PACK16_EXT:
1545 channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4argbQ[bgraWriteMask & 0xF][0]));
1546 break;
1547 case VK_FORMAT_A4B4G4R4_UNORM_PACK16_EXT:
1548 channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4abgrQ[bgraWriteMask & 0xF][0]));
1549 break;
1550 default:
1551 UNREACHABLE("Format: %s", vk::Stringify(state.colorFormat[index]).c_str());
1552 }
1553
1554 Int c01 = Extract(As<Int2>(current.x), 0);
1555 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1556 if(bgraWriteMask != 0x0000000F)
1557 {
1558 mask01 &= channelMask;
1559 }
1560 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1561
1562 buffer += pitchB;
1563 value = *Pointer<Int>(buffer);
1564
1565 Int c23 = Extract(As<Int2>(current.x), 1);
1566 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1567 if(bgraWriteMask != 0x0000000F)
1568 {
1569 mask23 &= channelMask;
1570 }
1571 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1572 }
1573 break;
1574 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1575 {
1576 buffer += 2 * x;
1577 Int value = *Pointer<Int>(buffer);
1578
1579 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskr5g5b5a1Q[bgraWriteMask & 0xF][0]));
1580
1581 Int c01 = Extract(As<Int2>(current.x), 0);
1582 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1583 if(bgraWriteMask != 0x0000000F)
1584 {
1585 mask01 &= channelMask;
1586 }
1587 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1588
1589 buffer += pitchB;
1590 value = *Pointer<Int>(buffer);
1591
1592 Int c23 = Extract(As<Int2>(current.x), 1);
1593 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1594 if(bgraWriteMask != 0x0000000F)
1595 {
1596 mask23 &= channelMask;
1597 }
1598 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1599 }
1600 break;
1601 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1602 {
1603 buffer += 2 * x;
1604 Int value = *Pointer<Int>(buffer);
1605
1606 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskb5g5r5a1Q[bgraWriteMask & 0xF][0]));
1607
1608 Int c01 = Extract(As<Int2>(current.x), 0);
1609 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1610 if(bgraWriteMask != 0x0000000F)
1611 {
1612 mask01 &= channelMask;
1613 }
1614 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1615
1616 buffer += pitchB;
1617 value = *Pointer<Int>(buffer);
1618
1619 Int c23 = Extract(As<Int2>(current.x), 1);
1620 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1621 if(bgraWriteMask != 0x0000000F)
1622 {
1623 mask23 &= channelMask;
1624 }
1625 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1626 }
1627 break;
1628 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1629 {
1630 buffer += 2 * x;
1631 Int value = *Pointer<Int>(buffer);
1632
1633 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask5551Q[bgraWriteMask & 0xF][0]));
1634
1635 Int c01 = Extract(As<Int2>(current.x), 0);
1636 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1637 if(bgraWriteMask != 0x0000000F)
1638 {
1639 mask01 &= channelMask;
1640 }
1641 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1642
1643 buffer += pitchB;
1644 value = *Pointer<Int>(buffer);
1645
1646 Int c23 = Extract(As<Int2>(current.x), 1);
1647 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1648 if(bgraWriteMask != 0x0000000F)
1649 {
1650 mask23 &= channelMask;
1651 }
1652 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1653 }
1654 break;
1655 case VK_FORMAT_R5G6B5_UNORM_PACK16:
1656 {
1657 buffer += 2 * x;
1658 Int value = *Pointer<Int>(buffer);
1659
1660 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask565Q[bgraWriteMask & 0x7][0]));
1661
1662 Int c01 = Extract(As<Int2>(current.x), 0);
1663 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1664 if((bgraWriteMask & 0x00000007) != 0x00000007)
1665 {
1666 mask01 &= channelMask;
1667 }
1668 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1669
1670 buffer += pitchB;
1671 value = *Pointer<Int>(buffer);
1672
1673 Int c23 = Extract(As<Int2>(current.x), 1);
1674 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1675 if((bgraWriteMask & 0x00000007) != 0x00000007)
1676 {
1677 mask23 &= channelMask;
1678 }
1679 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1680 }
1681 break;
1682 case VK_FORMAT_B8G8R8A8_UNORM:
1683 case VK_FORMAT_B8G8R8A8_SRGB:
1684 {
1685 buffer += x * 4;
1686 Short4 value = *Pointer<Short4>(buffer);
1687 Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q[bgraWriteMask][0]));
1688
1689 Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1690 if(bgraWriteMask != 0x0000000F)
1691 {
1692 mask01 &= channelMask;
1693 }
1694 *Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
1695
1696 buffer += pitchB;
1697 value = *Pointer<Short4>(buffer);
1698
1699 Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1700 if(bgraWriteMask != 0x0000000F)
1701 {
1702 mask23 &= channelMask;
1703 }
1704 *Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
1705 }
1706 break;
1707 case VK_FORMAT_R8G8B8A8_UNORM:
1708 case VK_FORMAT_R8G8B8A8_SRGB:
1709 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1710 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1711 {
1712 buffer += x * 4;
1713 Short4 value = *Pointer<Short4>(buffer);
1714 Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
1715
1716 Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1717 if(rgbaWriteMask != 0x0000000F)
1718 {
1719 mask01 &= channelMask;
1720 }
1721 *Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
1722
1723 buffer += pitchB;
1724 value = *Pointer<Short4>(buffer);
1725
1726 Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1727 if(rgbaWriteMask != 0x0000000F)
1728 {
1729 mask23 &= channelMask;
1730 }
1731 *Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
1732 }
1733 break;
1734 case VK_FORMAT_R8G8_UNORM:
1735 if((rgbaWriteMask & 0x00000003) != 0x0)
1736 {
1737 buffer += 2 * x;
1738 Int2 value;
1739 value = Insert(value, *Pointer<Int>(buffer), 0);
1740 value = Insert(value, *Pointer<Int>(buffer + pitchB), 1);
1741
1742 Int2 packedCol = As<Int2>(current.x);
1743
1744 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
1745 if((rgbaWriteMask & 0x3) != 0x3)
1746 {
1747 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
1748 UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
1749 mergedMask &= rgbaMask;
1750 }
1751
1752 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
1753
1754 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
1755 *Pointer<UInt>(buffer + pitchB) = As<UInt>(Extract(packedCol, 1));
1756 }
1757 break;
1758 case VK_FORMAT_R8_UNORM:
1759 if(rgbaWriteMask & 0x00000001)
1760 {
1761 buffer += 1 * x;
1762 Short4 value;
1763 value = Insert(value, *Pointer<Short>(buffer), 0);
1764 value = Insert(value, *Pointer<Short>(buffer + pitchB), 1);
1765
1766 current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
1767 value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
1768 current.x |= value;
1769
1770 *Pointer<Short>(buffer) = Extract(current.x, 0);
1771 *Pointer<Short>(buffer + pitchB) = Extract(current.x, 1);
1772 }
1773 break;
1774 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1775 rgbaWriteMask = bgraWriteMask;
1776 // [[fallthrough]]
1777 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1778 {
1779 buffer += 4 * x;
1780
1781 Int2 value = *Pointer<Int2>(buffer, 16);
1782 Int2 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1783 if(rgbaWriteMask != 0xF)
1784 {
1785 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
1786 }
1787 *Pointer<Int2>(buffer) = (As<Int2>(current.x) & mergedMask) | (value & ~mergedMask);
1788
1789 buffer += pitchB;
1790
1791 value = *Pointer<Int2>(buffer, 16);
1792 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1793 if(rgbaWriteMask != 0xF)
1794 {
1795 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
1796 }
1797 *Pointer<Int2>(buffer) = (As<Int2>(current.y) & mergedMask) | (value & ~mergedMask);
1798 }
1799 break;
1800 default:
1801 UNSUPPORTED("VkFormat: %d", int(state.colorFormat[index]));
1802 }
1803 }
1804
blendConstant(vk::Format format,int component,BlendFactorModifier modifier)1805 Float PixelRoutine::blendConstant(vk::Format format, int component, BlendFactorModifier modifier)
1806 {
1807 bool inverse = (modifier == OneMinus);
1808
1809 if(format.isUnsignedNormalized())
1810 {
1811 return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantU[component]))
1812 : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantU[component]));
1813 }
1814 else if(format.isSignedNormalized())
1815 {
1816 return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantS[component]))
1817 : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantS[component]));
1818 }
1819 else // Floating-point format
1820 {
1821 ASSERT(format.isFloatFormat());
1822 return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantF[component]))
1823 : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantF[component]));
1824 }
1825 }
1826
blendFactorRGB(Vector4f & blendFactor,const Vector4f & sourceColor,const Vector4f & destColor,VkBlendFactor colorBlendFactor,vk::Format format)1827 void PixelRoutine::blendFactorRGB(Vector4f &blendFactor, const Vector4f &sourceColor, const Vector4f &destColor, VkBlendFactor colorBlendFactor, vk::Format format)
1828 {
1829 switch(colorBlendFactor)
1830 {
1831 case VK_BLEND_FACTOR_ZERO:
1832 blendFactor.x = Float4(0);
1833 blendFactor.y = Float4(0);
1834 blendFactor.z = Float4(0);
1835 break;
1836 case VK_BLEND_FACTOR_ONE:
1837 blendFactor.x = Float4(1);
1838 blendFactor.y = Float4(1);
1839 blendFactor.z = Float4(1);
1840 break;
1841 case VK_BLEND_FACTOR_SRC_COLOR:
1842 blendFactor.x = sourceColor.x;
1843 blendFactor.y = sourceColor.y;
1844 blendFactor.z = sourceColor.z;
1845 break;
1846 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1847 blendFactor.x = Float4(1.0f) - sourceColor.x;
1848 blendFactor.y = Float4(1.0f) - sourceColor.y;
1849 blendFactor.z = Float4(1.0f) - sourceColor.z;
1850 break;
1851 case VK_BLEND_FACTOR_DST_COLOR:
1852 blendFactor.x = destColor.x;
1853 blendFactor.y = destColor.y;
1854 blendFactor.z = destColor.z;
1855 break;
1856 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1857 blendFactor.x = Float4(1.0f) - destColor.x;
1858 blendFactor.y = Float4(1.0f) - destColor.y;
1859 blendFactor.z = Float4(1.0f) - destColor.z;
1860 break;
1861 case VK_BLEND_FACTOR_SRC_ALPHA:
1862 blendFactor.x = sourceColor.w;
1863 blendFactor.y = sourceColor.w;
1864 blendFactor.z = sourceColor.w;
1865 break;
1866 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1867 blendFactor.x = Float4(1.0f) - sourceColor.w;
1868 blendFactor.y = Float4(1.0f) - sourceColor.w;
1869 blendFactor.z = Float4(1.0f) - sourceColor.w;
1870 break;
1871 case VK_BLEND_FACTOR_DST_ALPHA:
1872 blendFactor.x = destColor.w;
1873 blendFactor.y = destColor.w;
1874 blendFactor.z = destColor.w;
1875 break;
1876 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1877 blendFactor.x = Float4(1.0f) - destColor.w;
1878 blendFactor.y = Float4(1.0f) - destColor.w;
1879 blendFactor.z = Float4(1.0f) - destColor.w;
1880 break;
1881 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1882 blendFactor.x = Float4(1.0f) - destColor.w;
1883 blendFactor.x = Min(blendFactor.x, sourceColor.w);
1884 blendFactor.y = blendFactor.x;
1885 blendFactor.z = blendFactor.x;
1886 break;
1887 case VK_BLEND_FACTOR_CONSTANT_COLOR:
1888 blendFactor.x = Float4(blendConstant(format, 0));
1889 blendFactor.y = Float4(blendConstant(format, 1));
1890 blendFactor.z = Float4(blendConstant(format, 2));
1891 break;
1892 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1893 blendFactor.x = Float4(blendConstant(format, 3));
1894 blendFactor.y = Float4(blendConstant(format, 3));
1895 blendFactor.z = Float4(blendConstant(format, 3));
1896 break;
1897 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1898 blendFactor.x = Float4(blendConstant(format, 0, OneMinus));
1899 blendFactor.y = Float4(blendConstant(format, 1, OneMinus));
1900 blendFactor.z = Float4(blendConstant(format, 2, OneMinus));
1901 break;
1902 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1903 blendFactor.x = Float4(blendConstant(format, 3, OneMinus));
1904 blendFactor.y = Float4(blendConstant(format, 3, OneMinus));
1905 blendFactor.z = Float4(blendConstant(format, 3, OneMinus));
1906 break;
1907
1908 default:
1909 UNSUPPORTED("VkBlendFactor: %d", int(colorBlendFactor));
1910 }
1911
1912 // "If the color attachment is fixed-point, the components of the source and destination values and blend factors are each clamped
1913 // to [0,1] or [-1,1] respectively for an unsigned normalized or signed normalized color attachment prior to evaluating the blend
1914 // operations. If the color attachment is floating-point, no clamping occurs."
1915 if(blendFactorCanExceedFormatRange(colorBlendFactor, format))
1916 {
1917 if(format.isUnsignedNormalized())
1918 {
1919 blendFactor.x = Min(Max(blendFactor.x, Float4(0.0f)), Float4(1.0f));
1920 blendFactor.y = Min(Max(blendFactor.y, Float4(0.0f)), Float4(1.0f));
1921 blendFactor.z = Min(Max(blendFactor.z, Float4(0.0f)), Float4(1.0f));
1922 }
1923 else if(format.isSignedNormalized())
1924 {
1925 blendFactor.x = Min(Max(blendFactor.x, Float4(-1.0f)), Float4(1.0f));
1926 blendFactor.y = Min(Max(blendFactor.y, Float4(-1.0f)), Float4(1.0f));
1927 blendFactor.z = Min(Max(blendFactor.z, Float4(-1.0f)), Float4(1.0f));
1928 }
1929 }
1930 }
1931
blendFactorAlpha(Float4 & blendFactorAlpha,const Float4 & sourceAlpha,const Float4 & destAlpha,VkBlendFactor alphaBlendFactor,vk::Format format)1932 void PixelRoutine::blendFactorAlpha(Float4 &blendFactorAlpha, const Float4 &sourceAlpha, const Float4 &destAlpha, VkBlendFactor alphaBlendFactor, vk::Format format)
1933 {
1934 switch(alphaBlendFactor)
1935 {
1936 case VK_BLEND_FACTOR_ZERO:
1937 blendFactorAlpha = Float4(0);
1938 break;
1939 case VK_BLEND_FACTOR_ONE:
1940 blendFactorAlpha = Float4(1);
1941 break;
1942 case VK_BLEND_FACTOR_SRC_COLOR:
1943 blendFactorAlpha = sourceAlpha;
1944 break;
1945 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1946 blendFactorAlpha = Float4(1.0f) - sourceAlpha;
1947 break;
1948 case VK_BLEND_FACTOR_DST_COLOR:
1949 blendFactorAlpha = destAlpha;
1950 break;
1951 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1952 blendFactorAlpha = Float4(1.0f) - destAlpha;
1953 break;
1954 case VK_BLEND_FACTOR_SRC_ALPHA:
1955 blendFactorAlpha = sourceAlpha;
1956 break;
1957 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1958 blendFactorAlpha = Float4(1.0f) - sourceAlpha;
1959 break;
1960 case VK_BLEND_FACTOR_DST_ALPHA:
1961 blendFactorAlpha = destAlpha;
1962 break;
1963 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1964 blendFactorAlpha = Float4(1.0f) - destAlpha;
1965 break;
1966 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1967 blendFactorAlpha = Float4(1.0f);
1968 break;
1969 case VK_BLEND_FACTOR_CONSTANT_COLOR:
1970 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1971 blendFactorAlpha = Float4(blendConstant(format, 3));
1972 break;
1973 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1974 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1975 blendFactorAlpha = Float4(blendConstant(format, 3, OneMinus));
1976 break;
1977 default:
1978 UNSUPPORTED("VkBlendFactor: %d", int(alphaBlendFactor));
1979 }
1980
1981 // "If the color attachment is fixed-point, the components of the source and destination values and blend factors are each clamped
1982 // to [0,1] or [-1,1] respectively for an unsigned normalized or signed normalized color attachment prior to evaluating the blend
1983 // operations. If the color attachment is floating-point, no clamping occurs."
1984 if(blendFactorCanExceedFormatRange(alphaBlendFactor, format))
1985 {
1986 if(format.isUnsignedNormalized())
1987 {
1988 blendFactorAlpha = Min(Max(blendFactorAlpha, Float4(0.0f)), Float4(1.0f));
1989 }
1990 else if(format.isSignedNormalized())
1991 {
1992 blendFactorAlpha = Min(Max(blendFactorAlpha, Float4(-1.0f)), Float4(1.0f));
1993 }
1994 }
1995 }
1996
blendOpOverlay(Float4 & src,Float4 & dst)1997 Float4 PixelRoutine::blendOpOverlay(Float4 &src, Float4 &dst)
1998 {
1999 Int4 largeDst = CmpGT(dst, Float4(0.5f));
2000 return As<Float4>(
2001 (~largeDst &
2002 As<Int4>(Float4(2.0f) * src * dst)) |
2003 (largeDst &
2004 As<Int4>(Float4(1.0f) - (Float4(2.0f) * (Float4(1.0f) - src) * (Float4(1.0f) - dst)))));
2005 }
2006
blendOpColorDodge(Float4 & src,Float4 & dst)2007 Float4 PixelRoutine::blendOpColorDodge(Float4 &src, Float4 &dst)
2008 {
2009 Int4 srcBelowOne = CmpLT(src, Float4(1.0f));
2010 Int4 positiveDst = CmpGT(dst, Float4(0.0f));
2011 return As<Float4>(positiveDst & ((~srcBelowOne &
2012 As<Int4>(Float4(1.0f))) |
2013 (srcBelowOne &
2014 As<Int4>(Min(Float4(1.0f), (dst / (Float4(1.0f) - src)))))));
2015 }
2016
blendOpColorBurn(Float4 & src,Float4 & dst)2017 Float4 PixelRoutine::blendOpColorBurn(Float4 &src, Float4 &dst)
2018 {
2019 Int4 dstBelowOne = CmpLT(dst, Float4(1.0f));
2020 Int4 positiveSrc = CmpGT(src, Float4(0.0f));
2021 return As<Float4>(
2022 (~dstBelowOne &
2023 As<Int4>(Float4(1.0f))) |
2024 (dstBelowOne & positiveSrc &
2025 As<Int4>(Float4(1.0f) - Min(Float4(1.0f), (Float4(1.0f) - dst) / src))));
2026 }
2027
blendOpHardlight(Float4 & src,Float4 & dst)2028 Float4 PixelRoutine::blendOpHardlight(Float4 &src, Float4 &dst)
2029 {
2030 Int4 largeSrc = CmpGT(src, Float4(0.5f));
2031 return As<Float4>(
2032 (~largeSrc &
2033 As<Int4>(Float4(2.0f) * src * dst)) |
2034 (largeSrc &
2035 As<Int4>(Float4(1.0f) - (Float4(2.0f) * (Float4(1.0f) - src) * (Float4(1.0f) - dst)))));
2036 }
2037
blendOpSoftlight(Float4 & src,Float4 & dst)2038 Float4 PixelRoutine::blendOpSoftlight(Float4 &src, Float4 &dst)
2039 {
2040 Int4 largeSrc = CmpGT(src, Float4(0.5f));
2041 Int4 largeDst = CmpGT(dst, Float4(0.25f));
2042
2043 return As<Float4>(
2044 (~largeSrc &
2045 As<Int4>(dst - ((Float4(1.0f) - (Float4(2.0f) * src)) * dst * (Float4(1.0f) - dst)))) |
2046 (largeSrc & ((~largeDst &
2047 As<Int4>(dst + (((Float4(2.0f) * src) - Float4(1.0f)) * dst * ((((Float4(16.0f) * dst) - Float4(12.0f)) * dst) + Float4(3.0f))))) |
2048 (largeDst &
2049 As<Int4>(dst + (((Float4(2.0f) * src) - Float4(1.0f)) * (Sqrt(dst) - dst)))))));
2050 }
2051
maxRGB(Vector4f & c)2052 Float4 PixelRoutine::maxRGB(Vector4f &c)
2053 {
2054 return Max(Max(c.x, c.y), c.z);
2055 }
2056
minRGB(Vector4f & c)2057 Float4 PixelRoutine::minRGB(Vector4f &c)
2058 {
2059 return Min(Min(c.x, c.y), c.z);
2060 }
2061
setLumSat(Vector4f & cbase,Vector4f & csat,Vector4f & clum,Float4 & x,Float4 & y,Float4 & z)2062 void PixelRoutine::setLumSat(Vector4f &cbase, Vector4f &csat, Vector4f &clum, Float4 &x, Float4 &y, Float4 &z)
2063 {
2064 Float4 minbase = minRGB(cbase);
2065 Float4 sbase = maxRGB(cbase) - minbase;
2066 Float4 ssat = maxRGB(csat) - minRGB(csat);
2067 Int4 isNonZero = CmpGT(sbase, Float4(0.0f));
2068 Vector4f color;
2069 color.x = As<Float4>(isNonZero & As<Int4>((cbase.x - minbase) * ssat / sbase));
2070 color.y = As<Float4>(isNonZero & As<Int4>((cbase.y - minbase) * ssat / sbase));
2071 color.z = As<Float4>(isNonZero & As<Int4>((cbase.z - minbase) * ssat / sbase));
2072 setLum(color, clum, x, y, z);
2073 }
2074
lumRGB(Vector4f & c)2075 Float4 PixelRoutine::lumRGB(Vector4f &c)
2076 {
2077 return c.x * Float4(0.3f) + c.y * Float4(0.59f) + c.z * Float4(0.11f);
2078 }
2079
computeLum(Float4 & color,Float4 & lum,Float4 & mincol,Float4 & maxcol,Int4 & negative,Int4 & aboveOne)2080 Float4 PixelRoutine::computeLum(Float4 &color, Float4 &lum, Float4 &mincol, Float4 &maxcol, Int4 &negative, Int4 &aboveOne)
2081 {
2082 return As<Float4>(
2083 (negative &
2084 As<Int4>(lum + ((color - lum) * lum) / (lum - mincol))) |
2085 (~negative &
2086 ((aboveOne &
2087 As<Int4>(lum + ((color - lum) * (Float4(1.0f) - lum)) / (Float4(maxcol) - lum))) |
2088 (~aboveOne &
2089 As<Int4>(color)))));
2090 }
2091
setLum(Vector4f & cbase,Vector4f & clum,Float4 & x,Float4 & y,Float4 & z)2092 void PixelRoutine::setLum(Vector4f &cbase, Vector4f &clum, Float4 &x, Float4 &y, Float4 &z)
2093 {
2094 Float4 lbase = lumRGB(cbase);
2095 Float4 llum = lumRGB(clum);
2096 Float4 ldiff = llum - lbase;
2097
2098 Vector4f color;
2099 color.x = cbase.x + ldiff;
2100 color.y = cbase.y + ldiff;
2101 color.z = cbase.z + ldiff;
2102
2103 Float4 lum = lumRGB(color);
2104 Float4 mincol = minRGB(color);
2105 Float4 maxcol = maxRGB(color);
2106
2107 Int4 negative = CmpLT(mincol, Float4(0.0f));
2108 Int4 aboveOne = CmpGT(maxcol, Float4(1.0f));
2109
2110 x = computeLum(color.x, lum, mincol, maxcol, negative, aboveOne);
2111 y = computeLum(color.y, lum, mincol, maxcol, negative, aboveOne);
2112 z = computeLum(color.z, lum, mincol, maxcol, negative, aboveOne);
2113 }
2114
premultiply(Vector4f & c)2115 void PixelRoutine::premultiply(Vector4f &c)
2116 {
2117 Int4 nonZeroAlpha = CmpNEQ(c.w, Float4(0.0f));
2118 c.x = As<Float4>(nonZeroAlpha & As<Int4>(c.x / c.w));
2119 c.y = As<Float4>(nonZeroAlpha & As<Int4>(c.y / c.w));
2120 c.z = As<Float4>(nonZeroAlpha & As<Int4>(c.z / c.w));
2121 }
2122
computeAdvancedBlendMode(int index,const Vector4f & src,const Vector4f & dst,const Vector4f & srcFactor,const Vector4f & dstFactor)2123 Vector4f PixelRoutine::computeAdvancedBlendMode(int index, const Vector4f &src, const Vector4f &dst, const Vector4f &srcFactor, const Vector4f &dstFactor)
2124 {
2125 Vector4f srcColor = src;
2126 srcColor.x *= srcFactor.x;
2127 srcColor.y *= srcFactor.y;
2128 srcColor.z *= srcFactor.z;
2129 srcColor.w *= srcFactor.w;
2130
2131 Vector4f dstColor = dst;
2132 dstColor.x *= dstFactor.x;
2133 dstColor.y *= dstFactor.y;
2134 dstColor.z *= dstFactor.z;
2135 dstColor.w *= dstFactor.w;
2136
2137 premultiply(srcColor);
2138 premultiply(dstColor);
2139
2140 Vector4f blendedColor;
2141
2142 switch(state.blendState[index].blendOperation)
2143 {
2144 case VK_BLEND_OP_MULTIPLY_EXT:
2145 blendedColor.x = (srcColor.x * dstColor.x);
2146 blendedColor.y = (srcColor.y * dstColor.y);
2147 blendedColor.z = (srcColor.z * dstColor.z);
2148 break;
2149 case VK_BLEND_OP_SCREEN_EXT:
2150 blendedColor.x = srcColor.x + dstColor.x - (srcColor.x * dstColor.x);
2151 blendedColor.y = srcColor.y + dstColor.y - (srcColor.y * dstColor.y);
2152 blendedColor.z = srcColor.z + dstColor.z - (srcColor.z * dstColor.z);
2153 break;
2154 case VK_BLEND_OP_OVERLAY_EXT:
2155 blendedColor.x = blendOpOverlay(srcColor.x, dstColor.x);
2156 blendedColor.y = blendOpOverlay(srcColor.y, dstColor.y);
2157 blendedColor.z = blendOpOverlay(srcColor.z, dstColor.z);
2158 break;
2159 case VK_BLEND_OP_DARKEN_EXT:
2160 blendedColor.x = Min(srcColor.x, dstColor.x);
2161 blendedColor.y = Min(srcColor.y, dstColor.y);
2162 blendedColor.z = Min(srcColor.z, dstColor.z);
2163 break;
2164 case VK_BLEND_OP_LIGHTEN_EXT:
2165 blendedColor.x = Max(srcColor.x, dstColor.x);
2166 blendedColor.y = Max(srcColor.y, dstColor.y);
2167 blendedColor.z = Max(srcColor.z, dstColor.z);
2168 break;
2169 case VK_BLEND_OP_COLORDODGE_EXT:
2170 blendedColor.x = blendOpColorDodge(srcColor.x, dstColor.x);
2171 blendedColor.y = blendOpColorDodge(srcColor.y, dstColor.y);
2172 blendedColor.z = blendOpColorDodge(srcColor.z, dstColor.z);
2173 break;
2174 case VK_BLEND_OP_COLORBURN_EXT:
2175 blendedColor.x = blendOpColorBurn(srcColor.x, dstColor.x);
2176 blendedColor.y = blendOpColorBurn(srcColor.y, dstColor.y);
2177 blendedColor.z = blendOpColorBurn(srcColor.z, dstColor.z);
2178 break;
2179 case VK_BLEND_OP_HARDLIGHT_EXT:
2180 blendedColor.x = blendOpHardlight(srcColor.x, dstColor.x);
2181 blendedColor.y = blendOpHardlight(srcColor.y, dstColor.y);
2182 blendedColor.z = blendOpHardlight(srcColor.z, dstColor.z);
2183 break;
2184 case VK_BLEND_OP_SOFTLIGHT_EXT:
2185 blendedColor.x = blendOpSoftlight(srcColor.x, dstColor.x);
2186 blendedColor.y = blendOpSoftlight(srcColor.y, dstColor.y);
2187 blendedColor.z = blendOpSoftlight(srcColor.z, dstColor.z);
2188 break;
2189 case VK_BLEND_OP_DIFFERENCE_EXT:
2190 blendedColor.x = Abs(srcColor.x - dstColor.x);
2191 blendedColor.y = Abs(srcColor.y - dstColor.y);
2192 blendedColor.z = Abs(srcColor.z - dstColor.z);
2193 break;
2194 case VK_BLEND_OP_EXCLUSION_EXT:
2195 blendedColor.x = srcColor.x + dstColor.x - (srcColor.x * dstColor.x * Float4(2.0f));
2196 blendedColor.y = srcColor.y + dstColor.y - (srcColor.y * dstColor.y * Float4(2.0f));
2197 blendedColor.z = srcColor.z + dstColor.z - (srcColor.z * dstColor.z * Float4(2.0f));
2198 break;
2199 case VK_BLEND_OP_HSL_HUE_EXT:
2200 setLumSat(srcColor, dstColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
2201 break;
2202 case VK_BLEND_OP_HSL_SATURATION_EXT:
2203 setLumSat(dstColor, srcColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
2204 break;
2205 case VK_BLEND_OP_HSL_COLOR_EXT:
2206 setLum(srcColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
2207 break;
2208 case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
2209 setLum(dstColor, srcColor, blendedColor.x, blendedColor.y, blendedColor.z);
2210 break;
2211 default:
2212 UNSUPPORTED("Unsupported advanced VkBlendOp: %d", int(state.blendState[index].blendOperation));
2213 break;
2214 }
2215
2216 Float4 p = srcColor.w * dstColor.w;
2217 blendedColor.x *= p;
2218 blendedColor.y *= p;
2219 blendedColor.z *= p;
2220
2221 p = srcColor.w * (Float4(1.0f) - dstColor.w);
2222 blendedColor.x += srcColor.x * p;
2223 blendedColor.y += srcColor.y * p;
2224 blendedColor.z += srcColor.z * p;
2225
2226 p = dstColor.w * (Float4(1.0f) - srcColor.w);
2227 blendedColor.x += dstColor.x * p;
2228 blendedColor.y += dstColor.y * p;
2229 blendedColor.z += dstColor.z * p;
2230
2231 return blendedColor;
2232 }
2233
blendFactorCanExceedFormatRange(VkBlendFactor blendFactor,vk::Format format)2234 bool PixelRoutine::blendFactorCanExceedFormatRange(VkBlendFactor blendFactor, vk::Format format)
2235 {
2236 switch(blendFactor)
2237 {
2238 case VK_BLEND_FACTOR_ZERO:
2239 case VK_BLEND_FACTOR_ONE:
2240 return false;
2241 case VK_BLEND_FACTOR_SRC_COLOR:
2242 case VK_BLEND_FACTOR_SRC_ALPHA:
2243 // Source values have been clamped after fragment shader execution if the attachment format is normalized.
2244 return false;
2245 case VK_BLEND_FACTOR_DST_COLOR:
2246 case VK_BLEND_FACTOR_DST_ALPHA:
2247 // Dest values have a valid range due to being read from the attachment.
2248 return false;
2249 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
2250 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
2251 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
2252 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
2253 // For signed formats, negative values cause the result to exceed 1.0.
2254 return format.isSignedNormalized();
2255 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
2256 // min(As, 1 - Ad)
2257 return false;
2258 case VK_BLEND_FACTOR_CONSTANT_COLOR:
2259 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
2260 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
2261 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
2262 return false;
2263
2264 default:
2265 UNSUPPORTED("VkBlendFactor: %d", int(blendFactor));
2266 return false;
2267 }
2268 }
2269
alphaBlend(int index,const Pointer<Byte> & cBuffer,const Vector4f & sourceColor,const Int & x)2270 Vector4f PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, const Vector4f &sourceColor, const Int &x)
2271 {
2272 if(!state.blendState[index].alphaBlendEnable)
2273 {
2274 return sourceColor;
2275 }
2276
2277 vk::Format format = state.colorFormat[index];
2278 ASSERT(format.supportsColorAttachmentBlend());
2279
2280 Pointer<Byte> buffer = cBuffer;
2281 Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2282
2283 // destColor holds four texel color values.
2284 // Note: Despite the type being Vector4f, the colors may be stored as
2285 // integers. Half-floats are stored as full 32-bit floats.
2286 // Non-float and non-fixed point formats are not alpha blended.
2287 Vector4f destColor;
2288
2289 switch(format)
2290 {
2291 case VK_FORMAT_R32_SINT:
2292 case VK_FORMAT_R32_UINT:
2293 case VK_FORMAT_R32_SFLOAT:
2294 // FIXME: movlps
2295 buffer += 4 * x;
2296 destColor.x.x = *Pointer<Float>(buffer + 0);
2297 destColor.x.y = *Pointer<Float>(buffer + 4);
2298 buffer += pitchB;
2299 // FIXME: movhps
2300 destColor.x.z = *Pointer<Float>(buffer + 0);
2301 destColor.x.w = *Pointer<Float>(buffer + 4);
2302 destColor.y = destColor.z = destColor.w = Float4(1.0f);
2303 break;
2304 case VK_FORMAT_R32G32_SINT:
2305 case VK_FORMAT_R32G32_UINT:
2306 case VK_FORMAT_R32G32_SFLOAT:
2307 buffer += 8 * x;
2308 destColor.x = *Pointer<Float4>(buffer, 16);
2309 buffer += pitchB;
2310 destColor.y = *Pointer<Float4>(buffer, 16);
2311 destColor.z = destColor.x;
2312 destColor.x = ShuffleLowHigh(destColor.x, destColor.y, 0x0202);
2313 destColor.z = ShuffleLowHigh(destColor.z, destColor.y, 0x1313);
2314 destColor.y = destColor.z;
2315 destColor.z = destColor.w = Float4(1.0f);
2316 break;
2317 case VK_FORMAT_R32G32B32A32_SFLOAT:
2318 case VK_FORMAT_R32G32B32A32_SINT:
2319 case VK_FORMAT_R32G32B32A32_UINT:
2320 buffer += 16 * x;
2321 destColor.x = *Pointer<Float4>(buffer + 0, 16);
2322 destColor.y = *Pointer<Float4>(buffer + 16, 16);
2323 buffer += pitchB;
2324 destColor.z = *Pointer<Float4>(buffer + 0, 16);
2325 destColor.w = *Pointer<Float4>(buffer + 16, 16);
2326 transpose4x4(destColor.x, destColor.y, destColor.z, destColor.w);
2327 break;
2328 case VK_FORMAT_R16_UNORM:
2329 buffer += 2 * x;
2330 destColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
2331 destColor.x.y = Float(Int(*Pointer<UShort>(buffer + 2)));
2332 buffer += pitchB;
2333 destColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
2334 destColor.x.w = Float(Int(*Pointer<UShort>(buffer + 2)));
2335 destColor.x *= Float4(1.0f / 0xFFFF);
2336 destColor.y = destColor.z = destColor.w = Float4(1.0f);
2337 break;
2338 case VK_FORMAT_R16_SFLOAT:
2339 buffer += 2 * x;
2340 destColor.x.x = Float(*Pointer<Half>(buffer + 0));
2341 destColor.x.y = Float(*Pointer<Half>(buffer + 2));
2342 buffer += pitchB;
2343 destColor.x.z = Float(*Pointer<Half>(buffer + 0));
2344 destColor.x.w = Float(*Pointer<Half>(buffer + 2));
2345 destColor.y = destColor.z = destColor.w = Float4(1.0f);
2346 break;
2347 case VK_FORMAT_R16G16_UNORM:
2348 buffer += 4 * x;
2349 destColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
2350 destColor.y.x = Float(Int(*Pointer<UShort>(buffer + 2)));
2351 destColor.x.y = Float(Int(*Pointer<UShort>(buffer + 4)));
2352 destColor.y.y = Float(Int(*Pointer<UShort>(buffer + 6)));
2353 buffer += pitchB;
2354 destColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
2355 destColor.y.z = Float(Int(*Pointer<UShort>(buffer + 2)));
2356 destColor.x.w = Float(Int(*Pointer<UShort>(buffer + 4)));
2357 destColor.y.w = Float(Int(*Pointer<UShort>(buffer + 6)));
2358 destColor.x *= Float4(1.0f / 0xFFFF);
2359 destColor.y *= Float4(1.0f / 0xFFFF);
2360 destColor.z = destColor.w = Float4(1.0f);
2361 break;
2362 case VK_FORMAT_R16G16_SFLOAT:
2363 buffer += 4 * x;
2364 destColor.x.x = Float(*Pointer<Half>(buffer + 0));
2365 destColor.y.x = Float(*Pointer<Half>(buffer + 2));
2366 destColor.x.y = Float(*Pointer<Half>(buffer + 4));
2367 destColor.y.y = Float(*Pointer<Half>(buffer + 6));
2368 buffer += pitchB;
2369 destColor.x.z = Float(*Pointer<Half>(buffer + 0));
2370 destColor.y.z = Float(*Pointer<Half>(buffer + 2));
2371 destColor.x.w = Float(*Pointer<Half>(buffer + 4));
2372 destColor.y.w = Float(*Pointer<Half>(buffer + 6));
2373 destColor.z = destColor.w = Float4(1.0f);
2374 break;
2375 case VK_FORMAT_R16G16B16A16_UNORM:
2376 buffer += 8 * x;
2377 destColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0x0)));
2378 destColor.y.x = Float(Int(*Pointer<UShort>(buffer + 0x2)));
2379 destColor.z.x = Float(Int(*Pointer<UShort>(buffer + 0x4)));
2380 destColor.w.x = Float(Int(*Pointer<UShort>(buffer + 0x6)));
2381 destColor.x.y = Float(Int(*Pointer<UShort>(buffer + 0x8)));
2382 destColor.y.y = Float(Int(*Pointer<UShort>(buffer + 0xa)));
2383 destColor.z.y = Float(Int(*Pointer<UShort>(buffer + 0xc)));
2384 destColor.w.y = Float(Int(*Pointer<UShort>(buffer + 0xe)));
2385 buffer += pitchB;
2386 destColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0x0)));
2387 destColor.y.z = Float(Int(*Pointer<UShort>(buffer + 0x2)));
2388 destColor.z.z = Float(Int(*Pointer<UShort>(buffer + 0x4)));
2389 destColor.w.z = Float(Int(*Pointer<UShort>(buffer + 0x6)));
2390 destColor.x.w = Float(Int(*Pointer<UShort>(buffer + 0x8)));
2391 destColor.y.w = Float(Int(*Pointer<UShort>(buffer + 0xa)));
2392 destColor.z.w = Float(Int(*Pointer<UShort>(buffer + 0xc)));
2393 destColor.w.w = Float(Int(*Pointer<UShort>(buffer + 0xe)));
2394 destColor.x *= Float4(1.0f / 0xFFFF);
2395 destColor.y *= Float4(1.0f / 0xFFFF);
2396 destColor.z *= Float4(1.0f / 0xFFFF);
2397 destColor.w *= Float4(1.0f / 0xFFFF);
2398 break;
2399 case VK_FORMAT_R16G16B16A16_SFLOAT:
2400 buffer += 8 * x;
2401 destColor.x.x = Float(*Pointer<Half>(buffer + 0x0));
2402 destColor.y.x = Float(*Pointer<Half>(buffer + 0x2));
2403 destColor.z.x = Float(*Pointer<Half>(buffer + 0x4));
2404 destColor.w.x = Float(*Pointer<Half>(buffer + 0x6));
2405 destColor.x.y = Float(*Pointer<Half>(buffer + 0x8));
2406 destColor.y.y = Float(*Pointer<Half>(buffer + 0xa));
2407 destColor.z.y = Float(*Pointer<Half>(buffer + 0xc));
2408 destColor.w.y = Float(*Pointer<Half>(buffer + 0xe));
2409 buffer += pitchB;
2410 destColor.x.z = Float(*Pointer<Half>(buffer + 0x0));
2411 destColor.y.z = Float(*Pointer<Half>(buffer + 0x2));
2412 destColor.z.z = Float(*Pointer<Half>(buffer + 0x4));
2413 destColor.w.z = Float(*Pointer<Half>(buffer + 0x6));
2414 destColor.x.w = Float(*Pointer<Half>(buffer + 0x8));
2415 destColor.y.w = Float(*Pointer<Half>(buffer + 0xa));
2416 destColor.z.w = Float(*Pointer<Half>(buffer + 0xc));
2417 destColor.w.w = Float(*Pointer<Half>(buffer + 0xe));
2418 break;
2419 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2420 buffer += 4 * x;
2421 destColor.x = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
2422 destColor.y = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
2423 buffer += pitchB;
2424 destColor.z = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
2425 destColor.w = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
2426 transpose4x3(destColor.x, destColor.y, destColor.z, destColor.w);
2427 destColor.w = Float4(1.0f);
2428 break;
2429 default:
2430 {
2431 // Attempt to read an integer based format and convert it to float
2432 Vector4s color;
2433 readPixel(index, cBuffer, x, color);
2434 destColor.x = convertFloat32(As<UShort4>(color.x));
2435 destColor.y = convertFloat32(As<UShort4>(color.y));
2436 destColor.z = convertFloat32(As<UShort4>(color.z));
2437 destColor.w = convertFloat32(As<UShort4>(color.w));
2438 }
2439 break;
2440 }
2441
2442 Vector4f sourceFactor;
2443 Vector4f destFactor;
2444
2445 blendFactorRGB(sourceFactor, sourceColor, destColor, state.blendState[index].sourceBlendFactor, format);
2446 blendFactorRGB(destFactor, sourceColor, destColor, state.blendState[index].destBlendFactor, format);
2447 blendFactorAlpha(sourceFactor.w, sourceColor.w, destColor.w, state.blendState[index].sourceBlendFactorAlpha, format);
2448 blendFactorAlpha(destFactor.w, sourceColor.w, destColor.w, state.blendState[index].destBlendFactorAlpha, format);
2449
2450 Vector4f blendedColor;
2451
2452 switch(state.blendState[index].blendOperation)
2453 {
2454 case VK_BLEND_OP_ADD:
2455 blendedColor.x = sourceColor.x * sourceFactor.x + destColor.x * destFactor.x;
2456 blendedColor.y = sourceColor.y * sourceFactor.y + destColor.y * destFactor.y;
2457 blendedColor.z = sourceColor.z * sourceFactor.z + destColor.z * destFactor.z;
2458 break;
2459 case VK_BLEND_OP_SUBTRACT:
2460 blendedColor.x = sourceColor.x * sourceFactor.x - destColor.x * destFactor.x;
2461 blendedColor.y = sourceColor.y * sourceFactor.y - destColor.y * destFactor.y;
2462 blendedColor.z = sourceColor.z * sourceFactor.z - destColor.z * destFactor.z;
2463 break;
2464 case VK_BLEND_OP_REVERSE_SUBTRACT:
2465 blendedColor.x = destColor.x * destFactor.x - sourceColor.x * sourceFactor.x;
2466 blendedColor.y = destColor.y * destFactor.y - sourceColor.y * sourceFactor.y;
2467 blendedColor.z = destColor.z * destFactor.z - sourceColor.z * sourceFactor.z;
2468 break;
2469 case VK_BLEND_OP_MIN:
2470 blendedColor.x = Min(sourceColor.x, destColor.x);
2471 blendedColor.y = Min(sourceColor.y, destColor.y);
2472 blendedColor.z = Min(sourceColor.z, destColor.z);
2473 break;
2474 case VK_BLEND_OP_MAX:
2475 blendedColor.x = Max(sourceColor.x, destColor.x);
2476 blendedColor.y = Max(sourceColor.y, destColor.y);
2477 blendedColor.z = Max(sourceColor.z, destColor.z);
2478 break;
2479 case VK_BLEND_OP_SRC_EXT:
2480 blendedColor.x = sourceColor.x * sourceFactor.x; // TODO(b/204583457)
2481 blendedColor.y = sourceColor.y * sourceFactor.y; // TODO(b/204583457)
2482 blendedColor.z = sourceColor.z * sourceFactor.z; // TODO(b/204583457)
2483 break;
2484 case VK_BLEND_OP_DST_EXT:
2485 blendedColor.x = destColor.x * destFactor.x; // TODO(b/204583457)
2486 blendedColor.y = destColor.y * destFactor.y; // TODO(b/204583457)
2487 blendedColor.z = destColor.z * destFactor.z; // TODO(b/204583457)
2488 break;
2489 case VK_BLEND_OP_ZERO_EXT:
2490 blendedColor.x = Float4(0.0f);
2491 blendedColor.y = Float4(0.0f);
2492 blendedColor.z = Float4(0.0f);
2493 break;
2494 case VK_BLEND_OP_MULTIPLY_EXT:
2495 case VK_BLEND_OP_SCREEN_EXT:
2496 case VK_BLEND_OP_OVERLAY_EXT:
2497 case VK_BLEND_OP_DARKEN_EXT:
2498 case VK_BLEND_OP_LIGHTEN_EXT:
2499 case VK_BLEND_OP_COLORDODGE_EXT:
2500 case VK_BLEND_OP_COLORBURN_EXT:
2501 case VK_BLEND_OP_HARDLIGHT_EXT:
2502 case VK_BLEND_OP_SOFTLIGHT_EXT:
2503 case VK_BLEND_OP_DIFFERENCE_EXT:
2504 case VK_BLEND_OP_EXCLUSION_EXT:
2505 case VK_BLEND_OP_HSL_HUE_EXT:
2506 case VK_BLEND_OP_HSL_SATURATION_EXT:
2507 case VK_BLEND_OP_HSL_COLOR_EXT:
2508 case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
2509 blendedColor = computeAdvancedBlendMode(index, sourceColor, destColor, sourceFactor, destFactor);
2510 break;
2511 default:
2512 UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperation));
2513 }
2514
2515 switch(state.blendState[index].blendOperationAlpha)
2516 {
2517 case VK_BLEND_OP_ADD:
2518 blendedColor.w = sourceColor.w * sourceFactor.w + destColor.w * destFactor.w;
2519 break;
2520 case VK_BLEND_OP_SUBTRACT:
2521 blendedColor.w = sourceColor.w * sourceFactor.w - destColor.w * destFactor.w;
2522 break;
2523 case VK_BLEND_OP_REVERSE_SUBTRACT:
2524 blendedColor.w = destColor.w * destFactor.w - sourceColor.w * sourceFactor.w;
2525 break;
2526 case VK_BLEND_OP_MIN:
2527 blendedColor.w = Min(sourceColor.w, destColor.w);
2528 break;
2529 case VK_BLEND_OP_MAX:
2530 blendedColor.w = Max(sourceColor.w, destColor.w);
2531 break;
2532 case VK_BLEND_OP_SRC_EXT:
2533 blendedColor.w = sourceColor.w * sourceFactor.w; // TODO(b/204583457)
2534 break;
2535 case VK_BLEND_OP_DST_EXT:
2536 blendedColor.w = destColor.w * destFactor.w; // TODO(b/204583457)
2537 break;
2538 case VK_BLEND_OP_ZERO_EXT:
2539 blendedColor.w = Float4(0.0f);
2540 break;
2541 case VK_BLEND_OP_MULTIPLY_EXT:
2542 // All of the currently supported advanced blend modes compute the alpha the same way
2543 // Use VK_BLEND_OP_MULTIPLY_EXT as a placeholder
2544 blendedColor.w = sourceColor.w + destColor.w - (sourceColor.w * destColor.w);
2545 break;
2546 default:
2547 UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperationAlpha));
2548 }
2549
2550 return blendedColor;
2551 }
2552
writeColor(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4f & color,const Int & sMask,const Int & zMask,const Int & cMask)2553 void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4f &color, const Int &sMask, const Int &zMask, const Int &cMask)
2554 {
2555 vk::Format format = state.colorFormat[index];
2556 switch(format)
2557 {
2558 case VK_FORMAT_R16G16B16A16_UNORM:
2559 color.w = Min(Max(color.w, Float4(0.0f)), Float4(1.0f)); // TODO(b/204560089): Omit clamp if redundant
2560 color.w = As<Float4>(RoundInt(color.w * Float4(0xFFFF)));
2561 color.z = Min(Max(color.z, Float4(0.0f)), Float4(1.0f)); // TODO(b/204560089): Omit clamp if redundant
2562 color.z = As<Float4>(RoundInt(color.z * Float4(0xFFFF)));
2563 // [[fallthrough]]
2564 case VK_FORMAT_R16G16_UNORM:
2565 color.y = Min(Max(color.y, Float4(0.0f)), Float4(1.0f)); // TODO(b/204560089): Omit clamp if redundant
2566 color.y = As<Float4>(RoundInt(color.y * Float4(0xFFFF)));
2567 //[[fallthrough]]
2568 case VK_FORMAT_R16_UNORM:
2569 color.x = Min(Max(color.x, Float4(0.0f)), Float4(1.0f)); // TODO(b/204560089): Omit clamp if redundant
2570 color.x = As<Float4>(RoundInt(color.x * Float4(0xFFFF)));
2571 break;
2572 default:
2573 // TODO(b/204560089): Omit clamp if redundant
2574 if(format.isUnsignedNormalized())
2575 {
2576 color.x = Min(Max(color.x, Float4(0.0f)), Float4(1.0f));
2577 color.y = Min(Max(color.y, Float4(0.0f)), Float4(1.0f));
2578 color.z = Min(Max(color.z, Float4(0.0f)), Float4(1.0f));
2579 color.w = Min(Max(color.w, Float4(0.0f)), Float4(1.0f));
2580 }
2581 else if(format.isSignedNormalized())
2582 {
2583 color.x = Min(Max(color.x, Float4(-1.0f)), Float4(1.0f));
2584 color.y = Min(Max(color.y, Float4(-1.0f)), Float4(1.0f));
2585 color.z = Min(Max(color.z, Float4(-1.0f)), Float4(1.0f));
2586 color.w = Min(Max(color.w, Float4(-1.0f)), Float4(1.0f));
2587 }
2588 }
2589
2590 switch(format)
2591 {
2592 case VK_FORMAT_R16_SFLOAT:
2593 case VK_FORMAT_R32_SFLOAT:
2594 case VK_FORMAT_R32_SINT:
2595 case VK_FORMAT_R32_UINT:
2596 case VK_FORMAT_R16_UNORM:
2597 case VK_FORMAT_R16_SINT:
2598 case VK_FORMAT_R16_UINT:
2599 case VK_FORMAT_R8_SINT:
2600 case VK_FORMAT_R8_UINT:
2601 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2602 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2603 break;
2604 case VK_FORMAT_R16G16_SFLOAT:
2605 case VK_FORMAT_R32G32_SFLOAT:
2606 case VK_FORMAT_R32G32_SINT:
2607 case VK_FORMAT_R32G32_UINT:
2608 case VK_FORMAT_R16G16_UNORM:
2609 case VK_FORMAT_R16G16_SINT:
2610 case VK_FORMAT_R16G16_UINT:
2611 case VK_FORMAT_R8G8_SINT:
2612 case VK_FORMAT_R8G8_UINT:
2613 color.z = color.x;
2614 color.x = UnpackLow(color.x, color.y);
2615 color.z = UnpackHigh(color.z, color.y);
2616 color.y = color.z;
2617 break;
2618 case VK_FORMAT_R16G16B16A16_SFLOAT:
2619 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2620 case VK_FORMAT_R32G32B32A32_SFLOAT:
2621 case VK_FORMAT_R32G32B32A32_SINT:
2622 case VK_FORMAT_R32G32B32A32_UINT:
2623 case VK_FORMAT_R16G16B16A16_UNORM:
2624 case VK_FORMAT_R16G16B16A16_SINT:
2625 case VK_FORMAT_R16G16B16A16_UINT:
2626 case VK_FORMAT_R8G8B8A8_SINT:
2627 case VK_FORMAT_R8G8B8A8_UINT:
2628 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
2629 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
2630 transpose4x4(color.x, color.y, color.z, color.w);
2631 break;
2632 default:
2633 UNSUPPORTED("VkFormat: %d", int(format));
2634 }
2635
2636 int rgbaWriteMask = state.colorWriteActive(index);
2637 int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
2638
2639 Int xMask; // Combination of all masks
2640
2641 if(state.depthTestActive)
2642 {
2643 xMask = zMask;
2644 }
2645 else
2646 {
2647 xMask = cMask;
2648 }
2649
2650 if(state.stencilActive)
2651 {
2652 xMask &= sMask;
2653 }
2654
2655 Pointer<Byte> buffer = cBuffer;
2656 Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2657 Float4 value;
2658
2659 switch(format)
2660 {
2661 case VK_FORMAT_R32_SFLOAT:
2662 case VK_FORMAT_R32_SINT:
2663 case VK_FORMAT_R32_UINT:
2664 if(rgbaWriteMask & 0x00000001)
2665 {
2666 buffer += 4 * x;
2667
2668 // FIXME: movlps
2669 value.x = *Pointer<Float>(buffer + 0);
2670 value.y = *Pointer<Float>(buffer + 4);
2671
2672 buffer += pitchB;
2673
2674 // FIXME: movhps
2675 value.z = *Pointer<Float>(buffer + 0);
2676 value.w = *Pointer<Float>(buffer + 4);
2677
2678 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2679 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2680 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2681
2682 // FIXME: movhps
2683 *Pointer<Float>(buffer + 0) = color.x.z;
2684 *Pointer<Float>(buffer + 4) = color.x.w;
2685
2686 buffer -= pitchB;
2687
2688 // FIXME: movlps
2689 *Pointer<Float>(buffer + 0) = color.x.x;
2690 *Pointer<Float>(buffer + 4) = color.x.y;
2691 }
2692 break;
2693 case VK_FORMAT_R16_SFLOAT:
2694 if(rgbaWriteMask & 0x00000001)
2695 {
2696 buffer += 2 * x;
2697
2698 value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 0);
2699 value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 1);
2700
2701 buffer += pitchB;
2702
2703 value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 2);
2704 value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 3);
2705
2706 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2707 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2708 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2709
2710 *Pointer<Half>(buffer + 0) = Half(color.x.z);
2711 *Pointer<Half>(buffer + 2) = Half(color.x.w);
2712
2713 buffer -= pitchB;
2714
2715 *Pointer<Half>(buffer + 0) = Half(color.x.x);
2716 *Pointer<Half>(buffer + 2) = Half(color.x.y);
2717 }
2718 break;
2719 case VK_FORMAT_R16_UNORM:
2720 case VK_FORMAT_R16_SINT:
2721 case VK_FORMAT_R16_UINT:
2722 if(rgbaWriteMask & 0x00000001)
2723 {
2724 buffer += 2 * x;
2725
2726 UShort4 xyzw;
2727 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2728
2729 buffer += pitchB;
2730
2731 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2732 value = As<Float4>(Int4(xyzw));
2733
2734 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2735 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2736 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2737
2738 Float component = color.x.z;
2739 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2740 component = color.x.w;
2741 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2742
2743 buffer -= pitchB;
2744
2745 component = color.x.x;
2746 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2747 component = color.x.y;
2748 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2749 }
2750 break;
2751 case VK_FORMAT_R8_SINT:
2752 case VK_FORMAT_R8_UINT:
2753 if(rgbaWriteMask & 0x00000001)
2754 {
2755 buffer += x;
2756
2757 UInt xyzw, packedCol;
2758
2759 xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
2760 buffer += pitchB;
2761 xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2762
2763 Short4 tmpCol = Short4(As<Int4>(color.x));
2764 if(format == VK_FORMAT_R8_SINT)
2765 {
2766 tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
2767 }
2768 else
2769 {
2770 tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
2771 }
2772 packedCol = Extract(As<Int2>(tmpCol), 0);
2773
2774 packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2775 (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2776
2777 *Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2778 buffer -= pitchB;
2779 *Pointer<UShort>(buffer) = UShort(packedCol);
2780 }
2781 break;
2782 case VK_FORMAT_R32G32_SFLOAT:
2783 case VK_FORMAT_R32G32_SINT:
2784 case VK_FORMAT_R32G32_UINT:
2785 buffer += 8 * x;
2786
2787 value = *Pointer<Float4>(buffer);
2788
2789 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2790 {
2791 Float4 masked = value;
2792 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[rgbaWriteMask & 0x3][0])));
2793 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~rgbaWriteMask & 0x3][0])));
2794 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(masked));
2795 }
2796
2797 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16, 16));
2798 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ01X) + xMask * 16, 16));
2799 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2800 *Pointer<Float4>(buffer) = color.x;
2801
2802 buffer += pitchB;
2803
2804 value = *Pointer<Float4>(buffer);
2805
2806 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2807 {
2808 Float4 masked;
2809
2810 masked = value;
2811 color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[rgbaWriteMask & 0x3][0])));
2812 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~rgbaWriteMask & 0x3][0])));
2813 color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(masked));
2814 }
2815
2816 color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16, 16));
2817 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ23X) + xMask * 16, 16));
2818 color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(value));
2819 *Pointer<Float4>(buffer) = color.y;
2820 break;
2821 case VK_FORMAT_R16G16_SFLOAT:
2822 if((rgbaWriteMask & 0x00000003) != 0x0)
2823 {
2824 buffer += 4 * x;
2825
2826 UInt2 rgbaMask;
2827 UInt2 packedCol;
2828 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.y))) << 16) | UInt(As<UShort>(Half(color.x.x))), 0);
2829 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.w))) << 16) | UInt(As<UShort>(Half(color.x.z))), 1);
2830
2831 UShort4 value = *Pointer<UShort4>(buffer);
2832 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2833 if((rgbaWriteMask & 0x3) != 0x3)
2834 {
2835 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2836 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2837 mergedMask &= rgbaMask;
2838 }
2839 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2840
2841 buffer += pitchB;
2842
2843 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.y))) << 16) | UInt(As<UShort>(Half(color.y.x))), 0);
2844 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.w))) << 16) | UInt(As<UShort>(Half(color.y.z))), 1);
2845 value = *Pointer<UShort4>(buffer);
2846 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2847 if((rgbaWriteMask & 0x3) != 0x3)
2848 {
2849 mergedMask &= rgbaMask;
2850 }
2851 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2852 }
2853 break;
2854 case VK_FORMAT_R16G16_UNORM:
2855 case VK_FORMAT_R16G16_SINT:
2856 case VK_FORMAT_R16G16_UINT:
2857 if((rgbaWriteMask & 0x00000003) != 0x0)
2858 {
2859 buffer += 4 * x;
2860
2861 UInt2 rgbaMask;
2862 UShort4 packedCol = UShort4(As<Int4>(color.x));
2863 UShort4 value = *Pointer<UShort4>(buffer);
2864 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2865 if((rgbaWriteMask & 0x3) != 0x3)
2866 {
2867 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2868 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2869 mergedMask &= rgbaMask;
2870 }
2871 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2872
2873 buffer += pitchB;
2874
2875 packedCol = UShort4(As<Int4>(color.y));
2876 value = *Pointer<UShort4>(buffer);
2877 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2878 if((rgbaWriteMask & 0x3) != 0x3)
2879 {
2880 mergedMask &= rgbaMask;
2881 }
2882 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2883 }
2884 break;
2885 case VK_FORMAT_R8G8_SINT:
2886 case VK_FORMAT_R8G8_UINT:
2887 if((rgbaWriteMask & 0x00000003) != 0x0)
2888 {
2889 buffer += 2 * x;
2890
2891 Int2 xyzw, packedCol;
2892
2893 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2894 buffer += pitchB;
2895 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2896
2897 if(format == VK_FORMAT_R8G8_SINT)
2898 {
2899 packedCol = As<Int2>(PackSigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2900 }
2901 else
2902 {
2903 packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2904 }
2905
2906 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2907 if((rgbaWriteMask & 0x3) != 0x3)
2908 {
2909 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
2910 UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2911 mergedMask &= rgbaMask;
2912 }
2913
2914 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2915
2916 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2917 buffer -= pitchB;
2918 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2919 }
2920 break;
2921 case VK_FORMAT_R32G32B32A32_SFLOAT:
2922 case VK_FORMAT_R32G32B32A32_SINT:
2923 case VK_FORMAT_R32G32B32A32_UINT:
2924 buffer += 16 * x;
2925
2926 {
2927 value = *Pointer<Float4>(buffer, 16);
2928
2929 if(rgbaWriteMask != 0x0000000F)
2930 {
2931 Float4 masked = value;
2932 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2933 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2934 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(masked));
2935 }
2936
2937 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskX0X) + xMask * 16, 16));
2938 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX0X) + xMask * 16, 16));
2939 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2940 *Pointer<Float4>(buffer, 16) = color.x;
2941 }
2942
2943 {
2944 value = *Pointer<Float4>(buffer + 16, 16);
2945
2946 if(rgbaWriteMask != 0x0000000F)
2947 {
2948 Float4 masked = value;
2949 color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2950 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2951 color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(masked));
2952 }
2953
2954 color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskX1X) + xMask * 16, 16));
2955 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX1X) + xMask * 16, 16));
2956 color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(value));
2957 *Pointer<Float4>(buffer + 16, 16) = color.y;
2958 }
2959
2960 buffer += pitchB;
2961
2962 {
2963 value = *Pointer<Float4>(buffer, 16);
2964
2965 if(rgbaWriteMask != 0x0000000F)
2966 {
2967 Float4 masked = value;
2968 color.z = As<Float4>(As<Int4>(color.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2969 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2970 color.z = As<Float4>(As<Int4>(color.z) | As<Int4>(masked));
2971 }
2972
2973 color.z = As<Float4>(As<Int4>(color.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskX2X) + xMask * 16, 16));
2974 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX2X) + xMask * 16, 16));
2975 color.z = As<Float4>(As<Int4>(color.z) | As<Int4>(value));
2976 *Pointer<Float4>(buffer, 16) = color.z;
2977 }
2978
2979 {
2980 value = *Pointer<Float4>(buffer + 16, 16);
2981
2982 if(rgbaWriteMask != 0x0000000F)
2983 {
2984 Float4 masked = value;
2985 color.w = As<Float4>(As<Int4>(color.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2986 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2987 color.w = As<Float4>(As<Int4>(color.w) | As<Int4>(masked));
2988 }
2989
2990 color.w = As<Float4>(As<Int4>(color.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskX3X) + xMask * 16, 16));
2991 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX3X) + xMask * 16, 16));
2992 color.w = As<Float4>(As<Int4>(color.w) | As<Int4>(value));
2993 *Pointer<Float4>(buffer + 16, 16) = color.w;
2994 }
2995 break;
2996 case VK_FORMAT_R16G16B16A16_SFLOAT:
2997 if((rgbaWriteMask & 0x0000000F) != 0x0)
2998 {
2999 buffer += 8 * x;
3000
3001 UInt4 rgbaMask;
3002 UInt4 value = *Pointer<UInt4>(buffer);
3003 UInt4 packedCol;
3004 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.y))) << 16) | UInt(As<UShort>(Half(color.x.x))), 0);
3005 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.w))) << 16) | UInt(As<UShort>(Half(color.x.z))), 1);
3006 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.y))) << 16) | UInt(As<UShort>(Half(color.y.x))), 2);
3007 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.w))) << 16) | UInt(As<UShort>(Half(color.y.z))), 3);
3008 UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
3009 if((rgbaWriteMask & 0xF) != 0xF)
3010 {
3011 UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
3012 rgbaMask = UInt4(tmpMask, tmpMask);
3013 mergedMask &= rgbaMask;
3014 }
3015 *Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
3016
3017 buffer += pitchB;
3018
3019 value = *Pointer<UInt4>(buffer);
3020 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.z.y))) << 16) | UInt(As<UShort>(Half(color.z.x))), 0);
3021 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.z.w))) << 16) | UInt(As<UShort>(Half(color.z.z))), 1);
3022 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.w.y))) << 16) | UInt(As<UShort>(Half(color.w.x))), 2);
3023 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.w.w))) << 16) | UInt(As<UShort>(Half(color.w.z))), 3);
3024 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
3025 if((rgbaWriteMask & 0xF) != 0xF)
3026 {
3027 mergedMask &= rgbaMask;
3028 }
3029 *Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
3030 }
3031 break;
3032 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
3033 if((rgbaWriteMask & 0x7) != 0x0)
3034 {
3035 buffer += 4 * x;
3036
3037 UInt4 packedCol;
3038 packedCol = Insert(packedCol, r11g11b10Pack(color.x), 0);
3039 packedCol = Insert(packedCol, r11g11b10Pack(color.y), 1);
3040 packedCol = Insert(packedCol, r11g11b10Pack(color.z), 2);
3041 packedCol = Insert(packedCol, r11g11b10Pack(color.w), 3);
3042
3043 UInt4 value;
3044 value = Insert(value, *Pointer<UInt>(buffer + 0), 0);
3045 value = Insert(value, *Pointer<UInt>(buffer + 4), 1);
3046 buffer += pitchB;
3047 value = Insert(value, *Pointer<UInt>(buffer + 0), 2);
3048 value = Insert(value, *Pointer<UInt>(buffer + 4), 3);
3049
3050 UInt4 mask = *Pointer<UInt4>(constants + OFFSET(Constants, maskD4X[0][0]) + xMask * 16, 16);
3051 if((rgbaWriteMask & 0x7) != 0x7)
3052 {
3053 mask &= *Pointer<UInt4>(constants + OFFSET(Constants, mask11X[rgbaWriteMask & 0x7][0]), 16);
3054 }
3055 value = (packedCol & mask) | (value & ~mask);
3056
3057 *Pointer<UInt>(buffer + 0) = value.z;
3058 *Pointer<UInt>(buffer + 4) = value.w;
3059 buffer -= pitchB;
3060 *Pointer<UInt>(buffer + 0) = value.x;
3061 *Pointer<UInt>(buffer + 4) = value.y;
3062 }
3063 break;
3064 case VK_FORMAT_R16G16B16A16_UNORM:
3065 case VK_FORMAT_R16G16B16A16_SINT:
3066 case VK_FORMAT_R16G16B16A16_UINT:
3067 if((rgbaWriteMask & 0x0000000F) != 0x0)
3068 {
3069 buffer += 8 * x;
3070
3071 UInt4 rgbaMask;
3072 UShort8 value = *Pointer<UShort8>(buffer);
3073 UShort8 packedCol = UShort8(UShort4(As<Int4>(color.x)), UShort4(As<Int4>(color.y)));
3074 UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
3075 if((rgbaWriteMask & 0xF) != 0xF)
3076 {
3077 UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
3078 rgbaMask = UInt4(tmpMask, tmpMask);
3079 mergedMask &= rgbaMask;
3080 }
3081 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
3082
3083 buffer += pitchB;
3084
3085 value = *Pointer<UShort8>(buffer);
3086 packedCol = UShort8(UShort4(As<Int4>(color.z)), UShort4(As<Int4>(color.w)));
3087 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
3088 if((rgbaWriteMask & 0xF) != 0xF)
3089 {
3090 mergedMask &= rgbaMask;
3091 }
3092 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
3093 }
3094 break;
3095 case VK_FORMAT_R8G8B8A8_SINT:
3096 case VK_FORMAT_R8G8B8A8_UINT:
3097 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
3098 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
3099 if((rgbaWriteMask & 0x0000000F) != 0x0)
3100 {
3101 UInt2 value, packedCol, mergedMask;
3102
3103 buffer += 4 * x;
3104
3105 bool isSigned = (format == VK_FORMAT_R8G8B8A8_SINT) || (format == VK_FORMAT_A8B8G8R8_SINT_PACK32);
3106
3107 if(isSigned)
3108 {
3109 packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
3110 }
3111 else
3112 {
3113 packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
3114 }
3115 value = *Pointer<UInt2>(buffer, 16);
3116 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
3117 if(rgbaWriteMask != 0xF)
3118 {
3119 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
3120 }
3121 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
3122
3123 buffer += pitchB;
3124
3125 if(isSigned)
3126 {
3127 packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(color.z)), Short4(As<Int4>(color.w))));
3128 }
3129 else
3130 {
3131 packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(color.z)), Short4(As<Int4>(color.w))));
3132 }
3133 value = *Pointer<UInt2>(buffer, 16);
3134 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
3135 if(rgbaWriteMask != 0xF)
3136 {
3137 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
3138 }
3139 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
3140 }
3141 break;
3142 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
3143 if((rgbaWriteMask & 0x0000000F) != 0x0)
3144 {
3145 Int2 mergedMask, packedCol, value;
3146 Int4 packed = ((As<Int4>(color.w) & Int4(0x3)) << 30) |
3147 ((As<Int4>(color.z) & Int4(0x3ff)) << 20) |
3148 ((As<Int4>(color.y) & Int4(0x3ff)) << 10) |
3149 ((As<Int4>(color.x) & Int4(0x3ff)));
3150
3151 buffer += 4 * x;
3152 value = *Pointer<Int2>(buffer, 16);
3153 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
3154 if(rgbaWriteMask != 0xF)
3155 {
3156 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
3157 }
3158 *Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
3159
3160 buffer += pitchB;
3161
3162 value = *Pointer<Int2>(buffer, 16);
3163 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
3164 if(rgbaWriteMask != 0xF)
3165 {
3166 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
3167 }
3168 *Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
3169 }
3170 break;
3171 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
3172 if((bgraWriteMask & 0x0000000F) != 0x0)
3173 {
3174 Int2 mergedMask, packedCol, value;
3175 Int4 packed = ((As<Int4>(color.w) & Int4(0x3)) << 30) |
3176 ((As<Int4>(color.x) & Int4(0x3ff)) << 20) |
3177 ((As<Int4>(color.y) & Int4(0x3ff)) << 10) |
3178 ((As<Int4>(color.z) & Int4(0x3ff)));
3179
3180 buffer += 4 * x;
3181 value = *Pointer<Int2>(buffer, 16);
3182 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
3183 if(bgraWriteMask != 0xF)
3184 {
3185 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[bgraWriteMask][0]));
3186 }
3187 *Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
3188
3189 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
3190
3191 value = *Pointer<Int2>(buffer, 16);
3192 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
3193 if(bgraWriteMask != 0xF)
3194 {
3195 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[bgraWriteMask][0]));
3196 }
3197 *Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
3198 }
3199 break;
3200 default:
3201 UNSUPPORTED("VkFormat: %d", int(format));
3202 }
3203 }
3204
convertFixed16(const Float4 & cf,bool saturate)3205 UShort4 PixelRoutine::convertFixed16(const Float4 &cf, bool saturate)
3206 {
3207 return UShort4(cf * Float4(0xFFFF), saturate);
3208 }
3209
convertFloat32(const UShort4 & cf)3210 Float4 PixelRoutine::convertFloat32(const UShort4 &cf)
3211 {
3212 return Float4(cf) * Float4(1.0f / 65535.0f);
3213 }
3214
sRGBtoLinear16_12_16(Vector4s & c)3215 void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
3216 {
3217 Pointer<Byte> LUT = constants + OFFSET(Constants, sRGBtoLinear12_16);
3218
3219 c.x = AddSat(As<UShort4>(c.x), UShort4(0x0007)) >> 4;
3220 c.y = AddSat(As<UShort4>(c.y), UShort4(0x0007)) >> 4;
3221 c.z = AddSat(As<UShort4>(c.z), UShort4(0x0007)) >> 4;
3222
3223 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
3224 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
3225 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
3226 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
3227
3228 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
3229 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
3230 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
3231 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
3232
3233 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
3234 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
3235 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
3236 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
3237 }
3238
linearToSRGB16_12_16(Vector4s & c)3239 void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
3240 {
3241 c.x = AddSat(As<UShort4>(c.x), UShort4(0x0007)) >> 4;
3242 c.y = AddSat(As<UShort4>(c.y), UShort4(0x0007)) >> 4;
3243 c.z = AddSat(As<UShort4>(c.z), UShort4(0x0007)) >> 4;
3244
3245 linearToSRGB12_16(c);
3246 }
3247
linearToSRGB12_16(Vector4s & c)3248 void PixelRoutine::linearToSRGB12_16(Vector4s &c)
3249 {
3250 Pointer<Byte> LUT = constants + OFFSET(Constants, linearToSRGB12_16);
3251
3252 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
3253 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
3254 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
3255 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
3256
3257 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
3258 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
3259 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
3260 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
3261
3262 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
3263 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
3264 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
3265 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
3266 }
3267
sRGBtoLinear(const Float4 & x)3268 Float4 PixelRoutine::sRGBtoLinear(const Float4 &x) // Approximates x^2.2
3269 {
3270 Float4 linear = x * x;
3271 linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
3272
3273 return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
3274 }
3275
3276 } // namespace sw
3277