1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "PixelRoutine.hpp"
16
17 #include "Constants.hpp"
18 #include "SamplerCore.hpp"
19 #include "Device/Primitive.hpp"
20 #include "Device/QuadRasterizer.hpp"
21 #include "Device/Renderer.hpp"
22 #include "System/Debug.hpp"
23 #include "Vulkan/VkPipelineLayout.hpp"
24
25 namespace sw {
26
PixelRoutine(const PixelProcessor::State & state,vk::PipelineLayout const * pipelineLayout,SpirvShader const * spirvShader,const vk::DescriptorSet::Bindings & descriptorSets)27 PixelRoutine::PixelRoutine(
28 const PixelProcessor::State &state,
29 vk::PipelineLayout const *pipelineLayout,
30 SpirvShader const *spirvShader,
31 const vk::DescriptorSet::Bindings &descriptorSets)
32 : QuadRasterizer(state, spirvShader)
33 , routine(pipelineLayout)
34 , descriptorSets(descriptorSets)
35 {
36 if(spirvShader)
37 {
38 spirvShader->emitProlog(&routine);
39
40 // Clearing inputs to 0 is not demanded by the spec,
41 // but it makes the undefined behavior deterministic.
42 for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++)
43 {
44 routine.inputs[i] = Float4(0.0f);
45 }
46 }
47 }
48
~PixelRoutine()49 PixelRoutine::~PixelRoutine()
50 {
51 }
52
quad(Pointer<Byte> cBuffer[RENDERTARGETS],Pointer<Byte> & zBuffer,Pointer<Byte> & sBuffer,Int cMask[4],Int & x,Int & y)53 void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
54 {
55 // TODO: consider shader which modifies sample mask in general
56 const bool earlyDepthTest = !spirvShader || (spirvShader->getModes().EarlyFragmentTests && !spirvShader->getModes().DepthReplacing && !state.alphaToCoverage);
57
58 Int zMask[4]; // Depth mask
59 Int sMask[4]; // Stencil mask
60
61 for(unsigned int q = 0; q < state.multiSampleCount; q++)
62 {
63 zMask[q] = cMask[q];
64 sMask[q] = cMask[q];
65 }
66
67 for(unsigned int q = 0; q < state.multiSampleCount; q++)
68 {
69 stencilTest(sBuffer, q, x, sMask[q], cMask[q]);
70 }
71
72 Float4 f;
73 Float4 rhwCentroid;
74
75 Float4 xxxx = Float4(Float(x)) + *Pointer<Float4>(primitive + OFFSET(Primitive, xQuad), 16);
76
77 if(interpolateZ())
78 {
79 for(unsigned int q = 0; q < state.multiSampleCount; q++)
80 {
81 Float4 x = xxxx;
82
83 if(state.enableMultiSampling)
84 {
85 x -= *Pointer<Float4>(constants + OFFSET(Constants, X) + q * sizeof(float4));
86 }
87
88 z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive, z), false, false, state.depthClamp);
89 }
90 }
91
92 Bool depthPass = false;
93
94 if(earlyDepthTest)
95 {
96 for(unsigned int q = 0; q < state.multiSampleCount; q++)
97 {
98 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
99 }
100 }
101
102 If(depthPass || Bool(!earlyDepthTest))
103 {
104 Float4 yyyy = Float4(Float(y)) + *Pointer<Float4>(primitive + OFFSET(Primitive, yQuad), 16);
105
106 // Centroid locations
107 Float4 XXXX = Float4(0.0f);
108 Float4 YYYY = Float4(0.0f);
109
110 if(state.centroid)
111 {
112 Float4 WWWW(1.0e-9f);
113
114 for(unsigned int q = 0; q < state.multiSampleCount; q++)
115 {
116 XXXX += *Pointer<Float4>(constants + OFFSET(Constants, sampleX[q]) + 16 * cMask[q]);
117 YYYY += *Pointer<Float4>(constants + OFFSET(Constants, sampleY[q]) + 16 * cMask[q]);
118 WWWW += *Pointer<Float4>(constants + OFFSET(Constants, weight) + 16 * cMask[q]);
119 }
120
121 WWWW = Rcp_pp(WWWW);
122 XXXX *= WWWW;
123 YYYY *= WWWW;
124
125 XXXX += xxxx;
126 YYYY += yyyy;
127 }
128
129 if(interpolateW())
130 {
131 w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive, w), false, false, false);
132 rhw = reciprocal(w, false, false, true);
133
134 if(state.centroid)
135 {
136 rhwCentroid = reciprocal(interpolateCentroid(XXXX, YYYY, rhwCentroid, primitive + OFFSET(Primitive, w), false, false));
137 }
138 }
139
140 if(spirvShader)
141 {
142 for(int interpolant = 0; interpolant < MAX_INTERFACE_COMPONENTS; interpolant++)
143 {
144 auto const &input = spirvShader->inputs[interpolant];
145 if(input.Type != SpirvShader::ATTRIBTYPE_UNUSED)
146 {
147 if(input.Centroid && state.enableMultiSampling)
148 {
149 routine.inputs[interpolant] =
150 interpolateCentroid(XXXX, YYYY, rhwCentroid,
151 primitive + OFFSET(Primitive, V[interpolant]),
152 input.Flat, !input.NoPerspective);
153 }
154 else
155 {
156 routine.inputs[interpolant] =
157 interpolate(xxxx, Dv[interpolant], rhw,
158 primitive + OFFSET(Primitive, V[interpolant]),
159 input.Flat, !input.NoPerspective, false);
160 }
161 }
162 }
163
164 setBuiltins(x, y, z, w, cMask);
165
166 for(uint32_t i = 0; i < state.numClipDistances; i++)
167 {
168 auto distance = interpolate(xxxx, DclipDistance[i], rhw,
169 primitive + OFFSET(Primitive, clipDistance[i]),
170 false, true, false);
171
172 auto clipMask = SignMask(CmpGE(distance, SIMD::Float(0)));
173 for(auto ms = 0u; ms < state.multiSampleCount; ms++)
174 {
175 // FIXME(b/148105887): Fragments discarded by clipping do not exist at
176 // all -- they should not be counted in queries or have their Z/S effects
177 // performed when early fragment tests are enabled.
178 cMask[ms] &= clipMask;
179 }
180
181 if(spirvShader->getUsedCapabilities().ClipDistance)
182 {
183 auto it = spirvShader->inputBuiltins.find(spv::BuiltInClipDistance);
184 if(it != spirvShader->inputBuiltins.end())
185 {
186 if(i < it->second.SizeInComponents)
187 {
188 routine.getVariable(it->second.Id)[it->second.FirstComponent + i] = distance;
189 }
190 }
191 }
192 }
193
194 if(spirvShader->getUsedCapabilities().CullDistance)
195 {
196 auto it = spirvShader->inputBuiltins.find(spv::BuiltInCullDistance);
197 if(it != spirvShader->inputBuiltins.end())
198 {
199 for(uint32_t i = 0; i < state.numCullDistances; i++)
200 {
201 if(i < it->second.SizeInComponents)
202 {
203 routine.getVariable(it->second.Id)[it->second.FirstComponent + i] =
204 interpolate(xxxx, DcullDistance[i], rhw,
205 primitive + OFFSET(Primitive, cullDistance[i]),
206 false, true, false);
207 }
208 }
209 }
210 }
211 }
212
213 Bool alphaPass = true;
214
215 if(spirvShader)
216 {
217 bool earlyFragTests = (spirvShader && spirvShader->getModes().EarlyFragmentTests);
218 applyShader(cMask, earlyFragTests ? sMask : cMask, earlyDepthTest ? zMask : cMask);
219 }
220
221 alphaPass = alphaTest(cMask);
222
223 if((spirvShader && spirvShader->getModes().ContainsKill) || state.alphaToCoverage)
224 {
225 for(unsigned int q = 0; q < state.multiSampleCount; q++)
226 {
227 zMask[q] &= cMask[q];
228 sMask[q] &= cMask[q];
229 }
230 }
231
232 If(alphaPass)
233 {
234 if(!earlyDepthTest)
235 {
236 for(unsigned int q = 0; q < state.multiSampleCount; q++)
237 {
238 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
239 }
240 }
241
242 If(depthPass || Bool(earlyDepthTest))
243 {
244 for(unsigned int q = 0; q < state.multiSampleCount; q++)
245 {
246 if(state.multiSampleMask & (1 << q))
247 {
248 writeDepth(zBuffer, q, x, z[q], zMask[q]);
249
250 if(state.occlusionEnabled)
251 {
252 occlusion += *Pointer<UInt>(constants + OFFSET(Constants, occlusionCount) + 4 * (zMask[q] & sMask[q]));
253 }
254 }
255 }
256
257 rasterOperation(cBuffer, x, sMask, zMask, cMask);
258 }
259 }
260 }
261
262 for(unsigned int q = 0; q < state.multiSampleCount; q++)
263 {
264 if(state.multiSampleMask & (1 << q))
265 {
266 writeStencil(sBuffer, q, x, sMask[q], zMask[q], cMask[q]);
267 }
268 }
269 }
270
interpolateCentroid(const Float4 & x,const Float4 & y,const Float4 & rhw,Pointer<Byte> planeEquation,bool flat,bool perspective)271 Float4 PixelRoutine::interpolateCentroid(const Float4 &x, const Float4 &y, const Float4 &rhw, Pointer<Byte> planeEquation, bool flat, bool perspective)
272 {
273 Float4 interpolant = *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation, C), 16);
274
275 if(!flat)
276 {
277 interpolant += x * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation, A), 16) +
278 y * *Pointer<Float4>(planeEquation + OFFSET(PlaneEquation, B), 16);
279
280 if(perspective)
281 {
282 interpolant *= rhw;
283 }
284 }
285
286 return interpolant;
287 }
288
stencilTest(const Pointer<Byte> & sBuffer,int q,const Int & x,Int & sMask,const Int & cMask)289 void PixelRoutine::stencilTest(const Pointer<Byte> &sBuffer, int q, const Int &x, Int &sMask, const Int &cMask)
290 {
291 if(!state.stencilActive)
292 {
293 return;
294 }
295
296 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
297
298 Pointer<Byte> buffer = sBuffer + x;
299
300 if(q > 0)
301 {
302 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
303 }
304
305 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
306 Byte8 value = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
307 value = value | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
308 Byte8 valueBack = value;
309
310 if(state.frontStencil.compareMask != 0xff)
311 {
312 value &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].testMaskQ));
313 }
314
315 stencilTest(value, state.frontStencil.compareOp, false);
316
317 if(state.backStencil.compareMask != 0xff)
318 {
319 valueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].testMaskQ));
320 }
321
322 stencilTest(valueBack, state.backStencil.compareOp, true);
323
324 value &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
325 valueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
326 value |= valueBack;
327
328 sMask = SignMask(value) & cMask;
329 }
330
stencilTest(Byte8 & value,VkCompareOp stencilCompareMode,bool isBack)331 void PixelRoutine::stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool isBack)
332 {
333 Byte8 equal;
334
335 switch(stencilCompareMode)
336 {
337 case VK_COMPARE_OP_ALWAYS:
338 value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
339 break;
340 case VK_COMPARE_OP_NEVER:
341 value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
342 break;
343 case VK_COMPARE_OP_LESS: // a < b ~ b > a
344 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
345 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
346 break;
347 case VK_COMPARE_OP_EQUAL:
348 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
349 break;
350 case VK_COMPARE_OP_NOT_EQUAL: // a != b ~ !(a == b)
351 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
352 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
353 break;
354 case VK_COMPARE_OP_LESS_OR_EQUAL: // a <= b ~ (b > a) || (a == b)
355 equal = value;
356 equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
357 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
358 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
359 value |= equal;
360 break;
361 case VK_COMPARE_OP_GREATER: // a > b
362 equal = *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ));
363 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
364 equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
365 value = equal;
366 break;
367 case VK_COMPARE_OP_GREATER_OR_EQUAL: // a >= b ~ !(a < b) ~ !(b > a)
368 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
369 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
370 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
371 break;
372 default:
373 UNSUPPORTED("VkCompareOp: %d", int(stencilCompareMode));
374 }
375 }
376
depthTest32F(const Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & sMask,Int & zMask,const Int & cMask)377 Bool PixelRoutine::depthTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
378 {
379 Float4 Z = z;
380
381 if(spirvShader && spirvShader->getModes().DepthReplacing)
382 {
383 Z = oDepth;
384 }
385
386 Pointer<Byte> buffer = zBuffer + 4 * x;
387 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
388
389 if(q > 0)
390 {
391 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
392 }
393
394 Float4 zValue;
395
396 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
397 {
398 // FIXME: Properly optimizes?
399 zValue.xy = *Pointer<Float4>(buffer);
400 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
401 }
402
403 Int4 zTest;
404
405 switch(state.depthCompareMode)
406 {
407 case VK_COMPARE_OP_ALWAYS:
408 // Optimized
409 break;
410 case VK_COMPARE_OP_NEVER:
411 // Optimized
412 break;
413 case VK_COMPARE_OP_EQUAL:
414 zTest = CmpEQ(zValue, Z);
415 break;
416 case VK_COMPARE_OP_NOT_EQUAL:
417 zTest = CmpNEQ(zValue, Z);
418 break;
419 case VK_COMPARE_OP_LESS:
420 zTest = CmpNLE(zValue, Z);
421 break;
422 case VK_COMPARE_OP_GREATER_OR_EQUAL:
423 zTest = CmpLE(zValue, Z);
424 break;
425 case VK_COMPARE_OP_LESS_OR_EQUAL:
426 zTest = CmpNLT(zValue, Z);
427 break;
428 case VK_COMPARE_OP_GREATER:
429 zTest = CmpLT(zValue, Z);
430 break;
431 default:
432 UNSUPPORTED("VkCompareOp: %d", int(state.depthCompareMode));
433 }
434
435 switch(state.depthCompareMode)
436 {
437 case VK_COMPARE_OP_ALWAYS:
438 zMask = cMask;
439 break;
440 case VK_COMPARE_OP_NEVER:
441 zMask = 0x0;
442 break;
443 default:
444 zMask = SignMask(zTest) & cMask;
445 break;
446 }
447
448 if(state.stencilActive)
449 {
450 zMask &= sMask;
451 }
452
453 return zMask != 0;
454 }
455
depthTest16(const Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & sMask,Int & zMask,const Int & cMask)456 Bool PixelRoutine::depthTest16(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
457 {
458 Short4 Z = convertFixed16(z, true);
459
460 if(spirvShader && spirvShader->getModes().DepthReplacing)
461 {
462 Z = convertFixed16(oDepth, true);
463 }
464
465 Pointer<Byte> buffer = zBuffer + 2 * x;
466 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
467
468 if(q > 0)
469 {
470 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
471 }
472
473 Short4 zValue;
474
475 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
476 {
477 // FIXME: Properly optimizes?
478 zValue = *Pointer<Short4>(buffer) & Short4(-1, -1, 0, 0);
479 zValue = zValue | (*Pointer<Short4>(buffer + pitch - 4) & Short4(0, 0, -1, -1));
480 }
481
482 Int4 zTest;
483
484 // Bias values to make unsigned compares out of Reactor's (due SSE's) signed compares only
485 zValue = zValue - Short4(0x8000u);
486 Z = Z - Short4(0x8000u);
487
488 switch(state.depthCompareMode)
489 {
490 case VK_COMPARE_OP_ALWAYS:
491 // Optimized
492 break;
493 case VK_COMPARE_OP_NEVER:
494 // Optimized
495 break;
496 case VK_COMPARE_OP_EQUAL:
497 zTest = Int4(CmpEQ(zValue, Z));
498 break;
499 case VK_COMPARE_OP_NOT_EQUAL:
500 zTest = ~Int4(CmpEQ(zValue, Z));
501 break;
502 case VK_COMPARE_OP_LESS:
503 zTest = Int4(CmpGT(zValue, Z));
504 break;
505 case VK_COMPARE_OP_GREATER_OR_EQUAL:
506 zTest = ~Int4(CmpGT(zValue, Z));
507 break;
508 case VK_COMPARE_OP_LESS_OR_EQUAL:
509 zTest = ~Int4(CmpGT(Z, zValue));
510 break;
511 case VK_COMPARE_OP_GREATER:
512 zTest = Int4(CmpGT(Z, zValue));
513 break;
514 default:
515 UNSUPPORTED("VkCompareOp: %d", int(state.depthCompareMode));
516 }
517
518 switch(state.depthCompareMode)
519 {
520 case VK_COMPARE_OP_ALWAYS:
521 zMask = cMask;
522 break;
523 case VK_COMPARE_OP_NEVER:
524 zMask = 0x0;
525 break;
526 default:
527 zMask = SignMask(zTest) & cMask;
528 break;
529 }
530
531 if(state.stencilActive)
532 {
533 zMask &= sMask;
534 }
535
536 return zMask != 0;
537 }
538
depthTest(const Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & sMask,Int & zMask,const Int & cMask)539 Bool PixelRoutine::depthTest(const Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &sMask, Int &zMask, const Int &cMask)
540 {
541 if(!state.depthTestActive)
542 {
543 return true;
544 }
545
546 if(state.depthFormat == VK_FORMAT_D16_UNORM)
547 return depthTest16(zBuffer, q, x, z, sMask, zMask, cMask);
548 else
549 return depthTest32F(zBuffer, q, x, z, sMask, zMask, cMask);
550 }
551
alphaToCoverage(Int cMask[4],const Float4 & alpha)552 void PixelRoutine::alphaToCoverage(Int cMask[4], const Float4 &alpha)
553 {
554 Int4 coverage0 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData, a2c0)));
555 Int4 coverage1 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData, a2c1)));
556 Int4 coverage2 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData, a2c2)));
557 Int4 coverage3 = CmpNLT(alpha, *Pointer<Float4>(data + OFFSET(DrawData, a2c3)));
558
559 Int aMask0 = SignMask(coverage0);
560 Int aMask1 = SignMask(coverage1);
561 Int aMask2 = SignMask(coverage2);
562 Int aMask3 = SignMask(coverage3);
563
564 cMask[0] &= aMask0;
565 cMask[1] &= aMask1;
566 cMask[2] &= aMask2;
567 cMask[3] &= aMask3;
568 }
569
writeDepth32F(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)570 void PixelRoutine::writeDepth32F(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
571 {
572 Float4 Z = z;
573
574 if(spirvShader && spirvShader->getModes().DepthReplacing)
575 {
576 Z = oDepth;
577 }
578
579 Pointer<Byte> buffer = zBuffer + 4 * x;
580 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
581
582 if(q > 0)
583 {
584 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
585 }
586
587 Float4 zValue;
588
589 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
590 {
591 // FIXME: Properly optimizes?
592 zValue.xy = *Pointer<Float4>(buffer);
593 zValue.zw = *Pointer<Float4>(buffer + pitch - 8);
594 }
595
596 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + zMask * 16, 16));
597 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + zMask * 16, 16));
598 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
599
600 // FIXME: Properly optimizes?
601 *Pointer<Float2>(buffer) = Float2(Z.xy);
602 *Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
603 }
604
writeDepth16(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)605 void PixelRoutine::writeDepth16(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
606 {
607 Short4 Z = As<Short4>(convertFixed16(z, true));
608
609 if(spirvShader && spirvShader->getModes().DepthReplacing)
610 {
611 Z = As<Short4>(convertFixed16(oDepth, true));
612 }
613
614 Pointer<Byte> buffer = zBuffer + 2 * x;
615 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
616
617 if(q > 0)
618 {
619 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
620 }
621
622 Short4 zValue;
623
624 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
625 {
626 // FIXME: Properly optimizes?
627 zValue = *Pointer<Short4>(buffer) & Short4(-1, -1, 0, 0);
628 zValue = zValue | (*Pointer<Short4>(buffer + pitch - 4) & Short4(0, 0, -1, -1));
629 }
630
631 Z = Z & *Pointer<Short4>(constants + OFFSET(Constants, maskW4Q) + zMask * 8, 8);
632 zValue = zValue & *Pointer<Short4>(constants + OFFSET(Constants, invMaskW4Q) + zMask * 8, 8);
633 Z = Z | zValue;
634
635 // FIXME: Properly optimizes?
636 *Pointer<Short>(buffer) = Extract(Z, 0);
637 *Pointer<Short>(buffer + 2) = Extract(Z, 1);
638 *Pointer<Short>(buffer + pitch) = Extract(Z, 2);
639 *Pointer<Short>(buffer + pitch + 2) = Extract(Z, 3);
640 }
641
writeDepth(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)642 void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
643 {
644 if(!state.depthWriteEnable)
645 {
646 return;
647 }
648
649 if(state.depthFormat == VK_FORMAT_D16_UNORM)
650 writeDepth16(zBuffer, q, x, z, zMask);
651 else
652 writeDepth32F(zBuffer, q, x, z, zMask);
653 }
654
writeStencil(Pointer<Byte> & sBuffer,int q,const Int & x,const Int & sMask,const Int & zMask,const Int & cMask)655 void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, int q, const Int &x, const Int &sMask, const Int &zMask, const Int &cMask)
656 {
657 if(!state.stencilActive)
658 {
659 return;
660 }
661
662 if(state.frontStencil.passOp == VK_STENCIL_OP_KEEP && state.frontStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.frontStencil.failOp == VK_STENCIL_OP_KEEP)
663 {
664 if(state.backStencil.passOp == VK_STENCIL_OP_KEEP && state.backStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.backStencil.failOp == VK_STENCIL_OP_KEEP)
665 {
666 return;
667 }
668 }
669
670 if((state.frontStencil.writeMask == 0) && (state.backStencil.writeMask == 0))
671 {
672 return;
673 }
674
675 Pointer<Byte> buffer = sBuffer + x;
676
677 if(q > 0)
678 {
679 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
680 }
681
682 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
683 Byte8 bufferValue = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
684 bufferValue = bufferValue | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
685 Byte8 newValue;
686 stencilOperation(newValue, bufferValue, state.frontStencil, false, zMask, sMask);
687
688 if((state.frontStencil.writeMask & 0xFF) != 0xFF) // Assume 8-bit stencil buffer
689 {
690 Byte8 maskedValue = bufferValue;
691 newValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].writeMaskQ));
692 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].invWriteMaskQ));
693 newValue |= maskedValue;
694 }
695
696 Byte8 newValueBack;
697
698 stencilOperation(newValueBack, bufferValue, state.backStencil, true, zMask, sMask);
699
700 if((state.backStencil.writeMask & 0xFF) != 0xFF) // Assume 8-bit stencil buffer
701 {
702 Byte8 maskedValue = bufferValue;
703 newValueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].writeMaskQ));
704 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].invWriteMaskQ));
705 newValueBack |= maskedValue;
706 }
707
708 newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
709 newValueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
710 newValue |= newValueBack;
711
712 newValue &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * cMask);
713 bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * cMask);
714 newValue |= bufferValue;
715
716 *Pointer<Short>(buffer) = Extract(As<Short4>(newValue), 0);
717 *Pointer<Short>(buffer + pitch) = Extract(As<Short4>(newValue), 1);
718 }
719
stencilOperation(Byte8 & newValue,const Byte8 & bufferValue,const PixelProcessor::States::StencilOpState & ops,bool isBack,const Int & zMask,const Int & sMask)720 void PixelRoutine::stencilOperation(Byte8 &newValue, const Byte8 &bufferValue, const PixelProcessor::States::StencilOpState &ops, bool isBack, const Int &zMask, const Int &sMask)
721 {
722 Byte8 &pass = newValue;
723 Byte8 fail;
724 Byte8 zFail;
725
726 stencilOperation(pass, bufferValue, ops.passOp, isBack);
727
728 if(ops.depthFailOp != ops.passOp)
729 {
730 stencilOperation(zFail, bufferValue, ops.depthFailOp, isBack);
731 }
732
733 if(ops.failOp != ops.passOp || ops.failOp != ops.depthFailOp)
734 {
735 stencilOperation(fail, bufferValue, ops.failOp, isBack);
736 }
737
738 if(ops.failOp != ops.passOp || ops.failOp != ops.depthFailOp)
739 {
740 if(state.depthTestActive && ops.depthFailOp != ops.passOp) // zMask valid and values not the same
741 {
742 pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * zMask);
743 zFail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * zMask);
744 pass |= zFail;
745 }
746
747 pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * sMask);
748 fail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * sMask);
749 pass |= fail;
750 }
751 }
752
stencilReplaceRef(bool isBack)753 Byte8 PixelRoutine::stencilReplaceRef(bool isBack)
754 {
755 if(spirvShader)
756 {
757 auto it = spirvShader->outputBuiltins.find(spv::BuiltInFragStencilRefEXT);
758 if(it != spirvShader->outputBuiltins.end())
759 {
760 UInt4 sRef = As<UInt4>(routine.getVariable(it->second.Id)[it->second.FirstComponent]) & UInt4(0xff);
761 // TODO (b/148295813): Could be done with a single pshufb instruction. Optimize the
762 // following line by either adding a rr::Shuffle() variant to do
763 // it explicitly or adding a Byte4(Int4) constructor would work.
764 sRef.x = rr::UInt(sRef.x) | (rr::UInt(sRef.y) << 8) | (rr::UInt(sRef.z) << 16) | (rr::UInt(sRef.w) << 24);
765
766 UInt2 sRefDuplicated;
767 sRefDuplicated = Insert(sRefDuplicated, sRef.x, 0);
768 sRefDuplicated = Insert(sRefDuplicated, sRef.x, 1);
769 return As<Byte8>(sRefDuplicated);
770 }
771 }
772
773 return *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceQ));
774 }
775
stencilOperation(Byte8 & output,const Byte8 & bufferValue,VkStencilOp operation,bool isBack)776 void PixelRoutine::stencilOperation(Byte8 &output, const Byte8 &bufferValue, VkStencilOp operation, bool isBack)
777 {
778 switch(operation)
779 {
780 case VK_STENCIL_OP_KEEP:
781 output = bufferValue;
782 break;
783 case VK_STENCIL_OP_ZERO:
784 output = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
785 break;
786 case VK_STENCIL_OP_REPLACE:
787 output = stencilReplaceRef(isBack);
788 break;
789 case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
790 output = AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
791 break;
792 case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
793 output = SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
794 break;
795 case VK_STENCIL_OP_INVERT:
796 output = bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
797 break;
798 case VK_STENCIL_OP_INCREMENT_AND_WRAP:
799 output = bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
800 break;
801 case VK_STENCIL_OP_DECREMENT_AND_WRAP:
802 output = bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
803 break;
804 default:
805 UNSUPPORTED("VkStencilOp: %d", int(operation));
806 }
807 }
808
blendFactor(Vector4s & blendFactor,const Vector4s & current,const Vector4s & pixel,VkBlendFactor blendFactorActive)809 void PixelRoutine::blendFactor(Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, VkBlendFactor blendFactorActive)
810 {
811 switch(blendFactorActive)
812 {
813 case VK_BLEND_FACTOR_ZERO:
814 // Optimized
815 break;
816 case VK_BLEND_FACTOR_ONE:
817 // Optimized
818 break;
819 case VK_BLEND_FACTOR_SRC_COLOR:
820 blendFactor.x = current.x;
821 blendFactor.y = current.y;
822 blendFactor.z = current.z;
823 break;
824 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
825 blendFactor.x = Short4(0xFFFFu) - current.x;
826 blendFactor.y = Short4(0xFFFFu) - current.y;
827 blendFactor.z = Short4(0xFFFFu) - current.z;
828 break;
829 case VK_BLEND_FACTOR_DST_COLOR:
830 blendFactor.x = pixel.x;
831 blendFactor.y = pixel.y;
832 blendFactor.z = pixel.z;
833 break;
834 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
835 blendFactor.x = Short4(0xFFFFu) - pixel.x;
836 blendFactor.y = Short4(0xFFFFu) - pixel.y;
837 blendFactor.z = Short4(0xFFFFu) - pixel.z;
838 break;
839 case VK_BLEND_FACTOR_SRC_ALPHA:
840 blendFactor.x = current.w;
841 blendFactor.y = current.w;
842 blendFactor.z = current.w;
843 break;
844 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
845 blendFactor.x = Short4(0xFFFFu) - current.w;
846 blendFactor.y = Short4(0xFFFFu) - current.w;
847 blendFactor.z = Short4(0xFFFFu) - current.w;
848 break;
849 case VK_BLEND_FACTOR_DST_ALPHA:
850 blendFactor.x = pixel.w;
851 blendFactor.y = pixel.w;
852 blendFactor.z = pixel.w;
853 break;
854 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
855 blendFactor.x = Short4(0xFFFFu) - pixel.w;
856 blendFactor.y = Short4(0xFFFFu) - pixel.w;
857 blendFactor.z = Short4(0xFFFFu) - pixel.w;
858 break;
859 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
860 blendFactor.x = Short4(0xFFFFu) - pixel.w;
861 blendFactor.x = Min(As<UShort4>(blendFactor.x), As<UShort4>(current.w));
862 blendFactor.y = blendFactor.x;
863 blendFactor.z = blendFactor.x;
864 break;
865 case VK_BLEND_FACTOR_CONSTANT_COLOR:
866 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData, factor.blendConstant4W[0]));
867 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData, factor.blendConstant4W[1]));
868 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData, factor.blendConstant4W[2]));
869 break;
870 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
871 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData, factor.invBlendConstant4W[0]));
872 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData, factor.invBlendConstant4W[1]));
873 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData, factor.invBlendConstant4W[2]));
874 break;
875 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
876 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData, factor.blendConstant4W[3]));
877 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData, factor.blendConstant4W[3]));
878 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData, factor.blendConstant4W[3]));
879 break;
880 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
881 blendFactor.x = *Pointer<Short4>(data + OFFSET(DrawData, factor.invBlendConstant4W[3]));
882 blendFactor.y = *Pointer<Short4>(data + OFFSET(DrawData, factor.invBlendConstant4W[3]));
883 blendFactor.z = *Pointer<Short4>(data + OFFSET(DrawData, factor.invBlendConstant4W[3]));
884 break;
885 default:
886 UNSUPPORTED("VkBlendFactor: %d", int(blendFactorActive));
887 }
888 }
889
blendFactorAlpha(Vector4s & blendFactor,const Vector4s & current,const Vector4s & pixel,VkBlendFactor blendFactorAlphaActive)890 void PixelRoutine::blendFactorAlpha(Vector4s &blendFactor, const Vector4s ¤t, const Vector4s &pixel, VkBlendFactor blendFactorAlphaActive)
891 {
892 switch(blendFactorAlphaActive)
893 {
894 case VK_BLEND_FACTOR_ZERO:
895 // Optimized
896 break;
897 case VK_BLEND_FACTOR_ONE:
898 // Optimized
899 break;
900 case VK_BLEND_FACTOR_SRC_COLOR:
901 blendFactor.w = current.w;
902 break;
903 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
904 blendFactor.w = Short4(0xFFFFu) - current.w;
905 break;
906 case VK_BLEND_FACTOR_DST_COLOR:
907 blendFactor.w = pixel.w;
908 break;
909 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
910 blendFactor.w = Short4(0xFFFFu) - pixel.w;
911 break;
912 case VK_BLEND_FACTOR_SRC_ALPHA:
913 blendFactor.w = current.w;
914 break;
915 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
916 blendFactor.w = Short4(0xFFFFu) - current.w;
917 break;
918 case VK_BLEND_FACTOR_DST_ALPHA:
919 blendFactor.w = pixel.w;
920 break;
921 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
922 blendFactor.w = Short4(0xFFFFu) - pixel.w;
923 break;
924 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
925 blendFactor.w = Short4(0xFFFFu);
926 break;
927 case VK_BLEND_FACTOR_CONSTANT_COLOR:
928 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
929 blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData, factor.blendConstant4W[3]));
930 break;
931 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
932 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
933 blendFactor.w = *Pointer<Short4>(data + OFFSET(DrawData, factor.invBlendConstant4W[3]));
934 break;
935 default:
936 UNSUPPORTED("VkBlendFactor: %d", int(blendFactorAlphaActive));
937 }
938 }
939
isSRGB(int index) const940 bool PixelRoutine::isSRGB(int index) const
941 {
942 return vk::Format(state.targetFormat[index]).isSRGBformat();
943 }
944
readPixel(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4s & pixel)945 void PixelRoutine::readPixel(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &pixel)
946 {
947 Short4 c01;
948 Short4 c23;
949 Pointer<Byte> buffer = cBuffer;
950 Pointer<Byte> buffer2;
951
952 Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
953
954 switch(state.targetFormat[index])
955 {
956 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
957 buffer += 2 * x;
958 buffer2 = buffer + pitchB;
959 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
960
961 pixel.x = (c01 & Short4(0x7C00u)) << 1;
962 pixel.y = (c01 & Short4(0x03E0u)) << 6;
963 pixel.z = (c01 & Short4(0x001Fu)) << 11;
964 pixel.w = (c01 & Short4(0x8000u)) >> 15;
965
966 // Expand to 16 bit range
967 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
968 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
969 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
970 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
971 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
972 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
973 break;
974 case VK_FORMAT_R5G6B5_UNORM_PACK16:
975 buffer += 2 * x;
976 buffer2 = buffer + pitchB;
977 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
978
979 pixel.x = c01 & Short4(0xF800u);
980 pixel.y = (c01 & Short4(0x07E0u)) << 5;
981 pixel.z = (c01 & Short4(0x001Fu)) << 11;
982 pixel.w = Short4(0xFFFFu);
983
984 // Expand to 16 bit range
985 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
986 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
987 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
988 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
989 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
990 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
991 break;
992 case VK_FORMAT_B8G8R8A8_UNORM:
993 case VK_FORMAT_B8G8R8A8_SRGB:
994 buffer += 4 * x;
995 c01 = *Pointer<Short4>(buffer);
996 buffer += pitchB;
997 c23 = *Pointer<Short4>(buffer);
998 pixel.z = c01;
999 pixel.y = c01;
1000 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1001 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1002 pixel.x = pixel.z;
1003 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1004 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1005 pixel.y = pixel.z;
1006 pixel.w = pixel.x;
1007 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1008 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1009 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1010 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1011 break;
1012 case VK_FORMAT_R8G8B8A8_UNORM:
1013 case VK_FORMAT_R8G8B8A8_SRGB:
1014 buffer += 4 * x;
1015 c01 = *Pointer<Short4>(buffer);
1016 buffer += pitchB;
1017 c23 = *Pointer<Short4>(buffer);
1018 pixel.z = c01;
1019 pixel.y = c01;
1020 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1021 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1022 pixel.x = pixel.z;
1023 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1024 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1025 pixel.y = pixel.z;
1026 pixel.w = pixel.x;
1027 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1028 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1029 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1030 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1031 break;
1032 case VK_FORMAT_R8_UNORM:
1033 buffer += 1 * x;
1034 pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
1035 buffer += pitchB;
1036 pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
1037 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1038 pixel.y = Short4(0x0000);
1039 pixel.z = Short4(0x0000);
1040 pixel.w = Short4(0xFFFFu);
1041 break;
1042 case VK_FORMAT_R8G8_UNORM:
1043 buffer += 2 * x;
1044 c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
1045 buffer += pitchB;
1046 c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
1047 pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
1048 pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
1049 pixel.z = Short4(0x0000u);
1050 pixel.w = Short4(0xFFFFu);
1051 break;
1052 case VK_FORMAT_R16G16B16A16_UNORM:
1053 buffer += 8 * x;
1054 pixel.x = *Pointer<Short4>(buffer + 0);
1055 pixel.y = *Pointer<Short4>(buffer + 8);
1056 buffer += pitchB;
1057 pixel.z = *Pointer<Short4>(buffer + 0);
1058 pixel.w = *Pointer<Short4>(buffer + 8);
1059 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1060 break;
1061 case VK_FORMAT_R16G16_UNORM:
1062 buffer += 4 * x;
1063 pixel.x = *Pointer<Short4>(buffer);
1064 buffer += pitchB;
1065 pixel.y = *Pointer<Short4>(buffer);
1066 pixel.z = pixel.x;
1067 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.y));
1068 pixel.z = As<Short4>(UnpackHigh(pixel.z, pixel.y));
1069 pixel.y = pixel.z;
1070 pixel.x = As<Short4>(UnpackLow(pixel.x, pixel.z));
1071 pixel.y = As<Short4>(UnpackHigh(pixel.y, pixel.z));
1072 pixel.z = Short4(0xFFFFu);
1073 pixel.w = Short4(0xFFFFu);
1074 break;
1075 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1076 {
1077 Int4 v = Int4(0);
1078 buffer += 4 * x;
1079 v = Insert(v, *Pointer<Int>(buffer + 0), 0);
1080 v = Insert(v, *Pointer<Int>(buffer + 4), 1);
1081 buffer += pitchB;
1082 v = Insert(v, *Pointer<Int>(buffer + 0), 2);
1083 v = Insert(v, *Pointer<Int>(buffer + 4), 3);
1084
1085 pixel = a2b10g10r10Unpack(v);
1086 }
1087 break;
1088 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1089 {
1090 Int4 v = Int4(0);
1091 v = Insert(v, *Pointer<Int>(buffer + 4 * x), 0);
1092 v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 1);
1093 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1094 v = Insert(v, *Pointer<Int>(buffer + 4 * x), 2);
1095 v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 3);
1096
1097 pixel = a2r10g10b10Unpack(v);
1098 }
1099 break;
1100 default:
1101 UNSUPPORTED("VkFormat %d", state.targetFormat[index]);
1102 }
1103
1104 if(isSRGB(index))
1105 {
1106 sRGBtoLinear16_12_16(pixel);
1107 }
1108 }
1109
alphaBlend(int index,const Pointer<Byte> & cBuffer,Vector4s & current,const Int & x)1110 void PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, Vector4s ¤t, const Int &x)
1111 {
1112 if(!state.blendState[index].alphaBlendEnable)
1113 {
1114 return;
1115 }
1116
1117 Vector4s pixel;
1118 readPixel(index, cBuffer, x, pixel);
1119
1120 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1121 Vector4s sourceFactor;
1122 Vector4s destFactor;
1123
1124 blendFactor(sourceFactor, current, pixel, state.blendState[index].sourceBlendFactor);
1125 blendFactor(destFactor, current, pixel, state.blendState[index].destBlendFactor);
1126
1127 if(state.blendState[index].sourceBlendFactor != VK_BLEND_FACTOR_ONE && state.blendState[index].sourceBlendFactor != VK_BLEND_FACTOR_ZERO)
1128 {
1129 current.x = MulHigh(As<UShort4>(current.x), As<UShort4>(sourceFactor.x));
1130 current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
1131 current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
1132 }
1133
1134 if(state.blendState[index].destBlendFactor != VK_BLEND_FACTOR_ONE && state.blendState[index].destBlendFactor != VK_BLEND_FACTOR_ZERO)
1135 {
1136 pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
1137 pixel.y = MulHigh(As<UShort4>(pixel.y), As<UShort4>(destFactor.y));
1138 pixel.z = MulHigh(As<UShort4>(pixel.z), As<UShort4>(destFactor.z));
1139 }
1140
1141 switch(state.blendState[index].blendOperation)
1142 {
1143 case VK_BLEND_OP_ADD:
1144 current.x = AddSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1145 current.y = AddSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1146 current.z = AddSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1147 break;
1148 case VK_BLEND_OP_SUBTRACT:
1149 current.x = SubSat(As<UShort4>(current.x), As<UShort4>(pixel.x));
1150 current.y = SubSat(As<UShort4>(current.y), As<UShort4>(pixel.y));
1151 current.z = SubSat(As<UShort4>(current.z), As<UShort4>(pixel.z));
1152 break;
1153 case VK_BLEND_OP_REVERSE_SUBTRACT:
1154 current.x = SubSat(As<UShort4>(pixel.x), As<UShort4>(current.x));
1155 current.y = SubSat(As<UShort4>(pixel.y), As<UShort4>(current.y));
1156 current.z = SubSat(As<UShort4>(pixel.z), As<UShort4>(current.z));
1157 break;
1158 case VK_BLEND_OP_MIN:
1159 current.x = Min(As<UShort4>(current.x), As<UShort4>(pixel.x));
1160 current.y = Min(As<UShort4>(current.y), As<UShort4>(pixel.y));
1161 current.z = Min(As<UShort4>(current.z), As<UShort4>(pixel.z));
1162 break;
1163 case VK_BLEND_OP_MAX:
1164 current.x = Max(As<UShort4>(current.x), As<UShort4>(pixel.x));
1165 current.y = Max(As<UShort4>(current.y), As<UShort4>(pixel.y));
1166 current.z = Max(As<UShort4>(current.z), As<UShort4>(pixel.z));
1167 break;
1168 case VK_BLEND_OP_SRC_EXT:
1169 // No operation
1170 break;
1171 case VK_BLEND_OP_DST_EXT:
1172 current.x = pixel.x;
1173 current.y = pixel.y;
1174 current.z = pixel.z;
1175 break;
1176 case VK_BLEND_OP_ZERO_EXT:
1177 current.x = Short4(0x0000);
1178 current.y = Short4(0x0000);
1179 current.z = Short4(0x0000);
1180 break;
1181 default:
1182 UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperation));
1183 }
1184
1185 blendFactorAlpha(sourceFactor, current, pixel, state.blendState[index].sourceBlendFactorAlpha);
1186 blendFactorAlpha(destFactor, current, pixel, state.blendState[index].destBlendFactorAlpha);
1187
1188 if(state.blendState[index].sourceBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.blendState[index].sourceBlendFactorAlpha != VK_BLEND_FACTOR_ZERO)
1189 {
1190 current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
1191 }
1192
1193 if(state.blendState[index].destBlendFactorAlpha != VK_BLEND_FACTOR_ONE && state.blendState[index].destBlendFactorAlpha != VK_BLEND_FACTOR_ZERO)
1194 {
1195 pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
1196 }
1197
1198 switch(state.blendState[index].blendOperationAlpha)
1199 {
1200 case VK_BLEND_OP_ADD:
1201 current.w = AddSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1202 break;
1203 case VK_BLEND_OP_SUBTRACT:
1204 current.w = SubSat(As<UShort4>(current.w), As<UShort4>(pixel.w));
1205 break;
1206 case VK_BLEND_OP_REVERSE_SUBTRACT:
1207 current.w = SubSat(As<UShort4>(pixel.w), As<UShort4>(current.w));
1208 break;
1209 case VK_BLEND_OP_MIN:
1210 current.w = Min(As<UShort4>(current.w), As<UShort4>(pixel.w));
1211 break;
1212 case VK_BLEND_OP_MAX:
1213 current.w = Max(As<UShort4>(current.w), As<UShort4>(pixel.w));
1214 break;
1215 case VK_BLEND_OP_SRC_EXT:
1216 // No operation
1217 break;
1218 case VK_BLEND_OP_DST_EXT:
1219 current.w = pixel.w;
1220 break;
1221 case VK_BLEND_OP_ZERO_EXT:
1222 current.w = Short4(0x0000);
1223 break;
1224 default:
1225 UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperationAlpha));
1226 }
1227 }
1228
writeColor(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4s & current,const Int & sMask,const Int & zMask,const Int & cMask)1229 void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s ¤t, const Int &sMask, const Int &zMask, const Int &cMask)
1230 {
1231 if(isSRGB(index))
1232 {
1233 linearToSRGB16_12_16(current);
1234 }
1235
1236 switch(state.targetFormat[index])
1237 {
1238 case VK_FORMAT_B8G8R8A8_UNORM:
1239 case VK_FORMAT_B8G8R8A8_SRGB:
1240 case VK_FORMAT_R8G8B8A8_UNORM:
1241 case VK_FORMAT_R8G8B8A8_SRGB:
1242 case VK_FORMAT_R8G8_UNORM:
1243 case VK_FORMAT_R8_UNORM:
1244 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1245 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1246 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 8) + Short4(0x0080);
1247 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 8) + Short4(0x0080);
1248 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 8) + Short4(0x0080);
1249 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 8) + Short4(0x0080);
1250 break;
1251 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1252 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1253 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 10) + Short4(0x0020);
1254 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 10) + Short4(0x0020);
1255 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 10) + Short4(0x0020);
1256 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 2) + Short4(0x2000);
1257 break;
1258 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1259 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 5) + Short4(0x0400);
1260 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 5) + Short4(0x0400);
1261 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 5) + Short4(0x0400);
1262 current.w = current.w - As<Short4>(As<UShort4>(current.w) >> 1) + Short4(0x4000);
1263 break;
1264 case VK_FORMAT_R5G6B5_UNORM_PACK16:
1265 current.x = current.x - As<Short4>(As<UShort4>(current.x) >> 5) + Short4(0x0400);
1266 current.y = current.y - As<Short4>(As<UShort4>(current.y) >> 6) + Short4(0x0200);
1267 current.z = current.z - As<Short4>(As<UShort4>(current.z) >> 5) + Short4(0x0400);
1268 break;
1269 default:
1270 break;
1271 }
1272
1273 int rgbaWriteMask = state.colorWriteActive(index);
1274 int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
1275
1276 switch(state.targetFormat[index])
1277 {
1278 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1279 {
1280 current.w = current.w & Short4(0x8000u);
1281 current.x = As<UShort4>(current.x & Short4(0xF800)) >> 1;
1282 current.y = As<UShort4>(current.y & Short4(0xF800)) >> 6;
1283 current.z = As<UShort4>(current.z & Short4(0xF800)) >> 11;
1284
1285 current.x = current.x | current.y | current.z | current.w;
1286 }
1287 break;
1288 case VK_FORMAT_R5G6B5_UNORM_PACK16:
1289 {
1290 current.x = current.x & Short4(0xF800u);
1291 current.y = As<UShort4>(current.y & Short4(0xFC00u)) >> 5;
1292 current.z = As<UShort4>(current.z) >> 11;
1293
1294 current.x = current.x | current.y | current.z;
1295 }
1296 break;
1297 case VK_FORMAT_B8G8R8A8_UNORM:
1298 case VK_FORMAT_B8G8R8A8_SRGB:
1299 if(rgbaWriteMask == 0x7)
1300 {
1301 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1302 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1303 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1304
1305 current.z = As<Short4>(PackUnsigned(current.z, current.x));
1306 current.y = As<Short4>(PackUnsigned(current.y, current.y));
1307
1308 current.x = current.z;
1309 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1310 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1311 current.y = current.z;
1312 current.z = As<Short4>(UnpackLow(current.z, current.x));
1313 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1314 }
1315 else
1316 {
1317 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1318 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1319 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1320 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1321
1322 current.z = As<Short4>(PackUnsigned(current.z, current.x));
1323 current.y = As<Short4>(PackUnsigned(current.y, current.w));
1324
1325 current.x = current.z;
1326 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1327 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1328 current.y = current.z;
1329 current.z = As<Short4>(UnpackLow(current.z, current.x));
1330 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1331 }
1332 break;
1333 case VK_FORMAT_R8G8B8A8_UNORM:
1334 case VK_FORMAT_R8G8B8A8_SRGB:
1335 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1336 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1337 if(rgbaWriteMask == 0x7)
1338 {
1339 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1340 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1341 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1342
1343 current.z = As<Short4>(PackUnsigned(current.x, current.z));
1344 current.y = As<Short4>(PackUnsigned(current.y, current.y));
1345
1346 current.x = current.z;
1347 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1348 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1349 current.y = current.z;
1350 current.z = As<Short4>(UnpackLow(current.z, current.x));
1351 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1352 }
1353 else
1354 {
1355 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1356 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1357 current.z = As<Short4>(As<UShort4>(current.z) >> 8);
1358 current.w = As<Short4>(As<UShort4>(current.w) >> 8);
1359
1360 current.z = As<Short4>(PackUnsigned(current.x, current.z));
1361 current.y = As<Short4>(PackUnsigned(current.y, current.w));
1362
1363 current.x = current.z;
1364 current.z = UnpackLow(As<Byte8>(current.z), As<Byte8>(current.y));
1365 current.x = UnpackHigh(As<Byte8>(current.x), As<Byte8>(current.y));
1366 current.y = current.z;
1367 current.z = As<Short4>(UnpackLow(current.z, current.x));
1368 current.y = As<Short4>(UnpackHigh(current.y, current.x));
1369 }
1370 break;
1371 case VK_FORMAT_R8G8_UNORM:
1372 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1373 current.y = As<Short4>(As<UShort4>(current.y) >> 8);
1374 current.x = As<Short4>(PackUnsigned(current.x, current.x));
1375 current.y = As<Short4>(PackUnsigned(current.y, current.y));
1376 current.x = UnpackLow(As<Byte8>(current.x), As<Byte8>(current.y));
1377 break;
1378 case VK_FORMAT_R8_UNORM:
1379 current.x = As<Short4>(As<UShort4>(current.x) >> 8);
1380 current.x = As<Short4>(PackUnsigned(current.x, current.x));
1381 break;
1382 case VK_FORMAT_R16G16_UNORM:
1383 current.z = current.x;
1384 current.x = As<Short4>(UnpackLow(current.x, current.y));
1385 current.z = As<Short4>(UnpackHigh(current.z, current.y));
1386 current.y = current.z;
1387 break;
1388 case VK_FORMAT_R16G16B16A16_UNORM:
1389 transpose4x4(current.x, current.y, current.z, current.w);
1390 break;
1391 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1392 {
1393 auto r = (Int4(current.x) >> 6) & Int4(0x3ff);
1394 auto g = (Int4(current.y) >> 6) & Int4(0x3ff);
1395 auto b = (Int4(current.z) >> 6) & Int4(0x3ff);
1396 auto a = (Int4(current.w) >> 14) & Int4(0x3);
1397 Int4 packed = (a << 30) | (b << 20) | (g << 10) | r;
1398 auto c02 = As<Int2>(Int4(packed.xzzz)); // TODO: auto c02 = packed.xz;
1399 auto c13 = As<Int2>(Int4(packed.ywww)); // TODO: auto c13 = packed.yw;
1400 current.x = UnpackLow(c02, c13);
1401 current.y = UnpackHigh(c02, c13);
1402 break;
1403 }
1404 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1405 {
1406 auto r = (Int4(current.x) >> 6) & Int4(0x3ff);
1407 auto g = (Int4(current.y) >> 6) & Int4(0x3ff);
1408 auto b = (Int4(current.z) >> 6) & Int4(0x3ff);
1409 auto a = (Int4(current.w) >> 14) & Int4(0x3);
1410 Int4 packed = (a << 30) | (r << 20) | (g << 10) | b;
1411 auto c02 = As<Int2>(Int4(packed.xzzz)); // TODO: auto c02 = packed.xz;
1412 auto c13 = As<Int2>(Int4(packed.ywww)); // TODO: auto c13 = packed.yw;
1413 current.x = UnpackLow(c02, c13);
1414 current.y = UnpackHigh(c02, c13);
1415 break;
1416 }
1417 default:
1418 UNSUPPORTED("VkFormat: %d", int(state.targetFormat[index]));
1419 }
1420
1421 Short4 c01 = current.z;
1422 Short4 c23 = current.y;
1423
1424 Int xMask; // Combination of all masks
1425
1426 if(state.depthTestActive)
1427 {
1428 xMask = zMask;
1429 }
1430 else
1431 {
1432 xMask = cMask;
1433 }
1434
1435 if(state.stencilActive)
1436 {
1437 xMask &= sMask;
1438 }
1439
1440 Pointer<Byte> buffer = cBuffer;
1441 Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1442
1443 switch(state.targetFormat[index])
1444 {
1445 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1446 {
1447 buffer += 2 * x;
1448 Int value = *Pointer<Int>(buffer);
1449
1450 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask5551Q[bgraWriteMask & 0xF][0]));
1451
1452 Int c01 = Extract(As<Int2>(current.x), 0);
1453 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1454 if(bgraWriteMask != 0x0000000F)
1455 {
1456 mask01 &= channelMask;
1457 }
1458 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1459
1460 buffer += pitchB;
1461 value = *Pointer<Int>(buffer);
1462
1463 Int c23 = Extract(As<Int2>(current.x), 1);
1464 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1465 if(bgraWriteMask != 0x0000000F)
1466 {
1467 mask23 &= channelMask;
1468 }
1469 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1470 }
1471 break;
1472 case VK_FORMAT_R5G6B5_UNORM_PACK16:
1473 {
1474 buffer += 2 * x;
1475 Int value = *Pointer<Int>(buffer);
1476
1477 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask565Q[bgraWriteMask & 0x7][0]));
1478
1479 Int c01 = Extract(As<Int2>(current.x), 0);
1480 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
1481 if((bgraWriteMask & 0x00000007) != 0x00000007)
1482 {
1483 mask01 &= channelMask;
1484 }
1485 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
1486
1487 buffer += pitchB;
1488 value = *Pointer<Int>(buffer);
1489
1490 Int c23 = Extract(As<Int2>(current.x), 1);
1491 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
1492 if((bgraWriteMask & 0x00000007) != 0x00000007)
1493 {
1494 mask23 &= channelMask;
1495 }
1496 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
1497 }
1498 break;
1499 case VK_FORMAT_B8G8R8A8_UNORM:
1500 case VK_FORMAT_B8G8R8A8_SRGB:
1501 {
1502 buffer += x * 4;
1503 Short4 value = *Pointer<Short4>(buffer);
1504 Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q[bgraWriteMask][0]));
1505
1506 Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1507 if(bgraWriteMask != 0x0000000F)
1508 {
1509 mask01 &= channelMask;
1510 }
1511 *Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
1512
1513 buffer += pitchB;
1514 value = *Pointer<Short4>(buffer);
1515
1516 Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1517 if(bgraWriteMask != 0x0000000F)
1518 {
1519 mask23 &= channelMask;
1520 }
1521 *Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
1522 }
1523 break;
1524 case VK_FORMAT_R8G8B8A8_UNORM:
1525 case VK_FORMAT_R8G8B8A8_SRGB:
1526 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1527 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1528 {
1529 buffer += x * 4;
1530 Short4 value = *Pointer<Short4>(buffer);
1531 Short4 channelMask = *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
1532
1533 Short4 mask01 = *Pointer<Short4>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1534 if(rgbaWriteMask != 0x0000000F)
1535 {
1536 mask01 &= channelMask;
1537 }
1538 *Pointer<Short4>(buffer) = (c01 & mask01) | (value & ~mask01);
1539
1540 buffer += pitchB;
1541 value = *Pointer<Short4>(buffer);
1542
1543 Short4 mask23 = *Pointer<Short4>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1544 if(rgbaWriteMask != 0x0000000F)
1545 {
1546 mask23 &= channelMask;
1547 }
1548 *Pointer<Short4>(buffer) = (c23 & mask23) | (value & ~mask23);
1549 }
1550 break;
1551 case VK_FORMAT_R8G8_UNORM:
1552 if((rgbaWriteMask & 0x00000003) != 0x0)
1553 {
1554 buffer += 2 * x;
1555 Int2 value;
1556 value = Insert(value, *Pointer<Int>(buffer), 0);
1557 value = Insert(value, *Pointer<Int>(buffer + pitchB), 1);
1558
1559 Int2 packedCol = As<Int2>(current.x);
1560
1561 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
1562 if((rgbaWriteMask & 0x3) != 0x3)
1563 {
1564 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
1565 UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
1566 mergedMask &= rgbaMask;
1567 }
1568
1569 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask));
1570
1571 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
1572 *Pointer<UInt>(buffer + pitchB) = As<UInt>(Extract(packedCol, 1));
1573 }
1574 break;
1575 case VK_FORMAT_R8_UNORM:
1576 if(rgbaWriteMask & 0x00000001)
1577 {
1578 buffer += 1 * x;
1579 Short4 value;
1580 value = Insert(value, *Pointer<Short>(buffer), 0);
1581 value = Insert(value, *Pointer<Short>(buffer + pitchB), 1);
1582
1583 current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask);
1584 value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask);
1585 current.x |= value;
1586
1587 *Pointer<Short>(buffer) = Extract(current.x, 0);
1588 *Pointer<Short>(buffer + pitchB) = Extract(current.x, 1);
1589 }
1590 break;
1591 case VK_FORMAT_R16G16_UNORM:
1592 {
1593 buffer += 4 * x;
1594
1595 Short4 value = *Pointer<Short4>(buffer);
1596
1597 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1598 {
1599 Short4 masked = value;
1600 current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskW01Q[rgbaWriteMask & 0x3][0]));
1601 masked &= *Pointer<Short4>(constants + OFFSET(Constants, maskW01Q[~rgbaWriteMask & 0x3][0]));
1602 current.x |= masked;
1603 }
1604
1605 current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1606 value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskD01Q) + xMask * 8);
1607 current.x |= value;
1608 *Pointer<Short4>(buffer) = current.x;
1609
1610 buffer += pitchB;
1611
1612 value = *Pointer<Short4>(buffer);
1613
1614 if((rgbaWriteMask & 0x00000003) != 0x00000003)
1615 {
1616 Short4 masked = value;
1617 current.y &= *Pointer<Short4>(constants + OFFSET(Constants, maskW01Q[rgbaWriteMask & 0x3][0]));
1618 masked &= *Pointer<Short4>(constants + OFFSET(Constants, maskW01Q[~rgbaWriteMask & 0x3][0]));
1619 current.y |= masked;
1620 }
1621
1622 current.y &= *Pointer<Short4>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1623 value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskD23Q) + xMask * 8);
1624 current.y |= value;
1625 *Pointer<Short4>(buffer) = current.y;
1626 }
1627 break;
1628 case VK_FORMAT_R16G16B16A16_UNORM:
1629 {
1630 buffer += 8 * x;
1631
1632 {
1633 Short4 value = *Pointer<Short4>(buffer);
1634
1635 if(rgbaWriteMask != 0x0000000F)
1636 {
1637 Short4 masked = value;
1638 current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
1639 masked &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskW4Q[rgbaWriteMask][0]));
1640 current.x |= masked;
1641 }
1642
1643 current.x &= *Pointer<Short4>(constants + OFFSET(Constants, maskQ0Q) + xMask * 8);
1644 value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskQ0Q) + xMask * 8);
1645 current.x |= value;
1646 *Pointer<Short4>(buffer) = current.x;
1647 }
1648
1649 {
1650 Short4 value = *Pointer<Short4>(buffer + 8);
1651
1652 if(rgbaWriteMask != 0x0000000F)
1653 {
1654 Short4 masked = value;
1655 current.y &= *Pointer<Short4>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
1656 masked &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskW4Q[rgbaWriteMask][0]));
1657 current.y |= masked;
1658 }
1659
1660 current.y &= *Pointer<Short4>(constants + OFFSET(Constants, maskQ1Q) + xMask * 8);
1661 value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskQ1Q) + xMask * 8);
1662 current.y |= value;
1663 *Pointer<Short4>(buffer + 8) = current.y;
1664 }
1665
1666 buffer += pitchB;
1667
1668 {
1669 Short4 value = *Pointer<Short4>(buffer);
1670
1671 if(rgbaWriteMask != 0x0000000F)
1672 {
1673 Short4 masked = value;
1674 current.z &= *Pointer<Short4>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
1675 masked &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskW4Q[rgbaWriteMask][0]));
1676 current.z |= masked;
1677 }
1678
1679 current.z &= *Pointer<Short4>(constants + OFFSET(Constants, maskQ2Q) + xMask * 8);
1680 value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskQ2Q) + xMask * 8);
1681 current.z |= value;
1682 *Pointer<Short4>(buffer) = current.z;
1683 }
1684
1685 {
1686 Short4 value = *Pointer<Short4>(buffer + 8);
1687
1688 if(rgbaWriteMask != 0x0000000F)
1689 {
1690 Short4 masked = value;
1691 current.w &= *Pointer<Short4>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
1692 masked &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskW4Q[rgbaWriteMask][0]));
1693 current.w |= masked;
1694 }
1695
1696 current.w &= *Pointer<Short4>(constants + OFFSET(Constants, maskQ3Q) + xMask * 8);
1697 value &= *Pointer<Short4>(constants + OFFSET(Constants, invMaskQ3Q) + xMask * 8);
1698 current.w |= value;
1699 *Pointer<Short4>(buffer + 8) = current.w;
1700 }
1701 }
1702 break;
1703 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1704 rgbaWriteMask = bgraWriteMask;
1705 // [[fallthrough]]
1706 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1707 {
1708 buffer += 4 * x;
1709
1710 Int2 value = *Pointer<Int2>(buffer, 16);
1711 Int2 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
1712 if(rgbaWriteMask != 0xF)
1713 {
1714 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
1715 }
1716 *Pointer<Int2>(buffer) = (As<Int2>(current.x) & mergedMask) | (value & ~mergedMask);
1717
1718 buffer += pitchB;
1719
1720 value = *Pointer<Int2>(buffer, 16);
1721 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
1722 if(rgbaWriteMask != 0xF)
1723 {
1724 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
1725 }
1726 *Pointer<Int2>(buffer) = (As<Int2>(current.y) & mergedMask) | (value & ~mergedMask);
1727 }
1728 break;
1729 default:
1730 UNSUPPORTED("VkFormat: %d", int(state.targetFormat[index]));
1731 }
1732 }
1733
blendFactor(Vector4f & blendFactor,const Vector4f & oC,const Vector4f & pixel,VkBlendFactor blendFactorActive)1734 void PixelRoutine::blendFactor(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorActive)
1735 {
1736 switch(blendFactorActive)
1737 {
1738 case VK_BLEND_FACTOR_ZERO:
1739 blendFactor.x = Float4(0);
1740 blendFactor.y = Float4(0);
1741 blendFactor.z = Float4(0);
1742 break;
1743 case VK_BLEND_FACTOR_ONE:
1744 blendFactor.x = Float4(1);
1745 blendFactor.y = Float4(1);
1746 blendFactor.z = Float4(1);
1747 break;
1748 case VK_BLEND_FACTOR_SRC_COLOR:
1749 blendFactor.x = oC.x;
1750 blendFactor.y = oC.y;
1751 blendFactor.z = oC.z;
1752 break;
1753 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1754 blendFactor.x = Float4(1.0f) - oC.x;
1755 blendFactor.y = Float4(1.0f) - oC.y;
1756 blendFactor.z = Float4(1.0f) - oC.z;
1757 break;
1758 case VK_BLEND_FACTOR_DST_COLOR:
1759 blendFactor.x = pixel.x;
1760 blendFactor.y = pixel.y;
1761 blendFactor.z = pixel.z;
1762 break;
1763 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1764 blendFactor.x = Float4(1.0f) - pixel.x;
1765 blendFactor.y = Float4(1.0f) - pixel.y;
1766 blendFactor.z = Float4(1.0f) - pixel.z;
1767 break;
1768 case VK_BLEND_FACTOR_SRC_ALPHA:
1769 blendFactor.x = oC.w;
1770 blendFactor.y = oC.w;
1771 blendFactor.z = oC.w;
1772 break;
1773 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1774 blendFactor.x = Float4(1.0f) - oC.w;
1775 blendFactor.y = Float4(1.0f) - oC.w;
1776 blendFactor.z = Float4(1.0f) - oC.w;
1777 break;
1778 case VK_BLEND_FACTOR_DST_ALPHA:
1779 blendFactor.x = pixel.w;
1780 blendFactor.y = pixel.w;
1781 blendFactor.z = pixel.w;
1782 break;
1783 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1784 blendFactor.x = Float4(1.0f) - pixel.w;
1785 blendFactor.y = Float4(1.0f) - pixel.w;
1786 blendFactor.z = Float4(1.0f) - pixel.w;
1787 break;
1788 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1789 blendFactor.x = Float4(1.0f) - pixel.w;
1790 blendFactor.x = Min(blendFactor.x, oC.w);
1791 blendFactor.y = blendFactor.x;
1792 blendFactor.z = blendFactor.x;
1793 break;
1794 case VK_BLEND_FACTOR_CONSTANT_COLOR:
1795 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData, factor.blendConstant4F[0]));
1796 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData, factor.blendConstant4F[1]));
1797 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData, factor.blendConstant4F[2]));
1798 break;
1799 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1800 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData, factor.blendConstant4F[3]));
1801 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData, factor.blendConstant4F[3]));
1802 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData, factor.blendConstant4F[3]));
1803 break;
1804 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1805 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData, factor.invBlendConstant4F[0]));
1806 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData, factor.invBlendConstant4F[1]));
1807 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData, factor.invBlendConstant4F[2]));
1808 break;
1809 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1810 blendFactor.x = *Pointer<Float4>(data + OFFSET(DrawData, factor.invBlendConstant4F[3]));
1811 blendFactor.y = *Pointer<Float4>(data + OFFSET(DrawData, factor.invBlendConstant4F[3]));
1812 blendFactor.z = *Pointer<Float4>(data + OFFSET(DrawData, factor.invBlendConstant4F[3]));
1813 break;
1814
1815 default:
1816 UNSUPPORTED("VkBlendFactor: %d", int(blendFactorActive));
1817 }
1818 }
1819
blendFactorAlpha(Vector4f & blendFactor,const Vector4f & oC,const Vector4f & pixel,VkBlendFactor blendFactorAlphaActive)1820 void PixelRoutine::blendFactorAlpha(Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, VkBlendFactor blendFactorAlphaActive)
1821 {
1822 switch(blendFactorAlphaActive)
1823 {
1824 case VK_BLEND_FACTOR_ZERO:
1825 blendFactor.w = Float4(0);
1826 break;
1827 case VK_BLEND_FACTOR_ONE:
1828 blendFactor.w = Float4(1);
1829 break;
1830 case VK_BLEND_FACTOR_SRC_COLOR:
1831 blendFactor.w = oC.w;
1832 break;
1833 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1834 blendFactor.w = Float4(1.0f) - oC.w;
1835 break;
1836 case VK_BLEND_FACTOR_DST_COLOR:
1837 blendFactor.w = pixel.w;
1838 break;
1839 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1840 blendFactor.w = Float4(1.0f) - pixel.w;
1841 break;
1842 case VK_BLEND_FACTOR_SRC_ALPHA:
1843 blendFactor.w = oC.w;
1844 break;
1845 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1846 blendFactor.w = Float4(1.0f) - oC.w;
1847 break;
1848 case VK_BLEND_FACTOR_DST_ALPHA:
1849 blendFactor.w = pixel.w;
1850 break;
1851 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1852 blendFactor.w = Float4(1.0f) - pixel.w;
1853 break;
1854 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1855 blendFactor.w = Float4(1.0f);
1856 break;
1857 case VK_BLEND_FACTOR_CONSTANT_COLOR:
1858 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1859 blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData, factor.blendConstant4F[3]));
1860 break;
1861 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1862 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1863 blendFactor.w = *Pointer<Float4>(data + OFFSET(DrawData, factor.invBlendConstant4F[3]));
1864 break;
1865 default:
1866 UNSUPPORTED("VkBlendFactor: %d", int(blendFactorAlphaActive));
1867 }
1868 }
1869
alphaBlend(int index,const Pointer<Byte> & cBuffer,Vector4f & oC,const Int & x)1870 void PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, Vector4f &oC, const Int &x)
1871 {
1872 if(!state.blendState[index].alphaBlendEnable)
1873 {
1874 return;
1875 }
1876
1877 Pointer<Byte> buffer = cBuffer;
1878 Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1879
1880 // pixel holds four texel color values.
1881 // Note: Despite the type being Vector4f, the colors may be stored as
1882 // integers. Half-floats are stored as full 32-bit floats.
1883 // Non-float and non-fixed point formats are not alpha blended.
1884 Vector4f pixel;
1885
1886 Vector4s color;
1887 Short4 c01;
1888 Short4 c23;
1889
1890 Float4 one;
1891 vk::Format format(state.targetFormat[index]);
1892 if(format.isFloatFormat())
1893 {
1894 one = Float4(1.0f);
1895 }
1896 else if(format.isUnnormalizedInteger())
1897 {
1898 one = As<Float4>(format.isUnsignedComponent(0) ? Int4(0xFFFFFFFF) : Int4(0x7FFFFFFF));
1899 }
1900
1901 switch(state.targetFormat[index])
1902 {
1903 case VK_FORMAT_R32_SINT:
1904 case VK_FORMAT_R32_UINT:
1905 case VK_FORMAT_R32_SFLOAT:
1906 // FIXME: movlps
1907 buffer += 4 * x;
1908 pixel.x.x = *Pointer<Float>(buffer + 0);
1909 pixel.x.y = *Pointer<Float>(buffer + 4);
1910 buffer += pitchB;
1911 // FIXME: movhps
1912 pixel.x.z = *Pointer<Float>(buffer + 0);
1913 pixel.x.w = *Pointer<Float>(buffer + 4);
1914 pixel.y = pixel.z = pixel.w = one;
1915 break;
1916 case VK_FORMAT_R32G32_SINT:
1917 case VK_FORMAT_R32G32_UINT:
1918 case VK_FORMAT_R32G32_SFLOAT:
1919 buffer += 8 * x;
1920 pixel.x = *Pointer<Float4>(buffer, 16);
1921 buffer += pitchB;
1922 pixel.y = *Pointer<Float4>(buffer, 16);
1923 pixel.z = pixel.x;
1924 pixel.x = ShuffleLowHigh(pixel.x, pixel.y, 0x0202);
1925 pixel.z = ShuffleLowHigh(pixel.z, pixel.y, 0x1313);
1926 pixel.y = pixel.z;
1927 pixel.z = pixel.w = one;
1928 break;
1929 case VK_FORMAT_R32G32B32A32_SFLOAT:
1930 case VK_FORMAT_R32G32B32A32_SINT:
1931 case VK_FORMAT_R32G32B32A32_UINT:
1932 buffer += 16 * x;
1933 pixel.x = *Pointer<Float4>(buffer + 0, 16);
1934 pixel.y = *Pointer<Float4>(buffer + 16, 16);
1935 buffer += pitchB;
1936 pixel.z = *Pointer<Float4>(buffer + 0, 16);
1937 pixel.w = *Pointer<Float4>(buffer + 16, 16);
1938 transpose4x4(pixel.x, pixel.y, pixel.z, pixel.w);
1939 break;
1940 case VK_FORMAT_R16_SFLOAT:
1941 buffer += 2 * x;
1942 pixel.x.x = Float(*Pointer<Half>(buffer + 0));
1943 pixel.x.y = Float(*Pointer<Half>(buffer + 2));
1944 buffer += pitchB;
1945 pixel.x.z = Float(*Pointer<Half>(buffer + 0));
1946 pixel.x.w = Float(*Pointer<Half>(buffer + 2));
1947 pixel.y = pixel.z = pixel.w = one;
1948 break;
1949 case VK_FORMAT_R16G16_SFLOAT:
1950 buffer += 4 * x;
1951 pixel.x.x = Float(*Pointer<Half>(buffer + 0));
1952 pixel.y.x = Float(*Pointer<Half>(buffer + 2));
1953 pixel.x.y = Float(*Pointer<Half>(buffer + 4));
1954 pixel.y.y = Float(*Pointer<Half>(buffer + 6));
1955 buffer += pitchB;
1956 pixel.x.z = Float(*Pointer<Half>(buffer + 0));
1957 pixel.y.z = Float(*Pointer<Half>(buffer + 2));
1958 pixel.x.w = Float(*Pointer<Half>(buffer + 4));
1959 pixel.y.w = Float(*Pointer<Half>(buffer + 6));
1960 pixel.z = pixel.w = one;
1961 break;
1962 case VK_FORMAT_R16G16B16A16_SFLOAT:
1963 buffer += 8 * x;
1964 pixel.x.x = Float(*Pointer<Half>(buffer + 0x0));
1965 pixel.y.x = Float(*Pointer<Half>(buffer + 0x2));
1966 pixel.z.x = Float(*Pointer<Half>(buffer + 0x4));
1967 pixel.w.x = Float(*Pointer<Half>(buffer + 0x6));
1968 pixel.x.y = Float(*Pointer<Half>(buffer + 0x8));
1969 pixel.y.y = Float(*Pointer<Half>(buffer + 0xa));
1970 pixel.z.y = Float(*Pointer<Half>(buffer + 0xc));
1971 pixel.w.y = Float(*Pointer<Half>(buffer + 0xe));
1972 buffer += pitchB;
1973 pixel.x.z = Float(*Pointer<Half>(buffer + 0x0));
1974 pixel.y.z = Float(*Pointer<Half>(buffer + 0x2));
1975 pixel.z.z = Float(*Pointer<Half>(buffer + 0x4));
1976 pixel.w.z = Float(*Pointer<Half>(buffer + 0x6));
1977 pixel.x.w = Float(*Pointer<Half>(buffer + 0x8));
1978 pixel.y.w = Float(*Pointer<Half>(buffer + 0xa));
1979 pixel.z.w = Float(*Pointer<Half>(buffer + 0xc));
1980 pixel.w.w = Float(*Pointer<Half>(buffer + 0xe));
1981 break;
1982 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
1983 buffer += 4 * x;
1984 pixel.x = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
1985 pixel.y = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
1986 buffer += pitchB;
1987 pixel.z = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
1988 pixel.w = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
1989 transpose4x3(pixel.x, pixel.y, pixel.z, pixel.w);
1990 pixel.w = one;
1991 break;
1992 default:
1993 UNSUPPORTED("VkFormat: %d", int(state.targetFormat[index]));
1994 }
1995
1996 // Final Color = ObjectColor * SourceBlendFactor + PixelColor * DestinationBlendFactor
1997 Vector4f sourceFactor;
1998 Vector4f destFactor;
1999
2000 blendFactor(sourceFactor, oC, pixel, state.blendState[index].sourceBlendFactor);
2001 blendFactor(destFactor, oC, pixel, state.blendState[index].destBlendFactor);
2002
2003 oC.x *= sourceFactor.x;
2004 oC.y *= sourceFactor.y;
2005 oC.z *= sourceFactor.z;
2006
2007 pixel.x *= destFactor.x;
2008 pixel.y *= destFactor.y;
2009 pixel.z *= destFactor.z;
2010
2011 switch(state.blendState[index].blendOperation)
2012 {
2013 case VK_BLEND_OP_ADD:
2014 oC.x += pixel.x;
2015 oC.y += pixel.y;
2016 oC.z += pixel.z;
2017 break;
2018 case VK_BLEND_OP_SUBTRACT:
2019 oC.x -= pixel.x;
2020 oC.y -= pixel.y;
2021 oC.z -= pixel.z;
2022 break;
2023 case VK_BLEND_OP_REVERSE_SUBTRACT:
2024 oC.x = pixel.x - oC.x;
2025 oC.y = pixel.y - oC.y;
2026 oC.z = pixel.z - oC.z;
2027 break;
2028 case VK_BLEND_OP_MIN:
2029 oC.x = Min(oC.x, pixel.x);
2030 oC.y = Min(oC.y, pixel.y);
2031 oC.z = Min(oC.z, pixel.z);
2032 break;
2033 case VK_BLEND_OP_MAX:
2034 oC.x = Max(oC.x, pixel.x);
2035 oC.y = Max(oC.y, pixel.y);
2036 oC.z = Max(oC.z, pixel.z);
2037 break;
2038 case VK_BLEND_OP_SRC_EXT:
2039 // No operation
2040 break;
2041 case VK_BLEND_OP_DST_EXT:
2042 oC.x = pixel.x;
2043 oC.y = pixel.y;
2044 oC.z = pixel.z;
2045 break;
2046 case VK_BLEND_OP_ZERO_EXT:
2047 oC.x = Float4(0.0f);
2048 oC.y = Float4(0.0f);
2049 oC.z = Float4(0.0f);
2050 break;
2051 default:
2052 UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperation));
2053 }
2054
2055 blendFactorAlpha(sourceFactor, oC, pixel, state.blendState[index].sourceBlendFactorAlpha);
2056 blendFactorAlpha(destFactor, oC, pixel, state.blendState[index].destBlendFactorAlpha);
2057
2058 oC.w *= sourceFactor.w;
2059 pixel.w *= destFactor.w;
2060
2061 switch(state.blendState[index].blendOperationAlpha)
2062 {
2063 case VK_BLEND_OP_ADD:
2064 oC.w += pixel.w;
2065 break;
2066 case VK_BLEND_OP_SUBTRACT:
2067 oC.w -= pixel.w;
2068 break;
2069 case VK_BLEND_OP_REVERSE_SUBTRACT:
2070 pixel.w -= oC.w;
2071 oC.w = pixel.w;
2072 break;
2073 case VK_BLEND_OP_MIN:
2074 oC.w = Min(oC.w, pixel.w);
2075 break;
2076 case VK_BLEND_OP_MAX:
2077 oC.w = Max(oC.w, pixel.w);
2078 break;
2079 case VK_BLEND_OP_SRC_EXT:
2080 // No operation
2081 break;
2082 case VK_BLEND_OP_DST_EXT:
2083 oC.w = pixel.w;
2084 break;
2085 case VK_BLEND_OP_ZERO_EXT:
2086 oC.w = Float4(0.0f);
2087 break;
2088 default:
2089 UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperationAlpha));
2090 }
2091
2092 if(format.isUnsignedComponent(0)) { oC.x = Max(oC.x, Float4(0.0f)); }
2093 if(format.isUnsignedComponent(1)) { oC.y = Max(oC.y, Float4(0.0f)); }
2094 if(format.isUnsignedComponent(2)) { oC.z = Max(oC.z, Float4(0.0f)); }
2095 if(format.isUnsignedComponent(3)) { oC.w = Max(oC.w, Float4(0.0f)); }
2096 }
2097
writeColor(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4f & oC,const Int & sMask,const Int & zMask,const Int & cMask)2098 void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4f &oC, const Int &sMask, const Int &zMask, const Int &cMask)
2099 {
2100 switch(state.targetFormat[index])
2101 {
2102 case VK_FORMAT_R16_SFLOAT:
2103 case VK_FORMAT_R32_SFLOAT:
2104 case VK_FORMAT_R32_SINT:
2105 case VK_FORMAT_R32_UINT:
2106 case VK_FORMAT_R16_SINT:
2107 case VK_FORMAT_R16_UINT:
2108 case VK_FORMAT_R8_SINT:
2109 case VK_FORMAT_R8_UINT:
2110 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2111 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2112 break;
2113 case VK_FORMAT_R16G16_SFLOAT:
2114 case VK_FORMAT_R32G32_SFLOAT:
2115 case VK_FORMAT_R32G32_SINT:
2116 case VK_FORMAT_R32G32_UINT:
2117 case VK_FORMAT_R16G16_SINT:
2118 case VK_FORMAT_R16G16_UINT:
2119 case VK_FORMAT_R8G8_SINT:
2120 case VK_FORMAT_R8G8_UINT:
2121 oC.z = oC.x;
2122 oC.x = UnpackLow(oC.x, oC.y);
2123 oC.z = UnpackHigh(oC.z, oC.y);
2124 oC.y = oC.z;
2125 break;
2126 case VK_FORMAT_R16G16B16A16_SFLOAT:
2127 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2128 case VK_FORMAT_R32G32B32A32_SFLOAT:
2129 case VK_FORMAT_R32G32B32A32_SINT:
2130 case VK_FORMAT_R32G32B32A32_UINT:
2131 case VK_FORMAT_R16G16B16A16_SINT:
2132 case VK_FORMAT_R16G16B16A16_UINT:
2133 case VK_FORMAT_R8G8B8A8_SINT:
2134 case VK_FORMAT_R8G8B8A8_UINT:
2135 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
2136 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
2137 transpose4x4(oC.x, oC.y, oC.z, oC.w);
2138 break;
2139 default:
2140 UNSUPPORTED("VkFormat: %d", int(state.targetFormat[index]));
2141 }
2142
2143 int rgbaWriteMask = state.colorWriteActive(index);
2144 int bgraWriteMask = (rgbaWriteMask & 0x0000000A) | (rgbaWriteMask & 0x00000001) << 2 | (rgbaWriteMask & 0x00000004) >> 2;
2145
2146 Int xMask; // Combination of all masks
2147
2148 if(state.depthTestActive)
2149 {
2150 xMask = zMask;
2151 }
2152 else
2153 {
2154 xMask = cMask;
2155 }
2156
2157 if(state.stencilActive)
2158 {
2159 xMask &= sMask;
2160 }
2161
2162 auto targetFormat = state.targetFormat[index];
2163
2164 Pointer<Byte> buffer = cBuffer;
2165 Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2166 Float4 value;
2167
2168 switch(targetFormat)
2169 {
2170 case VK_FORMAT_R32_SFLOAT:
2171 case VK_FORMAT_R32_SINT:
2172 case VK_FORMAT_R32_UINT:
2173 if(rgbaWriteMask & 0x00000001)
2174 {
2175 buffer += 4 * x;
2176
2177 // FIXME: movlps
2178 value.x = *Pointer<Float>(buffer + 0);
2179 value.y = *Pointer<Float>(buffer + 4);
2180
2181 buffer += pitchB;
2182
2183 // FIXME: movhps
2184 value.z = *Pointer<Float>(buffer + 0);
2185 value.w = *Pointer<Float>(buffer + 4);
2186
2187 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2188 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2189 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2190
2191 // FIXME: movhps
2192 *Pointer<Float>(buffer + 0) = oC.x.z;
2193 *Pointer<Float>(buffer + 4) = oC.x.w;
2194
2195 buffer -= pitchB;
2196
2197 // FIXME: movlps
2198 *Pointer<Float>(buffer + 0) = oC.x.x;
2199 *Pointer<Float>(buffer + 4) = oC.x.y;
2200 }
2201 break;
2202 case VK_FORMAT_R16_SFLOAT:
2203 if(rgbaWriteMask & 0x00000001)
2204 {
2205 buffer += 2 * x;
2206
2207 value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 0);
2208 value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 1);
2209
2210 buffer += pitchB;
2211
2212 value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 2);
2213 value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 3);
2214
2215 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2216 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2217 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2218
2219 *Pointer<Half>(buffer + 0) = Half(oC.x.z);
2220 *Pointer<Half>(buffer + 2) = Half(oC.x.w);
2221
2222 buffer -= pitchB;
2223
2224 *Pointer<Half>(buffer + 0) = Half(oC.x.x);
2225 *Pointer<Half>(buffer + 2) = Half(oC.x.y);
2226 }
2227 break;
2228 case VK_FORMAT_R16_SINT:
2229 case VK_FORMAT_R16_UINT:
2230 if(rgbaWriteMask & 0x00000001)
2231 {
2232 buffer += 2 * x;
2233
2234 UShort4 xyzw;
2235 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2236
2237 buffer += pitchB;
2238
2239 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2240 value = As<Float4>(Int4(xyzw));
2241
2242 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2243 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2244 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2245
2246 if(targetFormat == VK_FORMAT_R16_SINT)
2247 {
2248 Float component = oC.x.z;
2249 *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2250 component = oC.x.w;
2251 *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2252
2253 buffer -= pitchB;
2254
2255 component = oC.x.x;
2256 *Pointer<Short>(buffer + 0) = Short(As<Int>(component));
2257 component = oC.x.y;
2258 *Pointer<Short>(buffer + 2) = Short(As<Int>(component));
2259 }
2260 else // VK_FORMAT_R16_UINT
2261 {
2262 Float component = oC.x.z;
2263 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2264 component = oC.x.w;
2265 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2266
2267 buffer -= pitchB;
2268
2269 component = oC.x.x;
2270 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2271 component = oC.x.y;
2272 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2273 }
2274 }
2275 break;
2276 case VK_FORMAT_R8_SINT:
2277 case VK_FORMAT_R8_UINT:
2278 if(rgbaWriteMask & 0x00000001)
2279 {
2280 buffer += x;
2281
2282 UInt xyzw, packedCol;
2283
2284 xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFF;
2285 buffer += pitchB;
2286 xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2287
2288 Short4 tmpCol = Short4(As<Int4>(oC.x));
2289 if(targetFormat == VK_FORMAT_R8_SINT)
2290 {
2291 tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
2292 }
2293 else
2294 {
2295 tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
2296 }
2297 packedCol = Extract(As<Int2>(tmpCol), 0);
2298
2299 packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2300 (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2301
2302 *Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2303 buffer -= pitchB;
2304 *Pointer<UShort>(buffer) = UShort(packedCol);
2305 }
2306 break;
2307 case VK_FORMAT_R32G32_SFLOAT:
2308 case VK_FORMAT_R32G32_SINT:
2309 case VK_FORMAT_R32G32_UINT:
2310 buffer += 8 * x;
2311
2312 value = *Pointer<Float4>(buffer);
2313
2314 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2315 {
2316 Float4 masked = value;
2317 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[rgbaWriteMask & 0x3][0])));
2318 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~rgbaWriteMask & 0x3][0])));
2319 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2320 }
2321
2322 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16, 16));
2323 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ01X) + xMask * 16, 16));
2324 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2325 *Pointer<Float4>(buffer) = oC.x;
2326
2327 buffer += pitchB;
2328
2329 value = *Pointer<Float4>(buffer);
2330
2331 if((rgbaWriteMask & 0x00000003) != 0x00000003)
2332 {
2333 Float4 masked;
2334
2335 masked = value;
2336 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[rgbaWriteMask & 0x3][0])));
2337 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~rgbaWriteMask & 0x3][0])));
2338 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2339 }
2340
2341 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16, 16));
2342 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ23X) + xMask * 16, 16));
2343 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2344 *Pointer<Float4>(buffer) = oC.y;
2345 break;
2346 case VK_FORMAT_R16G16_SFLOAT:
2347 if((rgbaWriteMask & 0x00000003) != 0x0)
2348 {
2349 buffer += 4 * x;
2350
2351 UInt2 rgbaMask;
2352 UInt2 packedCol;
2353 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.y))) << 16) | UInt(As<UShort>(Half(oC.x.x))), 0);
2354 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.w))) << 16) | UInt(As<UShort>(Half(oC.x.z))), 1);
2355
2356 UShort4 value = *Pointer<UShort4>(buffer);
2357 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2358 if((rgbaWriteMask & 0x3) != 0x3)
2359 {
2360 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2361 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2362 mergedMask &= rgbaMask;
2363 }
2364 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2365
2366 buffer += pitchB;
2367
2368 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.y))) << 16) | UInt(As<UShort>(Half(oC.y.x))), 0);
2369 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.w))) << 16) | UInt(As<UShort>(Half(oC.y.z))), 1);
2370 value = *Pointer<UShort4>(buffer);
2371 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2372 if((rgbaWriteMask & 0x3) != 0x3)
2373 {
2374 mergedMask &= rgbaMask;
2375 }
2376 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2377 }
2378 break;
2379 case VK_FORMAT_R16G16_SINT:
2380 case VK_FORMAT_R16G16_UINT:
2381 if((rgbaWriteMask & 0x00000003) != 0x0)
2382 {
2383 buffer += 4 * x;
2384
2385 UInt2 rgbaMask;
2386 UShort4 packedCol = UShort4(As<Int4>(oC.x));
2387 UShort4 value = *Pointer<UShort4>(buffer);
2388 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2389 if((rgbaWriteMask & 0x3) != 0x3)
2390 {
2391 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask & 0x3][0]));
2392 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2393 mergedMask &= rgbaMask;
2394 }
2395 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2396
2397 buffer += pitchB;
2398
2399 packedCol = UShort4(As<Int4>(oC.y));
2400 value = *Pointer<UShort4>(buffer);
2401 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2402 if((rgbaWriteMask & 0x3) != 0x3)
2403 {
2404 mergedMask &= rgbaMask;
2405 }
2406 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2407 }
2408 break;
2409 case VK_FORMAT_R8G8_SINT:
2410 case VK_FORMAT_R8G8_UINT:
2411 if((rgbaWriteMask & 0x00000003) != 0x0)
2412 {
2413 buffer += 2 * x;
2414
2415 Int2 xyzw, packedCol;
2416
2417 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2418 buffer += pitchB;
2419 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2420
2421 if(targetFormat == VK_FORMAT_R8G8_SINT)
2422 {
2423 packedCol = As<Int2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2424 }
2425 else
2426 {
2427 packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2428 }
2429
2430 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2431 if((rgbaWriteMask & 0x3) != 0x3)
2432 {
2433 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (rgbaWriteMask & 0x3)][0]));
2434 UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2435 mergedMask &= rgbaMask;
2436 }
2437
2438 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2439
2440 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2441 buffer -= pitchB;
2442 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2443 }
2444 break;
2445 case VK_FORMAT_R32G32B32A32_SFLOAT:
2446 case VK_FORMAT_R32G32B32A32_SINT:
2447 case VK_FORMAT_R32G32B32A32_UINT:
2448 buffer += 16 * x;
2449
2450 {
2451 value = *Pointer<Float4>(buffer, 16);
2452
2453 if(rgbaWriteMask != 0x0000000F)
2454 {
2455 Float4 masked = value;
2456 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2457 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2458 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
2459 }
2460
2461 oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskX0X) + xMask * 16, 16));
2462 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX0X) + xMask * 16, 16));
2463 oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
2464 *Pointer<Float4>(buffer, 16) = oC.x;
2465 }
2466
2467 {
2468 value = *Pointer<Float4>(buffer + 16, 16);
2469
2470 if(rgbaWriteMask != 0x0000000F)
2471 {
2472 Float4 masked = value;
2473 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2474 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2475 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(masked));
2476 }
2477
2478 oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskX1X) + xMask * 16, 16));
2479 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX1X) + xMask * 16, 16));
2480 oC.y = As<Float4>(As<Int4>(oC.y) | As<Int4>(value));
2481 *Pointer<Float4>(buffer + 16, 16) = oC.y;
2482 }
2483
2484 buffer += pitchB;
2485
2486 {
2487 value = *Pointer<Float4>(buffer, 16);
2488
2489 if(rgbaWriteMask != 0x0000000F)
2490 {
2491 Float4 masked = value;
2492 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2493 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2494 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(masked));
2495 }
2496
2497 oC.z = As<Float4>(As<Int4>(oC.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskX2X) + xMask * 16, 16));
2498 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX2X) + xMask * 16, 16));
2499 oC.z = As<Float4>(As<Int4>(oC.z) | As<Int4>(value));
2500 *Pointer<Float4>(buffer, 16) = oC.z;
2501 }
2502
2503 {
2504 value = *Pointer<Float4>(buffer + 16, 16);
2505
2506 if(rgbaWriteMask != 0x0000000F)
2507 {
2508 Float4 masked = value;
2509 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[rgbaWriteMask][0])));
2510 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[rgbaWriteMask][0])));
2511 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(masked));
2512 }
2513
2514 oC.w = As<Float4>(As<Int4>(oC.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskX3X) + xMask * 16, 16));
2515 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX3X) + xMask * 16, 16));
2516 oC.w = As<Float4>(As<Int4>(oC.w) | As<Int4>(value));
2517 *Pointer<Float4>(buffer + 16, 16) = oC.w;
2518 }
2519 break;
2520 case VK_FORMAT_R16G16B16A16_SFLOAT:
2521 if((rgbaWriteMask & 0x0000000F) != 0x0)
2522 {
2523 buffer += 8 * x;
2524
2525 UInt4 rgbaMask;
2526 UInt4 value = *Pointer<UInt4>(buffer);
2527 UInt4 packedCol;
2528 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.y))) << 16) | UInt(As<UShort>(Half(oC.x.x))), 0);
2529 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.x.w))) << 16) | UInt(As<UShort>(Half(oC.x.z))), 1);
2530 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.y))) << 16) | UInt(As<UShort>(Half(oC.y.x))), 2);
2531 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.y.w))) << 16) | UInt(As<UShort>(Half(oC.y.z))), 3);
2532 UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2533 if((rgbaWriteMask & 0xF) != 0xF)
2534 {
2535 UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
2536 rgbaMask = UInt4(tmpMask, tmpMask);
2537 mergedMask &= rgbaMask;
2538 }
2539 *Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2540
2541 buffer += pitchB;
2542
2543 value = *Pointer<UInt4>(buffer);
2544 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.z.y))) << 16) | UInt(As<UShort>(Half(oC.z.x))), 0);
2545 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.z.w))) << 16) | UInt(As<UShort>(Half(oC.z.z))), 1);
2546 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.w.y))) << 16) | UInt(As<UShort>(Half(oC.w.x))), 2);
2547 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(oC.w.w))) << 16) | UInt(As<UShort>(Half(oC.w.z))), 3);
2548 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2549 if((rgbaWriteMask & 0xF) != 0xF)
2550 {
2551 mergedMask &= rgbaMask;
2552 }
2553 *Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2554 }
2555 break;
2556 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2557 if((rgbaWriteMask & 0x7) != 0x0)
2558 {
2559 buffer += 4 * x;
2560
2561 UInt4 packedCol;
2562 packedCol = Insert(packedCol, r11g11b10Pack(oC.x), 0);
2563 packedCol = Insert(packedCol, r11g11b10Pack(oC.y), 1);
2564 packedCol = Insert(packedCol, r11g11b10Pack(oC.z), 2);
2565 packedCol = Insert(packedCol, r11g11b10Pack(oC.w), 3);
2566
2567 UInt4 value;
2568 value = Insert(value, *Pointer<UInt>(buffer + 0), 0);
2569 value = Insert(value, *Pointer<UInt>(buffer + 4), 1);
2570 buffer += pitchB;
2571 value = Insert(value, *Pointer<UInt>(buffer + 0), 2);
2572 value = Insert(value, *Pointer<UInt>(buffer + 4), 3);
2573
2574 UInt4 mask = *Pointer<UInt4>(constants + OFFSET(Constants, maskD4X[0][0]) + xMask * 16, 16);
2575 if((rgbaWriteMask & 0x7) != 0x7)
2576 {
2577 mask &= *Pointer<UInt4>(constants + OFFSET(Constants, mask11X[rgbaWriteMask & 0x7][0]), 16);
2578 }
2579 value = (packedCol & mask) | (value & ~mask);
2580
2581 *Pointer<UInt>(buffer + 0) = value.z;
2582 *Pointer<UInt>(buffer + 4) = value.w;
2583 buffer -= pitchB;
2584 *Pointer<UInt>(buffer + 0) = value.x;
2585 *Pointer<UInt>(buffer + 4) = value.y;
2586 }
2587 break;
2588 case VK_FORMAT_R16G16B16A16_SINT:
2589 case VK_FORMAT_R16G16B16A16_UINT:
2590 if((rgbaWriteMask & 0x0000000F) != 0x0)
2591 {
2592 buffer += 8 * x;
2593
2594 UInt4 rgbaMask;
2595 UShort8 value = *Pointer<UShort8>(buffer);
2596 UShort8 packedCol = UShort8(UShort4(As<Int4>(oC.x)), UShort4(As<Int4>(oC.y)));
2597 UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2598 if((rgbaWriteMask & 0xF) != 0xF)
2599 {
2600 UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[rgbaWriteMask][0]));
2601 rgbaMask = UInt4(tmpMask, tmpMask);
2602 mergedMask &= rgbaMask;
2603 }
2604 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2605
2606 buffer += pitchB;
2607
2608 value = *Pointer<UShort8>(buffer);
2609 packedCol = UShort8(UShort4(As<Int4>(oC.z)), UShort4(As<Int4>(oC.w)));
2610 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2611 if((rgbaWriteMask & 0xF) != 0xF)
2612 {
2613 mergedMask &= rgbaMask;
2614 }
2615 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2616 }
2617 break;
2618 case VK_FORMAT_R8G8B8A8_SINT:
2619 case VK_FORMAT_R8G8B8A8_UINT:
2620 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
2621 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
2622 if((rgbaWriteMask & 0x0000000F) != 0x0)
2623 {
2624 UInt2 value, packedCol, mergedMask;
2625
2626 buffer += 4 * x;
2627
2628 bool isSigned = targetFormat == VK_FORMAT_R8G8B8A8_SINT || targetFormat == VK_FORMAT_A8B8G8R8_SINT_PACK32;
2629
2630 if(isSigned)
2631 {
2632 packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2633 }
2634 else
2635 {
2636 packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.x)), Short4(As<Int4>(oC.y))));
2637 }
2638 value = *Pointer<UInt2>(buffer, 16);
2639 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2640 if(rgbaWriteMask != 0xF)
2641 {
2642 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2643 }
2644 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2645
2646 buffer += pitchB;
2647
2648 if(isSigned)
2649 {
2650 packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2651 }
2652 else
2653 {
2654 packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(oC.z)), Short4(As<Int4>(oC.w))));
2655 }
2656 value = *Pointer<UInt2>(buffer, 16);
2657 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2658 if(rgbaWriteMask != 0xF)
2659 {
2660 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[rgbaWriteMask][0]));
2661 }
2662 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2663 }
2664 break;
2665 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2666 if((rgbaWriteMask & 0x0000000F) != 0x0)
2667 {
2668 Int2 mergedMask, packedCol, value;
2669 Int4 packed = ((As<Int4>(oC.w) & Int4(0x3)) << 30) |
2670 ((As<Int4>(oC.z) & Int4(0x3ff)) << 20) |
2671 ((As<Int4>(oC.y) & Int4(0x3ff)) << 10) |
2672 ((As<Int4>(oC.x) & Int4(0x3ff)));
2673
2674 buffer += 4 * x;
2675 value = *Pointer<Int2>(buffer, 16);
2676 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2677 if(rgbaWriteMask != 0xF)
2678 {
2679 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
2680 }
2681 *Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
2682
2683 buffer += pitchB;
2684
2685 value = *Pointer<Int2>(buffer, 16);
2686 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2687 if(rgbaWriteMask != 0xF)
2688 {
2689 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[rgbaWriteMask][0]));
2690 }
2691 *Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
2692 }
2693 break;
2694 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2695 if((bgraWriteMask & 0x0000000F) != 0x0)
2696 {
2697 Int2 mergedMask, packedCol, value;
2698 Int4 packed = ((As<Int4>(oC.w) & Int4(0x3)) << 30) |
2699 ((As<Int4>(oC.x) & Int4(0x3ff)) << 20) |
2700 ((As<Int4>(oC.y) & Int4(0x3ff)) << 10) |
2701 ((As<Int4>(oC.z) & Int4(0x3ff)));
2702
2703 buffer += 4 * x;
2704 value = *Pointer<Int2>(buffer, 16);
2705 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2706 if(bgraWriteMask != 0xF)
2707 {
2708 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[bgraWriteMask][0]));
2709 }
2710 *Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
2711
2712 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2713
2714 value = *Pointer<Int2>(buffer, 16);
2715 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2716 if(bgraWriteMask != 0xF)
2717 {
2718 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[bgraWriteMask][0]));
2719 }
2720 *Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
2721 }
2722 break;
2723 default:
2724 UNSUPPORTED("VkFormat: %d", int(targetFormat));
2725 }
2726 }
2727
convertFixed16(const Float4 & cf,bool saturate)2728 UShort4 PixelRoutine::convertFixed16(const Float4 &cf, bool saturate)
2729 {
2730 return UShort4(cf * Float4(0xFFFF), saturate);
2731 }
2732
sRGBtoLinear16_12_16(Vector4s & c)2733 void PixelRoutine::sRGBtoLinear16_12_16(Vector4s &c)
2734 {
2735 Pointer<Byte> LUT = constants + OFFSET(Constants, sRGBtoLinear12_16);
2736
2737 c.x = AddSat(As<UShort4>(c.x), UShort4(0x0007)) >> 4;
2738 c.y = AddSat(As<UShort4>(c.y), UShort4(0x0007)) >> 4;
2739 c.z = AddSat(As<UShort4>(c.z), UShort4(0x0007)) >> 4;
2740
2741 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2742 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2743 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2744 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2745
2746 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2747 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2748 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2749 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2750
2751 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2752 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2753 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2754 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2755 }
2756
linearToSRGB16_12_16(Vector4s & c)2757 void PixelRoutine::linearToSRGB16_12_16(Vector4s &c)
2758 {
2759 c.x = AddSat(As<UShort4>(c.x), UShort4(0x0007)) >> 4;
2760 c.y = AddSat(As<UShort4>(c.y), UShort4(0x0007)) >> 4;
2761 c.z = AddSat(As<UShort4>(c.z), UShort4(0x0007)) >> 4;
2762
2763 linearToSRGB12_16(c);
2764 }
2765
linearToSRGB12_16(Vector4s & c)2766 void PixelRoutine::linearToSRGB12_16(Vector4s &c)
2767 {
2768 Pointer<Byte> LUT = constants + OFFSET(Constants, linearToSRGB12_16);
2769
2770 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 0))), 0);
2771 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 1))), 1);
2772 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 2))), 2);
2773 c.x = Insert(c.x, *Pointer<Short>(LUT + 2 * Int(Extract(c.x, 3))), 3);
2774
2775 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 0))), 0);
2776 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 1))), 1);
2777 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 2))), 2);
2778 c.y = Insert(c.y, *Pointer<Short>(LUT + 2 * Int(Extract(c.y, 3))), 3);
2779
2780 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 0))), 0);
2781 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 1))), 1);
2782 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 2))), 2);
2783 c.z = Insert(c.z, *Pointer<Short>(LUT + 2 * Int(Extract(c.z, 3))), 3);
2784 }
2785
sRGBtoLinear(const Float4 & x)2786 Float4 PixelRoutine::sRGBtoLinear(const Float4 &x) // Approximates x^2.2
2787 {
2788 Float4 linear = x * x;
2789 linear = linear * Float4(0.73f) + linear * x * Float4(0.27f);
2790
2791 return Min(Max(linear, Float4(0.0f)), Float4(1.0f));
2792 }
2793
2794 } // namespace sw
2795