1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file blend_jit.cpp
24 *
25 * @brief Implementation of the blend jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32 #include "jit_api.h"
33 #include "blend_jit.h"
34 #include "gen_state_llvm.h"
35
36 // components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
37 #define QUANTIZE_THRESHOLD 2
38
39 using namespace llvm;
40 using namespace SwrJit;
41
42 //////////////////////////////////////////////////////////////////////////
43 /// Interface to Jitting a blend shader
44 //////////////////////////////////////////////////////////////////////////
45 struct BlendJit : public Builder
46 {
BlendJitBlendJit47 BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};
48
49 template<bool Color, bool Alpha>
GenerateBlendFactorBlendJit50 void GenerateBlendFactor(SWR_BLEND_FACTOR factor, Value* constColor[4], Value* src[4], Value* src1[4], Value* dst[4], Value* result[4])
51 {
52 Value* out[4];
53
54 switch (factor)
55 {
56 case BLENDFACTOR_ONE:
57 out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f);
58 break;
59 case BLENDFACTOR_SRC_COLOR:
60 out[0] = src[0];
61 out[1] = src[1];
62 out[2] = src[2];
63 out[3] = src[3];
64 break;
65 case BLENDFACTOR_SRC_ALPHA:
66 out[0] = out[1] = out[2] = out[3] = src[3];
67 break;
68 case BLENDFACTOR_DST_ALPHA:
69 out[0] = out[1] = out[2] = out[3] = dst[3];
70 break;
71 case BLENDFACTOR_DST_COLOR:
72 out[0] = dst[0];
73 out[1] = dst[1];
74 out[2] = dst[2];
75 out[3] = dst[3];
76 break;
77 case BLENDFACTOR_SRC_ALPHA_SATURATE:
78 out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));
79 out[3] = VIMMED1(1.0f);
80 break;
81 case BLENDFACTOR_CONST_COLOR:
82 out[0] = constColor[0];
83 out[1] = constColor[1];
84 out[2] = constColor[2];
85 out[3] = constColor[3];
86 break;
87 case BLENDFACTOR_CONST_ALPHA:
88 out[0] = out[1] = out[2] = out[3] = constColor[3];
89 break;
90 case BLENDFACTOR_SRC1_COLOR:
91 out[0] = src1[0];
92 out[1] = src1[1];
93 out[2] = src1[2];
94 out[3] = src1[3];
95 break;
96 case BLENDFACTOR_SRC1_ALPHA:
97 out[0] = out[1] = out[2] = out[3] = src1[3];
98 break;
99 case BLENDFACTOR_ZERO:
100 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
101 break;
102 case BLENDFACTOR_INV_SRC_COLOR:
103 out[0] = FSUB(VIMMED1(1.0f), src[0]);
104 out[1] = FSUB(VIMMED1(1.0f), src[1]);
105 out[2] = FSUB(VIMMED1(1.0f), src[2]);
106 out[3] = FSUB(VIMMED1(1.0f), src[3]);
107 break;
108 case BLENDFACTOR_INV_SRC_ALPHA:
109 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]);
110 break;
111 case BLENDFACTOR_INV_DST_ALPHA:
112 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]);
113 break;
114 case BLENDFACTOR_INV_DST_COLOR:
115 out[0] = FSUB(VIMMED1(1.0f), dst[0]);
116 out[1] = FSUB(VIMMED1(1.0f), dst[1]);
117 out[2] = FSUB(VIMMED1(1.0f), dst[2]);
118 out[3] = FSUB(VIMMED1(1.0f), dst[3]);
119 break;
120 case BLENDFACTOR_INV_CONST_COLOR:
121 out[0] = FSUB(VIMMED1(1.0f), constColor[0]);
122 out[1] = FSUB(VIMMED1(1.0f), constColor[1]);
123 out[2] = FSUB(VIMMED1(1.0f), constColor[2]);
124 out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
125 break;
126 case BLENDFACTOR_INV_CONST_ALPHA:
127 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
128 break;
129 case BLENDFACTOR_INV_SRC1_COLOR:
130 out[0] = FSUB(VIMMED1(1.0f), src1[0]);
131 out[1] = FSUB(VIMMED1(1.0f), src1[1]);
132 out[2] = FSUB(VIMMED1(1.0f), src1[2]);
133 out[3] = FSUB(VIMMED1(1.0f), src1[3]);
134 break;
135 case BLENDFACTOR_INV_SRC1_ALPHA:
136 out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]);
137 break;
138 default:
139 SWR_INVALID("Unsupported blend factor: %d", factor);
140 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
141 break;
142 }
143
144 if (Color)
145 {
146 result[0] = out[0];
147 result[1] = out[1];
148 result[2] = out[2];
149 }
150
151 if (Alpha)
152 {
153 result[3] = out[3];
154 }
155 }
156
ClampBlendJit157 void Clamp(SWR_FORMAT format, Value* src[4])
158 {
159 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
160 SWR_TYPE type = info.type[0];
161
162 switch (type)
163 {
164 default:
165 break;
166
167 case SWR_TYPE_UNORM:
168 src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f));
169 src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f));
170 src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f));
171 src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f));
172 break;
173
174 case SWR_TYPE_SNORM:
175 src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f));
176 src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f));
177 src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f));
178 src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));
179 break;
180
181 case SWR_TYPE_UNKNOWN: SWR_INVALID("Unsupport format type: %d", type);
182 }
183 }
184
ApplyDefaultsBlendJit185 void ApplyDefaults(SWR_FORMAT format, Value* src[4])
186 {
187 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
188
189 bool valid[] = { false, false, false, false };
190 for (uint32_t c = 0; c < info.numComps; ++c)
191 {
192 valid[info.swizzle[c]] = true;
193 }
194
195 for (uint32_t c = 0; c < 4; ++c)
196 {
197 if (!valid[c])
198 {
199 src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty);
200 }
201 }
202 }
203
ApplyUnusedDefaultsBlendJit204 void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4])
205 {
206 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
207
208 for (uint32_t c = 0; c < info.numComps; ++c)
209 {
210 if (info.type[c] == SWR_TYPE_UNUSED)
211 {
212 src[info.swizzle[c]] = BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
213 }
214 }
215 }
216
QuantizeBlendJit217 void Quantize(SWR_FORMAT format, Value* src[4])
218 {
219 const SWR_FORMAT_INFO& info = GetFormatInfo(format);
220 for (uint32_t c = 0; c < info.numComps; ++c)
221 {
222 if (info.bpc[c] <= QUANTIZE_THRESHOLD && info.type[c] != SWR_TYPE_UNUSED)
223 {
224 uint32_t swizComp = info.swizzle[c];
225 float factor = (float)((1 << info.bpc[c]) - 1);
226 switch (info.type[c])
227 {
228 case SWR_TYPE_UNORM:
229 src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));
230 src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));
231 src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f /factor));
232 break;
233 default: SWR_INVALID("Unsupported format type: %d", info.type[c]);
234 }
235 }
236 }
237 }
238
239 template<bool Color, bool Alpha>
BlendFuncBlendJit240 void BlendFunc(SWR_BLEND_OP blendOp, Value* src[4], Value* srcFactor[4], Value* dst[4], Value* dstFactor[4], Value* result[4])
241 {
242 Value* out[4];
243 Value* srcBlend[4];
244 Value* dstBlend[4];
245 for (uint32_t i = 0; i < 4; ++i)
246 {
247 srcBlend[i] = FMUL(src[i], srcFactor[i]);
248 dstBlend[i] = FMUL(dst[i], dstFactor[i]);
249 }
250
251 switch (blendOp)
252 {
253 case BLENDOP_ADD:
254 out[0] = FADD(srcBlend[0], dstBlend[0]);
255 out[1] = FADD(srcBlend[1], dstBlend[1]);
256 out[2] = FADD(srcBlend[2], dstBlend[2]);
257 out[3] = FADD(srcBlend[3], dstBlend[3]);
258 break;
259
260 case BLENDOP_SUBTRACT:
261 out[0] = FSUB(srcBlend[0], dstBlend[0]);
262 out[1] = FSUB(srcBlend[1], dstBlend[1]);
263 out[2] = FSUB(srcBlend[2], dstBlend[2]);
264 out[3] = FSUB(srcBlend[3], dstBlend[3]);
265 break;
266
267 case BLENDOP_REVSUBTRACT:
268 out[0] = FSUB(dstBlend[0], srcBlend[0]);
269 out[1] = FSUB(dstBlend[1], srcBlend[1]);
270 out[2] = FSUB(dstBlend[2], srcBlend[2]);
271 out[3] = FSUB(dstBlend[3], srcBlend[3]);
272 break;
273
274 case BLENDOP_MIN:
275 out[0] = VMINPS(src[0], dst[0]);
276 out[1] = VMINPS(src[1], dst[1]);
277 out[2] = VMINPS(src[2], dst[2]);
278 out[3] = VMINPS(src[3], dst[3]);
279 break;
280
281 case BLENDOP_MAX:
282 out[0] = VMAXPS(src[0], dst[0]);
283 out[1] = VMAXPS(src[1], dst[1]);
284 out[2] = VMAXPS(src[2], dst[2]);
285 out[3] = VMAXPS(src[3], dst[3]);
286 break;
287
288 default:
289 SWR_INVALID("Unsupported blend operation: %d", blendOp);
290 out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
291 break;
292 }
293
294 if (Color)
295 {
296 result[0] = out[0];
297 result[1] = out[1];
298 result[2] = out[2];
299 }
300
301 if (Alpha)
302 {
303 result[3] = out[3];
304 }
305 }
306
LogicOpFuncBlendJit307 void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4])
308 {
309 // Op: (s == PS output, d = RT contents)
310 switch(logicOp)
311 {
312 case LOGICOP_CLEAR:
313 result[0] = VIMMED1(0);
314 result[1] = VIMMED1(0);
315 result[2] = VIMMED1(0);
316 result[3] = VIMMED1(0);
317 break;
318
319 case LOGICOP_NOR:
320 // ~(s | d)
321 result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
322 result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
323 result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
324 result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
325 break;
326
327 case LOGICOP_AND_INVERTED:
328 // ~s & d
329 // todo: use avx andnot instr when I can find the intrinsic to call
330 result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
331 result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
332 result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
333 result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
334 break;
335
336 case LOGICOP_COPY_INVERTED:
337 // ~s
338 result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF));
339 result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF));
340 result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF));
341 result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF));
342 break;
343
344 case LOGICOP_AND_REVERSE:
345 // s & ~d
346 // todo: use avx andnot instr when I can find the intrinsic to call
347 result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
348 result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
349 result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
350 result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
351 break;
352
353 case LOGICOP_INVERT:
354 // ~d
355 result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF));
356 result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF));
357 result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF));
358 result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF));
359 break;
360
361 case LOGICOP_XOR:
362 // s ^ d
363 result[0] = XOR(src[0], dst[0]);
364 result[1] = XOR(src[1], dst[1]);
365 result[2] = XOR(src[2], dst[2]);
366 result[3] = XOR(src[3], dst[3]);
367 break;
368
369 case LOGICOP_NAND:
370 // ~(s & d)
371 result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
372 result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
373 result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
374 result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
375 break;
376
377 case LOGICOP_AND:
378 // s & d
379 result[0] = AND(src[0], dst[0]);
380 result[1] = AND(src[1], dst[1]);
381 result[2] = AND(src[2], dst[2]);
382 result[3] = AND(src[3], dst[3]);
383 break;
384
385 case LOGICOP_EQUIV:
386 // ~(s ^ d)
387 result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
388 result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
389 result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
390 result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
391 break;
392
393 case LOGICOP_NOOP:
394 result[0] = dst[0];
395 result[1] = dst[1];
396 result[2] = dst[2];
397 result[3] = dst[3];
398 break;
399
400 case LOGICOP_OR_INVERTED:
401 // ~s | d
402 result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
403 result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
404 result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
405 result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
406 break;
407
408 case LOGICOP_COPY:
409 result[0] = src[0];
410 result[1] = src[1];
411 result[2] = src[2];
412 result[3] = src[3];
413 break;
414
415 case LOGICOP_OR_REVERSE:
416 // s | ~d
417 result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
418 result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
419 result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
420 result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
421 break;
422
423 case LOGICOP_OR:
424 // s | d
425 result[0] = OR(src[0], dst[0]);
426 result[1] = OR(src[1], dst[1]);
427 result[2] = OR(src[2], dst[2]);
428 result[3] = OR(src[3], dst[3]);
429 break;
430
431 case LOGICOP_SET:
432 result[0] = VIMMED1(0xFFFFFFFF);
433 result[1] = VIMMED1(0xFFFFFFFF);
434 result[2] = VIMMED1(0xFFFFFFFF);
435 result[3] = VIMMED1(0xFFFFFFFF);
436 break;
437
438 default:
439 SWR_INVALID("Unsupported logic operation: %d", logicOp);
440 result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f);
441 break;
442 }
443 }
444
AlphaTestBlendJit445 void AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask)
446 {
447 // load uint32_t reference
448 Value* pRef = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_alphaTestReference }));
449
450 // load alpha
451 Value* pAlpha = LOAD(ppAlpha);
452
453 Value* pTest = nullptr;
454 if (state.alphaTestFormat == ALPHA_TEST_UNORM8)
455 {
456 // convert float alpha to unorm8
457 Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f));
458 pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty);
459
460 // compare
461 switch (state.alphaTestFunction)
462 {
463 case ZFUNC_ALWAYS: pTest = VIMMED1(true); break;
464 case ZFUNC_NEVER: pTest = VIMMED1(false); break;
465 case ZFUNC_LT: pTest = ICMP_ULT(pAlphaU8, pRef); break;
466 case ZFUNC_EQ: pTest = ICMP_EQ(pAlphaU8, pRef); break;
467 case ZFUNC_LE: pTest = ICMP_ULE(pAlphaU8, pRef); break;
468 case ZFUNC_GT: pTest = ICMP_UGT(pAlphaU8, pRef); break;
469 case ZFUNC_NE: pTest = ICMP_NE(pAlphaU8, pRef); break;
470 case ZFUNC_GE: pTest = ICMP_UGE(pAlphaU8, pRef); break;
471 default:
472 SWR_INVALID("Invalid alpha test function");
473 break;
474 }
475 }
476 else
477 {
478 // cast ref to float
479 pRef = BITCAST(pRef, mSimdFP32Ty);
480
481 // compare
482 switch (state.alphaTestFunction)
483 {
484 case ZFUNC_ALWAYS: pTest = VIMMED1(true); break;
485 case ZFUNC_NEVER: pTest = VIMMED1(false); break;
486 case ZFUNC_LT: pTest = FCMP_OLT(pAlpha, pRef); break;
487 case ZFUNC_EQ: pTest = FCMP_OEQ(pAlpha, pRef); break;
488 case ZFUNC_LE: pTest = FCMP_OLE(pAlpha, pRef); break;
489 case ZFUNC_GT: pTest = FCMP_OGT(pAlpha, pRef); break;
490 case ZFUNC_NE: pTest = FCMP_ONE(pAlpha, pRef); break;
491 case ZFUNC_GE: pTest = FCMP_OGE(pAlpha, pRef); break;
492 default:
493 SWR_INVALID("Invalid alpha test function");
494 break;
495 }
496 }
497
498 // load current mask
499 Value* pMask = LOAD(ppMask);
500
501 // convert to int1 mask
502 pMask = MASK(pMask);
503
504 // and with alpha test result
505 pMask = AND(pMask, pTest);
506
507 // convert back to vector mask
508 pMask = VMASK(pMask);
509
510 // store new mask
511 STORE(pMask, ppMask);
512 }
513
CreateBlendJit514 Function* Create(const BLEND_COMPILE_STATE& state)
515 {
516 std::stringstream fnName("BLND_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
517 fnName << ComputeCRC(0, &state, sizeof(state));
518
519 // blend function signature
520 //typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, uint8_t*, simdvector&, simdscalari*, simdscalari*);
521
522 std::vector<Type*> args{
523 PointerType::get(Gen_SWR_BLEND_STATE(JM()), 0), // SWR_BLEND_STATE*
524 PointerType::get(mSimdFP32Ty, 0), // simdvector& src
525 PointerType::get(mSimdFP32Ty, 0), // simdvector& src1
526 PointerType::get(mSimdFP32Ty, 0), // src0alpha
527 Type::getInt32Ty(JM()->mContext), // sampleNum
528 PointerType::get(mSimdFP32Ty, 0), // uint8_t* pDst
529 PointerType::get(mSimdFP32Ty, 0), // simdvector& result
530 PointerType::get(mSimdInt32Ty, 0), // simdscalari* oMask
531 PointerType::get(mSimdInt32Ty, 0), // simdscalari* pMask
532 };
533
534 FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
535 Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
536 blendFunc->getParent()->setModuleIdentifier(blendFunc->getName());
537
538 BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
539
540 IRB()->SetInsertPoint(entry);
541
542 // arguments
543 auto argitr = blendFunc->arg_begin();
544 Value* pBlendState = &*argitr++;
545 pBlendState->setName("pBlendState");
546 Value* pSrc = &*argitr++;
547 pSrc->setName("src");
548 Value* pSrc1 = &*argitr++;
549 pSrc1->setName("src1");
550 Value* pSrc0Alpha = &*argitr++;
551 pSrc0Alpha->setName("src0alpha");
552 Value* sampleNum = &*argitr++;
553 sampleNum->setName("sampleNum");
554 Value* pDst = &*argitr++;
555 pDst->setName("pDst");
556 Value* pResult = &*argitr++;
557 pResult->setName("result");
558 Value* ppoMask = &*argitr++;
559 ppoMask->setName("ppoMask");
560 Value* ppMask = &*argitr++;
561 ppMask->setName("pMask");
562
563 static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
564 Value* dst[4];
565 Value* constantColor[4];
566 Value* src[4];
567 Value* src1[4];
568 Value* result[4];
569 for (uint32_t i = 0; i < 4; ++i)
570 {
571 // load hot tile
572 dst[i] = LOAD(pDst, { i });
573
574 // load constant color
575 constantColor[i] = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_constantColor, i }));
576
577 // load src
578 src[i] = LOAD(pSrc, { i });
579
580 // load src1
581 src1[i] = LOAD(pSrc1, { i });
582 }
583 Value* currentSampleMask = VIMMED1(-1);
584 if (state.desc.alphaToCoverageEnable)
585 {
586 Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
587 uint32_t bits = (1 << state.desc.numSamples) - 1;
588 currentSampleMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
589 currentSampleMask = FP_TO_SI(FADD(currentSampleMask, VIMMED1(0.5f)), mSimdInt32Ty);
590 }
591
592 // alpha test
593 if (state.desc.alphaTestEnable)
594 {
595 AlphaTest(state, pBlendState, pSrc0Alpha, ppMask);
596 }
597
598 // color blend
599 if (state.blendState.blendEnable)
600 {
601 // clamp sources
602 Clamp(state.format, src);
603 Clamp(state.format, src1);
604 Clamp(state.format, dst);
605 Clamp(state.format, constantColor);
606
607 // apply defaults to hottile contents to take into account missing components
608 ApplyDefaults(state.format, dst);
609
610 // Force defaults for unused 'X' components
611 ApplyUnusedDefaults(state.format, dst);
612
613 // Quantize low precision components
614 Quantize(state.format, dst);
615
616 // special case clamping for R11G11B10_float which has no sign bit
617 if (state.format == R11G11B10_FLOAT)
618 {
619 dst[0] = VMAXPS(dst[0], VIMMED1(0.0f));
620 dst[1] = VMAXPS(dst[1], VIMMED1(0.0f));
621 dst[2] = VMAXPS(dst[2], VIMMED1(0.0f));
622 dst[3] = VMAXPS(dst[3], VIMMED1(0.0f));
623 }
624
625 Value* srcFactor[4];
626 Value* dstFactor[4];
627 if (state.desc.independentAlphaBlendEnable)
628 {
629 GenerateBlendFactor<true, false>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
630 GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor, constantColor, src, src1, dst, srcFactor);
631
632 GenerateBlendFactor<true, false>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
633 GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor, constantColor, src, src1, dst, dstFactor);
634
635 BlendFunc<true, false>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
636 BlendFunc<false, true>(state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
637 }
638 else
639 {
640 GenerateBlendFactor<true, true>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
641 GenerateBlendFactor<true, true>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
642
643 BlendFunc<true, true>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
644 }
645
646 // store results out
647 for (uint32_t i = 0; i < 4; ++i)
648 {
649 STORE(result[i], pResult, { i });
650 }
651 }
652
653 if(state.blendState.logicOpEnable)
654 {
655 const SWR_FORMAT_INFO& info = GetFormatInfo(state.format);
656 Value* vMask[4];
657 float scale[4];
658
659 if (!state.blendState.blendEnable)
660 {
661 Clamp(state.format, src);
662 Clamp(state.format, dst);
663 }
664
665 for(uint32_t i = 0; i < 4; i++)
666 {
667 if (info.type[i] == SWR_TYPE_UNUSED)
668 {
669 continue;
670 }
671
672 if (info.bpc[i] >= 32)
673 {
674 vMask[i] = VIMMED1(0xFFFFFFFF);
675 scale[i] = 0xFFFFFFFF;
676 }
677 else
678 {
679 vMask[i] = VIMMED1((1 << info.bpc[i]) - 1);
680 if (info.type[i] == SWR_TYPE_SNORM)
681 scale[i] = (1 << (info.bpc[i] - 1)) - 1;
682 else
683 scale[i] = (1 << info.bpc[i]) - 1;
684 }
685
686 switch (info.type[i])
687 {
688 default:
689 SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);
690 break;
691
692 case SWR_TYPE_UNKNOWN:
693 case SWR_TYPE_UNUSED:
694 // fallthrough
695
696 case SWR_TYPE_UINT:
697 case SWR_TYPE_SINT:
698 src[i] = BITCAST(src[i], mSimdInt32Ty);
699 dst[i] = BITCAST(dst[i], mSimdInt32Ty);
700 break;
701 case SWR_TYPE_SNORM:
702 src[i] = FP_TO_SI(
703 FMUL(src[i], VIMMED1(scale[i])),
704 mSimdInt32Ty);
705 dst[i] = FP_TO_SI(
706 FMUL(dst[i], VIMMED1(scale[i])),
707 mSimdInt32Ty);
708 break;
709 case SWR_TYPE_UNORM:
710 src[i] = FP_TO_UI(
711 FMUL(src[i], VIMMED1(scale[i])),
712 mSimdInt32Ty);
713 dst[i] = FP_TO_UI(
714 FMUL(dst[i], VIMMED1(scale[i])),
715 mSimdInt32Ty);
716 break;
717 }
718 }
719
720 LogicOpFunc(state.blendState.logicOpFunc, src, dst, result);
721
722 // store results out
723 for(uint32_t i = 0; i < 4; ++i)
724 {
725 if (info.type[i] == SWR_TYPE_UNUSED)
726 {
727 continue;
728 }
729
730 // clear upper bits from PS output not in RT format after doing logic op
731 result[i] = AND(result[i], vMask[i]);
732
733 switch (info.type[i])
734 {
735 default:
736 SWR_INVALID("Unsupported type for logic op: %d", info.type[i]);
737 break;
738
739 case SWR_TYPE_UNKNOWN:
740 case SWR_TYPE_UNUSED:
741 // fallthrough
742
743 case SWR_TYPE_UINT:
744 case SWR_TYPE_SINT:
745 result[i] = BITCAST(result[i], mSimdFP32Ty);
746 break;
747 case SWR_TYPE_SNORM:
748 result[i] = SHL(result[i], C(32 - info.bpc[i]));
749 result[i] = ASHR(result[i], C(32 - info.bpc[i]));
750 result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty),
751 VIMMED1(1.0f / scale[i]));
752 break;
753 case SWR_TYPE_UNORM:
754 result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty),
755 VIMMED1(1.0f / scale[i]));
756 break;
757 }
758
759 STORE(result[i], pResult, {i});
760 }
761 }
762
763 if(state.desc.oMaskEnable)
764 {
765 assert(!(state.desc.alphaToCoverageEnable));
766 // load current mask
767 Value* oMask = LOAD(ppoMask);
768 currentSampleMask = AND(oMask, currentSampleMask);
769 }
770
771 if(state.desc.sampleMaskEnable)
772 {
773 Value* sampleMask = LOAD(pBlendState, { 0, SWR_BLEND_STATE_sampleMask});
774 currentSampleMask = AND(VBROADCAST(sampleMask), currentSampleMask);
775 }
776
777 if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
778 state.desc.oMaskEnable)
779 {
780 // load coverage mask and mask off any lanes with no samples
781 Value* pMask = LOAD(ppMask);
782 Value* sampleMasked = SHL(C(1), sampleNum);
783 currentSampleMask = AND(currentSampleMask, VBROADCAST(sampleMasked));
784 currentSampleMask = S_EXT(ICMP_UGT(currentSampleMask, VBROADCAST(C(0))), mSimdInt32Ty);
785 Value* outputMask = AND(pMask, currentSampleMask);
786 // store new mask
787 STORE(outputMask, GEP(ppMask, C(0)));
788 }
789
790 RET_VOID();
791
792 JitManager::DumpToFile(blendFunc, "");
793
794 ::FunctionPassManager passes(JM()->mpCurrentModule);
795
796 passes.add(createBreakCriticalEdgesPass());
797 passes.add(createCFGSimplificationPass());
798 passes.add(createEarlyCSEPass());
799 passes.add(createPromoteMemoryToRegisterPass());
800 passes.add(createCFGSimplificationPass());
801 passes.add(createEarlyCSEPass());
802 passes.add(createInstructionCombiningPass());
803 passes.add(createInstructionSimplifierPass());
804 passes.add(createConstantPropagationPass());
805 passes.add(createSCCPPass());
806 passes.add(createAggressiveDCEPass());
807
808 passes.run(*blendFunc);
809
810 JitManager::DumpToFile(blendFunc, "optimized");
811
812 return blendFunc;
813 }
814 };
815
816 //////////////////////////////////////////////////////////////////////////
817 /// @brief JITs from fetch shader IR
818 /// @param hJitMgr - JitManager handle
819 /// @param func - LLVM function IR
820 /// @return PFN_FETCH_FUNC - pointer to fetch code
JitBlendFunc(HANDLE hJitMgr,const HANDLE hFunc)821 PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)
822 {
823 const llvm::Function *func = (const llvm::Function*)hFunc;
824 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
825 PFN_BLEND_JIT_FUNC pfnBlend;
826 pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
827 // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
828 pJitMgr->mIsModuleFinalized = true;
829
830 return pfnBlend;
831 }
832
833 //////////////////////////////////////////////////////////////////////////
834 /// @brief JIT compiles blend shader
835 /// @param hJitMgr - JitManager handle
836 /// @param state - blend state to build function from
JitCompileBlend(HANDLE hJitMgr,const BLEND_COMPILE_STATE & state)837 extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, const BLEND_COMPILE_STATE& state)
838 {
839 JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
840
841 pJitMgr->SetupNewModule();
842
843 BlendJit theJit(pJitMgr);
844 HANDLE hFunc = theJit.Create(state);
845
846 return JitBlendFunc(hJitMgr, hFunc);
847 }
848