• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file blend_jit.cpp
24 *
25 * @brief Implementation of the blend jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_api.h"
31 #include "blend_jit.h"
32 #include "builder.h"
33 #include "state_llvm.h"
34 
35 #include <sstream>
36 
37 // components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
38 #define QUANTIZE_THRESHOLD 2
39 
40 using namespace llvm;
41 using namespace SwrJit;
42 
43 //////////////////////////////////////////////////////////////////////////
44 /// Interface to Jitting a blend shader
45 //////////////////////////////////////////////////////////////////////////
46 struct BlendJit : public Builder
47 {
BlendJitBlendJit48     BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};
49 
50     template<bool Color, bool Alpha>
GenerateBlendFactorBlendJit51     void GenerateBlendFactor(SWR_BLEND_FACTOR factor, Value* constColor[4], Value* src[4], Value* src1[4], Value* dst[4], Value* result[4])
52     {
53         Value* out[4];
54 
55         switch (factor)
56         {
57         case BLENDFACTOR_ONE:
58             out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f);
59             break;
60         case BLENDFACTOR_SRC_COLOR:
61             out[0] = src[0];
62             out[1] = src[1];
63             out[2] = src[2];
64             out[3] = src[3];
65             break;
66         case BLENDFACTOR_SRC_ALPHA:
67             out[0] = out[1] = out[2] = out[3] = src[3];
68             break;
69         case BLENDFACTOR_DST_ALPHA:
70             out[0] = out[1] = out[2] = out[3] = dst[3];
71             break;
72         case BLENDFACTOR_DST_COLOR:
73             out[0] = dst[0];
74             out[1] = dst[1];
75             out[2] = dst[2];
76             out[3] = dst[3];
77             break;
78         case BLENDFACTOR_SRC_ALPHA_SATURATE:
79             out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));
80             out[3] = VIMMED1(1.0f);
81             break;
82         case BLENDFACTOR_CONST_COLOR:
83             out[0] = constColor[0];
84             out[1] = constColor[1];
85             out[2] = constColor[2];
86             out[3] = constColor[3];
87             break;
88         case BLENDFACTOR_CONST_ALPHA:
89             out[0] = out[1] = out[2] = out[3] = constColor[3];
90             break;
91         case BLENDFACTOR_SRC1_COLOR:
92             out[0] = src1[0];
93             out[1] = src1[1];
94             out[2] = src1[2];
95             out[3] = src1[3];
96             break;
97         case BLENDFACTOR_SRC1_ALPHA:
98             out[0] = out[1] = out[2] = out[3] = src1[3];
99             break;
100         case BLENDFACTOR_ZERO:
101             out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
102             break;
103         case BLENDFACTOR_INV_SRC_COLOR:
104             out[0] = FSUB(VIMMED1(1.0f), src[0]);
105             out[1] = FSUB(VIMMED1(1.0f), src[1]);
106             out[2] = FSUB(VIMMED1(1.0f), src[2]);
107             out[3] = FSUB(VIMMED1(1.0f), src[3]);
108             break;
109         case BLENDFACTOR_INV_SRC_ALPHA:
110             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]);
111             break;
112         case BLENDFACTOR_INV_DST_ALPHA:
113             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]);
114             break;
115         case BLENDFACTOR_INV_DST_COLOR:
116             out[0] = FSUB(VIMMED1(1.0f), dst[0]);
117             out[1] = FSUB(VIMMED1(1.0f), dst[1]);
118             out[2] = FSUB(VIMMED1(1.0f), dst[2]);
119             out[3] = FSUB(VIMMED1(1.0f), dst[3]);
120             break;
121         case BLENDFACTOR_INV_CONST_COLOR:
122             out[0] = FSUB(VIMMED1(1.0f), constColor[0]);
123             out[1] = FSUB(VIMMED1(1.0f), constColor[1]);
124             out[2] = FSUB(VIMMED1(1.0f), constColor[2]);
125             out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
126             break;
127         case BLENDFACTOR_INV_CONST_ALPHA:
128             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
129             break;
130         case BLENDFACTOR_INV_SRC1_COLOR:
131             out[0] = FSUB(VIMMED1(1.0f), src1[0]);
132             out[1] = FSUB(VIMMED1(1.0f), src1[1]);
133             out[2] = FSUB(VIMMED1(1.0f), src1[2]);
134             out[3] = FSUB(VIMMED1(1.0f), src1[3]);
135             break;
136         case BLENDFACTOR_INV_SRC1_ALPHA:
137             out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]);
138             break;
139         default:
140             SWR_ASSERT(false, "Unsupported blend factor: %d", factor);
141             out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
142             break;
143         }
144 
145         if (Color)
146         {
147             result[0] = out[0];
148             result[1] = out[1];
149             result[2] = out[2];
150         }
151 
152         if (Alpha)
153         {
154             result[3] = out[3];
155         }
156     }
157 
ClampBlendJit158     void Clamp(SWR_FORMAT format, Value* src[4])
159     {
160         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
161         SWR_TYPE type = info.type[0];
162 
163         switch (type)
164         {
165         case SWR_TYPE_FLOAT:
166             break;
167 
168         case SWR_TYPE_UNORM:
169             src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f));
170             src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f));
171             src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f));
172             src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f));
173             break;
174 
175         case SWR_TYPE_SNORM:
176             src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f));
177             src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f));
178             src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f));
179             src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));
180             break;
181 
182         default: SWR_ASSERT(false, "Unsupport format type: %d", type);
183         }
184     }
185 
ApplyDefaultsBlendJit186     void ApplyDefaults(SWR_FORMAT format, Value* src[4])
187     {
188         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
189 
190         bool valid[] = { false, false, false, false };
191         for (uint32_t c = 0; c < info.numComps; ++c)
192         {
193             valid[info.swizzle[c]] = true;
194         }
195 
196         for (uint32_t c = 0; c < 4; ++c)
197         {
198             if (!valid[c])
199             {
200                 src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty);
201             }
202         }
203     }
204 
ApplyUnusedDefaultsBlendJit205     void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4])
206     {
207         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
208 
209         for (uint32_t c = 0; c < info.numComps; ++c)
210         {
211             if (info.type[c] == SWR_TYPE_UNUSED)
212             {
213                 src[info.swizzle[c]] = BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
214             }
215         }
216     }
217 
QuantizeBlendJit218     void Quantize(SWR_FORMAT format, Value* src[4])
219     {
220         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
221         for (uint32_t c = 0; c < info.numComps; ++c)
222         {
223             if (info.bpc[c] <= QUANTIZE_THRESHOLD && info.type[c] != SWR_TYPE_UNUSED)
224             {
225                 uint32_t swizComp = info.swizzle[c];
226                 float factor = (float)((1 << info.bpc[c]) - 1);
227                 switch (info.type[c])
228                 {
229                 case SWR_TYPE_UNORM:
230                     src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));
231                     src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));
232                     src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f /factor));
233                     break;
234                 default: SWR_ASSERT(false, "Unsupported format type: %d", info.type[c]);
235                 }
236             }
237         }
238     }
239 
240     template<bool Color, bool Alpha>
BlendFuncBlendJit241     void BlendFunc(SWR_BLEND_OP blendOp, Value* src[4], Value* srcFactor[4], Value* dst[4], Value* dstFactor[4], Value* result[4])
242     {
243         Value* out[4];
244         Value* srcBlend[4];
245         Value* dstBlend[4];
246         for (uint32_t i = 0; i < 4; ++i)
247         {
248             srcBlend[i] = FMUL(src[i], srcFactor[i]);
249             dstBlend[i] = FMUL(dst[i], dstFactor[i]);
250         }
251 
252         switch (blendOp)
253         {
254         case BLENDOP_ADD:
255             out[0] = FADD(srcBlend[0], dstBlend[0]);
256             out[1] = FADD(srcBlend[1], dstBlend[1]);
257             out[2] = FADD(srcBlend[2], dstBlend[2]);
258             out[3] = FADD(srcBlend[3], dstBlend[3]);
259             break;
260 
261         case BLENDOP_SUBTRACT:
262             out[0] = FSUB(srcBlend[0], dstBlend[0]);
263             out[1] = FSUB(srcBlend[1], dstBlend[1]);
264             out[2] = FSUB(srcBlend[2], dstBlend[2]);
265             out[3] = FSUB(srcBlend[3], dstBlend[3]);
266             break;
267 
268         case BLENDOP_REVSUBTRACT:
269             out[0] = FSUB(dstBlend[0], srcBlend[0]);
270             out[1] = FSUB(dstBlend[1], srcBlend[1]);
271             out[2] = FSUB(dstBlend[2], srcBlend[2]);
272             out[3] = FSUB(dstBlend[3], srcBlend[3]);
273             break;
274 
275         case BLENDOP_MIN:
276             out[0] = VMINPS(src[0], dst[0]);
277             out[1] = VMINPS(src[1], dst[1]);
278             out[2] = VMINPS(src[2], dst[2]);
279             out[3] = VMINPS(src[3], dst[3]);
280             break;
281 
282         case BLENDOP_MAX:
283             out[0] = VMAXPS(src[0], dst[0]);
284             out[1] = VMAXPS(src[1], dst[1]);
285             out[2] = VMAXPS(src[2], dst[2]);
286             out[3] = VMAXPS(src[3], dst[3]);
287             break;
288 
289         default:
290             SWR_ASSERT(false, "Unsupported blend operation: %d", blendOp);
291             out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
292             break;
293         }
294 
295         if (Color)
296         {
297             result[0] = out[0];
298             result[1] = out[1];
299             result[2] = out[2];
300         }
301 
302         if (Alpha)
303         {
304             result[3] = out[3];
305         }
306     }
307 
LogicOpFuncBlendJit308     void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4])
309     {
310         // Op: (s == PS output, d = RT contents)
311         switch(logicOp)
312         {
313         case LOGICOP_CLEAR:
314             result[0] = VIMMED1(0);
315             result[1] = VIMMED1(0);
316             result[2] = VIMMED1(0);
317             result[3] = VIMMED1(0);
318             break;
319 
320         case LOGICOP_NOR:
321             // ~(s | d)
322             result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
323             result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
324             result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
325             result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
326             break;
327 
328         case LOGICOP_AND_INVERTED:
329             // ~s & d
330             // todo: use avx andnot instr when I can find the intrinsic to call
331             result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
332             result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
333             result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
334             result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
335             break;
336 
337         case LOGICOP_COPY_INVERTED:
338             // ~s
339             result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF));
340             result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF));
341             result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF));
342             result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF));
343             break;
344 
345         case LOGICOP_AND_REVERSE:
346             // s & ~d
347             // todo: use avx andnot instr when I can find the intrinsic to call
348             result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
349             result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
350             result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
351             result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
352             break;
353 
354         case LOGICOP_INVERT:
355             // ~d
356             result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF));
357             result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF));
358             result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF));
359             result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF));
360             break;
361 
362         case LOGICOP_XOR:
363             // s ^ d
364             result[0] = XOR(src[0], dst[0]);
365             result[1] = XOR(src[1], dst[1]);
366             result[2] = XOR(src[2], dst[2]);
367             result[3] = XOR(src[3], dst[3]);
368             break;
369 
370         case LOGICOP_NAND:
371             // ~(s & d)
372             result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
373             result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
374             result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
375             result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
376             break;
377 
378         case LOGICOP_AND:
379             // s & d
380             result[0] = AND(src[0], dst[0]);
381             result[1] = AND(src[1], dst[1]);
382             result[2] = AND(src[2], dst[2]);
383             result[3] = AND(src[3], dst[3]);
384             break;
385 
386         case LOGICOP_EQUIV:
387             // ~(s ^ d)
388             result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
389             result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
390             result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
391             result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
392             break;
393 
394         case LOGICOP_NOOP:
395             result[0] = dst[0];
396             result[1] = dst[1];
397             result[2] = dst[2];
398             result[3] = dst[3];
399             break;
400 
401         case LOGICOP_OR_INVERTED:
402             // ~s | d
403             result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
404             result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
405             result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
406             result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
407             break;
408 
409         case LOGICOP_COPY:
410             result[0] = src[0];
411             result[1] = src[1];
412             result[2] = src[2];
413             result[3] = src[3];
414             break;
415 
416         case LOGICOP_OR_REVERSE:
417             // s | ~d
418             result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
419             result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
420             result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
421             result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
422             break;
423 
424         case LOGICOP_OR:
425             // s | d
426             result[0] = OR(src[0], dst[0]);
427             result[1] = OR(src[1], dst[1]);
428             result[2] = OR(src[2], dst[2]);
429             result[3] = OR(src[3], dst[3]);
430             break;
431 
432         case LOGICOP_SET:
433             result[0] = VIMMED1(0xFFFFFFFF);
434             result[1] = VIMMED1(0xFFFFFFFF);
435             result[2] = VIMMED1(0xFFFFFFFF);
436             result[3] = VIMMED1(0xFFFFFFFF);
437             break;
438 
439         default:
440             SWR_ASSERT(false, "Unsupported logic operation: %d", logicOp);
441             result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f);
442             break;
443         }
444     }
445 
AlphaTestBlendJit446     void AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask)
447     {
448         // load uint32_t reference
449         Value* pRef = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_alphaTestReference }));
450 
451         // load alpha
452         Value* pAlpha = LOAD(ppAlpha);
453 
454         Value* pTest = nullptr;
455         if (state.alphaTestFormat == ALPHA_TEST_UNORM8)
456         {
457             // convert float alpha to unorm8
458             Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f));
459             pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty);
460 
461             // compare
462             switch (state.alphaTestFunction)
463             {
464             case ZFUNC_ALWAYS:  pTest = VIMMED1(true); break;
465             case ZFUNC_NEVER:   pTest = VIMMED1(false); break;
466             case ZFUNC_LT:      pTest = ICMP_ULT(pAlphaU8, pRef); break;
467             case ZFUNC_EQ:      pTest = ICMP_EQ(pAlphaU8, pRef); break;
468             case ZFUNC_LE:      pTest = ICMP_ULE(pAlphaU8, pRef); break;
469             case ZFUNC_GT:      pTest = ICMP_UGT(pAlphaU8, pRef); break;
470             case ZFUNC_NE:      pTest = ICMP_NE(pAlphaU8, pRef); break;
471             case ZFUNC_GE:      pTest = ICMP_UGE(pAlphaU8, pRef); break;
472             default:
473                 SWR_ASSERT(false, "Invalid alpha test function");
474                 break;
475             }
476         }
477         else
478         {
479             // cast ref to float
480             pRef = BITCAST(pRef, mSimdFP32Ty);
481 
482             // compare
483             switch (state.alphaTestFunction)
484             {
485             case ZFUNC_ALWAYS:  pTest = VIMMED1(true); break;
486             case ZFUNC_NEVER:   pTest = VIMMED1(false); break;
487             case ZFUNC_LT:      pTest = FCMP_OLT(pAlpha, pRef); break;
488             case ZFUNC_EQ:      pTest = FCMP_OEQ(pAlpha, pRef); break;
489             case ZFUNC_LE:      pTest = FCMP_OLE(pAlpha, pRef); break;
490             case ZFUNC_GT:      pTest = FCMP_OGT(pAlpha, pRef); break;
491             case ZFUNC_NE:      pTest = FCMP_ONE(pAlpha, pRef); break;
492             case ZFUNC_GE:      pTest = FCMP_OGE(pAlpha, pRef); break;
493             default:
494                 SWR_ASSERT(false, "Invalid alpha test function");
495                 break;
496             }
497         }
498 
499         // load current mask
500         Value* pMask = LOAD(ppMask);
501 
502         // convert to int1 mask
503         pMask = MASK(pMask);
504 
505         // and with alpha test result
506         pMask = AND(pMask, pTest);
507 
508         // convert back to vector mask
509         pMask = VMASK(pMask);
510 
511         // store new mask
512         STORE(pMask, ppMask);
513     }
514 
CreateBlendJit515     Function* Create(const BLEND_COMPILE_STATE& state)
516     {
517         static std::size_t jitNum = 0;
518 
519         std::stringstream fnName("BlendShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
520         fnName << jitNum++;
521 
522         // blend function signature
523         //typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*);
524 
525         std::vector<Type*> args{
526             PointerType::get(Gen_SWR_BLEND_STATE(JM()), 0), // SWR_BLEND_STATE*
527             PointerType::get(mSimdFP32Ty, 0),               // simdvector& src
528             PointerType::get(mSimdFP32Ty, 0),               // simdvector& src1
529             PointerType::get(mSimdFP32Ty, 0),               // src0alpha
530             Type::getInt32Ty(JM()->mContext),               // sampleNum
531             PointerType::get(mSimdFP32Ty, 0),               // uint8_t* pDst
532             PointerType::get(mSimdFP32Ty, 0),               // simdvector& result
533             PointerType::get(mSimdInt32Ty, 0),              // simdscalari* oMask
534             PointerType::get(mSimdInt32Ty, 0),              // simdscalari* pMask
535         };
536 
537         FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
538         Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
539 
540         BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
541 
542         IRB()->SetInsertPoint(entry);
543 
544         // arguments
545         auto argitr = blendFunc->arg_begin();
546         Value* pBlendState = &*argitr++;
547         pBlendState->setName("pBlendState");
548         Value* pSrc = &*argitr++;
549         pSrc->setName("src");
550         Value* pSrc1 = &*argitr++;
551         pSrc1->setName("src1");
552         Value* pSrc0Alpha = &*argitr++;
553         pSrc0Alpha->setName("src0alpha");
554         Value* sampleNum = &*argitr++;
555         sampleNum->setName("sampleNum");
556         Value* pDst = &*argitr++;
557         pDst->setName("pDst");
558         Value* pResult = &*argitr++;
559         pResult->setName("result");
560         Value* ppoMask = &*argitr++;
561         ppoMask->setName("ppoMask");
562         Value* ppMask = &*argitr++;
563         ppMask->setName("pMask");
564 
565         static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
566         Value* dst[4];
567         Value* constantColor[4];
568         Value* src[4];
569         Value* src1[4];
570         Value* result[4];
571         for (uint32_t i = 0; i < 4; ++i)
572         {
573             // load hot tile
574             dst[i] = LOAD(pDst, { i });
575 
576             // load constant color
577             constantColor[i] = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_constantColor, i }));
578 
579             // load src
580             src[i] = LOAD(pSrc, { i });
581 
582             // load src1
583             src1[i] = LOAD(pSrc1, { i });
584         }
585         Value* currentMask = VIMMED1(-1);
586         if (state.desc.alphaToCoverageEnable)
587         {
588             Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
589             uint32_t bits = (1 << state.desc.numSamples) - 1;
590             currentMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
591             currentMask = FP_TO_SI(FADD(currentMask, VIMMED1(0.5f)), mSimdInt32Ty);
592         }
593 
594         // alpha test
595         if (state.desc.alphaTestEnable)
596         {
597             AlphaTest(state, pBlendState, pSrc0Alpha, ppMask);
598         }
599 
600         // color blend
601         if (state.blendState.blendEnable)
602         {
603             // clamp sources
604             Clamp(state.format, src);
605             Clamp(state.format, src1);
606             Clamp(state.format, dst);
607             Clamp(state.format, constantColor);
608 
609             // apply defaults to hottile contents to take into account missing components
610             ApplyDefaults(state.format, dst);
611 
612             // Force defaults for unused 'X' components
613             ApplyUnusedDefaults(state.format, dst);
614 
615             // Quantize low precision components
616             Quantize(state.format, dst);
617 
618             // special case clamping for R11G11B10_float which has no sign bit
619             if (state.format == R11G11B10_FLOAT)
620             {
621                 dst[0] = VMAXPS(dst[0], VIMMED1(0.0f));
622                 dst[1] = VMAXPS(dst[1], VIMMED1(0.0f));
623                 dst[2] = VMAXPS(dst[2], VIMMED1(0.0f));
624                 dst[3] = VMAXPS(dst[3], VIMMED1(0.0f));
625             }
626 
627             Value* srcFactor[4];
628             Value* dstFactor[4];
629             if (state.desc.independentAlphaBlendEnable)
630             {
631                 GenerateBlendFactor<true, false>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
632                 GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor, constantColor, src, src1, dst, srcFactor);
633 
634                 GenerateBlendFactor<true, false>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
635                 GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor, constantColor, src, src1, dst, dstFactor);
636 
637                 BlendFunc<true, false>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
638                 BlendFunc<false, true>(state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
639             }
640             else
641             {
642                 GenerateBlendFactor<true, true>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
643                 GenerateBlendFactor<true, true>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
644 
645                 BlendFunc<true, true>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
646             }
647 
648             // store results out
649             for (uint32_t i = 0; i < 4; ++i)
650             {
651                 STORE(result[i], pResult, { i });
652             }
653         }
654 
655         if(state.blendState.logicOpEnable)
656         {
657             const SWR_FORMAT_INFO& info = GetFormatInfo(state.format);
658             Value* vMask[4];
659             float scale[4];
660 
661             if (!state.blendState.blendEnable)
662             {
663                 Clamp(state.format, src);
664                 Clamp(state.format, dst);
665             }
666 
667             for(uint32_t i = 0; i < 4; i++)
668             {
669                 if (info.type[i] == SWR_TYPE_UNUSED)
670                 {
671                     continue;
672                 }
673 
674                 if (info.bpc[i] >= 32) {
675                     vMask[i] = VIMMED1(0xFFFFFFFF);
676                     scale[i] = 0xFFFFFFFF;
677                 } else {
678                     vMask[i] = VIMMED1((1 << info.bpc[i]) - 1);
679                     if (info.type[i] == SWR_TYPE_SNORM)
680                         scale[i] = (1 << (info.bpc[i] - 1)) - 1;
681                     else
682                         scale[i] = (1 << info.bpc[i]) - 1;
683                 }
684 
685                 switch (info.type[i]) {
686                 default:
687                     SWR_ASSERT(0, "Unsupported type for logic op\n");
688                     /* fallthrough */
689                 case SWR_TYPE_UINT:
690                 case SWR_TYPE_SINT:
691                     src[i] = BITCAST(src[i], mSimdInt32Ty);
692                     dst[i] = BITCAST(dst[i], mSimdInt32Ty);
693                     break;
694                 case SWR_TYPE_SNORM:
695                     src[i] = FP_TO_SI(
696                         FMUL(src[i], VIMMED1(scale[i])),
697                         mSimdInt32Ty);
698                     dst[i] = FP_TO_SI(
699                         FMUL(dst[i], VIMMED1(scale[i])),
700                         mSimdInt32Ty);
701                     break;
702                 case SWR_TYPE_UNORM:
703                     src[i] = FP_TO_UI(
704                         FMUL(src[i], VIMMED1(scale[i])),
705                         mSimdInt32Ty);
706                     dst[i] = FP_TO_UI(
707                         FMUL(dst[i], VIMMED1(scale[i])),
708                         mSimdInt32Ty);
709                     break;
710                 }
711             }
712 
713             LogicOpFunc(state.blendState.logicOpFunc, src, dst, result);
714 
715             // store results out
716             for(uint32_t i = 0; i < 4; ++i)
717             {
718                 if (info.type[i] == SWR_TYPE_UNUSED)
719                 {
720                     continue;
721                 }
722 
723                 // clear upper bits from PS output not in RT format after doing logic op
724                 result[i] = AND(result[i], vMask[i]);
725 
726                 switch (info.type[i]) {
727                 default:
728                     SWR_ASSERT(0, "Unsupported type for logic op\n");
729                     /* fallthrough */
730                 case SWR_TYPE_UINT:
731                 case SWR_TYPE_SINT:
732                     result[i] = BITCAST(result[i], mSimdFP32Ty);
733                     break;
734                 case SWR_TYPE_SNORM:
735                     result[i] = SHL(result[i], C(32 - info.bpc[i]));
736                     result[i] = ASHR(result[i], C(32 - info.bpc[i]));
737                     result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty),
738                                      VIMMED1(1.0f / scale[i]));
739                     break;
740                 case SWR_TYPE_UNORM:
741                     result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty),
742                                      VIMMED1(1.0f / scale[i]));
743                     break;
744                 }
745 
746                 STORE(result[i], pResult, {i});
747             }
748         }
749 
750         if(state.desc.oMaskEnable)
751         {
752             assert(!(state.desc.alphaToCoverageEnable));
753             // load current mask
754             Value* oMask = LOAD(ppoMask);
755             Value* sampleMasked = VBROADCAST(SHL(C(1), sampleNum));
756             oMask = AND(oMask, sampleMasked);
757             currentMask = AND(oMask, currentMask);
758         }
759 
760         if(state.desc.sampleMaskEnable)
761         {
762             Value* sampleMask = LOAD(pBlendState, { 0, SWR_BLEND_STATE_sampleMask});
763             Value* sampleMasked = SHL(C(1), sampleNum);
764             sampleMask = AND(sampleMask, sampleMasked);
765             sampleMask = VBROADCAST(ICMP_SGT(sampleMask, C(0)));
766             sampleMask = S_EXT(sampleMask, mSimdInt32Ty);
767             currentMask = AND(sampleMask, currentMask);
768         }
769 
770         if (state.desc.alphaToCoverageEnable)
771         {
772             Value* sampleMasked = SHL(C(1), sampleNum);
773             currentMask = AND(currentMask, VBROADCAST(sampleMasked));
774         }
775 
776         if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
777            state.desc.oMaskEnable)
778         {
779             // load current mask
780             Value* pMask = LOAD(ppMask);
781             currentMask = S_EXT(ICMP_SGT(currentMask, VBROADCAST(C(0))), mSimdInt32Ty);
782             Value* outputMask = AND(pMask, currentMask);
783             // store new mask
784             STORE(outputMask, GEP(ppMask, C(0)));
785         }
786 
787         RET_VOID();
788 
789         JitManager::DumpToFile(blendFunc, "");
790 
791         ::FunctionPassManager passes(JM()->mpCurrentModule);
792 
793         passes.add(createBreakCriticalEdgesPass());
794         passes.add(createCFGSimplificationPass());
795         passes.add(createEarlyCSEPass());
796         passes.add(createPromoteMemoryToRegisterPass());
797         passes.add(createCFGSimplificationPass());
798         passes.add(createEarlyCSEPass());
799         passes.add(createInstructionCombiningPass());
800         passes.add(createInstructionSimplifierPass());
801         passes.add(createConstantPropagationPass());
802         passes.add(createSCCPPass());
803         passes.add(createAggressiveDCEPass());
804 
805         passes.run(*blendFunc);
806 
807         JitManager::DumpToFile(blendFunc, "optimized");
808 
809         return blendFunc;
810     }
811 };
812 
813 //////////////////////////////////////////////////////////////////////////
814 /// @brief JITs from fetch shader IR
815 /// @param hJitMgr - JitManager handle
816 /// @param func   - LLVM function IR
817 /// @return PFN_FETCH_FUNC - pointer to fetch code
JitBlendFunc(HANDLE hJitMgr,const HANDLE hFunc)818 PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)
819 {
820     const llvm::Function *func = (const llvm::Function*)hFunc;
821     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
822     PFN_BLEND_JIT_FUNC pfnBlend;
823     pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
824     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
825     pJitMgr->mIsModuleFinalized = true;
826 
827     return pfnBlend;
828 }
829 
830 //////////////////////////////////////////////////////////////////////////
831 /// @brief JIT compiles blend shader
832 /// @param hJitMgr - JitManager handle
833 /// @param state   - blend state to build function from
JitCompileBlend(HANDLE hJitMgr,const BLEND_COMPILE_STATE & state)834 extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, const BLEND_COMPILE_STATE& state)
835 {
836     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
837 
838     pJitMgr->SetupNewModule();
839 
840     BlendJit theJit(pJitMgr);
841     HANDLE hFunc = theJit.Create(state);
842 
843     return JitBlendFunc(hJitMgr, hFunc);
844 }
845