• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * @file streamout_jit.cpp
24 *
25 * @brief Implementation of the streamout jitter
26 *
27 * Notes:
28 *
29 ******************************************************************************/
30 #include "jit_pch.hpp"
31 #include "builder.h"
32 #include "jit_api.h"
33 #include "streamout_jit.h"
34 #include "gen_state_llvm.h"
35 
36 using namespace llvm;
37 using namespace SwrJit;
38 
39 //////////////////////////////////////////////////////////////////////////
40 /// Interface to Jitting a fetch shader
41 //////////////////////////////////////////////////////////////////////////
42 struct StreamOutJit : public Builder
43 {
StreamOutJitStreamOutJit44     StreamOutJit(JitManager* pJitMgr) : Builder(pJitMgr){};
45 
46     // returns pointer to SWR_STREAMOUT_BUFFER
getSOBufferStreamOutJit47     Value* getSOBuffer(Value* pSoCtx, uint32_t buffer)
48     {
49         return LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer });
50     }
51 
52 
53     //////////////////////////////////////////////////////////////////////////
54     // @brief checks if streamout buffer is oob
55     // @return <i1> true/false
oobStreamOutJit56     Value* oob(const STREAMOUT_COMPILE_STATE& state, Value* pSoCtx, uint32_t buffer)
57     {
58         Value* returnMask = C(false);
59 
60         Value* pBuf = getSOBuffer(pSoCtx, buffer);
61 
62         // load enable
63         // @todo bool data types should generate <i1> llvm type
64         Value* enabled = TRUNC(LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_enable }), IRB()->getInt1Ty());
65 
66         // load buffer size
67         Value* bufferSize = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_bufferSize });
68 
69         // load current streamOffset
70         Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
71 
72         // load buffer pitch
73         Value* pitch = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
74 
75         // buffer is considered oob if in use in a decl but not enabled
76         returnMask = OR(returnMask, NOT(enabled));
77 
78         // buffer is oob if cannot fit a prims worth of verts
79         Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim)));
80         returnMask = OR(returnMask, ICMP_SGT(newOffset, bufferSize));
81 
82         return returnMask;
83     }
84 
85 
86     //////////////////////////////////////////////////////////////////////////
87     // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector,
88     //        packing the active mask bits
89     //        ex. bitmask 0011 -> (0, 1, 0, 0)
90     //            bitmask 1000 -> (3, 0, 0, 0)
91     //            bitmask 1100 -> (2, 3, 0, 0)
PackMaskStreamOutJit92     Value* PackMask(uint32_t bitmask)
93     {
94         std::vector<Constant*> indices(4, C(0));
95         DWORD index;
96         uint32_t elem = 0;
97         while (_BitScanForward(&index, bitmask))
98         {
99             indices[elem++] = C((int)index);
100             bitmask &= ~(1 << index);
101         }
102 
103         return ConstantVector::get(indices);
104     }
105 
106     //////////////////////////////////////////////////////////////////////////
107     // @brief convert scalar bitmask to <4xfloat> bitmask
ToMaskStreamOutJit108     Value* ToMask(uint32_t bitmask)
109     {
110         std::vector<Constant*> indices;
111         for (uint32_t i = 0; i < 4; ++i)
112         {
113             if (bitmask & (1 << i))
114             {
115                 indices.push_back(C(-1.0f));
116             }
117             else
118             {
119                 indices.push_back(C(0.0f));
120             }
121         }
122         return ConstantVector::get(indices);
123     }
124 
125     //////////////////////////////////////////////////////////////////////////
126     // @brief processes a single decl from the streamout stream. Reads 4 components from the input
127     //        stream and writes N components to the output buffer given the componentMask or if
128     //        a hole, just increments the buffer pointer
129     // @param pStream - pointer to current attribute
130     // @param pOutBuffers - pointers to the current location of each output buffer
131     // @param decl - input decl
buildDeclStreamOutJit132     void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl)
133     {
134         // @todo add this to x86 macros
135         Function* maskStore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskstore_ps);
136 
137         uint32_t numComponents = _mm_popcnt_u32(decl.componentMask);
138         uint32_t packedMask = (1 << numComponents) - 1;
139         if (!decl.hole)
140         {
141             // increment stream pointer to correct slot
142             Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot));
143 
144             // load 4 components from stream
145             Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4);
146             Type* simd4PtrTy = PointerType::get(simd4Ty, 0);
147             pAttrib = BITCAST(pAttrib, simd4PtrTy);
148             Value *vattrib = LOAD(pAttrib);
149 
150             // shuffle/pack enabled components
151             Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask));
152 
153             // store to output buffer
154             // cast SO buffer to i8*, needed by maskstore
155             Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(mInt8Ty, 0));
156 
157             // cast input to <4xfloat>
158             Value* src = BITCAST(vpackedAttrib, simd4Ty);
159 
160             // cast mask to <4xint>
161             Value* mask = ToMask(packedMask);
162             mask = BITCAST(mask, VectorType::get(IRB()->getInt32Ty(), 4));
163             CALL(maskStore, {pOut, mask, src});
164         }
165 
166         // increment SO buffer
167         pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents));
168     }
169 
170     //////////////////////////////////////////////////////////////////////////
171     // @brief builds a single vertex worth of data for the given stream
172     // @param streamState - state for this stream
173     // @param pCurVertex - pointer to src stream vertex data
174     // @param pOutBuffer - pointers to up to 4 SO buffers
buildVertexStreamOutJit175     void buildVertex(const STREAMOUT_STREAM& streamState, Value* pCurVertex, Value* pOutBuffer[4])
176     {
177         for (uint32_t d = 0; d < streamState.numDecls; ++d)
178         {
179             const STREAMOUT_DECL& decl = streamState.decl[d];
180             buildDecl(pCurVertex, pOutBuffer, decl);
181         }
182     }
183 
buildStreamStreamOutJit184     void buildStream(const STREAMOUT_COMPILE_STATE& state, const STREAMOUT_STREAM& streamState, Value* pSoCtx, BasicBlock* returnBB, Function* soFunc)
185     {
186         // get list of active SO buffers
187         std::unordered_set<uint32_t> activeSOBuffers;
188         for (uint32_t d = 0; d < streamState.numDecls; ++d)
189         {
190             const STREAMOUT_DECL& decl = streamState.decl[d];
191             activeSOBuffers.insert(decl.bufferIndex);
192         }
193 
194         // always increment numPrimStorageNeeded
195         Value *numPrimStorageNeeded = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
196         numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1));
197         STORE(numPrimStorageNeeded, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
198 
199         // check OOB on active SO buffers.  If any buffer is out of bound, don't write
200         // the primitive to any buffer
201         Value* oobMask = C(false);
202         for (uint32_t buffer : activeSOBuffers)
203         {
204             oobMask = OR(oobMask, oob(state, pSoCtx, buffer));
205         }
206 
207         BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc);
208 
209         // early out if OOB
210         COND_BR(oobMask, returnBB, validBB);
211 
212         IRB()->SetInsertPoint(validBB);
213 
214         Value* numPrimsWritten = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
215         numPrimsWritten = ADD(numPrimsWritten, C(1));
216         STORE(numPrimsWritten, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
217 
218         // compute start pointer for each output buffer
219         Value* pOutBuffer[4];
220         Value* pOutBufferStartVertex[4];
221         Value* outBufferPitch[4];
222         for (uint32_t b: activeSOBuffers)
223         {
224             Value* pBuf = getSOBuffer(pSoCtx, b);
225             Value* pData = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pBuffer });
226             Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
227             pOutBuffer[b] = GEP(pData, streamOffset);
228             pOutBufferStartVertex[b] = pOutBuffer[b];
229 
230             outBufferPitch[b] = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
231         }
232 
233         // loop over the vertices of the prim
234         Value* pStreamData = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pPrimData });
235         for (uint32_t v = 0; v < state.numVertsPerPrim; ++v)
236         {
237             buildVertex(streamState, pStreamData, pOutBuffer);
238 
239             // increment stream and output buffer pointers
240             // stream verts are always 32*4 dwords apart
241             pStreamData = GEP(pStreamData, C(SWR_VTX_NUM_SLOTS * 4));
242 
243             // output buffers offset using pitch in buffer state
244             for (uint32_t b : activeSOBuffers)
245             {
246                 pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]);
247                 pOutBuffer[b] = pOutBufferStartVertex[b];
248             }
249         }
250 
251         // update each active buffer's streamOffset
252         for (uint32_t b : activeSOBuffers)
253         {
254             Value* pBuf = getSOBuffer(pSoCtx, b);
255             Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
256             streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b]));
257             STORE(streamOffset, pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
258         }
259     }
260 
CreateStreamOutJit261     Function* Create(const STREAMOUT_COMPILE_STATE& state)
262     {
263         std::stringstream fnName("SO_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
264         fnName << ComputeCRC(0, &state, sizeof(state));
265 
266         // SO function signature
267         // typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT*)
268 
269         std::vector<Type*> args{
270             PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
271         };
272 
273         FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
274         Function* soFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
275 
276         soFunc->getParent()->setModuleIdentifier(soFunc->getName());
277 
278         // create return basic block
279         BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc);
280         BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc);
281 
282         IRB()->SetInsertPoint(entry);
283 
284         // arguments
285         auto argitr = soFunc->arg_begin();
286         Value* pSoCtx = &*argitr++;
287         pSoCtx->setName("pSoCtx");
288 
289         const STREAMOUT_STREAM& streamState = state.stream;
290         buildStream(state, streamState, pSoCtx, returnBB, soFunc);
291 
292         BR(returnBB);
293 
294         IRB()->SetInsertPoint(returnBB);
295         RET_VOID();
296 
297         JitManager::DumpToFile(soFunc, "SoFunc");
298 
299         ::FunctionPassManager passes(JM()->mpCurrentModule);
300 
301         passes.add(createBreakCriticalEdgesPass());
302         passes.add(createCFGSimplificationPass());
303         passes.add(createEarlyCSEPass());
304         passes.add(createPromoteMemoryToRegisterPass());
305         passes.add(createCFGSimplificationPass());
306         passes.add(createEarlyCSEPass());
307         passes.add(createInstructionCombiningPass());
308         passes.add(createInstructionSimplifierPass());
309         passes.add(createConstantPropagationPass());
310         passes.add(createSCCPPass());
311         passes.add(createAggressiveDCEPass());
312 
313         passes.run(*soFunc);
314 
315         JitManager::DumpToFile(soFunc, "SoFunc_optimized");
316 
317         return soFunc;
318     }
319 };
320 
321 //////////////////////////////////////////////////////////////////////////
322 /// @brief JITs from streamout shader IR
323 /// @param hJitMgr - JitManager handle
324 /// @param func   - LLVM function IR
325 /// @return PFN_SO_FUNC - pointer to SOS function
JitStreamoutFunc(HANDLE hJitMgr,const HANDLE hFunc)326 PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc)
327 {
328     const llvm::Function *func = (const llvm::Function*)hFunc;
329     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
330     PFN_SO_FUNC pfnStreamOut;
331     pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
332     // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
333     pJitMgr->mIsModuleFinalized = true;
334 
335     return pfnStreamOut;
336 }
337 
338 //////////////////////////////////////////////////////////////////////////
339 /// @brief JIT compiles streamout shader
340 /// @param hJitMgr - JitManager handle
341 /// @param state   - SO state to build function from
JitCompileStreamout(HANDLE hJitMgr,const STREAMOUT_COMPILE_STATE & state)342 extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr, const STREAMOUT_COMPILE_STATE& state)
343 {
344     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
345 
346     STREAMOUT_COMPILE_STATE soState = state;
347     if (soState.offsetAttribs)
348     {
349         for (uint32_t i = 0; i < soState.stream.numDecls; ++i)
350         {
351             soState.stream.decl[i].attribSlot -= soState.offsetAttribs;
352         }
353     }
354 
355     pJitMgr->SetupNewModule();
356 
357     StreamOutJit theJit(pJitMgr);
358     HANDLE hFunc = theJit.Create(soState);
359 
360     return JitStreamoutFunc(hJitMgr, hFunc);
361 }
362