1 /*
2 * Copyright 2012, The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "Assert.h"
18 #include "Log.h"
19 #include "RSTransforms.h"
20 #include "RSUtils.h"
21
22 #include "bcc/Config.h"
23 #include "bcinfo/MetadataExtractor.h"
24
25 #include "slang_version.h"
26
27 #include <cstdlib>
28 #include <functional>
29 #include <unordered_set>
30
31 #include <llvm/IR/DerivedTypes.h>
32 #include <llvm/IR/Function.h>
33 #include <llvm/IR/Instructions.h>
34 #include <llvm/IR/IRBuilder.h>
35 #include <llvm/IR/MDBuilder.h>
36 #include <llvm/IR/Module.h>
37 #include <llvm/Pass.h>
38 #include <llvm/Support/raw_ostream.h>
39 #include <llvm/IR/DataLayout.h>
40 #include <llvm/IR/Function.h>
41 #include <llvm/IR/Type.h>
42 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
43
44 #ifndef __DISABLE_ASSERTS
45 // Only used in bccAssert()
46 const int kNumExpandedForeachParams = 4;
47 const int kNumExpandedReduceAccumulatorParams = 4;
48 #endif
49
50 const char kRenderScriptTBAARootName[] = "RenderScript Distinct TBAA";
51 const char kRenderScriptTBAANodeName[] = "RenderScript TBAA";
52
53 using namespace bcc;
54
55 namespace {
56
57 static const bool gEnableRsTbaa = true;
58
59 /* RSKernelExpandPass
60 *
61 * This pass generates functions used to implement calls via
62 * rsForEach(), "foreach_<NAME>", or "reduce_<NAME>". We create an
63 * inner loop for the function to be invoked over the appropriate data
64 * cells of the input/output allocations (adjusting other relevant
65 * parameters as we go). We support doing this for any forEach or
66 * reduce style compute kernels.
67 *
68 * In the case of a foreach kernel or a simple reduction kernel, the
69 * new function name is the original function name "<NAME>" followed
70 * by ".expand" -- "<NAME>.expand".
71 *
72 * In the case of a general reduction kernel, the kernel's accumulator
73 * function is the one transformed, and the new function name is the
74 * original accumulator function name "<ACCUMFN>" followed by
75 * ".expand" -- "<ACCUMFN>.expand". Using the name "<ACCUMFN>.expand"
76 * for the function generated from the accumulator should not
77 * introduce any possibility for name clashes today: The accumulator
78 * function <ACCUMFN> must be static, so it cannot also serve as a
79 * foreach kernel; and the code for <ACCUMFN>.expand depends only on
80 * <ACCUMFN>, not on any other properties of the reduction kernel, so
81 * any reduction kernels that share the accumulator <ACCUMFN> can
82 * share <ACCUMFN>.expand also.
83 *
84 * Note that this pass does not delete the original function <NAME> or
85 * <ACCUMFN>. However, if it is inlined into the newly-generated
86 * function and not otherwise referenced, then a subsequent pass may
87 * delete it.
88 */
89 class RSKernelExpandPass : public llvm::ModulePass {
90 public:
91 static char ID;
92
93 private:
94 static const size_t RS_KERNEL_INPUT_LIMIT = 8; // see frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h
95
96 typedef std::unordered_set<llvm::Function *> FunctionSet;
97
98 enum RsLaunchDimensionsField {
99 RsLaunchDimensionsFieldX,
100 RsLaunchDimensionsFieldY,
101 RsLaunchDimensionsFieldZ,
102 RsLaunchDimensionsFieldLod,
103 RsLaunchDimensionsFieldFace,
104 RsLaunchDimensionsFieldArray,
105
106 RsLaunchDimensionsFieldCount
107 };
108
109 enum RsExpandKernelDriverInfoPfxField {
110 RsExpandKernelDriverInfoPfxFieldInPtr,
111 RsExpandKernelDriverInfoPfxFieldInStride,
112 RsExpandKernelDriverInfoPfxFieldInLen,
113 RsExpandKernelDriverInfoPfxFieldOutPtr,
114 RsExpandKernelDriverInfoPfxFieldOutStride,
115 RsExpandKernelDriverInfoPfxFieldOutLen,
116 RsExpandKernelDriverInfoPfxFieldDim,
117 RsExpandKernelDriverInfoPfxFieldCurrent,
118 RsExpandKernelDriverInfoPfxFieldUsr,
119 RsExpandKernelDriverInfoPfxFieldUsLenr,
120
121 RsExpandKernelDriverInfoPfxFieldCount
122 };
123
124 llvm::Module *Module;
125 llvm::LLVMContext *Context;
126
127 /*
128 * Pointers to LLVM type information for the the function signatures
129 * for expanded functions. These must be re-calculated for each module
130 * the pass is run on.
131 */
132 llvm::FunctionType *ExpandedForEachType;
133 llvm::Type *RsExpandKernelDriverInfoPfxTy;
134
135 // Initialized when we begin to process each Module
136 bool mStructExplicitlyPaddedBySlang;
137 uint32_t mExportForEachCount;
138 const char **mExportForEachNameList;
139 const uint32_t *mExportForEachSignatureList;
140
141 // Turns on optimization of allocation stride values.
142 bool mEnableStepOpt;
143
getRootSignature(llvm::Function * Function)144 uint32_t getRootSignature(llvm::Function *Function) {
145 const llvm::NamedMDNode *ExportForEachMetadata =
146 Module->getNamedMetadata("#rs_export_foreach");
147
148 if (!ExportForEachMetadata) {
149 llvm::SmallVector<llvm::Type*, 8> RootArgTys;
150 for (llvm::Function::arg_iterator B = Function->arg_begin(),
151 E = Function->arg_end();
152 B != E;
153 ++B) {
154 RootArgTys.push_back(B->getType());
155 }
156
157 // For pre-ICS bitcode, we may not have signature information. In that
158 // case, we use the size of the RootArgTys to select the number of
159 // arguments.
160 return (1 << RootArgTys.size()) - 1;
161 }
162
163 if (ExportForEachMetadata->getNumOperands() == 0) {
164 return 0;
165 }
166
167 bccAssert(ExportForEachMetadata->getNumOperands() > 0);
168
169 // We only handle the case for legacy root() functions here, so this is
170 // hard-coded to look at only the first such function.
171 llvm::MDNode *SigNode = ExportForEachMetadata->getOperand(0);
172 if (SigNode != nullptr && SigNode->getNumOperands() == 1) {
173 llvm::Metadata *SigMD = SigNode->getOperand(0);
174 if (llvm::MDString *SigS = llvm::dyn_cast<llvm::MDString>(SigMD)) {
175 llvm::StringRef SigString = SigS->getString();
176 uint32_t Signature = 0;
177 if (SigString.getAsInteger(10, Signature)) {
178 ALOGE("Non-integer signature value '%s'", SigString.str().c_str());
179 return 0;
180 }
181 return Signature;
182 }
183 }
184
185 return 0;
186 }
187
isStepOptSupported(llvm::Type * AllocType)188 bool isStepOptSupported(llvm::Type *AllocType) {
189
190 llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
191 llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context);
192
193 if (mEnableStepOpt) {
194 return false;
195 }
196
197 if (AllocType == VoidPtrTy) {
198 return false;
199 }
200
201 if (!PT) {
202 return false;
203 }
204
205 // remaining conditions are 64-bit only
206 if (VoidPtrTy->getPrimitiveSizeInBits() == 32) {
207 return true;
208 }
209
210 // coerce suggests an upconverted struct type, which we can't support
211 if (AllocType->getStructName().find("coerce") != llvm::StringRef::npos) {
212 return false;
213 }
214
215 // 2xi64 and i128 suggest an upconverted struct type, which are also unsupported
216 llvm::Type *V2xi64Ty = llvm::VectorType::get(llvm::Type::getInt64Ty(*Context), 2);
217 llvm::Type *Int128Ty = llvm::Type::getIntNTy(*Context, 128);
218 if (AllocType == V2xi64Ty || AllocType == Int128Ty) {
219 return false;
220 }
221
222 return true;
223 }
224
225 // Get the actual value we should use to step through an allocation.
226 //
227 // Normally the value we use to step through an allocation is given to us by
228 // the driver. However, for certain primitive data types, we can derive an
229 // integer constant for the step value. We use this integer constant whenever
230 // possible to allow further compiler optimizations to take place.
231 //
232 // DL - Target Data size/layout information.
233 // T - Type of allocation (should be a pointer).
234 // OrigStep - Original step increment (root.expand() input from driver).
getStepValue(llvm::DataLayout * DL,llvm::Type * AllocType,llvm::Value * OrigStep)235 llvm::Value *getStepValue(llvm::DataLayout *DL, llvm::Type *AllocType,
236 llvm::Value *OrigStep) {
237 bccAssert(DL);
238 bccAssert(AllocType);
239 bccAssert(OrigStep);
240 llvm::PointerType *PT = llvm::dyn_cast<llvm::PointerType>(AllocType);
241 if (isStepOptSupported(AllocType)) {
242 llvm::Type *ET = PT->getElementType();
243 uint64_t ETSize = DL->getTypeAllocSize(ET);
244 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
245 return llvm::ConstantInt::get(Int32Ty, ETSize);
246 } else {
247 return OrigStep;
248 }
249 }
250
251 /// Builds the types required by the pass for the given context.
buildTypes(void)252 void buildTypes(void) {
253 // Create the RsLaunchDimensionsTy and RsExpandKernelDriverInfoPfxTy structs.
254
255 llvm::Type *Int8Ty = llvm::Type::getInt8Ty(*Context);
256 llvm::Type *Int8PtrTy = Int8Ty->getPointerTo();
257 llvm::Type *Int8PtrArrayInputLimitTy = llvm::ArrayType::get(Int8PtrTy, RS_KERNEL_INPUT_LIMIT);
258 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
259 llvm::Type *Int32ArrayInputLimitTy = llvm::ArrayType::get(Int32Ty, RS_KERNEL_INPUT_LIMIT);
260 llvm::Type *VoidPtrTy = llvm::Type::getInt8PtrTy(*Context);
261 llvm::Type *Int32Array4Ty = llvm::ArrayType::get(Int32Ty, 4);
262
263 /* Defined in frameworks/base/libs/rs/cpu_ref/rsCpuCore.h:
264 *
265 * struct RsLaunchDimensions {
266 * uint32_t x;
267 * uint32_t y;
268 * uint32_t z;
269 * uint32_t lod;
270 * uint32_t face;
271 * uint32_t array[4];
272 * };
273 */
274 llvm::SmallVector<llvm::Type*, RsLaunchDimensionsFieldCount> RsLaunchDimensionsTypes;
275 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t x
276 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t y
277 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t z
278 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t lod
279 RsLaunchDimensionsTypes.push_back(Int32Ty); // uint32_t face
280 RsLaunchDimensionsTypes.push_back(Int32Array4Ty); // uint32_t array[4]
281 llvm::StructType *RsLaunchDimensionsTy =
282 llvm::StructType::create(RsLaunchDimensionsTypes, "RsLaunchDimensions");
283
284 /* Defined as the beginning of RsExpandKernelDriverInfo in frameworks/base/libs/rs/cpu_ref/rsCpuCoreRuntime.h:
285 *
286 * struct RsExpandKernelDriverInfoPfx {
287 * const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT];
288 * uint32_t inStride[RS_KERNEL_INPUT_LIMIT];
289 * uint32_t inLen;
290 *
291 * uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT];
292 * uint32_t outStride[RS_KERNEL_INPUT_LIMIT];
293 * uint32_t outLen;
294 *
295 * // Dimension of the launch
296 * RsLaunchDimensions dim;
297 *
298 * // The walking iterator of the launch
299 * RsLaunchDimensions current;
300 *
301 * const void *usr;
302 * uint32_t usrLen;
303 *
304 * // Items below this line are not used by the compiler and can be change in the driver.
305 * // So the compiler must assume there are an unknown number of fields of unknown type
306 * // beginning here.
307 * };
308 *
309 * The name "RsExpandKernelDriverInfoPfx" is known to RSInvariantPass (RSInvariant.cpp).
310 */
311 llvm::SmallVector<llvm::Type*, RsExpandKernelDriverInfoPfxFieldCount> RsExpandKernelDriverInfoPfxTypes;
312 RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // const uint8_t *inPtr[RS_KERNEL_INPUT_LIMIT]
313 RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy); // uint32_t inStride[RS_KERNEL_INPUT_LIMIT]
314 RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t inLen
315 RsExpandKernelDriverInfoPfxTypes.push_back(Int8PtrArrayInputLimitTy); // uint8_t *outPtr[RS_KERNEL_INPUT_LIMIT]
316 RsExpandKernelDriverInfoPfxTypes.push_back(Int32ArrayInputLimitTy); // uint32_t outStride[RS_KERNEL_INPUT_LIMIT]
317 RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t outLen
318 RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy); // RsLaunchDimensions dim
319 RsExpandKernelDriverInfoPfxTypes.push_back(RsLaunchDimensionsTy); // RsLaunchDimensions current
320 RsExpandKernelDriverInfoPfxTypes.push_back(VoidPtrTy); // const void *usr
321 RsExpandKernelDriverInfoPfxTypes.push_back(Int32Ty); // uint32_t usrLen
322 RsExpandKernelDriverInfoPfxTy =
323 llvm::StructType::create(RsExpandKernelDriverInfoPfxTypes, "RsExpandKernelDriverInfoPfx");
324
325 // Create the function type for expanded kernels.
326 llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context);
327
328 llvm::Type *RsExpandKernelDriverInfoPfxPtrTy = RsExpandKernelDriverInfoPfxTy->getPointerTo();
329 // void (const RsExpandKernelDriverInfoPfxTy *p, uint32_t x1, uint32_t x2, uint32_t outstep)
330 ExpandedForEachType = llvm::FunctionType::get(VoidTy,
331 {RsExpandKernelDriverInfoPfxPtrTy, Int32Ty, Int32Ty, Int32Ty}, false);
332 }
333
334 /// @brief Create skeleton of the expanded foreach kernel.
335 ///
336 /// This creates a function with the following signature:
337 ///
338 /// void (const RsForEachStubParamStruct *p, uint32_t x1, uint32_t x2,
339 /// uint32_t outstep)
340 ///
createEmptyExpandedForEachKernel(llvm::StringRef OldName)341 llvm::Function *createEmptyExpandedForEachKernel(llvm::StringRef OldName) {
342 llvm::Function *ExpandedFunction =
343 llvm::Function::Create(ExpandedForEachType,
344 llvm::GlobalValue::ExternalLinkage,
345 OldName + ".expand", Module);
346 bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
347 llvm::Function::arg_iterator AI = ExpandedFunction->arg_begin();
348 (AI++)->setName("p");
349 (AI++)->setName("x1");
350 (AI++)->setName("x2");
351 (AI++)->setName("arg_outstep");
352 llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
353 ExpandedFunction);
354 llvm::IRBuilder<> Builder(Begin);
355 Builder.CreateRetVoid();
356 return ExpandedFunction;
357 }
358
359 // Create skeleton of a general reduce kernel's expanded accumulator.
360 //
361 // This creates a function with the following signature:
362 //
363 // void @func.expand(%RsExpandKernelDriverInfoPfx* nocapture %p,
364 // i32 %x1, i32 %x2, accumType* nocapture %accum)
365 //
createEmptyExpandedReduceAccumulator(llvm::StringRef OldName,llvm::Type * AccumArgTy)366 llvm::Function *createEmptyExpandedReduceAccumulator(llvm::StringRef OldName,
367 llvm::Type *AccumArgTy) {
368 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
369 llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context);
370 llvm::FunctionType *ExpandedReduceAccumulatorType =
371 llvm::FunctionType::get(VoidTy,
372 {RsExpandKernelDriverInfoPfxTy->getPointerTo(),
373 Int32Ty, Int32Ty, AccumArgTy}, false);
374 llvm::Function *FnExpandedAccumulator =
375 llvm::Function::Create(ExpandedReduceAccumulatorType,
376 llvm::GlobalValue::ExternalLinkage,
377 OldName + ".expand", Module);
378 bccAssert(FnExpandedAccumulator->arg_size() == kNumExpandedReduceAccumulatorParams);
379
380 llvm::Function::arg_iterator AI = FnExpandedAccumulator->arg_begin();
381
382 using llvm::Attribute;
383
384 llvm::Argument *Arg_p = &(*AI++);
385 Arg_p->setName("p");
386 Arg_p->addAttr(llvm::AttributeSet::get(*Context, Arg_p->getArgNo() + 1,
387 llvm::makeArrayRef(Attribute::NoCapture)));
388
389 llvm::Argument *Arg_x1 = &(*AI++);
390 Arg_x1->setName("x1");
391
392 llvm::Argument *Arg_x2 = &(*AI++);
393 Arg_x2->setName("x2");
394
395 llvm::Argument *Arg_accum = &(*AI++);
396 Arg_accum->setName("accum");
397 Arg_accum->addAttr(llvm::AttributeSet::get(*Context, Arg_accum->getArgNo() + 1,
398 llvm::makeArrayRef(Attribute::NoCapture)));
399
400 llvm::BasicBlock *Begin = llvm::BasicBlock::Create(*Context, "Begin",
401 FnExpandedAccumulator);
402 llvm::IRBuilder<> Builder(Begin);
403 Builder.CreateRetVoid();
404
405 return FnExpandedAccumulator;
406 }
407
408 /// @brief Create an empty loop
409 ///
410 /// Create a loop of the form:
411 ///
412 /// for (i = LowerBound; i < UpperBound; i++)
413 /// ;
414 ///
415 /// After the loop has been created, the builder is set such that
416 /// instructions can be added to the loop body.
417 ///
418 /// @param Builder The builder to use to build this loop. The current
419 /// position of the builder is the position the loop
420 /// will be inserted.
421 /// @param LowerBound The first value of the loop iterator
422 /// @param UpperBound The maximal value of the loop iterator
423 /// @param LoopIV A reference that will be set to the loop iterator.
424 /// @return The BasicBlock that will be executed after the loop.
createLoop(llvm::IRBuilder<> & Builder,llvm::Value * LowerBound,llvm::Value * UpperBound,llvm::Value ** LoopIV)425 llvm::BasicBlock *createLoop(llvm::IRBuilder<> &Builder,
426 llvm::Value *LowerBound,
427 llvm::Value *UpperBound,
428 llvm::Value **LoopIV) {
429 bccAssert(LowerBound->getType() == UpperBound->getType());
430
431 llvm::BasicBlock *CondBB, *AfterBB, *HeaderBB;
432 llvm::Value *Cond, *IVNext, *IV, *IVVar;
433
434 CondBB = Builder.GetInsertBlock();
435 AfterBB = llvm::SplitBlock(CondBB, &*Builder.GetInsertPoint(), nullptr, nullptr);
436 HeaderBB = llvm::BasicBlock::Create(*Context, "Loop", CondBB->getParent());
437
438 CondBB->getTerminator()->eraseFromParent();
439 Builder.SetInsertPoint(CondBB);
440
441 // decltype(LowerBound) *ivvar = alloca(sizeof(int))
442 // *ivvar = LowerBound
443 IVVar = Builder.CreateAlloca(LowerBound->getType(), nullptr, BCC_INDEX_VAR_NAME);
444 Builder.CreateStore(LowerBound, IVVar);
445
446 // if (LowerBound < Upperbound)
447 // goto LoopHeader
448 // else
449 // goto AfterBB
450 Cond = Builder.CreateICmpULT(LowerBound, UpperBound);
451 Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
452
453 // LoopHeader:
454 // iv = *ivvar
455 // <insertion point here>
456 // iv.next = iv + 1
457 // *ivvar = iv.next
458 // if (iv.next < Upperbound)
459 // goto LoopHeader
460 // else
461 // goto AfterBB
462 // AfterBB:
463 Builder.SetInsertPoint(HeaderBB);
464 IV = Builder.CreateLoad(IVVar, "X");
465 IVNext = Builder.CreateNUWAdd(IV, Builder.getInt32(1));
466 Builder.CreateStore(IVNext, IVVar);
467 Cond = Builder.CreateICmpULT(IVNext, UpperBound);
468 Builder.CreateCondBr(Cond, HeaderBB, AfterBB);
469 AfterBB->setName("Exit");
470 Builder.SetInsertPoint(llvm::cast<llvm::Instruction>(IVNext));
471
472 // Record information about this loop.
473 *LoopIV = IV;
474 return AfterBB;
475 }
476
477 // Finish building the outgoing argument list for calling a ForEach-able function.
478 //
479 // ArgVector - on input, the non-special arguments
480 // on output, the non-special arguments combined with the special arguments
481 // from SpecialArgVector
482 // SpecialArgVector - special arguments (from ExpandSpecialArguments())
483 // SpecialArgContextIdx - return value of ExpandSpecialArguments()
484 // (position of context argument in SpecialArgVector)
485 // CalleeFunction - the ForEach-able function being called
486 // Builder - for inserting code into the caller function
487 template<unsigned int ArgVectorLen, unsigned int SpecialArgVectorLen>
finishArgList(llvm::SmallVector<llvm::Value *,ArgVectorLen> & ArgVector,const llvm::SmallVector<llvm::Value *,SpecialArgVectorLen> & SpecialArgVector,const int SpecialArgContextIdx,const llvm::Function & CalleeFunction,llvm::IRBuilder<> & CallerBuilder)488 void finishArgList( llvm::SmallVector<llvm::Value *, ArgVectorLen> &ArgVector,
489 const llvm::SmallVector<llvm::Value *, SpecialArgVectorLen> &SpecialArgVector,
490 const int SpecialArgContextIdx,
491 const llvm::Function &CalleeFunction,
492 llvm::IRBuilder<> &CallerBuilder) {
493 /* The context argument (if any) is a pointer to an opaque user-visible type that differs from
494 * the RsExpandKernelDriverInfoPfx type used in the function we are generating (although the
495 * two types represent the same thing). Therefore, we must introduce a pointer cast when
496 * generating a call to the kernel function.
497 */
498 const int ArgContextIdx =
499 SpecialArgContextIdx >= 0 ? (ArgVector.size() + SpecialArgContextIdx) : SpecialArgContextIdx;
500 ArgVector.append(SpecialArgVector.begin(), SpecialArgVector.end());
501 if (ArgContextIdx >= 0) {
502 llvm::Type *ContextArgType = nullptr;
503 int ArgIdx = ArgContextIdx;
504 for (const auto &Arg : CalleeFunction.getArgumentList()) {
505 if (!ArgIdx--) {
506 ContextArgType = Arg.getType();
507 break;
508 }
509 }
510 bccAssert(ContextArgType);
511 ArgVector[ArgContextIdx] = CallerBuilder.CreatePointerCast(ArgVector[ArgContextIdx], ContextArgType);
512 }
513 }
514
515 // GEPHelper() returns a SmallVector of values suitable for passing
516 // to IRBuilder::CreateGEP(), and SmallGEPIndices is a typedef for
517 // the returned data type. It is sized so that the SmallVector
518 // returned by GEPHelper() never needs to do a heap allocation for
519 // any list of GEP indices it encounters in the code.
520 typedef llvm::SmallVector<llvm::Value *, 3> SmallGEPIndices;
521
522 // Helper for turning a list of constant integer GEP indices into a
523 // SmallVector of llvm::Value*. The return value is suitable for
524 // passing to a GetElementPtrInst constructor or IRBuilder::CreateGEP().
525 //
526 // Inputs:
527 // I32Args should be integers which represent the index arguments
528 // to a GEP instruction.
529 //
530 // Returns:
531 // Returns a SmallVector of ConstantInts.
GEPHelper(const std::initializer_list<int32_t> I32Args)532 SmallGEPIndices GEPHelper(const std::initializer_list<int32_t> I32Args) {
533 SmallGEPIndices Out(I32Args.size());
534 llvm::IntegerType *I32Ty = llvm::Type::getInt32Ty(*Context);
535 std::transform(I32Args.begin(), I32Args.end(), Out.begin(),
536 [I32Ty](int32_t Arg) { return llvm::ConstantInt::get(I32Ty, Arg); });
537 return Out;
538 }
539
540 public:
RSKernelExpandPass(bool pEnableStepOpt=true)541 explicit RSKernelExpandPass(bool pEnableStepOpt = true)
542 : ModulePass(ID), Module(nullptr), Context(nullptr),
543 mEnableStepOpt(pEnableStepOpt) {
544
545 }
546
getAnalysisUsage(llvm::AnalysisUsage & AU) const547 virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
548 // This pass does not use any other analysis passes, but it does
549 // add/wrap the existing functions in the module (thus altering the CFG).
550 }
551
552 // Build contribution to outgoing argument list for calling a
553 // ForEach-able function or a general reduction accumulator
554 // function, based on the special parameters of that function.
555 //
556 // Signature - metadata bits for the signature of the callee
557 // X, Arg_p - values derived directly from expanded function,
558 // suitable for computing arguments for the callee
559 // CalleeArgs - contribution is accumulated here
560 // Bump - invoked once for each contributed outgoing argument
561 // LoopHeaderInsertionPoint - an Instruction in the loop header, before which
562 // this function can insert loop-invariant loads
563 //
564 // Return value is the (zero-based) position of the context (Arg_p)
565 // argument in the CalleeArgs vector, or a negative value if the
566 // context argument is not placed in the CalleeArgs vector.
ExpandSpecialArguments(uint32_t Signature,llvm::Value * X,llvm::Value * Arg_p,llvm::IRBuilder<> & Builder,llvm::SmallVector<llvm::Value *,8> & CalleeArgs,const std::function<void ()> & Bump,llvm::Instruction * LoopHeaderInsertionPoint)567 int ExpandSpecialArguments(uint32_t Signature,
568 llvm::Value *X,
569 llvm::Value *Arg_p,
570 llvm::IRBuilder<> &Builder,
571 llvm::SmallVector<llvm::Value*, 8> &CalleeArgs,
572 const std::function<void ()> &Bump,
573 llvm::Instruction *LoopHeaderInsertionPoint) {
574
575 bccAssert(CalleeArgs.empty());
576
577 int Return = -1;
578 if (bcinfo::MetadataExtractor::hasForEachSignatureCtxt(Signature)) {
579 CalleeArgs.push_back(Arg_p);
580 Bump();
581 Return = CalleeArgs.size() - 1;
582 }
583
584 if (bcinfo::MetadataExtractor::hasForEachSignatureX(Signature)) {
585 CalleeArgs.push_back(X);
586 Bump();
587 }
588
589 if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature) ||
590 bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
591 bccAssert(LoopHeaderInsertionPoint);
592
593 // Y and Z are loop invariant, so they can be hoisted out of the
594 // loop. Set the IRBuilder insertion point to the loop header.
595 auto OldInsertionPoint = Builder.saveIP();
596 Builder.SetInsertPoint(LoopHeaderInsertionPoint);
597
598 if (bcinfo::MetadataExtractor::hasForEachSignatureY(Signature)) {
599 SmallGEPIndices YValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent,
600 RsLaunchDimensionsFieldY}));
601 llvm::Value *YAddr = Builder.CreateInBoundsGEP(Arg_p, YValueGEP, "Y.gep");
602 CalleeArgs.push_back(Builder.CreateLoad(YAddr, "Y"));
603 Bump();
604 }
605
606 if (bcinfo::MetadataExtractor::hasForEachSignatureZ(Signature)) {
607 SmallGEPIndices ZValueGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldCurrent,
608 RsLaunchDimensionsFieldZ}));
609 llvm::Value *ZAddr = Builder.CreateInBoundsGEP(Arg_p, ZValueGEP, "Z.gep");
610 CalleeArgs.push_back(Builder.CreateLoad(ZAddr, "Z"));
611 Bump();
612 }
613
614 Builder.restoreIP(OldInsertionPoint);
615 }
616
617 return Return;
618 }
619
620 // Generate loop-invariant input processing setup code for an expanded
621 // ForEach-able function or an expanded general reduction accumulator
622 // function.
623 //
624 // LoopHeader - block at the end of which the setup code will be inserted
625 // Arg_p - RSKernelDriverInfo pointer passed to the expanded function
626 // TBAAPointer - metadata for marking loads of pointer values out of RSKernelDriverInfo
627 // ArgIter - iterator pointing to first input of the UNexpanded function
628 // NumInputs - number of inputs (NOT number of ARGUMENTS)
629 //
630 // InTypes[] - this function saves input type, they will be used in ExpandInputsBody().
631 // InBufPtrs[] - this function sets each array element to point to the first cell / byte
632 // (byte for x86, cell for other platforms) of the corresponding input allocation
633 // InStructTempSlots[] - this function sets each array element either to nullptr
634 // or to the result of an alloca (for the case where the
635 // calling convention dictates that a value must be passed
636 // by reference, and so we need a stacked temporary to hold
637 // a copy of that value)
ExpandInputsLoopInvariant(llvm::IRBuilder<> & Builder,llvm::BasicBlock * LoopHeader,llvm::Value * Arg_p,llvm::MDNode * TBAAPointer,llvm::Function::arg_iterator ArgIter,const size_t NumInputs,llvm::SmallVectorImpl<llvm::Type * > & InTypes,llvm::SmallVectorImpl<llvm::Value * > & InBufPtrs,llvm::SmallVectorImpl<llvm::Value * > & InStructTempSlots)638 void ExpandInputsLoopInvariant(llvm::IRBuilder<> &Builder, llvm::BasicBlock *LoopHeader,
639 llvm::Value *Arg_p,
640 llvm::MDNode *TBAAPointer,
641 llvm::Function::arg_iterator ArgIter,
642 const size_t NumInputs,
643 llvm::SmallVectorImpl<llvm::Type *> &InTypes,
644 llvm::SmallVectorImpl<llvm::Value *> &InBufPtrs,
645 llvm::SmallVectorImpl<llvm::Value *> &InStructTempSlots) {
646 bccAssert(NumInputs <= RS_KERNEL_INPUT_LIMIT);
647
648 // Extract information about input slots. The work done
649 // here is loop-invariant, so we can hoist the operations out of the loop.
650 auto OldInsertionPoint = Builder.saveIP();
651 Builder.SetInsertPoint(LoopHeader->getTerminator());
652
653 for (size_t InputIndex = 0; InputIndex < NumInputs; ++InputIndex, ArgIter++) {
654 llvm::Type *InType = ArgIter->getType();
655
656 /*
657 * AArch64 calling conventions dictate that structs of sufficient size
658 * get passed by pointer instead of passed by value. This, combined
659 * with the fact that we don't allow kernels to operate on pointer
660 * data means that if we see a kernel with a pointer parameter we know
661 * that it is a struct input that has been promoted. As such we don't
662 * need to convert its type to a pointer. Later we will need to know
663 * to create a temporary copy on the stack, so we save this information
664 * in InStructTempSlots.
665 */
666 if (auto PtrType = llvm::dyn_cast<llvm::PointerType>(InType)) {
667 llvm::Type *ElementType = PtrType->getElementType();
668 InStructTempSlots.push_back(Builder.CreateAlloca(ElementType, nullptr,
669 "input_struct_slot"));
670 } else {
671 InType = InType->getPointerTo();
672 InStructTempSlots.push_back(nullptr);
673 }
674
675 SmallGEPIndices InBufPtrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr,
676 static_cast<int32_t>(InputIndex)}));
677 llvm::Value *InBufPtrAddr = Builder.CreateInBoundsGEP(Arg_p, InBufPtrGEP, "input_buf.gep");
678 llvm::LoadInst *InBufPtr = Builder.CreateLoad(InBufPtrAddr, "input_buf");
679
680 llvm::Value *CastInBufPtr = nullptr;
681 if (mStructExplicitlyPaddedBySlang || (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING)) {
682 CastInBufPtr = Builder.CreatePointerCast(InBufPtr, InType, "casted_in");
683 } else {
684 // The disagreement between module and x86 target machine datalayout
685 // causes mismatched input/output data offset between slang reflected
686 // code and bcc codegen for GetElementPtr. To solve this issue, skip the
687 // cast to InType and leave CastInBufPtr as an int8_t*. The buffer is
688 // later indexed with an explicit byte offset computed based on
689 // X86_CUSTOM_DL_STRING and then bitcast to actual input type.
690 CastInBufPtr = InBufPtr;
691 }
692
693 if (gEnableRsTbaa) {
694 InBufPtr->setMetadata("tbaa", TBAAPointer);
695 }
696
697 InTypes.push_back(InType);
698 InBufPtrs.push_back(CastInBufPtr);
699 }
700
701 Builder.restoreIP(OldInsertionPoint);
702 }
703
704 // Generate loop-varying input processing code for an expanded ForEach-able function
705 // or an expanded general reduction accumulator function. Also, for the call to the
706 // UNexpanded function, collect the portion of the argument list corresponding to the
707 // inputs.
708 //
709 // Arg_x1 - first X coordinate to be processed by the expanded function
710 // TBAAAllocation - metadata for marking loads of input values out of allocations
711 // NumInputs -- number of inputs (NOT number of ARGUMENTS)
712 // InTypes[] - this function uses the saved input types in ExpandInputsLoopInvariant()
713 // to convert the pointer of byte InPtr to its real type.
714 // InBufPtrs[] - this function consumes the information produced by ExpandInputsLoopInvariant()
715 // InStructTempSlots[] - this function consumes the information produced by ExpandInputsLoopInvariant()
716 // IndVar - value of loop induction variable (X coordinate) for a given loop iteration
717 //
718 // RootArgs - this function sets this to the list of outgoing argument values corresponding
719 // to the inputs
ExpandInputsBody(llvm::IRBuilder<> & Builder,llvm::Value * Arg_x1,llvm::MDNode * TBAAAllocation,const size_t NumInputs,const llvm::SmallVectorImpl<llvm::Type * > & InTypes,const llvm::SmallVectorImpl<llvm::Value * > & InBufPtrs,const llvm::SmallVectorImpl<llvm::Value * > & InStructTempSlots,llvm::Value * IndVar,llvm::SmallVectorImpl<llvm::Value * > & RootArgs)720 void ExpandInputsBody(llvm::IRBuilder<> &Builder,
721 llvm::Value *Arg_x1,
722 llvm::MDNode *TBAAAllocation,
723 const size_t NumInputs,
724 const llvm::SmallVectorImpl<llvm::Type *> &InTypes,
725 const llvm::SmallVectorImpl<llvm::Value *> &InBufPtrs,
726 const llvm::SmallVectorImpl<llvm::Value *> &InStructTempSlots,
727 llvm::Value *IndVar,
728 llvm::SmallVectorImpl<llvm::Value *> &RootArgs) {
729 llvm::Value *Offset = Builder.CreateSub(IndVar, Arg_x1);
730 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
731
732 for (size_t Index = 0; Index < NumInputs; ++Index) {
733
734 llvm::Value *InPtr = nullptr;
735 if (mStructExplicitlyPaddedBySlang || (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING)) {
736 InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], Offset);
737 } else {
738 // Treat x86 input buffer as byte[], get indexed pointer with explicit
739 // byte offset computed using a datalayout based on
740 // X86_CUSTOM_DL_STRING, then bitcast it to actual input type.
741 llvm::DataLayout DL(X86_CUSTOM_DL_STRING);
742 llvm::Type *InTy = InTypes[Index];
743 uint64_t InStep = DL.getTypeAllocSize(InTy->getPointerElementType());
744 llvm::Value *OffsetInBytes = Builder.CreateMul(Offset, llvm::ConstantInt::get(Int32Ty, InStep));
745 InPtr = Builder.CreateInBoundsGEP(InBufPtrs[Index], OffsetInBytes);
746 InPtr = Builder.CreatePointerCast(InPtr, InTy);
747 }
748
749 llvm::Value *Input;
750 llvm::LoadInst *InputLoad = Builder.CreateLoad(InPtr, "input");
751
752 if (gEnableRsTbaa) {
753 InputLoad->setMetadata("tbaa", TBAAAllocation);
754 }
755
756 if (llvm::Value *TemporarySlot = InStructTempSlots[Index]) {
757 // Pass a pointer to a temporary on the stack, rather than
758 // passing a pointer to the original value. We do not want
759 // the kernel to potentially modify the input data.
760
761 // Note: don't annotate with TBAA, since the kernel might
762 // have its own TBAA annotations for the pointer argument.
763 Builder.CreateStore(InputLoad, TemporarySlot);
764 Input = TemporarySlot;
765 } else {
766 Input = InputLoad;
767 }
768
769 RootArgs.push_back(Input);
770 }
771 }
772
773 /* Performs the actual optimization on a selected function. On success, the
774 * Module will contain a new function of the name "<NAME>.expand" that
775 * invokes <NAME>() in a loop with the appropriate parameters.
776 */
ExpandOldStyleForEach(llvm::Function * Function,uint32_t Signature)777 bool ExpandOldStyleForEach(llvm::Function *Function, uint32_t Signature) {
778 ALOGV("Expanding ForEach-able Function %s",
779 Function->getName().str().c_str());
780
781 if (!Signature) {
782 Signature = getRootSignature(Function);
783 if (!Signature) {
784 // We couldn't determine how to expand this function based on its
785 // function signature.
786 return false;
787 }
788 }
789
790 llvm::DataLayout DL(Module);
791 if (!mStructExplicitlyPaddedBySlang && (Module->getTargetTriple() == DEFAULT_X86_TRIPLE_STRING)) {
792 DL.reset(X86_CUSTOM_DL_STRING);
793 }
794
795 llvm::Function *ExpandedFunction =
796 createEmptyExpandedForEachKernel(Function->getName());
797
798 /*
799 * Extract the expanded function's parameters. It is guaranteed by
800 * createEmptyExpandedForEachKernel that there will be four parameters.
801 */
802
803 bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
804
805 llvm::Function::arg_iterator ExpandedFunctionArgIter =
806 ExpandedFunction->arg_begin();
807
808 llvm::Value *Arg_p = &*(ExpandedFunctionArgIter++);
809 llvm::Value *Arg_x1 = &*(ExpandedFunctionArgIter++);
810 llvm::Value *Arg_x2 = &*(ExpandedFunctionArgIter++);
811 llvm::Value *Arg_outstep = &*(ExpandedFunctionArgIter);
812
813 llvm::Value *InStep = nullptr;
814 llvm::Value *OutStep = nullptr;
815
816 // Construct the actual function body.
817 llvm::IRBuilder<> Builder(&*ExpandedFunction->getEntryBlock().begin());
818
819 // Collect and construct the arguments for the kernel().
820 // Note that we load any loop-invariant arguments before entering the Loop.
821 llvm::Function::arg_iterator FunctionArgIter = Function->arg_begin();
822
823 llvm::Type *InTy = nullptr;
824 llvm::Value *InBufPtr = nullptr;
825 if (bcinfo::MetadataExtractor::hasForEachSignatureIn(Signature)) {
826 SmallGEPIndices InStepGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInStride, 0}));
827 llvm::LoadInst *InStepArg = Builder.CreateLoad(
828 Builder.CreateInBoundsGEP(Arg_p, InStepGEP, "instep_addr.gep"), "instep_addr");
829
830 InTy = (FunctionArgIter++)->getType();
831 InStep = getStepValue(&DL, InTy, InStepArg);
832
833 InStep->setName("instep");
834
835 SmallGEPIndices InputAddrGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldInPtr, 0}));
836 InBufPtr = Builder.CreateLoad(
837 Builder.CreateInBoundsGEP(Arg_p, InputAddrGEP, "input_buf.gep"), "input_buf");
838 }
839
840 llvm::Type *OutTy = nullptr;
841 llvm::Value *OutBasePtr = nullptr;
842 if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
843 OutTy = (FunctionArgIter++)->getType();
844 OutStep = getStepValue(&DL, OutTy, Arg_outstep);
845 OutStep->setName("outstep");
846 SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0}));
847 OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep"));
848 }
849
850 llvm::Value *UsrData = nullptr;
851 if (bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature)) {
852 llvm::Type *UsrDataTy = (FunctionArgIter++)->getType();
853 llvm::Value *UsrDataPointerAddr = Builder.CreateStructGEP(nullptr, Arg_p, RsExpandKernelDriverInfoPfxFieldUsr);
854 UsrData = Builder.CreatePointerCast(Builder.CreateLoad(UsrDataPointerAddr), UsrDataTy);
855 UsrData->setName("UsrData");
856 }
857
858 llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock();
859 llvm::Value *IV;
860 createLoop(Builder, Arg_x1, Arg_x2, &IV);
861
862 llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
863 const int CalleeArgsContextIdx = ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
864 [&FunctionArgIter]() { FunctionArgIter++; },
865 LoopHeader->getTerminator());
866
867 bccAssert(FunctionArgIter == Function->arg_end());
868
869 // Populate the actual call to kernel().
870 llvm::SmallVector<llvm::Value*, 8> RootArgs;
871
872 llvm::Value *InPtr = nullptr;
873 llvm::Value *OutPtr = nullptr;
874
875 // Calculate the current input and output pointers
876 //
877 // We always calculate the input/output pointers with a GEP operating on i8
878 // values and only cast at the very end to OutTy. This is because the step
879 // between two values is given in bytes.
880 //
881 // TODO: We could further optimize the output by using a GEP operation of
882 // type 'OutTy' in cases where the element type of the allocation allows.
883 if (OutBasePtr) {
884 llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
885 OutOffset = Builder.CreateMul(OutOffset, OutStep);
886 OutPtr = Builder.CreateInBoundsGEP(OutBasePtr, OutOffset);
887 OutPtr = Builder.CreatePointerCast(OutPtr, OutTy);
888 }
889
890 if (InBufPtr) {
891 llvm::Value *InOffset = Builder.CreateSub(IV, Arg_x1);
892 InOffset = Builder.CreateMul(InOffset, InStep);
893 InPtr = Builder.CreateInBoundsGEP(InBufPtr, InOffset);
894 InPtr = Builder.CreatePointerCast(InPtr, InTy);
895 }
896
897 if (InPtr) {
898 RootArgs.push_back(InPtr);
899 }
900
901 if (OutPtr) {
902 RootArgs.push_back(OutPtr);
903 }
904
905 if (UsrData) {
906 RootArgs.push_back(UsrData);
907 }
908
909 finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder);
910
911 Builder.CreateCall(Function, RootArgs);
912
913 return true;
914 }
915
916 /* Expand a pass-by-value foreach kernel.
917 */
ExpandForEach(llvm::Function * Function,uint32_t Signature)918 bool ExpandForEach(llvm::Function *Function, uint32_t Signature) {
919 bccAssert(bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature));
920 ALOGV("Expanding kernel Function %s", Function->getName().str().c_str());
921
922 // TODO: Refactor this to share functionality with ExpandOldStyleForEach.
923 llvm::DataLayout DL(Module);
924 if (!mStructExplicitlyPaddedBySlang && (Module->getTargetTriple() == DEFAULT_X86_TRIPLE_STRING)) {
925 DL.reset(X86_CUSTOM_DL_STRING);
926 }
927 llvm::Type *Int32Ty = llvm::Type::getInt32Ty(*Context);
928
929 llvm::Function *ExpandedFunction =
930 createEmptyExpandedForEachKernel(Function->getName());
931
932 /*
933 * Extract the expanded function's parameters. It is guaranteed by
934 * createEmptyExpandedForEachKernel that there will be four parameters.
935 */
936
937 bccAssert(ExpandedFunction->arg_size() == kNumExpandedForeachParams);
938
939 llvm::Function::arg_iterator ExpandedFunctionArgIter =
940 ExpandedFunction->arg_begin();
941
942 llvm::Value *Arg_p = &*(ExpandedFunctionArgIter++);
943 llvm::Value *Arg_x1 = &*(ExpandedFunctionArgIter++);
944 llvm::Value *Arg_x2 = &*(ExpandedFunctionArgIter++);
945 // Arg_outstep is not used by expanded new-style forEach kernels.
946
947 // Construct the actual function body.
948 llvm::IRBuilder<> Builder(&*ExpandedFunction->getEntryBlock().begin());
949
950 // Create TBAA meta-data.
951 llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript,
952 *TBAAAllocation, *TBAAPointer;
953 llvm::MDBuilder MDHelper(*Context);
954
955 TBAARenderScriptDistinct =
956 MDHelper.createTBAARoot(kRenderScriptTBAARootName);
957 TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
958 TBAARenderScriptDistinct);
959 TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
960 TBAARenderScript);
961 TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
962 TBAAAllocation, 0);
963 TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer",
964 TBAARenderScript);
965 TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0);
966
967 /*
968 * Collect and construct the arguments for the kernel().
969 *
970 * Note that we load any loop-invariant arguments before entering the Loop.
971 */
972 size_t NumRemainingInputs = Function->arg_size();
973
974 // No usrData parameter on kernels.
975 bccAssert(
976 !bcinfo::MetadataExtractor::hasForEachSignatureUsrData(Signature));
977
978 llvm::Function::arg_iterator ArgIter = Function->arg_begin();
979
980 // Check the return type
981 llvm::Type *OutTy = nullptr;
982 llvm::LoadInst *OutBasePtr = nullptr;
983 llvm::Value *CastedOutBasePtr = nullptr;
984
985 bool PassOutByPointer = false;
986
987 if (bcinfo::MetadataExtractor::hasForEachSignatureOut(Signature)) {
988 llvm::Type *OutBaseTy = Function->getReturnType();
989
990 if (OutBaseTy->isVoidTy()) {
991 PassOutByPointer = true;
992 OutTy = ArgIter->getType();
993
994 ArgIter++;
995 --NumRemainingInputs;
996 } else {
997 // We don't increment Args, since we are using the actual return type.
998 OutTy = OutBaseTy->getPointerTo();
999 }
1000
1001 SmallGEPIndices OutBaseGEP(GEPHelper({0, RsExpandKernelDriverInfoPfxFieldOutPtr, 0}));
1002 OutBasePtr = Builder.CreateLoad(Builder.CreateInBoundsGEP(Arg_p, OutBaseGEP, "out_buf.gep"));
1003
1004 if (gEnableRsTbaa) {
1005 OutBasePtr->setMetadata("tbaa", TBAAPointer);
1006 }
1007
1008 if (mStructExplicitlyPaddedBySlang || (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING)) {
1009 CastedOutBasePtr = Builder.CreatePointerCast(OutBasePtr, OutTy, "casted_out");
1010 } else {
1011 // The disagreement between module and x86 target machine datalayout
1012 // causes mismatched input/output data offset between slang reflected
1013 // code and bcc codegen for GetElementPtr. To solve this issue, skip the
1014 // cast to OutTy and leave CastedOutBasePtr as an int8_t*. The buffer
1015 // is later indexed with an explicit byte offset computed based on
1016 // X86_CUSTOM_DL_STRING and then bitcast to actual output type.
1017 CastedOutBasePtr = OutBasePtr;
1018 }
1019 }
1020
1021 llvm::SmallVector<llvm::Type*, 8> InTypes;
1022 llvm::SmallVector<llvm::Value*, 8> InBufPtrs;
1023 llvm::SmallVector<llvm::Value*, 8> InStructTempSlots;
1024
1025 bccAssert(NumRemainingInputs <= RS_KERNEL_INPUT_LIMIT);
1026
1027 // Create the loop structure.
1028 llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock();
1029 llvm::Value *IV;
1030 createLoop(Builder, Arg_x1, Arg_x2, &IV);
1031
1032 llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
1033 const int CalleeArgsContextIdx =
1034 ExpandSpecialArguments(Signature, IV, Arg_p, Builder, CalleeArgs,
1035 [&NumRemainingInputs]() { --NumRemainingInputs; },
1036 LoopHeader->getTerminator());
1037
1038 // After ExpandSpecialArguments() gets called, NumRemainingInputs
1039 // counts the number of arguments to the kernel that correspond to
1040 // an array entry from the InPtr field of the DriverInfo
1041 // structure.
1042 const size_t NumInPtrArguments = NumRemainingInputs;
1043
1044 if (NumInPtrArguments > 0) {
1045 ExpandInputsLoopInvariant(Builder, LoopHeader, Arg_p, TBAAPointer, ArgIter, NumInPtrArguments,
1046 InTypes, InBufPtrs, InStructTempSlots);
1047 }
1048
1049 // Populate the actual call to kernel().
1050 llvm::SmallVector<llvm::Value*, 8> RootArgs;
1051
1052 // Calculate the current input and output pointers.
1053
1054 // Output
1055
1056 llvm::Value *OutPtr = nullptr;
1057 if (CastedOutBasePtr) {
1058 llvm::Value *OutOffset = Builder.CreateSub(IV, Arg_x1);
1059
1060 if (mStructExplicitlyPaddedBySlang || (Module->getTargetTriple() != DEFAULT_X86_TRIPLE_STRING)) {
1061 OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffset);
1062 } else {
1063 // Treat x86 output buffer as byte[], get indexed pointer with explicit
1064 // byte offset computed using a datalayout based on
1065 // X86_CUSTOM_DL_STRING, then bitcast it to actual output type.
1066 uint64_t OutStep = DL.getTypeAllocSize(OutTy->getPointerElementType());
1067 llvm::Value *OutOffsetInBytes = Builder.CreateMul(OutOffset, llvm::ConstantInt::get(Int32Ty, OutStep));
1068 OutPtr = Builder.CreateInBoundsGEP(CastedOutBasePtr, OutOffsetInBytes);
1069 OutPtr = Builder.CreatePointerCast(OutPtr, OutTy);
1070 }
1071
1072 if (PassOutByPointer) {
1073 RootArgs.push_back(OutPtr);
1074 }
1075 }
1076
1077 // Inputs
1078
1079 if (NumInPtrArguments > 0) {
1080 ExpandInputsBody(Builder, Arg_x1, TBAAAllocation, NumInPtrArguments,
1081 InTypes, InBufPtrs, InStructTempSlots, IV, RootArgs);
1082 }
1083
1084 finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *Function, Builder);
1085
1086 llvm::Value *RetVal = Builder.CreateCall(Function, RootArgs);
1087
1088 if (OutPtr && !PassOutByPointer) {
1089 RetVal->setName("call.result");
1090 llvm::StoreInst *Store = Builder.CreateStore(RetVal, OutPtr);
1091 if (gEnableRsTbaa) {
1092 Store->setMetadata("tbaa", TBAAAllocation);
1093 }
1094 }
1095
1096 return true;
1097 }
1098
1099 // Certain categories of functions that make up a general
1100 // reduce-style kernel are called directly from the driver with no
1101 // expansion needed. For a function in such a category, we need to
1102 // promote linkage from static to external, to ensure that the
1103 // function is visible to the driver in the dynamic symbol table.
1104 // This promotion is safe because we don't have any kind of cross
1105 // translation unit linkage model (except for linking against
1106 // RenderScript libraries), so we do not risk name clashes.
PromoteReduceFunction(const char * Name,FunctionSet & PromotedFunctions)1107 bool PromoteReduceFunction(const char *Name, FunctionSet &PromotedFunctions) {
1108 if (!Name) // a presumably-optional function that is not present
1109 return false;
1110
1111 llvm::Function *Fn = Module->getFunction(Name);
1112 bccAssert(Fn != nullptr);
1113 if (PromotedFunctions.insert(Fn).second) {
1114 bccAssert(Fn->getLinkage() == llvm::GlobalValue::InternalLinkage);
1115 Fn->setLinkage(llvm::GlobalValue::ExternalLinkage);
1116 return true;
1117 }
1118
1119 return false;
1120 }
1121
1122 // Expand the accumulator function for a general reduce-style kernel.
1123 //
1124 // The input is a function of the form
1125 //
1126 // define void @func(accumType* %accum, foo1 in1[, ... fooN inN] [, special arguments])
1127 //
1128 // where all arguments except the first are the same as for a foreach kernel.
1129 //
1130 // The input accumulator function gets expanded into a function of the form
1131 //
1132 // define void @func.expand(%RsExpandKernelDriverInfoPfx* %p, i32 %x1, i32 %x2, accumType* %accum)
1133 //
1134 // which performs a serial accumulaion of elements [x1, x2) into *%accum.
1135 //
1136 // In pseudocode, @func.expand does:
1137 //
1138 // for (i = %x1; i < %x2; ++i) {
1139 // func(%accum,
1140 // *((foo1 *)p->inPtr[0] + i)[, ... *((fooN *)p->inPtr[N-1] + i)
1141 // [, p] [, i] [, p->current.y] [, p->current.z]);
1142 // }
1143 //
1144 // This is very similar to foreach kernel expansion with no output.
ExpandReduceAccumulator(llvm::Function * FnAccumulator,uint32_t Signature,size_t NumInputs)1145 bool ExpandReduceAccumulator(llvm::Function *FnAccumulator, uint32_t Signature, size_t NumInputs) {
1146 ALOGV("Expanding accumulator %s for general reduce kernel",
1147 FnAccumulator->getName().str().c_str());
1148
1149 // Create TBAA meta-data.
1150 llvm::MDNode *TBAARenderScriptDistinct, *TBAARenderScript,
1151 *TBAAAllocation, *TBAAPointer;
1152 llvm::MDBuilder MDHelper(*Context);
1153 TBAARenderScriptDistinct =
1154 MDHelper.createTBAARoot(kRenderScriptTBAARootName);
1155 TBAARenderScript = MDHelper.createTBAANode(kRenderScriptTBAANodeName,
1156 TBAARenderScriptDistinct);
1157 TBAAAllocation = MDHelper.createTBAAScalarTypeNode("allocation",
1158 TBAARenderScript);
1159 TBAAAllocation = MDHelper.createTBAAStructTagNode(TBAAAllocation,
1160 TBAAAllocation, 0);
1161 TBAAPointer = MDHelper.createTBAAScalarTypeNode("pointer",
1162 TBAARenderScript);
1163 TBAAPointer = MDHelper.createTBAAStructTagNode(TBAAPointer, TBAAPointer, 0);
1164
1165 auto AccumulatorArgIter = FnAccumulator->arg_begin();
1166
1167 // Create empty accumulator function.
1168 llvm::Function *FnExpandedAccumulator =
1169 createEmptyExpandedReduceAccumulator(FnAccumulator->getName(),
1170 (AccumulatorArgIter++)->getType());
1171
1172 // Extract the expanded accumulator's parameters. It is
1173 // guaranteed by createEmptyExpandedReduceAccumulator that
1174 // there will be 4 parameters.
1175 bccAssert(FnExpandedAccumulator->arg_size() == kNumExpandedReduceAccumulatorParams);
1176 auto ExpandedAccumulatorArgIter = FnExpandedAccumulator->arg_begin();
1177 llvm::Value *Arg_p = &*(ExpandedAccumulatorArgIter++);
1178 llvm::Value *Arg_x1 = &*(ExpandedAccumulatorArgIter++);
1179 llvm::Value *Arg_x2 = &*(ExpandedAccumulatorArgIter++);
1180 llvm::Value *Arg_accum = &*(ExpandedAccumulatorArgIter++);
1181
1182 // Construct the actual function body.
1183 llvm::IRBuilder<> Builder(&*FnExpandedAccumulator->getEntryBlock().begin());
1184
1185 // Create the loop structure.
1186 llvm::BasicBlock *LoopHeader = Builder.GetInsertBlock();
1187 llvm::Value *IndVar;
1188 createLoop(Builder, Arg_x1, Arg_x2, &IndVar);
1189
1190 llvm::SmallVector<llvm::Value*, 8> CalleeArgs;
1191 const int CalleeArgsContextIdx =
1192 ExpandSpecialArguments(Signature, IndVar, Arg_p, Builder, CalleeArgs,
1193 [](){}, LoopHeader->getTerminator());
1194
1195 llvm::SmallVector<llvm::Type*, 8> InTypes;
1196 llvm::SmallVector<llvm::Value*, 8> InBufPtrs;
1197 llvm::SmallVector<llvm::Value*, 8> InStructTempSlots;
1198 ExpandInputsLoopInvariant(Builder, LoopHeader, Arg_p, TBAAPointer, AccumulatorArgIter, NumInputs,
1199 InTypes, InBufPtrs, InStructTempSlots);
1200
1201 // Populate the actual call to the original accumulator.
1202 llvm::SmallVector<llvm::Value*, 8> RootArgs;
1203 RootArgs.push_back(Arg_accum);
1204 ExpandInputsBody(Builder, Arg_x1, TBAAAllocation, NumInputs, InTypes, InBufPtrs, InStructTempSlots,
1205 IndVar, RootArgs);
1206 finishArgList(RootArgs, CalleeArgs, CalleeArgsContextIdx, *FnAccumulator, Builder);
1207 Builder.CreateCall(FnAccumulator, RootArgs);
1208
1209 return true;
1210 }
1211
1212 // Create a combiner function for a general reduce-style kernel that lacks one,
1213 // by calling the accumulator function.
1214 //
1215 // The accumulator function must be of the form
1216 //
1217 // define void @accumFn(accumType* %accum, accumType %in)
1218 //
1219 // A combiner function will be generated of the form
1220 //
1221 // define void @accumFn.combiner(accumType* %accum, accumType* %other) {
1222 // %1 = load accumType, accumType* %other
1223 // call void @accumFn(accumType* %accum, accumType %1);
1224 // }
CreateReduceCombinerFromAccumulator(llvm::Function * FnAccumulator)1225 bool CreateReduceCombinerFromAccumulator(llvm::Function *FnAccumulator) {
1226 ALOGV("Creating combiner from accumulator %s for general reduce kernel",
1227 FnAccumulator->getName().str().c_str());
1228
1229 using llvm::Attribute;
1230
1231 bccAssert(FnAccumulator->arg_size() == 2);
1232 auto AccumulatorArgIter = FnAccumulator->arg_begin();
1233 llvm::Value *AccumulatorArg_accum = &*(AccumulatorArgIter++);
1234 llvm::Value *AccumulatorArg_in = &*(AccumulatorArgIter++);
1235 llvm::Type *AccumulatorArgType = AccumulatorArg_accum->getType();
1236 bccAssert(AccumulatorArgType->isPointerTy());
1237
1238 llvm::Type *VoidTy = llvm::Type::getVoidTy(*Context);
1239 llvm::FunctionType *CombinerType =
1240 llvm::FunctionType::get(VoidTy, { AccumulatorArgType, AccumulatorArgType }, false);
1241 llvm::Function *FnCombiner =
1242 llvm::Function::Create(CombinerType, llvm::GlobalValue::ExternalLinkage,
1243 nameReduceCombinerFromAccumulator(FnAccumulator->getName()),
1244 Module);
1245
1246 auto CombinerArgIter = FnCombiner->arg_begin();
1247
1248 llvm::Argument *CombinerArg_accum = &(*CombinerArgIter++);
1249 CombinerArg_accum->setName("accum");
1250 CombinerArg_accum->addAttr(llvm::AttributeSet::get(*Context, CombinerArg_accum->getArgNo() + 1,
1251 llvm::makeArrayRef(Attribute::NoCapture)));
1252
1253 llvm::Argument *CombinerArg_other = &(*CombinerArgIter++);
1254 CombinerArg_other->setName("other");
1255 CombinerArg_other->addAttr(llvm::AttributeSet::get(*Context, CombinerArg_other->getArgNo() + 1,
1256 llvm::makeArrayRef(Attribute::NoCapture)));
1257
1258 llvm::BasicBlock *BB = llvm::BasicBlock::Create(*Context, "BB", FnCombiner);
1259 llvm::IRBuilder<> Builder(BB);
1260
1261 if (AccumulatorArg_in->getType()->isPointerTy()) {
1262 // Types of sufficient size get passed by pointer-to-copy rather
1263 // than passed by value. An accumulator cannot take a pointer
1264 // at the user level; so if we see a pointer here, we know that
1265 // we have a pass-by-pointer-to-copy case.
1266 llvm::Type *ElementType = AccumulatorArg_in->getType()->getPointerElementType();
1267 llvm::Value *TempMem = Builder.CreateAlloca(ElementType, nullptr, "caller_copy");
1268 Builder.CreateStore(Builder.CreateLoad(CombinerArg_other), TempMem);
1269 Builder.CreateCall(FnAccumulator, { CombinerArg_accum, TempMem });
1270 } else {
1271 llvm::Value *TypeAdjustedOther = CombinerArg_other;
1272 if (AccumulatorArgType->getPointerElementType() != AccumulatorArg_in->getType()) {
1273 // Call lowering by frontend has done some type coercion
1274 TypeAdjustedOther = Builder.CreatePointerCast(CombinerArg_other,
1275 AccumulatorArg_in->getType()->getPointerTo(),
1276 "cast");
1277 }
1278 llvm::Value *DerefOther = Builder.CreateLoad(TypeAdjustedOther);
1279 Builder.CreateCall(FnAccumulator, { CombinerArg_accum, DerefOther });
1280 }
1281 Builder.CreateRetVoid();
1282
1283 return true;
1284 }
1285
1286 /// @brief Checks if pointers to allocation internals are exposed
1287 ///
1288 /// This function verifies if through the parameters passed to the kernel
1289 /// or through calls to the runtime library the script gains access to
1290 /// pointers pointing to data within a RenderScript Allocation.
1291 /// If we know we control all loads from and stores to data within
1292 /// RenderScript allocations and if we know the run-time internal accesses
1293 /// are all annotated with RenderScript TBAA metadata, only then we
1294 /// can safely use TBAA to distinguish between generic and from-allocation
1295 /// pointers.
allocPointersExposed(llvm::Module & Module)1296 bool allocPointersExposed(llvm::Module &Module) {
1297 // Old style kernel function can expose pointers to elements within
1298 // allocations.
1299 // TODO: Extend analysis to allow simple cases of old-style kernels.
1300 for (size_t i = 0; i < mExportForEachCount; ++i) {
1301 const char *Name = mExportForEachNameList[i];
1302 uint32_t Signature = mExportForEachSignatureList[i];
1303 if (Module.getFunction(Name) &&
1304 !bcinfo::MetadataExtractor::hasForEachSignatureKernel(Signature)) {
1305 return true;
1306 }
1307 }
1308
1309 // Check for library functions that expose a pointer to an Allocation or
1310 // that are not yet annotated with RenderScript-specific tbaa information.
1311 static const std::vector<const char *> Funcs{
1312 // rsGetElementAt(...)
1313 "_Z14rsGetElementAt13rs_allocationj",
1314 "_Z14rsGetElementAt13rs_allocationjj",
1315 "_Z14rsGetElementAt13rs_allocationjjj",
1316
1317 // rsSetElementAt()
1318 "_Z14rsSetElementAt13rs_allocationPvj",
1319 "_Z14rsSetElementAt13rs_allocationPvjj",
1320 "_Z14rsSetElementAt13rs_allocationPvjjj",
1321
1322 // rsGetElementAtYuv_uchar_Y()
1323 "_Z25rsGetElementAtYuv_uchar_Y13rs_allocationjj",
1324
1325 // rsGetElementAtYuv_uchar_U()
1326 "_Z25rsGetElementAtYuv_uchar_U13rs_allocationjj",
1327
1328 // rsGetElementAtYuv_uchar_V()
1329 "_Z25rsGetElementAtYuv_uchar_V13rs_allocationjj",
1330 };
1331
1332 for (auto FI : Funcs) {
1333 llvm::Function *Function = Module.getFunction(FI);
1334
1335 if (!Function) {
1336 ALOGE("Missing run-time function '%s'", FI);
1337 return true;
1338 }
1339
1340 if (Function->getNumUses() > 0) {
1341 return true;
1342 }
1343 }
1344
1345 return false;
1346 }
1347
1348 /// @brief Connect RenderScript TBAA metadata to C/C++ metadata
1349 ///
1350 /// The TBAA metadata used to annotate loads/stores from RenderScript
1351 /// Allocations is generated in a separate TBAA tree with a
1352 /// "RenderScript Distinct TBAA" root node. LLVM does assume may-alias for
1353 /// all nodes in unrelated alias analysis trees. This function makes the
1354 /// "RenderScript TBAA" node (which is parented by the Distinct TBAA root),
1355 /// a subtree of the normal C/C++ TBAA tree aside of normal C/C++ types. With
1356 /// the connected trees every access to an Allocation is resolved to
1357 /// must-alias if compared to a normal C/C++ access.
connectRenderScriptTBAAMetadata(llvm::Module & Module)1358 void connectRenderScriptTBAAMetadata(llvm::Module &Module) {
1359 llvm::MDBuilder MDHelper(*Context);
1360 llvm::MDNode *TBAARenderScriptDistinct =
1361 MDHelper.createTBAARoot("RenderScript Distinct TBAA");
1362 llvm::MDNode *TBAARenderScript = MDHelper.createTBAANode(
1363 "RenderScript TBAA", TBAARenderScriptDistinct);
1364 llvm::MDNode *TBAARoot = MDHelper.createTBAARoot("Simple C/C++ TBAA");
1365 TBAARenderScript->replaceOperandWith(1, TBAARoot);
1366 }
1367
runOnModule(llvm::Module & Module)1368 virtual bool runOnModule(llvm::Module &Module) {
1369 bool Changed = false;
1370 this->Module = &Module;
1371 Context = &Module.getContext();
1372
1373 buildTypes();
1374
1375 bcinfo::MetadataExtractor me(&Module);
1376 if (!me.extract()) {
1377 ALOGE("Could not extract metadata from module!");
1378 return false;
1379 }
1380
1381 mStructExplicitlyPaddedBySlang = (me.getCompilerVersion() >= SlangVersion::N_STRUCT_EXPLICIT_PADDING);
1382
1383 // Expand forEach_* style kernels.
1384 mExportForEachCount = me.getExportForEachSignatureCount();
1385 mExportForEachNameList = me.getExportForEachNameList();
1386 mExportForEachSignatureList = me.getExportForEachSignatureList();
1387
1388 for (size_t i = 0; i < mExportForEachCount; ++i) {
1389 const char *name = mExportForEachNameList[i];
1390 uint32_t signature = mExportForEachSignatureList[i];
1391 llvm::Function *kernel = Module.getFunction(name);
1392 if (kernel) {
1393 if (bcinfo::MetadataExtractor::hasForEachSignatureKernel(signature)) {
1394 Changed |= ExpandForEach(kernel, signature);
1395 kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
1396 } else if (kernel->getReturnType()->isVoidTy()) {
1397 Changed |= ExpandOldStyleForEach(kernel, signature);
1398 kernel->setLinkage(llvm::GlobalValue::InternalLinkage);
1399 } else {
1400 // There are some graphics root functions that are not
1401 // expanded, but that will be called directly. For those
1402 // functions, we can not set the linkage to internal.
1403 }
1404 }
1405 }
1406
1407 // Process general reduce_* style functions.
1408 const size_t ExportReduceCount = me.getExportReduceCount();
1409 const bcinfo::MetadataExtractor::Reduce *ExportReduceList = me.getExportReduceList();
1410 // Note that functions can be shared between kernels
1411 FunctionSet PromotedFunctions, ExpandedAccumulators, AccumulatorsForCombiners;
1412
1413 for (size_t i = 0; i < ExportReduceCount; ++i) {
1414 Changed |= PromoteReduceFunction(ExportReduceList[i].mInitializerName, PromotedFunctions);
1415 Changed |= PromoteReduceFunction(ExportReduceList[i].mCombinerName, PromotedFunctions);
1416 Changed |= PromoteReduceFunction(ExportReduceList[i].mOutConverterName, PromotedFunctions);
1417
1418 // Accumulator
1419 llvm::Function *accumulator = Module.getFunction(ExportReduceList[i].mAccumulatorName);
1420 bccAssert(accumulator != nullptr);
1421 if (ExpandedAccumulators.insert(accumulator).second)
1422 Changed |= ExpandReduceAccumulator(accumulator,
1423 ExportReduceList[i].mSignature,
1424 ExportReduceList[i].mInputCount);
1425 if (!ExportReduceList[i].mCombinerName) {
1426 if (AccumulatorsForCombiners.insert(accumulator).second)
1427 Changed |= CreateReduceCombinerFromAccumulator(accumulator);
1428 }
1429 }
1430
1431 if (gEnableRsTbaa && !allocPointersExposed(Module)) {
1432 connectRenderScriptTBAAMetadata(Module);
1433 }
1434
1435 return Changed;
1436 }
1437
getPassName() const1438 virtual const char *getPassName() const {
1439 return "forEach_* and reduce_* function expansion";
1440 }
1441
1442 }; // end RSKernelExpandPass
1443
1444 } // end anonymous namespace
1445
1446 char RSKernelExpandPass::ID = 0;
1447 static llvm::RegisterPass<RSKernelExpandPass> X("kernelexp", "Kernel Expand Pass");
1448
1449 namespace bcc {
1450
1451 const char BCC_INDEX_VAR_NAME[] = "rsIndex";
1452
1453 llvm::ModulePass *
createRSKernelExpandPass(bool pEnableStepOpt)1454 createRSKernelExpandPass(bool pEnableStepOpt) {
1455 return new RSKernelExpandPass(pEnableStepOpt);
1456 }
1457
1458 } // end namespace bcc
1459