1 /*
2 * Copyright 2014 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include <llvm-c/Core.h>
8 #include <llvm/Analysis/TargetLibraryInfo.h>
9 #include <llvm/IR/IRBuilder.h>
10 #include <llvm/IR/LegacyPassManager.h>
11 #include <llvm/IR/Module.h>
12 #include <llvm/IR/Verifier.h>
13 #include <llvm/Target/TargetMachine.h>
14 #include <llvm/MC/MCSubtargetInfo.h>
15 #include <llvm/Support/CommandLine.h>
16 #include <llvm/Transforms/IPO.h>
17 #include <llvm/Transforms/Scalar.h>
18 #include <llvm/Transforms/Utils.h>
19 #include <llvm/CodeGen/Passes.h>
20 #include <llvm/Passes/PassBuilder.h>
21 #include <llvm/Transforms/InstCombine/InstCombine.h>
22 #include <llvm/Transforms/IPO/AlwaysInliner.h>
23 #include <llvm/Transforms/IPO/SCCP.h>
24 #include <llvm/Transforms/Scalar/EarlyCSE.h>
25 #include <llvm/Transforms/Scalar/LICM.h>
26 #include <llvm/Transforms/Scalar/SROA.h>
27 #include <llvm/Transforms/Scalar/SimplifyCFG.h>
28 #include "llvm/CodeGen/SelectionDAGNodes.h"
29
30 #include <cstring>
31
32 /* DO NOT REORDER THE HEADERS
33 * The LLVM headers need to all be included before any Mesa header,
34 * as they use the `restrict` keyword in ways that are incompatible
35 * with our #define in include/c99_compat.h
36 */
37
38 #include "ac_binary.h"
39 #include "ac_llvm_util.h"
40 #include "ac_llvm_build.h"
41 #include "util/macros.h"
42
43 using namespace llvm;
44
45 class RunAtExitForStaticDestructors : public SDNode
46 {
47 public:
48 /* getSDVTList (protected) calls getValueTypeList (private), which contains static variables. */
RunAtExitForStaticDestructors()49 RunAtExitForStaticDestructors(): SDNode(0, 0, DebugLoc(), getSDVTList(MVT::Other))
50 {
51 }
52 };
53
ac_llvm_run_atexit_for_destructors(void)54 void ac_llvm_run_atexit_for_destructors(void)
55 {
56 /* LLVM >= 16 registers static variable destructors on the first compile, which gcc
57 * implements by calling atexit there. Before that, u_queue registers its atexit
58 * handler to kill all threads. Since exit() runs atexit handlers in the reverse order,
59 * the LLVM destructors are called first while shader compiler threads may still be
60 * running, which crashes in LLVM in SelectionDAG.cpp.
61 *
62 * The solution is to run the code that declares the LLVM static variables first,
63 * so that atexit for LLVM is registered first and u_queue is registered after that,
64 * which ensures that all u_queue threads are terminated before LLVM destructors are
65 * called.
66 *
67 * This just executes the code that declares static variables.
68 */
69 RunAtExitForStaticDestructors();
70 }
71
ac_is_llvm_processor_supported(LLVMTargetMachineRef tm,const char * processor)72 bool ac_is_llvm_processor_supported(LLVMTargetMachineRef tm, const char *processor)
73 {
74 TargetMachine *TM = reinterpret_cast<TargetMachine *>(tm);
75 return TM->getMCSubtargetInfo()->isCPUStringValid(processor);
76 }
77
ac_reset_llvm_all_options_occurrences()78 void ac_reset_llvm_all_options_occurrences()
79 {
80 cl::ResetAllOptionOccurrences();
81 }
82
ac_add_attr_dereferenceable(LLVMValueRef val,uint64_t bytes)83 void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes)
84 {
85 Argument *A = unwrap<Argument>(val);
86 A->addAttr(Attribute::getWithDereferenceableBytes(A->getContext(), bytes));
87 }
88
ac_add_attr_alignment(LLVMValueRef val,uint64_t bytes)89 void ac_add_attr_alignment(LLVMValueRef val, uint64_t bytes)
90 {
91 Argument *A = unwrap<Argument>(val);
92 A->addAttr(Attribute::getWithAlignment(A->getContext(), Align(bytes)));
93 }
94
ac_create_module(LLVMTargetMachineRef tm,LLVMContextRef ctx)95 LLVMModuleRef ac_create_module(LLVMTargetMachineRef tm, LLVMContextRef ctx)
96 {
97 TargetMachine *TM = reinterpret_cast<TargetMachine *>(tm);
98 LLVMModuleRef module = LLVMModuleCreateWithNameInContext("mesa-shader", ctx);
99
100 unwrap(module)->setTargetTriple(TM->getTargetTriple().getTriple());
101 unwrap(module)->setDataLayout(TM->createDataLayout());
102 return module;
103 }
104
ac_create_builder(LLVMContextRef ctx,enum ac_float_mode float_mode)105 LLVMBuilderRef ac_create_builder(LLVMContextRef ctx, enum ac_float_mode float_mode)
106 {
107 LLVMBuilderRef builder = LLVMCreateBuilderInContext(ctx);
108
109 FastMathFlags flags;
110
111 switch (float_mode) {
112 case AC_FLOAT_MODE_DEFAULT:
113 case AC_FLOAT_MODE_DENORM_FLUSH_TO_ZERO:
114 break;
115
116 case AC_FLOAT_MODE_DEFAULT_OPENGL:
117 /* Allow optimizations to treat the sign of a zero argument or
118 * result as insignificant.
119 */
120 flags.setNoSignedZeros(); /* nsz */
121
122 /* Allow optimizations to use the reciprocal of an argument
123 * rather than perform division.
124 */
125 flags.setAllowReciprocal(); /* arcp */
126
127 unwrap(builder)->setFastMathFlags(flags);
128 break;
129 }
130
131 return builder;
132 }
133
ac_enable_signed_zeros(struct ac_llvm_context * ctx)134 void ac_enable_signed_zeros(struct ac_llvm_context *ctx)
135 {
136 if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL) {
137 auto *b = unwrap(ctx->builder);
138 FastMathFlags flags = b->getFastMathFlags();
139
140 /* This disables the optimization of (x + 0), which is used
141 * to convert negative zero to positive zero.
142 */
143 flags.setNoSignedZeros(false);
144 b->setFastMathFlags(flags);
145 }
146 }
147
ac_disable_signed_zeros(struct ac_llvm_context * ctx)148 void ac_disable_signed_zeros(struct ac_llvm_context *ctx)
149 {
150 if (ctx->float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL) {
151 auto *b = unwrap(ctx->builder);
152 FastMathFlags flags = b->getFastMathFlags();
153
154 flags.setNoSignedZeros();
155 b->setFastMathFlags(flags);
156 }
157 }
158
159 /* Implementation of raw_pwrite_stream that works on malloc()ed memory for
160 * better compatibility with C code. */
161 struct raw_memory_ostream : public raw_pwrite_stream {
162 char *buffer;
163 size_t written;
164 size_t bufsize;
165
raw_memory_ostreamraw_memory_ostream166 raw_memory_ostream()
167 {
168 buffer = NULL;
169 written = 0;
170 bufsize = 0;
171 SetUnbuffered();
172 }
173
~raw_memory_ostreamraw_memory_ostream174 ~raw_memory_ostream()
175 {
176 free(buffer);
177 }
178
takeraw_memory_ostream179 void take(char *&out_buffer, size_t &out_size)
180 {
181 out_buffer = buffer;
182 out_size = written;
183 buffer = NULL;
184 written = 0;
185 bufsize = 0;
186 }
187
188 void flush() = delete;
189
write_implraw_memory_ostream190 void write_impl(const char *ptr, size_t size) override
191 {
192 if (unlikely(written + size < written))
193 abort();
194 if (written + size > bufsize) {
195 bufsize = MAX3(1024, written + size, bufsize / 3 * 4);
196 buffer = (char *)realloc(buffer, bufsize);
197 if (!buffer) {
198 fprintf(stderr, "amd: out of memory allocating ELF buffer\n");
199 abort();
200 }
201 }
202 memcpy(buffer + written, ptr, size);
203 written += size;
204 }
205
pwrite_implraw_memory_ostream206 void pwrite_impl(const char *ptr, size_t size, uint64_t offset) override
207 {
208 assert(offset == (size_t)offset && offset + size >= offset && offset + size <= written);
209 memcpy(buffer + offset, ptr, size);
210 }
211
current_posraw_memory_ostream212 uint64_t current_pos() const override
213 {
214 return written;
215 }
216 };
217
218 /* The middle-end optimization passes are run using
219 * the LLVM's new pass manager infrastructure.
220 */
221 struct ac_midend_optimizer
222 {
223 TargetMachine *target_machine;
224 PassBuilder pass_builder;
225 TargetLibraryInfoImpl target_library_info;
226
227 /* Should be declared in this order only,
228 * so that they are destroyed in the correct order
229 * due to inter-analysis-manager references.
230 */
231 LoopAnalysisManager loop_am;
232 FunctionAnalysisManager function_am;
233 CGSCCAnalysisManager cgscc_am;
234 ModuleAnalysisManager module_am;
235
236 /* Pass Managers */
237 LoopPassManager loop_pm;
238 FunctionPassManager function_pm;
239 ModulePassManager module_pm;
240
ac_midend_optimizerac_midend_optimizer241 ac_midend_optimizer(TargetMachine *arg_target_machine, bool arg_check_ir)
242 : target_machine(arg_target_machine),
243 pass_builder(target_machine, PipelineTuningOptions(), {}),
244 target_library_info(Triple(target_machine->getTargetTriple()))
245 {
246 /* Build the pipeline and optimize.
247 * Any custom analyses should be registered
248 * before LLVM's default analysis sets.
249 */
250 function_am.registerPass(
__anona8c9a3230102null251 [&] { return TargetLibraryAnalysis(target_library_info); }
252 );
253
254 pass_builder.registerModuleAnalyses(module_am);
255 pass_builder.registerCGSCCAnalyses(cgscc_am);
256 pass_builder.registerFunctionAnalyses(function_am);
257 pass_builder.registerLoopAnalyses(loop_am);
258 pass_builder.crossRegisterProxies(loop_am, function_am, cgscc_am, module_am);
259
260 if (arg_check_ir)
261 module_pm.addPass(VerifierPass());
262
263 /* Adding inliner pass to the module pass manager directly
264 * ensures that the pass is run on all functions first, which makes sure
265 * that the following passes are only run on the remaining non-inline
266 * function, so it removes useless work done on dead inline functions.
267 */
268 module_pm.addPass(AlwaysInlinerPass());
269
270 /* The following set of passes run on an individual function/loop first
271 * before proceeding to the next.
272 */
273 #if LLVM_VERSION_MAJOR >= 16
274 function_pm.addPass(SROAPass(SROAOptions::ModifyCFG));
275 #else
276 // Old version of the code
277 function_pm.addPass(SROAPass());
278 #endif
279
280 loop_pm.addPass(LICMPass(LICMOptions()));
281 function_pm.addPass(createFunctionToLoopPassAdaptor(std::move(loop_pm), true));
282 function_pm.addPass(SimplifyCFGPass());
283 function_pm.addPass(EarlyCSEPass(true));
284
285 module_pm.addPass(createModuleToFunctionPassAdaptor(std::move(function_pm)));
286 }
287
runac_midend_optimizer288 void run(Module &module)
289 {
290 module_pm.run(module, module_am);
291
292 /* After a run(), the results in the analyses managers
293 * aren't useful to optimize a subsequent LLVM module.
294 * If used, it can lead to unexpected crashes.
295 * Hence, the results in the analyses managers
296 * need to be invalidated and cleared before
297 * running optimizations on a new LLVM module.
298 */
299 module_am.invalidate(module, PreservedAnalyses::none());
300 module_am.clear();
301 cgscc_am.clear();
302 function_am.clear();
303 loop_am.clear();
304 }
305 };
306
307 /* The backend passes for optimizations, instruction selection,
308 * and code generation in the LLVM compiler still requires the
309 * legacy::PassManager. The use of the legacy PM will be
310 * deprecated when the new PM can handle backend passes.
311 */
312 struct ac_backend_optimizer
313 {
314 raw_memory_ostream ostream; /* ELF shader binary stream */
315 legacy::PassManager backend_pass_manager; /* for codegen only */
316
ac_backend_optimizerac_backend_optimizer317 ac_backend_optimizer(TargetMachine *arg_target_machine)
318 {
319 /* add backend passes */
320 if (arg_target_machine->addPassesToEmitFile(backend_pass_manager, ostream, nullptr,
321 #if LLVM_VERSION_MAJOR >= 18
322 CodeGenFileType::ObjectFile)) {
323 #else
324 CGFT_ObjectFile)) {
325 #endif
326 fprintf(stderr, "amd: TargetMachine can't emit a file of this type!\n");
327 }
328 }
329
330 void run(Module &module, char *&out_buffer, size_t &out_size)
331 {
332 backend_pass_manager.run(module);
333 ostream.take(out_buffer, out_size);
334 }
335 };
336
ac_create_midend_optimizer(LLVMTargetMachineRef tm,bool check_ir)337 ac_midend_optimizer *ac_create_midend_optimizer(LLVMTargetMachineRef tm,
338 bool check_ir)
339 {
340 TargetMachine *TM = reinterpret_cast<TargetMachine *>(tm);
341 return new ac_midend_optimizer(TM, check_ir);
342 }
343
ac_destroy_midend_optimiser(ac_midend_optimizer * meo)344 void ac_destroy_midend_optimiser(ac_midend_optimizer *meo)
345 {
346 delete meo;
347 }
348
ac_llvm_optimize_module(ac_midend_optimizer * meo,LLVMModuleRef module)349 bool ac_llvm_optimize_module(ac_midend_optimizer *meo, LLVMModuleRef module)
350 {
351 if (!meo)
352 return false;
353
354 /* Runs all the middle-end optimizations, no code generation */
355 meo->run(*unwrap(module));
356 return true;
357 }
358
ac_create_backend_optimizer(LLVMTargetMachineRef tm)359 ac_backend_optimizer *ac_create_backend_optimizer(LLVMTargetMachineRef tm)
360 {
361 TargetMachine *TM = reinterpret_cast<TargetMachine *>(tm);
362 return new ac_backend_optimizer(TM);
363 }
364
ac_destroy_backend_optimizer(ac_backend_optimizer * beo)365 void ac_destroy_backend_optimizer(ac_backend_optimizer *beo)
366 {
367 delete beo;
368 }
369
ac_compile_module_to_elf(ac_backend_optimizer * beo,LLVMModuleRef module,char ** pelf_buffer,size_t * pelf_size)370 bool ac_compile_module_to_elf(ac_backend_optimizer *beo, LLVMModuleRef module,
371 char **pelf_buffer, size_t *pelf_size)
372 {
373 if (!beo)
374 return false;
375
376 /* Runs all backend optimizations and code generation */
377 beo->run(*unwrap(module), *pelf_buffer, *pelf_size);
378 return true;
379 }
380
ac_build_atomic_rmw(struct ac_llvm_context * ctx,LLVMAtomicRMWBinOp op,LLVMValueRef ptr,LLVMValueRef val,const char * sync_scope)381 LLVMValueRef ac_build_atomic_rmw(struct ac_llvm_context *ctx, LLVMAtomicRMWBinOp op,
382 LLVMValueRef ptr, LLVMValueRef val, const char *sync_scope)
383 {
384 AtomicRMWInst::BinOp binop;
385 switch (op) {
386 case LLVMAtomicRMWBinOpXchg:
387 binop = AtomicRMWInst::Xchg;
388 break;
389 case LLVMAtomicRMWBinOpAdd:
390 binop = AtomicRMWInst::Add;
391 break;
392 case LLVMAtomicRMWBinOpSub:
393 binop = AtomicRMWInst::Sub;
394 break;
395 case LLVMAtomicRMWBinOpAnd:
396 binop = AtomicRMWInst::And;
397 break;
398 case LLVMAtomicRMWBinOpNand:
399 binop = AtomicRMWInst::Nand;
400 break;
401 case LLVMAtomicRMWBinOpOr:
402 binop = AtomicRMWInst::Or;
403 break;
404 case LLVMAtomicRMWBinOpXor:
405 binop = AtomicRMWInst::Xor;
406 break;
407 case LLVMAtomicRMWBinOpMax:
408 binop = AtomicRMWInst::Max;
409 break;
410 case LLVMAtomicRMWBinOpMin:
411 binop = AtomicRMWInst::Min;
412 break;
413 case LLVMAtomicRMWBinOpUMax:
414 binop = AtomicRMWInst::UMax;
415 break;
416 case LLVMAtomicRMWBinOpUMin:
417 binop = AtomicRMWInst::UMin;
418 break;
419 case LLVMAtomicRMWBinOpFAdd:
420 binop = AtomicRMWInst::FAdd;
421 break;
422 default:
423 unreachable("invalid LLVMAtomicRMWBinOp");
424 break;
425 }
426 unsigned SSID = unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope);
427 return wrap(unwrap(ctx->builder)
428 ->CreateAtomicRMW(binop, unwrap(ptr), unwrap(val),
429 MaybeAlign(0),
430 AtomicOrdering::SequentiallyConsistent, SSID));
431 }
432
ac_build_atomic_cmp_xchg(struct ac_llvm_context * ctx,LLVMValueRef ptr,LLVMValueRef cmp,LLVMValueRef val,const char * sync_scope)433 LLVMValueRef ac_build_atomic_cmp_xchg(struct ac_llvm_context *ctx, LLVMValueRef ptr,
434 LLVMValueRef cmp, LLVMValueRef val, const char *sync_scope)
435 {
436 unsigned SSID = unwrap(ctx->context)->getOrInsertSyncScopeID(sync_scope);
437 return wrap(unwrap(ctx->builder)
438 ->CreateAtomicCmpXchg(unwrap(ptr), unwrap(cmp),
439 unwrap(val),
440 MaybeAlign(0),
441 AtomicOrdering::SequentiallyConsistent,
442 AtomicOrdering::SequentiallyConsistent, SSID));
443 }
444