• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "aco_ir.h"
8 
9 #include "util/crc32.h"
10 
11 #include <algorithm>
12 #include <deque>
13 #include <set>
14 #include <vector>
15 
16 namespace aco {
17 
18 namespace {
19 
20 class BlockCycleEstimator {
21 public:
22    enum resource {
23       null = 0,
24       scalar,
25       branch_sendmsg,
26       valu,
27       valu_complex,
28       lds,
29       export_gds,
30       vmem,
31       resource_count,
32    };
33 
BlockCycleEstimator(Program * program_)34    BlockCycleEstimator(Program* program_) : program(program_) {}
35 
36    Program* program;
37 
38    int32_t cur_cycle = 0;
39    int32_t res_available[(int)BlockCycleEstimator::resource_count] = {0};
40    unsigned res_usage[(int)BlockCycleEstimator::resource_count] = {0};
41    int32_t reg_available[512] = {0};
42    std::deque<int32_t> mem_ops[wait_type_num];
43 
44    void add(aco_ptr<Instruction>& instr);
45    void join(const BlockCycleEstimator& other);
46 
47 private:
48    unsigned get_waitcnt_cost(wait_imm imm);
49    unsigned get_dependency_cost(aco_ptr<Instruction>& instr);
50 
51    void use_resources(aco_ptr<Instruction>& instr);
52    int32_t cycles_until_res_available(aco_ptr<Instruction>& instr);
53 };
54 
55 struct perf_info {
56    int latency;
57 
58    BlockCycleEstimator::resource rsrc0;
59    unsigned cost0;
60 
61    BlockCycleEstimator::resource rsrc1;
62    unsigned cost1;
63 };
64 
65 static bool
is_dual_issue_capable(const Program & program,const Instruction & instr)66 is_dual_issue_capable(const Program& program, const Instruction& instr)
67 {
68    if (program.gfx_level < GFX11 || !instr.isVALU() || instr.isDPP())
69       return false;
70 
71    switch (instr.opcode) {
72    case aco_opcode::v_fma_f32:
73    case aco_opcode::v_fmac_f32:
74    case aco_opcode::v_fmaak_f32:
75    case aco_opcode::v_fmamk_f32:
76    case aco_opcode::v_mul_f32:
77    case aco_opcode::v_add_f32:
78    case aco_opcode::v_sub_f32:
79    case aco_opcode::v_subrev_f32:
80    case aco_opcode::v_mul_legacy_f32:
81    case aco_opcode::v_fma_legacy_f32:
82    case aco_opcode::v_fmac_legacy_f32:
83    case aco_opcode::v_fma_f16:
84    case aco_opcode::v_fmac_f16:
85    case aco_opcode::v_fmaak_f16:
86    case aco_opcode::v_fmamk_f16:
87    case aco_opcode::v_mul_f16:
88    case aco_opcode::v_add_f16:
89    case aco_opcode::v_sub_f16:
90    case aco_opcode::v_subrev_f16:
91    case aco_opcode::v_mov_b32:
92    case aco_opcode::v_movreld_b32:
93    case aco_opcode::v_movrels_b32:
94    case aco_opcode::v_movrelsd_b32:
95    case aco_opcode::v_movrelsd_2_b32:
96    case aco_opcode::v_cndmask_b32:
97    case aco_opcode::v_writelane_b32_e64:
98    case aco_opcode::v_mov_b16:
99    case aco_opcode::v_cndmask_b16:
100    case aco_opcode::v_max_f32:
101    case aco_opcode::v_min_f32:
102    case aco_opcode::v_max_f16:
103    case aco_opcode::v_min_f16:
104    case aco_opcode::v_max_i16_e64:
105    case aco_opcode::v_min_i16_e64:
106    case aco_opcode::v_max_u16_e64:
107    case aco_opcode::v_min_u16_e64:
108    case aco_opcode::v_add_i16:
109    case aco_opcode::v_sub_i16:
110    case aco_opcode::v_mad_i16:
111    case aco_opcode::v_add_u16_e64:
112    case aco_opcode::v_sub_u16_e64:
113    case aco_opcode::v_mad_u16:
114    case aco_opcode::v_mul_lo_u16_e64:
115    case aco_opcode::v_not_b16:
116    case aco_opcode::v_and_b16:
117    case aco_opcode::v_or_b16:
118    case aco_opcode::v_xor_b16:
119    case aco_opcode::v_lshrrev_b16_e64:
120    case aco_opcode::v_ashrrev_i16_e64:
121    case aco_opcode::v_lshlrev_b16_e64:
122    case aco_opcode::v_dot2_bf16_bf16:
123    case aco_opcode::v_dot2_f32_bf16:
124    case aco_opcode::v_dot2_f16_f16:
125    case aco_opcode::v_dot2_f32_f16:
126    case aco_opcode::v_dot2c_f32_f16: return true;
127    case aco_opcode::v_fma_mix_f32:
128    case aco_opcode::v_fma_mixlo_f16:
129    case aco_opcode::v_fma_mixhi_f16: {
130       /* dst and acc type must match */
131       if (instr.valu().opsel_hi[2] == (instr.opcode == aco_opcode::v_fma_mix_f32))
132          return false;
133 
134       /* If all operands are vgprs, two must be the same. */
135       for (unsigned i = 0; i < 3; i++) {
136          if (instr.operands[i].isConstant() || instr.operands[i].isOfType(RegType::sgpr))
137             return true;
138          for (unsigned j = 0; j < i; j++) {
139             if (instr.operands[i].physReg() == instr.operands[j].physReg())
140                return true;
141          }
142       }
143       return false;
144    }
145    default:
146       if (instr.isVINTERP_INREG())
147          return program.gfx_level >= GFX11_5;
148       if (instr.isVOPC() && instr_info.classes[(int)instr.opcode] == instr_class::valu32)
149          return program.gfx_level == GFX11_5;
150       return false;
151    }
152 }
153 
154 static perf_info
get_perf_info(const Program & program,const Instruction & instr)155 get_perf_info(const Program& program, const Instruction& instr)
156 {
157    instr_class cls = instr_info.classes[(int)instr.opcode];
158 
159 #define WAIT(res)          BlockCycleEstimator::res, 0
160 #define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt
161 
162    if (program.gfx_level >= GFX10) {
163       /* fp64 might be incorrect */
164       switch (cls) {
165       case instr_class::valu32:
166       case instr_class::valu_convert32:
167       case instr_class::valu_fma: return {5, WAIT_USE(valu, 1)};
168       case instr_class::valu64: return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)};
169       case instr_class::valu_quarter_rate32:
170          return {8, WAIT_USE(valu, 4), WAIT_USE(valu_complex, 4)};
171       case instr_class::valu_transcendental32:
172          return {10, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 4)};
173       case instr_class::valu_double: return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
174       case instr_class::valu_double_add:
175          return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
176       case instr_class::valu_double_convert:
177          return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
178       case instr_class::valu_double_transcendental:
179          return {24, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
180       case instr_class::salu: return {2, WAIT_USE(scalar, 1)};
181       case instr_class::sfpu: return {4, WAIT_USE(scalar, 1)};
182       case instr_class::valu_pseudo_scalar_trans:
183          return {7, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 1)};
184       case instr_class::smem: return {0, WAIT_USE(scalar, 1)};
185       case instr_class::branch:
186       case instr_class::sendmsg: return {0, WAIT_USE(branch_sendmsg, 3)};
187       case instr_class::ds:
188          return instr.isDS() && instr.ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)}
189                                                : perf_info{0, WAIT_USE(lds, 1)};
190       case instr_class::exp: return {0, WAIT_USE(export_gds, 1)};
191       case instr_class::vmem: return {0, WAIT_USE(vmem, 1)};
192       case instr_class::wmma: {
193          /* int8 and (b)f16 have the same performance. */
194          uint8_t cost = instr.opcode == aco_opcode::v_wmma_i32_16x16x16_iu4 ? 16 : 32;
195          return {cost, WAIT_USE(valu, cost)};
196       }
197       case instr_class::barrier:
198       case instr_class::waitcnt:
199       case instr_class::other:
200       default: return {0};
201       }
202    } else {
203       switch (cls) {
204       case instr_class::valu32: return {4, WAIT_USE(valu, 4)};
205       case instr_class::valu_convert32: return {16, WAIT_USE(valu, 16)};
206       case instr_class::valu64: return {8, WAIT_USE(valu, 8)};
207       case instr_class::valu_quarter_rate32: return {16, WAIT_USE(valu, 16)};
208       case instr_class::valu_fma:
209          return program.dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)}
210                                            : perf_info{16, WAIT_USE(valu, 16)};
211       case instr_class::valu_transcendental32: return {16, WAIT_USE(valu, 16)};
212       case instr_class::valu_double: return {64, WAIT_USE(valu, 64)};
213       case instr_class::valu_double_add: return {32, WAIT_USE(valu, 32)};
214       case instr_class::valu_double_convert: return {16, WAIT_USE(valu, 16)};
215       case instr_class::valu_double_transcendental: return {64, WAIT_USE(valu, 64)};
216       case instr_class::salu: return {4, WAIT_USE(scalar, 4)};
217       case instr_class::smem: return {4, WAIT_USE(scalar, 4)};
218       case instr_class::branch: return {4, WAIT_USE(branch_sendmsg, 4)};
219       case instr_class::ds:
220          return instr.isDS() && instr.ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)}
221                                                : perf_info{4, WAIT_USE(lds, 4)};
222       case instr_class::exp: return {16, WAIT_USE(export_gds, 16)};
223       case instr_class::vmem: return {4, WAIT_USE(vmem, 4)};
224       case instr_class::barrier:
225       case instr_class::waitcnt:
226       case instr_class::other:
227       default: return {4};
228       }
229    }
230 
231 #undef WAIT_USE
232 #undef WAIT
233 }
234 
235 void
use_resources(aco_ptr<Instruction> & instr)236 BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
237 {
238    perf_info perf = get_perf_info(*program, *instr);
239 
240    if (perf.rsrc0 != resource_count) {
241       res_available[(int)perf.rsrc0] = cur_cycle + perf.cost0;
242       res_usage[(int)perf.rsrc0] += perf.cost0;
243    }
244 
245    if (perf.rsrc1 != resource_count) {
246       res_available[(int)perf.rsrc1] = cur_cycle + perf.cost1;
247       res_usage[(int)perf.rsrc1] += perf.cost1;
248    }
249 }
250 
251 int32_t
cycles_until_res_available(aco_ptr<Instruction> & instr)252 BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
253 {
254    perf_info perf = get_perf_info(*program, *instr);
255 
256    int32_t cost = 0;
257    if (perf.rsrc0 != resource_count)
258       cost = MAX2(cost, res_available[(int)perf.rsrc0] - cur_cycle);
259    if (perf.rsrc1 != resource_count)
260       cost = MAX2(cost, res_available[(int)perf.rsrc1] - cur_cycle);
261 
262    return cost;
263 }
264 
265 static std::array<unsigned, wait_type_num>
get_wait_counter_info(amd_gfx_level gfx_level,aco_ptr<Instruction> & instr)266 get_wait_counter_info(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr)
267 {
268    /* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance
269     * depends a lot on the situation. */
270 
271    std::array<unsigned, wait_type_num> info{};
272 
273    if (instr->isEXP()) {
274       info[wait_type_exp] = 16;
275    } else if (instr->isLDSDIR()) {
276       info[wait_type_exp] = 13;
277    } else if (instr->isFlatLike()) {
278       info[wait_type_lgkm] = instr->isFlat() ? 20 : 0;
279       if (!instr->definitions.empty() || gfx_level < GFX10)
280          info[wait_type_vm] = 320;
281       else
282          info[wait_type_vs] = 320;
283    } else if (instr->isSMEM()) {
284       wait_type type = gfx_level >= GFX12 ? wait_type_km : wait_type_lgkm;
285       if (instr->definitions.empty()) {
286          info[type] = 200;
287       } else if (instr->operands.empty()) { /* s_memtime and s_memrealtime */
288          info[type] = 1;
289       } else {
290          bool likely_desc_load = instr->operands[0].size() == 2;
291          bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
292          bool const_offset =
293             instr->operands[1].isConstant() && (!soe || instr->operands.back().isConstant());
294 
295          if (likely_desc_load || const_offset)
296             info[type] = 30; /* likely to hit L0 cache */
297          else
298             info[type] = 200;
299       }
300    } else if (instr->isDS()) {
301       info[wait_type_lgkm] = 20;
302    } else if (instr->isVMEM() && instr->definitions.empty() && gfx_level >= GFX10) {
303       info[wait_type_vs] = 320;
304    } else if (instr->isVMEM()) {
305       uint8_t vm_type = get_vmem_type(gfx_level, instr.get());
306       wait_type type = wait_type_vm;
307       if (gfx_level >= GFX12 && vm_type == vmem_bvh)
308          type = wait_type_bvh;
309       else if (gfx_level >= GFX12 && vm_type == vmem_sampler)
310          type = wait_type_sample;
311       info[type] = 320;
312    }
313 
314    return info;
315 }
316 
317 static wait_imm
get_wait_imm(Program * program,aco_ptr<Instruction> & instr)318 get_wait_imm(Program* program, aco_ptr<Instruction>& instr)
319 {
320    wait_imm imm;
321    if (instr->opcode == aco_opcode::s_endpgm) {
322       for (unsigned i = 0; i < wait_type_num; i++)
323          imm[i] = 0;
324    } else if (imm.unpack(program->gfx_level, instr.get())) {
325    } else if (instr->isVINTERP_INREG()) {
326       imm.exp = instr->vinterp_inreg().wait_exp;
327       if (imm.exp == 0x7)
328          imm.exp = wait_imm::unset_counter;
329    } else {
330       /* If an instruction increases a counter, it waits for it to be below maximum first. */
331       std::array<unsigned, wait_type_num> wait_info =
332          get_wait_counter_info(program->gfx_level, instr);
333       wait_imm max = wait_imm::max(program->gfx_level);
334       for (unsigned i = 0; i < wait_type_num; i++) {
335          if (wait_info[i])
336             imm[i] = max[i] - 1;
337       }
338    }
339    return imm;
340 }
341 
342 unsigned
get_dependency_cost(aco_ptr<Instruction> & instr)343 BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
344 {
345    int deps_available = cur_cycle;
346 
347    wait_imm imm = get_wait_imm(program, instr);
348    for (unsigned i = 0; i < wait_type_num; i++) {
349       if (imm[i] == wait_imm::unset_counter)
350          continue;
351       for (int j = 0; j < (int)mem_ops[i].size() - imm[i]; j++)
352          deps_available = MAX2(deps_available, mem_ops[i][j]);
353    }
354 
355    if (instr->opcode == aco_opcode::s_endpgm) {
356       for (unsigned i = 0; i < 512; i++)
357          deps_available = MAX2(deps_available, reg_available[i]);
358    } else if (program->gfx_level >= GFX10) {
359       for (Operand& op : instr->operands) {
360          if (op.isConstant() || op.isUndefined())
361             continue;
362          for (unsigned i = 0; i < op.size(); i++)
363             deps_available = MAX2(deps_available, reg_available[op.physReg().reg() + i]);
364       }
365    }
366 
367    if (program->gfx_level < GFX10)
368       deps_available = align(deps_available, 4);
369 
370    return deps_available - cur_cycle;
371 }
372 
373 static bool
is_vector(aco_opcode op)374 is_vector(aco_opcode op)
375 {
376    switch (instr_info.classes[(int)op]) {
377    case instr_class::valu32:
378    case instr_class::valu_convert32:
379    case instr_class::valu_fma:
380    case instr_class::valu_double:
381    case instr_class::valu_double_add:
382    case instr_class::valu_double_convert:
383    case instr_class::valu_double_transcendental:
384    case instr_class::vmem:
385    case instr_class::ds:
386    case instr_class::exp:
387    case instr_class::valu64:
388    case instr_class::valu_quarter_rate32:
389    case instr_class::valu_transcendental32: return true;
390    default: return false;
391    }
392 }
393 
394 void
add(aco_ptr<Instruction> & instr)395 BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
396 {
397    perf_info perf = get_perf_info(*program, *instr);
398 
399    cur_cycle += get_dependency_cost(instr);
400 
401    unsigned start;
402    bool dual_issue = program->gfx_level >= GFX10 && program->wave_size == 64 &&
403                      is_vector(instr->opcode) && !is_dual_issue_capable(*program, *instr) &&
404                      program->workgroup_size > 32;
405    for (unsigned i = 0; i < (dual_issue ? 2 : 1); i++) {
406       cur_cycle += cycles_until_res_available(instr);
407 
408       start = cur_cycle;
409       use_resources(instr);
410 
411       /* GCN is in-order and doesn't begin the next instruction until the current one finishes */
412       cur_cycle += program->gfx_level >= GFX10 ? 1 : perf.latency;
413    }
414 
415    wait_imm imm = get_wait_imm(program, instr);
416    for (unsigned i = 0; i < wait_type_num; i++) {
417       while (mem_ops[i].size() > imm[i])
418          mem_ops[i].pop_front();
419    }
420 
421    std::array<unsigned, wait_type_num> wait_info = get_wait_counter_info(program->gfx_level, instr);
422    for (unsigned i = 0; i < wait_type_num; i++) {
423       if (wait_info[i])
424          mem_ops[i].push_back(cur_cycle + wait_info[i]);
425    }
426 
427    /* This is inaccurate but shouldn't affect anything after waitcnt insertion.
428     * Before waitcnt insertion, this is necessary to consider memory operations.
429     */
430    unsigned latency = 0;
431    for (unsigned i = 0; i < wait_type_num; i++)
432       latency = MAX2(latency, i == wait_type_vs ? 0 : wait_info[i]);
433    int32_t result_available = start + MAX2(perf.latency, (int32_t)latency);
434 
435    for (Definition& def : instr->definitions) {
436       int32_t* available = &reg_available[def.physReg().reg()];
437       for (unsigned i = 0; i < def.size(); i++)
438          available[i] = MAX2(available[i], result_available);
439    }
440 }
441 
442 void
join(const BlockCycleEstimator & pred)443 BlockCycleEstimator::join(const BlockCycleEstimator& pred)
444 {
445    assert(cur_cycle == 0);
446 
447    for (unsigned i = 0; i < (unsigned)resource_count; i++) {
448       assert(res_usage[i] == 0);
449       res_available[i] = MAX2(res_available[i], pred.res_available[i] - pred.cur_cycle);
450    }
451 
452    for (unsigned i = 0; i < 512; i++)
453       reg_available[i] = MAX2(reg_available[i], pred.reg_available[i] - pred.cur_cycle + cur_cycle);
454 
455    for (unsigned i = 0; i < wait_type_num; i++) {
456       std::deque<int32_t>& ops = mem_ops[i];
457       const std::deque<int32_t>& pred_ops = pred.mem_ops[i];
458       for (unsigned j = 0; j < MIN2(ops.size(), pred_ops.size()); j++)
459          ops.rbegin()[j] = MAX2(ops.rbegin()[j], pred_ops.rbegin()[j] - pred.cur_cycle);
460       for (int j = pred_ops.size() - ops.size() - 1; j >= 0; j--)
461          ops.push_front(pred_ops[j] - pred.cur_cycle);
462    }
463 }
464 
465 } /* end namespace */
466 
467 /* sgpr_presched/vgpr_presched */
468 void
collect_presched_stats(Program * program)469 collect_presched_stats(Program* program)
470 {
471    RegisterDemand presched_demand;
472    for (Block& block : program->blocks)
473       presched_demand.update(block.register_demand);
474    program->statistics[aco_statistic_sgpr_presched] = presched_demand.sgpr;
475    program->statistics[aco_statistic_vgpr_presched] = presched_demand.vgpr;
476 }
477 
478 /* instructions/branches/vmem_clauses/smem_clauses/cycles */
479 void
collect_preasm_stats(Program * program)480 collect_preasm_stats(Program* program)
481 {
482    for (Block& block : program->blocks) {
483       std::set<Instruction*> vmem_clause;
484       std::set<Instruction*> smem_clause;
485 
486       program->statistics[aco_statistic_instructions] += block.instructions.size();
487 
488       for (aco_ptr<Instruction>& instr : block.instructions) {
489          const bool is_branch =
490             instr->isSOPP() && instr_info.classes[(int)instr->opcode] == instr_class::branch;
491          if (is_branch)
492             program->statistics[aco_statistic_branches]++;
493 
494          if (instr->isVALU() || instr->isVINTRP())
495             program->statistics[aco_statistic_valu]++;
496          if (instr->isSALU() && !instr->isSOPP() &&
497              instr_info.classes[(int)instr->opcode] != instr_class::waitcnt)
498             program->statistics[aco_statistic_salu]++;
499          if (instr->isVOPD())
500             program->statistics[aco_statistic_vopd]++;
501 
502          if ((instr->isVMEM() || instr->isScratch() || instr->isGlobal()) &&
503              !instr->operands.empty()) {
504             if (std::none_of(vmem_clause.begin(), vmem_clause.end(),
505                              [&](Instruction* other)
506                              { return should_form_clause(instr.get(), other); }))
507                program->statistics[aco_statistic_vmem_clauses]++;
508             vmem_clause.insert(instr.get());
509 
510             program->statistics[aco_statistic_vmem]++;
511          } else {
512             vmem_clause.clear();
513          }
514 
515          if (instr->isSMEM() && !instr->operands.empty()) {
516             if (std::none_of(smem_clause.begin(), smem_clause.end(),
517                              [&](Instruction* other)
518                              { return should_form_clause(instr.get(), other); }))
519                program->statistics[aco_statistic_smem_clauses]++;
520             smem_clause.insert(instr.get());
521 
522             program->statistics[aco_statistic_smem]++;
523          } else {
524             smem_clause.clear();
525          }
526       }
527    }
528 
529    double latency = 0;
530    double usage[(int)BlockCycleEstimator::resource_count] = {0};
531    std::vector<BlockCycleEstimator> blocks(program->blocks.size(), program);
532 
533    constexpr const unsigned vmem_latency = 320;
534    for (const Definition def : program->args_pending_vmem) {
535       blocks[0].mem_ops[wait_type_vm].push_back(vmem_latency);
536       for (unsigned i = 0; i < def.size(); i++)
537          blocks[0].reg_available[def.physReg().reg() + i] = vmem_latency;
538    }
539 
540    for (Block& block : program->blocks) {
541       BlockCycleEstimator& block_est = blocks[block.index];
542       for (unsigned pred : block.linear_preds)
543          block_est.join(blocks[pred]);
544 
545       for (aco_ptr<Instruction>& instr : block.instructions) {
546          unsigned before = block_est.cur_cycle;
547          block_est.add(instr);
548          instr->pass_flags = block_est.cur_cycle - before;
549       }
550 
551       /* TODO: it would be nice to be able to consider estimated loop trip
552        * counts used for loop unrolling.
553        */
554 
555       /* TODO: estimate the trip_count of divergent loops (those which break
556        * divergent) higher than of uniform loops
557        */
558 
559       /* Assume loops execute 8-2 times, uniform branches are taken 50% the time,
560        * and any lane in the wave takes a side of a divergent branch 75% of the
561        * time.
562        */
563       double iter = 1.0;
564       iter *= block.loop_nest_depth > 0 ? 8.0 : 1.0;
565       iter *= block.loop_nest_depth > 1 ? 4.0 : 1.0;
566       iter *= block.loop_nest_depth > 2 ? pow(2.0, block.loop_nest_depth - 2) : 1.0;
567       iter *= pow(0.5, block.uniform_if_depth);
568       iter *= pow(0.75, block.divergent_if_logical_depth);
569 
570       bool divergent_if_linear_else =
571          block.logical_preds.empty() && block.linear_preds.size() == 1 &&
572          block.linear_succs.size() == 1 &&
573          program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert);
574       if (divergent_if_linear_else)
575          iter *= 0.25;
576 
577       latency += block_est.cur_cycle * iter;
578       for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++)
579          usage[i] += block_est.res_usage[i] * iter;
580    }
581 
582    /* This likely exaggerates the effectiveness of parallelism because it
583     * ignores instruction ordering. It can assume there might be SALU/VALU/etc
584     * work to from other waves while one is idle but that might not be the case
585     * because those other waves have not reached such a point yet.
586     */
587 
588    double parallelism = program->num_waves;
589    for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++) {
590       if (usage[i] > 0.0)
591          parallelism = MIN2(parallelism, latency / usage[i]);
592    }
593    double waves_per_cycle = 1.0 / latency * parallelism;
594    double wave64_per_cycle = waves_per_cycle * (program->wave_size / 64.0);
595 
596    double max_utilization = 1.0;
597    if (program->workgroup_size != UINT_MAX)
598       max_utilization =
599          program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
600    wave64_per_cycle *= max_utilization;
601 
602    program->statistics[aco_statistic_latency] = round(latency);
603    program->statistics[aco_statistic_inv_throughput] = round(1.0 / wave64_per_cycle);
604 
605    if (debug_flags & DEBUG_PERF_INFO) {
606       aco_print_program(program, stderr, print_no_ssa | print_perf_info);
607 
608       fprintf(stderr, "num_waves: %u\n", program->num_waves);
609       fprintf(stderr, "salu_smem_usage: %f\n", usage[(int)BlockCycleEstimator::scalar]);
610       fprintf(stderr, "branch_sendmsg_usage: %f\n",
611               usage[(int)BlockCycleEstimator::branch_sendmsg]);
612       fprintf(stderr, "valu_usage: %f\n", usage[(int)BlockCycleEstimator::valu]);
613       fprintf(stderr, "valu_complex_usage: %f\n", usage[(int)BlockCycleEstimator::valu_complex]);
614       fprintf(stderr, "lds_usage: %f\n", usage[(int)BlockCycleEstimator::lds]);
615       fprintf(stderr, "export_gds_usage: %f\n", usage[(int)BlockCycleEstimator::export_gds]);
616       fprintf(stderr, "vmem_usage: %f\n", usage[(int)BlockCycleEstimator::vmem]);
617       fprintf(stderr, "latency: %f\n", latency);
618       fprintf(stderr, "parallelism: %f\n", parallelism);
619       fprintf(stderr, "max_utilization: %f\n", max_utilization);
620       fprintf(stderr, "wave64_per_cycle: %f\n", wave64_per_cycle);
621       fprintf(stderr, "\n");
622    }
623 }
624 
625 void
collect_postasm_stats(Program * program,const std::vector<uint32_t> & code)626 collect_postasm_stats(Program* program, const std::vector<uint32_t>& code)
627 {
628    program->statistics[aco_statistic_hash] = util_hash_crc32(code.data(), code.size() * 4);
629 }
630 
631 Instruction_cycle_info
get_cycle_info(const Program & program,const Instruction & instr)632 get_cycle_info(const Program& program, const Instruction& instr)
633 {
634    perf_info info = get_perf_info(program, instr);
635    return Instruction_cycle_info{(unsigned)info.latency, std::max(info.cost0, info.cost1)};
636 }
637 
638 } // namespace aco
639