• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "aco_ir.h"
26 
27 #include "util/crc32.h"
28 
29 #include <algorithm>
30 #include <deque>
31 #include <set>
32 #include <vector>
33 
34 namespace aco {
35 
36 /* sgpr_presched/vgpr_presched */
37 void
collect_presched_stats(Program * program)38 collect_presched_stats(Program* program)
39 {
40    RegisterDemand presched_demand;
41    for (Block& block : program->blocks)
42       presched_demand.update(block.register_demand);
43    program->statistics[aco_statistic_sgpr_presched] = presched_demand.sgpr;
44    program->statistics[aco_statistic_vgpr_presched] = presched_demand.vgpr;
45 }
46 
47 class BlockCycleEstimator {
48 public:
49    enum resource {
50       null = 0,
51       scalar,
52       branch_sendmsg,
53       valu,
54       valu_complex,
55       lds,
56       export_gds,
57       vmem,
58       resource_count,
59    };
60 
BlockCycleEstimator(Program * program_)61    BlockCycleEstimator(Program* program_) : program(program_) {}
62 
63    Program* program;
64 
65    int32_t cur_cycle = 0;
66    int32_t res_available[(int)BlockCycleEstimator::resource_count] = {0};
67    unsigned res_usage[(int)BlockCycleEstimator::resource_count] = {0};
68    int32_t reg_available[512] = {0};
69    std::deque<int32_t> lgkm;
70    std::deque<int32_t> exp;
71    std::deque<int32_t> vm;
72    std::deque<int32_t> vs;
73 
74    unsigned predict_cost(aco_ptr<Instruction>& instr);
75    void add(aco_ptr<Instruction>& instr);
76    void join(const BlockCycleEstimator& other);
77 
78 private:
79    unsigned get_waitcnt_cost(wait_imm imm);
80    unsigned get_dependency_cost(aco_ptr<Instruction>& instr);
81 
82    void use_resources(aco_ptr<Instruction>& instr);
83    int32_t cycles_until_res_available(aco_ptr<Instruction>& instr);
84 };
85 
86 struct wait_counter_info {
wait_counter_infoaco::wait_counter_info87    wait_counter_info(unsigned vm_, unsigned exp_, unsigned lgkm_, unsigned vs_)
88        : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_)
89    {}
90 
91    unsigned vm;
92    unsigned exp;
93    unsigned lgkm;
94    unsigned vs;
95 };
96 
97 struct perf_info {
98    int latency;
99 
100    BlockCycleEstimator::resource rsrc0;
101    unsigned cost0;
102 
103    BlockCycleEstimator::resource rsrc1;
104    unsigned cost1;
105 };
106 
107 static bool
is_dual_issue_capable(const Program & program,const Instruction & instr)108 is_dual_issue_capable(const Program& program, const Instruction& instr)
109 {
110    if (program.gfx_level < GFX11 || !instr.isVALU() || instr.isDPP())
111       return false;
112 
113    switch (instr.opcode) {
114    case aco_opcode::v_fma_f32:
115    case aco_opcode::v_fmac_f32:
116    case aco_opcode::v_fmaak_f32:
117    case aco_opcode::v_fmamk_f32:
118    case aco_opcode::v_mul_f32:
119    case aco_opcode::v_add_f32:
120    case aco_opcode::v_sub_f32:
121    case aco_opcode::v_subrev_f32:
122    case aco_opcode::v_mul_legacy_f32:
123    case aco_opcode::v_fma_legacy_f32:
124    case aco_opcode::v_fmac_legacy_f32:
125    case aco_opcode::v_fma_f16:
126    case aco_opcode::v_fmac_f16:
127    case aco_opcode::v_fmaak_f16:
128    case aco_opcode::v_fmamk_f16:
129    case aco_opcode::v_mul_f16:
130    case aco_opcode::v_add_f16:
131    case aco_opcode::v_sub_f16:
132    case aco_opcode::v_subrev_f16:
133    case aco_opcode::v_mov_b32:
134    case aco_opcode::v_movreld_b32:
135    case aco_opcode::v_movrels_b32:
136    case aco_opcode::v_movrelsd_b32:
137    case aco_opcode::v_movrelsd_2_b32:
138    case aco_opcode::v_cndmask_b32:
139    case aco_opcode::v_writelane_b32_e64:
140    case aco_opcode::v_mov_b16:
141    case aco_opcode::v_cndmask_b16:
142    case aco_opcode::v_max_f32:
143    case aco_opcode::v_min_f32:
144    case aco_opcode::v_max_f16:
145    case aco_opcode::v_min_f16:
146    case aco_opcode::v_max_i16_e64:
147    case aco_opcode::v_min_i16_e64:
148    case aco_opcode::v_max_u16_e64:
149    case aco_opcode::v_min_u16_e64:
150    case aco_opcode::v_add_i16:
151    case aco_opcode::v_sub_i16:
152    case aco_opcode::v_mad_i16:
153    case aco_opcode::v_add_u16_e64:
154    case aco_opcode::v_sub_u16_e64:
155    case aco_opcode::v_mad_u16:
156    case aco_opcode::v_mul_lo_u16_e64:
157    case aco_opcode::v_not_b16:
158    case aco_opcode::v_and_b16:
159    case aco_opcode::v_or_b16:
160    case aco_opcode::v_xor_b16:
161    case aco_opcode::v_lshrrev_b16_e64:
162    case aco_opcode::v_ashrrev_i16_e64:
163    case aco_opcode::v_lshlrev_b16_e64:
164    case aco_opcode::v_dot2_bf16_bf16:
165    case aco_opcode::v_dot2_f32_bf16:
166    case aco_opcode::v_dot2_f16_f16:
167    case aco_opcode::v_dot2_f32_f16:
168    case aco_opcode::v_dot2c_f32_f16: return true;
169    case aco_opcode::v_fma_mix_f32:
170    case aco_opcode::v_fma_mixlo_f16:
171    case aco_opcode::v_fma_mixhi_f16: {
172       /* dst and acc type must match */
173       if (instr.valu().opsel_hi[2] == (instr.opcode == aco_opcode::v_fma_mix_f32))
174          return false;
175 
176       /* If all operands are vgprs, two must be the same. */
177       for (unsigned i = 0; i < 3; i++) {
178          if (instr.operands[i].isConstant() || instr.operands[i].isOfType(RegType::sgpr))
179             return true;
180          for (unsigned j = 0; j < i; j++) {
181             if (instr.operands[i].physReg() == instr.operands[j].physReg())
182                return true;
183          }
184       }
185       return false;
186    }
187    default: return false;
188    }
189 }
190 
191 static perf_info
get_perf_info(const Program & program,const Instruction & instr)192 get_perf_info(const Program& program, const Instruction& instr)
193 {
194    instr_class cls = instr_info.classes[(int)instr.opcode];
195 
196 #define WAIT(res)          BlockCycleEstimator::res, 0
197 #define WAIT_USE(res, cnt) BlockCycleEstimator::res, cnt
198 
199    if (program.gfx_level >= GFX10) {
200       /* fp64 might be incorrect */
201       switch (cls) {
202       case instr_class::valu32:
203       case instr_class::valu_convert32:
204       case instr_class::valu_fma: return {5, WAIT_USE(valu, 1)};
205       case instr_class::valu64: return {6, WAIT_USE(valu, 2), WAIT_USE(valu_complex, 2)};
206       case instr_class::valu_quarter_rate32:
207          return {8, WAIT_USE(valu, 4), WAIT_USE(valu_complex, 4)};
208       case instr_class::valu_transcendental32:
209          return {10, WAIT_USE(valu, 1), WAIT_USE(valu_complex, 4)};
210       case instr_class::valu_double: return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
211       case instr_class::valu_double_add:
212          return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
213       case instr_class::valu_double_convert:
214          return {22, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
215       case instr_class::valu_double_transcendental:
216          return {24, WAIT_USE(valu, 16), WAIT_USE(valu_complex, 16)};
217       case instr_class::salu: return {2, WAIT_USE(scalar, 1)};
218       case instr_class::smem: return {0, WAIT_USE(scalar, 1)};
219       case instr_class::branch:
220       case instr_class::sendmsg: return {0, WAIT_USE(branch_sendmsg, 1)};
221       case instr_class::ds:
222          return instr.isDS() && instr.ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)}
223                                                : perf_info{0, WAIT_USE(lds, 1)};
224       case instr_class::exp: return {0, WAIT_USE(export_gds, 1)};
225       case instr_class::vmem: return {0, WAIT_USE(vmem, 1)};
226       case instr_class::wmma: {
227          /* int8 and (b)f16 have the same performance. */
228          uint8_t cost = instr.opcode == aco_opcode::v_wmma_i32_16x16x16_iu4 ? 16 : 32;
229          return {cost, WAIT_USE(valu, cost)};
230       }
231       case instr_class::barrier:
232       case instr_class::waitcnt:
233       case instr_class::other:
234       default: return {0};
235       }
236    } else {
237       switch (cls) {
238       case instr_class::valu32: return {4, WAIT_USE(valu, 4)};
239       case instr_class::valu_convert32: return {16, WAIT_USE(valu, 16)};
240       case instr_class::valu64: return {8, WAIT_USE(valu, 8)};
241       case instr_class::valu_quarter_rate32: return {16, WAIT_USE(valu, 16)};
242       case instr_class::valu_fma:
243          return program.dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)}
244                                            : perf_info{16, WAIT_USE(valu, 16)};
245       case instr_class::valu_transcendental32: return {16, WAIT_USE(valu, 16)};
246       case instr_class::valu_double: return {64, WAIT_USE(valu, 64)};
247       case instr_class::valu_double_add: return {32, WAIT_USE(valu, 32)};
248       case instr_class::valu_double_convert: return {16, WAIT_USE(valu, 16)};
249       case instr_class::valu_double_transcendental: return {64, WAIT_USE(valu, 64)};
250       case instr_class::salu: return {4, WAIT_USE(scalar, 4)};
251       case instr_class::smem: return {4, WAIT_USE(scalar, 4)};
252       case instr_class::branch:
253          return {8, WAIT_USE(branch_sendmsg, 8)};
254          return {4, WAIT_USE(branch_sendmsg, 4)};
255       case instr_class::ds:
256          return instr.isDS() && instr.ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)}
257                                                : perf_info{4, WAIT_USE(lds, 4)};
258       case instr_class::exp: return {16, WAIT_USE(export_gds, 16)};
259       case instr_class::vmem: return {4, WAIT_USE(vmem, 4)};
260       case instr_class::barrier:
261       case instr_class::waitcnt:
262       case instr_class::other:
263       default: return {4};
264       }
265    }
266 
267 #undef WAIT_USE
268 #undef WAIT
269 }
270 
271 void
use_resources(aco_ptr<Instruction> & instr)272 BlockCycleEstimator::use_resources(aco_ptr<Instruction>& instr)
273 {
274    perf_info perf = get_perf_info(*program, *instr);
275 
276    if (perf.rsrc0 != resource_count) {
277       res_available[(int)perf.rsrc0] = cur_cycle + perf.cost0;
278       res_usage[(int)perf.rsrc0] += perf.cost0;
279    }
280 
281    if (perf.rsrc1 != resource_count) {
282       res_available[(int)perf.rsrc1] = cur_cycle + perf.cost1;
283       res_usage[(int)perf.rsrc1] += perf.cost1;
284    }
285 }
286 
287 int32_t
cycles_until_res_available(aco_ptr<Instruction> & instr)288 BlockCycleEstimator::cycles_until_res_available(aco_ptr<Instruction>& instr)
289 {
290    perf_info perf = get_perf_info(*program, *instr);
291 
292    int32_t cost = 0;
293    if (perf.rsrc0 != resource_count)
294       cost = MAX2(cost, res_available[(int)perf.rsrc0] - cur_cycle);
295    if (perf.rsrc1 != resource_count)
296       cost = MAX2(cost, res_available[(int)perf.rsrc1] - cur_cycle);
297 
298    return cost;
299 }
300 
301 static wait_counter_info
get_wait_counter_info(aco_ptr<Instruction> & instr)302 get_wait_counter_info(aco_ptr<Instruction>& instr)
303 {
304    /* These numbers are all a bit nonsense. LDS/VMEM/SMEM/EXP performance
305     * depends a lot on the situation. */
306 
307    if (instr->isEXP())
308       return wait_counter_info(0, 16, 0, 0);
309 
310    if (instr->isFlatLike()) {
311       unsigned lgkm = instr->isFlat() ? 20 : 0;
312       if (!instr->definitions.empty())
313          return wait_counter_info(320, 0, lgkm, 0);
314       else
315          return wait_counter_info(0, 0, lgkm, 320);
316    }
317 
318    if (instr->isSMEM()) {
319       if (instr->definitions.empty())
320          return wait_counter_info(0, 0, 200, 0);
321       if (instr->operands.empty()) /* s_memtime and s_memrealtime */
322          return wait_counter_info(0, 0, 1, 0);
323 
324       bool likely_desc_load = instr->operands[0].size() == 2;
325       bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4);
326       bool const_offset =
327          instr->operands[1].isConstant() && (!soe || instr->operands.back().isConstant());
328 
329       if (likely_desc_load || const_offset)
330          return wait_counter_info(0, 0, 30, 0); /* likely to hit L0 cache */
331 
332       return wait_counter_info(0, 0, 200, 0);
333    }
334 
335    if (instr->format == Format::DS)
336       return wait_counter_info(0, 0, 20, 0);
337 
338    if (instr->isVMEM() && !instr->definitions.empty())
339       return wait_counter_info(320, 0, 0, 0);
340 
341    if (instr->isVMEM() && instr->definitions.empty())
342       return wait_counter_info(0, 0, 0, 320);
343 
344    return wait_counter_info(0, 0, 0, 0);
345 }
346 
347 static wait_imm
get_wait_imm(Program * program,aco_ptr<Instruction> & instr)348 get_wait_imm(Program* program, aco_ptr<Instruction>& instr)
349 {
350    if (instr->opcode == aco_opcode::s_endpgm) {
351       return wait_imm(0, 0, 0, 0);
352    } else if (instr->opcode == aco_opcode::s_waitcnt) {
353       return wait_imm(GFX10_3, instr->sopp().imm);
354    } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) {
355       return wait_imm(0, 0, 0, instr->sopk().imm);
356    } else {
357       unsigned max_lgkm_cnt = program->gfx_level >= GFX10 ? 62 : 14;
358       unsigned max_exp_cnt = 6;
359       unsigned max_vm_cnt = program->gfx_level >= GFX9 ? 62 : 14;
360       unsigned max_vs_cnt = 62;
361 
362       wait_counter_info wait_info = get_wait_counter_info(instr);
363       wait_imm imm;
364       imm.lgkm = wait_info.lgkm ? max_lgkm_cnt : wait_imm::unset_counter;
365       imm.exp = wait_info.exp ? max_exp_cnt : wait_imm::unset_counter;
366       imm.vm = wait_info.vm ? max_vm_cnt : wait_imm::unset_counter;
367       imm.vs = wait_info.vs ? max_vs_cnt : wait_imm::unset_counter;
368       return imm;
369    }
370 }
371 
372 unsigned
get_dependency_cost(aco_ptr<Instruction> & instr)373 BlockCycleEstimator::get_dependency_cost(aco_ptr<Instruction>& instr)
374 {
375    int deps_available = cur_cycle;
376 
377    wait_imm imm = get_wait_imm(program, instr);
378    if (imm.vm != wait_imm::unset_counter) {
379       for (int i = 0; i < (int)vm.size() - imm.vm; i++)
380          deps_available = MAX2(deps_available, vm[i]);
381    }
382    if (imm.exp != wait_imm::unset_counter) {
383       for (int i = 0; i < (int)exp.size() - imm.exp; i++)
384          deps_available = MAX2(deps_available, exp[i]);
385    }
386    if (imm.lgkm != wait_imm::unset_counter) {
387       for (int i = 0; i < (int)lgkm.size() - imm.lgkm; i++)
388          deps_available = MAX2(deps_available, lgkm[i]);
389    }
390    if (imm.vs != wait_imm::unset_counter) {
391       for (int i = 0; i < (int)vs.size() - imm.vs; i++)
392          deps_available = MAX2(deps_available, vs[i]);
393    }
394 
395    if (instr->opcode == aco_opcode::s_endpgm) {
396       for (unsigned i = 0; i < 512; i++)
397          deps_available = MAX2(deps_available, reg_available[i]);
398    } else if (program->gfx_level >= GFX10) {
399       for (Operand& op : instr->operands) {
400          if (op.isConstant() || op.isUndefined())
401             continue;
402          for (unsigned i = 0; i < op.size(); i++)
403             deps_available = MAX2(deps_available, reg_available[op.physReg().reg() + i]);
404       }
405    }
406 
407    if (program->gfx_level < GFX10)
408       deps_available = align(deps_available, 4);
409 
410    return deps_available - cur_cycle;
411 }
412 
413 unsigned
predict_cost(aco_ptr<Instruction> & instr)414 BlockCycleEstimator::predict_cost(aco_ptr<Instruction>& instr)
415 {
416    int32_t dep = get_dependency_cost(instr);
417    return dep + std::max(cycles_until_res_available(instr) - dep, 0);
418 }
419 
420 static bool
is_vector(aco_opcode op)421 is_vector(aco_opcode op)
422 {
423    switch (instr_info.classes[(int)op]) {
424    case instr_class::valu32:
425    case instr_class::valu_convert32:
426    case instr_class::valu_fma:
427    case instr_class::valu_double:
428    case instr_class::valu_double_add:
429    case instr_class::valu_double_convert:
430    case instr_class::valu_double_transcendental:
431    case instr_class::vmem:
432    case instr_class::ds:
433    case instr_class::exp:
434    case instr_class::valu64:
435    case instr_class::valu_quarter_rate32:
436    case instr_class::valu_transcendental32: return true;
437    default: return false;
438    }
439 }
440 
441 void
add(aco_ptr<Instruction> & instr)442 BlockCycleEstimator::add(aco_ptr<Instruction>& instr)
443 {
444    perf_info perf = get_perf_info(*program, *instr);
445 
446    cur_cycle += get_dependency_cost(instr);
447 
448    unsigned start;
449    bool dual_issue = program->gfx_level >= GFX10 && program->wave_size == 64 &&
450                      is_vector(instr->opcode) && !is_dual_issue_capable(*program, *instr) &&
451                      program->workgroup_size > 32;
452    for (unsigned i = 0; i < (dual_issue ? 2 : 1); i++) {
453       cur_cycle += cycles_until_res_available(instr);
454 
455       start = cur_cycle;
456       use_resources(instr);
457 
458       /* GCN is in-order and doesn't begin the next instruction until the current one finishes */
459       cur_cycle += program->gfx_level >= GFX10 ? 1 : perf.latency;
460    }
461 
462    wait_imm imm = get_wait_imm(program, instr);
463    while (lgkm.size() > imm.lgkm)
464       lgkm.pop_front();
465    while (exp.size() > imm.exp)
466       exp.pop_front();
467    while (vm.size() > imm.vm)
468       vm.pop_front();
469    while (vs.size() > imm.vs)
470       vs.pop_front();
471 
472    wait_counter_info wait_info = get_wait_counter_info(instr);
473    if (wait_info.exp)
474       exp.push_back(cur_cycle + wait_info.exp);
475    if (wait_info.lgkm)
476       lgkm.push_back(cur_cycle + wait_info.lgkm);
477    if (wait_info.vm)
478       vm.push_back(cur_cycle + wait_info.vm);
479    if (wait_info.vs)
480       vs.push_back(cur_cycle + wait_info.vs);
481 
482    /* This is inaccurate but shouldn't affect anything after waitcnt insertion.
483     * Before waitcnt insertion, this is necessary to consider memory operations.
484     */
485    int latency = MAX3(wait_info.exp, wait_info.lgkm, wait_info.vm);
486    int32_t result_available = start + MAX2(perf.latency, latency);
487 
488    for (Definition& def : instr->definitions) {
489       int32_t* available = &reg_available[def.physReg().reg()];
490       for (unsigned i = 0; i < def.size(); i++)
491          available[i] = MAX2(available[i], result_available);
492    }
493 }
494 
495 static void
join_queue(std::deque<int32_t> & queue,const std::deque<int32_t> & pred,int cycle_diff)496 join_queue(std::deque<int32_t>& queue, const std::deque<int32_t>& pred, int cycle_diff)
497 {
498    for (unsigned i = 0; i < MIN2(queue.size(), pred.size()); i++)
499       queue.rbegin()[i] = MAX2(queue.rbegin()[i], pred.rbegin()[i] + cycle_diff);
500    for (int i = pred.size() - queue.size() - 1; i >= 0; i--)
501       queue.push_front(pred[i] + cycle_diff);
502 }
503 
504 void
join(const BlockCycleEstimator & pred)505 BlockCycleEstimator::join(const BlockCycleEstimator& pred)
506 {
507    assert(cur_cycle == 0);
508 
509    for (unsigned i = 0; i < (unsigned)resource_count; i++) {
510       assert(res_usage[i] == 0);
511       res_available[i] = MAX2(res_available[i], pred.res_available[i] - pred.cur_cycle);
512    }
513 
514    for (unsigned i = 0; i < 512; i++)
515       reg_available[i] = MAX2(reg_available[i], pred.reg_available[i] - pred.cur_cycle + cur_cycle);
516 
517    join_queue(lgkm, pred.lgkm, -pred.cur_cycle);
518    join_queue(exp, pred.exp, -pred.cur_cycle);
519    join_queue(vm, pred.vm, -pred.cur_cycle);
520    join_queue(vs, pred.vs, -pred.cur_cycle);
521 }
522 
523 /* instructions/branches/vmem_clauses/smem_clauses/cycles */
524 void
collect_preasm_stats(Program * program)525 collect_preasm_stats(Program* program)
526 {
527    for (Block& block : program->blocks) {
528       std::set<Instruction*> vmem_clause;
529       std::set<Instruction*> smem_clause;
530 
531       program->statistics[aco_statistic_instructions] += block.instructions.size();
532 
533       for (aco_ptr<Instruction>& instr : block.instructions) {
534          bool is_branch = instr->isSOPP() && instr->sopp().block != -1;
535          if (is_branch)
536             program->statistics[aco_statistic_branches]++;
537 
538          if (instr->isVALU() || instr->isVINTRP())
539             program->statistics[aco_statistic_valu]++;
540          if (instr->isSALU() && !instr->isSOPP() &&
541              instr_info.classes[(int)instr->opcode] != instr_class::waitcnt)
542             program->statistics[aco_statistic_salu]++;
543          if (instr->isVOPD())
544             program->statistics[aco_statistic_vopd]++;
545 
546          if ((instr->isVMEM() || instr->isScratch() || instr->isGlobal()) &&
547              !instr->operands.empty()) {
548             if (std::none_of(vmem_clause.begin(), vmem_clause.end(),
549                              [&](Instruction* other)
550                              { return should_form_clause(instr.get(), other); }))
551                program->statistics[aco_statistic_vmem_clauses]++;
552             vmem_clause.insert(instr.get());
553 
554             program->statistics[aco_statistic_vmem]++;
555          } else {
556             vmem_clause.clear();
557          }
558 
559          if (instr->isSMEM() && !instr->operands.empty()) {
560             if (std::none_of(smem_clause.begin(), smem_clause.end(),
561                              [&](Instruction* other)
562                              { return should_form_clause(instr.get(), other); }))
563                program->statistics[aco_statistic_smem_clauses]++;
564             smem_clause.insert(instr.get());
565 
566             program->statistics[aco_statistic_smem]++;
567          } else {
568             smem_clause.clear();
569          }
570       }
571    }
572 
573    double latency = 0;
574    double usage[(int)BlockCycleEstimator::resource_count] = {0};
575    std::vector<BlockCycleEstimator> blocks(program->blocks.size(), program);
576 
577    constexpr const unsigned vmem_latency = 320;
578    for (const Definition def : program->args_pending_vmem) {
579       blocks[0].vm.push_back(vmem_latency);
580       for (unsigned i = 0; i < def.size(); i++)
581          blocks[0].reg_available[def.physReg().reg() + i] = vmem_latency;
582    }
583 
584    for (Block& block : program->blocks) {
585       BlockCycleEstimator& block_est = blocks[block.index];
586       for (unsigned pred : block.linear_preds)
587          block_est.join(blocks[pred]);
588 
589       for (aco_ptr<Instruction>& instr : block.instructions) {
590          unsigned before = block_est.cur_cycle;
591          block_est.add(instr);
592          instr->pass_flags = block_est.cur_cycle - before;
593       }
594 
595       /* TODO: it would be nice to be able to consider estimated loop trip
596        * counts used for loop unrolling.
597        */
598 
599       /* TODO: estimate the trip_count of divergent loops (those which break
600        * divergent) higher than of uniform loops
601        */
602 
603       /* Assume loops execute 8-2 times, uniform branches are taken 50% the time,
604        * and any lane in the wave takes a side of a divergent branch 75% of the
605        * time.
606        */
607       double iter = 1.0;
608       iter *= block.loop_nest_depth > 0 ? 8.0 : 1.0;
609       iter *= block.loop_nest_depth > 1 ? 4.0 : 1.0;
610       iter *= block.loop_nest_depth > 2 ? pow(2.0, block.loop_nest_depth - 2) : 1.0;
611       iter *= pow(0.5, block.uniform_if_depth);
612       iter *= pow(0.75, block.divergent_if_logical_depth);
613 
614       bool divergent_if_linear_else =
615          block.logical_preds.empty() && block.linear_preds.size() == 1 &&
616          block.linear_succs.size() == 1 &&
617          program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert);
618       if (divergent_if_linear_else)
619          iter *= 0.25;
620 
621       latency += block_est.cur_cycle * iter;
622       for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++)
623          usage[i] += block_est.res_usage[i] * iter;
624    }
625 
626    /* This likely exaggerates the effectiveness of parallelism because it
627     * ignores instruction ordering. It can assume there might be SALU/VALU/etc
628     * work to from other waves while one is idle but that might not be the case
629     * because those other waves have not reached such a point yet.
630     */
631 
632    double parallelism = program->num_waves;
633    for (unsigned i = 0; i < (unsigned)BlockCycleEstimator::resource_count; i++) {
634       if (usage[i] > 0.0)
635          parallelism = MIN2(parallelism, latency / usage[i]);
636    }
637    double waves_per_cycle = 1.0 / latency * parallelism;
638    double wave64_per_cycle = waves_per_cycle * (program->wave_size / 64.0);
639 
640    double max_utilization = 1.0;
641    if (program->workgroup_size != UINT_MAX)
642       max_utilization =
643          program->workgroup_size / (double)align(program->workgroup_size, program->wave_size);
644    wave64_per_cycle *= max_utilization;
645 
646    program->statistics[aco_statistic_latency] = round(latency);
647    program->statistics[aco_statistic_inv_throughput] = round(1.0 / wave64_per_cycle);
648 
649    if (debug_flags & DEBUG_PERF_INFO) {
650       aco_print_program(program, stderr, print_no_ssa | print_perf_info);
651 
652       fprintf(stderr, "num_waves: %u\n", program->num_waves);
653       fprintf(stderr, "salu_smem_usage: %f\n", usage[(int)BlockCycleEstimator::scalar]);
654       fprintf(stderr, "branch_sendmsg_usage: %f\n",
655               usage[(int)BlockCycleEstimator::branch_sendmsg]);
656       fprintf(stderr, "valu_usage: %f\n", usage[(int)BlockCycleEstimator::valu]);
657       fprintf(stderr, "valu_complex_usage: %f\n", usage[(int)BlockCycleEstimator::valu_complex]);
658       fprintf(stderr, "lds_usage: %f\n", usage[(int)BlockCycleEstimator::lds]);
659       fprintf(stderr, "export_gds_usage: %f\n", usage[(int)BlockCycleEstimator::export_gds]);
660       fprintf(stderr, "vmem_usage: %f\n", usage[(int)BlockCycleEstimator::vmem]);
661       fprintf(stderr, "latency: %f\n", latency);
662       fprintf(stderr, "parallelism: %f\n", parallelism);
663       fprintf(stderr, "max_utilization: %f\n", max_utilization);
664       fprintf(stderr, "wave64_per_cycle: %f\n", wave64_per_cycle);
665       fprintf(stderr, "\n");
666    }
667 }
668 
669 void
collect_postasm_stats(Program * program,const std::vector<uint32_t> & code)670 collect_postasm_stats(Program* program, const std::vector<uint32_t>& code)
671 {
672    program->statistics[aco_statistic_hash] = util_hash_crc32(code.data(), code.size() * 4);
673 }
674 
675 Instruction_cycle_info
get_cycle_info(const Program & program,const Instruction & instr)676 get_cycle_info(const Program& program, const Instruction& instr)
677 {
678    perf_info info = get_perf_info(program, instr);
679    return Instruction_cycle_info{(unsigned)info.latency, std::max(info.cost0, info.cost1)};
680 }
681 
682 } // namespace aco
683