Lines Matching +full:vm +full:- +full:other
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
41 for (Block& block : program->blocks) in collect_presched_stats()
43 program->statistics[statistic_sgpr_presched] = presched_demand.sgpr; in collect_presched_stats()
44 program->statistics[statistic_vgpr_presched] = presched_demand.vgpr; in collect_presched_stats()
71 std::deque<int32_t> vm; member in aco::BlockCycleEstimator
76 void join(const BlockCycleEstimator& other);
88 : vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) in wait_counter_info()
91 unsigned vm; member
110 instr_class cls = instr_info.classes[(int)instr->opcode]; in get_perf_info()
115 if (program->gfx_level >= GFX10) { in get_perf_info()
138 return instr->ds().gds ? perf_info{0, WAIT_USE(export_gds, 1)} in get_perf_info()
144 case instr_class::other: in get_perf_info()
154 return program->dev.has_fast_fma32 ? perf_info{4, WAIT_USE(valu, 4)} in get_perf_info()
167 return instr->ds().gds ? perf_info{4, WAIT_USE(export_gds, 4)} in get_perf_info()
173 case instr_class::other: in get_perf_info()
205 cost = MAX2(cost, res_available[(int)perf.rsrc0] - cur_cycle); in cycles_until_res_available()
207 cost = MAX2(cost, res_available[(int)perf.rsrc1] - cur_cycle); in cycles_until_res_available()
218 if (instr->isEXP()) in get_wait_counter_info()
221 if (instr->isFlatLike()) { in get_wait_counter_info()
222 unsigned lgkm = instr->isFlat() ? 20 : 0; in get_wait_counter_info()
223 if (!instr->definitions.empty()) in get_wait_counter_info()
229 if (instr->isSMEM()) { in get_wait_counter_info()
230 if (instr->definitions.empty()) in get_wait_counter_info()
232 if (instr->operands.empty()) /* s_memtime and s_memrealtime */ in get_wait_counter_info()
235 bool likely_desc_load = instr->operands[0].size() == 2; in get_wait_counter_info()
236 bool soe = instr->operands.size() >= (!instr->definitions.empty() ? 3 : 4); in get_wait_counter_info()
238 instr->operands[1].isConstant() && (!soe || instr->operands.back().isConstant()); in get_wait_counter_info()
246 if (instr->format == Format::DS) in get_wait_counter_info()
249 if (instr->isVMEM() && !instr->definitions.empty()) in get_wait_counter_info()
252 if (instr->isVMEM() && instr->definitions.empty()) in get_wait_counter_info()
261 if (instr->opcode == aco_opcode::s_endpgm) { in get_wait_imm()
263 } else if (instr->opcode == aco_opcode::s_waitcnt) { in get_wait_imm()
264 return wait_imm(GFX10_3, instr->sopp().imm); in get_wait_imm()
265 } else if (instr->opcode == aco_opcode::s_waitcnt_vscnt) { in get_wait_imm()
266 return wait_imm(0, 0, 0, instr->sopk().imm); in get_wait_imm()
268 unsigned max_lgkm_cnt = program->gfx_level >= GFX10 ? 62 : 14; in get_wait_imm()
270 unsigned max_vm_cnt = program->gfx_level >= GFX9 ? 62 : 14; in get_wait_imm()
277 imm.vm = wait_info.vm ? max_vm_cnt : wait_imm::unset_counter; in get_wait_imm()
289 if (imm.vm != wait_imm::unset_counter) { in get_dependency_cost()
290 for (int i = 0; i < (int)vm.size() - imm.vm; i++) in get_dependency_cost()
291 deps_available = MAX2(deps_available, vm[i]); in get_dependency_cost()
294 for (int i = 0; i < (int)exp.size() - imm.exp; i++) in get_dependency_cost()
298 for (int i = 0; i < (int)lgkm.size() - imm.lgkm; i++) in get_dependency_cost()
302 for (int i = 0; i < (int)vs.size() - imm.vs; i++) in get_dependency_cost()
306 if (instr->opcode == aco_opcode::s_endpgm) { in get_dependency_cost()
309 } else if (program->gfx_level >= GFX10) { in get_dependency_cost()
310 for (Operand& op : instr->operands) { in get_dependency_cost()
318 if (program->gfx_level < GFX10) in get_dependency_cost()
321 return deps_available - cur_cycle; in get_dependency_cost()
328 return dep + std::max(cycles_until_res_available(instr) - dep, 0); in predict_cost()
360 bool dual_issue = program->gfx_level >= GFX10 && program->wave_size == 64 && in add()
361 is_vector(instr->opcode) && program->workgroup_size > 32; in add()
368 /* GCN is in-order and doesn't begin the next instruction until the current one finishes */ in add()
369 cur_cycle += program->gfx_level >= GFX10 ? 1 : perf.latency; in add()
377 while (vm.size() > imm.vm) in add()
378 vm.pop_front(); in add()
387 if (wait_info.vm) in add()
388 vm.push_back(cur_cycle + wait_info.vm); in add()
395 int latency = MAX3(wait_info.exp, wait_info.lgkm, wait_info.vm); in add()
398 for (Definition& def : instr->definitions) { in add()
410 for (int i = pred.size() - queue.size() - 1; i >= 0; i--) in join_queue()
421 res_available[i] = MAX2(res_available[i], pred.res_available[i] - pred.cur_cycle); in join()
425 reg_available[i] = MAX2(reg_available[i], pred.reg_available[i] - pred.cur_cycle + cur_cycle); in join()
427 join_queue(lgkm, pred.lgkm, -pred.cur_cycle); in join()
428 join_queue(exp, pred.exp, -pred.cur_cycle); in join()
429 join_queue(vm, pred.vm, -pred.cur_cycle); in join()
430 join_queue(vs, pred.vs, -pred.cur_cycle); in join()
437 for (Block& block : program->blocks) { in collect_preasm_stats()
441 program->statistics[statistic_instructions] += block.instructions.size(); in collect_preasm_stats()
444 if (instr->isSOPP() && instr->sopp().block != -1) in collect_preasm_stats()
445 program->statistics[statistic_branches]++; in collect_preasm_stats()
447 if (instr->opcode == aco_opcode::p_constaddr) in collect_preasm_stats()
448 program->statistics[statistic_instructions] += 2; in collect_preasm_stats()
450 if ((instr->isVMEM() || instr->isScratch() || instr->isGlobal()) && in collect_preasm_stats()
451 !instr->operands.empty()) { in collect_preasm_stats()
453 [&](Instruction* other) in collect_preasm_stats()
454 { return should_form_clause(instr.get(), other); })) in collect_preasm_stats()
455 program->statistics[statistic_vmem_clauses]++; in collect_preasm_stats()
461 if (instr->isSMEM() && !instr->operands.empty()) { in collect_preasm_stats()
463 [&](Instruction* other) in collect_preasm_stats() argument
464 { return should_form_clause(instr.get(), other); })) in collect_preasm_stats() argument
465 program->statistics[statistic_smem_clauses]++; in collect_preasm_stats()
475 std::vector<BlockCycleEstimator> blocks(program->blocks.size(), program); in collect_preasm_stats()
477 if (program->stage.has(SWStage::VS) && program->info.vs.has_prolog) { in collect_preasm_stats()
479 for (Definition def : program->vs_inputs) { in collect_preasm_stats()
480 blocks[0].vm.push_back(vs_input_latency); in collect_preasm_stats()
486 for (Block& block : program->blocks) { in collect_preasm_stats()
494 instr->pass_flags = block_est.cur_cycle - before; in collect_preasm_stats()
505 /* Assume loops execute 8-2 times, uniform branches are taken 50% the time, in collect_preasm_stats()
512 iter *= block.loop_nest_depth > 2 ? pow(2.0, block.loop_nest_depth - 2) : 1.0; in collect_preasm_stats()
519 program->blocks[block.linear_preds[0]].kind & (block_kind_branch | block_kind_invert); in collect_preasm_stats()
530 * work to from other waves while one is idle but that might not be the case in collect_preasm_stats()
531 * because those other waves have not reached such a point yet. in collect_preasm_stats()
534 double parallelism = program->num_waves; in collect_preasm_stats()
540 double wave64_per_cycle = waves_per_cycle * (program->wave_size / 64.0); in collect_preasm_stats()
543 if (program->workgroup_size != UINT_MAX) in collect_preasm_stats()
545 program->workgroup_size / (double)align(program->workgroup_size, program->wave_size); in collect_preasm_stats()
548 program->statistics[statistic_latency] = round(latency); in collect_preasm_stats()
549 program->statistics[statistic_inv_throughput] = round(1.0 / wave64_per_cycle); in collect_preasm_stats()
554 fprintf(stderr, "num_waves: %u\n", program->num_waves); in collect_preasm_stats()
574 program->statistics[aco::statistic_hash] = util_hash_crc32(code.data(), code.size() * 4); in collect_postasm_stats()