1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Eric Anholt <eric@anholt.net>
25 *
26 */
27
28 #include "brw_eu.h"
29 #include "brw_fs.h"
30 #include "brw_fs_live_variables.h"
31 #include "brw_cfg.h"
32 #include <new>
33
34 using namespace brw;
35
36 /** @file
37 *
38 * List scheduling of FS instructions.
39 *
40 * The basic model of the list scheduler is to take a basic block,
41 * compute a DAG of the dependencies (RAW ordering with latency, WAW
42 * ordering with latency, WAR ordering), and make a list of the DAG heads.
43 * Heuristically pick a DAG head, then put all the children that are
44 * now DAG heads into the list of things to schedule.
45 *
46 * The heuristic is the important part. We're trying to be cheap,
47 * since actually computing the optimal scheduling is NP complete.
48 * What we do is track a "current clock". When we schedule a node, we
49 * update the earliest-unblocked clock time of its children, and
50 * increment the clock. Then, when trying to schedule, we just pick
51 * the earliest-unblocked instruction to schedule.
52 *
53 * Note that often there will be many things which could execute
54 * immediately, and there are a range of heuristic options to choose
55 * from in picking among those.
56 */
57
58 static bool debug = false;
59
60 struct schedule_node_child;
61
62 class schedule_node : public exec_node
63 {
64 public:
65 void set_latency(const struct brw_isa_info *isa);
66
67 fs_inst *inst;
68 schedule_node_child *children;
69 int children_count;
70 int children_cap;
71 int initial_parent_count;
72 int initial_unblocked_time;
73 int latency;
74
75 /**
76 * This is the sum of the instruction's latency plus the maximum delay of
77 * its children, or just the issue_time if it's a leaf node.
78 */
79 int delay;
80
81 /**
82 * Preferred exit node among the (direct or indirect) successors of this
83 * node. Among the scheduler nodes blocked by this node, this will be the
84 * one that may cause earliest program termination, or NULL if none of the
85 * successors is an exit node.
86 */
87 schedule_node *exit;
88
89 /**
90 * How many cycles this instruction takes to issue.
91 *
92 * Instructions in gen hardware are handled one simd4 vector at a time,
93 * with 1 cycle per vector dispatched. Thus SIMD8 pixel shaders take 2
94 * cycles to dispatch and SIMD16 (compressed) instructions take 4.
95 */
96 int issue_time;
97
98 /**
99 * Whether the instruction reads any part of the address register (to speed
100 * up instruction checks).
101 */
102 schedule_node **address_read;
103 int address_read_count;
104 int address_read_cap;
105
106 /* Temporary data used during the scheduling process. */
107 struct {
108 int parent_count;
109 int unblocked_time;
110
111 /**
112 * Which iteration of pushing groups of children onto the candidates list
113 * this node was a part of.
114 */
115 unsigned cand_generation;
116 } tmp;
117 };
118
119 struct schedule_node_child {
120 schedule_node *n;
121 int effective_latency;
122 };
123
124 static inline void
reset_node_tmp(schedule_node * n)125 reset_node_tmp(schedule_node *n)
126 {
127 n->tmp.parent_count = n->initial_parent_count;
128 n->tmp.unblocked_time = n->initial_unblocked_time;
129 n->tmp.cand_generation = 0;
130 }
131
132 /**
133 * Lower bound of the scheduling time after which one of the instructions
134 * blocked by this node may lead to program termination.
135 *
136 * exit_unblocked_time() determines a strict partial ordering relation '«' on
137 * the set of scheduler nodes as follows:
138 *
139 * n « m <-> exit_unblocked_time(n) < exit_unblocked_time(m)
140 *
141 * which can be used to heuristically order nodes according to how early they
142 * can unblock an exit node and lead to program termination.
143 */
144 static inline int
exit_tmp_unblocked_time(const schedule_node * n)145 exit_tmp_unblocked_time(const schedule_node *n)
146 {
147 return n->exit ? n->exit->tmp.unblocked_time : INT_MAX;
148 }
149
150 static inline int
exit_initial_unblocked_time(const schedule_node * n)151 exit_initial_unblocked_time(const schedule_node *n)
152 {
153 return n->exit ? n->exit->initial_unblocked_time : INT_MAX;
154 }
155
156 void
set_latency(const struct brw_isa_info * isa)157 schedule_node::set_latency(const struct brw_isa_info *isa)
158 {
159 switch (inst->opcode) {
160 case BRW_OPCODE_MAD:
161 /* 2 cycles
162 * (since the last two src operands are in different register banks):
163 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
164 *
165 * 3 cycles on IVB, 4 on HSW
166 * (since the last two src operands are in the same register bank):
167 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
168 *
169 * 18 cycles on IVB, 16 on HSW
170 * (since the last two src operands are in different register banks):
171 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
172 * mov(8) null g4<4,5,1>F { align16 WE_normal 1Q };
173 *
174 * 20 cycles on IVB, 18 on HSW
175 * (since the last two src operands are in the same register bank):
176 * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
177 * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q };
178 */
179
180 /* Our register allocator doesn't know about register banks, so use the
181 * higher latency.
182 */
183 latency = 18;
184 break;
185
186 case BRW_OPCODE_LRP:
187 /* 2 cycles
188 * (since the last two src operands are in different register banks):
189 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
190 *
191 * 3 cycles on IVB, 4 on HSW
192 * (since the last two src operands are in the same register bank):
193 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
194 *
195 * 16 cycles on IVB, 14 on HSW
196 * (since the last two src operands are in different register banks):
197 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
198 * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q };
199 *
200 * 16 cycles
201 * (since the last two src operands are in the same register bank):
202 * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
203 * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q };
204 */
205
206 /* Our register allocator doesn't know about register banks, so use the
207 * higher latency.
208 */
209 latency = 14;
210 break;
211
212 case SHADER_OPCODE_RCP:
213 case SHADER_OPCODE_RSQ:
214 case SHADER_OPCODE_SQRT:
215 case SHADER_OPCODE_LOG2:
216 case SHADER_OPCODE_EXP2:
217 case SHADER_OPCODE_SIN:
218 case SHADER_OPCODE_COS:
219 /* 2 cycles:
220 * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q };
221 *
222 * 18 cycles:
223 * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q };
224 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
225 *
226 * Same for exp2, log2, rsq, sqrt, sin, cos.
227 */
228 latency = 16;
229 break;
230
231 case SHADER_OPCODE_POW:
232 /* 2 cycles:
233 * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q };
234 *
235 * 26 cycles:
236 * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q };
237 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
238 */
239 latency = 24;
240 break;
241
242 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
243 /* testing using varying-index pull constants:
244 *
245 * 16 cycles:
246 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
247 * send(8) g4<1>F g4<8,8,1>D
248 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
249 *
250 * ~480 cycles:
251 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
252 * send(8) g4<1>F g4<8,8,1>D
253 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
254 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
255 *
256 * ~620 cycles:
257 * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
258 * send(8) g4<1>F g4<8,8,1>D
259 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
260 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
261 * send(8) g4<1>F g4<8,8,1>D
262 * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
263 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
264 *
265 * So, if it's cache-hot, it's about 140. If it's cache cold, it's
266 * about 460. We expect to mostly be cache hot, so pick something more
267 * in that direction.
268 */
269 latency = 200;
270 break;
271
272 case SHADER_OPCODE_SEND:
273 switch (inst->sfid) {
274 case BRW_SFID_SAMPLER: {
275 unsigned msg_type = (inst->desc >> 12) & 0x1f;
276 switch (msg_type) {
277 case GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO:
278 case GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO:
279 /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
280 * cycles (n=15):
281 * mov(8) g114<1>UD 0D { align1 WE_normal 1Q };
282 * send(8) g6<1>UW g114<8,8,1>F
283 * sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q };
284 * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q };
285 *
286 *
287 * Two loads was 535 +/- 30 cycles (n=19):
288 * mov(16) g114<1>UD 0D { align1 WE_normal 1H };
289 * send(16) g6<1>UW g114<8,8,1>F
290 * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H };
291 * mov(16) g114<1>UD 0D { align1 WE_normal 1H };
292 * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H };
293 * send(16) g8<1>UW g114<8,8,1>F
294 * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H };
295 * mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H };
296 * add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H };
297 *
298 * Since the only caches that should matter are just the
299 * instruction/state cache containing the surface state,
300 * assume that we always have hot caches.
301 */
302 latency = 100;
303 break;
304
305 default:
306 /* 18 cycles:
307 * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
308 * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
309 * send(8) g4<1>UW g114<8,8,1>F
310 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
311 *
312 * 697 +/-49 cycles (min 610, n=26):
313 * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
314 * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
315 * send(8) g4<1>UW g114<8,8,1>F
316 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
317 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
318 *
319 * So the latency on our first texture load of the batchbuffer
320 * takes ~700 cycles, since the caches are cold at that point.
321 *
322 * 840 +/- 92 cycles (min 720, n=25):
323 * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
324 * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
325 * send(8) g4<1>UW g114<8,8,1>F
326 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
327 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
328 * send(8) g4<1>UW g114<8,8,1>F
329 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
330 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
331 *
332 * On the second load, it takes just an extra ~140 cycles, and
333 * after accounting for the 14 cycles of the MOV's latency, that
334 * makes ~130.
335 *
336 * 683 +/- 49 cycles (min = 602, n=47):
337 * mov(8) g115<1>F 0F { align1 WE_normal 1Q };
338 * mov(8) g114<1>F 0F { align1 WE_normal 1Q };
339 * send(8) g4<1>UW g114<8,8,1>F
340 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
341 * send(8) g50<1>UW g114<8,8,1>F
342 * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
343 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
344 *
345 * The unit appears to be pipelined, since this matches up with
346 * the cache-cold case, despite there being two loads here. If
347 * you replace the g4 in the MOV to null with g50, it's still
348 * 693 +/- 52 (n=39).
349 *
350 * So, take some number between the cache-hot 140 cycles and the
351 * cache-cold 700 cycles. No particular tuning was done on this.
352 *
353 * I haven't done significant testing of the non-TEX opcodes.
354 * TXL at least looked about the same as TEX.
355 */
356 latency = 200;
357 break;
358 }
359 break;
360 }
361
362 case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
363 /* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */
364 latency = 200;
365 break;
366
367 case GFX6_SFID_DATAPORT_RENDER_CACHE:
368 switch (brw_fb_desc_msg_type(isa->devinfo, inst->desc)) {
369 case GFX7_DATAPORT_RC_TYPED_SURFACE_WRITE:
370 case GFX7_DATAPORT_RC_TYPED_SURFACE_READ:
371 /* See also SHADER_OPCODE_TYPED_SURFACE_READ */
372 latency = 600;
373 break;
374
375 case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
376 /* See also SHADER_OPCODE_TYPED_ATOMIC */
377 latency = 14000;
378 break;
379
380 case GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE:
381 /* completely fabricated number */
382 latency = 600;
383 break;
384
385 default:
386 unreachable("Unknown render cache message");
387 }
388 break;
389
390 case GFX7_SFID_DATAPORT_DATA_CACHE:
391 switch ((inst->desc >> 14) & 0x1f) {
392 case BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ:
393 case GFX7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ:
394 case GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE:
395 /* We have no data for this but assume it's a little faster than
396 * untyped surface read/write.
397 */
398 latency = 200;
399 break;
400
401 case GFX7_DATAPORT_DC_DWORD_SCATTERED_READ:
402 case GFX6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE:
403 case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ:
404 case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE:
405 /* We have no data for this but assume it's roughly the same as
406 * untyped surface read/write.
407 */
408 latency = 300;
409 break;
410
411 case GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ:
412 case GFX7_DATAPORT_DC_UNTYPED_SURFACE_WRITE:
413 /* Test code:
414 * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q };
415 * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all };
416 * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q };
417 * send(8) g4<1>UD g112<8,8,1>UD
418 * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q };
419 * .
420 * . [repeats 8 times]
421 * .
422 * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q };
423 * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all };
424 * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q };
425 * send(8) g4<1>UD g112<8,8,1>UD
426 * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q };
427 *
428 * Running it 100 times as fragment shader on a 128x128 quad
429 * gives an average latency of 583 cycles per surface read,
430 * standard deviation 0.9%.
431 */
432 latency = 600;
433 break;
434
435 case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
436 /* Test code:
437 * mov(8) g112<1>ud 0x00000000ud { align1 WE_all 1Q };
438 * mov(1) g112.7<1>ud g1.7<0,1,0>ud { align1 WE_all };
439 * mov(8) g113<1>ud 0x00000000ud { align1 WE_normal 1Q };
440 * send(8) g4<1>ud g112<8,8,1>ud
441 * data (38, 5, 6) mlen 2 rlen 1 { align1 WE_normal 1Q };
442 *
443 * Running it 100 times as fragment shader on a 128x128 quad
444 * gives an average latency of 13867 cycles per atomic op,
445 * standard deviation 3%. Note that this is a rather
446 * pessimistic estimate, the actual latency in cases with few
447 * collisions between threads and favorable pipelining has been
448 * seen to be reduced by a factor of 100.
449 */
450 latency = 14000;
451 break;
452
453 default:
454 unreachable("Unknown data cache message");
455 }
456 break;
457
458 case HSW_SFID_DATAPORT_DATA_CACHE_1:
459 switch (brw_dp_desc_msg_type(isa->devinfo, inst->desc)) {
460 case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ:
461 case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE:
462 case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ:
463 case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE:
464 case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE:
465 case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ:
466 case GFX8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE:
467 case GFX9_DATAPORT_DC_PORT1_A64_SCATTERED_READ:
468 case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ:
469 case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE:
470 /* See also GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ */
471 latency = 300;
472 break;
473
474 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
475 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
476 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
477 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
478 case GFX9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP:
479 case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP:
480 case GFX9_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_FLOAT_OP:
481 case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_INT_OP:
482 case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_FLOAT_OP:
483 /* See also GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP */
484 latency = 14000;
485 break;
486
487 default:
488 unreachable("Unknown data cache message");
489 }
490 break;
491
492 case GFX7_SFID_PIXEL_INTERPOLATOR:
493 latency = 50; /* TODO */
494 break;
495
496 case GFX12_SFID_UGM:
497 case GFX12_SFID_TGM:
498 case GFX12_SFID_SLM:
499 switch (lsc_msg_desc_opcode(isa->devinfo, inst->desc)) {
500 case LSC_OP_LOAD:
501 case LSC_OP_STORE:
502 case LSC_OP_LOAD_CMASK:
503 case LSC_OP_STORE_CMASK:
504 latency = 300;
505 break;
506 case LSC_OP_FENCE:
507 case LSC_OP_ATOMIC_INC:
508 case LSC_OP_ATOMIC_DEC:
509 case LSC_OP_ATOMIC_LOAD:
510 case LSC_OP_ATOMIC_STORE:
511 case LSC_OP_ATOMIC_ADD:
512 case LSC_OP_ATOMIC_SUB:
513 case LSC_OP_ATOMIC_MIN:
514 case LSC_OP_ATOMIC_MAX:
515 case LSC_OP_ATOMIC_UMIN:
516 case LSC_OP_ATOMIC_UMAX:
517 case LSC_OP_ATOMIC_CMPXCHG:
518 case LSC_OP_ATOMIC_FADD:
519 case LSC_OP_ATOMIC_FSUB:
520 case LSC_OP_ATOMIC_FMIN:
521 case LSC_OP_ATOMIC_FMAX:
522 case LSC_OP_ATOMIC_FCMPXCHG:
523 case LSC_OP_ATOMIC_AND:
524 case LSC_OP_ATOMIC_OR:
525 case LSC_OP_ATOMIC_XOR:
526 latency = 1400;
527 break;
528 default:
529 unreachable("unsupported new data port message instruction");
530 }
531 break;
532
533 case BRW_SFID_MESSAGE_GATEWAY:
534 case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH: /* or THREAD_SPAWNER */
535 case GEN_RT_SFID_RAY_TRACE_ACCELERATOR:
536 /* TODO.
537 *
538 * We'll assume for the moment that this is pretty quick as it
539 * doesn't actually return any data.
540 */
541 latency = 200;
542 break;
543
544 case BRW_SFID_URB:
545 latency = 200;
546 break;
547
548 default:
549 unreachable("Unknown SFID");
550 }
551 break;
552
553 case BRW_OPCODE_DPAS:
554 switch (inst->rcount) {
555 case 1:
556 latency = 21;
557 break;
558 case 2:
559 latency = 22;
560 break;
561 case 8:
562 default:
563 latency = 32;
564 break;
565 }
566 break;
567
568 default:
569 /* 2 cycles:
570 * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q };
571 *
572 * 16 cycles:
573 * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q };
574 * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
575 */
576 latency = 14;
577 break;
578 }
579 }
580
581 class brw_instruction_scheduler {
582 public:
583 brw_instruction_scheduler(void *mem_ctx, const fs_visitor *s, int grf_count, int hw_reg_count,
584 int block_count, bool post_reg_alloc);
585
586 void add_barrier_deps(schedule_node *n);
587 void add_cross_lane_deps(schedule_node *n);
588 void add_dep(schedule_node *before, schedule_node *after, int latency);
589 void add_dep(schedule_node *before, schedule_node *after);
590 void add_address_dep(schedule_node *before, schedule_node *after);
591
592 void set_current_block(bblock_t *block);
593 void compute_delays();
594 void compute_exits();
595
596 void schedule(schedule_node *chosen);
597 void update_children(schedule_node *chosen);
598
599 void calculate_deps();
600 bool is_compressed(const fs_inst *inst);
601 bool register_needs_barrier(const brw_reg ®);
602 bool address_register_interfere(const schedule_node *n);
603 schedule_node *choose_instruction_to_schedule();
604 int calculate_issue_time(const fs_inst *inst);
605
606 void count_reads_remaining(const fs_inst *inst);
607 void setup_liveness(cfg_t *cfg);
608 void update_register_pressure(const fs_inst *inst);
609 int get_register_pressure_benefit(const fs_inst *inst);
610 void clear_last_grf_write();
611
612 void schedule_instructions();
613 void run(brw_instruction_scheduler_mode mode);
614
615 int grf_index(const brw_reg ®);
616
617 void *mem_ctx;
618 linear_ctx *lin_ctx;
619
620 schedule_node *nodes;
621 int nodes_len;
622
623 /* Current block being processed. */
624 struct {
625 bblock_t *block;
626
627 /* Range of nodes in the block. End will point to first node
628 * address after the block, i.e. the range is [start, end).
629 */
630 schedule_node *start;
631 schedule_node *end;
632 int len;
633
634 int scheduled;
635
636 unsigned cand_generation;
637 int time;
638 exec_list available;
639
640 /* Currently used address register */
641 uint32_t address_register[16];
642 } current;
643
644 bool post_reg_alloc;
645 int grf_count;
646 const fs_visitor *s;
647
648 /**
649 * Last instruction to have written the grf (or a channel in the grf, for the
650 * scalar backend)
651 */
652 schedule_node **last_grf_write;
653
654 unsigned hw_reg_count;
655 int reg_pressure;
656 brw_instruction_scheduler_mode mode;
657
658 /*
659 * The register pressure at the beginning of each basic block.
660 */
661
662 int *reg_pressure_in;
663
664 /*
665 * The virtual GRF's whose range overlaps the beginning of each basic block.
666 */
667
668 BITSET_WORD **livein;
669
670 /*
671 * The virtual GRF's whose range overlaps the end of each basic block.
672 */
673
674 BITSET_WORD **liveout;
675
676 /*
677 * The hardware GRF's whose range overlaps the end of each basic block.
678 */
679
680 BITSET_WORD **hw_liveout;
681
682 /*
683 * Whether we've scheduled a write for this virtual GRF yet.
684 */
685
686 bool *written;
687
688 /*
689 * How many reads we haven't scheduled for this virtual GRF yet.
690 */
691
692 int *reads_remaining;
693
694 /*
695 * How many reads we haven't scheduled for this hardware GRF yet.
696 */
697
698 int *hw_reads_remaining;
699 };
700
brw_instruction_scheduler(void * mem_ctx,const fs_visitor * s,int grf_count,int hw_reg_count,int block_count,bool post_reg_alloc)701 brw_instruction_scheduler::brw_instruction_scheduler(void *mem_ctx, const fs_visitor *s,
702 int grf_count, int hw_reg_count,
703 int block_count, bool post_reg_alloc)
704 : s(s)
705 {
706 this->mem_ctx = mem_ctx;
707 this->lin_ctx = linear_context(this->mem_ctx);
708 this->grf_count = grf_count;
709 this->post_reg_alloc = post_reg_alloc;
710
711 const unsigned grf_write_scale = MAX_VGRF_SIZE(s->devinfo);
712 this->last_grf_write = linear_zalloc_array(lin_ctx, schedule_node *, grf_count * grf_write_scale);
713
714 this->nodes_len = s->cfg->last_block()->end_ip + 1;
715 this->nodes = linear_zalloc_array(lin_ctx, schedule_node, this->nodes_len);
716
717 const struct brw_isa_info *isa = &s->compiler->isa;
718
719 schedule_node *n = nodes;
720 foreach_block_and_inst(block, fs_inst, inst, s->cfg) {
721 n->inst = inst;
722
723 if (!post_reg_alloc)
724 n->latency = 1;
725 else
726 n->set_latency(isa);
727
728 n++;
729 }
730 assert(n == nodes + nodes_len);
731
732 current.block = NULL;
733 current.start = NULL;
734 current.end = NULL;
735 current.len = 0;
736 current.time = 0;
737 current.cand_generation = 0;
738 current.available.make_empty();
739
740 this->hw_reg_count = hw_reg_count;
741 this->mode = BRW_SCHEDULE_NONE;
742 this->reg_pressure = 0;
743
744 if (!post_reg_alloc) {
745 this->reg_pressure_in = linear_zalloc_array(lin_ctx, int, block_count);
746
747 this->livein = linear_alloc_array(lin_ctx, BITSET_WORD *, block_count);
748 for (int i = 0; i < block_count; i++)
749 this->livein[i] = linear_zalloc_array(lin_ctx, BITSET_WORD,
750 BITSET_WORDS(grf_count));
751
752 this->liveout = linear_alloc_array(lin_ctx, BITSET_WORD *, block_count);
753 for (int i = 0; i < block_count; i++)
754 this->liveout[i] = linear_zalloc_array(lin_ctx, BITSET_WORD,
755 BITSET_WORDS(grf_count));
756
757 this->hw_liveout = linear_alloc_array(lin_ctx, BITSET_WORD *, block_count);
758 for (int i = 0; i < block_count; i++)
759 this->hw_liveout[i] = linear_zalloc_array(lin_ctx, BITSET_WORD,
760 BITSET_WORDS(hw_reg_count));
761
762 setup_liveness(s->cfg);
763
764 this->written = linear_alloc_array(lin_ctx, bool, grf_count);
765
766 this->reads_remaining = linear_alloc_array(lin_ctx, int, grf_count);
767
768 this->hw_reads_remaining = linear_alloc_array(lin_ctx, int, hw_reg_count);
769 } else {
770 this->reg_pressure_in = NULL;
771 this->livein = NULL;
772 this->liveout = NULL;
773 this->hw_liveout = NULL;
774 this->written = NULL;
775 this->reads_remaining = NULL;
776 this->hw_reads_remaining = NULL;
777 }
778
779 foreach_block(block, s->cfg) {
780 set_current_block(block);
781
782 for (schedule_node *n = current.start; n < current.end; n++)
783 n->issue_time = calculate_issue_time(n->inst);
784
785 calculate_deps();
786 compute_delays();
787 compute_exits();
788 }
789 }
790
791 static bool
is_src_duplicate(const fs_inst * inst,int src)792 is_src_duplicate(const fs_inst *inst, int src)
793 {
794 for (int i = 0; i < src; i++)
795 if (inst->src[i].equals(inst->src[src]))
796 return true;
797
798 return false;
799 }
800
801 void
count_reads_remaining(const fs_inst * inst)802 brw_instruction_scheduler::count_reads_remaining(const fs_inst *inst)
803 {
804 assert(reads_remaining);
805
806 for (int i = 0; i < inst->sources; i++) {
807 if (is_src_duplicate(inst, i))
808 continue;
809
810 if (inst->src[i].file == VGRF) {
811 reads_remaining[inst->src[i].nr]++;
812 } else if (inst->src[i].file == FIXED_GRF) {
813 if (inst->src[i].nr >= hw_reg_count)
814 continue;
815
816 for (unsigned j = 0; j < regs_read(s->devinfo, inst, i); j++)
817 hw_reads_remaining[inst->src[i].nr + j]++;
818 }
819 }
820 }
821
822 void
setup_liveness(cfg_t * cfg)823 brw_instruction_scheduler::setup_liveness(cfg_t *cfg)
824 {
825 const fs_live_variables &live = s->live_analysis.require();
826
827 /* First, compute liveness on a per-GRF level using the in/out sets from
828 * liveness calculation.
829 */
830 for (int block = 0; block < cfg->num_blocks; block++) {
831 for (int i = 0; i < live.num_vars; i++) {
832 if (BITSET_TEST(live.block_data[block].livein, i)) {
833 int vgrf = live.vgrf_from_var[i];
834 if (!BITSET_TEST(livein[block], vgrf)) {
835 reg_pressure_in[block] += s->alloc.sizes[vgrf];
836 BITSET_SET(livein[block], vgrf);
837 }
838 }
839
840 if (BITSET_TEST(live.block_data[block].liveout, i))
841 BITSET_SET(liveout[block], live.vgrf_from_var[i]);
842 }
843 }
844
845 /* Now, extend the live in/live out sets for when a range crosses a block
846 * boundary, which matches what our register allocator/interference code
847 * does to account for force_writemask_all and incompatible exec_mask's.
848 */
849 for (int block = 0; block < cfg->num_blocks - 1; block++) {
850 for (int i = 0; i < grf_count; i++) {
851 if (live.vgrf_start[i] <= cfg->blocks[block]->end_ip &&
852 live.vgrf_end[i] >= cfg->blocks[block + 1]->start_ip) {
853 if (!BITSET_TEST(livein[block + 1], i)) {
854 reg_pressure_in[block + 1] += s->alloc.sizes[i];
855 BITSET_SET(livein[block + 1], i);
856 }
857
858 BITSET_SET(liveout[block], i);
859 }
860 }
861 }
862
863 int *payload_last_use_ip = ralloc_array(NULL, int, hw_reg_count);
864 s->calculate_payload_ranges(true, hw_reg_count, payload_last_use_ip);
865
866 for (unsigned i = 0; i < hw_reg_count; i++) {
867 if (payload_last_use_ip[i] == -1)
868 continue;
869
870 for (int block = 0; block < cfg->num_blocks; block++) {
871 if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i])
872 reg_pressure_in[block]++;
873
874 if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i])
875 BITSET_SET(hw_liveout[block], i);
876 }
877 }
878
879 ralloc_free(payload_last_use_ip);
880 }
881
882 void
update_register_pressure(const fs_inst * inst)883 brw_instruction_scheduler::update_register_pressure(const fs_inst *inst)
884 {
885 assert(reads_remaining);
886
887 if (inst->dst.file == VGRF) {
888 written[inst->dst.nr] = true;
889 }
890
891 for (int i = 0; i < inst->sources; i++) {
892 if (is_src_duplicate(inst, i))
893 continue;
894
895 if (inst->src[i].file == VGRF) {
896 reads_remaining[inst->src[i].nr]--;
897 } else if (inst->src[i].file == FIXED_GRF &&
898 inst->src[i].nr < hw_reg_count) {
899 for (unsigned off = 0; off < regs_read(s->devinfo, inst, i); off++)
900 hw_reads_remaining[inst->src[i].nr + off]--;
901 }
902 }
903 }
904
905 int
get_register_pressure_benefit(const fs_inst * inst)906 brw_instruction_scheduler::get_register_pressure_benefit(const fs_inst *inst)
907 {
908 int benefit = 0;
909 const int block_idx = current.block->num;
910
911 if (inst->dst.file == VGRF) {
912 if (!BITSET_TEST(livein[block_idx], inst->dst.nr) &&
913 !written[inst->dst.nr])
914 benefit -= s->alloc.sizes[inst->dst.nr];
915 }
916
917 for (int i = 0; i < inst->sources; i++) {
918 if (is_src_duplicate(inst, i))
919 continue;
920
921 if (inst->src[i].file == VGRF &&
922 !BITSET_TEST(liveout[block_idx], inst->src[i].nr) &&
923 reads_remaining[inst->src[i].nr] == 1)
924 benefit += s->alloc.sizes[inst->src[i].nr];
925
926 if (inst->src[i].file == FIXED_GRF &&
927 inst->src[i].nr < hw_reg_count) {
928 for (unsigned off = 0; off < regs_read(s->devinfo, inst, i); off++) {
929 int reg = inst->src[i].nr + off;
930 if (!BITSET_TEST(hw_liveout[block_idx], reg) &&
931 hw_reads_remaining[reg] == 1) {
932 benefit++;
933 }
934 }
935 }
936 }
937
938 return benefit;
939 }
940
941 void
set_current_block(bblock_t * block)942 brw_instruction_scheduler::set_current_block(bblock_t *block)
943 {
944 current.block = block;
945 current.start = nodes + block->start_ip;
946 current.len = block->end_ip - block->start_ip + 1;
947 current.end = current.start + current.len;
948 current.time = 0;
949 current.scheduled = 0;
950 current.cand_generation = 1;
951 }
952
953 /** Computation of the delay member of each node. */
954 void
compute_delays()955 brw_instruction_scheduler::compute_delays()
956 {
957 for (schedule_node *n = current.end - 1; n >= current.start; n--) {
958 if (!n->children_count) {
959 n->delay = n->issue_time;
960 } else {
961 for (int i = 0; i < n->children_count; i++) {
962 if (n->children[i].n->delay == 0) {
963 /* This is a special case for address register, where a child
964 * could be a prior instruction.
965 *
966 * This ensures that a address register write instruction will
967 * always unblock the reader of the address register. Otherwise
968 * we could end up with scheduling deadlocks.
969 */
970 assert(n->children[i].n->inst->dst.is_address());
971 n->delay = MAX2(n->delay, 1);
972 } else {
973 n->delay = MAX2(n->delay, n->latency + n->children[i].n->delay);
974 }
975 }
976 }
977 }
978 }
979
980 void
compute_exits()981 brw_instruction_scheduler::compute_exits()
982 {
983 /* Calculate a lower bound of the scheduling time of each node in the
984 * graph. This is analogous to the node's critical path but calculated
985 * from the top instead of from the bottom of the block.
986 */
987 for (schedule_node *n = current.start; n < current.end; n++) {
988 for (int i = 0; i < n->children_count; i++) {
989 schedule_node_child *child = &n->children[i];
990 child->n->initial_unblocked_time =
991 MAX2(child->n->initial_unblocked_time,
992 n->initial_unblocked_time + n->issue_time + child->effective_latency);
993 }
994 }
995
996 /* Calculate the exit of each node by induction based on the exit nodes of
997 * its children. The preferred exit of a node is the one among the exit
998 * nodes of its children which can be unblocked first according to the
999 * optimistic unblocked time estimate calculated above.
1000 */
1001 for (schedule_node *n = current.end - 1; n >= current.start; n--) {
1002 n->exit = (n->inst->opcode == BRW_OPCODE_HALT ? n : NULL);
1003
1004 for (int i = 0; i < n->children_count; i++) {
1005 if (exit_initial_unblocked_time(n->children[i].n) < exit_initial_unblocked_time(n))
1006 n->exit = n->children[i].n->exit;
1007 }
1008 }
1009 }
1010
1011 /**
1012 * Add a dependency between two instruction nodes.
1013 *
1014 * The @after node will be scheduled after @before. We will try to
1015 * schedule it @latency cycles after @before, but no guarantees there.
1016 */
1017 void
add_dep(schedule_node * before,schedule_node * after,int latency)1018 brw_instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
1019 int latency)
1020 {
1021 if (!before || !after)
1022 return;
1023
1024 assert(before != after);
1025
1026 for (int i = 0; i < before->children_count; i++) {
1027 schedule_node_child *child = &before->children[i];
1028 if (child->n == after) {
1029 child->effective_latency = MAX2(child->effective_latency, latency);
1030 return;
1031 }
1032 }
1033
1034 if (before->children_cap <= before->children_count) {
1035 if (before->children_cap < 16)
1036 before->children_cap = 16;
1037 else
1038 before->children_cap *= 2;
1039
1040 before->children = reralloc(mem_ctx, before->children,
1041 schedule_node_child,
1042 before->children_cap);
1043 }
1044
1045 schedule_node_child *child = &before->children[before->children_count];
1046 child->n = after;
1047 child->effective_latency = latency;
1048 before->children_count++;
1049 after->initial_parent_count++;
1050
1051 /* Propagate the dependency to the address register instructions. */
1052 for (int i = 0; i < after->address_read_count; i++)
1053 add_dep(before, after->address_read[i]);
1054 }
1055
1056 void
add_dep(schedule_node * before,schedule_node * after)1057 brw_instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
1058 {
1059 if (!before)
1060 return;
1061
1062 add_dep(before, after, before->latency);
1063 }
1064
1065 void
add_address_dep(schedule_node * before,schedule_node * after)1066 brw_instruction_scheduler::add_address_dep(schedule_node *before, schedule_node *after)
1067 {
1068 assert(before && after);
1069
1070 add_dep(before, after, before->latency);
1071
1072 if (after->address_read_cap <= after->address_read_count) {
1073 after->address_read_cap = MAX2(2 * after->address_read_cap, 1);
1074
1075 after->address_read = reralloc(mem_ctx, after->address_read,
1076 schedule_node *,
1077 after->address_read_cap);
1078 }
1079
1080 after->address_read[after->address_read_count++] = before;
1081 }
1082
1083 static bool
is_scheduling_barrier(const fs_inst * inst)1084 is_scheduling_barrier(const fs_inst *inst)
1085 {
1086 return inst->opcode == SHADER_OPCODE_HALT_TARGET ||
1087 inst->is_control_flow() ||
1088 inst->has_side_effects();
1089 }
1090
1091 static bool
has_cross_lane_access(const fs_inst * inst)1092 has_cross_lane_access(const fs_inst *inst)
1093 {
1094 /* FINISHME:
1095 *
1096 * This function is likely incomplete in terms of identify cross lane
1097 * accesses.
1098 */
1099 if (inst->opcode == SHADER_OPCODE_BROADCAST ||
1100 inst->opcode == SHADER_OPCODE_CLUSTER_BROADCAST ||
1101 inst->opcode == SHADER_OPCODE_SHUFFLE ||
1102 inst->opcode == FS_OPCODE_LOAD_LIVE_CHANNELS ||
1103 inst->opcode == SHADER_OPCODE_LOAD_LIVE_CHANNELS ||
1104 inst->opcode == SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL ||
1105 inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL)
1106 return true;
1107
1108 for (unsigned s = 0; s < inst->sources; s++) {
1109 if (inst->src[s].file == VGRF) {
1110 if (inst->src[s].stride == 0)
1111 return true;
1112 }
1113 }
1114
1115 return false;
1116 }
1117
1118 /**
1119 * Some register access need dependencies on other instructions.
1120 */
1121 bool
register_needs_barrier(const brw_reg & reg)1122 brw_instruction_scheduler::register_needs_barrier(const brw_reg ®)
1123 {
1124 if (reg.file != ARF || reg.is_null())
1125 return false;
1126
1127 /* If you look at SR register layout, there is nothing in there that
1128 * depends on other instructions. This is just fixed dispatch information.
1129 *
1130 * ATSM PRMs, Volume 9: Render Engine, State Register Fields :
1131 * sr0.0:
1132 * - 0:2 TID
1133 * - 4:13 Slice, DSS, Subslice, EU IDs
1134 * - 20:22 Priority
1135 * - 23:23 Priority class
1136 * - 24:27 FFID
1137 * sr0.1:
1138 * - 0:5 IEEE Exception
1139 * - 21:31 FFTID
1140 * sr0.2:
1141 * - 0:31 Dispatch Mask
1142 * sr0.3:
1143 * - 0:31 Vector Mask
1144 */
1145 if (reg.nr == BRW_ARF_STATE)
1146 return false;
1147
1148 return true;
1149 }
1150
1151 /**
1152 * Sometimes we really want this node to execute after everything that
1153 * was before it and before everything that followed it. This adds
1154 * the deps to do so.
1155 */
1156 void
add_barrier_deps(schedule_node * n)1157 brw_instruction_scheduler::add_barrier_deps(schedule_node *n)
1158 {
1159 for (schedule_node *prev = n - 1; prev >= current.start; prev--) {
1160 add_dep(prev, n, 0);
1161 if (is_scheduling_barrier(prev->inst))
1162 break;
1163 }
1164
1165 for (schedule_node *next = n + 1; next < current.end; next++) {
1166 add_dep(n, next, 0);
1167 if (is_scheduling_barrier(next->inst))
1168 break;
1169 }
1170 }
1171
1172 /**
1173 * Because some instructions like HALT can disable lanes, scheduling prior to
1174 * a cross lane access should not be allowed, otherwise we could end up with
1175 * later instructions accessing uninitialized data.
1176 */
1177 void
add_cross_lane_deps(schedule_node * n)1178 brw_instruction_scheduler::add_cross_lane_deps(schedule_node *n)
1179 {
1180 for (schedule_node *prev = n - 1; prev >= current.start; prev--) {
1181 if (has_cross_lane_access((fs_inst*)prev->inst))
1182 add_dep(prev, n, 0);
1183 }
1184 }
1185
1186 /* instruction scheduling needs to be aware of when an MRF write
1187 * actually writes 2 MRFs.
1188 */
1189 bool
is_compressed(const fs_inst * inst)1190 brw_instruction_scheduler::is_compressed(const fs_inst *inst)
1191 {
1192 return inst->exec_size == 16;
1193 }
1194
1195 /* Clears last_grf_write to be ready to start calculating deps for a block
1196 * again.
1197 *
1198 * Since pre-ra grf_count scales with instructions, and instructions scale with
1199 * BBs, we don't want to memset all of last_grf_write per block or you'll end up
1200 * O(n^2) with number of blocks. For shaders using softfp64, we get a *lot* of
1201 * blocks.
1202 *
1203 * We don't bother being careful for post-ra, since then grf_count doesn't scale
1204 * with instructions.
1205 */
1206 void
clear_last_grf_write()1207 brw_instruction_scheduler::clear_last_grf_write()
1208 {
1209 if (!post_reg_alloc) {
1210 for (schedule_node *n = current.start; n < current.end; n++) {
1211 fs_inst *inst = (fs_inst *)n->inst;
1212
1213 if (inst->dst.file == VGRF) {
1214 /* Don't bother being careful with regs_written(), quicker to just clear 2 cachelines. */
1215 memset(&last_grf_write[inst->dst.nr * MAX_VGRF_SIZE(s->devinfo)], 0,
1216 sizeof(*last_grf_write) * MAX_VGRF_SIZE(s->devinfo));
1217 }
1218 }
1219 } else {
1220 memset(last_grf_write, 0,
1221 sizeof(*last_grf_write) * grf_count * MAX_VGRF_SIZE(s->devinfo));
1222 }
1223 }
1224
1225 int
grf_index(const brw_reg & reg)1226 brw_instruction_scheduler::grf_index(const brw_reg ®)
1227 {
1228 if (post_reg_alloc)
1229 return reg.nr;
1230 return reg.nr * MAX_VGRF_SIZE(s->devinfo) + reg.offset / REG_SIZE;
1231 }
1232
1233 void
calculate_deps()1234 brw_instruction_scheduler::calculate_deps()
1235 {
1236 /* Pre-register-allocation, this tracks the last write per VGRF offset.
1237 * After register allocation, reg_offsets are gone and we track individual
1238 * GRF registers.
1239 */
1240 schedule_node *last_conditional_mod[8] = {};
1241 schedule_node *last_accumulator_write = NULL;
1242 /* Fixed HW registers are assumed to be separate from the virtual
1243 * GRFs, so they can be tracked separately. We don't really write
1244 * to fixed GRFs much, so don't bother tracking them on a more
1245 * granular level.
1246 */
1247 schedule_node *last_fixed_grf_write = NULL;
1248 schedule_node *last_address_write[16] = {};
1249
1250 /* top-to-bottom dependencies: RAW and WAW. */
1251
1252 if (!post_reg_alloc) {
1253 /* Address registers have virtual identifier, allowing us to identify
1254 * what instructions needs the values written to the register. The
1255 * address register is written/read in pairs of instructions (enforced
1256 * by the brw_fs_validate.cpp).
1257 *
1258 * To allow scheduling of SEND messages, out of order, without the
1259 * address register tracking generating serialized dependency between
1260 * all the messages, we first track all the dependencies of the address
1261 * register. Those dependencies are added to the instructions consuming
1262 * the address register value. Then when doing the normal dependency
1263 * tracking, any node adding a dependency to an instruction consuming
1264 * the address register is also added as dependency to the instruction
1265 * writing the value to the address register.
1266 *
1267 * This scheme allows the scheduling done by
1268 * choose_instruction_to_schedule() to ensure that once an instruction
1269 * writing the address register is scheduled, we can always schedule all
1270 * instructions making use of the address register value. Otherwise we
1271 * could run into scheduling deadlocks.
1272 *
1273 * Here is a deadlock example :
1274 *
1275 * mov a0, 0x42
1276 * send grf1, ..., a0
1277 * mov a0, 0x43
1278 * send grf2, grf1, a0
1279 *
1280 * Let say choose_instruction_to_schedule() chooses the second mov
1281 * instruction first (mov a0, 0x43). Then it cannot schedule the second
1282 * send instruction because the first send instruction populating grf1
1283 * and has not been scheduled and we cannot schedule the first mov
1284 * either because the address register is already in use for another
1285 * message.
1286 *
1287 * In post-register-allocation mode, this scheme cannot work as all GRFs
1288 * can get reused and we have to serializae all address register usages
1289 * (like the accumulator, flag, etc...).
1290 */
1291 for (schedule_node *n = current.start; n < current.end; n++) {
1292 fs_inst *inst = (fs_inst *)n->inst;
1293
1294 /* Pre pass going over instruction using the register flag as a
1295 * source.
1296 */
1297 for (int i = 0; i < inst->sources; i++) {
1298 if (!inst->src[i].is_address())
1299 continue;
1300
1301 for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2) {
1302 assert(inst->src[i].address_slot(byte) < ARRAY_SIZE(last_address_write));
1303 schedule_node *write_addr_node =
1304 last_address_write[inst->src[i].address_slot(byte)];
1305 assert(write_addr_node->inst->dst.nr == inst->src[i].nr);
1306 add_address_dep(write_addr_node, n);
1307 }
1308 }
1309
1310 if (inst->dst.is_address()) {
1311 for (unsigned byte = 0; byte < inst->size_written; byte += 2) {
1312 last_address_write[inst->dst.address_slot(byte)] = n;
1313 }
1314 }
1315 }
1316 }
1317
1318 for (schedule_node *n = current.start; n < current.end; n++) {
1319 fs_inst *inst = (fs_inst *)n->inst;
1320
1321 if (is_scheduling_barrier(inst))
1322 add_barrier_deps(n);
1323
1324 if (inst->opcode == BRW_OPCODE_HALT ||
1325 inst->opcode == SHADER_OPCODE_HALT_TARGET)
1326 add_cross_lane_deps(n);
1327
1328 /* read-after-write deps. */
1329 for (int i = 0; i < inst->sources; i++) {
1330 if (inst->src[i].file == VGRF) {
1331 for (unsigned r = 0; r < regs_read(s->devinfo, inst, i); r++)
1332 add_dep(last_grf_write[grf_index(inst->src[i]) + r], n);
1333 } else if (inst->src[i].file == FIXED_GRF) {
1334 if (post_reg_alloc) {
1335 for (unsigned r = 0; r < regs_read(s->devinfo, inst, i); r++)
1336 add_dep(last_grf_write[inst->src[i].nr + r], n);
1337 } else {
1338 add_dep(last_fixed_grf_write, n);
1339 }
1340 } else if (inst->src[i].is_accumulator()) {
1341 add_dep(last_accumulator_write, n);
1342 } else if (inst->src[i].is_address()) {
1343 if (post_reg_alloc) {
1344 for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2)
1345 add_dep(last_address_write[inst->src[i].address_slot(byte)], n);
1346 }
1347 } else if (register_needs_barrier(inst->src[i])) {
1348 add_barrier_deps(n);
1349 }
1350 }
1351
1352 if (const unsigned mask = inst->flags_read(s->devinfo)) {
1353 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
1354
1355 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
1356 if (mask & (1 << i))
1357 add_dep(last_conditional_mod[i], n);
1358 }
1359 }
1360
1361 if (inst->reads_accumulator_implicitly()) {
1362 add_dep(last_accumulator_write, n);
1363 }
1364
1365 /* write-after-write deps. */
1366 if (inst->dst.file == VGRF) {
1367 int grf_idx = grf_index(inst->dst);
1368 for (unsigned r = 0; r < regs_written(inst); r++) {
1369 add_dep(last_grf_write[grf_idx + r], n);
1370 last_grf_write[grf_idx + r] = n;
1371 }
1372 } else if (inst->dst.file == FIXED_GRF) {
1373 if (post_reg_alloc) {
1374 for (unsigned r = 0; r < regs_written(inst); r++) {
1375 add_dep(last_grf_write[inst->dst.nr + r], n);
1376 last_grf_write[inst->dst.nr + r] = n;
1377 }
1378 } else {
1379 add_dep(last_fixed_grf_write, n);
1380 last_fixed_grf_write = n;
1381 }
1382 } else if (inst->dst.is_accumulator()) {
1383 add_dep(last_accumulator_write, n);
1384 last_accumulator_write = n;
1385 } else if (inst->dst.is_address()) {
1386 if (post_reg_alloc) {
1387 for (unsigned byte = 0; byte < inst->size_written; byte += 2) {
1388 add_dep(last_address_write[inst->dst.address_slot(byte)], n);
1389 last_address_write[inst->dst.address_slot(byte)] = n;
1390 }
1391 }
1392 } else if (register_needs_barrier(inst->dst)) {
1393 add_barrier_deps(n);
1394 }
1395
1396 if (const unsigned mask = inst->flags_written(s->devinfo)) {
1397 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
1398
1399 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
1400 if (mask & (1 << i)) {
1401 add_dep(last_conditional_mod[i], n, 0);
1402 last_conditional_mod[i] = n;
1403 }
1404 }
1405 }
1406
1407 if (inst->writes_accumulator_implicitly(s->devinfo) &&
1408 !inst->dst.is_accumulator()) {
1409 add_dep(last_accumulator_write, n);
1410 last_accumulator_write = n;
1411 }
1412
1413 if (post_reg_alloc && inst->uses_address_register_implicitly()) {
1414 for (unsigned i = 0; i < ARRAY_SIZE(last_address_write); i++) {
1415 add_dep(last_address_write[i], n);
1416 last_address_write[i] = n;
1417 }
1418 }
1419 }
1420
1421 clear_last_grf_write();
1422
1423 /* bottom-to-top dependencies: WAR */
1424 memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
1425 last_accumulator_write = NULL;
1426 last_fixed_grf_write = NULL;
1427 memset(last_address_write, 0, sizeof(last_address_write));
1428
1429 for (schedule_node *n = current.end - 1; n >= current.start; n--) {
1430 fs_inst *inst = (fs_inst *)n->inst;
1431
1432 /* write-after-read deps. */
1433 for (int i = 0; i < inst->sources; i++) {
1434 if (inst->src[i].file == VGRF) {
1435 for (unsigned r = 0; r < regs_read(s->devinfo, inst, i); r++)
1436 add_dep(n, last_grf_write[grf_index(inst->src[i]) + r], 0);
1437 } else if (inst->src[i].file == FIXED_GRF) {
1438 if (post_reg_alloc) {
1439 for (unsigned r = 0; r < regs_read(s->devinfo, inst, i); r++)
1440 add_dep(n, last_grf_write[inst->src[i].nr + r], 0);
1441 } else {
1442 add_dep(n, last_fixed_grf_write, 0);
1443 }
1444 } else if (inst->src[i].is_accumulator()) {
1445 add_dep(n, last_accumulator_write, 0);
1446 } else if (inst->src[i].is_address()) {
1447 if (post_reg_alloc) {
1448 for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2) {
1449 add_dep(n, last_address_write[inst->src[i].address_slot(byte)], 0);
1450 }
1451 }
1452 } else if (register_needs_barrier(inst->src[i])) {
1453 add_barrier_deps(n);
1454 }
1455 }
1456
1457 if (const unsigned mask = inst->flags_read(s->devinfo)) {
1458 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
1459
1460 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
1461 if (mask & (1 << i))
1462 add_dep(n, last_conditional_mod[i]);
1463 }
1464 }
1465
1466 if (inst->reads_accumulator_implicitly()) {
1467 add_dep(n, last_accumulator_write);
1468 }
1469
1470 if (post_reg_alloc && inst->uses_address_register_implicitly()) {
1471 for (unsigned i = 0; i < ARRAY_SIZE(last_address_write); i++)
1472 last_address_write[i] = n;
1473 }
1474
1475 /* Update the things this instruction wrote, so earlier reads
1476 * can mark this as WAR dependency.
1477 */
1478 if (inst->dst.file == VGRF) {
1479 for (unsigned r = 0; r < regs_written(inst); r++)
1480 last_grf_write[grf_index(inst->dst) + r] = n;
1481 } else if (inst->dst.file == FIXED_GRF) {
1482 if (post_reg_alloc) {
1483 for (unsigned r = 0; r < regs_written(inst); r++)
1484 last_grf_write[inst->dst.nr + r] = n;
1485 } else {
1486 last_fixed_grf_write = n;
1487 }
1488 } else if (inst->dst.is_accumulator()) {
1489 last_accumulator_write = n;
1490 } else if (inst->dst.is_address()) {
1491 if (post_reg_alloc) {
1492 for (unsigned byte = 0; byte < inst->size_written; byte += 2)
1493 last_address_write[inst->dst.address_slot(byte)] = n;
1494 }
1495 } else if (register_needs_barrier(inst->dst)) {
1496 add_barrier_deps(n);
1497 }
1498
1499 if (const unsigned mask = inst->flags_written(s->devinfo)) {
1500 assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
1501
1502 for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
1503 if (mask & (1 << i))
1504 last_conditional_mod[i] = n;
1505 }
1506 }
1507
1508 if (inst->writes_accumulator_implicitly(s->devinfo)) {
1509 last_accumulator_write = n;
1510 }
1511 }
1512
1513 clear_last_grf_write();
1514 }
1515
1516 bool
address_register_interfere(const schedule_node * n)1517 brw_instruction_scheduler::address_register_interfere(const schedule_node *n)
1518 {
1519 if (n->inst->uses_address_register_implicitly()) {
1520 for (unsigned i = 0; i < ARRAY_SIZE(current.address_register); i++)
1521 if (current.address_register[i] != 0)
1522 return true;
1523 return false;
1524 }
1525
1526 if (n->inst->dst.is_address()) {
1527 for (unsigned byte = 0; byte < n->inst->size_written; byte += 2) {
1528 if (current.address_register[n->inst->dst.address_slot(byte)] != 0 &&
1529 current.address_register[n->inst->dst.address_slot(byte)] != n->inst->dst.nr)
1530 return true;
1531 }
1532 }
1533
1534 if (n->address_read_count > 0) {
1535 for (unsigned i = 0; i < n->inst->sources; i++) {
1536 if (!n->inst->src[i].is_address())
1537 continue;
1538 for (unsigned byte = 0; byte < n->inst->size_read(s->devinfo, i); byte += 2) {
1539 if (current.address_register[n->inst->src[i].address_slot(byte)] !=
1540 n->inst->src[i].nr)
1541 return true;
1542 }
1543 }
1544 }
1545
1546 return false;
1547 }
1548
1549 schedule_node *
choose_instruction_to_schedule()1550 brw_instruction_scheduler::choose_instruction_to_schedule()
1551 {
1552 schedule_node *chosen = NULL;
1553
1554 if (mode == BRW_SCHEDULE_PRE || mode == BRW_SCHEDULE_POST) {
1555 int chosen_time = 0;
1556
1557 /* Of the instructions ready to execute or the closest to being ready,
1558 * choose the one most likely to unblock an early program exit, or
1559 * otherwise the oldest one.
1560 */
1561 foreach_in_list(schedule_node, n, ¤t.available) {
1562 if (!post_reg_alloc && address_register_interfere(n))
1563 continue;
1564
1565 if (!chosen ||
1566 exit_tmp_unblocked_time(n) < exit_tmp_unblocked_time(chosen) ||
1567 (exit_tmp_unblocked_time(n) == exit_tmp_unblocked_time(chosen) &&
1568 n->tmp.unblocked_time < chosen_time)) {
1569 chosen = n;
1570 chosen_time = n->tmp.unblocked_time;
1571 }
1572 }
1573 } else {
1574 int chosen_register_pressure_benefit = 0;
1575
1576 /* Before register allocation, we don't care about the latencies of
1577 * instructions. All we care about is reducing live intervals of
1578 * variables so that we can avoid register spilling, or get SIMD16
1579 * shaders which naturally do a better job of hiding instruction
1580 * latency.
1581 */
1582 foreach_in_list(schedule_node, n, ¤t.available) {
1583 if (!post_reg_alloc && address_register_interfere(n))
1584 continue;
1585
1586 if (!chosen) {
1587 chosen = n;
1588 chosen_register_pressure_benefit =
1589 get_register_pressure_benefit(chosen->inst);
1590 continue;
1591 }
1592
1593 /* Most important: If we can definitely reduce register pressure, do
1594 * so immediately.
1595 */
1596 int register_pressure_benefit = get_register_pressure_benefit(n->inst);
1597
1598 if (register_pressure_benefit > 0 &&
1599 register_pressure_benefit > chosen_register_pressure_benefit) {
1600 chosen = n;
1601 chosen_register_pressure_benefit = register_pressure_benefit;
1602 continue;
1603 } else if (chosen_register_pressure_benefit > 0 &&
1604 (register_pressure_benefit <
1605 chosen_register_pressure_benefit)) {
1606 continue;
1607 }
1608
1609 if (mode == BRW_SCHEDULE_PRE_LIFO) {
1610 /* Prefer instructions that recently became available for
1611 * scheduling. These are the things that are most likely to
1612 * (eventually) make a variable dead and reduce register pressure.
1613 * Typical register pressure estimates don't work for us because
1614 * most of our pressure comes from texturing, where no single
1615 * instruction to schedule will make a vec4 value dead.
1616 */
1617 if (n->tmp.cand_generation > chosen->tmp.cand_generation) {
1618 chosen = n;
1619 chosen_register_pressure_benefit = register_pressure_benefit;
1620 continue;
1621 } else if (n->tmp.cand_generation < chosen->tmp.cand_generation) {
1622 continue;
1623 }
1624 }
1625
1626 /* For instructions pushed on the cands list at the same time, prefer
1627 * the one with the highest delay to the end of the program. This is
1628 * most likely to have its values able to be consumed first (such as
1629 * for a large tree of lowered ubo loads, which appear reversed in
1630 * the instruction stream with respect to when they can be consumed).
1631 */
1632 if (n->delay > chosen->delay) {
1633 chosen = n;
1634 chosen_register_pressure_benefit = register_pressure_benefit;
1635 continue;
1636 } else if (n->delay < chosen->delay) {
1637 continue;
1638 }
1639
1640 /* Prefer the node most likely to unblock an early program exit.
1641 */
1642 if (exit_tmp_unblocked_time(n) < exit_tmp_unblocked_time(chosen)) {
1643 chosen = n;
1644 chosen_register_pressure_benefit = register_pressure_benefit;
1645 continue;
1646 } else if (exit_tmp_unblocked_time(n) > exit_tmp_unblocked_time(chosen)) {
1647 continue;
1648 }
1649
1650 /* If all other metrics are equal, we prefer the first instruction in
1651 * the list (program execution).
1652 */
1653 }
1654 }
1655
1656 return chosen;
1657 }
1658
1659 int
calculate_issue_time(const fs_inst * inst)1660 brw_instruction_scheduler::calculate_issue_time(const fs_inst *inst)
1661 {
1662 const struct brw_isa_info *isa = &s->compiler->isa;
1663 const unsigned overhead = s->grf_used && has_bank_conflict(isa, inst) ?
1664 DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE) : 0;
1665 if (is_compressed(inst))
1666 return 4 + overhead;
1667 else
1668 return 2 + overhead;
1669 }
1670
1671 void
schedule(schedule_node * chosen)1672 brw_instruction_scheduler::schedule(schedule_node *chosen)
1673 {
1674 assert(current.scheduled < current.len);
1675 current.scheduled++;
1676
1677 assert(chosen);
1678 chosen->remove();
1679 current.block->instructions.push_tail(chosen->inst);
1680
1681 /* If we expected a delay for scheduling, then bump the clock to reflect
1682 * that. In reality, the hardware will switch to another hyperthread
1683 * and may not return to dispatching our thread for a while even after
1684 * we're unblocked. After this, we have the time when the chosen
1685 * instruction will start executing.
1686 */
1687 current.time = MAX2(current.time, chosen->tmp.unblocked_time);
1688
1689 /* Update the clock for how soon an instruction could start after the
1690 * chosen one.
1691 */
1692 current.time += chosen->issue_time;
1693
1694 if (debug) {
1695 fprintf(stderr, "clock %4d, scheduled: ", current.time);
1696 brw_print_instruction(*s, chosen->inst);
1697 }
1698 }
1699
1700 void
update_children(schedule_node * chosen)1701 brw_instruction_scheduler::update_children(schedule_node *chosen)
1702 {
1703 if (chosen->address_read_count > 0) {
1704 for (unsigned i = 0; i < chosen->inst->sources; i++) {
1705 if (!chosen->inst->src[i].is_address())
1706 continue;
1707 for (unsigned byte = 0; byte < chosen->inst->size_read(s->devinfo, i); byte += 2) {
1708 assert(chosen->inst->src[i].address_slot(byte) <
1709 ARRAY_SIZE(current.address_register));
1710 current.address_register[chosen->inst->src[i].address_slot(byte)] = 0;
1711 }
1712 }
1713 }
1714
1715 if (chosen->inst->dst.is_address()) {
1716 for (unsigned byte = 0; byte < chosen->inst->size_written; byte += 2) {
1717 assert(chosen->inst->dst.address_slot(byte) <
1718 ARRAY_SIZE(current.address_register));
1719 current.address_register[
1720 chosen->inst->dst.address_slot(byte)] = chosen->inst->dst.nr;
1721 }
1722 } else if (chosen->inst->uses_address_register_implicitly()) {
1723 memset(current.address_register, 0, sizeof(current.address_register));
1724 }
1725
1726 /* Now that we've scheduled a new instruction, some of its
1727 * children can be promoted to the list of instructions ready to
1728 * be scheduled. Update the children's unblocked time for this
1729 * DAG edge as we do so.
1730 */
1731 for (int i = chosen->children_count - 1; i >= 0; i--) {
1732 schedule_node_child *child = &chosen->children[i];
1733
1734 child->n->tmp.unblocked_time = MAX2(child->n->tmp.unblocked_time,
1735 current.time + child->effective_latency);
1736
1737 if (debug) {
1738 fprintf(stderr, "\tchild %d, %d parents: ", i, child->n->tmp.parent_count);
1739 brw_print_instruction(*s, child->n->inst);
1740 }
1741
1742 child->n->tmp.cand_generation = current.cand_generation;
1743 child->n->tmp.parent_count--;
1744 if (child->n->tmp.parent_count == 0) {
1745 if (debug) {
1746 fprintf(stderr, "\t\tnow available\n");
1747 }
1748 current.available.push_head(child->n);
1749 }
1750 }
1751 current.cand_generation++;
1752 }
1753
1754 void
schedule_instructions()1755 brw_instruction_scheduler::schedule_instructions()
1756 {
1757 if (!post_reg_alloc)
1758 reg_pressure = reg_pressure_in[current.block->num];
1759
1760 assert(current.available.is_empty());
1761 for (schedule_node *n = current.start; n < current.end; n++) {
1762 reset_node_tmp(n);
1763
1764 /* Add DAG heads to the list of available instructions. */
1765 if (n->tmp.parent_count == 0)
1766 current.available.push_tail(n);
1767 }
1768
1769 current.block->instructions.make_empty();
1770
1771 memset(current.address_register, 0, sizeof(current.address_register));
1772
1773 while (!current.available.is_empty()) {
1774 schedule_node *chosen = choose_instruction_to_schedule();
1775 schedule(chosen);
1776
1777 if (!post_reg_alloc) {
1778 reg_pressure -= get_register_pressure_benefit(chosen->inst);
1779 update_register_pressure(chosen->inst);
1780 if (debug)
1781 fprintf(stderr, "(register pressure %d)\n", reg_pressure);
1782 }
1783
1784 update_children(chosen);
1785 }
1786 }
1787
1788 void
run(brw_instruction_scheduler_mode mode)1789 brw_instruction_scheduler::run(brw_instruction_scheduler_mode mode)
1790 {
1791 this->mode = mode;
1792
1793 if (debug && !post_reg_alloc) {
1794 fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n",
1795 post_reg_alloc);
1796 brw_print_instructions(*s);
1797 }
1798
1799 if (!post_reg_alloc) {
1800 memset(reads_remaining, 0, grf_count * sizeof(*reads_remaining));
1801 memset(hw_reads_remaining, 0, hw_reg_count * sizeof(*hw_reads_remaining));
1802 memset(written, 0, grf_count * sizeof(*written));
1803 }
1804
1805 foreach_block(block, s->cfg) {
1806 set_current_block(block);
1807
1808 if (!post_reg_alloc) {
1809 for (schedule_node *n = current.start; n < current.end; n++)
1810 count_reads_remaining(n->inst);
1811 }
1812
1813 schedule_instructions();
1814 }
1815
1816 if (debug && !post_reg_alloc) {
1817 fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n",
1818 post_reg_alloc);
1819 brw_print_instructions(*s);
1820 }
1821 }
1822
1823 brw_instruction_scheduler *
brw_prepare_scheduler(fs_visitor & s,void * mem_ctx)1824 brw_prepare_scheduler(fs_visitor &s, void *mem_ctx)
1825 {
1826 const int grf_count = s.alloc.count;
1827
1828 brw_instruction_scheduler *empty = rzalloc(mem_ctx, brw_instruction_scheduler);
1829 return new (empty) brw_instruction_scheduler(mem_ctx, &s, grf_count, s.first_non_payload_grf,
1830 s.cfg->num_blocks, /* post_reg_alloc */ false);
1831 }
1832
1833 void
brw_schedule_instructions_pre_ra(fs_visitor & s,brw_instruction_scheduler * sched,brw_instruction_scheduler_mode mode)1834 brw_schedule_instructions_pre_ra(fs_visitor &s, brw_instruction_scheduler *sched,
1835 brw_instruction_scheduler_mode mode)
1836 {
1837 if (mode == BRW_SCHEDULE_NONE)
1838 return;
1839
1840 sched->run(mode);
1841
1842 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1843 }
1844
1845 void
brw_schedule_instructions_post_ra(fs_visitor & s)1846 brw_schedule_instructions_post_ra(fs_visitor &s)
1847 {
1848 const bool post_reg_alloc = true;
1849 const int grf_count = reg_unit(s.devinfo) * s.grf_used;
1850
1851 void *mem_ctx = ralloc_context(NULL);
1852
1853 brw_instruction_scheduler sched(mem_ctx, &s, grf_count, s.first_non_payload_grf,
1854 s.cfg->num_blocks, post_reg_alloc);
1855 sched.run(BRW_SCHEDULE_POST);
1856
1857 ralloc_free(mem_ctx);
1858
1859 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1860 }
1861