• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Eric Anholt <eric@anholt.net>
25  *
26  */
27 
28 #include "brw_eu.h"
29 #include "brw_fs.h"
30 #include "brw_fs_live_variables.h"
31 #include "brw_cfg.h"
32 #include <new>
33 
34 using namespace brw;
35 
36 /** @file
37  *
38  * List scheduling of FS instructions.
39  *
40  * The basic model of the list scheduler is to take a basic block,
41  * compute a DAG of the dependencies (RAW ordering with latency, WAW
42  * ordering with latency, WAR ordering), and make a list of the DAG heads.
43  * Heuristically pick a DAG head, then put all the children that are
44  * now DAG heads into the list of things to schedule.
45  *
46  * The heuristic is the important part.  We're trying to be cheap,
47  * since actually computing the optimal scheduling is NP complete.
48  * What we do is track a "current clock".  When we schedule a node, we
49  * update the earliest-unblocked clock time of its children, and
50  * increment the clock.  Then, when trying to schedule, we just pick
51  * the earliest-unblocked instruction to schedule.
52  *
53  * Note that often there will be many things which could execute
54  * immediately, and there are a range of heuristic options to choose
55  * from in picking among those.
56  */
57 
58 static bool debug = false;
59 
60 struct schedule_node_child;
61 
62 class schedule_node : public exec_node
63 {
64 public:
65    void set_latency(const struct brw_isa_info *isa);
66 
67    fs_inst *inst;
68    schedule_node_child *children;
69    int children_count;
70    int children_cap;
71    int initial_parent_count;
72    int initial_unblocked_time;
73    int latency;
74 
75    /**
76     * This is the sum of the instruction's latency plus the maximum delay of
77     * its children, or just the issue_time if it's a leaf node.
78     */
79    int delay;
80 
81    /**
82     * Preferred exit node among the (direct or indirect) successors of this
83     * node.  Among the scheduler nodes blocked by this node, this will be the
84     * one that may cause earliest program termination, or NULL if none of the
85     * successors is an exit node.
86     */
87    schedule_node *exit;
88 
89    /**
90     * How many cycles this instruction takes to issue.
91     *
92     * Instructions in gen hardware are handled one simd4 vector at a time,
93     * with 1 cycle per vector dispatched.  Thus SIMD8 pixel shaders take 2
94     * cycles to dispatch and SIMD16 (compressed) instructions take 4.
95     */
96    int issue_time;
97 
98    /**
99     * Whether the instruction reads any part of the address register (to speed
100     * up instruction checks).
101     */
102    schedule_node **address_read;
103    int address_read_count;
104    int address_read_cap;
105 
106    /* Temporary data used during the scheduling process. */
107    struct {
108       int parent_count;
109       int unblocked_time;
110 
111       /**
112        * Which iteration of pushing groups of children onto the candidates list
113        * this node was a part of.
114        */
115       unsigned cand_generation;
116    } tmp;
117 };
118 
119 struct schedule_node_child {
120    schedule_node *n;
121    int effective_latency;
122 };
123 
124 static inline void
reset_node_tmp(schedule_node * n)125 reset_node_tmp(schedule_node *n)
126 {
127    n->tmp.parent_count = n->initial_parent_count;
128    n->tmp.unblocked_time = n->initial_unblocked_time;
129    n->tmp.cand_generation = 0;
130 }
131 
132 /**
133  * Lower bound of the scheduling time after which one of the instructions
134  * blocked by this node may lead to program termination.
135  *
136  * exit_unblocked_time() determines a strict partial ordering relation '«' on
137  * the set of scheduler nodes as follows:
138  *
139  *   n « m <-> exit_unblocked_time(n) < exit_unblocked_time(m)
140  *
141  * which can be used to heuristically order nodes according to how early they
142  * can unblock an exit node and lead to program termination.
143  */
144 static inline int
exit_tmp_unblocked_time(const schedule_node * n)145 exit_tmp_unblocked_time(const schedule_node *n)
146 {
147    return n->exit ? n->exit->tmp.unblocked_time : INT_MAX;
148 }
149 
150 static inline int
exit_initial_unblocked_time(const schedule_node * n)151 exit_initial_unblocked_time(const schedule_node *n)
152 {
153    return n->exit ? n->exit->initial_unblocked_time : INT_MAX;
154 }
155 
156 void
set_latency(const struct brw_isa_info * isa)157 schedule_node::set_latency(const struct brw_isa_info *isa)
158 {
159    switch (inst->opcode) {
160    case BRW_OPCODE_MAD:
161       /* 2 cycles
162        *  (since the last two src operands are in different register banks):
163        * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
164        *
165        * 3 cycles on IVB, 4 on HSW
166        *  (since the last two src operands are in the same register bank):
167        * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
168        *
169        * 18 cycles on IVB, 16 on HSW
170        *  (since the last two src operands are in different register banks):
171        * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
172        * mov(8) null   g4<4,5,1>F                     { align16 WE_normal 1Q };
173        *
174        * 20 cycles on IVB, 18 on HSW
175        *  (since the last two src operands are in the same register bank):
176        * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
177        * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
178        */
179 
180       /* Our register allocator doesn't know about register banks, so use the
181        * higher latency.
182        */
183       latency = 18;
184       break;
185 
186    case BRW_OPCODE_LRP:
187       /* 2 cycles
188        *  (since the last two src operands are in different register banks):
189        * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
190        *
191        * 3 cycles on IVB, 4 on HSW
192        *  (since the last two src operands are in the same register bank):
193        * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
194        *
195        * 16 cycles on IVB, 14 on HSW
196        *  (since the last two src operands are in different register banks):
197        * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
198        * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
199        *
200        * 16 cycles
201        *  (since the last two src operands are in the same register bank):
202        * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
203        * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
204        */
205 
206       /* Our register allocator doesn't know about register banks, so use the
207        * higher latency.
208        */
209       latency = 14;
210       break;
211 
212    case SHADER_OPCODE_RCP:
213    case SHADER_OPCODE_RSQ:
214    case SHADER_OPCODE_SQRT:
215    case SHADER_OPCODE_LOG2:
216    case SHADER_OPCODE_EXP2:
217    case SHADER_OPCODE_SIN:
218    case SHADER_OPCODE_COS:
219       /* 2 cycles:
220        * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
221        *
222        * 18 cycles:
223        * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
224        * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
225        *
226        * Same for exp2, log2, rsq, sqrt, sin, cos.
227        */
228       latency = 16;
229       break;
230 
231    case SHADER_OPCODE_POW:
232       /* 2 cycles:
233        * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
234        *
235        * 26 cycles:
236        * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
237        * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
238        */
239       latency = 24;
240       break;
241 
242    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
243       /* testing using varying-index pull constants:
244        *
245        * 16 cycles:
246        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
247        * send(8) g4<1>F  g4<8,8,1>D
248        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
249        *
250        * ~480 cycles:
251        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
252        * send(8) g4<1>F  g4<8,8,1>D
253        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
254        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
255        *
256        * ~620 cycles:
257        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
258        * send(8) g4<1>F  g4<8,8,1>D
259        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
260        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
261        * send(8) g4<1>F  g4<8,8,1>D
262        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
263        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
264        *
265        * So, if it's cache-hot, it's about 140.  If it's cache cold, it's
266        * about 460.  We expect to mostly be cache hot, so pick something more
267        * in that direction.
268        */
269       latency = 200;
270       break;
271 
272    case SHADER_OPCODE_SEND:
273       switch (inst->sfid) {
274       case BRW_SFID_SAMPLER: {
275          unsigned msg_type = (inst->desc >> 12) & 0x1f;
276          switch (msg_type) {
277          case GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO:
278          case GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO:
279             /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
280              * cycles (n=15):
281              * mov(8)   g114<1>UD  0D                  { align1 WE_normal 1Q };
282              * send(8)  g6<1>UW    g114<8,8,1>F
283              *   sampler (10, 0, 10, 1) mlen 1 rlen 4  { align1 WE_normal 1Q };
284              * mov(16)  g6<1>F     g6<8,8,1>D                { align1 WE_normal 1Q };
285              *
286              *
287              * Two loads was 535 +/- 30 cycles (n=19):
288              * mov(16)   g114<1>UD  0D                 { align1 WE_normal 1H };
289              * send(16)  g6<1>UW    g114<8,8,1>F
290              *   sampler (10, 0, 10, 2) mlen 2 rlen 8  { align1 WE_normal 1H };
291              * mov(16)   g114<1>UD  0D                 { align1 WE_normal 1H };
292              * mov(16)   g6<1>F     g6<8,8,1>D         { align1 WE_normal 1H };
293              * send(16)  g8<1>UW    g114<8,8,1>F
294              *   sampler (10, 0, 10, 2) mlen 2 rlen 8  { align1 WE_normal 1H };
295              * mov(16)   g8<1>F     g8<8,8,1>D         { align1 WE_normal 1H };
296              * add(16)   g6<1>F     g6<8,8,1>F   g8<8,8,1>F  { align1 WE_normal 1H };
297              *
298              * Since the only caches that should matter are just the
299              * instruction/state cache containing the surface state,
300              * assume that we always have hot caches.
301              */
302             latency = 100;
303             break;
304 
305          default:
306             /* 18 cycles:
307              * mov(8)  g115<1>F   0F                  { align1 WE_normal 1Q };
308              * mov(8)  g114<1>F   0F                  { align1 WE_normal 1Q };
309              * send(8) g4<1>UW    g114<8,8,1>F
310              *   sampler (10, 0, 0, 1) mlen 2 rlen 4  { align1 WE_normal 1Q };
311              *
312              * 697 +/-49 cycles (min 610, n=26):
313              * mov(8)  g115<1>F   0F                  { align1 WE_normal 1Q };
314              * mov(8)  g114<1>F   0F                  { align1 WE_normal 1Q };
315              * send(8) g4<1>UW    g114<8,8,1>F
316              *   sampler (10, 0, 0, 1) mlen 2 rlen 4  { align1 WE_normal 1Q };
317              * mov(8)  null       g4<8,8,1>F          { align1 WE_normal 1Q };
318              *
319              * So the latency on our first texture load of the batchbuffer
320              * takes ~700 cycles, since the caches are cold at that point.
321              *
322              * 840 +/- 92 cycles (min 720, n=25):
323              * mov(8)  g115<1>F   0F                  { align1 WE_normal 1Q };
324              * mov(8)  g114<1>F   0F                  { align1 WE_normal 1Q };
325              * send(8) g4<1>UW    g114<8,8,1>F
326              *   sampler (10, 0, 0, 1) mlen 2 rlen 4  { align1 WE_normal 1Q };
327              * mov(8)  null       g4<8,8,1>F          { align1 WE_normal 1Q };
328              * send(8) g4<1>UW    g114<8,8,1>F
329              *   sampler (10, 0, 0, 1) mlen 2 rlen 4  { align1 WE_normal 1Q };
330              * mov(8)  null       g4<8,8,1>F          { align1 WE_normal 1Q };
331              *
332              * On the second load, it takes just an extra ~140 cycles, and
333              * after accounting for the 14 cycles of the MOV's latency, that
334              * makes ~130.
335              *
336              * 683 +/- 49 cycles (min = 602, n=47):
337              * mov(8)  g115<1>F   0F                  { align1 WE_normal 1Q };
338              * mov(8)  g114<1>F   0F                  { align1 WE_normal 1Q };
339              * send(8) g4<1>UW    g114<8,8,1>F
340              *   sampler (10, 0, 0, 1) mlen 2 rlen 4  { align1 WE_normal 1Q };
341              * send(8) g50<1>UW   g114<8,8,1>F
342              *   sampler (10, 0, 0, 1) mlen 2 rlen 4  { align1 WE_normal 1Q };
343              * mov(8)  null       g4<8,8,1>F          { align1 WE_normal 1Q };
344              *
345              * The unit appears to be pipelined, since this matches up with
346              * the cache-cold case, despite there being two loads here.  If
347              * you replace the g4 in the MOV to null with g50, it's still
348              * 693 +/- 52 (n=39).
349              *
350              * So, take some number between the cache-hot 140 cycles and the
351              * cache-cold 700 cycles.  No particular tuning was done on this.
352              *
353              * I haven't done significant testing of the non-TEX opcodes.
354              * TXL at least looked about the same as TEX.
355              */
356             latency = 200;
357             break;
358          }
359          break;
360       }
361 
362       case GFX6_SFID_DATAPORT_CONSTANT_CACHE:
363          /* See FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD */
364          latency = 200;
365          break;
366 
367       case GFX6_SFID_DATAPORT_RENDER_CACHE:
368          switch (brw_fb_desc_msg_type(isa->devinfo, inst->desc)) {
369          case GFX7_DATAPORT_RC_TYPED_SURFACE_WRITE:
370          case GFX7_DATAPORT_RC_TYPED_SURFACE_READ:
371             /* See also SHADER_OPCODE_TYPED_SURFACE_READ */
372             latency = 600;
373             break;
374 
375          case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP:
376             /* See also SHADER_OPCODE_TYPED_ATOMIC */
377             latency = 14000;
378             break;
379 
380          case GFX6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE:
381             /* completely fabricated number */
382             latency = 600;
383             break;
384 
385          default:
386             unreachable("Unknown render cache message");
387          }
388          break;
389 
390       case GFX7_SFID_DATAPORT_DATA_CACHE:
391          switch ((inst->desc >> 14) & 0x1f) {
392          case BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ:
393          case GFX7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ:
394          case GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE:
395             /* We have no data for this but assume it's a little faster than
396              * untyped surface read/write.
397              */
398             latency = 200;
399             break;
400 
401          case GFX7_DATAPORT_DC_DWORD_SCATTERED_READ:
402          case GFX6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE:
403          case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ:
404          case HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE:
405             /* We have no data for this but assume it's roughly the same as
406              * untyped surface read/write.
407              */
408             latency = 300;
409             break;
410 
411          case GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ:
412          case GFX7_DATAPORT_DC_UNTYPED_SURFACE_WRITE:
413             /* Test code:
414              *   mov(8)    g112<1>UD       0x00000000UD       { align1 WE_all 1Q };
415              *   mov(1)    g112.7<1>UD     g1.7<0,1,0>UD      { align1 WE_all };
416              *   mov(8)    g113<1>UD       0x00000000UD       { align1 WE_normal 1Q };
417              *   send(8)   g4<1>UD         g112<8,8,1>UD
418              *             data (38, 6, 5) mlen 2 rlen 1      { align1 WE_normal 1Q };
419              *   .
420              *   . [repeats 8 times]
421              *   .
422              *   mov(8)    g112<1>UD       0x00000000UD       { align1 WE_all 1Q };
423              *   mov(1)    g112.7<1>UD     g1.7<0,1,0>UD      { align1 WE_all };
424              *   mov(8)    g113<1>UD       0x00000000UD       { align1 WE_normal 1Q };
425              *   send(8)   g4<1>UD         g112<8,8,1>UD
426              *             data (38, 6, 5) mlen 2 rlen 1      { align1 WE_normal 1Q };
427              *
428              * Running it 100 times as fragment shader on a 128x128 quad
429              * gives an average latency of 583 cycles per surface read,
430              * standard deviation 0.9%.
431              */
432             latency = 600;
433             break;
434 
435          case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
436             /* Test code:
437              *   mov(8)    g112<1>ud       0x00000000ud       { align1 WE_all 1Q };
438              *   mov(1)    g112.7<1>ud     g1.7<0,1,0>ud      { align1 WE_all };
439              *   mov(8)    g113<1>ud       0x00000000ud       { align1 WE_normal 1Q };
440              *   send(8)   g4<1>ud         g112<8,8,1>ud
441              *             data (38, 5, 6) mlen 2 rlen 1      { align1 WE_normal 1Q };
442              *
443              * Running it 100 times as fragment shader on a 128x128 quad
444              * gives an average latency of 13867 cycles per atomic op,
445              * standard deviation 3%.  Note that this is a rather
446              * pessimistic estimate, the actual latency in cases with few
447              * collisions between threads and favorable pipelining has been
448              * seen to be reduced by a factor of 100.
449              */
450             latency = 14000;
451             break;
452 
453          default:
454             unreachable("Unknown data cache message");
455          }
456          break;
457 
458       case HSW_SFID_DATAPORT_DATA_CACHE_1:
459          switch (brw_dp_desc_msg_type(isa->devinfo, inst->desc)) {
460          case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ:
461          case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE:
462          case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ:
463          case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE:
464          case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_WRITE:
465          case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_SURFACE_READ:
466          case GFX8_DATAPORT_DC_PORT1_A64_SCATTERED_WRITE:
467          case GFX9_DATAPORT_DC_PORT1_A64_SCATTERED_READ:
468          case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_READ:
469          case GFX9_DATAPORT_DC_PORT1_A64_OWORD_BLOCK_WRITE:
470             /* See also GFX7_DATAPORT_DC_UNTYPED_SURFACE_READ */
471             latency = 300;
472             break;
473 
474          case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
475          case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
476          case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
477          case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
478          case GFX9_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_FLOAT_OP:
479          case GFX8_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_OP:
480          case GFX9_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_FLOAT_OP:
481          case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_INT_OP:
482          case GFX12_DATAPORT_DC_PORT1_A64_UNTYPED_ATOMIC_HALF_FLOAT_OP:
483             /* See also GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP */
484             latency = 14000;
485             break;
486 
487          default:
488             unreachable("Unknown data cache message");
489          }
490          break;
491 
492       case GFX7_SFID_PIXEL_INTERPOLATOR:
493          latency = 50; /* TODO */
494          break;
495 
496       case GFX12_SFID_UGM:
497       case GFX12_SFID_TGM:
498       case GFX12_SFID_SLM:
499          switch (lsc_msg_desc_opcode(isa->devinfo, inst->desc)) {
500          case LSC_OP_LOAD:
501          case LSC_OP_STORE:
502          case LSC_OP_LOAD_CMASK:
503          case LSC_OP_STORE_CMASK:
504             latency = 300;
505             break;
506          case LSC_OP_FENCE:
507          case LSC_OP_ATOMIC_INC:
508          case LSC_OP_ATOMIC_DEC:
509          case LSC_OP_ATOMIC_LOAD:
510          case LSC_OP_ATOMIC_STORE:
511          case LSC_OP_ATOMIC_ADD:
512          case LSC_OP_ATOMIC_SUB:
513          case LSC_OP_ATOMIC_MIN:
514          case LSC_OP_ATOMIC_MAX:
515          case LSC_OP_ATOMIC_UMIN:
516          case LSC_OP_ATOMIC_UMAX:
517          case LSC_OP_ATOMIC_CMPXCHG:
518          case LSC_OP_ATOMIC_FADD:
519          case LSC_OP_ATOMIC_FSUB:
520          case LSC_OP_ATOMIC_FMIN:
521          case LSC_OP_ATOMIC_FMAX:
522          case LSC_OP_ATOMIC_FCMPXCHG:
523          case LSC_OP_ATOMIC_AND:
524          case LSC_OP_ATOMIC_OR:
525          case LSC_OP_ATOMIC_XOR:
526             latency = 1400;
527             break;
528          default:
529             unreachable("unsupported new data port message instruction");
530          }
531          break;
532 
533       case BRW_SFID_MESSAGE_GATEWAY:
534       case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH: /* or THREAD_SPAWNER */
535       case GEN_RT_SFID_RAY_TRACE_ACCELERATOR:
536          /* TODO.
537           *
538           * We'll assume for the moment that this is pretty quick as it
539           * doesn't actually return any data.
540           */
541          latency = 200;
542          break;
543 
544       case BRW_SFID_URB:
545          latency = 200;
546          break;
547 
548       default:
549          unreachable("Unknown SFID");
550       }
551       break;
552 
553    case BRW_OPCODE_DPAS:
554       switch (inst->rcount) {
555       case 1:
556          latency = 21;
557          break;
558       case 2:
559          latency = 22;
560          break;
561       case 8:
562       default:
563          latency = 32;
564          break;
565       }
566       break;
567 
568    default:
569       /* 2 cycles:
570        * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
571        *
572        * 16 cycles:
573        * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
574        * mov(8) null   g4<8,8,1>F                      { align1 WE_normal 1Q };
575        */
576       latency = 14;
577       break;
578    }
579 }
580 
581 class brw_instruction_scheduler {
582 public:
583    brw_instruction_scheduler(void *mem_ctx, const fs_visitor *s, int grf_count, int hw_reg_count,
584                          int block_count, bool post_reg_alloc);
585 
586    void add_barrier_deps(schedule_node *n);
587    void add_cross_lane_deps(schedule_node *n);
588    void add_dep(schedule_node *before, schedule_node *after, int latency);
589    void add_dep(schedule_node *before, schedule_node *after);
590    void add_address_dep(schedule_node *before, schedule_node *after);
591 
592    void set_current_block(bblock_t *block);
593    void compute_delays();
594    void compute_exits();
595 
596    void schedule(schedule_node *chosen);
597    void update_children(schedule_node *chosen);
598 
599    void calculate_deps();
600    bool is_compressed(const fs_inst *inst);
601    bool register_needs_barrier(const brw_reg &reg);
602    bool address_register_interfere(const schedule_node *n);
603    schedule_node *choose_instruction_to_schedule();
604    int calculate_issue_time(const fs_inst *inst);
605 
606    void count_reads_remaining(const fs_inst *inst);
607    void setup_liveness(cfg_t *cfg);
608    void update_register_pressure(const fs_inst *inst);
609    int get_register_pressure_benefit(const fs_inst *inst);
610    void clear_last_grf_write();
611 
612    void schedule_instructions();
613    void run(brw_instruction_scheduler_mode mode);
614 
615    int grf_index(const brw_reg &reg);
616 
617    void *mem_ctx;
618    linear_ctx *lin_ctx;
619 
620    schedule_node *nodes;
621    int nodes_len;
622 
623    /* Current block being processed. */
624    struct {
625       bblock_t *block;
626 
627       /* Range of nodes in the block.  End will point to first node
628        * address after the block, i.e. the range is [start, end).
629        */
630       schedule_node *start;
631       schedule_node *end;
632       int len;
633 
634       int scheduled;
635 
636       unsigned cand_generation;
637       int time;
638       exec_list available;
639 
640       /* Currently used address register */
641       uint32_t address_register[16];
642    } current;
643 
644    bool post_reg_alloc;
645    int grf_count;
646    const fs_visitor *s;
647 
648    /**
649     * Last instruction to have written the grf (or a channel in the grf, for the
650     * scalar backend)
651     */
652    schedule_node **last_grf_write;
653 
654    unsigned hw_reg_count;
655    int reg_pressure;
656    brw_instruction_scheduler_mode mode;
657 
658    /*
659     * The register pressure at the beginning of each basic block.
660     */
661 
662    int *reg_pressure_in;
663 
664    /*
665     * The virtual GRF's whose range overlaps the beginning of each basic block.
666     */
667 
668    BITSET_WORD **livein;
669 
670    /*
671     * The virtual GRF's whose range overlaps the end of each basic block.
672     */
673 
674    BITSET_WORD **liveout;
675 
676    /*
677     * The hardware GRF's whose range overlaps the end of each basic block.
678     */
679 
680    BITSET_WORD **hw_liveout;
681 
682    /*
683     * Whether we've scheduled a write for this virtual GRF yet.
684     */
685 
686    bool *written;
687 
688    /*
689     * How many reads we haven't scheduled for this virtual GRF yet.
690     */
691 
692    int *reads_remaining;
693 
694    /*
695     * How many reads we haven't scheduled for this hardware GRF yet.
696     */
697 
698    int *hw_reads_remaining;
699 };
700 
brw_instruction_scheduler(void * mem_ctx,const fs_visitor * s,int grf_count,int hw_reg_count,int block_count,bool post_reg_alloc)701 brw_instruction_scheduler::brw_instruction_scheduler(void *mem_ctx, const fs_visitor *s,
702                                              int grf_count, int hw_reg_count,
703                                              int block_count, bool post_reg_alloc)
704    : s(s)
705 {
706    this->mem_ctx = mem_ctx;
707    this->lin_ctx = linear_context(this->mem_ctx);
708    this->grf_count = grf_count;
709    this->post_reg_alloc = post_reg_alloc;
710 
711    const unsigned grf_write_scale = MAX_VGRF_SIZE(s->devinfo);
712    this->last_grf_write = linear_zalloc_array(lin_ctx, schedule_node *, grf_count * grf_write_scale);
713 
714    this->nodes_len = s->cfg->last_block()->end_ip + 1;
715    this->nodes = linear_zalloc_array(lin_ctx, schedule_node, this->nodes_len);
716 
717    const struct brw_isa_info *isa = &s->compiler->isa;
718 
719    schedule_node *n = nodes;
720    foreach_block_and_inst(block, fs_inst, inst, s->cfg) {
721       n->inst = inst;
722 
723       if (!post_reg_alloc)
724          n->latency = 1;
725       else
726          n->set_latency(isa);
727 
728       n++;
729    }
730    assert(n == nodes + nodes_len);
731 
732    current.block = NULL;
733    current.start = NULL;
734    current.end = NULL;
735    current.len = 0;
736    current.time = 0;
737    current.cand_generation = 0;
738    current.available.make_empty();
739 
740    this->hw_reg_count = hw_reg_count;
741    this->mode = BRW_SCHEDULE_NONE;
742    this->reg_pressure = 0;
743 
744    if (!post_reg_alloc) {
745       this->reg_pressure_in = linear_zalloc_array(lin_ctx, int, block_count);
746 
747       this->livein = linear_alloc_array(lin_ctx, BITSET_WORD *, block_count);
748       for (int i = 0; i < block_count; i++)
749          this->livein[i] = linear_zalloc_array(lin_ctx, BITSET_WORD,
750                                          BITSET_WORDS(grf_count));
751 
752       this->liveout = linear_alloc_array(lin_ctx, BITSET_WORD *, block_count);
753       for (int i = 0; i < block_count; i++)
754          this->liveout[i] = linear_zalloc_array(lin_ctx, BITSET_WORD,
755                                           BITSET_WORDS(grf_count));
756 
757       this->hw_liveout = linear_alloc_array(lin_ctx, BITSET_WORD *, block_count);
758       for (int i = 0; i < block_count; i++)
759          this->hw_liveout[i] = linear_zalloc_array(lin_ctx, BITSET_WORD,
760                                              BITSET_WORDS(hw_reg_count));
761 
762       setup_liveness(s->cfg);
763 
764       this->written = linear_alloc_array(lin_ctx, bool, grf_count);
765 
766       this->reads_remaining = linear_alloc_array(lin_ctx, int, grf_count);
767 
768       this->hw_reads_remaining = linear_alloc_array(lin_ctx, int, hw_reg_count);
769    } else {
770       this->reg_pressure_in = NULL;
771       this->livein = NULL;
772       this->liveout = NULL;
773       this->hw_liveout = NULL;
774       this->written = NULL;
775       this->reads_remaining = NULL;
776       this->hw_reads_remaining = NULL;
777    }
778 
779    foreach_block(block, s->cfg) {
780       set_current_block(block);
781 
782       for (schedule_node *n = current.start; n < current.end; n++)
783          n->issue_time = calculate_issue_time(n->inst);
784 
785       calculate_deps();
786       compute_delays();
787       compute_exits();
788    }
789 }
790 
791 static bool
is_src_duplicate(const fs_inst * inst,int src)792 is_src_duplicate(const fs_inst *inst, int src)
793 {
794    for (int i = 0; i < src; i++)
795      if (inst->src[i].equals(inst->src[src]))
796        return true;
797 
798   return false;
799 }
800 
801 void
count_reads_remaining(const fs_inst * inst)802 brw_instruction_scheduler::count_reads_remaining(const fs_inst *inst)
803 {
804    assert(reads_remaining);
805 
806    for (int i = 0; i < inst->sources; i++) {
807       if (is_src_duplicate(inst, i))
808          continue;
809 
810       if (inst->src[i].file == VGRF) {
811          reads_remaining[inst->src[i].nr]++;
812       } else if (inst->src[i].file == FIXED_GRF) {
813          if (inst->src[i].nr >= hw_reg_count)
814             continue;
815 
816          for (unsigned j = 0; j < regs_read(s->devinfo, inst, i); j++)
817             hw_reads_remaining[inst->src[i].nr + j]++;
818       }
819    }
820 }
821 
822 void
setup_liveness(cfg_t * cfg)823 brw_instruction_scheduler::setup_liveness(cfg_t *cfg)
824 {
825    const fs_live_variables &live = s->live_analysis.require();
826 
827    /* First, compute liveness on a per-GRF level using the in/out sets from
828     * liveness calculation.
829     */
830    for (int block = 0; block < cfg->num_blocks; block++) {
831       for (int i = 0; i < live.num_vars; i++) {
832          if (BITSET_TEST(live.block_data[block].livein, i)) {
833             int vgrf = live.vgrf_from_var[i];
834             if (!BITSET_TEST(livein[block], vgrf)) {
835                reg_pressure_in[block] += s->alloc.sizes[vgrf];
836                BITSET_SET(livein[block], vgrf);
837             }
838          }
839 
840          if (BITSET_TEST(live.block_data[block].liveout, i))
841             BITSET_SET(liveout[block], live.vgrf_from_var[i]);
842       }
843    }
844 
845    /* Now, extend the live in/live out sets for when a range crosses a block
846     * boundary, which matches what our register allocator/interference code
847     * does to account for force_writemask_all and incompatible exec_mask's.
848     */
849    for (int block = 0; block < cfg->num_blocks - 1; block++) {
850       for (int i = 0; i < grf_count; i++) {
851          if (live.vgrf_start[i] <= cfg->blocks[block]->end_ip &&
852              live.vgrf_end[i] >= cfg->blocks[block + 1]->start_ip) {
853             if (!BITSET_TEST(livein[block + 1], i)) {
854                 reg_pressure_in[block + 1] += s->alloc.sizes[i];
855                 BITSET_SET(livein[block + 1], i);
856             }
857 
858             BITSET_SET(liveout[block], i);
859          }
860       }
861    }
862 
863    int *payload_last_use_ip = ralloc_array(NULL, int, hw_reg_count);
864    s->calculate_payload_ranges(true, hw_reg_count, payload_last_use_ip);
865 
866    for (unsigned i = 0; i < hw_reg_count; i++) {
867       if (payload_last_use_ip[i] == -1)
868          continue;
869 
870       for (int block = 0; block < cfg->num_blocks; block++) {
871          if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i])
872             reg_pressure_in[block]++;
873 
874          if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i])
875             BITSET_SET(hw_liveout[block], i);
876       }
877    }
878 
879    ralloc_free(payload_last_use_ip);
880 }
881 
882 void
update_register_pressure(const fs_inst * inst)883 brw_instruction_scheduler::update_register_pressure(const fs_inst *inst)
884 {
885    assert(reads_remaining);
886 
887    if (inst->dst.file == VGRF) {
888       written[inst->dst.nr] = true;
889    }
890 
891    for (int i = 0; i < inst->sources; i++) {
892       if (is_src_duplicate(inst, i))
893           continue;
894 
895       if (inst->src[i].file == VGRF) {
896          reads_remaining[inst->src[i].nr]--;
897       } else if (inst->src[i].file == FIXED_GRF &&
898                  inst->src[i].nr < hw_reg_count) {
899          for (unsigned off = 0; off < regs_read(s->devinfo, inst, i); off++)
900             hw_reads_remaining[inst->src[i].nr + off]--;
901       }
902    }
903 }
904 
905 int
get_register_pressure_benefit(const fs_inst * inst)906 brw_instruction_scheduler::get_register_pressure_benefit(const fs_inst *inst)
907 {
908    int benefit = 0;
909    const int block_idx = current.block->num;
910 
911    if (inst->dst.file == VGRF) {
912       if (!BITSET_TEST(livein[block_idx], inst->dst.nr) &&
913           !written[inst->dst.nr])
914          benefit -= s->alloc.sizes[inst->dst.nr];
915    }
916 
917    for (int i = 0; i < inst->sources; i++) {
918       if (is_src_duplicate(inst, i))
919          continue;
920 
921       if (inst->src[i].file == VGRF &&
922           !BITSET_TEST(liveout[block_idx], inst->src[i].nr) &&
923           reads_remaining[inst->src[i].nr] == 1)
924          benefit += s->alloc.sizes[inst->src[i].nr];
925 
926       if (inst->src[i].file == FIXED_GRF &&
927           inst->src[i].nr < hw_reg_count) {
928          for (unsigned off = 0; off < regs_read(s->devinfo, inst, i); off++) {
929             int reg = inst->src[i].nr + off;
930             if (!BITSET_TEST(hw_liveout[block_idx], reg) &&
931                 hw_reads_remaining[reg] == 1) {
932                benefit++;
933             }
934          }
935       }
936    }
937 
938    return benefit;
939 }
940 
941 void
set_current_block(bblock_t * block)942 brw_instruction_scheduler::set_current_block(bblock_t *block)
943 {
944    current.block = block;
945    current.start = nodes + block->start_ip;
946    current.len = block->end_ip - block->start_ip + 1;
947    current.end = current.start + current.len;
948    current.time = 0;
949    current.scheduled = 0;
950    current.cand_generation = 1;
951 }
952 
953 /** Computation of the delay member of each node. */
954 void
compute_delays()955 brw_instruction_scheduler::compute_delays()
956 {
957    for (schedule_node *n = current.end - 1; n >= current.start; n--) {
958       if (!n->children_count) {
959          n->delay = n->issue_time;
960       } else {
961          for (int i = 0; i < n->children_count; i++) {
962             if (n->children[i].n->delay == 0) {
963                /* This is a special case for address register, where a child
964                 * could be a prior instruction.
965                 *
966                 * This ensures that a address register write instruction will
967                 * always unblock the reader of the address register. Otherwise
968                 * we could end up with scheduling deadlocks.
969                 */
970                assert(n->children[i].n->inst->dst.is_address());
971                n->delay = MAX2(n->delay, 1);
972             } else {
973                n->delay = MAX2(n->delay, n->latency + n->children[i].n->delay);
974             }
975          }
976       }
977    }
978 }
979 
980 void
compute_exits()981 brw_instruction_scheduler::compute_exits()
982 {
983    /* Calculate a lower bound of the scheduling time of each node in the
984     * graph.  This is analogous to the node's critical path but calculated
985     * from the top instead of from the bottom of the block.
986     */
987    for (schedule_node *n = current.start; n < current.end; n++) {
988       for (int i = 0; i < n->children_count; i++) {
989          schedule_node_child *child = &n->children[i];
990          child->n->initial_unblocked_time =
991             MAX2(child->n->initial_unblocked_time,
992                  n->initial_unblocked_time + n->issue_time + child->effective_latency);
993       }
994    }
995 
996    /* Calculate the exit of each node by induction based on the exit nodes of
997     * its children.  The preferred exit of a node is the one among the exit
998     * nodes of its children which can be unblocked first according to the
999     * optimistic unblocked time estimate calculated above.
1000     */
1001    for (schedule_node *n = current.end - 1; n >= current.start; n--) {
1002       n->exit = (n->inst->opcode == BRW_OPCODE_HALT ? n : NULL);
1003 
1004       for (int i = 0; i < n->children_count; i++) {
1005          if (exit_initial_unblocked_time(n->children[i].n) < exit_initial_unblocked_time(n))
1006             n->exit = n->children[i].n->exit;
1007       }
1008    }
1009 }
1010 
1011 /**
1012  * Add a dependency between two instruction nodes.
1013  *
1014  * The @after node will be scheduled after @before.  We will try to
1015  * schedule it @latency cycles after @before, but no guarantees there.
1016  */
1017 void
add_dep(schedule_node * before,schedule_node * after,int latency)1018 brw_instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
1019                                int latency)
1020 {
1021    if (!before || !after)
1022       return;
1023 
1024    assert(before != after);
1025 
1026    for (int i = 0; i < before->children_count; i++) {
1027       schedule_node_child *child = &before->children[i];
1028       if (child->n == after) {
1029          child->effective_latency = MAX2(child->effective_latency, latency);
1030          return;
1031       }
1032    }
1033 
1034    if (before->children_cap <= before->children_count) {
1035       if (before->children_cap < 16)
1036          before->children_cap = 16;
1037       else
1038          before->children_cap *= 2;
1039 
1040       before->children = reralloc(mem_ctx, before->children,
1041                                   schedule_node_child,
1042                                   before->children_cap);
1043    }
1044 
1045    schedule_node_child *child = &before->children[before->children_count];
1046    child->n = after;
1047    child->effective_latency = latency;
1048    before->children_count++;
1049    after->initial_parent_count++;
1050 
1051    /* Propagate the dependency to the address register instructions. */
1052    for (int i = 0; i < after->address_read_count; i++)
1053       add_dep(before, after->address_read[i]);
1054 }
1055 
1056 void
add_dep(schedule_node * before,schedule_node * after)1057 brw_instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
1058 {
1059    if (!before)
1060       return;
1061 
1062    add_dep(before, after, before->latency);
1063 }
1064 
1065 void
add_address_dep(schedule_node * before,schedule_node * after)1066 brw_instruction_scheduler::add_address_dep(schedule_node *before, schedule_node *after)
1067 {
1068    assert(before && after);
1069 
1070    add_dep(before, after, before->latency);
1071 
1072    if (after->address_read_cap <= after->address_read_count) {
1073       after->address_read_cap = MAX2(2 * after->address_read_cap, 1);
1074 
1075       after->address_read = reralloc(mem_ctx, after->address_read,
1076                                      schedule_node *,
1077                                      after->address_read_cap);
1078    }
1079 
1080    after->address_read[after->address_read_count++] = before;
1081 }
1082 
1083 static bool
is_scheduling_barrier(const fs_inst * inst)1084 is_scheduling_barrier(const fs_inst *inst)
1085 {
1086    return inst->opcode == SHADER_OPCODE_HALT_TARGET ||
1087           inst->is_control_flow() ||
1088           inst->has_side_effects();
1089 }
1090 
1091 static bool
has_cross_lane_access(const fs_inst * inst)1092 has_cross_lane_access(const fs_inst *inst)
1093 {
1094    /* FINISHME:
1095     *
1096     * This function is likely incomplete in terms of identify cross lane
1097     * accesses.
1098     */
1099    if (inst->opcode == SHADER_OPCODE_BROADCAST ||
1100        inst->opcode == SHADER_OPCODE_CLUSTER_BROADCAST ||
1101        inst->opcode == SHADER_OPCODE_SHUFFLE ||
1102        inst->opcode == FS_OPCODE_LOAD_LIVE_CHANNELS ||
1103        inst->opcode == SHADER_OPCODE_LOAD_LIVE_CHANNELS ||
1104        inst->opcode == SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL ||
1105        inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL)
1106       return true;
1107 
1108    for (unsigned s = 0; s < inst->sources; s++) {
1109       if (inst->src[s].file == VGRF) {
1110          if (inst->src[s].stride == 0)
1111             return true;
1112       }
1113    }
1114 
1115    return false;
1116 }
1117 
1118 /**
1119  * Some register access need dependencies on other instructions.
1120  */
1121 bool
register_needs_barrier(const brw_reg & reg)1122 brw_instruction_scheduler::register_needs_barrier(const brw_reg &reg)
1123 {
1124    if (reg.file != ARF || reg.is_null())
1125       return false;
1126 
1127    /* If you look at SR register layout, there is nothing in there that
1128     * depends on other instructions. This is just fixed dispatch information.
1129     *
1130     * ATSM PRMs, Volume 9: Render Engine, State Register Fields :
1131     *    sr0.0:
1132     *      - 0:2   TID
1133     *      - 4:13  Slice, DSS, Subslice, EU IDs
1134     *      - 20:22 Priority
1135     *      - 23:23 Priority class
1136     *      - 24:27 FFID
1137     *    sr0.1:
1138     *      - 0:5   IEEE Exception
1139     *      - 21:31 FFTID
1140     *    sr0.2:
1141     *      - 0:31  Dispatch Mask
1142     *    sr0.3:
1143     *      - 0:31  Vector Mask
1144     */
1145    if (reg.nr == BRW_ARF_STATE)
1146       return false;
1147 
1148    return true;
1149 }
1150 
1151 /**
1152  * Sometimes we really want this node to execute after everything that
1153  * was before it and before everything that followed it.  This adds
1154  * the deps to do so.
1155  */
1156 void
add_barrier_deps(schedule_node * n)1157 brw_instruction_scheduler::add_barrier_deps(schedule_node *n)
1158 {
1159    for (schedule_node *prev = n - 1; prev >= current.start; prev--) {
1160       add_dep(prev, n, 0);
1161       if (is_scheduling_barrier(prev->inst))
1162          break;
1163    }
1164 
1165    for (schedule_node *next = n + 1; next < current.end; next++) {
1166       add_dep(n, next, 0);
1167       if (is_scheduling_barrier(next->inst))
1168          break;
1169    }
1170 }
1171 
1172 /**
1173  * Because some instructions like HALT can disable lanes, scheduling prior to
1174  * a cross lane access should not be allowed, otherwise we could end up with
1175  * later instructions accessing uninitialized data.
1176  */
1177 void
add_cross_lane_deps(schedule_node * n)1178 brw_instruction_scheduler::add_cross_lane_deps(schedule_node *n)
1179 {
1180    for (schedule_node *prev = n - 1; prev >= current.start; prev--) {
1181       if (has_cross_lane_access((fs_inst*)prev->inst))
1182          add_dep(prev, n, 0);
1183    }
1184 }
1185 
1186 /* instruction scheduling needs to be aware of when an MRF write
1187  * actually writes 2 MRFs.
1188  */
1189 bool
is_compressed(const fs_inst * inst)1190 brw_instruction_scheduler::is_compressed(const fs_inst *inst)
1191 {
1192    return inst->exec_size == 16;
1193 }
1194 
1195 /* Clears last_grf_write to be ready to start calculating deps for a block
1196  * again.
1197  *
1198  * Since pre-ra grf_count scales with instructions, and instructions scale with
1199  * BBs, we don't want to memset all of last_grf_write per block or you'll end up
1200  * O(n^2) with number of blocks.  For shaders using softfp64, we get a *lot* of
1201  * blocks.
1202  *
1203  * We don't bother being careful for post-ra, since then grf_count doesn't scale
1204  * with instructions.
1205  */
1206 void
clear_last_grf_write()1207 brw_instruction_scheduler::clear_last_grf_write()
1208 {
1209    if (!post_reg_alloc) {
1210       for (schedule_node *n = current.start; n < current.end; n++) {
1211          fs_inst *inst = (fs_inst *)n->inst;
1212 
1213          if (inst->dst.file == VGRF) {
1214             /* Don't bother being careful with regs_written(), quicker to just clear 2 cachelines. */
1215             memset(&last_grf_write[inst->dst.nr * MAX_VGRF_SIZE(s->devinfo)], 0,
1216                    sizeof(*last_grf_write) * MAX_VGRF_SIZE(s->devinfo));
1217          }
1218       }
1219    } else {
1220       memset(last_grf_write, 0,
1221              sizeof(*last_grf_write) * grf_count * MAX_VGRF_SIZE(s->devinfo));
1222    }
1223 }
1224 
1225 int
grf_index(const brw_reg & reg)1226 brw_instruction_scheduler::grf_index(const brw_reg &reg)
1227 {
1228    if (post_reg_alloc)
1229       return reg.nr;
1230    return reg.nr * MAX_VGRF_SIZE(s->devinfo) + reg.offset / REG_SIZE;
1231 }
1232 
1233 void
calculate_deps()1234 brw_instruction_scheduler::calculate_deps()
1235 {
1236    /* Pre-register-allocation, this tracks the last write per VGRF offset.
1237     * After register allocation, reg_offsets are gone and we track individual
1238     * GRF registers.
1239     */
1240    schedule_node *last_conditional_mod[8] = {};
1241    schedule_node *last_accumulator_write = NULL;
1242    /* Fixed HW registers are assumed to be separate from the virtual
1243     * GRFs, so they can be tracked separately.  We don't really write
1244     * to fixed GRFs much, so don't bother tracking them on a more
1245     * granular level.
1246     */
1247    schedule_node *last_fixed_grf_write = NULL;
1248    schedule_node *last_address_write[16] = {};
1249 
1250    /* top-to-bottom dependencies: RAW and WAW. */
1251 
1252    if (!post_reg_alloc) {
1253       /* Address registers have virtual identifier, allowing us to identify
1254        * what instructions needs the values written to the register. The
1255        * address register is written/read in pairs of instructions (enforced
1256        * by the brw_fs_validate.cpp).
1257        *
1258        * To allow scheduling of SEND messages, out of order, without the
1259        * address register tracking generating serialized dependency between
1260        * all the messages, we first track all the dependencies of the address
1261        * register. Those dependencies are added to the instructions consuming
1262        * the address register value. Then when doing the normal dependency
1263        * tracking, any node adding a dependency to an instruction consuming
1264        * the address register is also added as dependency to the instruction
1265        * writing the value to the address register.
1266        *
1267        * This scheme allows the scheduling done by
1268        * choose_instruction_to_schedule() to ensure that once an instruction
1269        * writing the address register is scheduled, we can always schedule all
1270        * instructions making use of the address register value. Otherwise we
1271        * could run into scheduling deadlocks.
1272        *
1273        * Here is a deadlock example :
1274        *
1275        *    mov    a0, 0x42
1276        *    send grf1, ..., a0
1277        *    mov    a0, 0x43
1278        *    send grf2, grf1, a0
1279        *
1280        * Let say choose_instruction_to_schedule() chooses the second mov
1281        * instruction first (mov a0, 0x43). Then it cannot schedule the second
1282        * send instruction because the first send instruction populating grf1
1283        * and has not been scheduled and we cannot schedule the first mov
1284        * either because the address register is already in use for another
1285        * message.
1286        *
1287        * In post-register-allocation mode, this scheme cannot work as all GRFs
1288        * can get reused and we have to serializae all address register usages
1289        * (like the accumulator, flag, etc...).
1290        */
1291       for (schedule_node *n = current.start; n < current.end; n++) {
1292          fs_inst *inst = (fs_inst *)n->inst;
1293 
1294          /* Pre pass going over instruction using the register flag as a
1295           * source.
1296           */
1297          for (int i = 0; i < inst->sources; i++) {
1298             if (!inst->src[i].is_address())
1299                continue;
1300 
1301             for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2) {
1302                assert(inst->src[i].address_slot(byte) < ARRAY_SIZE(last_address_write));
1303                schedule_node *write_addr_node =
1304                   last_address_write[inst->src[i].address_slot(byte)];
1305                assert(write_addr_node->inst->dst.nr == inst->src[i].nr);
1306                add_address_dep(write_addr_node, n);
1307             }
1308          }
1309 
1310          if (inst->dst.is_address()) {
1311             for (unsigned byte = 0; byte < inst->size_written; byte += 2) {
1312                last_address_write[inst->dst.address_slot(byte)] = n;
1313             }
1314          }
1315       }
1316    }
1317 
1318    for (schedule_node *n = current.start; n < current.end; n++) {
1319       fs_inst *inst = (fs_inst *)n->inst;
1320 
1321       if (is_scheduling_barrier(inst))
1322          add_barrier_deps(n);
1323 
1324       if (inst->opcode == BRW_OPCODE_HALT ||
1325           inst->opcode == SHADER_OPCODE_HALT_TARGET)
1326           add_cross_lane_deps(n);
1327 
1328       /* read-after-write deps. */
1329       for (int i = 0; i < inst->sources; i++) {
1330          if (inst->src[i].file == VGRF) {
1331             for (unsigned r = 0; r < regs_read(s->devinfo, inst, i); r++)
1332                add_dep(last_grf_write[grf_index(inst->src[i]) + r], n);
1333          } else if (inst->src[i].file == FIXED_GRF) {
1334             if (post_reg_alloc) {
1335                for (unsigned r = 0; r < regs_read(s->devinfo, inst, i); r++)
1336                   add_dep(last_grf_write[inst->src[i].nr + r], n);
1337             } else {
1338                add_dep(last_fixed_grf_write, n);
1339             }
1340          } else if (inst->src[i].is_accumulator()) {
1341             add_dep(last_accumulator_write, n);
1342          } else if (inst->src[i].is_address()) {
1343             if (post_reg_alloc) {
1344                for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2)
1345                   add_dep(last_address_write[inst->src[i].address_slot(byte)], n);
1346             }
1347          } else if (register_needs_barrier(inst->src[i])) {
1348             add_barrier_deps(n);
1349          }
1350       }
1351 
1352       if (const unsigned mask = inst->flags_read(s->devinfo)) {
1353          assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
1354 
1355          for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
1356             if (mask & (1 << i))
1357                add_dep(last_conditional_mod[i], n);
1358          }
1359       }
1360 
1361       if (inst->reads_accumulator_implicitly()) {
1362          add_dep(last_accumulator_write, n);
1363       }
1364 
1365       /* write-after-write deps. */
1366       if (inst->dst.file == VGRF) {
1367          int grf_idx = grf_index(inst->dst);
1368          for (unsigned r = 0; r < regs_written(inst); r++) {
1369             add_dep(last_grf_write[grf_idx + r], n);
1370             last_grf_write[grf_idx + r] = n;
1371          }
1372       } else if (inst->dst.file == FIXED_GRF) {
1373          if (post_reg_alloc) {
1374             for (unsigned r = 0; r < regs_written(inst); r++) {
1375                add_dep(last_grf_write[inst->dst.nr + r], n);
1376                last_grf_write[inst->dst.nr + r] = n;
1377             }
1378          } else {
1379             add_dep(last_fixed_grf_write, n);
1380             last_fixed_grf_write = n;
1381          }
1382       } else if (inst->dst.is_accumulator()) {
1383          add_dep(last_accumulator_write, n);
1384          last_accumulator_write = n;
1385       } else if (inst->dst.is_address()) {
1386          if (post_reg_alloc) {
1387             for (unsigned byte = 0; byte < inst->size_written; byte += 2) {
1388                add_dep(last_address_write[inst->dst.address_slot(byte)], n);
1389                last_address_write[inst->dst.address_slot(byte)] = n;
1390             }
1391          }
1392       } else if (register_needs_barrier(inst->dst)) {
1393          add_barrier_deps(n);
1394       }
1395 
1396       if (const unsigned mask = inst->flags_written(s->devinfo)) {
1397          assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
1398 
1399          for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
1400             if (mask & (1 << i)) {
1401                add_dep(last_conditional_mod[i], n, 0);
1402                last_conditional_mod[i] = n;
1403             }
1404          }
1405       }
1406 
1407       if (inst->writes_accumulator_implicitly(s->devinfo) &&
1408           !inst->dst.is_accumulator()) {
1409          add_dep(last_accumulator_write, n);
1410          last_accumulator_write = n;
1411       }
1412 
1413       if (post_reg_alloc && inst->uses_address_register_implicitly()) {
1414          for (unsigned i = 0; i < ARRAY_SIZE(last_address_write); i++) {
1415             add_dep(last_address_write[i], n);
1416             last_address_write[i] = n;
1417          }
1418       }
1419    }
1420 
1421    clear_last_grf_write();
1422 
1423    /* bottom-to-top dependencies: WAR */
1424    memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
1425    last_accumulator_write = NULL;
1426    last_fixed_grf_write = NULL;
1427    memset(last_address_write, 0, sizeof(last_address_write));
1428 
1429    for (schedule_node *n = current.end - 1; n >= current.start; n--) {
1430       fs_inst *inst = (fs_inst *)n->inst;
1431 
1432       /* write-after-read deps. */
1433       for (int i = 0; i < inst->sources; i++) {
1434          if (inst->src[i].file == VGRF) {
1435             for (unsigned r = 0; r < regs_read(s->devinfo, inst, i); r++)
1436                add_dep(n, last_grf_write[grf_index(inst->src[i]) + r], 0);
1437          } else if (inst->src[i].file == FIXED_GRF) {
1438             if (post_reg_alloc) {
1439                for (unsigned r = 0; r < regs_read(s->devinfo, inst, i); r++)
1440                   add_dep(n, last_grf_write[inst->src[i].nr + r], 0);
1441             } else {
1442                add_dep(n, last_fixed_grf_write, 0);
1443             }
1444          } else if (inst->src[i].is_accumulator()) {
1445             add_dep(n, last_accumulator_write, 0);
1446          } else if (inst->src[i].is_address()) {
1447             if (post_reg_alloc) {
1448                for (unsigned byte = 0; byte < inst->size_read(s->devinfo, i); byte += 2) {
1449                   add_dep(n, last_address_write[inst->src[i].address_slot(byte)], 0);
1450                }
1451             }
1452          } else if (register_needs_barrier(inst->src[i])) {
1453             add_barrier_deps(n);
1454          }
1455       }
1456 
1457       if (const unsigned mask = inst->flags_read(s->devinfo)) {
1458          assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
1459 
1460          for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
1461             if (mask & (1 << i))
1462                add_dep(n, last_conditional_mod[i]);
1463          }
1464       }
1465 
1466       if (inst->reads_accumulator_implicitly()) {
1467          add_dep(n, last_accumulator_write);
1468       }
1469 
1470       if (post_reg_alloc && inst->uses_address_register_implicitly()) {
1471          for (unsigned i = 0; i < ARRAY_SIZE(last_address_write); i++)
1472             last_address_write[i] = n;
1473       }
1474 
1475       /* Update the things this instruction wrote, so earlier reads
1476        * can mark this as WAR dependency.
1477        */
1478       if (inst->dst.file == VGRF) {
1479          for (unsigned r = 0; r < regs_written(inst); r++)
1480             last_grf_write[grf_index(inst->dst) + r] = n;
1481       } else if (inst->dst.file == FIXED_GRF) {
1482          if (post_reg_alloc) {
1483             for (unsigned r = 0; r < regs_written(inst); r++)
1484                last_grf_write[inst->dst.nr + r] = n;
1485          } else {
1486             last_fixed_grf_write = n;
1487          }
1488       } else if (inst->dst.is_accumulator()) {
1489          last_accumulator_write = n;
1490       } else if (inst->dst.is_address()) {
1491          if (post_reg_alloc) {
1492             for (unsigned byte = 0; byte < inst->size_written; byte += 2)
1493                last_address_write[inst->dst.address_slot(byte)] = n;
1494          }
1495       } else if (register_needs_barrier(inst->dst)) {
1496          add_barrier_deps(n);
1497       }
1498 
1499       if (const unsigned mask = inst->flags_written(s->devinfo)) {
1500          assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
1501 
1502          for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
1503             if (mask & (1 << i))
1504                last_conditional_mod[i] = n;
1505          }
1506       }
1507 
1508       if (inst->writes_accumulator_implicitly(s->devinfo)) {
1509          last_accumulator_write = n;
1510       }
1511    }
1512 
1513    clear_last_grf_write();
1514 }
1515 
1516 bool
address_register_interfere(const schedule_node * n)1517 brw_instruction_scheduler::address_register_interfere(const schedule_node *n)
1518 {
1519    if (n->inst->uses_address_register_implicitly()) {
1520       for (unsigned i = 0; i < ARRAY_SIZE(current.address_register); i++)
1521          if (current.address_register[i] != 0)
1522             return true;
1523       return false;
1524    }
1525 
1526    if (n->inst->dst.is_address()) {
1527       for (unsigned byte = 0; byte < n->inst->size_written; byte += 2) {
1528          if (current.address_register[n->inst->dst.address_slot(byte)] != 0 &&
1529              current.address_register[n->inst->dst.address_slot(byte)] != n->inst->dst.nr)
1530             return true;
1531       }
1532    }
1533 
1534    if (n->address_read_count > 0) {
1535       for (unsigned i = 0; i < n->inst->sources; i++) {
1536          if (!n->inst->src[i].is_address())
1537             continue;
1538          for (unsigned byte = 0; byte < n->inst->size_read(s->devinfo, i); byte += 2) {
1539             if (current.address_register[n->inst->src[i].address_slot(byte)] !=
1540                 n->inst->src[i].nr)
1541                return true;
1542          }
1543       }
1544    }
1545 
1546    return false;
1547 }
1548 
1549 schedule_node *
choose_instruction_to_schedule()1550 brw_instruction_scheduler::choose_instruction_to_schedule()
1551 {
1552    schedule_node *chosen = NULL;
1553 
1554    if (mode == BRW_SCHEDULE_PRE || mode == BRW_SCHEDULE_POST) {
1555       int chosen_time = 0;
1556 
1557       /* Of the instructions ready to execute or the closest to being ready,
1558        * choose the one most likely to unblock an early program exit, or
1559        * otherwise the oldest one.
1560        */
1561       foreach_in_list(schedule_node, n, &current.available) {
1562          if (!post_reg_alloc && address_register_interfere(n))
1563             continue;
1564 
1565          if (!chosen ||
1566              exit_tmp_unblocked_time(n) < exit_tmp_unblocked_time(chosen) ||
1567              (exit_tmp_unblocked_time(n) == exit_tmp_unblocked_time(chosen) &&
1568               n->tmp.unblocked_time < chosen_time)) {
1569             chosen = n;
1570             chosen_time = n->tmp.unblocked_time;
1571          }
1572       }
1573    } else {
1574       int chosen_register_pressure_benefit = 0;
1575 
1576       /* Before register allocation, we don't care about the latencies of
1577        * instructions.  All we care about is reducing live intervals of
1578        * variables so that we can avoid register spilling, or get SIMD16
1579        * shaders which naturally do a better job of hiding instruction
1580        * latency.
1581        */
1582       foreach_in_list(schedule_node, n, &current.available) {
1583          if (!post_reg_alloc && address_register_interfere(n))
1584             continue;
1585 
1586          if (!chosen) {
1587             chosen = n;
1588             chosen_register_pressure_benefit =
1589                   get_register_pressure_benefit(chosen->inst);
1590             continue;
1591          }
1592 
1593          /* Most important: If we can definitely reduce register pressure, do
1594           * so immediately.
1595           */
1596          int register_pressure_benefit = get_register_pressure_benefit(n->inst);
1597 
1598          if (register_pressure_benefit > 0 &&
1599              register_pressure_benefit > chosen_register_pressure_benefit) {
1600             chosen = n;
1601             chosen_register_pressure_benefit = register_pressure_benefit;
1602             continue;
1603          } else if (chosen_register_pressure_benefit > 0 &&
1604                     (register_pressure_benefit <
1605                      chosen_register_pressure_benefit)) {
1606             continue;
1607          }
1608 
1609          if (mode == BRW_SCHEDULE_PRE_LIFO) {
1610             /* Prefer instructions that recently became available for
1611              * scheduling.  These are the things that are most likely to
1612              * (eventually) make a variable dead and reduce register pressure.
1613              * Typical register pressure estimates don't work for us because
1614              * most of our pressure comes from texturing, where no single
1615              * instruction to schedule will make a vec4 value dead.
1616              */
1617             if (n->tmp.cand_generation > chosen->tmp.cand_generation) {
1618                chosen = n;
1619                chosen_register_pressure_benefit = register_pressure_benefit;
1620                continue;
1621             } else if (n->tmp.cand_generation < chosen->tmp.cand_generation) {
1622                continue;
1623             }
1624          }
1625 
1626          /* For instructions pushed on the cands list at the same time, prefer
1627           * the one with the highest delay to the end of the program.  This is
1628           * most likely to have its values able to be consumed first (such as
1629           * for a large tree of lowered ubo loads, which appear reversed in
1630           * the instruction stream with respect to when they can be consumed).
1631           */
1632          if (n->delay > chosen->delay) {
1633             chosen = n;
1634             chosen_register_pressure_benefit = register_pressure_benefit;
1635             continue;
1636          } else if (n->delay < chosen->delay) {
1637             continue;
1638          }
1639 
1640          /* Prefer the node most likely to unblock an early program exit.
1641           */
1642          if (exit_tmp_unblocked_time(n) < exit_tmp_unblocked_time(chosen)) {
1643             chosen = n;
1644             chosen_register_pressure_benefit = register_pressure_benefit;
1645             continue;
1646          } else if (exit_tmp_unblocked_time(n) > exit_tmp_unblocked_time(chosen)) {
1647             continue;
1648          }
1649 
1650          /* If all other metrics are equal, we prefer the first instruction in
1651           * the list (program execution).
1652           */
1653       }
1654    }
1655 
1656    return chosen;
1657 }
1658 
1659 int
calculate_issue_time(const fs_inst * inst)1660 brw_instruction_scheduler::calculate_issue_time(const fs_inst *inst)
1661 {
1662    const struct brw_isa_info *isa = &s->compiler->isa;
1663    const unsigned overhead = s->grf_used && has_bank_conflict(isa, inst) ?
1664       DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE) : 0;
1665    if (is_compressed(inst))
1666       return 4 + overhead;
1667    else
1668       return 2 + overhead;
1669 }
1670 
1671 void
schedule(schedule_node * chosen)1672 brw_instruction_scheduler::schedule(schedule_node *chosen)
1673 {
1674    assert(current.scheduled < current.len);
1675    current.scheduled++;
1676 
1677    assert(chosen);
1678    chosen->remove();
1679    current.block->instructions.push_tail(chosen->inst);
1680 
1681    /* If we expected a delay for scheduling, then bump the clock to reflect
1682     * that.  In reality, the hardware will switch to another hyperthread
1683     * and may not return to dispatching our thread for a while even after
1684     * we're unblocked.  After this, we have the time when the chosen
1685     * instruction will start executing.
1686     */
1687    current.time = MAX2(current.time, chosen->tmp.unblocked_time);
1688 
1689    /* Update the clock for how soon an instruction could start after the
1690     * chosen one.
1691     */
1692    current.time += chosen->issue_time;
1693 
1694    if (debug) {
1695       fprintf(stderr, "clock %4d, scheduled: ", current.time);
1696       brw_print_instruction(*s, chosen->inst);
1697    }
1698 }
1699 
1700 void
update_children(schedule_node * chosen)1701 brw_instruction_scheduler::update_children(schedule_node *chosen)
1702 {
1703    if (chosen->address_read_count > 0) {
1704       for (unsigned i = 0; i < chosen->inst->sources; i++) {
1705          if (!chosen->inst->src[i].is_address())
1706             continue;
1707          for (unsigned byte = 0; byte < chosen->inst->size_read(s->devinfo, i); byte += 2) {
1708             assert(chosen->inst->src[i].address_slot(byte) <
1709                    ARRAY_SIZE(current.address_register));
1710             current.address_register[chosen->inst->src[i].address_slot(byte)] = 0;
1711          }
1712       }
1713    }
1714 
1715    if (chosen->inst->dst.is_address()) {
1716       for (unsigned byte = 0; byte < chosen->inst->size_written; byte += 2) {
1717          assert(chosen->inst->dst.address_slot(byte) <
1718                 ARRAY_SIZE(current.address_register));
1719          current.address_register[
1720             chosen->inst->dst.address_slot(byte)] = chosen->inst->dst.nr;
1721       }
1722    } else if (chosen->inst->uses_address_register_implicitly()) {
1723       memset(current.address_register, 0, sizeof(current.address_register));
1724    }
1725 
1726    /* Now that we've scheduled a new instruction, some of its
1727     * children can be promoted to the list of instructions ready to
1728     * be scheduled.  Update the children's unblocked time for this
1729     * DAG edge as we do so.
1730     */
1731    for (int i = chosen->children_count - 1; i >= 0; i--) {
1732       schedule_node_child *child = &chosen->children[i];
1733 
1734       child->n->tmp.unblocked_time = MAX2(child->n->tmp.unblocked_time,
1735                                           current.time + child->effective_latency);
1736 
1737       if (debug) {
1738          fprintf(stderr, "\tchild %d, %d parents: ", i, child->n->tmp.parent_count);
1739          brw_print_instruction(*s, child->n->inst);
1740       }
1741 
1742       child->n->tmp.cand_generation = current.cand_generation;
1743       child->n->tmp.parent_count--;
1744       if (child->n->tmp.parent_count == 0) {
1745          if (debug) {
1746             fprintf(stderr, "\t\tnow available\n");
1747          }
1748          current.available.push_head(child->n);
1749       }
1750    }
1751    current.cand_generation++;
1752 }
1753 
1754 void
schedule_instructions()1755 brw_instruction_scheduler::schedule_instructions()
1756 {
1757    if (!post_reg_alloc)
1758       reg_pressure = reg_pressure_in[current.block->num];
1759 
1760    assert(current.available.is_empty());
1761    for (schedule_node *n = current.start; n < current.end; n++) {
1762       reset_node_tmp(n);
1763 
1764       /* Add DAG heads to the list of available instructions. */
1765       if (n->tmp.parent_count == 0)
1766          current.available.push_tail(n);
1767    }
1768 
1769    current.block->instructions.make_empty();
1770 
1771    memset(current.address_register, 0, sizeof(current.address_register));
1772 
1773    while (!current.available.is_empty()) {
1774       schedule_node *chosen = choose_instruction_to_schedule();
1775       schedule(chosen);
1776 
1777       if (!post_reg_alloc) {
1778          reg_pressure -= get_register_pressure_benefit(chosen->inst);
1779          update_register_pressure(chosen->inst);
1780          if (debug)
1781             fprintf(stderr, "(register pressure %d)\n", reg_pressure);
1782       }
1783 
1784       update_children(chosen);
1785    }
1786 }
1787 
1788 void
run(brw_instruction_scheduler_mode mode)1789 brw_instruction_scheduler::run(brw_instruction_scheduler_mode mode)
1790 {
1791    this->mode = mode;
1792 
1793    if (debug && !post_reg_alloc) {
1794       fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n",
1795               post_reg_alloc);
1796          brw_print_instructions(*s);
1797    }
1798 
1799    if (!post_reg_alloc) {
1800       memset(reads_remaining, 0, grf_count * sizeof(*reads_remaining));
1801       memset(hw_reads_remaining, 0, hw_reg_count * sizeof(*hw_reads_remaining));
1802       memset(written, 0, grf_count * sizeof(*written));
1803    }
1804 
1805    foreach_block(block, s->cfg) {
1806       set_current_block(block);
1807 
1808       if (!post_reg_alloc) {
1809          for (schedule_node *n = current.start; n < current.end; n++)
1810             count_reads_remaining(n->inst);
1811       }
1812 
1813       schedule_instructions();
1814    }
1815 
1816    if (debug && !post_reg_alloc) {
1817       fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n",
1818               post_reg_alloc);
1819       brw_print_instructions(*s);
1820    }
1821 }
1822 
1823 brw_instruction_scheduler *
brw_prepare_scheduler(fs_visitor & s,void * mem_ctx)1824 brw_prepare_scheduler(fs_visitor &s, void *mem_ctx)
1825 {
1826    const int grf_count = s.alloc.count;
1827 
1828    brw_instruction_scheduler *empty = rzalloc(mem_ctx, brw_instruction_scheduler);
1829    return new (empty) brw_instruction_scheduler(mem_ctx, &s, grf_count, s.first_non_payload_grf,
1830                                                 s.cfg->num_blocks, /* post_reg_alloc */ false);
1831 }
1832 
1833 void
brw_schedule_instructions_pre_ra(fs_visitor & s,brw_instruction_scheduler * sched,brw_instruction_scheduler_mode mode)1834 brw_schedule_instructions_pre_ra(fs_visitor &s, brw_instruction_scheduler *sched,
1835                                  brw_instruction_scheduler_mode mode)
1836 {
1837    if (mode == BRW_SCHEDULE_NONE)
1838       return;
1839 
1840    sched->run(mode);
1841 
1842    s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1843 }
1844 
1845 void
brw_schedule_instructions_post_ra(fs_visitor & s)1846 brw_schedule_instructions_post_ra(fs_visitor &s)
1847 {
1848    const bool post_reg_alloc = true;
1849    const int grf_count = reg_unit(s.devinfo) * s.grf_used;
1850 
1851    void *mem_ctx = ralloc_context(NULL);
1852 
1853    brw_instruction_scheduler sched(mem_ctx, &s, grf_count, s.first_non_payload_grf,
1854                                    s.cfg->num_blocks, post_reg_alloc);
1855    sched.run(BRW_SCHEDULE_POST);
1856 
1857    ralloc_free(mem_ctx);
1858 
1859    s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1860 }
1861