• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file brw_fs_visitor.cpp
25  *
26  * This file supports generating the FS LIR from the GLSL IR.  The LIR
27  * makes it easier to do backend-specific optimizations than doing so
28  * in the GLSL IR or in the native code.
29  */
30 #include "brw_eu.h"
31 #include "brw_fs.h"
32 #include "brw_fs_builder.h"
33 #include "brw_nir.h"
34 #include "compiler/glsl_types.h"
35 
36 using namespace brw;
37 
38 /* Input data is organized with first the per-primitive values, followed
39  * by per-vertex values.  The per-vertex will have interpolation information
40  * associated, so use 4 components for each value.
41  */
42 
43 /* The register location here is relative to the start of the URB
44  * data.  It will get adjusted to be a real location before
45  * generate_code() time.
46  */
47 fs_reg
interp_reg(const fs_builder & bld,unsigned location,unsigned channel,unsigned comp)48 fs_visitor::interp_reg(const fs_builder &bld, unsigned location,
49                        unsigned channel, unsigned comp)
50 {
51    assert(stage == MESA_SHADER_FRAGMENT);
52    assert(BITFIELD64_BIT(location) & ~nir->info.per_primitive_inputs);
53 
54    const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
55 
56    assert(prog_data->urb_setup[location] >= 0);
57    unsigned nr = prog_data->urb_setup[location];
58    channel += prog_data->urb_setup_channel[location];
59 
60    /* Adjust so we start counting from the first per_vertex input. */
61    assert(nr >= prog_data->num_per_primitive_inputs);
62    nr -= prog_data->num_per_primitive_inputs;
63 
64    const unsigned per_vertex_start = prog_data->num_per_primitive_inputs;
65    const unsigned regnr = per_vertex_start + (nr * 4) + channel;
66 
67    if (max_polygons > 1) {
68       /* In multipolygon dispatch each plane parameter is a
69        * dispatch_width-wide SIMD vector (see comment in
70        * assign_urb_setup()), so we need to use offset() instead of
71        * component() to select the specified parameter.
72        */
73       const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
74       bld.MOV(tmp, offset(fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_UD),
75                           dispatch_width, comp));
76       return retype(tmp, BRW_REGISTER_TYPE_F);
77    } else {
78       return component(fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F), comp);
79    }
80 }
81 
82 /* The register location here is relative to the start of the URB
83  * data.  It will get adjusted to be a real location before
84  * generate_code() time.
85  */
86 fs_reg
per_primitive_reg(const fs_builder & bld,int location,unsigned comp)87 fs_visitor::per_primitive_reg(const fs_builder &bld, int location, unsigned comp)
88 {
89    assert(stage == MESA_SHADER_FRAGMENT);
90    assert(BITFIELD64_BIT(location) & nir->info.per_primitive_inputs);
91 
92    const struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
93 
94    comp += prog_data->urb_setup_channel[location];
95 
96    assert(prog_data->urb_setup[location] >= 0);
97 
98    const unsigned regnr = prog_data->urb_setup[location] + comp / 4;
99 
100    assert(regnr < prog_data->num_per_primitive_inputs);
101 
102    if (max_polygons > 1) {
103       /* In multipolygon dispatch each primitive constant is a
104        * dispatch_width-wide SIMD vector (see comment in
105        * assign_urb_setup()), so we need to use offset() instead of
106        * component() to select the specified parameter.
107        */
108       const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
109       bld.MOV(tmp, offset(fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_UD),
110                           dispatch_width, comp % 4));
111       return retype(tmp, BRW_REGISTER_TYPE_F);
112    } else {
113       return component(fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F), comp % 4);
114    }
115 }
116 
117 /** Emits the interpolation for the varying inputs. */
118 void
emit_interpolation_setup()119 fs_visitor::emit_interpolation_setup()
120 {
121    const fs_builder bld = fs_builder(this).at_end();
122    fs_builder abld = bld.annotate("compute pixel centers");
123 
124    this->pixel_x = vgrf(glsl_float_type());
125    this->pixel_y = vgrf(glsl_float_type());
126 
127    const struct brw_wm_prog_key *wm_key = (brw_wm_prog_key*) this->key;
128    struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(prog_data);
129 
130    fs_reg int_sample_offset_x, int_sample_offset_y; /* Used on Gen12HP+ */
131    fs_reg int_sample_offset_xy; /* Used on Gen8+ */
132    fs_reg half_int_sample_offset_x, half_int_sample_offset_y;
133    if (wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS) {
134       /* The thread payload only delivers subspan locations (ss0, ss1,
135        * ss2, ...). Since subspans covers 2x2 pixels blocks, we need to
136        * generate 4 pixel coordinates out of each subspan location. We do this
137        * by replicating a subspan coordinate 4 times and adding an offset of 1
138        * in each direction from the initial top left (tl) location to generate
139        * top right (tr = +1 in x), bottom left (bl = +1 in y) and bottom right
140        * (br = +1 in x, +1 in y).
141        *
142        * The locations we build look like this in SIMD8 :
143        *
144        *    ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
145        *
146        * The value 0x11001010 is a vector of 8 half byte vector. It adds
147        * following to generate the 4 pixels coordinates out of the subspan0:
148        *
149        *  0x
150        *    1 : ss0.y + 1 -> ss0.br.y
151        *    1 : ss0.y + 1 -> ss0.bl.y
152        *    0 : ss0.y + 0 -> ss0.tr.y
153        *    0 : ss0.y + 0 -> ss0.tl.y
154        *    1 : ss0.x + 1 -> ss0.br.x
155        *    0 : ss0.x + 0 -> ss0.bl.x
156        *    1 : ss0.x + 1 -> ss0.tr.x
157        *    0 : ss0.x + 0 -> ss0.tl.x
158        *
159        * By doing a SIMD16 add in a SIMD8 shader, we can generate the 8 pixels
160        * coordinates out of 2 subspans coordinates in a single ADD instruction
161        * (twice the operation above).
162        */
163       int_sample_offset_xy = fs_reg(brw_imm_v(0x11001010));
164       half_int_sample_offset_x = fs_reg(brw_imm_uw(0));
165       half_int_sample_offset_y = fs_reg(brw_imm_uw(0));
166       /* On Gfx12.5, because of regioning restrictions, the interpolation code
167        * is slightly different and works off X & Y only inputs. The ordering
168        * of the half bytes here is a bit odd, with each subspan replicated
169        * twice and every other element is discarded :
170        *
171        *             ss0.tl ss0.tl ss0.tr ss0.tr ss0.bl ss0.bl ss0.br ss0.br
172        *  X offset:    0      0      1      0      0      0      1      0
173        *  Y offset:    0      0      0      0      1      0      1      0
174        */
175       int_sample_offset_x = fs_reg(brw_imm_v(0x01000100));
176       int_sample_offset_y = fs_reg(brw_imm_v(0x01010000));
177    }
178 
179    fs_reg int_coarse_offset_x, int_coarse_offset_y; /* Used on Gen12HP+ */
180    fs_reg int_coarse_offset_xy; /* Used on Gen8+ */
181    fs_reg half_int_coarse_offset_x, half_int_coarse_offset_y;
182    if (wm_prog_data->coarse_pixel_dispatch != BRW_NEVER) {
183       /* In coarse pixel dispatch we have to do the same ADD instruction that
184        * we do in normal per pixel dispatch, except this time we're not adding
185        * 1 in each direction, but instead the coarse pixel size.
186        *
187        * The coarse pixel size is delivered as 2 u8 in r1.0
188        */
189       struct brw_reg r1_0 = retype(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), BRW_REGISTER_TYPE_UB);
190 
191       const fs_builder dbld =
192          abld.exec_all().group(MIN2(16, dispatch_width) * 2, 0);
193 
194       if (devinfo->verx10 >= 125) {
195          /* To build the array of half bytes we do and AND operation with the
196           * right mask in X.
197           */
198          int_coarse_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW);
199          dbld.AND(int_coarse_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0f000f00));
200 
201          /* And the right mask in Y. */
202          int_coarse_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW);
203          dbld.AND(int_coarse_offset_y, byte_offset(r1_0, 1), brw_imm_v(0x0f0f0000));
204       } else {
205          /* To build the array of half bytes we do and AND operation with the
206           * right mask in X.
207           */
208          int_coarse_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW);
209          dbld.AND(int_coarse_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0000f0f0));
210 
211          /* And the right mask in Y. */
212          int_coarse_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW);
213          dbld.AND(int_coarse_offset_y, byte_offset(r1_0, 1), brw_imm_v(0xff000000));
214 
215          /* Finally OR the 2 registers. */
216          int_coarse_offset_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW);
217          dbld.OR(int_coarse_offset_xy, int_coarse_offset_x, int_coarse_offset_y);
218       }
219 
220       /* Also compute the half coarse size used to center coarses. */
221       half_int_coarse_offset_x = bld.vgrf(BRW_REGISTER_TYPE_UW);
222       half_int_coarse_offset_y = bld.vgrf(BRW_REGISTER_TYPE_UW);
223 
224       bld.SHR(half_int_coarse_offset_x, suboffset(r1_0, 0), brw_imm_ud(1));
225       bld.SHR(half_int_coarse_offset_y, suboffset(r1_0, 1), brw_imm_ud(1));
226    }
227 
228    fs_reg int_pixel_offset_x, int_pixel_offset_y; /* Used on Gen12HP+ */
229    fs_reg int_pixel_offset_xy; /* Used on Gen8+ */
230    fs_reg half_int_pixel_offset_x, half_int_pixel_offset_y;
231    switch (wm_prog_data->coarse_pixel_dispatch) {
232    case BRW_NEVER:
233       int_pixel_offset_x = int_sample_offset_x;
234       int_pixel_offset_y = int_sample_offset_y;
235       int_pixel_offset_xy = int_sample_offset_xy;
236       half_int_pixel_offset_x = half_int_sample_offset_x;
237       half_int_pixel_offset_y = half_int_sample_offset_y;
238       break;
239 
240    case BRW_SOMETIMES: {
241       const fs_builder dbld =
242          abld.exec_all().group(MIN2(16, dispatch_width) * 2, 0);
243 
244       check_dynamic_msaa_flag(dbld, wm_prog_data,
245                               INTEL_MSAA_FLAG_COARSE_RT_WRITES);
246 
247       int_pixel_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW);
248       set_predicate(BRW_PREDICATE_NORMAL,
249                     dbld.SEL(int_pixel_offset_x,
250                              int_coarse_offset_x,
251                              int_sample_offset_x));
252 
253       int_pixel_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW);
254       set_predicate(BRW_PREDICATE_NORMAL,
255                     dbld.SEL(int_pixel_offset_y,
256                              int_coarse_offset_y,
257                              int_sample_offset_y));
258 
259       int_pixel_offset_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW);
260       set_predicate(BRW_PREDICATE_NORMAL,
261                     dbld.SEL(int_pixel_offset_xy,
262                              int_coarse_offset_xy,
263                              int_sample_offset_xy));
264 
265       half_int_pixel_offset_x = bld.vgrf(BRW_REGISTER_TYPE_UW);
266       set_predicate(BRW_PREDICATE_NORMAL,
267                     bld.SEL(half_int_pixel_offset_x,
268                             half_int_coarse_offset_x,
269                             half_int_sample_offset_x));
270 
271       half_int_pixel_offset_y = bld.vgrf(BRW_REGISTER_TYPE_UW);
272       set_predicate(BRW_PREDICATE_NORMAL,
273                     bld.SEL(half_int_pixel_offset_y,
274                             half_int_coarse_offset_y,
275                             half_int_sample_offset_y));
276       break;
277    }
278 
279    case BRW_ALWAYS:
280       int_pixel_offset_x = int_coarse_offset_x;
281       int_pixel_offset_y = int_coarse_offset_y;
282       int_pixel_offset_xy = int_coarse_offset_xy;
283       half_int_pixel_offset_x = half_int_coarse_offset_x;
284       half_int_pixel_offset_y = half_int_coarse_offset_y;
285       break;
286    }
287 
288    for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
289       const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i);
290       /* According to the "PS Thread Payload for Normal Dispatch"
291        * pages on the BSpec, subspan X/Y coordinates are stored in
292        * R1.2-R1.5/R2.2-R2.5 on gfx6+, and on R0.10-R0.13/R1.10-R1.13
293        * on gfx20+.  gi_reg is the 32B section of the GRF that
294        * contains the subspan coordinates.
295        */
296       const struct brw_reg gi_reg = devinfo->ver >= 20 ? xe2_vec1_grf(i, 8) :
297                                     brw_vec1_grf(i + 1, 0);
298       const struct brw_reg gi_uw = retype(gi_reg, BRW_REGISTER_TYPE_UW);
299 
300       if (devinfo->verx10 >= 125) {
301          const fs_builder dbld =
302             abld.exec_all().group(hbld.dispatch_width() * 2, 0);
303          const fs_reg int_pixel_x = dbld.vgrf(BRW_REGISTER_TYPE_UW);
304          const fs_reg int_pixel_y = dbld.vgrf(BRW_REGISTER_TYPE_UW);
305 
306          dbld.ADD(int_pixel_x,
307                   fs_reg(stride(suboffset(gi_uw, 4), 2, 8, 0)),
308                   int_pixel_offset_x);
309          dbld.ADD(int_pixel_y,
310                   fs_reg(stride(suboffset(gi_uw, 5), 2, 8, 0)),
311                   int_pixel_offset_y);
312 
313          if (wm_prog_data->coarse_pixel_dispatch != BRW_NEVER) {
314             fs_inst *addx = dbld.ADD(int_pixel_x, int_pixel_x,
315                                      horiz_stride(half_int_pixel_offset_x, 0));
316             fs_inst *addy = dbld.ADD(int_pixel_y, int_pixel_y,
317                                      horiz_stride(half_int_pixel_offset_y, 0));
318             if (wm_prog_data->coarse_pixel_dispatch != BRW_ALWAYS) {
319                addx->predicate = BRW_PREDICATE_NORMAL;
320                addy->predicate = BRW_PREDICATE_NORMAL;
321             }
322          }
323 
324          hbld.MOV(offset(pixel_x, hbld, i), horiz_stride(int_pixel_x, 2));
325          hbld.MOV(offset(pixel_y, hbld, i), horiz_stride(int_pixel_y, 2));
326 
327       } else {
328          /* The "Register Region Restrictions" page says for BDW (and newer,
329           * presumably):
330           *
331           *     "When destination spans two registers, the source may be one or
332           *      two registers. The destination elements must be evenly split
333           *      between the two registers."
334           *
335           * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16
336           * to compute our pixel centers.
337           */
338          const fs_builder dbld =
339             abld.exec_all().group(hbld.dispatch_width() * 2, 0);
340          fs_reg int_pixel_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW);
341 
342          dbld.ADD(int_pixel_xy,
343                   fs_reg(stride(suboffset(gi_uw, 4), 1, 4, 0)),
344                   int_pixel_offset_xy);
345 
346          hbld.emit(FS_OPCODE_PIXEL_X, offset(pixel_x, hbld, i), int_pixel_xy,
347                                       horiz_stride(half_int_pixel_offset_x, 0));
348          hbld.emit(FS_OPCODE_PIXEL_Y, offset(pixel_y, hbld, i), int_pixel_xy,
349                                       horiz_stride(half_int_pixel_offset_y, 0));
350       }
351    }
352 
353    abld = bld.annotate("compute pos.z");
354    fs_reg coarse_z;
355    if (wm_prog_data->uses_depth_w_coefficients) {
356       /* In coarse pixel mode, the HW doesn't interpolate Z coordinate
357        * properly. In the same way we have to add the coarse pixel size to
358        * pixels locations, here we recompute the Z value with 2 coefficients
359        * in X & Y axis.
360        */
361       fs_reg coef_payload = brw_vec8_grf(fs_payload().depth_w_coef_reg, 0);
362       const fs_reg x_start = brw_vec1_grf(coef_payload.nr, 2);
363       const fs_reg y_start = brw_vec1_grf(coef_payload.nr, 6);
364       const fs_reg z_cx    = brw_vec1_grf(coef_payload.nr, 1);
365       const fs_reg z_cy    = brw_vec1_grf(coef_payload.nr, 0);
366       const fs_reg z_c0    = brw_vec1_grf(coef_payload.nr, 3);
367 
368       const fs_reg float_pixel_x = abld.vgrf(BRW_REGISTER_TYPE_F);
369       const fs_reg float_pixel_y = abld.vgrf(BRW_REGISTER_TYPE_F);
370 
371       abld.ADD(float_pixel_x, this->pixel_x, negate(x_start));
372       abld.ADD(float_pixel_y, this->pixel_y, negate(y_start));
373 
374       /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
375       const fs_reg u8_cps_width = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB));
376       /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
377       const fs_reg u8_cps_height = byte_offset(u8_cps_width, 1);
378       const fs_reg u32_cps_width = abld.vgrf(BRW_REGISTER_TYPE_UD);
379       const fs_reg u32_cps_height = abld.vgrf(BRW_REGISTER_TYPE_UD);
380       abld.MOV(u32_cps_width, u8_cps_width);
381       abld.MOV(u32_cps_height, u8_cps_height);
382 
383       const fs_reg f_cps_width = abld.vgrf(BRW_REGISTER_TYPE_F);
384       const fs_reg f_cps_height = abld.vgrf(BRW_REGISTER_TYPE_F);
385       abld.MOV(f_cps_width, u32_cps_width);
386       abld.MOV(f_cps_height, u32_cps_height);
387 
388       /* Center in the middle of the coarse pixel. */
389       abld.MAD(float_pixel_x, float_pixel_x, brw_imm_f(0.5f), f_cps_width);
390       abld.MAD(float_pixel_y, float_pixel_y, brw_imm_f(0.5f), f_cps_height);
391 
392       coarse_z = abld.vgrf(BRW_REGISTER_TYPE_F);
393       abld.MAD(coarse_z, z_c0, z_cx, float_pixel_x);
394       abld.MAD(coarse_z, coarse_z, z_cy, float_pixel_y);
395    }
396 
397    if (wm_prog_data->uses_src_depth)
398       this->pixel_z = fetch_payload_reg(bld, fs_payload().source_depth_reg);
399 
400    if (wm_prog_data->uses_depth_w_coefficients ||
401        wm_prog_data->uses_src_depth) {
402       fs_reg sample_z = this->pixel_z;
403 
404       switch (wm_prog_data->coarse_pixel_dispatch) {
405       case BRW_NEVER:
406          assert(wm_prog_data->uses_src_depth);
407          assert(!wm_prog_data->uses_depth_w_coefficients);
408          this->pixel_z = sample_z;
409          break;
410 
411       case BRW_SOMETIMES:
412          assert(wm_prog_data->uses_src_depth);
413          assert(wm_prog_data->uses_depth_w_coefficients);
414          this->pixel_z = abld.vgrf(BRW_REGISTER_TYPE_F);
415 
416          /* We re-use the check_dynamic_msaa_flag() call from above */
417          set_predicate(BRW_PREDICATE_NORMAL,
418                        abld.SEL(this->pixel_z, coarse_z, sample_z));
419          break;
420 
421       case BRW_ALWAYS:
422          assert(!wm_prog_data->uses_src_depth);
423          assert(wm_prog_data->uses_depth_w_coefficients);
424          this->pixel_z = coarse_z;
425          break;
426       }
427    }
428 
429    if (wm_prog_data->uses_src_w) {
430       abld = bld.annotate("compute pos.w");
431       this->pixel_w = fetch_payload_reg(abld, fs_payload().source_w_reg);
432       this->wpos_w = vgrf(glsl_float_type());
433       abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
434    }
435 
436    if (wm_key->persample_interp == BRW_SOMETIMES) {
437       assert(!devinfo->needs_unlit_centroid_workaround);
438 
439       const fs_builder ubld = bld.exec_all().group(16, 0);
440       bool loaded_flag = false;
441 
442       for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
443          if (!(wm_prog_data->barycentric_interp_modes & BITFIELD_BIT(i)))
444             continue;
445 
446          /* The sample mode will always be the top bit set in the perspective
447           * or non-perspective section.  In the case where no SAMPLE mode was
448           * requested, wm_prog_data_barycentric_modes() will swap out the top
449           * mode for SAMPLE so this works regardless of whether SAMPLE was
450           * requested or not.
451           */
452          int sample_mode;
453          if (BITFIELD_BIT(i) & BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) {
454             sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes &
455                                         BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1;
456          } else {
457             sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes &
458                                         BRW_BARYCENTRIC_PERSPECTIVE_BITS) - 1;
459          }
460          assert(wm_prog_data->barycentric_interp_modes &
461                 BITFIELD_BIT(sample_mode));
462 
463          if (i == sample_mode)
464             continue;
465 
466          uint8_t *barys = fs_payload().barycentric_coord_reg[i];
467 
468          uint8_t *sample_barys = fs_payload().barycentric_coord_reg[sample_mode];
469          assert(barys[0] && sample_barys[0]);
470 
471          if (!loaded_flag) {
472             check_dynamic_msaa_flag(ubld, wm_prog_data,
473                                     INTEL_MSAA_FLAG_PERSAMPLE_INTERP);
474          }
475 
476          for (unsigned j = 0; j < dispatch_width / 8; j++) {
477             set_predicate(
478                BRW_PREDICATE_NORMAL,
479                ubld.MOV(brw_vec8_grf(barys[j / 2] + (j % 2) * 2, 0),
480                         brw_vec8_grf(sample_barys[j / 2] + (j % 2) * 2, 0)));
481          }
482       }
483    }
484 
485    for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
486       this->delta_xy[i] = fetch_barycentric_reg(
487          bld, fs_payload().barycentric_coord_reg[i]);
488    }
489 
490    uint32_t centroid_modes = wm_prog_data->barycentric_interp_modes &
491       (1 << BRW_BARYCENTRIC_PERSPECTIVE_CENTROID |
492        1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
493 
494    if (devinfo->needs_unlit_centroid_workaround && centroid_modes) {
495       /* Get the pixel/sample mask into f0 so that we know which
496        * pixels are lit.  Then, for each channel that is unlit,
497        * replace the centroid data with non-centroid data.
498        */
499       for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
500          bld.exec_all().group(1, 0)
501             .MOV(retype(brw_flag_reg(0, i), BRW_REGISTER_TYPE_UW),
502                  retype(brw_vec1_grf(1 + i, 7), BRW_REGISTER_TYPE_UW));
503       }
504 
505       for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
506          if (!(centroid_modes & (1 << i)))
507             continue;
508 
509          const fs_reg centroid_delta_xy = delta_xy[i];
510          const fs_reg &pixel_delta_xy = delta_xy[i - 1];
511 
512          delta_xy[i] = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
513 
514          for (unsigned c = 0; c < 2; c++) {
515             for (unsigned q = 0; q < dispatch_width / 8; q++) {
516                set_predicate(BRW_PREDICATE_NORMAL,
517                   bld.quarter(q).SEL(
518                      quarter(offset(delta_xy[i], bld, c), q),
519                      quarter(offset(centroid_delta_xy, bld, c), q),
520                      quarter(offset(pixel_delta_xy, bld, c), q)));
521             }
522          }
523       }
524    }
525 }
526 
527 fs_inst *
emit_single_fb_write(const fs_builder & bld,fs_reg color0,fs_reg color1,fs_reg src0_alpha,unsigned components)528 fs_visitor::emit_single_fb_write(const fs_builder &bld,
529                                  fs_reg color0, fs_reg color1,
530                                  fs_reg src0_alpha, unsigned components)
531 {
532    assert(stage == MESA_SHADER_FRAGMENT);
533    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
534 
535    /* Hand over gl_FragDepth or the payload depth. */
536    const fs_reg dst_depth = fetch_payload_reg(bld, fs_payload().dest_depth_reg);
537    fs_reg src_depth, src_stencil;
538 
539    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
540       src_depth = frag_depth;
541 
542    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
543       src_stencil = frag_stencil;
544 
545    const fs_reg sources[] = {
546       color0, color1, src0_alpha, src_depth, dst_depth, src_stencil,
547       (prog_data->uses_omask ? sample_mask : fs_reg()),
548       brw_imm_ud(components)
549    };
550    assert(ARRAY_SIZE(sources) - 1 == FB_WRITE_LOGICAL_SRC_COMPONENTS);
551    fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, fs_reg(),
552                              sources, ARRAY_SIZE(sources));
553 
554    if (prog_data->uses_kill) {
555       write->predicate = BRW_PREDICATE_NORMAL;
556       write->flag_subreg = sample_mask_flag_subreg(*this);
557    }
558 
559    return write;
560 }
561 
562 void
do_emit_fb_writes(int nr_color_regions,bool replicate_alpha)563 fs_visitor::do_emit_fb_writes(int nr_color_regions, bool replicate_alpha)
564 {
565    const fs_builder bld = fs_builder(this).at_end();
566    fs_inst *inst = NULL;
567 
568    for (int target = 0; target < nr_color_regions; target++) {
569       /* Skip over outputs that weren't written. */
570       if (this->outputs[target].file == BAD_FILE)
571          continue;
572 
573       const fs_builder abld = bld.annotate(
574          ralloc_asprintf(this->mem_ctx, "FB write target %d", target));
575 
576       fs_reg src0_alpha;
577       if (replicate_alpha && target != 0)
578          src0_alpha = offset(outputs[0], bld, 3);
579 
580       inst = emit_single_fb_write(abld, this->outputs[target],
581                                   this->dual_src_output, src0_alpha, 4);
582       inst->target = target;
583    }
584 
585    if (inst == NULL) {
586       /* Even if there's no color buffers enabled, we still need to send
587        * alpha out the pipeline to our null renderbuffer to support
588        * alpha-testing, alpha-to-coverage, and so on.
589        */
590       /* FINISHME: Factor out this frequently recurring pattern into a
591        * helper function.
592        */
593       const fs_reg srcs[] = { reg_undef, reg_undef,
594                               reg_undef, offset(this->outputs[0], bld, 3) };
595       const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
596       bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
597 
598       inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4);
599       inst->target = 0;
600    }
601 
602    inst->last_rt = true;
603    inst->eot = true;
604 }
605 
606 void
emit_fb_writes()607 fs_visitor::emit_fb_writes()
608 {
609    assert(stage == MESA_SHADER_FRAGMENT);
610    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
611    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
612 
613    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
614       /* From the 'Render Target Write message' section of the docs:
615        * "Output Stencil is not supported with SIMD16 Render Target Write
616        * Messages."
617        */
618       limit_dispatch_width(8, "gl_FragStencilRefARB unsupported "
619                            "in SIMD16+ mode.\n");
620    }
621 
622    /* ANV doesn't know about sample mask output during the wm key creation
623     * so we compute if we need replicate alpha and emit alpha to coverage
624     * workaround here.
625     */
626    const bool replicate_alpha = key->alpha_test_replicate_alpha ||
627       (key->nr_color_regions > 1 && key->alpha_to_coverage &&
628        sample_mask.file == BAD_FILE);
629 
630    prog_data->dual_src_blend = (this->dual_src_output.file != BAD_FILE &&
631                                 this->outputs[0].file != BAD_FILE);
632    assert(!prog_data->dual_src_blend || key->nr_color_regions == 1);
633 
634    /* Following condition implements Wa_14017468336:
635     *
636     * "If dual source blend is enabled do not enable SIMD32 dispatch" and
637     * "For a thread dispatched as SIMD32, must not issue SIMD8 message with Last
638     *  Render Target Select set."
639     */
640    if (devinfo->ver >= 11 && devinfo->ver <= 12 &&
641        prog_data->dual_src_blend) {
642       /* The dual-source RT write messages fail to release the thread
643        * dependency on ICL and TGL with SIMD32 dispatch, leading to hangs.
644        *
645        * XXX - Emit an extra single-source NULL RT-write marked LastRT in
646        *       order to release the thread dependency without disabling
647        *       SIMD32.
648        *
649        * The dual-source RT write messages may lead to hangs with SIMD16
650        * dispatch on ICL due some unknown reasons, see
651        * https://gitlab.freedesktop.org/mesa/mesa/-/issues/2183
652        */
653       limit_dispatch_width(8, "Dual source blending unsupported "
654                            "in SIMD16 and SIMD32 modes.\n");
655    }
656 
657    do_emit_fb_writes(key->nr_color_regions, replicate_alpha);
658 }
659 
660 void
emit_urb_writes(const fs_reg & gs_vertex_count)661 fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
662 {
663    int slot, urb_offset, length;
664    int starting_urb_offset = 0;
665    const struct brw_vue_prog_data *vue_prog_data =
666       brw_vue_prog_data(this->prog_data);
667    const GLbitfield64 psiz_mask =
668       VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ | VARYING_BIT_PRIMITIVE_SHADING_RATE;
669    const struct intel_vue_map *vue_map = &vue_prog_data->vue_map;
670    bool flush;
671    fs_reg sources[8];
672    fs_reg urb_handle;
673 
674    switch (stage) {
675    case MESA_SHADER_VERTEX:
676       urb_handle = vs_payload().urb_handles;
677       break;
678    case MESA_SHADER_TESS_EVAL:
679       urb_handle = tes_payload().urb_output;
680       break;
681    case MESA_SHADER_GEOMETRY:
682       urb_handle = gs_payload().urb_handles;
683       break;
684    default:
685       unreachable("invalid stage");
686    }
687 
688    const fs_builder bld = fs_builder(this).at_end();
689 
690    fs_reg per_slot_offsets;
691 
692    if (stage == MESA_SHADER_GEOMETRY) {
693       const struct brw_gs_prog_data *gs_prog_data =
694          brw_gs_prog_data(this->prog_data);
695 
696       /* We need to increment the Global Offset to skip over the control data
697        * header and the extra "Vertex Count" field (1 HWord) at the beginning
698        * of the VUE.  We're counting in OWords, so the units are doubled.
699        */
700       starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
701       if (gs_prog_data->static_vertex_count == -1)
702          starting_urb_offset += 2;
703 
704       /* The URB offset is in 128-bit units, so we need to multiply by 2 */
705       const int output_vertex_size_owords =
706          gs_prog_data->output_vertex_size_hwords * 2;
707 
708       if (gs_vertex_count.file == IMM) {
709          per_slot_offsets = brw_imm_ud(output_vertex_size_owords *
710                                        gs_vertex_count.ud);
711       } else {
712          per_slot_offsets = vgrf(glsl_uint_type());
713          bld.MUL(per_slot_offsets, gs_vertex_count,
714                  brw_imm_ud(output_vertex_size_owords));
715       }
716    }
717 
718    length = 0;
719    urb_offset = starting_urb_offset;
720    flush = false;
721 
722    /* SSO shaders can have VUE slots allocated which are never actually
723     * written to, so ignore them when looking for the last (written) slot.
724     */
725    int last_slot = vue_map->num_slots - 1;
726    while (last_slot > 0 &&
727           (vue_map->slot_to_varying[last_slot] == BRW_VARYING_SLOT_PAD ||
728            outputs[vue_map->slot_to_varying[last_slot]].file == BAD_FILE)) {
729       last_slot--;
730    }
731 
732    bool urb_written = false;
733    for (slot = 0; slot < vue_map->num_slots; slot++) {
734       int varying = vue_map->slot_to_varying[slot];
735       switch (varying) {
736       case VARYING_SLOT_PSIZ: {
737          /* The point size varying slot is the vue header and is always in the
738           * vue map.  But often none of the special varyings that live there
739           * are written and in that case we can skip writing to the vue
740           * header, provided the corresponding state properly clamps the
741           * values further down the pipeline. */
742          if ((vue_map->slots_valid & psiz_mask) == 0) {
743             assert(length == 0);
744             urb_offset++;
745             break;
746          }
747 
748          fs_reg zero(VGRF, alloc.allocate(dispatch_width / 8),
749                      BRW_REGISTER_TYPE_UD);
750          bld.MOV(zero, brw_imm_ud(0u));
751 
752          if (vue_map->slots_valid & VARYING_BIT_PRIMITIVE_SHADING_RATE &&
753              this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE].file != BAD_FILE) {
754             sources[length++] = this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE];
755          } else if (devinfo->has_coarse_pixel_primitive_and_cb) {
756             uint32_t one_fp16 = 0x3C00;
757             fs_reg one_by_one_fp16(VGRF, alloc.allocate(dispatch_width / 8),
758                                    BRW_REGISTER_TYPE_UD);
759             bld.MOV(one_by_one_fp16, brw_imm_ud((one_fp16 << 16) | one_fp16));
760             sources[length++] = one_by_one_fp16;
761          } else {
762             sources[length++] = zero;
763          }
764 
765          if (vue_map->slots_valid & VARYING_BIT_LAYER)
766             sources[length++] = this->outputs[VARYING_SLOT_LAYER];
767          else
768             sources[length++] = zero;
769 
770          if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
771             sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
772          else
773             sources[length++] = zero;
774 
775          if (vue_map->slots_valid & VARYING_BIT_PSIZ)
776             sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
777          else
778             sources[length++] = zero;
779          break;
780       }
781       case VARYING_SLOT_EDGE:
782          unreachable("unexpected scalar vs output");
783          break;
784 
785       default:
786          /* gl_Position is always in the vue map, but isn't always written by
787           * the shader.  Other varyings (clip distances) get added to the vue
788           * map but don't always get written.  In those cases, the
789           * corresponding this->output[] slot will be invalid we and can skip
790           * the urb write for the varying.  If we've already queued up a vue
791           * slot for writing we flush a mlen 5 urb write, otherwise we just
792           * advance the urb_offset.
793           */
794          if (varying == BRW_VARYING_SLOT_PAD ||
795              this->outputs[varying].file == BAD_FILE) {
796             if (length > 0)
797                flush = true;
798             else
799                urb_offset++;
800             break;
801          }
802 
803          int slot_offset = 0;
804 
805          /* When using Primitive Replication, there may be multiple slots
806           * assigned to POS.
807           */
808          if (varying == VARYING_SLOT_POS)
809             slot_offset = slot - vue_map->varying_to_slot[VARYING_SLOT_POS];
810 
811          for (unsigned i = 0; i < 4; i++) {
812             sources[length++] = offset(this->outputs[varying], bld,
813                                        i + (slot_offset * 4));
814          }
815          break;
816       }
817 
818       const fs_builder abld = bld.annotate("URB write");
819 
820       /* If we've queued up 8 registers of payload (2 VUE slots), if this is
821        * the last slot or if we need to flush (see BAD_FILE varying case
822        * above), emit a URB write send now to flush out the data.
823        */
824       if (length == 8 || (length > 0 && slot == last_slot))
825          flush = true;
826       if (flush) {
827          fs_reg srcs[URB_LOGICAL_NUM_SRCS];
828 
829          srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
830          srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offsets;
831          srcs[URB_LOGICAL_SRC_DATA] = fs_reg(VGRF,
832                                              alloc.allocate((dispatch_width / 8) * length),
833                                              BRW_REGISTER_TYPE_F);
834          srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(length);
835          abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
836 
837          fs_inst *inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
838                                    srcs, ARRAY_SIZE(srcs));
839 
840          /* For ICL Wa_1805992985 one needs additional write in the end. */
841          if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL)
842             inst->eot = false;
843          else
844             inst->eot = slot == last_slot && stage != MESA_SHADER_GEOMETRY;
845 
846          inst->offset = urb_offset;
847          urb_offset = starting_urb_offset + slot + 1;
848          length = 0;
849          flush = false;
850          urb_written = true;
851       }
852    }
853 
854    /* If we don't have any valid slots to write, just do a minimal urb write
855     * send to terminate the shader.  This includes 1 slot of undefined data,
856     * because it's invalid to write 0 data:
857     *
858     * From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
859     * Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
860     * Write Data Payload:
861     *
862     *    "The write data payload can be between 1 and 8 message phases long."
863     */
864    if (!urb_written) {
865       /* For GS, just turn EmitVertex() into a no-op.  We don't want it to
866        * end the thread, and emit_gs_thread_end() already emits a SEND with
867        * EOT at the end of the program for us.
868        */
869       if (stage == MESA_SHADER_GEOMETRY)
870          return;
871 
872       fs_reg uniform_urb_handle = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
873                                          BRW_REGISTER_TYPE_UD);
874       fs_reg payload = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
875                               BRW_REGISTER_TYPE_UD);
876 
877       bld.exec_all().MOV(uniform_urb_handle, urb_handle);
878 
879       fs_reg srcs[URB_LOGICAL_NUM_SRCS];
880       srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
881       srcs[URB_LOGICAL_SRC_DATA] = payload;
882       srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
883 
884       fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
885                                srcs, ARRAY_SIZE(srcs));
886       inst->eot = true;
887       inst->offset = 1;
888       return;
889    }
890 
891    /* ICL Wa_1805992985:
892     *
893     * ICLLP GPU hangs on one of tessellation vkcts tests with DS not done. The
894     * send cycle, which is a urb write with an eot must be 4 phases long and
895     * all 8 lanes must valid.
896     */
897    if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL) {
898       assert(dispatch_width == 8);
899       fs_reg uniform_urb_handle = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
900       fs_reg uniform_mask = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
901       fs_reg payload = fs_reg(VGRF, alloc.allocate(4), BRW_REGISTER_TYPE_UD);
902 
903       /* Workaround requires all 8 channels (lanes) to be valid. This is
904        * understood to mean they all need to be alive. First trick is to find
905        * a live channel and copy its urb handle for all the other channels to
906        * make sure all handles are valid.
907        */
908       bld.exec_all().MOV(uniform_urb_handle, bld.emit_uniformize(urb_handle));
909 
910       /* Second trick is to use masked URB write where one can tell the HW to
911        * actually write data only for selected channels even though all are
912        * active.
913        * Third trick is to take advantage of the must-be-zero (MBZ) area in
914        * the very beginning of the URB.
915        *
916        * One masks data to be written only for the first channel and uses
917        * offset zero explicitly to land data to the MBZ area avoiding trashing
918        * any other part of the URB.
919        *
920        * Since the WA says that the write needs to be 4 phases long one uses
921        * 4 slots data. All are explicitly zeros in order to to keep the MBZ
922        * area written as zeros.
923        */
924       bld.exec_all().MOV(uniform_mask, brw_imm_ud(0x10000u));
925       bld.exec_all().MOV(offset(payload, bld, 0), brw_imm_ud(0u));
926       bld.exec_all().MOV(offset(payload, bld, 1), brw_imm_ud(0u));
927       bld.exec_all().MOV(offset(payload, bld, 2), brw_imm_ud(0u));
928       bld.exec_all().MOV(offset(payload, bld, 3), brw_imm_ud(0u));
929 
930       fs_reg srcs[URB_LOGICAL_NUM_SRCS];
931       srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
932       srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = uniform_mask;
933       srcs[URB_LOGICAL_SRC_DATA] = payload;
934       srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(4);
935 
936       fs_inst *inst = bld.exec_all().emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
937                                           reg_undef, srcs, ARRAY_SIZE(srcs));
938       inst->eot = true;
939       inst->offset = 0;
940    }
941 }
942 
943 void
emit_urb_fence()944 fs_visitor::emit_urb_fence()
945 {
946    const fs_builder bld = fs_builder(this).at_end();
947    fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
948    fs_inst *fence = bld.emit(SHADER_OPCODE_MEMORY_FENCE, dst,
949                              brw_vec8_grf(0, 0),
950                              brw_imm_ud(true),
951                              brw_imm_ud(0));
952    fence->sfid = BRW_SFID_URB;
953    fence->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_LOCAL,
954                                     LSC_FLUSH_TYPE_NONE, true);
955 
956    bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE,
957                                    bld.null_reg_ud(),
958                                    &dst,
959                                    1);
960 }
961 
962 void
emit_cs_terminate()963 fs_visitor::emit_cs_terminate()
964 {
965    const fs_builder bld = fs_builder(this).at_end();
966 
967    /* We can't directly send from g0, since sends with EOT have to use
968     * g112-127. So, copy it to a virtual register, The register allocator will
969     * make sure it uses the appropriate register range.
970     */
971    struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD);
972    fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
973    bld.group(8, 0).exec_all().MOV(payload, g0);
974 
975    /* Send a message to the thread spawner to terminate the thread. */
976    fs_inst *inst = bld.exec_all()
977                       .emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload);
978    inst->eot = true;
979 }
980 
fs_visitor(const struct brw_compiler * compiler,const struct brw_compile_params * params,const brw_base_prog_key * key,struct brw_stage_prog_data * prog_data,const nir_shader * shader,unsigned dispatch_width,bool needs_register_pressure,bool debug_enabled)981 fs_visitor::fs_visitor(const struct brw_compiler *compiler,
982                        const struct brw_compile_params *params,
983                        const brw_base_prog_key *key,
984                        struct brw_stage_prog_data *prog_data,
985                        const nir_shader *shader,
986                        unsigned dispatch_width,
987                        bool needs_register_pressure,
988                        bool debug_enabled)
989    : backend_shader(compiler, params, shader, prog_data, debug_enabled),
990      key(key), gs_compile(NULL), prog_data(prog_data),
991      live_analysis(this), regpressure_analysis(this),
992      performance_analysis(this),
993      needs_register_pressure(needs_register_pressure),
994      dispatch_width(dispatch_width),
995      max_polygons(0),
996      api_subgroup_size(brw_nir_api_subgroup_size(shader, dispatch_width))
997 {
998    init();
999 }
1000 
fs_visitor(const struct brw_compiler * compiler,const struct brw_compile_params * params,const brw_wm_prog_key * key,struct brw_wm_prog_data * prog_data,const nir_shader * shader,unsigned dispatch_width,unsigned max_polygons,bool needs_register_pressure,bool debug_enabled)1001 fs_visitor::fs_visitor(const struct brw_compiler *compiler,
1002                        const struct brw_compile_params *params,
1003                        const brw_wm_prog_key *key,
1004                        struct brw_wm_prog_data *prog_data,
1005                        const nir_shader *shader,
1006                        unsigned dispatch_width, unsigned max_polygons,
1007                        bool needs_register_pressure,
1008                        bool debug_enabled)
1009    : backend_shader(compiler, params, shader, &prog_data->base,
1010                     debug_enabled),
1011      key(&key->base), gs_compile(NULL), prog_data(&prog_data->base),
1012      live_analysis(this), regpressure_analysis(this),
1013      performance_analysis(this),
1014      needs_register_pressure(needs_register_pressure),
1015      dispatch_width(dispatch_width),
1016      max_polygons(max_polygons),
1017      api_subgroup_size(brw_nir_api_subgroup_size(shader, dispatch_width))
1018 {
1019    init();
1020    assert(api_subgroup_size == 0 ||
1021           api_subgroup_size == 8 ||
1022           api_subgroup_size == 16 ||
1023           api_subgroup_size == 32);
1024 }
1025 
fs_visitor(const struct brw_compiler * compiler,const struct brw_compile_params * params,struct brw_gs_compile * c,struct brw_gs_prog_data * prog_data,const nir_shader * shader,bool needs_register_pressure,bool debug_enabled)1026 fs_visitor::fs_visitor(const struct brw_compiler *compiler,
1027                        const struct brw_compile_params *params,
1028                        struct brw_gs_compile *c,
1029                        struct brw_gs_prog_data *prog_data,
1030                        const nir_shader *shader,
1031                        bool needs_register_pressure,
1032                        bool debug_enabled)
1033    : backend_shader(compiler, params, shader, &prog_data->base.base,
1034                     debug_enabled),
1035      key(&c->key.base), gs_compile(c),
1036      prog_data(&prog_data->base.base),
1037      live_analysis(this), regpressure_analysis(this),
1038      performance_analysis(this),
1039      needs_register_pressure(needs_register_pressure),
1040      dispatch_width(compiler->devinfo->ver >= 20 ? 16 : 8),
1041      max_polygons(0),
1042      api_subgroup_size(brw_nir_api_subgroup_size(shader, dispatch_width))
1043 {
1044    init();
1045    assert(api_subgroup_size == 0 ||
1046           api_subgroup_size == 8 ||
1047           api_subgroup_size == 16 ||
1048           api_subgroup_size == 32);
1049 }
1050 
1051 void
init()1052 fs_visitor::init()
1053 {
1054    this->max_dispatch_width = 32;
1055    this->prog_data = this->stage_prog_data;
1056 
1057    this->failed = false;
1058    this->fail_msg = NULL;
1059 
1060    this->payload_ = NULL;
1061    this->source_depth_to_render_target = false;
1062    this->first_non_payload_grf = 0;
1063 
1064    this->uniforms = 0;
1065    this->last_scratch = 0;
1066    this->push_constant_loc = NULL;
1067 
1068    memset(&this->shader_stats, 0, sizeof(this->shader_stats));
1069 
1070    this->grf_used = 0;
1071    this->spilled_any_registers = false;
1072 }
1073 
~fs_visitor()1074 fs_visitor::~fs_visitor()
1075 {
1076    delete this->payload_;
1077 }
1078