• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "brw_eu.h"
7 #include "brw_fs.h"
8 #include "brw_fs_builder.h"
9 #include "brw_fs_live_variables.h"
10 #include "brw_generator.h"
11 #include "brw_nir.h"
12 #include "brw_cfg.h"
13 #include "brw_private.h"
14 #include "intel_nir.h"
15 #include "shader_enums.h"
16 #include "dev/intel_debug.h"
17 #include "dev/intel_wa.h"
18 
19 #include <memory>
20 
21 using namespace brw;
22 
23 static fs_inst *
brw_emit_single_fb_write(fs_visitor & s,const fs_builder & bld,brw_reg color0,brw_reg color1,brw_reg src0_alpha,unsigned components,bool null_rt)24 brw_emit_single_fb_write(fs_visitor &s, const fs_builder &bld,
25                          brw_reg color0, brw_reg color1,
26                          brw_reg src0_alpha, unsigned components,
27                          bool null_rt)
28 {
29    assert(s.stage == MESA_SHADER_FRAGMENT);
30    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
31 
32    /* Hand over gl_FragDepth or the payload depth. */
33    const brw_reg dst_depth = fetch_payload_reg(bld, s.fs_payload().dest_depth_reg);
34 
35    brw_reg sources[FB_WRITE_LOGICAL_NUM_SRCS];
36    sources[FB_WRITE_LOGICAL_SRC_COLOR0]     = color0;
37    sources[FB_WRITE_LOGICAL_SRC_COLOR1]     = color1;
38    sources[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA] = src0_alpha;
39    sources[FB_WRITE_LOGICAL_SRC_DST_DEPTH]  = dst_depth;
40    sources[FB_WRITE_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(components);
41    sources[FB_WRITE_LOGICAL_SRC_NULL_RT]    = brw_imm_ud(null_rt);
42 
43    if (prog_data->uses_omask)
44       sources[FB_WRITE_LOGICAL_SRC_OMASK] = s.sample_mask;
45    if (s.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
46       sources[FB_WRITE_LOGICAL_SRC_SRC_DEPTH] = s.frag_depth;
47    if (s.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
48       sources[FB_WRITE_LOGICAL_SRC_SRC_STENCIL] = s.frag_stencil;
49 
50    fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, brw_reg(),
51                              sources, ARRAY_SIZE(sources));
52 
53    if (prog_data->uses_kill) {
54       write->predicate = BRW_PREDICATE_NORMAL;
55       write->flag_subreg = sample_mask_flag_subreg(s);
56    }
57 
58    return write;
59 }
60 
61 static void
brw_do_emit_fb_writes(fs_visitor & s,int nr_color_regions,bool replicate_alpha)62 brw_do_emit_fb_writes(fs_visitor &s, int nr_color_regions, bool replicate_alpha)
63 {
64    const fs_builder bld = fs_builder(&s).at_end();
65    fs_inst *inst = NULL;
66 
67    for (int target = 0; target < nr_color_regions; target++) {
68       /* Skip over outputs that weren't written. */
69       if (s.outputs[target].file == BAD_FILE)
70          continue;
71 
72       const fs_builder abld = bld.annotate(
73          ralloc_asprintf(s.mem_ctx, "FB write target %d", target));
74 
75       brw_reg src0_alpha;
76       if (replicate_alpha && target != 0)
77          src0_alpha = offset(s.outputs[0], bld, 3);
78 
79       inst = brw_emit_single_fb_write(s, abld, s.outputs[target],
80                                       s.dual_src_output, src0_alpha, 4,
81                                       false);
82       inst->target = target;
83    }
84 
85    if (inst == NULL) {
86       struct brw_wm_prog_key *key = (brw_wm_prog_key*) s.key;
87       struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
88       /* Disable null_rt if any non color output is written or if
89        * alpha_to_coverage can be enabled. Since the alpha_to_coverage bit is
90        * coming from the BLEND_STATE structure and the HW will avoid reading
91        * it if null_rt is enabled.
92        */
93       const bool use_null_rt =
94          key->alpha_to_coverage == INTEL_NEVER &&
95          !prog_data->uses_omask;
96 
97       /* Even if there's no color buffers enabled, we still need to send
98        * alpha out the pipeline to our null renderbuffer to support
99        * alpha-testing, alpha-to-coverage, and so on.
100        */
101       /* FINISHME: Factor out this frequently recurring pattern into a
102        * helper function.
103        */
104       const brw_reg srcs[] = { reg_undef, reg_undef,
105                               reg_undef, offset(s.outputs[0], bld, 3) };
106       const brw_reg tmp = bld.vgrf(BRW_TYPE_UD, 4);
107       bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
108 
109       inst = brw_emit_single_fb_write(s, bld, tmp, reg_undef, reg_undef, 4,
110                                       use_null_rt);
111       inst->target = 0;
112    }
113 
114    inst->last_rt = true;
115    inst->eot = true;
116 }
117 
118 static void
brw_emit_fb_writes(fs_visitor & s)119 brw_emit_fb_writes(fs_visitor &s)
120 {
121    const struct intel_device_info *devinfo = s.devinfo;
122    assert(s.stage == MESA_SHADER_FRAGMENT);
123    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
124    brw_wm_prog_key *key = (brw_wm_prog_key*) s.key;
125 
126    if (s.nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
127       /* From the 'Render Target Write message' section of the docs:
128        * "Output Stencil is not supported with SIMD16 Render Target Write
129        * Messages."
130        */
131       if (devinfo->ver >= 20)
132          s.limit_dispatch_width(16, "gl_FragStencilRefARB unsupported "
133                                 "in SIMD32+ mode.\n");
134       else
135          s.limit_dispatch_width(8, "gl_FragStencilRefARB unsupported "
136                                 "in SIMD16+ mode.\n");
137    }
138 
139    /* ANV doesn't know about sample mask output during the wm key creation
140     * so we compute if we need replicate alpha and emit alpha to coverage
141     * workaround here.
142     */
143    const bool replicate_alpha = key->alpha_test_replicate_alpha ||
144       (key->nr_color_regions > 1 && key->alpha_to_coverage &&
145        s.sample_mask.file == BAD_FILE);
146 
147    prog_data->dual_src_blend = (s.dual_src_output.file != BAD_FILE &&
148                                 s.outputs[0].file != BAD_FILE);
149    assert(!prog_data->dual_src_blend || key->nr_color_regions == 1);
150 
151    /* Following condition implements Wa_14017468336:
152     *
153     * "If dual source blend is enabled do not enable SIMD32 dispatch" and
154     * "For a thread dispatched as SIMD32, must not issue SIMD8 message with Last
155     *  Render Target Select set."
156     */
157    if (devinfo->ver >= 11 && devinfo->ver <= 12 &&
158        prog_data->dual_src_blend) {
159       /* The dual-source RT write messages fail to release the thread
160        * dependency on ICL and TGL with SIMD32 dispatch, leading to hangs.
161        *
162        * XXX - Emit an extra single-source NULL RT-write marked LastRT in
163        *       order to release the thread dependency without disabling
164        *       SIMD32.
165        *
166        * The dual-source RT write messages may lead to hangs with SIMD16
167        * dispatch on ICL due some unknown reasons, see
168        * https://gitlab.freedesktop.org/mesa/mesa/-/issues/2183
169        */
170       if (devinfo->ver >= 20)
171          s.limit_dispatch_width(16, "Dual source blending unsupported "
172                                 "in SIMD32 mode.\n");
173       else
174          s.limit_dispatch_width(8, "Dual source blending unsupported "
175                                 "in SIMD16 and SIMD32 modes.\n");
176    }
177 
178    brw_do_emit_fb_writes(s, key->nr_color_regions, replicate_alpha);
179 }
180 
181 
182 /** Emits the interpolation for the varying inputs. */
183 static void
brw_emit_interpolation_setup(fs_visitor & s)184 brw_emit_interpolation_setup(fs_visitor &s)
185 {
186    const struct intel_device_info *devinfo = s.devinfo;
187    const fs_builder bld = fs_builder(&s).at_end();
188    fs_builder abld = bld.annotate("compute pixel centers");
189 
190    s.pixel_x = bld.vgrf(BRW_TYPE_F);
191    s.pixel_y = bld.vgrf(BRW_TYPE_F);
192 
193    const struct brw_wm_prog_key *wm_key = (brw_wm_prog_key*) s.key;
194    struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
195    fs_thread_payload &payload = s.fs_payload();
196 
197    brw_reg int_sample_offset_x, int_sample_offset_y; /* Used on Gen12HP+ */
198    brw_reg int_sample_offset_xy; /* Used on Gen8+ */
199    brw_reg half_int_sample_offset_x, half_int_sample_offset_y;
200    if (wm_prog_data->coarse_pixel_dispatch != INTEL_ALWAYS) {
201       /* The thread payload only delivers subspan locations (ss0, ss1,
202        * ss2, ...). Since subspans covers 2x2 pixels blocks, we need to
203        * generate 4 pixel coordinates out of each subspan location. We do this
204        * by replicating a subspan coordinate 4 times and adding an offset of 1
205        * in each direction from the initial top left (tl) location to generate
206        * top right (tr = +1 in x), bottom left (bl = +1 in y) and bottom right
207        * (br = +1 in x, +1 in y).
208        *
209        * The locations we build look like this in SIMD8 :
210        *
211        *    ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
212        *
213        * The value 0x11001010 is a vector of 8 half byte vector. It adds
214        * following to generate the 4 pixels coordinates out of the subspan0:
215        *
216        *  0x
217        *    1 : ss0.y + 1 -> ss0.br.y
218        *    1 : ss0.y + 1 -> ss0.bl.y
219        *    0 : ss0.y + 0 -> ss0.tr.y
220        *    0 : ss0.y + 0 -> ss0.tl.y
221        *    1 : ss0.x + 1 -> ss0.br.x
222        *    0 : ss0.x + 0 -> ss0.bl.x
223        *    1 : ss0.x + 1 -> ss0.tr.x
224        *    0 : ss0.x + 0 -> ss0.tl.x
225        *
226        * By doing a SIMD16 add in a SIMD8 shader, we can generate the 8 pixels
227        * coordinates out of 2 subspans coordinates in a single ADD instruction
228        * (twice the operation above).
229        */
230       int_sample_offset_xy = brw_reg(brw_imm_v(0x11001010));
231       half_int_sample_offset_x = brw_reg(brw_imm_uw(0));
232       half_int_sample_offset_y = brw_reg(brw_imm_uw(0));
233       /* On Gfx12.5, because of regioning restrictions, the interpolation code
234        * is slightly different and works off X & Y only inputs. The ordering
235        * of the half bytes here is a bit odd, with each subspan replicated
236        * twice and every other element is discarded :
237        *
238        *             ss0.tl ss0.tl ss0.tr ss0.tr ss0.bl ss0.bl ss0.br ss0.br
239        *  X offset:    0      0      1      0      0      0      1      0
240        *  Y offset:    0      0      0      0      1      0      1      0
241        */
242       int_sample_offset_x = brw_reg(brw_imm_v(0x01000100));
243       int_sample_offset_y = brw_reg(brw_imm_v(0x01010000));
244    }
245 
246    brw_reg int_coarse_offset_x, int_coarse_offset_y; /* Used on Gen12HP+ */
247    brw_reg int_coarse_offset_xy; /* Used on Gen8+ */
248    brw_reg half_int_coarse_offset_x, half_int_coarse_offset_y;
249    if (wm_prog_data->coarse_pixel_dispatch != INTEL_NEVER) {
250       /* In coarse pixel dispatch we have to do the same ADD instruction that
251        * we do in normal per pixel dispatch, except this time we're not adding
252        * 1 in each direction, but instead the coarse pixel size.
253        *
254        * The coarse pixel size is delivered as 2 u8 in r1.0
255        */
256       struct brw_reg r1_0 = retype(brw_vec1_reg(FIXED_GRF, 1, 0), BRW_TYPE_UB);
257 
258       const fs_builder dbld =
259          abld.exec_all().group(MIN2(16, s.dispatch_width) * 2, 0);
260 
261       if (devinfo->verx10 >= 125) {
262          /* To build the array of half bytes we do and AND operation with the
263           * right mask in X.
264           */
265          int_coarse_offset_x = dbld.vgrf(BRW_TYPE_UW);
266          dbld.AND(int_coarse_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0f000f00));
267 
268          /* And the right mask in Y. */
269          int_coarse_offset_y = dbld.vgrf(BRW_TYPE_UW);
270          dbld.AND(int_coarse_offset_y, byte_offset(r1_0, 1), brw_imm_v(0x0f0f0000));
271       } else {
272          /* To build the array of half bytes we do and AND operation with the
273           * right mask in X.
274           */
275          int_coarse_offset_x = dbld.vgrf(BRW_TYPE_UW);
276          dbld.AND(int_coarse_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0000f0f0));
277 
278          /* And the right mask in Y. */
279          int_coarse_offset_y = dbld.vgrf(BRW_TYPE_UW);
280          dbld.AND(int_coarse_offset_y, byte_offset(r1_0, 1), brw_imm_v(0xff000000));
281 
282          /* Finally OR the 2 registers. */
283          int_coarse_offset_xy = dbld.vgrf(BRW_TYPE_UW);
284          dbld.OR(int_coarse_offset_xy, int_coarse_offset_x, int_coarse_offset_y);
285       }
286 
287       /* Also compute the half coarse size used to center coarses. */
288       half_int_coarse_offset_x = bld.vgrf(BRW_TYPE_UW);
289       half_int_coarse_offset_y = bld.vgrf(BRW_TYPE_UW);
290 
291       bld.SHR(half_int_coarse_offset_x, suboffset(r1_0, 0), brw_imm_ud(1));
292       bld.SHR(half_int_coarse_offset_y, suboffset(r1_0, 1), brw_imm_ud(1));
293    }
294 
295    brw_reg int_pixel_offset_x, int_pixel_offset_y; /* Used on Gen12HP+ */
296    brw_reg int_pixel_offset_xy; /* Used on Gen8+ */
297    brw_reg half_int_pixel_offset_x, half_int_pixel_offset_y;
298    switch (wm_prog_data->coarse_pixel_dispatch) {
299    case INTEL_NEVER:
300       int_pixel_offset_x = int_sample_offset_x;
301       int_pixel_offset_y = int_sample_offset_y;
302       int_pixel_offset_xy = int_sample_offset_xy;
303       half_int_pixel_offset_x = half_int_sample_offset_x;
304       half_int_pixel_offset_y = half_int_sample_offset_y;
305       break;
306 
307    case INTEL_SOMETIMES: {
308       const fs_builder dbld =
309          abld.exec_all().group(MIN2(16, s.dispatch_width) * 2, 0);
310 
311       check_dynamic_msaa_flag(dbld, wm_prog_data,
312                               INTEL_MSAA_FLAG_COARSE_RT_WRITES);
313 
314       int_pixel_offset_x = dbld.vgrf(BRW_TYPE_UW);
315       set_predicate(BRW_PREDICATE_NORMAL,
316                     dbld.SEL(int_pixel_offset_x,
317                              int_coarse_offset_x,
318                              int_sample_offset_x));
319 
320       int_pixel_offset_y = dbld.vgrf(BRW_TYPE_UW);
321       set_predicate(BRW_PREDICATE_NORMAL,
322                     dbld.SEL(int_pixel_offset_y,
323                              int_coarse_offset_y,
324                              int_sample_offset_y));
325 
326       int_pixel_offset_xy = dbld.vgrf(BRW_TYPE_UW);
327       set_predicate(BRW_PREDICATE_NORMAL,
328                     dbld.SEL(int_pixel_offset_xy,
329                              int_coarse_offset_xy,
330                              int_sample_offset_xy));
331 
332       half_int_pixel_offset_x = bld.vgrf(BRW_TYPE_UW);
333       set_predicate(BRW_PREDICATE_NORMAL,
334                     bld.SEL(half_int_pixel_offset_x,
335                             half_int_coarse_offset_x,
336                             half_int_sample_offset_x));
337 
338       half_int_pixel_offset_y = bld.vgrf(BRW_TYPE_UW);
339       set_predicate(BRW_PREDICATE_NORMAL,
340                     bld.SEL(half_int_pixel_offset_y,
341                             half_int_coarse_offset_y,
342                             half_int_sample_offset_y));
343       break;
344    }
345 
346    case INTEL_ALWAYS:
347       int_pixel_offset_x = int_coarse_offset_x;
348       int_pixel_offset_y = int_coarse_offset_y;
349       int_pixel_offset_xy = int_coarse_offset_xy;
350       half_int_pixel_offset_x = half_int_coarse_offset_x;
351       half_int_pixel_offset_y = half_int_coarse_offset_y;
352       break;
353    }
354 
355    for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
356       const fs_builder hbld = abld.group(MIN2(16, s.dispatch_width), i);
357       /* According to the "PS Thread Payload for Normal Dispatch"
358        * pages on the BSpec, subspan X/Y coordinates are stored in
359        * R1.2-R1.5/R2.2-R2.5 on gfx6+, and on R0.10-R0.13/R1.10-R1.13
360        * on gfx20+.  gi_reg is the 32B section of the GRF that
361        * contains the subspan coordinates.
362        */
363       const struct brw_reg gi_reg = devinfo->ver >= 20 ? xe2_vec1_grf(i, 8) :
364                                     brw_vec1_grf(i + 1, 0);
365       const struct brw_reg gi_uw = retype(gi_reg, BRW_TYPE_UW);
366 
367       if (devinfo->verx10 >= 125) {
368          const fs_builder dbld =
369             abld.exec_all().group(hbld.dispatch_width() * 2, 0);
370          const brw_reg int_pixel_x = dbld.vgrf(BRW_TYPE_UW);
371          const brw_reg int_pixel_y = dbld.vgrf(BRW_TYPE_UW);
372 
373          dbld.ADD(int_pixel_x,
374                   brw_reg(stride(suboffset(gi_uw, 4), 2, 8, 0)),
375                   int_pixel_offset_x);
376          dbld.ADD(int_pixel_y,
377                   brw_reg(stride(suboffset(gi_uw, 5), 2, 8, 0)),
378                   int_pixel_offset_y);
379 
380          if (wm_prog_data->coarse_pixel_dispatch != INTEL_NEVER) {
381             fs_inst *addx = dbld.ADD(int_pixel_x, int_pixel_x,
382                                      horiz_stride(half_int_pixel_offset_x, 0));
383             fs_inst *addy = dbld.ADD(int_pixel_y, int_pixel_y,
384                                      horiz_stride(half_int_pixel_offset_y, 0));
385             if (wm_prog_data->coarse_pixel_dispatch != INTEL_ALWAYS) {
386                addx->predicate = BRW_PREDICATE_NORMAL;
387                addy->predicate = BRW_PREDICATE_NORMAL;
388             }
389          }
390 
391          hbld.MOV(offset(s.pixel_x, hbld, i), horiz_stride(int_pixel_x, 2));
392          hbld.MOV(offset(s.pixel_y, hbld, i), horiz_stride(int_pixel_y, 2));
393 
394       } else {
395          /* The "Register Region Restrictions" page says for BDW (and newer,
396           * presumably):
397           *
398           *     "When destination spans two registers, the source may be one or
399           *      two registers. The destination elements must be evenly split
400           *      between the two registers."
401           *
402           * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16
403           * to compute our pixel centers.
404           */
405          const fs_builder dbld =
406             abld.exec_all().group(hbld.dispatch_width() * 2, 0);
407          brw_reg int_pixel_xy = dbld.vgrf(BRW_TYPE_UW);
408 
409          dbld.ADD(int_pixel_xy,
410                   brw_reg(stride(suboffset(gi_uw, 4), 1, 4, 0)),
411                   int_pixel_offset_xy);
412 
413          hbld.emit(FS_OPCODE_PIXEL_X, offset(s.pixel_x, hbld, i), int_pixel_xy,
414                                       horiz_stride(half_int_pixel_offset_x, 0));
415          hbld.emit(FS_OPCODE_PIXEL_Y, offset(s.pixel_y, hbld, i), int_pixel_xy,
416                                       horiz_stride(half_int_pixel_offset_y, 0));
417       }
418    }
419 
420    abld = bld.annotate("compute pos.z");
421    brw_reg coarse_z;
422    if (wm_prog_data->coarse_pixel_dispatch != INTEL_NEVER &&
423        wm_prog_data->uses_depth_w_coefficients) {
424       /* In coarse pixel mode, the HW doesn't interpolate Z coordinate
425        * properly. In the same way we have to add the coarse pixel size to
426        * pixels locations, here we recompute the Z value with 2 coefficients
427        * in X & Y axis.
428        */
429       brw_reg coef_payload = brw_vec8_grf(payload.depth_w_coef_reg, 0);
430       const brw_reg x_start = devinfo->ver >= 20 ?
431          brw_vec1_grf(coef_payload.nr, 6) :
432          brw_vec1_grf(coef_payload.nr, 2);
433       const brw_reg y_start = devinfo->ver >= 20 ?
434          brw_vec1_grf(coef_payload.nr, 7) :
435          brw_vec1_grf(coef_payload.nr, 6);
436       const brw_reg z_cx    = devinfo->ver >= 20 ?
437          brw_vec1_grf(coef_payload.nr + 1, 1) :
438          brw_vec1_grf(coef_payload.nr, 1);
439       const brw_reg z_cy    = devinfo->ver >= 20 ?
440          brw_vec1_grf(coef_payload.nr + 1, 0) :
441          brw_vec1_grf(coef_payload.nr, 0);
442       const brw_reg z_c0    = devinfo->ver >= 20 ?
443          brw_vec1_grf(coef_payload.nr + 1, 2) :
444          brw_vec1_grf(coef_payload.nr, 3);
445 
446       const brw_reg float_pixel_x = abld.vgrf(BRW_TYPE_F);
447       const brw_reg float_pixel_y = abld.vgrf(BRW_TYPE_F);
448 
449       abld.ADD(float_pixel_x, s.pixel_x, negate(x_start));
450       abld.ADD(float_pixel_y, s.pixel_y, negate(y_start));
451 
452       /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
453       const brw_reg u8_cps_width = brw_reg(retype(brw_vec1_grf(1, 0), BRW_TYPE_UB));
454       /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
455       const brw_reg u8_cps_height = byte_offset(u8_cps_width, 1);
456       const brw_reg u32_cps_width = abld.vgrf(BRW_TYPE_UD);
457       const brw_reg u32_cps_height = abld.vgrf(BRW_TYPE_UD);
458       abld.MOV(u32_cps_width, u8_cps_width);
459       abld.MOV(u32_cps_height, u8_cps_height);
460 
461       const brw_reg f_cps_width = abld.vgrf(BRW_TYPE_F);
462       const brw_reg f_cps_height = abld.vgrf(BRW_TYPE_F);
463       abld.MOV(f_cps_width, u32_cps_width);
464       abld.MOV(f_cps_height, u32_cps_height);
465 
466       /* Center in the middle of the coarse pixel. */
467       abld.MAD(float_pixel_x, float_pixel_x, f_cps_width, brw_imm_f(0.5f));
468       abld.MAD(float_pixel_y, float_pixel_y, f_cps_height, brw_imm_f(0.5f));
469 
470       coarse_z = abld.vgrf(BRW_TYPE_F);
471       abld.MAD(coarse_z, z_c0, z_cx, float_pixel_x);
472       abld.MAD(coarse_z, coarse_z, z_cy, float_pixel_y);
473    }
474 
475    if (wm_prog_data->uses_src_depth)
476       s.pixel_z = fetch_payload_reg(bld, payload.source_depth_reg);
477 
478    if (wm_prog_data->uses_depth_w_coefficients ||
479        wm_prog_data->uses_src_depth) {
480       brw_reg sample_z = s.pixel_z;
481 
482       switch (wm_prog_data->coarse_pixel_dispatch) {
483       case INTEL_NEVER:
484          break;
485 
486       case INTEL_SOMETIMES:
487          assert(wm_prog_data->uses_src_depth);
488          assert(wm_prog_data->uses_depth_w_coefficients);
489          s.pixel_z = abld.vgrf(BRW_TYPE_F);
490 
491          /* We re-use the check_dynamic_msaa_flag() call from above */
492          set_predicate(BRW_PREDICATE_NORMAL,
493                        abld.SEL(s.pixel_z, coarse_z, sample_z));
494          break;
495 
496       case INTEL_ALWAYS:
497          assert(!wm_prog_data->uses_src_depth);
498          assert(wm_prog_data->uses_depth_w_coefficients);
499          s.pixel_z = coarse_z;
500          break;
501       }
502    }
503 
504    if (wm_prog_data->uses_src_w) {
505       abld = bld.annotate("compute pos.w");
506       s.pixel_w = fetch_payload_reg(abld, payload.source_w_reg);
507       s.wpos_w = bld.vgrf(BRW_TYPE_F);
508       abld.emit(SHADER_OPCODE_RCP, s.wpos_w, s.pixel_w);
509    }
510 
511    if (wm_key->persample_interp == INTEL_SOMETIMES) {
512       assert(!devinfo->needs_unlit_centroid_workaround);
513 
514       const fs_builder ubld = bld.exec_all().group(16, 0);
515       bool loaded_flag = false;
516 
517       for (int i = 0; i < INTEL_BARYCENTRIC_MODE_COUNT; ++i) {
518          if (!(wm_prog_data->barycentric_interp_modes & BITFIELD_BIT(i)))
519             continue;
520 
521          /* The sample mode will always be the top bit set in the perspective
522           * or non-perspective section.  In the case where no SAMPLE mode was
523           * requested, wm_prog_data_barycentric_modes() will swap out the top
524           * mode for SAMPLE so this works regardless of whether SAMPLE was
525           * requested or not.
526           */
527          int sample_mode;
528          if (BITFIELD_BIT(i) & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) {
529             sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes &
530                                         INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1;
531          } else {
532             sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes &
533                                         INTEL_BARYCENTRIC_PERSPECTIVE_BITS) - 1;
534          }
535          assert(wm_prog_data->barycentric_interp_modes &
536                 BITFIELD_BIT(sample_mode));
537 
538          if (i == sample_mode)
539             continue;
540 
541          uint8_t *barys = payload.barycentric_coord_reg[i];
542 
543          uint8_t *sample_barys = payload.barycentric_coord_reg[sample_mode];
544          assert(barys[0] && sample_barys[0]);
545 
546          if (!loaded_flag) {
547             check_dynamic_msaa_flag(ubld, wm_prog_data,
548                                     INTEL_MSAA_FLAG_PERSAMPLE_INTERP);
549          }
550 
551          for (unsigned j = 0; j < s.dispatch_width / 8; j++) {
552             set_predicate(
553                BRW_PREDICATE_NORMAL,
554                ubld.MOV(brw_vec8_grf(barys[j / 2] + (j % 2) * 2, 0),
555                         brw_vec8_grf(sample_barys[j / 2] + (j % 2) * 2, 0)));
556          }
557       }
558    }
559 
560    for (int i = 0; i < INTEL_BARYCENTRIC_MODE_COUNT; ++i) {
561       s.delta_xy[i] = fetch_barycentric_reg(
562          bld, payload.barycentric_coord_reg[i]);
563    }
564 
565    uint32_t centroid_modes = wm_prog_data->barycentric_interp_modes &
566       (1 << INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID |
567        1 << INTEL_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
568 
569    if (devinfo->needs_unlit_centroid_workaround && centroid_modes) {
570       /* Get the pixel/sample mask into f0 so that we know which
571        * pixels are lit.  Then, for each channel that is unlit,
572        * replace the centroid data with non-centroid data.
573        */
574       for (unsigned i = 0; i < DIV_ROUND_UP(s.dispatch_width, 16); i++) {
575          bld.exec_all().group(1, 0)
576             .MOV(retype(brw_flag_reg(0, i), BRW_TYPE_UW),
577                  retype(brw_vec1_grf(1 + i, 7), BRW_TYPE_UW));
578       }
579 
580       for (int i = 0; i < INTEL_BARYCENTRIC_MODE_COUNT; ++i) {
581          if (!(centroid_modes & (1 << i)))
582             continue;
583 
584          const brw_reg centroid_delta_xy = s.delta_xy[i];
585          const brw_reg &pixel_delta_xy = s.delta_xy[i - 1];
586 
587          s.delta_xy[i] = bld.vgrf(BRW_TYPE_F, 2);
588 
589          for (unsigned c = 0; c < 2; c++) {
590             for (unsigned q = 0; q < s.dispatch_width / 8; q++) {
591                set_predicate(BRW_PREDICATE_NORMAL,
592                   bld.quarter(q).SEL(
593                      quarter(offset(s.delta_xy[i], bld, c), q),
594                      quarter(offset(centroid_delta_xy, bld, c), q),
595                      quarter(offset(pixel_delta_xy, bld, c), q)));
596             }
597          }
598       }
599    }
600 }
601 
602 
603 /**
604  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
605  * instructions to FS_OPCODE_REP_FB_WRITE.
606  */
607 static void
brw_emit_repclear_shader(fs_visitor & s)608 brw_emit_repclear_shader(fs_visitor &s)
609 {
610    brw_wm_prog_key *key = (brw_wm_prog_key*) s.key;
611    fs_inst *write = NULL;
612 
613    assert(s.devinfo->ver < 20);
614    assert(s.uniforms == 0);
615    assume(key->nr_color_regions > 0);
616 
617    brw_reg color_output = retype(brw_vec4_grf(127, 0), BRW_TYPE_UD);
618    brw_reg header = retype(brw_vec8_grf(125, 0), BRW_TYPE_UD);
619 
620    /* We pass the clear color as a flat input.  Copy it to the output. */
621    brw_reg color_input =
622       brw_make_reg(FIXED_GRF, 2, 3, 0, 0, BRW_TYPE_UD,
623               BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
624               BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
625 
626    const fs_builder bld = fs_builder(&s).at_end();
627    bld.exec_all().group(4, 0).MOV(color_output, color_input);
628 
629    if (key->nr_color_regions > 1) {
630       /* Copy g0..g1 as the message header */
631       bld.exec_all().group(16, 0)
632          .MOV(header, retype(brw_vec8_grf(0, 0), BRW_TYPE_UD));
633    }
634 
635    for (int i = 0; i < key->nr_color_regions; ++i) {
636       if (i > 0)
637          bld.exec_all().group(1, 0).MOV(component(header, 2), brw_imm_ud(i));
638 
639       write = bld.emit(SHADER_OPCODE_SEND);
640       write->resize_sources(3);
641 
642       /* We can use a headerless message for the first render target */
643       write->header_size = i == 0 ? 0 : 2;
644       write->mlen = 1 + write->header_size;
645 
646       write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
647       write->src[0] = brw_imm_ud(
648          brw_fb_write_desc(
649             s.devinfo, i,
650             BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
651             i == key->nr_color_regions - 1, false) |
652          brw_message_desc(s.devinfo, write->mlen,
653                           0 /* rlen */, write->header_size));
654       write->src[1] = brw_imm_ud(0);
655       write->src[2] = i == 0 ? color_output : header;
656       write->check_tdr = true;
657       write->send_has_side_effects = true;
658 
659       /* We can use a headerless message for the first render target */
660       write->header_size = i == 0 ? 0 : 2;
661       write->mlen = 1 + write->header_size;
662    }
663    write->eot = true;
664    write->last_rt = true;
665 
666    brw_calculate_cfg(s);
667 
668    s.first_non_payload_grf = s.payload().num_regs;
669 
670    brw_lower_scoreboard(s);
671 }
672 
673 /**
674  * Turn one of the two CENTROID barycentric modes into PIXEL mode.
675  */
676 static enum intel_barycentric_mode
centroid_to_pixel(enum intel_barycentric_mode bary)677 centroid_to_pixel(enum intel_barycentric_mode bary)
678 {
679    assert(bary == INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID ||
680           bary == INTEL_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
681    return (enum intel_barycentric_mode) ((unsigned) bary - 1);
682 }
683 
684 static void
calculate_urb_setup(const struct intel_device_info * devinfo,const struct brw_wm_prog_key * key,struct brw_wm_prog_data * prog_data,const nir_shader * nir,const struct brw_mue_map * mue_map)685 calculate_urb_setup(const struct intel_device_info *devinfo,
686                     const struct brw_wm_prog_key *key,
687                     struct brw_wm_prog_data *prog_data,
688                     const nir_shader *nir,
689                     const struct brw_mue_map *mue_map)
690 {
691    memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
692    memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
693 
694    int urb_next = 0; /* in vec4s */
695 
696    const uint64_t inputs_read =
697       nir->info.inputs_read & ~nir->info.per_primitive_inputs;
698 
699    /* Figure out where each of the incoming setup attributes lands. */
700    if (key->mesh_input != INTEL_NEVER) {
701       /* Per-Primitive Attributes are laid out by Hardware before the regular
702        * attributes, so order them like this to make easy later to map setup
703        * into real HW registers.
704        */
705       if (nir->info.per_primitive_inputs) {
706          uint64_t per_prim_inputs_read =
707                nir->info.inputs_read & nir->info.per_primitive_inputs;
708 
709          /* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots
710           * are always at the beginning, because they come from MUE
711           * Primitive Header, not Per-Primitive Attributes.
712           */
713          const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT |
714                                                 VARYING_BIT_LAYER |
715                                                 VARYING_BIT_PRIMITIVE_SHADING_RATE;
716 
717          if (mue_map) {
718             unsigned per_prim_start_dw = mue_map->per_primitive_start_dw;
719             unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw;
720 
721             bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;
722 
723             if (reads_header || mue_map->user_data_in_primitive_header) {
724                /* Primitive Shading Rate, Layer and Viewport live in the same
725                 * 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
726                 * is dword 2).
727                 */
728                if (per_prim_inputs_read & VARYING_BIT_PRIMITIVE_SHADING_RATE)
729                   prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0;
730 
731                if (per_prim_inputs_read & VARYING_BIT_LAYER)
732                   prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
733 
734                if (per_prim_inputs_read & VARYING_BIT_VIEWPORT)
735                   prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0;
736 
737                per_prim_inputs_read &= ~primitive_header_bits;
738             } else {
739                /* If fs doesn't need primitive header, then it won't be made
740                 * available through SBE_MESH, so we have to skip them when
741                 * calculating offset from start of per-prim data.
742                 */
743                per_prim_start_dw += mue_map->per_primitive_header_size_dw;
744                per_prim_size_dw -= mue_map->per_primitive_header_size_dw;
745             }
746 
747             u_foreach_bit64(i, per_prim_inputs_read) {
748                int start = mue_map->start_dw[i];
749 
750                assert(start >= 0);
751                assert(mue_map->len_dw[i] > 0);
752 
753                assert(unsigned(start) >= per_prim_start_dw);
754                unsigned pos_dw = unsigned(start) - per_prim_start_dw;
755 
756                prog_data->urb_setup[i] = urb_next + pos_dw / 4;
757                prog_data->urb_setup_channel[i] = pos_dw % 4;
758             }
759 
760             urb_next = per_prim_size_dw / 4;
761          } else {
762             /* With no MUE map, we never read the primitive header, and
763              * per-primitive attributes won't be packed either, so just lay
764              * them in varying order.
765              */
766             per_prim_inputs_read &= ~primitive_header_bits;
767 
768             for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
769                if (per_prim_inputs_read & BITFIELD64_BIT(i)) {
770                   prog_data->urb_setup[i] = urb_next++;
771                }
772             }
773 
774             /* The actual setup attributes later must be aligned to a full GRF. */
775             urb_next = ALIGN(urb_next, 2);
776          }
777 
778          prog_data->num_per_primitive_inputs = urb_next;
779       }
780 
781       const uint64_t clip_dist_bits = VARYING_BIT_CLIP_DIST0 |
782                                       VARYING_BIT_CLIP_DIST1;
783 
784       uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
785 
786       if (inputs_read & clip_dist_bits) {
787          assert(!mue_map || mue_map->per_vertex_header_size_dw > 8);
788          unique_fs_attrs &= ~clip_dist_bits;
789       }
790 
791       if (mue_map) {
792          unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
793          unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;
794 
795          /* Per-Vertex header is available to fragment shader only if there's
796           * user data there.
797           */
798          if (!mue_map->user_data_in_vertex_header) {
799             per_vertex_start_dw += 8;
800             per_vertex_size_dw -= 8;
801          }
802 
803          /* In Mesh, CLIP_DIST slots are always at the beginning, because
804           * they come from MUE Vertex Header, not Per-Vertex Attributes.
805           */
806          if (inputs_read & clip_dist_bits) {
807             prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next;
808             prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1;
809          } else if (mue_map && mue_map->per_vertex_header_size_dw > 8) {
810             /* Clip distances are in MUE, but we are not reading them in FS. */
811             per_vertex_start_dw += 8;
812             per_vertex_size_dw -= 8;
813          }
814 
815          /* Per-Vertex attributes are laid out ordered.  Because we always link
816           * Mesh and Fragment shaders, the which slots are written and read by
817           * each of them will match. */
818          u_foreach_bit64(i, unique_fs_attrs) {
819             int start = mue_map->start_dw[i];
820 
821             assert(start >= 0);
822             assert(mue_map->len_dw[i] > 0);
823 
824             assert(unsigned(start) >= per_vertex_start_dw);
825             unsigned pos_dw = unsigned(start) - per_vertex_start_dw;
826 
827             prog_data->urb_setup[i] = urb_next + pos_dw / 4;
828             prog_data->urb_setup_channel[i] = pos_dw % 4;
829          }
830 
831          urb_next += per_vertex_size_dw / 4;
832       } else {
833          /* If we don't have an MUE map, just lay down the inputs the FS reads
834           * in varying order, as we do for the legacy pipeline.
835           */
836          if (inputs_read & clip_dist_bits) {
837             prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++;
838             prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++;
839          }
840 
841          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
842             if (unique_fs_attrs & BITFIELD64_BIT(i))
843                prog_data->urb_setup[i] = urb_next++;
844          }
845       }
846    } else {
847       assert(!nir->info.per_primitive_inputs);
848 
849       uint64_t vue_header_bits =
850          VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
851 
852       uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
853 
854       /* VUE header fields all live in the same URB slot, so we pass them
855        * as a single FS input attribute.  We want to only count them once.
856        */
857       if (inputs_read & vue_header_bits) {
858          unique_fs_attrs &= ~vue_header_bits;
859          unique_fs_attrs |= VARYING_BIT_PSIZ;
860       }
861 
862       if (util_bitcount64(unique_fs_attrs) <= 16) {
863          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
864           * first 16 varying inputs, so we can put them wherever we want.
865           * Just put them in order.
866           *
867           * This is useful because it means that (a) inputs not used by the
868           * fragment shader won't take up valuable register space, and (b) we
869           * won't have to recompile the fragment shader if it gets paired with
870           * a different vertex (or geometry) shader.
871           *
872           * VUE header fields share the same FS input attribute.
873           */
874          if (inputs_read & vue_header_bits) {
875             if (inputs_read & VARYING_BIT_PSIZ)
876                prog_data->urb_setup[VARYING_SLOT_PSIZ] = urb_next;
877             if (inputs_read & VARYING_BIT_LAYER)
878                prog_data->urb_setup[VARYING_SLOT_LAYER] = urb_next;
879             if (inputs_read & VARYING_BIT_VIEWPORT)
880                prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = urb_next;
881 
882             urb_next++;
883          }
884 
885          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
886             if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits &
887                 BITFIELD64_BIT(i)) {
888                prog_data->urb_setup[i] = urb_next++;
889             }
890          }
891       } else {
892          /* We have enough input varyings that the SF/SBE pipeline stage can't
893           * arbitrarily rearrange them to suit our whim; we have to put them
894           * in an order that matches the output of the previous pipeline stage
895           * (geometry or vertex shader).
896           */
897 
898          /* Re-compute the VUE map here in the case that the one coming from
899           * geometry has more than one position slot (used for Primitive
900           * Replication).
901           */
902          struct intel_vue_map prev_stage_vue_map;
903          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
904                              key->input_slots_valid,
905                              nir->info.separate_shader, 1);
906 
907          int first_slot =
908             brw_compute_first_urb_slot_required(inputs_read,
909                                                 &prev_stage_vue_map);
910 
911          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
912          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
913               slot++) {
914             int varying = prev_stage_vue_map.slot_to_varying[slot];
915             if (varying != BRW_VARYING_SLOT_PAD &&
916                 (inputs_read & BRW_FS_VARYING_INPUT_MASK &
917                  BITFIELD64_BIT(varying))) {
918                prog_data->urb_setup[varying] = slot - first_slot;
919             }
920          }
921          urb_next = prev_stage_vue_map.num_slots - first_slot;
922       }
923    }
924 
925    prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
926    prog_data->inputs = inputs_read;
927 
928    brw_compute_urb_setup_index(prog_data);
929 }
930 static bool
is_used_in_not_interp_frag_coord(nir_def * def)931 is_used_in_not_interp_frag_coord(nir_def *def)
932 {
933    nir_foreach_use_including_if(src, def) {
934       if (nir_src_is_if(src))
935          return true;
936 
937       if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
938          return true;
939 
940       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src));
941       if (intrin->intrinsic != nir_intrinsic_load_frag_coord)
942          return true;
943    }
944 
945    return false;
946 }
947 
948 /**
949  * Return a bitfield where bit n is set if barycentric interpolation mode n
950  * (see enum intel_barycentric_mode) is needed by the fragment shader.
951  *
952  * We examine the load_barycentric intrinsics rather than looking at input
953  * variables so that we catch interpolateAtCentroid() messages too, which
954  * also need the INTEL_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
955  */
956 static unsigned
brw_compute_barycentric_interp_modes(const struct intel_device_info * devinfo,const struct brw_wm_prog_key * key,const nir_shader * shader)957 brw_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
958                                      const struct brw_wm_prog_key *key,
959                                      const nir_shader *shader)
960 {
961    unsigned barycentric_interp_modes = 0;
962 
963    nir_foreach_function_impl(impl, shader) {
964       nir_foreach_block(block, impl) {
965          nir_foreach_instr(instr, block) {
966             if (instr->type != nir_instr_type_intrinsic)
967                continue;
968 
969             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
970             switch (intrin->intrinsic) {
971             case nir_intrinsic_load_barycentric_pixel:
972             case nir_intrinsic_load_barycentric_centroid:
973             case nir_intrinsic_load_barycentric_sample:
974             case nir_intrinsic_load_barycentric_at_sample:
975             case nir_intrinsic_load_barycentric_at_offset:
976                break;
977             default:
978                continue;
979             }
980 
981             /* Ignore WPOS; it doesn't require interpolation. */
982             if (!is_used_in_not_interp_frag_coord(&intrin->def))
983                continue;
984 
985             nir_intrinsic_op bary_op = intrin->intrinsic;
986             enum intel_barycentric_mode bary =
987                brw_barycentric_mode(key, intrin);
988 
989             barycentric_interp_modes |= 1 << bary;
990 
991             if (devinfo->needs_unlit_centroid_workaround &&
992                 bary_op == nir_intrinsic_load_barycentric_centroid)
993                barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
994          }
995       }
996    }
997 
998    return barycentric_interp_modes;
999 }
1000 
1001 /**
1002  * Return a bitfield where bit n is set if barycentric interpolation
1003  * mode n (see enum intel_barycentric_mode) is needed by the fragment
1004  * shader barycentric intrinsics that take an explicit offset or
1005  * sample as argument.
1006  */
1007 static unsigned
brw_compute_offset_barycentric_interp_modes(const struct brw_wm_prog_key * key,const nir_shader * shader)1008 brw_compute_offset_barycentric_interp_modes(const struct brw_wm_prog_key *key,
1009                                             const nir_shader *shader)
1010 {
1011    unsigned barycentric_interp_modes = 0;
1012 
1013    nir_foreach_function_impl(impl, shader) {
1014       nir_foreach_block(block, impl) {
1015          nir_foreach_instr(instr, block) {
1016             if (instr->type != nir_instr_type_intrinsic)
1017                continue;
1018 
1019             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1020             if (intrin->intrinsic == nir_intrinsic_load_barycentric_at_offset ||
1021                 intrin->intrinsic == nir_intrinsic_load_barycentric_at_sample)
1022                barycentric_interp_modes |= 1 << brw_barycentric_mode(key, intrin);
1023          }
1024       }
1025    }
1026 
1027    return barycentric_interp_modes;
1028 }
1029 
1030 static void
brw_compute_flat_inputs(struct brw_wm_prog_data * prog_data,const nir_shader * shader)1031 brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
1032                         const nir_shader *shader)
1033 {
1034    prog_data->flat_inputs = 0;
1035 
1036    const unsigned per_vertex_start = prog_data->num_per_primitive_inputs;
1037 
1038    nir_foreach_shader_in_variable(var, shader) {
1039       /* flat shading */
1040       if (var->data.interpolation != INTERP_MODE_FLAT)
1041          continue;
1042 
1043       if (var->data.per_primitive)
1044          continue;
1045 
1046       unsigned slots = glsl_count_attribute_slots(var->type, false);
1047       for (unsigned s = 0; s < slots; s++) {
1048          int input_index = prog_data->urb_setup[var->data.location + s] - per_vertex_start;
1049 
1050          if (input_index >= 0)
1051             prog_data->flat_inputs |= 1 << input_index;
1052       }
1053    }
1054 }
1055 
1056 static uint8_t
computed_depth_mode(const nir_shader * shader)1057 computed_depth_mode(const nir_shader *shader)
1058 {
1059    if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
1060       switch (shader->info.fs.depth_layout) {
1061       case FRAG_DEPTH_LAYOUT_NONE:
1062       case FRAG_DEPTH_LAYOUT_ANY:
1063          return BRW_PSCDEPTH_ON;
1064       case FRAG_DEPTH_LAYOUT_GREATER:
1065          return BRW_PSCDEPTH_ON_GE;
1066       case FRAG_DEPTH_LAYOUT_LESS:
1067          return BRW_PSCDEPTH_ON_LE;
1068       case FRAG_DEPTH_LAYOUT_UNCHANGED:
1069          /* We initially set this to OFF, but having the shader write the
1070           * depth means we allocate register space in the SEND message. The
1071           * difference between the SEND register count and the OFF state
1072           * programming makes the HW hang.
1073           *
1074           * Removing the depth writes also leads to test failures. So use
1075           * LesserThanOrEqual, which fits writing the same value
1076           * (unchanged/equal).
1077           *
1078           */
1079          return BRW_PSCDEPTH_ON_LE;
1080       }
1081    }
1082    return BRW_PSCDEPTH_OFF;
1083 }
1084 
1085 static void
brw_nir_populate_wm_prog_data(nir_shader * shader,const struct intel_device_info * devinfo,const struct brw_wm_prog_key * key,struct brw_wm_prog_data * prog_data,const struct brw_mue_map * mue_map)1086 brw_nir_populate_wm_prog_data(nir_shader *shader,
1087                               const struct intel_device_info *devinfo,
1088                               const struct brw_wm_prog_key *key,
1089                               struct brw_wm_prog_data *prog_data,
1090                               const struct brw_mue_map *mue_map)
1091 {
1092    prog_data->uses_kill = shader->info.fs.uses_discard;
1093    prog_data->uses_omask = !key->ignore_sample_mask_out &&
1094       (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
1095    prog_data->max_polygons = 1;
1096    prog_data->computed_depth_mode = computed_depth_mode(shader);
1097    prog_data->computed_stencil =
1098       shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
1099 
1100    prog_data->sample_shading =
1101       shader->info.fs.uses_sample_shading ||
1102       shader->info.outputs_read;
1103 
1104    assert(key->multisample_fbo != INTEL_NEVER ||
1105           key->persample_interp == INTEL_NEVER);
1106 
1107    prog_data->persample_dispatch = key->persample_interp;
1108    if (prog_data->sample_shading)
1109       prog_data->persample_dispatch = INTEL_ALWAYS;
1110 
1111    /* We can only persample dispatch if we have a multisample FBO */
1112    prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch,
1113                                         key->multisample_fbo);
1114 
1115    /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
1116     * persample_dispatch & multisample_fbo are not dynamic, Anv should be able
1117     * to definitively tell whether alpha_to_coverage is on or off.
1118     */
1119    prog_data->alpha_to_coverage = key->alpha_to_coverage;
1120 
1121    prog_data->uses_sample_mask =
1122       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
1123 
1124    /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
1125     *
1126     *    "MSDISPMODE_PERSAMPLE is required in order to select
1127     *    POSOFFSET_SAMPLE"
1128     *
1129     * So we can only really get sample positions if we are doing real
1130     * per-sample dispatch.  If we need gl_SamplePosition and we don't have
1131     * persample dispatch, we hard-code it to 0.5.
1132     */
1133    prog_data->uses_pos_offset =
1134       prog_data->persample_dispatch != INTEL_NEVER &&
1135       (BITSET_TEST(shader->info.system_values_read,
1136                    SYSTEM_VALUE_SAMPLE_POS) ||
1137        BITSET_TEST(shader->info.system_values_read,
1138                    SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
1139 
1140    prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
1141    prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
1142    prog_data->inner_coverage = shader->info.fs.inner_coverage;
1143 
1144    prog_data->barycentric_interp_modes =
1145       brw_compute_barycentric_interp_modes(devinfo, key, shader);
1146 
1147    /* From the BDW PRM documentation for 3DSTATE_WM:
1148     *
1149     *    "MSDISPMODE_PERSAMPLE is required in order to select Perspective
1150     *     Sample or Non- perspective Sample barycentric coordinates."
1151     *
1152     * So cleanup any potentially set sample barycentric mode when not in per
1153     * sample dispatch.
1154     */
1155    if (prog_data->persample_dispatch == INTEL_NEVER) {
1156       prog_data->barycentric_interp_modes &=
1157          ~BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE);
1158    }
1159 
1160    if (devinfo->ver >= 20) {
1161       const unsigned offset_bary_modes =
1162          brw_compute_offset_barycentric_interp_modes(key, shader);
1163 
1164       prog_data->uses_npc_bary_coefficients =
1165          offset_bary_modes & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS;
1166       prog_data->uses_pc_bary_coefficients =
1167          offset_bary_modes & ~INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS;
1168       prog_data->uses_sample_offsets =
1169          offset_bary_modes & ((1 << INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE) |
1170                               (1 << INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE));
1171    }
1172 
1173    prog_data->uses_nonperspective_interp_modes =
1174       (prog_data->barycentric_interp_modes & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) ||
1175       prog_data->uses_npc_bary_coefficients;
1176 
1177    /* The current VK_EXT_graphics_pipeline_library specification requires
1178     * coarse to specified at compile time. But per sample interpolation can be
1179     * dynamic. So we should never be in a situation where coarse &
1180     * persample_interp are both respectively true & INTEL_ALWAYS.
1181     *
1182     * Coarse will dynamically turned off when persample_interp is active.
1183     */
1184    assert(!key->coarse_pixel || key->persample_interp != INTEL_ALWAYS);
1185 
1186    prog_data->coarse_pixel_dispatch =
1187       intel_sometimes_invert(prog_data->persample_dispatch);
1188    if (!key->coarse_pixel ||
1189        prog_data->uses_omask ||
1190        prog_data->sample_shading ||
1191        prog_data->uses_sample_mask ||
1192        (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) ||
1193        prog_data->computed_stencil) {
1194       prog_data->coarse_pixel_dispatch = INTEL_NEVER;
1195    }
1196 
1197    /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
1198     * Message Descriptor :
1199     *
1200     *    "Message Type. Specifies the type of message being sent when
1201     *     pixel-rate evaluation is requested :
1202     *
1203     *     Format = U2
1204     *       0: Per Message Offset (eval_snapped with immediate offset)
1205     *       1: Sample Position Offset (eval_sindex)
1206     *       2: Centroid Position Offset (eval_centroid)
1207     *       3: Per Slot Offset (eval_snapped with register offset)
1208     *
1209     *     Message Type. Specifies the type of message being sent when
1210     *     coarse-rate evaluation is requested :
1211     *
1212     *     Format = U2
1213     *       0: Coarse to Pixel Mapping Message (internal message)
1214     *       1: Reserved
1215     *       2: Coarse Centroid Position (eval_centroid)
1216     *       3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
1217     *
1218     * The Sample Position Offset is marked as reserved for coarse rate
1219     * evaluation and leads to hangs if we try to use it. So disable coarse
1220     * pixel shading if we have any intrinsic that will result in a pixel
1221     * interpolater message at sample.
1222     */
1223    if (intel_nir_pulls_at_sample(shader))
1224       prog_data->coarse_pixel_dispatch = INTEL_NEVER;
1225 
1226    /* We choose to always enable VMask prior to XeHP, as it would cause
1227     * us to lose out on the eliminate_find_live_channel() optimization.
1228     */
1229    prog_data->uses_vmask = devinfo->verx10 < 125 ||
1230                            shader->info.fs.needs_quad_helper_invocations ||
1231                            shader->info.uses_wide_subgroup_intrinsics ||
1232                            prog_data->coarse_pixel_dispatch != INTEL_NEVER;
1233 
1234    prog_data->uses_src_w =
1235       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
1236    prog_data->uses_src_depth =
1237       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
1238       prog_data->coarse_pixel_dispatch != INTEL_ALWAYS;
1239    prog_data->uses_depth_w_coefficients = prog_data->uses_pc_bary_coefficients ||
1240       (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
1241        prog_data->coarse_pixel_dispatch != INTEL_NEVER);
1242 
1243    calculate_urb_setup(devinfo, key, prog_data, shader, mue_map);
1244    brw_compute_flat_inputs(prog_data, shader);
1245 }
1246 
1247 /* From the SKL PRM, Volume 16, Workarounds:
1248  *
1249  *   0877  3D   Pixel Shader Hang possible when pixel shader dispatched with
1250  *              only header phases (R0-R2)
1251  *
1252  *   WA: Enable a non-header phase (e.g. push constant) when dispatch would
1253  *       have been header only.
1254  *
1255  * Instead of enabling push constants one can alternatively enable one of the
1256  * inputs. Here one simply chooses "layer" which shouldn't impose much
1257  * overhead.
1258  */
1259 static void
gfx9_ps_header_only_workaround(struct brw_wm_prog_data * wm_prog_data)1260 gfx9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
1261 {
1262    if (wm_prog_data->num_varying_inputs)
1263       return;
1264 
1265    if (wm_prog_data->base.curb_read_length)
1266       return;
1267 
1268    wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
1269    wm_prog_data->num_varying_inputs = 1;
1270 
1271    brw_compute_urb_setup_index(wm_prog_data);
1272 }
1273 
1274 static void
brw_assign_urb_setup(fs_visitor & s)1275 brw_assign_urb_setup(fs_visitor &s)
1276 {
1277    assert(s.stage == MESA_SHADER_FRAGMENT);
1278 
1279    const struct intel_device_info *devinfo = s.devinfo;
1280    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data);
1281 
1282    int urb_start = s.payload().num_regs + prog_data->base.curb_read_length;
1283 
1284    /* Offset all the urb_setup[] index by the actual position of the
1285     * setup regs, now that the location of the constants has been chosen.
1286     */
1287    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
1288       for (int i = 0; i < inst->sources; i++) {
1289          if (inst->src[i].file == ATTR) {
1290             /* ATTR brw_reg::nr in the FS is in units of logical scalar
1291              * inputs each of which consumes 16B on Gfx4-Gfx12.  In
1292              * single polygon mode this leads to the following layout
1293              * of the vertex setup plane parameters in the ATTR
1294              * register file:
1295              *
1296              *  brw_reg::nr   Input   Comp0  Comp1  Comp2  Comp3
1297              *      0       Attr0.x  a1-a0  a2-a0   N/A    a0
1298              *      1       Attr0.y  a1-a0  a2-a0   N/A    a0
1299              *      2       Attr0.z  a1-a0  a2-a0   N/A    a0
1300              *      3       Attr0.w  a1-a0  a2-a0   N/A    a0
1301              *      4       Attr1.x  a1-a0  a2-a0   N/A    a0
1302              *     ...
1303              *
1304              * In multipolygon mode that no longer works since
1305              * different channels may be processing polygons with
1306              * different plane parameters, so each parameter above is
1307              * represented as a dispatch_width-wide vector:
1308              *
1309              *  brw_reg::nr     brw_reg::offset    Input      Comp0     ...    CompN
1310              *      0                 0          Attr0.x  a1[0]-a0[0] ... a1[N]-a0[N]
1311              *      0        4 * dispatch_width  Attr0.x  a2[0]-a0[0] ... a2[N]-a0[N]
1312              *      0        8 * dispatch_width  Attr0.x     N/A      ...     N/A
1313              *      0       12 * dispatch_width  Attr0.x    a0[0]     ...    a0[N]
1314              *      1                 0          Attr0.y  a1[0]-a0[0] ... a1[N]-a0[N]
1315              *     ...
1316              *
1317              * Note that many of the components on a single row above
1318              * are likely to be replicated multiple times (if, say, a
1319              * single SIMD thread is only processing 2 different
1320              * polygons), so plane parameters aren't actually stored
1321              * in GRF memory with that layout to avoid wasting space.
1322              * Instead we compose ATTR register regions with a 2D
1323              * region that walks through the parameters of each
1324              * polygon with the correct stride, reading the parameter
1325              * corresponding to each channel directly from the PS
1326              * thread payload.
1327              *
1328              * The latter layout corresponds to a param_width equal to
1329              * dispatch_width, while the former (scalar parameter)
1330              * layout has a param_width of 1.
1331              *
1332              * Gfx20+ represent plane parameters in a format similar
1333              * to the above, except the parameters are packed in 12B
1334              * and ordered like "a0, a1-a0, a2-a0" instead of the
1335              * above vec4 representation with a missing component.
1336              */
1337             const unsigned param_width = (s.max_polygons > 1 ? s.dispatch_width : 1);
1338 
1339             /* Size of a single scalar component of a plane parameter
1340              * in bytes.
1341              */
1342             const unsigned chan_sz = 4;
1343             struct brw_reg reg;
1344             assert(s.max_polygons > 0);
1345 
1346             /* Calculate the base register on the thread payload of
1347              * either the block of vertex setup data or the block of
1348              * per-primitive constant data depending on whether we're
1349              * accessing a primitive or vertex input.  Also calculate
1350              * the index of the input within that block.
1351              */
1352             const bool per_prim = inst->src[i].nr < prog_data->num_per_primitive_inputs;
1353             const unsigned base = urb_start +
1354                (per_prim ? 0 :
1355                 ALIGN(prog_data->num_per_primitive_inputs / 2,
1356                       reg_unit(devinfo)) * s.max_polygons);
1357             const unsigned idx = per_prim ? inst->src[i].nr :
1358                inst->src[i].nr - prog_data->num_per_primitive_inputs;
1359 
1360             /* Translate the offset within the param_width-wide
1361              * representation described above into an offset and a
1362              * grf, which contains the plane parameters for the first
1363              * polygon processed by the thread.
1364              */
1365             if (devinfo->ver >= 20 && !per_prim) {
1366                /* Gfx20+ is able to pack 5 logical input components
1367                 * per 64B register for vertex setup data.
1368                 */
1369                const unsigned grf = base + idx / 5 * 2 * s.max_polygons;
1370                assert(inst->src[i].offset / param_width < 12);
1371                const unsigned delta = idx % 5 * 12 +
1372                   inst->src[i].offset / (param_width * chan_sz) * chan_sz +
1373                   inst->src[i].offset % chan_sz;
1374                reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1375                                  delta);
1376             } else {
1377                /* Earlier platforms and per-primitive block pack 2 logical
1378                 * input components per 32B register.
1379                 */
1380                const unsigned grf = base + idx / 2 * s.max_polygons;
1381                assert(inst->src[i].offset / param_width < REG_SIZE / 2);
1382                const unsigned delta = (idx % 2) * (REG_SIZE / 2) +
1383                   inst->src[i].offset / (param_width * chan_sz) * chan_sz +
1384                   inst->src[i].offset % chan_sz;
1385                reg = byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1386                                  delta);
1387             }
1388 
1389             if (s.max_polygons > 1) {
1390                assert(devinfo->ver >= 12);
1391                /* Misaligned channel strides that would lead to
1392                 * cross-channel access in the representation above are
1393                 * disallowed.
1394                 */
1395                assert(inst->src[i].stride * brw_type_size_bytes(inst->src[i].type) == chan_sz);
1396 
1397                /* Number of channels processing the same polygon. */
1398                const unsigned poly_width = s.dispatch_width / s.max_polygons;
1399                assert(s.dispatch_width % s.max_polygons == 0);
1400 
1401                /* Accessing a subset of channels of a parameter vector
1402                 * starting from "chan" is necessary to handle
1403                 * SIMD-lowered instructions though.
1404                 */
1405                const unsigned chan = inst->src[i].offset %
1406                   (param_width * chan_sz) / chan_sz;
1407                assert(chan < s.dispatch_width);
1408                assert(chan % poly_width == 0);
1409                const unsigned reg_size = reg_unit(devinfo) * REG_SIZE;
1410                reg = byte_offset(reg, chan / poly_width * reg_size);
1411 
1412                if (inst->exec_size > poly_width) {
1413                   /* Accessing the parameters for multiple polygons.
1414                    * Corresponding parameters for different polygons
1415                    * are stored a GRF apart on the thread payload, so
1416                    * use that as vertical stride.
1417                    */
1418                   const unsigned vstride = reg_size / brw_type_size_bytes(inst->src[i].type);
1419                   assert(vstride <= 32);
1420                   assert(chan % poly_width == 0);
1421                   reg = stride(reg, vstride, poly_width, 0);
1422                } else {
1423                   /* Accessing one parameter for a single polygon --
1424                    * Translate to a scalar region.
1425                    */
1426                   assert(chan % poly_width + inst->exec_size <= poly_width);
1427                   reg = stride(reg, 0, 1, 0);
1428                }
1429 
1430             } else {
1431                const unsigned width = inst->src[i].stride == 0 ?
1432                   1 : MIN2(inst->exec_size, 8);
1433                reg = stride(reg, width * inst->src[i].stride,
1434                             width, inst->src[i].stride);
1435             }
1436 
1437             reg.abs = inst->src[i].abs;
1438             reg.negate = inst->src[i].negate;
1439             inst->src[i] = reg;
1440          }
1441       }
1442    }
1443 
1444    /* Each attribute is 4 setup channels, each of which is half a reg,
1445     * but they may be replicated multiple times for multipolygon
1446     * dispatch.
1447     */
1448    s.first_non_payload_grf += prog_data->num_varying_inputs * 2 * s.max_polygons;
1449 
1450    /* Unlike regular attributes, per-primitive attributes have all 4 channels
1451     * in the same slot, so each GRF can store two slots.
1452     */
1453    assert(prog_data->num_per_primitive_inputs % 2 == 0);
1454    s.first_non_payload_grf += prog_data->num_per_primitive_inputs / 2 * s.max_polygons;
1455 }
1456 
1457 static bool
run_fs(fs_visitor & s,bool allow_spilling,bool do_rep_send)1458 run_fs(fs_visitor &s, bool allow_spilling, bool do_rep_send)
1459 {
1460    const struct intel_device_info *devinfo = s.devinfo;
1461    struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(s.prog_data);
1462    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) s.key;
1463    const fs_builder bld = fs_builder(&s).at_end();
1464    const nir_shader *nir = s.nir;
1465 
1466    assert(s.stage == MESA_SHADER_FRAGMENT);
1467 
1468    s.payload_ = new fs_thread_payload(s, s.source_depth_to_render_target);
1469 
1470    if (nir->info.ray_queries > 0)
1471       s.limit_dispatch_width(16, "SIMD32 not supported with ray queries.\n");
1472 
1473    if (do_rep_send) {
1474       assert(s.dispatch_width == 16);
1475       brw_emit_repclear_shader(s);
1476    } else {
1477       if (nir->info.inputs_read > 0 ||
1478           BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
1479           (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
1480          brw_emit_interpolation_setup(s);
1481       }
1482 
1483       /* We handle discards by keeping track of the still-live pixels in f0.1.
1484        * Initialize it with the dispatched pixels.
1485        */
1486       if (devinfo->ver >= 20 || wm_prog_data->uses_kill) {
1487          const unsigned lower_width = MIN2(s.dispatch_width, 16);
1488          for (unsigned i = 0; i < s.dispatch_width / lower_width; i++) {
1489             /* According to the "PS Thread Payload for Normal
1490              * Dispatch" pages on the BSpec, the dispatch mask is
1491              * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
1492              * gfx6+.
1493              */
1494             const brw_reg dispatch_mask =
1495                devinfo->ver >= 20 ? xe2_vec1_grf(i, 15) :
1496                                     brw_vec1_grf(i + 1, 7);
1497             bld.exec_all().group(1, 0)
1498                .MOV(brw_sample_mask_reg(bld.group(lower_width, i)),
1499                     retype(dispatch_mask, BRW_TYPE_UW));
1500          }
1501       }
1502 
1503       if (nir->info.writes_memory)
1504          wm_prog_data->has_side_effects = true;
1505 
1506       nir_to_brw(&s);
1507 
1508       if (s.failed)
1509 	 return false;
1510 
1511       brw_emit_fb_writes(s);
1512 
1513       brw_calculate_cfg(s);
1514 
1515       brw_optimize(s);
1516 
1517       s.assign_curb_setup();
1518 
1519       if (devinfo->ver == 9)
1520          gfx9_ps_header_only_workaround(wm_prog_data);
1521 
1522       brw_assign_urb_setup(s);
1523 
1524       brw_lower_3src_null_dest(s);
1525       brw_workaround_memory_fence_before_eot(s);
1526       brw_workaround_emit_dummy_mov_instruction(s);
1527 
1528       brw_allocate_registers(s, allow_spilling);
1529 
1530       brw_workaround_source_arf_before_eot(s);
1531    }
1532 
1533    return !s.failed;
1534 }
1535 
1536 const unsigned *
brw_compile_fs(const struct brw_compiler * compiler,struct brw_compile_fs_params * params)1537 brw_compile_fs(const struct brw_compiler *compiler,
1538                struct brw_compile_fs_params *params)
1539 {
1540    struct nir_shader *nir = params->base.nir;
1541    const struct brw_wm_prog_key *key = params->key;
1542    struct brw_wm_prog_data *prog_data = params->prog_data;
1543    bool allow_spilling = params->allow_spilling;
1544    const bool debug_enabled =
1545       brw_should_print_shader(nir, params->base.debug_flag ?
1546                                    params->base.debug_flag : DEBUG_WM);
1547 
1548    prog_data->base.stage = MESA_SHADER_FRAGMENT;
1549    prog_data->base.ray_queries = nir->info.ray_queries;
1550    prog_data->base.total_scratch = 0;
1551 
1552    const struct intel_device_info *devinfo = compiler->devinfo;
1553    const unsigned max_subgroup_size = 32;
1554 
1555    brw_nir_apply_key(nir, compiler, &key->base, max_subgroup_size);
1556    brw_nir_lower_fs_inputs(nir, devinfo, key);
1557    brw_nir_lower_fs_outputs(nir);
1558 
1559    /* From the SKL PRM, Volume 7, "Alpha Coverage":
1560     *  "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
1561     *   hardware, regardless of the state setting for this feature."
1562     */
1563    if (key->alpha_to_coverage != INTEL_NEVER) {
1564       /* Run constant fold optimization in order to get the correct source
1565        * offset to determine render target 0 store instruction in
1566        * emit_alpha_to_coverage pass.
1567        */
1568       NIR_PASS(_, nir, nir_opt_constant_folding);
1569       NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage, key, prog_data);
1570    }
1571 
1572    NIR_PASS(_, nir, brw_nir_move_interpolation_to_top);
1573    brw_postprocess_nir(nir, compiler, debug_enabled,
1574                        key->base.robust_flags);
1575 
1576    brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data,
1577                                  params->mue_map);
1578 
1579    std::unique_ptr<fs_visitor> v8, v16, v32, vmulti;
1580    cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL,
1581       *multi_cfg = NULL;
1582    float throughput = 0;
1583    bool has_spilled = false;
1584 
1585    if (devinfo->ver < 20) {
1586       v8 = std::make_unique<fs_visitor>(compiler, &params->base, key,
1587                                         prog_data, nir, 8, 1,
1588                                         params->base.stats != NULL,
1589                                         debug_enabled);
1590       if (!run_fs(*v8, allow_spilling, false /* do_rep_send */)) {
1591          params->base.error_str = ralloc_strdup(params->base.mem_ctx,
1592                                                 v8->fail_msg);
1593          return NULL;
1594       } else if (INTEL_SIMD(FS, 8)) {
1595          simd8_cfg = v8->cfg;
1596 
1597          assert(v8->payload().num_regs % reg_unit(devinfo) == 0);
1598          prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo);
1599 
1600          const performance &perf = v8->performance_analysis.require();
1601          throughput = MAX2(throughput, perf.throughput);
1602          has_spilled = v8->spilled_any_registers;
1603          allow_spilling = false;
1604       }
1605    }
1606 
1607    if (key->coarse_pixel && devinfo->ver < 20) {
1608       if (prog_data->dual_src_blend) {
1609          v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot"
1610                                   " use SIMD8 messages.\n");
1611       }
1612       v8->limit_dispatch_width(16, "SIMD32 not supported with coarse"
1613                                " pixel shading.\n");
1614    }
1615 
1616    if (!has_spilled &&
1617        (!v8 || v8->max_dispatch_width >= 16) &&
1618        (INTEL_SIMD(FS, 16) || params->use_rep_send)) {
1619       /* Try a SIMD16 compile */
1620       v16 = std::make_unique<fs_visitor>(compiler, &params->base, key,
1621                                          prog_data, nir, 16, 1,
1622                                          params->base.stats != NULL,
1623                                          debug_enabled);
1624       if (v8)
1625          v16->import_uniforms(v8.get());
1626       if (!run_fs(*v16, allow_spilling, params->use_rep_send)) {
1627          brw_shader_perf_log(compiler, params->base.log_data,
1628                              "SIMD16 shader failed to compile: %s\n",
1629                              v16->fail_msg);
1630       } else {
1631          simd16_cfg = v16->cfg;
1632 
1633          assert(v16->payload().num_regs % reg_unit(devinfo) == 0);
1634          prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo);
1635 
1636          const performance &perf = v16->performance_analysis.require();
1637          throughput = MAX2(throughput, perf.throughput);
1638          has_spilled = v16->spilled_any_registers;
1639          allow_spilling = false;
1640       }
1641    }
1642 
1643    const bool simd16_failed = v16 && !simd16_cfg;
1644 
1645    /* Currently, the compiler only supports SIMD32 on SNB+ */
1646    if (!has_spilled &&
1647        (!v8 || v8->max_dispatch_width >= 32) &&
1648        (!v16 || v16->max_dispatch_width >= 32) && !params->use_rep_send &&
1649        !simd16_failed &&
1650        INTEL_SIMD(FS, 32)) {
1651       /* Try a SIMD32 compile */
1652       v32 = std::make_unique<fs_visitor>(compiler, &params->base, key,
1653                                          prog_data, nir, 32, 1,
1654                                          params->base.stats != NULL,
1655                                          debug_enabled);
1656       if (v8)
1657          v32->import_uniforms(v8.get());
1658       else if (v16)
1659          v32->import_uniforms(v16.get());
1660 
1661       if (!run_fs(*v32, allow_spilling, false)) {
1662          brw_shader_perf_log(compiler, params->base.log_data,
1663                              "SIMD32 shader failed to compile: %s\n",
1664                              v32->fail_msg);
1665       } else {
1666          const performance &perf = v32->performance_analysis.require();
1667 
1668          if (!INTEL_DEBUG(DEBUG_DO32) && throughput >= perf.throughput) {
1669             brw_shader_perf_log(compiler, params->base.log_data,
1670                                 "SIMD32 shader inefficient\n");
1671          } else {
1672             simd32_cfg = v32->cfg;
1673 
1674             assert(v32->payload().num_regs % reg_unit(devinfo) == 0);
1675             prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo);
1676 
1677             throughput = MAX2(throughput, perf.throughput);
1678          }
1679       }
1680    }
1681 
1682    if (devinfo->ver >= 12 && !has_spilled &&
1683        params->max_polygons >= 2 && !key->coarse_pixel) {
1684       fs_visitor *vbase = v8 ? v8.get() : v16 ? v16.get() : v32.get();
1685       assert(vbase);
1686 
1687       if (devinfo->ver >= 20 &&
1688           params->max_polygons >= 4 &&
1689           vbase->max_dispatch_width >= 32 &&
1690           4 * prog_data->num_varying_inputs <= MAX_VARYING &&
1691           INTEL_SIMD(FS, 4X8)) {
1692          /* Try a quad-SIMD8 compile */
1693          vmulti = std::make_unique<fs_visitor>(compiler, &params->base, key,
1694                                                prog_data, nir, 32, 4,
1695                                                params->base.stats != NULL,
1696                                                debug_enabled);
1697          vmulti->import_uniforms(vbase);
1698          if (!run_fs(*vmulti, false, params->use_rep_send)) {
1699             brw_shader_perf_log(compiler, params->base.log_data,
1700                                 "Quad-SIMD8 shader failed to compile: %s\n",
1701                                 vmulti->fail_msg);
1702          } else {
1703             multi_cfg = vmulti->cfg;
1704             assert(!vmulti->spilled_any_registers);
1705          }
1706       }
1707 
1708       if (!multi_cfg && devinfo->ver >= 20 &&
1709           vbase->max_dispatch_width >= 32 &&
1710           2 * prog_data->num_varying_inputs <= MAX_VARYING &&
1711           INTEL_SIMD(FS, 2X16)) {
1712          /* Try a dual-SIMD16 compile */
1713          vmulti = std::make_unique<fs_visitor>(compiler, &params->base, key,
1714                                                prog_data, nir, 32, 2,
1715                                                params->base.stats != NULL,
1716                                                debug_enabled);
1717          vmulti->import_uniforms(vbase);
1718          if (!run_fs(*vmulti, false, params->use_rep_send)) {
1719             brw_shader_perf_log(compiler, params->base.log_data,
1720                                 "Dual-SIMD16 shader failed to compile: %s\n",
1721                                 vmulti->fail_msg);
1722          } else {
1723             multi_cfg = vmulti->cfg;
1724             assert(!vmulti->spilled_any_registers);
1725          }
1726       }
1727 
1728       if (!multi_cfg && vbase->max_dispatch_width >= 16 &&
1729           2 * prog_data->num_varying_inputs <= MAX_VARYING &&
1730           INTEL_SIMD(FS, 2X8)) {
1731          /* Try a dual-SIMD8 compile */
1732          vmulti = std::make_unique<fs_visitor>(compiler, &params->base, key,
1733                                                prog_data, nir, 16, 2,
1734                                                params->base.stats != NULL,
1735                                                debug_enabled);
1736          vmulti->import_uniforms(vbase);
1737          if (!run_fs(*vmulti, allow_spilling, params->use_rep_send)) {
1738             brw_shader_perf_log(compiler, params->base.log_data,
1739                                 "Dual-SIMD8 shader failed to compile: %s\n",
1740                                 vmulti->fail_msg);
1741          } else {
1742             multi_cfg = vmulti->cfg;
1743          }
1744       }
1745 
1746       if (multi_cfg) {
1747          assert(vmulti->payload().num_regs % reg_unit(devinfo) == 0);
1748          prog_data->base.dispatch_grf_start_reg = vmulti->payload().num_regs / reg_unit(devinfo);
1749       }
1750    }
1751 
1752    /* When the caller requests a repclear shader, they want SIMD16-only */
1753    if (params->use_rep_send)
1754       simd8_cfg = NULL;
1755 
1756    brw_generator g(compiler, &params->base, &prog_data->base,
1757                   MESA_SHADER_FRAGMENT);
1758 
1759    if (unlikely(debug_enabled)) {
1760       g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
1761                                      "%s fragment shader %s",
1762                                      nir->info.label ?
1763                                         nir->info.label : "unnamed",
1764                                      nir->info.name));
1765    }
1766 
1767    struct brw_compile_stats *stats = params->base.stats;
1768    uint32_t max_dispatch_width = 0;
1769 
1770    if (multi_cfg) {
1771       prog_data->dispatch_multi = vmulti->dispatch_width;
1772       prog_data->max_polygons = vmulti->max_polygons;
1773       g.generate_code(multi_cfg, vmulti->dispatch_width, vmulti->shader_stats,
1774                       vmulti->performance_analysis.require(),
1775                       stats, vmulti->max_polygons);
1776       stats = stats ? stats + 1 : NULL;
1777       max_dispatch_width = vmulti->dispatch_width;
1778 
1779    } else if (simd8_cfg) {
1780       prog_data->dispatch_8 = true;
1781       g.generate_code(simd8_cfg, 8, v8->shader_stats,
1782                       v8->performance_analysis.require(), stats, 1);
1783       stats = stats ? stats + 1 : NULL;
1784       max_dispatch_width = 8;
1785    }
1786 
1787    if (simd16_cfg) {
1788       prog_data->dispatch_16 = true;
1789       prog_data->prog_offset_16 = g.generate_code(
1790          simd16_cfg, 16, v16->shader_stats,
1791          v16->performance_analysis.require(), stats, 1);
1792       stats = stats ? stats + 1 : NULL;
1793       max_dispatch_width = 16;
1794    }
1795 
1796    if (simd32_cfg) {
1797       prog_data->dispatch_32 = true;
1798       prog_data->prog_offset_32 = g.generate_code(
1799          simd32_cfg, 32, v32->shader_stats,
1800          v32->performance_analysis.require(), stats, 1);
1801       stats = stats ? stats + 1 : NULL;
1802       max_dispatch_width = 32;
1803    }
1804 
1805    for (struct brw_compile_stats *s = params->base.stats; s != NULL && s != stats; s++)
1806       s->max_dispatch_width = max_dispatch_width;
1807 
1808    g.add_const_data(nir->constant_data, nir->constant_data_size);
1809    return g.get_assembly();
1810 }
1811