1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file elk_fs_visitor.cpp
25 *
26 * This file supports generating the FS LIR from the GLSL IR. The LIR
27 * makes it easier to do backend-specific optimizations than doing so
28 * in the GLSL IR or in the native code.
29 */
30 #include "elk_eu.h"
31 #include "elk_fs.h"
32 #include "elk_fs_builder.h"
33 #include "elk_nir.h"
34 #include "compiler/glsl_types.h"
35
36 using namespace elk;
37
38 /* Input data is organized with first the per-primitive values, followed
39 * by per-vertex values. The per-vertex will have interpolation information
40 * associated, so use 4 components for each value.
41 */
42
43 /* The register location here is relative to the start of the URB
44 * data. It will get adjusted to be a real location before
45 * generate_code() time.
46 */
47 elk_fs_reg
interp_reg(const fs_builder & bld,unsigned location,unsigned channel,unsigned comp)48 elk_fs_visitor::interp_reg(const fs_builder &bld, unsigned location,
49 unsigned channel, unsigned comp)
50 {
51 assert(stage == MESA_SHADER_FRAGMENT);
52 assert(BITFIELD64_BIT(location) & ~nir->info.per_primitive_inputs);
53
54 const struct elk_wm_prog_data *prog_data = elk_wm_prog_data(this->prog_data);
55
56 assert(prog_data->urb_setup[location] >= 0);
57 unsigned nr = prog_data->urb_setup[location];
58 channel += prog_data->urb_setup_channel[location];
59
60 /* Adjust so we start counting from the first per_vertex input. */
61 assert(nr >= prog_data->num_per_primitive_inputs);
62 nr -= prog_data->num_per_primitive_inputs;
63
64 const unsigned per_vertex_start = prog_data->num_per_primitive_inputs;
65 const unsigned regnr = per_vertex_start + (nr * 4) + channel;
66
67 if (max_polygons > 1) {
68 /* In multipolygon dispatch each plane parameter is a
69 * dispatch_width-wide SIMD vector (see comment in
70 * assign_urb_setup()), so we need to use offset() instead of
71 * component() to select the specified parameter.
72 */
73 const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
74 bld.MOV(tmp, offset(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_UD),
75 dispatch_width, comp));
76 return retype(tmp, ELK_REGISTER_TYPE_F);
77 } else {
78 return component(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_F), comp);
79 }
80 }
81
82 /* The register location here is relative to the start of the URB
83 * data. It will get adjusted to be a real location before
84 * generate_code() time.
85 */
86 elk_fs_reg
per_primitive_reg(const fs_builder & bld,int location,unsigned comp)87 elk_fs_visitor::per_primitive_reg(const fs_builder &bld, int location, unsigned comp)
88 {
89 assert(stage == MESA_SHADER_FRAGMENT);
90 assert(BITFIELD64_BIT(location) & nir->info.per_primitive_inputs);
91
92 const struct elk_wm_prog_data *prog_data = elk_wm_prog_data(this->prog_data);
93
94 comp += prog_data->urb_setup_channel[location];
95
96 assert(prog_data->urb_setup[location] >= 0);
97
98 const unsigned regnr = prog_data->urb_setup[location] + comp / 4;
99
100 assert(regnr < prog_data->num_per_primitive_inputs);
101
102 if (max_polygons > 1) {
103 /* In multipolygon dispatch each primitive constant is a
104 * dispatch_width-wide SIMD vector (see comment in
105 * assign_urb_setup()), so we need to use offset() instead of
106 * component() to select the specified parameter.
107 */
108 const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD);
109 bld.MOV(tmp, offset(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_UD),
110 dispatch_width, comp % 4));
111 return retype(tmp, ELK_REGISTER_TYPE_F);
112 } else {
113 return component(elk_fs_reg(ATTR, regnr, ELK_REGISTER_TYPE_F), comp % 4);
114 }
115 }
116
117 /** Emits the interpolation for the varying inputs. */
118 void
emit_interpolation_setup_gfx4()119 elk_fs_visitor::emit_interpolation_setup_gfx4()
120 {
121 struct elk_reg g1_uw = retype(elk_vec1_grf(1, 0), ELK_REGISTER_TYPE_UW);
122
123 fs_builder abld = fs_builder(this).at_end().annotate("compute pixel centers");
124 this->pixel_x = vgrf(glsl_uint_type());
125 this->pixel_y = vgrf(glsl_uint_type());
126 this->pixel_x.type = ELK_REGISTER_TYPE_UW;
127 this->pixel_y.type = ELK_REGISTER_TYPE_UW;
128 abld.ADD(this->pixel_x,
129 elk_fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
130 elk_fs_reg(elk_imm_v(0x10101010)));
131 abld.ADD(this->pixel_y,
132 elk_fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
133 elk_fs_reg(elk_imm_v(0x11001100)));
134
135 const fs_builder bld = fs_builder(this).at_end();
136 abld = bld.annotate("compute pixel deltas from v0");
137
138 this->delta_xy[ELK_BARYCENTRIC_PERSPECTIVE_PIXEL] =
139 vgrf(glsl_vec2_type());
140 const elk_fs_reg &delta_xy = this->delta_xy[ELK_BARYCENTRIC_PERSPECTIVE_PIXEL];
141 const elk_fs_reg xstart(negate(elk_vec1_grf(1, 0)));
142 const elk_fs_reg ystart(negate(elk_vec1_grf(1, 1)));
143
144 if (devinfo->has_pln) {
145 for (unsigned i = 0; i < dispatch_width / 8; i++) {
146 abld.quarter(i).ADD(quarter(offset(delta_xy, abld, 0), i),
147 quarter(this->pixel_x, i), xstart);
148 abld.quarter(i).ADD(quarter(offset(delta_xy, abld, 1), i),
149 quarter(this->pixel_y, i), ystart);
150 }
151 } else {
152 abld.ADD(offset(delta_xy, abld, 0), this->pixel_x, xstart);
153 abld.ADD(offset(delta_xy, abld, 1), this->pixel_y, ystart);
154 }
155
156 this->pixel_z = fetch_payload_reg(bld, fs_payload().source_depth_reg);
157
158 /* The SF program automatically handles doing the perspective correction or
159 * not based on wm_prog_data::interp_mode[] so we can use the same pixel
160 * offsets for both perspective and non-perspective.
161 */
162 this->delta_xy[ELK_BARYCENTRIC_NONPERSPECTIVE_PIXEL] =
163 this->delta_xy[ELK_BARYCENTRIC_PERSPECTIVE_PIXEL];
164
165 abld = bld.annotate("compute pos.w and 1/pos.w");
166 /* Compute wpos.w. It's always in our setup, since it's needed to
167 * interpolate the other attributes.
168 */
169 this->wpos_w = vgrf(glsl_float_type());
170 abld.emit(ELK_FS_OPCODE_LINTERP, wpos_w, delta_xy,
171 interp_reg(abld, VARYING_SLOT_POS, 3, 0));
172 /* Compute the pixel 1/W value from wpos.w. */
173 this->pixel_w = vgrf(glsl_float_type());
174 abld.emit(ELK_SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
175 }
176
177 /** Emits the interpolation for the varying inputs. */
178 void
emit_interpolation_setup_gfx6()179 elk_fs_visitor::emit_interpolation_setup_gfx6()
180 {
181 const fs_builder bld = fs_builder(this).at_end();
182 fs_builder abld = bld.annotate("compute pixel centers");
183
184 this->pixel_x = vgrf(glsl_float_type());
185 this->pixel_y = vgrf(glsl_float_type());
186
187 const struct elk_wm_prog_key *wm_key = (elk_wm_prog_key*) this->key;
188 struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(prog_data);
189
190 elk_fs_reg int_sample_offset_x, int_sample_offset_y; /* Used on Gen12HP+ */
191 elk_fs_reg int_sample_offset_xy; /* Used on Gen8+ */
192 elk_fs_reg half_int_sample_offset_x, half_int_sample_offset_y;
193 if (wm_prog_data->coarse_pixel_dispatch != ELK_ALWAYS) {
194 /* The thread payload only delivers subspan locations (ss0, ss1,
195 * ss2, ...). Since subspans covers 2x2 pixels blocks, we need to
196 * generate 4 pixel coordinates out of each subspan location. We do this
197 * by replicating a subspan coordinate 4 times and adding an offset of 1
198 * in each direction from the initial top left (tl) location to generate
199 * top right (tr = +1 in x), bottom left (bl = +1 in y) and bottom right
200 * (br = +1 in x, +1 in y).
201 *
202 * The locations we build look like this in SIMD8 :
203 *
204 * ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
205 *
206 * The value 0x11001010 is a vector of 8 half byte vector. It adds
207 * following to generate the 4 pixels coordinates out of the subspan0:
208 *
209 * 0x
210 * 1 : ss0.y + 1 -> ss0.br.y
211 * 1 : ss0.y + 1 -> ss0.bl.y
212 * 0 : ss0.y + 0 -> ss0.tr.y
213 * 0 : ss0.y + 0 -> ss0.tl.y
214 * 1 : ss0.x + 1 -> ss0.br.x
215 * 0 : ss0.x + 0 -> ss0.bl.x
216 * 1 : ss0.x + 1 -> ss0.tr.x
217 * 0 : ss0.x + 0 -> ss0.tl.x
218 *
219 * By doing a SIMD16 add in a SIMD8 shader, we can generate the 8 pixels
220 * coordinates out of 2 subspans coordinates in a single ADD instruction
221 * (twice the operation above).
222 */
223 int_sample_offset_xy = elk_fs_reg(elk_imm_v(0x11001010));
224 half_int_sample_offset_x = elk_fs_reg(elk_imm_uw(0));
225 half_int_sample_offset_y = elk_fs_reg(elk_imm_uw(0));
226 /* On Gfx12.5, because of regioning restrictions, the interpolation code
227 * is slightly different and works off X & Y only inputs. The ordering
228 * of the half bytes here is a bit odd, with each subspan replicated
229 * twice and every other element is discarded :
230 *
231 * ss0.tl ss0.tl ss0.tr ss0.tr ss0.bl ss0.bl ss0.br ss0.br
232 * X offset: 0 0 1 0 0 0 1 0
233 * Y offset: 0 0 0 0 1 0 1 0
234 */
235 int_sample_offset_x = elk_fs_reg(elk_imm_v(0x01000100));
236 int_sample_offset_y = elk_fs_reg(elk_imm_v(0x01010000));
237 }
238
239 elk_fs_reg int_coarse_offset_x, int_coarse_offset_y; /* Used on Gen12HP+ */
240 elk_fs_reg int_coarse_offset_xy; /* Used on Gen8+ */
241 elk_fs_reg half_int_coarse_offset_x, half_int_coarse_offset_y;
242 if (wm_prog_data->coarse_pixel_dispatch != ELK_NEVER) {
243 /* In coarse pixel dispatch we have to do the same ADD instruction that
244 * we do in normal per pixel dispatch, except this time we're not adding
245 * 1 in each direction, but instead the coarse pixel size.
246 *
247 * The coarse pixel size is delivered as 2 u8 in r1.0
248 */
249 struct elk_reg r1_0 = retype(elk_vec1_reg(ELK_GENERAL_REGISTER_FILE, 1, 0), ELK_REGISTER_TYPE_UB);
250
251 const fs_builder dbld =
252 abld.exec_all().group(MIN2(16, dispatch_width) * 2, 0);
253
254 if (devinfo->verx10 >= 125) {
255 /* To build the array of half bytes we do and AND operation with the
256 * right mask in X.
257 */
258 int_coarse_offset_x = dbld.vgrf(ELK_REGISTER_TYPE_UW);
259 dbld.AND(int_coarse_offset_x, byte_offset(r1_0, 0), elk_imm_v(0x0f000f00));
260
261 /* And the right mask in Y. */
262 int_coarse_offset_y = dbld.vgrf(ELK_REGISTER_TYPE_UW);
263 dbld.AND(int_coarse_offset_y, byte_offset(r1_0, 1), elk_imm_v(0x0f0f0000));
264 } else {
265 /* To build the array of half bytes we do and AND operation with the
266 * right mask in X.
267 */
268 int_coarse_offset_x = dbld.vgrf(ELK_REGISTER_TYPE_UW);
269 dbld.AND(int_coarse_offset_x, byte_offset(r1_0, 0), elk_imm_v(0x0000f0f0));
270
271 /* And the right mask in Y. */
272 int_coarse_offset_y = dbld.vgrf(ELK_REGISTER_TYPE_UW);
273 dbld.AND(int_coarse_offset_y, byte_offset(r1_0, 1), elk_imm_v(0xff000000));
274
275 /* Finally OR the 2 registers. */
276 int_coarse_offset_xy = dbld.vgrf(ELK_REGISTER_TYPE_UW);
277 dbld.OR(int_coarse_offset_xy, int_coarse_offset_x, int_coarse_offset_y);
278 }
279
280 /* Also compute the half coarse size used to center coarses. */
281 half_int_coarse_offset_x = bld.vgrf(ELK_REGISTER_TYPE_UW);
282 half_int_coarse_offset_y = bld.vgrf(ELK_REGISTER_TYPE_UW);
283
284 bld.SHR(half_int_coarse_offset_x, suboffset(r1_0, 0), elk_imm_ud(1));
285 bld.SHR(half_int_coarse_offset_y, suboffset(r1_0, 1), elk_imm_ud(1));
286 }
287
288 elk_fs_reg int_pixel_offset_x, int_pixel_offset_y; /* Used on Gen12HP+ */
289 elk_fs_reg int_pixel_offset_xy; /* Used on Gen8+ */
290 elk_fs_reg half_int_pixel_offset_x, half_int_pixel_offset_y;
291 switch (wm_prog_data->coarse_pixel_dispatch) {
292 case ELK_NEVER:
293 int_pixel_offset_x = int_sample_offset_x;
294 int_pixel_offset_y = int_sample_offset_y;
295 int_pixel_offset_xy = int_sample_offset_xy;
296 half_int_pixel_offset_x = half_int_sample_offset_x;
297 half_int_pixel_offset_y = half_int_sample_offset_y;
298 break;
299
300 case ELK_SOMETIMES: {
301 const fs_builder dbld =
302 abld.exec_all().group(MIN2(16, dispatch_width) * 2, 0);
303
304 check_dynamic_msaa_flag(dbld, wm_prog_data,
305 INTEL_MSAA_FLAG_COARSE_RT_WRITES);
306
307 int_pixel_offset_x = dbld.vgrf(ELK_REGISTER_TYPE_UW);
308 set_predicate(ELK_PREDICATE_NORMAL,
309 dbld.SEL(int_pixel_offset_x,
310 int_coarse_offset_x,
311 int_sample_offset_x));
312
313 int_pixel_offset_y = dbld.vgrf(ELK_REGISTER_TYPE_UW);
314 set_predicate(ELK_PREDICATE_NORMAL,
315 dbld.SEL(int_pixel_offset_y,
316 int_coarse_offset_y,
317 int_sample_offset_y));
318
319 int_pixel_offset_xy = dbld.vgrf(ELK_REGISTER_TYPE_UW);
320 set_predicate(ELK_PREDICATE_NORMAL,
321 dbld.SEL(int_pixel_offset_xy,
322 int_coarse_offset_xy,
323 int_sample_offset_xy));
324
325 half_int_pixel_offset_x = bld.vgrf(ELK_REGISTER_TYPE_UW);
326 set_predicate(ELK_PREDICATE_NORMAL,
327 bld.SEL(half_int_pixel_offset_x,
328 half_int_coarse_offset_x,
329 half_int_sample_offset_x));
330
331 half_int_pixel_offset_y = bld.vgrf(ELK_REGISTER_TYPE_UW);
332 set_predicate(ELK_PREDICATE_NORMAL,
333 bld.SEL(half_int_pixel_offset_y,
334 half_int_coarse_offset_y,
335 half_int_sample_offset_y));
336 break;
337 }
338
339 case ELK_ALWAYS:
340 int_pixel_offset_x = int_coarse_offset_x;
341 int_pixel_offset_y = int_coarse_offset_y;
342 int_pixel_offset_xy = int_coarse_offset_xy;
343 half_int_pixel_offset_x = half_int_coarse_offset_x;
344 half_int_pixel_offset_y = half_int_coarse_offset_y;
345 break;
346 }
347
348 for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
349 const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i);
350 /* According to the "PS Thread Payload for Normal Dispatch"
351 * pages on the BSpec, subspan X/Y coordinates are stored in
352 * R1.2-R1.5/R2.2-R2.5 on gfx6+, and on R0.10-R0.13/R1.10-R1.13
353 * on gfx20+. gi_reg is the 32B section of the GRF that
354 * contains the subspan coordinates.
355 */
356 const struct elk_reg gi_reg = devinfo->ver >= 20 ? xe2_vec1_grf(i, 8) :
357 elk_vec1_grf(i + 1, 0);
358 const struct elk_reg gi_uw = retype(gi_reg, ELK_REGISTER_TYPE_UW);
359
360 if (devinfo->verx10 >= 125) {
361 const fs_builder dbld =
362 abld.exec_all().group(hbld.dispatch_width() * 2, 0);
363 const elk_fs_reg int_pixel_x = dbld.vgrf(ELK_REGISTER_TYPE_UW);
364 const elk_fs_reg int_pixel_y = dbld.vgrf(ELK_REGISTER_TYPE_UW);
365
366 dbld.ADD(int_pixel_x,
367 elk_fs_reg(stride(suboffset(gi_uw, 4), 2, 8, 0)),
368 int_pixel_offset_x);
369 dbld.ADD(int_pixel_y,
370 elk_fs_reg(stride(suboffset(gi_uw, 5), 2, 8, 0)),
371 int_pixel_offset_y);
372
373 if (wm_prog_data->coarse_pixel_dispatch != ELK_NEVER) {
374 elk_fs_inst *addx = dbld.ADD(int_pixel_x, int_pixel_x,
375 horiz_stride(half_int_pixel_offset_x, 0));
376 elk_fs_inst *addy = dbld.ADD(int_pixel_y, int_pixel_y,
377 horiz_stride(half_int_pixel_offset_y, 0));
378 if (wm_prog_data->coarse_pixel_dispatch != ELK_ALWAYS) {
379 addx->predicate = ELK_PREDICATE_NORMAL;
380 addy->predicate = ELK_PREDICATE_NORMAL;
381 }
382 }
383
384 hbld.MOV(offset(pixel_x, hbld, i), horiz_stride(int_pixel_x, 2));
385 hbld.MOV(offset(pixel_y, hbld, i), horiz_stride(int_pixel_y, 2));
386
387 } else if (devinfo->ver >= 8 || dispatch_width == 8) {
388 /* The "Register Region Restrictions" page says for BDW (and newer,
389 * presumably):
390 *
391 * "When destination spans two registers, the source may be one or
392 * two registers. The destination elements must be evenly split
393 * between the two registers."
394 *
395 * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16
396 * to compute our pixel centers.
397 */
398 const fs_builder dbld =
399 abld.exec_all().group(hbld.dispatch_width() * 2, 0);
400 elk_fs_reg int_pixel_xy = dbld.vgrf(ELK_REGISTER_TYPE_UW);
401
402 dbld.ADD(int_pixel_xy,
403 elk_fs_reg(stride(suboffset(gi_uw, 4), 1, 4, 0)),
404 int_pixel_offset_xy);
405
406 hbld.emit(ELK_FS_OPCODE_PIXEL_X, offset(pixel_x, hbld, i), int_pixel_xy,
407 horiz_stride(half_int_pixel_offset_x, 0));
408 hbld.emit(ELK_FS_OPCODE_PIXEL_Y, offset(pixel_y, hbld, i), int_pixel_xy,
409 horiz_stride(half_int_pixel_offset_y, 0));
410 } else {
411 /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
412 *
413 * "When destination spans two registers, the source MUST span
414 * two registers."
415 *
416 * Since the GRF source of the ADD will only read a single register,
417 * we must do two separate ADDs in SIMD16.
418 */
419 const elk_fs_reg int_pixel_x = hbld.vgrf(ELK_REGISTER_TYPE_UW);
420 const elk_fs_reg int_pixel_y = hbld.vgrf(ELK_REGISTER_TYPE_UW);
421
422 hbld.ADD(int_pixel_x,
423 elk_fs_reg(stride(suboffset(gi_uw, 4), 2, 4, 0)),
424 elk_fs_reg(elk_imm_v(0x10101010)));
425 hbld.ADD(int_pixel_y,
426 elk_fs_reg(stride(suboffset(gi_uw, 5), 2, 4, 0)),
427 elk_fs_reg(elk_imm_v(0x11001100)));
428
429 /* As of gfx6, we can no longer mix float and int sources. We have
430 * to turn the integer pixel centers into floats for their actual
431 * use.
432 */
433 hbld.MOV(offset(pixel_x, hbld, i), int_pixel_x);
434 hbld.MOV(offset(pixel_y, hbld, i), int_pixel_y);
435 }
436 }
437
438 abld = bld.annotate("compute pos.z");
439 elk_fs_reg coarse_z;
440 if (wm_prog_data->uses_depth_w_coefficients) {
441 /* In coarse pixel mode, the HW doesn't interpolate Z coordinate
442 * properly. In the same way we have to add the coarse pixel size to
443 * pixels locations, here we recompute the Z value with 2 coefficients
444 * in X & Y axis.
445 */
446 elk_fs_reg coef_payload = elk_vec8_grf(fs_payload().depth_w_coef_reg, 0);
447 const elk_fs_reg x_start = elk_vec1_grf(coef_payload.nr, 2);
448 const elk_fs_reg y_start = elk_vec1_grf(coef_payload.nr, 6);
449 const elk_fs_reg z_cx = elk_vec1_grf(coef_payload.nr, 1);
450 const elk_fs_reg z_cy = elk_vec1_grf(coef_payload.nr, 0);
451 const elk_fs_reg z_c0 = elk_vec1_grf(coef_payload.nr, 3);
452
453 const elk_fs_reg float_pixel_x = abld.vgrf(ELK_REGISTER_TYPE_F);
454 const elk_fs_reg float_pixel_y = abld.vgrf(ELK_REGISTER_TYPE_F);
455
456 abld.ADD(float_pixel_x, this->pixel_x, negate(x_start));
457 abld.ADD(float_pixel_y, this->pixel_y, negate(y_start));
458
459 /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
460 const elk_fs_reg u8_cps_width = elk_fs_reg(retype(elk_vec1_grf(1, 0), ELK_REGISTER_TYPE_UB));
461 /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
462 const elk_fs_reg u8_cps_height = byte_offset(u8_cps_width, 1);
463 const elk_fs_reg u32_cps_width = abld.vgrf(ELK_REGISTER_TYPE_UD);
464 const elk_fs_reg u32_cps_height = abld.vgrf(ELK_REGISTER_TYPE_UD);
465 abld.MOV(u32_cps_width, u8_cps_width);
466 abld.MOV(u32_cps_height, u8_cps_height);
467
468 const elk_fs_reg f_cps_width = abld.vgrf(ELK_REGISTER_TYPE_F);
469 const elk_fs_reg f_cps_height = abld.vgrf(ELK_REGISTER_TYPE_F);
470 abld.MOV(f_cps_width, u32_cps_width);
471 abld.MOV(f_cps_height, u32_cps_height);
472
473 /* Center in the middle of the coarse pixel. */
474 abld.MAD(float_pixel_x, float_pixel_x, elk_imm_f(0.5f), f_cps_width);
475 abld.MAD(float_pixel_y, float_pixel_y, elk_imm_f(0.5f), f_cps_height);
476
477 coarse_z = abld.vgrf(ELK_REGISTER_TYPE_F);
478 abld.MAD(coarse_z, z_c0, z_cx, float_pixel_x);
479 abld.MAD(coarse_z, coarse_z, z_cy, float_pixel_y);
480 }
481
482 if (wm_prog_data->uses_src_depth)
483 this->pixel_z = fetch_payload_reg(bld, fs_payload().source_depth_reg);
484
485 if (wm_prog_data->uses_depth_w_coefficients ||
486 wm_prog_data->uses_src_depth) {
487 elk_fs_reg sample_z = this->pixel_z;
488
489 switch (wm_prog_data->coarse_pixel_dispatch) {
490 case ELK_NEVER:
491 assert(wm_prog_data->uses_src_depth);
492 assert(!wm_prog_data->uses_depth_w_coefficients);
493 this->pixel_z = sample_z;
494 break;
495
496 case ELK_SOMETIMES:
497 assert(wm_prog_data->uses_src_depth);
498 assert(wm_prog_data->uses_depth_w_coefficients);
499 this->pixel_z = abld.vgrf(ELK_REGISTER_TYPE_F);
500
501 /* We re-use the check_dynamic_msaa_flag() call from above */
502 set_predicate(ELK_PREDICATE_NORMAL,
503 abld.SEL(this->pixel_z, coarse_z, sample_z));
504 break;
505
506 case ELK_ALWAYS:
507 assert(!wm_prog_data->uses_src_depth);
508 assert(wm_prog_data->uses_depth_w_coefficients);
509 this->pixel_z = coarse_z;
510 break;
511 }
512 }
513
514 if (wm_prog_data->uses_src_w) {
515 abld = bld.annotate("compute pos.w");
516 this->pixel_w = fetch_payload_reg(abld, fs_payload().source_w_reg);
517 this->wpos_w = vgrf(glsl_float_type());
518 abld.emit(ELK_SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
519 }
520
521 if (wm_key->persample_interp == ELK_SOMETIMES) {
522 assert(!devinfo->needs_unlit_centroid_workaround);
523
524 const fs_builder ubld = bld.exec_all().group(16, 0);
525 bool loaded_flag = false;
526
527 for (int i = 0; i < ELK_BARYCENTRIC_MODE_COUNT; ++i) {
528 if (!(wm_prog_data->barycentric_interp_modes & BITFIELD_BIT(i)))
529 continue;
530
531 /* The sample mode will always be the top bit set in the perspective
532 * or non-perspective section. In the case where no SAMPLE mode was
533 * requested, elk_wm_prog_data_barycentric_modes() will swap out the top
534 * mode for SAMPLE so this works regardless of whether SAMPLE was
535 * requested or not.
536 */
537 int sample_mode;
538 if (BITFIELD_BIT(i) & ELK_BARYCENTRIC_NONPERSPECTIVE_BITS) {
539 sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes &
540 ELK_BARYCENTRIC_NONPERSPECTIVE_BITS) - 1;
541 } else {
542 sample_mode = util_last_bit(wm_prog_data->barycentric_interp_modes &
543 ELK_BARYCENTRIC_PERSPECTIVE_BITS) - 1;
544 }
545 assert(wm_prog_data->barycentric_interp_modes &
546 BITFIELD_BIT(sample_mode));
547
548 if (i == sample_mode)
549 continue;
550
551 uint8_t *barys = fs_payload().barycentric_coord_reg[i];
552
553 uint8_t *sample_barys = fs_payload().barycentric_coord_reg[sample_mode];
554 assert(barys[0] && sample_barys[0]);
555
556 if (!loaded_flag) {
557 check_dynamic_msaa_flag(ubld, wm_prog_data,
558 INTEL_MSAA_FLAG_PERSAMPLE_INTERP);
559 }
560
561 for (unsigned j = 0; j < dispatch_width / 8; j++) {
562 set_predicate(
563 ELK_PREDICATE_NORMAL,
564 ubld.MOV(elk_vec8_grf(barys[j / 2] + (j % 2) * 2, 0),
565 elk_vec8_grf(sample_barys[j / 2] + (j % 2) * 2, 0)));
566 }
567 }
568 }
569
570 for (int i = 0; i < ELK_BARYCENTRIC_MODE_COUNT; ++i) {
571 this->delta_xy[i] = fetch_barycentric_reg(
572 bld, fs_payload().barycentric_coord_reg[i]);
573 }
574
575 uint32_t centroid_modes = wm_prog_data->barycentric_interp_modes &
576 (1 << ELK_BARYCENTRIC_PERSPECTIVE_CENTROID |
577 1 << ELK_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
578
579 if (devinfo->needs_unlit_centroid_workaround && centroid_modes) {
580 /* Get the pixel/sample mask into f0 so that we know which
581 * pixels are lit. Then, for each channel that is unlit,
582 * replace the centroid data with non-centroid data.
583 */
584 for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
585 bld.exec_all().group(1, 0)
586 .MOV(retype(elk_flag_reg(0, i), ELK_REGISTER_TYPE_UW),
587 retype(elk_vec1_grf(1 + i, 7), ELK_REGISTER_TYPE_UW));
588 }
589
590 for (int i = 0; i < ELK_BARYCENTRIC_MODE_COUNT; ++i) {
591 if (!(centroid_modes & (1 << i)))
592 continue;
593
594 const elk_fs_reg centroid_delta_xy = delta_xy[i];
595 const elk_fs_reg &pixel_delta_xy = delta_xy[i - 1];
596
597 delta_xy[i] = bld.vgrf(ELK_REGISTER_TYPE_F, 2);
598
599 for (unsigned c = 0; c < 2; c++) {
600 for (unsigned q = 0; q < dispatch_width / 8; q++) {
601 set_predicate(ELK_PREDICATE_NORMAL,
602 bld.quarter(q).SEL(
603 quarter(offset(delta_xy[i], bld, c), q),
604 quarter(offset(centroid_delta_xy, bld, c), q),
605 quarter(offset(pixel_delta_xy, bld, c), q)));
606 }
607 }
608 }
609 }
610 }
611
612 static enum elk_conditional_mod
cond_for_alpha_func(enum compare_func func)613 cond_for_alpha_func(enum compare_func func)
614 {
615 switch(func) {
616 case COMPARE_FUNC_GREATER:
617 return ELK_CONDITIONAL_G;
618 case COMPARE_FUNC_GEQUAL:
619 return ELK_CONDITIONAL_GE;
620 case COMPARE_FUNC_LESS:
621 return ELK_CONDITIONAL_L;
622 case COMPARE_FUNC_LEQUAL:
623 return ELK_CONDITIONAL_LE;
624 case COMPARE_FUNC_EQUAL:
625 return ELK_CONDITIONAL_EQ;
626 case COMPARE_FUNC_NOTEQUAL:
627 return ELK_CONDITIONAL_NEQ;
628 default:
629 unreachable("Not reached");
630 }
631 }
632
633 /**
634 * Alpha test support for when we compile it into the shader instead
635 * of using the normal fixed-function alpha test.
636 */
637 void
emit_alpha_test()638 elk_fs_visitor::emit_alpha_test()
639 {
640 assert(stage == MESA_SHADER_FRAGMENT);
641 elk_wm_prog_key *key = (elk_wm_prog_key*) this->key;
642 const fs_builder bld = fs_builder(this).at_end();
643 const fs_builder abld = bld.annotate("Alpha test");
644
645 elk_fs_inst *cmp;
646 if (key->alpha_test_func == COMPARE_FUNC_ALWAYS)
647 return;
648
649 if (key->alpha_test_func == COMPARE_FUNC_NEVER) {
650 /* f0.1 = 0 */
651 elk_fs_reg some_reg = elk_fs_reg(retype(elk_vec8_grf(0, 0),
652 ELK_REGISTER_TYPE_UW));
653 cmp = abld.CMP(bld.null_reg_f(), some_reg, some_reg,
654 ELK_CONDITIONAL_NEQ);
655 } else {
656 /* RT0 alpha */
657 elk_fs_reg color = offset(outputs[0], bld, 3);
658
659 /* f0.1 &= func(color, ref) */
660 cmp = abld.CMP(bld.null_reg_f(), color, elk_imm_f(key->alpha_test_ref),
661 cond_for_alpha_func(key->alpha_test_func));
662 }
663 cmp->predicate = ELK_PREDICATE_NORMAL;
664 cmp->flag_subreg = 1;
665 }
666
667 elk_fs_inst *
emit_single_fb_write(const fs_builder & bld,elk_fs_reg color0,elk_fs_reg color1,elk_fs_reg src0_alpha,unsigned components)668 elk_fs_visitor::emit_single_fb_write(const fs_builder &bld,
669 elk_fs_reg color0, elk_fs_reg color1,
670 elk_fs_reg src0_alpha, unsigned components)
671 {
672 assert(stage == MESA_SHADER_FRAGMENT);
673 struct elk_wm_prog_data *prog_data = elk_wm_prog_data(this->prog_data);
674
675 /* Hand over gl_FragDepth or the payload depth. */
676 const elk_fs_reg dst_depth = fetch_payload_reg(bld, fs_payload().dest_depth_reg);
677 elk_fs_reg src_depth, src_stencil;
678
679 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
680 src_depth = frag_depth;
681 } else if (source_depth_to_render_target) {
682 /* If we got here, we're in one of those strange Gen4-5 cases where
683 * we're forced to pass the source depth, unmodified, to the FB write.
684 * In this case, we don't want to use pixel_z because we may not have
685 * set up interpolation. It's also perfectly safe because it only
686 * happens on old hardware (no coarse interpolation) and this is
687 * explicitly the pass-through case.
688 */
689 assert(devinfo->ver <= 5);
690 src_depth = fetch_payload_reg(bld, fs_payload().source_depth_reg);
691 }
692
693 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
694 src_stencil = frag_stencil;
695
696 const elk_fs_reg sources[] = {
697 color0, color1, src0_alpha, src_depth, dst_depth, src_stencil,
698 (prog_data->uses_omask ? sample_mask : elk_fs_reg()),
699 elk_imm_ud(components)
700 };
701 assert(ARRAY_SIZE(sources) - 1 == FB_WRITE_LOGICAL_SRC_COMPONENTS);
702 elk_fs_inst *write = bld.emit(ELK_FS_OPCODE_FB_WRITE_LOGICAL, elk_fs_reg(),
703 sources, ARRAY_SIZE(sources));
704
705 if (prog_data->uses_kill) {
706 write->predicate = ELK_PREDICATE_NORMAL;
707 write->flag_subreg = sample_mask_flag_subreg(*this);
708 }
709
710 return write;
711 }
712
713 void
do_emit_fb_writes(int nr_color_regions,bool replicate_alpha)714 elk_fs_visitor::do_emit_fb_writes(int nr_color_regions, bool replicate_alpha)
715 {
716 const fs_builder bld = fs_builder(this).at_end();
717 elk_fs_inst *inst = NULL;
718
719 for (int target = 0; target < nr_color_regions; target++) {
720 /* Skip over outputs that weren't written. */
721 if (this->outputs[target].file == BAD_FILE)
722 continue;
723
724 const fs_builder abld = bld.annotate(
725 ralloc_asprintf(this->mem_ctx, "FB write target %d", target));
726
727 elk_fs_reg src0_alpha;
728 if (devinfo->ver >= 6 && replicate_alpha && target != 0)
729 src0_alpha = offset(outputs[0], bld, 3);
730
731 inst = emit_single_fb_write(abld, this->outputs[target],
732 this->dual_src_output, src0_alpha, 4);
733 inst->target = target;
734 }
735
736 if (inst == NULL) {
737 /* Even if there's no color buffers enabled, we still need to send
738 * alpha out the pipeline to our null renderbuffer to support
739 * alpha-testing, alpha-to-coverage, and so on.
740 */
741 /* FINISHME: Factor out this frequently recurring pattern into a
742 * helper function.
743 */
744 const elk_fs_reg srcs[] = { reg_undef, reg_undef,
745 reg_undef, offset(this->outputs[0], bld, 3) };
746 const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_UD, 4);
747 bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
748
749 inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4);
750 inst->target = 0;
751 }
752
753 inst->last_rt = true;
754 inst->eot = true;
755 }
756
757 void
emit_fb_writes()758 elk_fs_visitor::emit_fb_writes()
759 {
760 assert(stage == MESA_SHADER_FRAGMENT);
761 struct elk_wm_prog_data *prog_data = elk_wm_prog_data(this->prog_data);
762 elk_wm_prog_key *key = (elk_wm_prog_key*) this->key;
763
764 if (source_depth_to_render_target && devinfo->ver == 6) {
765 /* For outputting oDepth on gfx6, SIMD8 writes have to be used. This
766 * would require SIMD8 moves of each half to message regs, e.g. by using
767 * the SIMD lowering pass. Unfortunately this is more difficult than it
768 * sounds because the SIMD8 single-source message lacks channel selects
769 * for the second and third subspans.
770 */
771 limit_dispatch_width(8, "Depth writes unsupported in SIMD16+ mode.\n");
772 }
773
774 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
775 /* From the 'Render Target Write message' section of the docs:
776 * "Output Stencil is not supported with SIMD16 Render Target Write
777 * Messages."
778 */
779 limit_dispatch_width(8, "gl_FragStencilRefARB unsupported "
780 "in SIMD16+ mode.\n");
781 }
782
783 /* ANV doesn't know about sample mask output during the wm key creation
784 * so we compute if we need replicate alpha and emit alpha to coverage
785 * workaround here.
786 */
787 const bool replicate_alpha = key->alpha_test_replicate_alpha ||
788 (key->nr_color_regions > 1 && key->alpha_to_coverage &&
789 (sample_mask.file == BAD_FILE || devinfo->ver == 6));
790
791 prog_data->dual_src_blend = (this->dual_src_output.file != BAD_FILE &&
792 this->outputs[0].file != BAD_FILE);
793 assert(!prog_data->dual_src_blend || key->nr_color_regions == 1);
794
795 /* Following condition implements Wa_14017468336:
796 *
797 * "If dual source blend is enabled do not enable SIMD32 dispatch" and
798 * "For a thread dispatched as SIMD32, must not issue SIMD8 message with Last
799 * Render Target Select set."
800 */
801 if (devinfo->ver >= 11 && devinfo->ver <= 12 &&
802 prog_data->dual_src_blend) {
803 /* The dual-source RT write messages fail to release the thread
804 * dependency on ICL and TGL with SIMD32 dispatch, leading to hangs.
805 *
806 * XXX - Emit an extra single-source NULL RT-write marked LastRT in
807 * order to release the thread dependency without disabling
808 * SIMD32.
809 *
810 * The dual-source RT write messages may lead to hangs with SIMD16
811 * dispatch on ICL due some unknown reasons, see
812 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/2183
813 */
814 limit_dispatch_width(8, "Dual source blending unsupported "
815 "in SIMD16 and SIMD32 modes.\n");
816 }
817
818 do_emit_fb_writes(key->nr_color_regions, replicate_alpha);
819 }
820
821 void
emit_urb_writes(const elk_fs_reg & gs_vertex_count)822 elk_fs_visitor::emit_urb_writes(const elk_fs_reg &gs_vertex_count)
823 {
824 int slot, urb_offset, length;
825 int starting_urb_offset = 0;
826 const struct elk_vue_prog_data *vue_prog_data =
827 elk_vue_prog_data(this->prog_data);
828 const struct elk_vs_prog_key *vs_key =
829 (const struct elk_vs_prog_key *) this->key;
830 const GLbitfield64 psiz_mask =
831 VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ | VARYING_BIT_PRIMITIVE_SHADING_RATE;
832 const struct intel_vue_map *vue_map = &vue_prog_data->vue_map;
833 bool flush;
834 elk_fs_reg sources[8];
835 elk_fs_reg urb_handle;
836
837 switch (stage) {
838 case MESA_SHADER_VERTEX:
839 urb_handle = vs_payload().urb_handles;
840 break;
841 case MESA_SHADER_TESS_EVAL:
842 urb_handle = tes_payload().urb_output;
843 break;
844 case MESA_SHADER_GEOMETRY:
845 urb_handle = gs_payload().urb_handles;
846 break;
847 default:
848 unreachable("invalid stage");
849 }
850
851 const fs_builder bld = fs_builder(this).at_end();
852
853 elk_fs_reg per_slot_offsets;
854
855 if (stage == MESA_SHADER_GEOMETRY) {
856 const struct elk_gs_prog_data *gs_prog_data =
857 elk_gs_prog_data(this->prog_data);
858
859 /* We need to increment the Global Offset to skip over the control data
860 * header and the extra "Vertex Count" field (1 HWord) at the beginning
861 * of the VUE. We're counting in OWords, so the units are doubled.
862 */
863 starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
864 if (gs_prog_data->static_vertex_count == -1)
865 starting_urb_offset += 2;
866
867 /* The URB offset is in 128-bit units, so we need to multiply by 2 */
868 const int output_vertex_size_owords =
869 gs_prog_data->output_vertex_size_hwords * 2;
870
871 if (gs_vertex_count.file == IMM) {
872 per_slot_offsets = elk_imm_ud(output_vertex_size_owords *
873 gs_vertex_count.ud);
874 } else {
875 per_slot_offsets = vgrf(glsl_uint_type());
876 bld.MUL(per_slot_offsets, gs_vertex_count,
877 elk_imm_ud(output_vertex_size_owords));
878 }
879 }
880
881 length = 0;
882 urb_offset = starting_urb_offset;
883 flush = false;
884
885 /* SSO shaders can have VUE slots allocated which are never actually
886 * written to, so ignore them when looking for the last (written) slot.
887 */
888 int last_slot = vue_map->num_slots - 1;
889 while (last_slot > 0 &&
890 (vue_map->slot_to_varying[last_slot] == ELK_VARYING_SLOT_PAD ||
891 outputs[vue_map->slot_to_varying[last_slot]].file == BAD_FILE)) {
892 last_slot--;
893 }
894
895 bool urb_written = false;
896 for (slot = 0; slot < vue_map->num_slots; slot++) {
897 int varying = vue_map->slot_to_varying[slot];
898 switch (varying) {
899 case VARYING_SLOT_PSIZ: {
900 /* The point size varying slot is the vue header and is always in the
901 * vue map. But often none of the special varyings that live there
902 * are written and in that case we can skip writing to the vue
903 * header, provided the corresponding state properly clamps the
904 * values further down the pipeline. */
905 if ((vue_map->slots_valid & psiz_mask) == 0) {
906 assert(length == 0);
907 urb_offset++;
908 break;
909 }
910
911 elk_fs_reg zero(VGRF, alloc.allocate(dispatch_width / 8),
912 ELK_REGISTER_TYPE_UD);
913 bld.MOV(zero, elk_imm_ud(0u));
914
915 if (vue_map->slots_valid & VARYING_BIT_PRIMITIVE_SHADING_RATE &&
916 this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE].file != BAD_FILE) {
917 sources[length++] = this->outputs[VARYING_SLOT_PRIMITIVE_SHADING_RATE];
918 } else if (devinfo->has_coarse_pixel_primitive_and_cb) {
919 uint32_t one_fp16 = 0x3C00;
920 elk_fs_reg one_by_one_fp16(VGRF, alloc.allocate(dispatch_width / 8),
921 ELK_REGISTER_TYPE_UD);
922 bld.MOV(one_by_one_fp16, elk_imm_ud((one_fp16 << 16) | one_fp16));
923 sources[length++] = one_by_one_fp16;
924 } else {
925 sources[length++] = zero;
926 }
927
928 if (vue_map->slots_valid & VARYING_BIT_LAYER)
929 sources[length++] = this->outputs[VARYING_SLOT_LAYER];
930 else
931 sources[length++] = zero;
932
933 if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
934 sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
935 else
936 sources[length++] = zero;
937
938 if (vue_map->slots_valid & VARYING_BIT_PSIZ)
939 sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
940 else
941 sources[length++] = zero;
942 break;
943 }
944 case ELK_VARYING_SLOT_NDC:
945 case VARYING_SLOT_EDGE:
946 unreachable("unexpected scalar vs output");
947 break;
948
949 default:
950 /* gl_Position is always in the vue map, but isn't always written by
951 * the shader. Other varyings (clip distances) get added to the vue
952 * map but don't always get written. In those cases, the
953 * corresponding this->output[] slot will be invalid we and can skip
954 * the urb write for the varying. If we've already queued up a vue
955 * slot for writing we flush a mlen 5 urb write, otherwise we just
956 * advance the urb_offset.
957 */
958 if (varying == ELK_VARYING_SLOT_PAD ||
959 this->outputs[varying].file == BAD_FILE) {
960 if (length > 0)
961 flush = true;
962 else
963 urb_offset++;
964 break;
965 }
966
967 if (stage == MESA_SHADER_VERTEX && vs_key->clamp_vertex_color &&
968 (varying == VARYING_SLOT_COL0 ||
969 varying == VARYING_SLOT_COL1 ||
970 varying == VARYING_SLOT_BFC0 ||
971 varying == VARYING_SLOT_BFC1)) {
972 /* We need to clamp these guys, so do a saturating MOV into a
973 * temp register and use that for the payload.
974 */
975 for (int i = 0; i < 4; i++) {
976 elk_fs_reg reg = elk_fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
977 outputs[varying].type);
978 elk_fs_reg src = offset(this->outputs[varying], bld, i);
979 set_saturate(true, bld.MOV(reg, src));
980 sources[length++] = reg;
981 }
982 } else {
983 int slot_offset = 0;
984
985 /* When using Primitive Replication, there may be multiple slots
986 * assigned to POS.
987 */
988 if (varying == VARYING_SLOT_POS)
989 slot_offset = slot - vue_map->varying_to_slot[VARYING_SLOT_POS];
990
991 for (unsigned i = 0; i < 4; i++) {
992 sources[length++] = offset(this->outputs[varying], bld,
993 i + (slot_offset * 4));
994 }
995 }
996 break;
997 }
998
999 const fs_builder abld = bld.annotate("URB write");
1000
1001 /* If we've queued up 8 registers of payload (2 VUE slots), if this is
1002 * the last slot or if we need to flush (see BAD_FILE varying case
1003 * above), emit a URB write send now to flush out the data.
1004 */
1005 if (length == 8 || (length > 0 && slot == last_slot))
1006 flush = true;
1007 if (flush) {
1008 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1009
1010 srcs[URB_LOGICAL_SRC_HANDLE] = urb_handle;
1011 srcs[URB_LOGICAL_SRC_PER_SLOT_OFFSETS] = per_slot_offsets;
1012 srcs[URB_LOGICAL_SRC_DATA] = elk_fs_reg(VGRF,
1013 alloc.allocate((dispatch_width / 8) * length),
1014 ELK_REGISTER_TYPE_F);
1015 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(length);
1016 abld.LOAD_PAYLOAD(srcs[URB_LOGICAL_SRC_DATA], sources, length, 0);
1017
1018 elk_fs_inst *inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
1019 srcs, ARRAY_SIZE(srcs));
1020
1021 /* For ICL Wa_1805992985 one needs additional write in the end. */
1022 if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL)
1023 inst->eot = false;
1024 else
1025 inst->eot = slot == last_slot && stage != MESA_SHADER_GEOMETRY;
1026
1027 inst->offset = urb_offset;
1028 urb_offset = starting_urb_offset + slot + 1;
1029 length = 0;
1030 flush = false;
1031 urb_written = true;
1032 }
1033 }
1034
1035 /* If we don't have any valid slots to write, just do a minimal urb write
1036 * send to terminate the shader. This includes 1 slot of undefined data,
1037 * because it's invalid to write 0 data:
1038 *
1039 * From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
1040 * Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
1041 * Write Data Payload:
1042 *
1043 * "The write data payload can be between 1 and 8 message phases long."
1044 */
1045 if (!urb_written) {
1046 /* For GS, just turn EmitVertex() into a no-op. We don't want it to
1047 * end the thread, and emit_gs_thread_end() already emits a SEND with
1048 * EOT at the end of the program for us.
1049 */
1050 if (stage == MESA_SHADER_GEOMETRY)
1051 return;
1052
1053 elk_fs_reg uniform_urb_handle = elk_fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
1054 ELK_REGISTER_TYPE_UD);
1055 elk_fs_reg payload = elk_fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
1056 ELK_REGISTER_TYPE_UD);
1057
1058 bld.exec_all().MOV(uniform_urb_handle, urb_handle);
1059
1060 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1061 srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
1062 srcs[URB_LOGICAL_SRC_DATA] = payload;
1063 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(1);
1064
1065 elk_fs_inst *inst = bld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
1066 srcs, ARRAY_SIZE(srcs));
1067 inst->eot = true;
1068 inst->offset = 1;
1069 return;
1070 }
1071
1072 /* ICL Wa_1805992985:
1073 *
1074 * ICLLP GPU hangs on one of tessellation vkcts tests with DS not done. The
1075 * send cycle, which is a urb write with an eot must be 4 phases long and
1076 * all 8 lanes must valid.
1077 */
1078 if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL) {
1079 assert(dispatch_width == 8);
1080 elk_fs_reg uniform_urb_handle = elk_fs_reg(VGRF, alloc.allocate(1), ELK_REGISTER_TYPE_UD);
1081 elk_fs_reg uniform_mask = elk_fs_reg(VGRF, alloc.allocate(1), ELK_REGISTER_TYPE_UD);
1082 elk_fs_reg payload = elk_fs_reg(VGRF, alloc.allocate(4), ELK_REGISTER_TYPE_UD);
1083
1084 /* Workaround requires all 8 channels (lanes) to be valid. This is
1085 * understood to mean they all need to be alive. First trick is to find
1086 * a live channel and copy its urb handle for all the other channels to
1087 * make sure all handles are valid.
1088 */
1089 bld.exec_all().MOV(uniform_urb_handle, bld.emit_uniformize(urb_handle));
1090
1091 /* Second trick is to use masked URB write where one can tell the HW to
1092 * actually write data only for selected channels even though all are
1093 * active.
1094 * Third trick is to take advantage of the must-be-zero (MBZ) area in
1095 * the very beginning of the URB.
1096 *
1097 * One masks data to be written only for the first channel and uses
1098 * offset zero explicitly to land data to the MBZ area avoiding trashing
1099 * any other part of the URB.
1100 *
1101 * Since the WA says that the write needs to be 4 phases long one uses
1102 * 4 slots data. All are explicitly zeros in order to to keep the MBZ
1103 * area written as zeros.
1104 */
1105 bld.exec_all().MOV(uniform_mask, elk_imm_ud(0x10000u));
1106 bld.exec_all().MOV(offset(payload, bld, 0), elk_imm_ud(0u));
1107 bld.exec_all().MOV(offset(payload, bld, 1), elk_imm_ud(0u));
1108 bld.exec_all().MOV(offset(payload, bld, 2), elk_imm_ud(0u));
1109 bld.exec_all().MOV(offset(payload, bld, 3), elk_imm_ud(0u));
1110
1111 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1112 srcs[URB_LOGICAL_SRC_HANDLE] = uniform_urb_handle;
1113 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = uniform_mask;
1114 srcs[URB_LOGICAL_SRC_DATA] = payload;
1115 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(4);
1116
1117 elk_fs_inst *inst = bld.exec_all().emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL,
1118 reg_undef, srcs, ARRAY_SIZE(srcs));
1119 inst->eot = true;
1120 inst->offset = 0;
1121 }
1122 }
1123
1124 void
emit_urb_fence()1125 elk_fs_visitor::emit_urb_fence()
1126 {
1127 const fs_builder bld = fs_builder(this).at_end();
1128 elk_fs_reg dst = bld.vgrf(ELK_REGISTER_TYPE_UD);
1129 elk_fs_inst *fence = bld.emit(ELK_SHADER_OPCODE_MEMORY_FENCE, dst,
1130 elk_vec8_grf(0, 0),
1131 elk_imm_ud(true),
1132 elk_imm_ud(0));
1133 fence->sfid = ELK_SFID_URB;
1134 fence->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_LOCAL,
1135 LSC_FLUSH_TYPE_NONE, true);
1136
1137 bld.exec_all().group(1, 0).emit(ELK_FS_OPCODE_SCHEDULING_FENCE,
1138 bld.null_reg_ud(),
1139 &dst,
1140 1);
1141 }
1142
1143 void
emit_cs_terminate()1144 elk_fs_visitor::emit_cs_terminate()
1145 {
1146 assert(devinfo->ver >= 7);
1147 const fs_builder bld = fs_builder(this).at_end();
1148
1149 /* We can't directly send from g0, since sends with EOT have to use
1150 * g112-127. So, copy it to a virtual register, The register allocator will
1151 * make sure it uses the appropriate register range.
1152 */
1153 struct elk_reg g0 = retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD);
1154 elk_fs_reg payload = elk_fs_reg(VGRF, alloc.allocate(1), ELK_REGISTER_TYPE_UD);
1155 bld.group(8, 0).exec_all().MOV(payload, g0);
1156
1157 /* Send a message to the thread spawner to terminate the thread. */
1158 elk_fs_inst *inst = bld.exec_all()
1159 .emit(ELK_CS_OPCODE_CS_TERMINATE, reg_undef, payload);
1160 inst->eot = true;
1161 }
1162
elk_fs_visitor(const struct elk_compiler * compiler,const struct elk_compile_params * params,const elk_base_prog_key * key,struct elk_stage_prog_data * prog_data,const nir_shader * shader,unsigned dispatch_width,bool needs_register_pressure,bool debug_enabled)1163 elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler,
1164 const struct elk_compile_params *params,
1165 const elk_base_prog_key *key,
1166 struct elk_stage_prog_data *prog_data,
1167 const nir_shader *shader,
1168 unsigned dispatch_width,
1169 bool needs_register_pressure,
1170 bool debug_enabled)
1171 : elk_backend_shader(compiler, params, shader, prog_data, debug_enabled),
1172 key(key), gs_compile(NULL), prog_data(prog_data),
1173 live_analysis(this), regpressure_analysis(this),
1174 performance_analysis(this),
1175 needs_register_pressure(needs_register_pressure),
1176 dispatch_width(dispatch_width),
1177 max_polygons(0),
1178 api_subgroup_size(elk_nir_api_subgroup_size(shader, dispatch_width))
1179 {
1180 init();
1181 }
1182
elk_fs_visitor(const struct elk_compiler * compiler,const struct elk_compile_params * params,const elk_wm_prog_key * key,struct elk_wm_prog_data * prog_data,const nir_shader * shader,unsigned dispatch_width,unsigned max_polygons,bool needs_register_pressure,bool debug_enabled)1183 elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler,
1184 const struct elk_compile_params *params,
1185 const elk_wm_prog_key *key,
1186 struct elk_wm_prog_data *prog_data,
1187 const nir_shader *shader,
1188 unsigned dispatch_width, unsigned max_polygons,
1189 bool needs_register_pressure,
1190 bool debug_enabled)
1191 : elk_backend_shader(compiler, params, shader, &prog_data->base,
1192 debug_enabled),
1193 key(&key->base), gs_compile(NULL), prog_data(&prog_data->base),
1194 live_analysis(this), regpressure_analysis(this),
1195 performance_analysis(this),
1196 needs_register_pressure(needs_register_pressure),
1197 dispatch_width(dispatch_width),
1198 max_polygons(max_polygons),
1199 api_subgroup_size(elk_nir_api_subgroup_size(shader, dispatch_width))
1200 {
1201 init();
1202 assert(api_subgroup_size == 0 ||
1203 api_subgroup_size == 8 ||
1204 api_subgroup_size == 16 ||
1205 api_subgroup_size == 32);
1206 }
1207
elk_fs_visitor(const struct elk_compiler * compiler,const struct elk_compile_params * params,struct elk_gs_compile * c,struct elk_gs_prog_data * prog_data,const nir_shader * shader,bool needs_register_pressure,bool debug_enabled)1208 elk_fs_visitor::elk_fs_visitor(const struct elk_compiler *compiler,
1209 const struct elk_compile_params *params,
1210 struct elk_gs_compile *c,
1211 struct elk_gs_prog_data *prog_data,
1212 const nir_shader *shader,
1213 bool needs_register_pressure,
1214 bool debug_enabled)
1215 : elk_backend_shader(compiler, params, shader, &prog_data->base.base,
1216 debug_enabled),
1217 key(&c->key.base), gs_compile(c),
1218 prog_data(&prog_data->base.base),
1219 live_analysis(this), regpressure_analysis(this),
1220 performance_analysis(this),
1221 needs_register_pressure(needs_register_pressure),
1222 dispatch_width(compiler->devinfo->ver >= 20 ? 16 : 8),
1223 max_polygons(0),
1224 api_subgroup_size(elk_nir_api_subgroup_size(shader, dispatch_width))
1225 {
1226 init();
1227 assert(api_subgroup_size == 0 ||
1228 api_subgroup_size == 8 ||
1229 api_subgroup_size == 16 ||
1230 api_subgroup_size == 32);
1231 }
1232
1233 void
init()1234 elk_fs_visitor::init()
1235 {
1236 if (key)
1237 this->key_tex = &key->tex;
1238 else
1239 this->key_tex = NULL;
1240
1241 this->max_dispatch_width = 32;
1242 this->prog_data = this->stage_prog_data;
1243
1244 this->failed = false;
1245 this->fail_msg = NULL;
1246
1247 this->payload_ = NULL;
1248 this->source_depth_to_render_target = false;
1249 this->runtime_check_aads_emit = false;
1250 this->first_non_payload_grf = 0;
1251 this->max_grf = devinfo->ver >= 7 ? GFX7_MRF_HACK_START : ELK_MAX_GRF;
1252
1253 this->uniforms = 0;
1254 this->last_scratch = 0;
1255 this->push_constant_loc = NULL;
1256
1257 memset(&this->shader_stats, 0, sizeof(this->shader_stats));
1258
1259 this->grf_used = 0;
1260 this->spilled_any_registers = false;
1261 }
1262
~elk_fs_visitor()1263 elk_fs_visitor::~elk_fs_visitor()
1264 {
1265 delete this->payload_;
1266 }
1267