1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_visitor.cpp
25 *
26 * This file supports generating the FS LIR from the GLSL IR. The LIR
27 * makes it easier to do backend-specific optimizations than doing so
28 * in the GLSL IR or in the native code.
29 */
30 #include "brw_fs.h"
31 #include "compiler/glsl_types.h"
32
33 using namespace brw;
34
35 /* Sample from the MCS surface attached to this multisample texture. */
36 fs_reg
emit_mcs_fetch(const fs_reg & coordinate,unsigned components,const fs_reg & texture,const fs_reg & texture_handle)37 fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
38 const fs_reg &texture,
39 const fs_reg &texture_handle)
40 {
41 const fs_reg dest = vgrf(glsl_type::uvec4_type);
42
43 fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
44 srcs[TEX_LOGICAL_SRC_COORDINATE] = coordinate;
45 srcs[TEX_LOGICAL_SRC_SURFACE] = texture;
46 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0);
47 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = texture_handle;
48 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(components);
49 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
50
51 fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
52 ARRAY_SIZE(srcs));
53
54 /* We only care about one or two regs of response, but the sampler always
55 * writes 4/8.
56 */
57 inst->size_written = 4 * dest.component_size(inst->exec_size);
58
59 return dest;
60 }
61
62 /**
63 * Apply workarounds for Gfx6 gather with UINT/SINT
64 */
65 void
emit_gfx6_gather_wa(uint8_t wa,fs_reg dst)66 fs_visitor::emit_gfx6_gather_wa(uint8_t wa, fs_reg dst)
67 {
68 if (!wa)
69 return;
70
71 int width = (wa & WA_8BIT) ? 8 : 16;
72
73 for (int i = 0; i < 4; i++) {
74 fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
75 /* Convert from UNORM to UINT */
76 bld.MUL(dst_f, dst_f, brw_imm_f((1 << width) - 1));
77 bld.MOV(dst, dst_f);
78
79 if (wa & WA_SIGN) {
80 /* Reinterpret the UINT value as a signed INT value by
81 * shifting the sign bit into place, then shifting back
82 * preserving sign.
83 */
84 bld.SHL(dst, dst, brw_imm_d(32 - width));
85 bld.ASR(dst, dst, brw_imm_d(32 - width));
86 }
87
88 dst = offset(dst, bld, 1);
89 }
90 }
91
92 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
93 void
emit_dummy_fs()94 fs_visitor::emit_dummy_fs()
95 {
96 int reg_width = dispatch_width / 8;
97
98 /* Everyone's favorite color. */
99 const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
100 for (int i = 0; i < 4; i++) {
101 bld.MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F),
102 brw_imm_f(color[i]));
103 }
104
105 fs_inst *write;
106 write = bld.emit(FS_OPCODE_FB_WRITE);
107 write->eot = true;
108 write->last_rt = true;
109 if (devinfo->ver >= 6) {
110 write->base_mrf = 2;
111 write->mlen = 4 * reg_width;
112 } else {
113 write->header_size = 2;
114 write->base_mrf = 0;
115 write->mlen = 2 + 4 * reg_width;
116 }
117
118 /* Tell the SF we don't have any inputs. Gfx4-5 require at least one
119 * varying to avoid GPU hangs, so set that.
120 */
121 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
122 wm_prog_data->num_varying_inputs = devinfo->ver < 6 ? 1 : 0;
123 memset(wm_prog_data->urb_setup, -1,
124 sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
125 brw_compute_urb_setup_index(wm_prog_data);
126
127 /* We don't have any uniforms. */
128 stage_prog_data->nr_params = 0;
129 stage_prog_data->nr_pull_params = 0;
130 stage_prog_data->curb_read_length = 0;
131 stage_prog_data->dispatch_grf_start_reg = 2;
132 wm_prog_data->dispatch_grf_start_reg_16 = 2;
133 wm_prog_data->dispatch_grf_start_reg_32 = 2;
134 grf_used = 1; /* Gfx4-5 don't allow zero GRF blocks */
135
136 calculate_cfg();
137 }
138
139 /* The register location here is relative to the start of the URB
140 * data. It will get adjusted to be a real location before
141 * generate_code() time.
142 */
143 fs_reg
interp_reg(int location,int channel)144 fs_visitor::interp_reg(int location, int channel)
145 {
146 assert(stage == MESA_SHADER_FRAGMENT);
147 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
148 int regnr = prog_data->urb_setup[location] * 4 + channel;
149 assert(prog_data->urb_setup[location] != -1);
150
151 return fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F);
152 }
153
154 /** Emits the interpolation for the varying inputs. */
155 void
emit_interpolation_setup_gfx4()156 fs_visitor::emit_interpolation_setup_gfx4()
157 {
158 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
159
160 fs_builder abld = bld.annotate("compute pixel centers");
161 this->pixel_x = vgrf(glsl_type::uint_type);
162 this->pixel_y = vgrf(glsl_type::uint_type);
163 this->pixel_x.type = BRW_REGISTER_TYPE_UW;
164 this->pixel_y.type = BRW_REGISTER_TYPE_UW;
165 abld.ADD(this->pixel_x,
166 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
167 fs_reg(brw_imm_v(0x10101010)));
168 abld.ADD(this->pixel_y,
169 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
170 fs_reg(brw_imm_v(0x11001100)));
171
172 abld = bld.annotate("compute pixel deltas from v0");
173
174 this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] =
175 vgrf(glsl_type::vec2_type);
176 const fs_reg &delta_xy = this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL];
177 const fs_reg xstart(negate(brw_vec1_grf(1, 0)));
178 const fs_reg ystart(negate(brw_vec1_grf(1, 1)));
179
180 if (devinfo->has_pln) {
181 for (unsigned i = 0; i < dispatch_width / 8; i++) {
182 abld.quarter(i).ADD(quarter(offset(delta_xy, abld, 0), i),
183 quarter(this->pixel_x, i), xstart);
184 abld.quarter(i).ADD(quarter(offset(delta_xy, abld, 1), i),
185 quarter(this->pixel_y, i), ystart);
186 }
187 } else {
188 abld.ADD(offset(delta_xy, abld, 0), this->pixel_x, xstart);
189 abld.ADD(offset(delta_xy, abld, 1), this->pixel_y, ystart);
190 }
191
192 this->pixel_z = fetch_payload_reg(bld, payload.source_depth_reg);
193
194 /* The SF program automatically handles doing the perspective correction or
195 * not based on wm_prog_data::interp_mode[] so we can use the same pixel
196 * offsets for both perspective and non-perspective.
197 */
198 this->delta_xy[BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL] =
199 this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL];
200
201 abld = bld.annotate("compute pos.w and 1/pos.w");
202 /* Compute wpos.w. It's always in our setup, since it's needed to
203 * interpolate the other attributes.
204 */
205 this->wpos_w = vgrf(glsl_type::float_type);
206 abld.emit(FS_OPCODE_LINTERP, wpos_w, delta_xy,
207 component(interp_reg(VARYING_SLOT_POS, 3), 0));
208 /* Compute the pixel 1/W value from wpos.w. */
209 this->pixel_w = vgrf(glsl_type::float_type);
210 abld.emit(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
211 }
212
213 static unsigned
brw_rnd_mode_from_nir(unsigned mode,unsigned * mask)214 brw_rnd_mode_from_nir(unsigned mode, unsigned *mask)
215 {
216 unsigned brw_mode = 0;
217 *mask = 0;
218
219 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
220 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
221 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
222 mode) {
223 brw_mode |= BRW_RND_MODE_RTZ << BRW_CR0_RND_MODE_SHIFT;
224 *mask |= BRW_CR0_RND_MODE_MASK;
225 }
226 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
227 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
228 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
229 mode) {
230 brw_mode |= BRW_RND_MODE_RTNE << BRW_CR0_RND_MODE_SHIFT;
231 *mask |= BRW_CR0_RND_MODE_MASK;
232 }
233 if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) {
234 brw_mode |= BRW_CR0_FP16_DENORM_PRESERVE;
235 *mask |= BRW_CR0_FP16_DENORM_PRESERVE;
236 }
237 if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) {
238 brw_mode |= BRW_CR0_FP32_DENORM_PRESERVE;
239 *mask |= BRW_CR0_FP32_DENORM_PRESERVE;
240 }
241 if (mode & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) {
242 brw_mode |= BRW_CR0_FP64_DENORM_PRESERVE;
243 *mask |= BRW_CR0_FP64_DENORM_PRESERVE;
244 }
245 if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16)
246 *mask |= BRW_CR0_FP16_DENORM_PRESERVE;
247 if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32)
248 *mask |= BRW_CR0_FP32_DENORM_PRESERVE;
249 if (mode & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64)
250 *mask |= BRW_CR0_FP64_DENORM_PRESERVE;
251 if (mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
252 *mask |= BRW_CR0_FP_MODE_MASK;
253
254 if (*mask != 0)
255 assert((*mask & brw_mode) == brw_mode);
256
257 return brw_mode;
258 }
259
260 void
emit_shader_float_controls_execution_mode()261 fs_visitor::emit_shader_float_controls_execution_mode()
262 {
263 unsigned execution_mode = this->nir->info.float_controls_execution_mode;
264 if (execution_mode == FLOAT_CONTROLS_DEFAULT_FLOAT_CONTROL_MODE)
265 return;
266
267 fs_builder abld = bld.annotate("shader floats control execution mode");
268 unsigned mask, mode = brw_rnd_mode_from_nir(execution_mode, &mask);
269
270 if (mask == 0)
271 return;
272
273 abld.emit(SHADER_OPCODE_FLOAT_CONTROL_MODE, bld.null_reg_ud(),
274 brw_imm_d(mode), brw_imm_d(mask));
275 }
276
277 /** Emits the interpolation for the varying inputs. */
278 void
emit_interpolation_setup_gfx6()279 fs_visitor::emit_interpolation_setup_gfx6()
280 {
281 fs_builder abld = bld.annotate("compute pixel centers");
282
283 this->pixel_x = vgrf(glsl_type::float_type);
284 this->pixel_y = vgrf(glsl_type::float_type);
285
286 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(prog_data);
287
288 fs_reg int_pixel_offset_x, int_pixel_offset_y; /* Used on Gen12HP+ */
289 fs_reg int_pixel_offset_xy; /* Used on Gen8+ */
290 fs_reg half_int_pixel_offset_x, half_int_pixel_offset_y;
291 if (!wm_prog_data->per_coarse_pixel_dispatch) {
292 /* The thread payload only delivers subspan locations (ss0, ss1,
293 * ss2, ...). Since subspans covers 2x2 pixels blocks, we need to
294 * generate 4 pixel coordinates out of each subspan location. We do this
295 * by replicating a subspan coordinate 4 times and adding an offset of 1
296 * in each direction from the initial top left (tl) location to generate
297 * top right (tr = +1 in x), bottom left (bl = +1 in y) and bottom right
298 * (br = +1 in x, +1 in y).
299 *
300 * The locations we build look like this in SIMD8 :
301 *
302 * ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
303 *
304 * The value 0x11001010 is a vector of 8 half byte vector. It adds
305 * following to generate the 4 pixels coordinates out of the subspan0:
306 *
307 * 0x
308 * 1 : ss0.y + 1 -> ss0.br.y
309 * 1 : ss0.y + 1 -> ss0.bl.y
310 * 0 : ss0.y + 0 -> ss0.tr.y
311 * 0 : ss0.y + 0 -> ss0.tl.y
312 * 1 : ss0.x + 1 -> ss0.br.x
313 * 0 : ss0.x + 0 -> ss0.bl.x
314 * 1 : ss0.x + 1 -> ss0.tr.x
315 * 0 : ss0.x + 0 -> ss0.tl.x
316 *
317 * By doing a SIMD16 add in a SIMD8 shader, we can generate the 8 pixels
318 * coordinates out of 2 subspans coordinates in a single ADD instruction
319 * (twice the operation above).
320 */
321 int_pixel_offset_xy = fs_reg(brw_imm_v(0x11001010));
322 half_int_pixel_offset_x = fs_reg(brw_imm_uw(0));
323 half_int_pixel_offset_y = fs_reg(brw_imm_uw(0));
324 /* On Gfx12.5, because of regioning restrictions, the interpolation code
325 * is slightly different and works off X & Y only inputs. The ordering
326 * of the half bytes here is a bit odd, with each subspan replicated
327 * twice and every other element is discarded :
328 *
329 * ss0.tl ss0.tl ss0.tr ss0.tr ss0.bl ss0.bl ss0.br ss0.br
330 * X offset: 0 0 1 0 0 0 1 0
331 * Y offset: 0 0 0 0 1 0 1 0
332 */
333 int_pixel_offset_x = fs_reg(brw_imm_v(0x01000100));
334 int_pixel_offset_y = fs_reg(brw_imm_v(0x01010000));
335 } else {
336 /* In coarse pixel dispatch we have to do the same ADD instruction that
337 * we do in normal per pixel dispatch, except this time we're not adding
338 * 1 in each direction, but instead the coarse pixel size.
339 *
340 * The coarse pixel size is delivered as 2 u8 in r1.0
341 */
342 struct brw_reg r1_0 = retype(brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, 1, 0), BRW_REGISTER_TYPE_UB);
343
344 const fs_builder dbld =
345 abld.exec_all().group(MIN2(16, dispatch_width) * 2, 0);
346
347 if (devinfo->verx10 >= 125) {
348 /* To build the array of half bytes we do and AND operation with the
349 * right mask in X.
350 */
351 int_pixel_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW);
352 dbld.AND(int_pixel_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0f000f00));
353
354 /* And the right mask in Y. */
355 int_pixel_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW);
356 dbld.AND(int_pixel_offset_y, byte_offset(r1_0, 1), brw_imm_v(0x0f0f0000));
357 } else {
358 /* To build the array of half bytes we do and AND operation with the
359 * right mask in X.
360 */
361 int_pixel_offset_x = dbld.vgrf(BRW_REGISTER_TYPE_UW);
362 dbld.AND(int_pixel_offset_x, byte_offset(r1_0, 0), brw_imm_v(0x0000f0f0));
363
364 /* And the right mask in Y. */
365 int_pixel_offset_y = dbld.vgrf(BRW_REGISTER_TYPE_UW);
366 dbld.AND(int_pixel_offset_y, byte_offset(r1_0, 1), brw_imm_v(0xff000000));
367
368 /* Finally OR the 2 registers. */
369 int_pixel_offset_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW);
370 dbld.OR(int_pixel_offset_xy, int_pixel_offset_x, int_pixel_offset_y);
371 }
372
373 /* Also compute the half pixel size used to center pixels. */
374 half_int_pixel_offset_x = bld.vgrf(BRW_REGISTER_TYPE_UW);
375 half_int_pixel_offset_y = bld.vgrf(BRW_REGISTER_TYPE_UW);
376
377 bld.SHR(half_int_pixel_offset_x, suboffset(r1_0, 0), brw_imm_ud(1));
378 bld.SHR(half_int_pixel_offset_y, suboffset(r1_0, 1), brw_imm_ud(1));
379 }
380
381 for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
382 const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i);
383 struct brw_reg gi_uw = retype(brw_vec1_grf(1 + i, 0), BRW_REGISTER_TYPE_UW);
384
385 if (devinfo->verx10 >= 125) {
386 const fs_builder dbld =
387 abld.exec_all().group(hbld.dispatch_width() * 2, 0);
388 const fs_reg int_pixel_x = dbld.vgrf(BRW_REGISTER_TYPE_UW);
389 const fs_reg int_pixel_y = dbld.vgrf(BRW_REGISTER_TYPE_UW);
390
391 dbld.ADD(int_pixel_x,
392 fs_reg(stride(suboffset(gi_uw, 4), 2, 8, 0)),
393 int_pixel_offset_x);
394 dbld.ADD(int_pixel_y,
395 fs_reg(stride(suboffset(gi_uw, 5), 2, 8, 0)),
396 int_pixel_offset_y);
397
398 if (wm_prog_data->per_coarse_pixel_dispatch) {
399 dbld.ADD(int_pixel_x, int_pixel_x,
400 horiz_stride(half_int_pixel_offset_x, 0));
401 dbld.ADD(int_pixel_y, int_pixel_y,
402 horiz_stride(half_int_pixel_offset_y, 0));
403 }
404
405 hbld.MOV(offset(pixel_x, hbld, i), horiz_stride(int_pixel_x, 2));
406 hbld.MOV(offset(pixel_y, hbld, i), horiz_stride(int_pixel_y, 2));
407
408 } else if (devinfo->ver >= 8 || dispatch_width == 8) {
409 /* The "Register Region Restrictions" page says for BDW (and newer,
410 * presumably):
411 *
412 * "When destination spans two registers, the source may be one or
413 * two registers. The destination elements must be evenly split
414 * between the two registers."
415 *
416 * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16
417 * to compute our pixel centers.
418 */
419 const fs_builder dbld =
420 abld.exec_all().group(hbld.dispatch_width() * 2, 0);
421 fs_reg int_pixel_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW);
422
423 dbld.ADD(int_pixel_xy,
424 fs_reg(stride(suboffset(gi_uw, 4), 1, 4, 0)),
425 int_pixel_offset_xy);
426
427 hbld.emit(FS_OPCODE_PIXEL_X, offset(pixel_x, hbld, i), int_pixel_xy,
428 horiz_stride(half_int_pixel_offset_x, 0));
429 hbld.emit(FS_OPCODE_PIXEL_Y, offset(pixel_y, hbld, i), int_pixel_xy,
430 horiz_stride(half_int_pixel_offset_y, 0));
431 } else {
432 /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
433 *
434 * "When destination spans two registers, the source MUST span
435 * two registers."
436 *
437 * Since the GRF source of the ADD will only read a single register,
438 * we must do two separate ADDs in SIMD16.
439 */
440 const fs_reg int_pixel_x = hbld.vgrf(BRW_REGISTER_TYPE_UW);
441 const fs_reg int_pixel_y = hbld.vgrf(BRW_REGISTER_TYPE_UW);
442
443 hbld.ADD(int_pixel_x,
444 fs_reg(stride(suboffset(gi_uw, 4), 2, 4, 0)),
445 fs_reg(brw_imm_v(0x10101010)));
446 hbld.ADD(int_pixel_y,
447 fs_reg(stride(suboffset(gi_uw, 5), 2, 4, 0)),
448 fs_reg(brw_imm_v(0x11001100)));
449
450 /* As of gfx6, we can no longer mix float and int sources. We have
451 * to turn the integer pixel centers into floats for their actual
452 * use.
453 */
454 hbld.MOV(offset(pixel_x, hbld, i), int_pixel_x);
455 hbld.MOV(offset(pixel_y, hbld, i), int_pixel_y);
456 }
457 }
458
459 abld = bld.annotate("compute pos.z");
460 if (wm_prog_data->uses_depth_w_coefficients) {
461 assert(!wm_prog_data->uses_src_depth);
462 /* In coarse pixel mode, the HW doesn't interpolate Z coordinate
463 * properly. In the same way we have to add the coarse pixel size to
464 * pixels locations, here we recompute the Z value with 2 coefficients
465 * in X & Y axis.
466 */
467 fs_reg coef_payload = fetch_payload_reg(abld, payload.depth_w_coef_reg, BRW_REGISTER_TYPE_F);
468 const fs_reg x_start = brw_vec1_grf(coef_payload.nr, 2);
469 const fs_reg y_start = brw_vec1_grf(coef_payload.nr, 6);
470 const fs_reg z_cx = brw_vec1_grf(coef_payload.nr, 1);
471 const fs_reg z_cy = brw_vec1_grf(coef_payload.nr, 0);
472 const fs_reg z_c0 = brw_vec1_grf(coef_payload.nr, 3);
473
474 const fs_reg float_pixel_x = abld.vgrf(BRW_REGISTER_TYPE_F);
475 const fs_reg float_pixel_y = abld.vgrf(BRW_REGISTER_TYPE_F);
476
477 abld.ADD(float_pixel_x, this->pixel_x, negate(x_start));
478 abld.ADD(float_pixel_y, this->pixel_y, negate(y_start));
479
480 /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
481 const fs_reg u8_cps_width = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB));
482 /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
483 const fs_reg u8_cps_height = byte_offset(u8_cps_width, 1);
484 const fs_reg u32_cps_width = abld.vgrf(BRW_REGISTER_TYPE_UD);
485 const fs_reg u32_cps_height = abld.vgrf(BRW_REGISTER_TYPE_UD);
486 abld.MOV(u32_cps_width, u8_cps_width);
487 abld.MOV(u32_cps_height, u8_cps_height);
488
489 const fs_reg f_cps_width = abld.vgrf(BRW_REGISTER_TYPE_F);
490 const fs_reg f_cps_height = abld.vgrf(BRW_REGISTER_TYPE_F);
491 abld.MOV(f_cps_width, u32_cps_width);
492 abld.MOV(f_cps_height, u32_cps_height);
493
494 /* Center in the middle of the coarse pixel. */
495 abld.MAD(float_pixel_x, float_pixel_x, brw_imm_f(0.5f), f_cps_width);
496 abld.MAD(float_pixel_y, float_pixel_y, brw_imm_f(0.5f), f_cps_height);
497
498 this->pixel_z = abld.vgrf(BRW_REGISTER_TYPE_F);
499 abld.MAD(this->pixel_z, z_c0, z_cx, float_pixel_x);
500 abld.MAD(this->pixel_z, this->pixel_z, z_cy, float_pixel_y);
501 }
502
503 if (wm_prog_data->uses_src_depth) {
504 assert(!wm_prog_data->uses_depth_w_coefficients);
505 this->pixel_z = fetch_payload_reg(bld, payload.source_depth_reg);
506 }
507
508 if (wm_prog_data->uses_src_w) {
509 abld = bld.annotate("compute pos.w");
510 this->pixel_w = fetch_payload_reg(abld, payload.source_w_reg);
511 this->wpos_w = vgrf(glsl_type::float_type);
512 abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
513 }
514
515 for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
516 this->delta_xy[i] = fetch_barycentric_reg(
517 bld, payload.barycentric_coord_reg[i]);
518 }
519
520 uint32_t centroid_modes = wm_prog_data->barycentric_interp_modes &
521 (1 << BRW_BARYCENTRIC_PERSPECTIVE_CENTROID |
522 1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
523
524 if (devinfo->needs_unlit_centroid_workaround && centroid_modes) {
525 /* Get the pixel/sample mask into f0 so that we know which
526 * pixels are lit. Then, for each channel that is unlit,
527 * replace the centroid data with non-centroid data.
528 */
529 for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
530 bld.exec_all().group(1, 0)
531 .MOV(retype(brw_flag_reg(0, i), BRW_REGISTER_TYPE_UW),
532 retype(brw_vec1_grf(1 + i, 7), BRW_REGISTER_TYPE_UW));
533 }
534
535 for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
536 if (!(centroid_modes & (1 << i)))
537 continue;
538
539 const fs_reg centroid_delta_xy = delta_xy[i];
540 const fs_reg &pixel_delta_xy = delta_xy[i - 1];
541
542 delta_xy[i] = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
543
544 for (unsigned c = 0; c < 2; c++) {
545 for (unsigned q = 0; q < dispatch_width / 8; q++) {
546 set_predicate(BRW_PREDICATE_NORMAL,
547 bld.quarter(q).SEL(
548 quarter(offset(delta_xy[i], bld, c), q),
549 quarter(offset(centroid_delta_xy, bld, c), q),
550 quarter(offset(pixel_delta_xy, bld, c), q)));
551 }
552 }
553 }
554 }
555 }
556
557 static enum brw_conditional_mod
cond_for_alpha_func(GLenum func)558 cond_for_alpha_func(GLenum func)
559 {
560 switch(func) {
561 case GL_GREATER:
562 return BRW_CONDITIONAL_G;
563 case GL_GEQUAL:
564 return BRW_CONDITIONAL_GE;
565 case GL_LESS:
566 return BRW_CONDITIONAL_L;
567 case GL_LEQUAL:
568 return BRW_CONDITIONAL_LE;
569 case GL_EQUAL:
570 return BRW_CONDITIONAL_EQ;
571 case GL_NOTEQUAL:
572 return BRW_CONDITIONAL_NEQ;
573 default:
574 unreachable("Not reached");
575 }
576 }
577
578 /**
579 * Alpha test support for when we compile it into the shader instead
580 * of using the normal fixed-function alpha test.
581 */
582 void
emit_alpha_test()583 fs_visitor::emit_alpha_test()
584 {
585 assert(stage == MESA_SHADER_FRAGMENT);
586 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
587 const fs_builder abld = bld.annotate("Alpha test");
588
589 fs_inst *cmp;
590 if (key->alpha_test_func == GL_ALWAYS)
591 return;
592
593 if (key->alpha_test_func == GL_NEVER) {
594 /* f0.1 = 0 */
595 fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
596 BRW_REGISTER_TYPE_UW));
597 cmp = abld.CMP(bld.null_reg_f(), some_reg, some_reg,
598 BRW_CONDITIONAL_NEQ);
599 } else {
600 /* RT0 alpha */
601 fs_reg color = offset(outputs[0], bld, 3);
602
603 /* f0.1 &= func(color, ref) */
604 cmp = abld.CMP(bld.null_reg_f(), color, brw_imm_f(key->alpha_test_ref),
605 cond_for_alpha_func(key->alpha_test_func));
606 }
607 cmp->predicate = BRW_PREDICATE_NORMAL;
608 cmp->flag_subreg = 1;
609 }
610
611 fs_inst *
emit_single_fb_write(const fs_builder & bld,fs_reg color0,fs_reg color1,fs_reg src0_alpha,unsigned components)612 fs_visitor::emit_single_fb_write(const fs_builder &bld,
613 fs_reg color0, fs_reg color1,
614 fs_reg src0_alpha, unsigned components)
615 {
616 assert(stage == MESA_SHADER_FRAGMENT);
617 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
618
619 /* Hand over gl_FragDepth or the payload depth. */
620 const fs_reg dst_depth = fetch_payload_reg(bld, payload.dest_depth_reg);
621 fs_reg src_depth, src_stencil;
622
623 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
624 src_depth = frag_depth;
625 } else if (source_depth_to_render_target) {
626 /* If we got here, we're in one of those strange Gen4-5 cases where
627 * we're forced to pass the source depth, unmodified, to the FB write.
628 * In this case, we don't want to use pixel_z because we may not have
629 * set up interpolation. It's also perfectly safe because it only
630 * happens on old hardware (no coarse interpolation) and this is
631 * explicitly the pass-through case.
632 */
633 assert(devinfo->ver <= 5);
634 src_depth = fetch_payload_reg(bld, payload.source_depth_reg);
635 }
636
637 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
638 src_stencil = frag_stencil;
639
640 const fs_reg sources[] = {
641 color0, color1, src0_alpha, src_depth, dst_depth, src_stencil,
642 (prog_data->uses_omask ? sample_mask : fs_reg()),
643 brw_imm_ud(components)
644 };
645 assert(ARRAY_SIZE(sources) - 1 == FB_WRITE_LOGICAL_SRC_COMPONENTS);
646 fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, fs_reg(),
647 sources, ARRAY_SIZE(sources));
648
649 if (prog_data->uses_kill) {
650 write->predicate = BRW_PREDICATE_NORMAL;
651 write->flag_subreg = sample_mask_flag_subreg(this);
652 }
653
654 return write;
655 }
656
657 void
emit_fb_writes()658 fs_visitor::emit_fb_writes()
659 {
660 assert(stage == MESA_SHADER_FRAGMENT);
661 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
662 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
663
664 fs_inst *inst = NULL;
665
666 if (source_depth_to_render_target && devinfo->ver == 6) {
667 /* For outputting oDepth on gfx6, SIMD8 writes have to be used. This
668 * would require SIMD8 moves of each half to message regs, e.g. by using
669 * the SIMD lowering pass. Unfortunately this is more difficult than it
670 * sounds because the SIMD8 single-source message lacks channel selects
671 * for the second and third subspans.
672 */
673 limit_dispatch_width(8, "Depth writes unsupported in SIMD16+ mode.\n");
674 }
675
676 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
677 /* From the 'Render Target Write message' section of the docs:
678 * "Output Stencil is not supported with SIMD16 Render Target Write
679 * Messages."
680 */
681 limit_dispatch_width(8, "gl_FragStencilRefARB unsupported "
682 "in SIMD16+ mode.\n");
683 }
684
685 /* ANV doesn't know about sample mask output during the wm key creation
686 * so we compute if we need replicate alpha and emit alpha to coverage
687 * workaround here.
688 */
689 const bool replicate_alpha = key->alpha_test_replicate_alpha ||
690 (key->nr_color_regions > 1 && key->alpha_to_coverage &&
691 (sample_mask.file == BAD_FILE || devinfo->ver == 6));
692
693 for (int target = 0; target < key->nr_color_regions; target++) {
694 /* Skip over outputs that weren't written. */
695 if (this->outputs[target].file == BAD_FILE)
696 continue;
697
698 const fs_builder abld = bld.annotate(
699 ralloc_asprintf(this->mem_ctx, "FB write target %d", target));
700
701 fs_reg src0_alpha;
702 if (devinfo->ver >= 6 && replicate_alpha && target != 0)
703 src0_alpha = offset(outputs[0], bld, 3);
704
705 inst = emit_single_fb_write(abld, this->outputs[target],
706 this->dual_src_output, src0_alpha, 4);
707 inst->target = target;
708 }
709
710 prog_data->dual_src_blend = (this->dual_src_output.file != BAD_FILE &&
711 this->outputs[0].file != BAD_FILE);
712 assert(!prog_data->dual_src_blend || key->nr_color_regions == 1);
713
714 if (inst == NULL) {
715 /* Even if there's no color buffers enabled, we still need to send
716 * alpha out the pipeline to our null renderbuffer to support
717 * alpha-testing, alpha-to-coverage, and so on.
718 */
719 /* FINISHME: Factor out this frequently recurring pattern into a
720 * helper function.
721 */
722 const fs_reg srcs[] = { reg_undef, reg_undef,
723 reg_undef, offset(this->outputs[0], bld, 3) };
724 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
725 bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
726
727 inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4);
728 inst->target = 0;
729 }
730
731 inst->last_rt = true;
732 inst->eot = true;
733
734 if (devinfo->ver >= 11 && devinfo->ver <= 12 &&
735 prog_data->dual_src_blend) {
736 /* The dual-source RT write messages fail to release the thread
737 * dependency on ICL and TGL with SIMD32 dispatch, leading to hangs.
738 *
739 * XXX - Emit an extra single-source NULL RT-write marked LastRT in
740 * order to release the thread dependency without disabling
741 * SIMD32.
742 *
743 * The dual-source RT write messages may lead to hangs with SIMD16
744 * dispatch on ICL due some unknown reasons, see
745 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/2183
746 */
747 limit_dispatch_width(8, "Dual source blending unsupported "
748 "in SIMD16 and SIMD32 modes.\n");
749 }
750 }
751
752 void
emit_urb_writes(const fs_reg & gs_vertex_count)753 fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
754 {
755 int slot, urb_offset, length;
756 int starting_urb_offset = 0;
757 const struct brw_vue_prog_data *vue_prog_data =
758 brw_vue_prog_data(this->prog_data);
759 const struct brw_vs_prog_key *vs_key =
760 (const struct brw_vs_prog_key *) this->key;
761 const GLbitfield64 psiz_mask =
762 VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
763 const struct brw_vue_map *vue_map = &vue_prog_data->vue_map;
764 bool flush;
765 fs_reg sources[8];
766 fs_reg urb_handle;
767
768 if (stage == MESA_SHADER_TESS_EVAL)
769 urb_handle = fs_reg(retype(brw_vec8_grf(4, 0), BRW_REGISTER_TYPE_UD));
770 else
771 urb_handle = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
772
773 opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
774 int header_size = 1;
775 fs_reg per_slot_offsets;
776
777 if (stage == MESA_SHADER_GEOMETRY) {
778 const struct brw_gs_prog_data *gs_prog_data =
779 brw_gs_prog_data(this->prog_data);
780
781 /* We need to increment the Global Offset to skip over the control data
782 * header and the extra "Vertex Count" field (1 HWord) at the beginning
783 * of the VUE. We're counting in OWords, so the units are doubled.
784 */
785 starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
786 if (gs_prog_data->static_vertex_count == -1)
787 starting_urb_offset += 2;
788
789 /* We also need to use per-slot offsets. The per-slot offset is the
790 * Vertex Count. SIMD8 mode processes 8 different primitives at a
791 * time; each may output a different number of vertices.
792 */
793 opcode = SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT;
794 header_size++;
795
796 /* The URB offset is in 128-bit units, so we need to multiply by 2 */
797 const int output_vertex_size_owords =
798 gs_prog_data->output_vertex_size_hwords * 2;
799
800 if (gs_vertex_count.file == IMM) {
801 per_slot_offsets = brw_imm_ud(output_vertex_size_owords *
802 gs_vertex_count.ud);
803 } else {
804 per_slot_offsets = vgrf(glsl_type::uint_type);
805 bld.MUL(per_slot_offsets, gs_vertex_count,
806 brw_imm_ud(output_vertex_size_owords));
807 }
808 }
809
810 length = 0;
811 urb_offset = starting_urb_offset;
812 flush = false;
813
814 /* SSO shaders can have VUE slots allocated which are never actually
815 * written to, so ignore them when looking for the last (written) slot.
816 */
817 int last_slot = vue_map->num_slots - 1;
818 while (last_slot > 0 &&
819 (vue_map->slot_to_varying[last_slot] == BRW_VARYING_SLOT_PAD ||
820 outputs[vue_map->slot_to_varying[last_slot]].file == BAD_FILE)) {
821 last_slot--;
822 }
823
824 bool urb_written = false;
825 for (slot = 0; slot < vue_map->num_slots; slot++) {
826 int varying = vue_map->slot_to_varying[slot];
827 switch (varying) {
828 case VARYING_SLOT_PSIZ: {
829 /* The point size varying slot is the vue header and is always in the
830 * vue map. But often none of the special varyings that live there
831 * are written and in that case we can skip writing to the vue
832 * header, provided the corresponding state properly clamps the
833 * values further down the pipeline. */
834 if ((vue_map->slots_valid & psiz_mask) == 0) {
835 assert(length == 0);
836 urb_offset++;
837 break;
838 }
839
840 fs_reg zero(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
841 bld.MOV(zero, brw_imm_ud(0u));
842
843 sources[length++] = zero;
844 if (vue_map->slots_valid & VARYING_BIT_LAYER)
845 sources[length++] = this->outputs[VARYING_SLOT_LAYER];
846 else
847 sources[length++] = zero;
848
849 if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
850 sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
851 else
852 sources[length++] = zero;
853
854 if (vue_map->slots_valid & VARYING_BIT_PSIZ)
855 sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
856 else
857 sources[length++] = zero;
858 break;
859 }
860 case BRW_VARYING_SLOT_NDC:
861 case VARYING_SLOT_EDGE:
862 unreachable("unexpected scalar vs output");
863 break;
864
865 default:
866 /* gl_Position is always in the vue map, but isn't always written by
867 * the shader. Other varyings (clip distances) get added to the vue
868 * map but don't always get written. In those cases, the
869 * corresponding this->output[] slot will be invalid we and can skip
870 * the urb write for the varying. If we've already queued up a vue
871 * slot for writing we flush a mlen 5 urb write, otherwise we just
872 * advance the urb_offset.
873 */
874 if (varying == BRW_VARYING_SLOT_PAD ||
875 this->outputs[varying].file == BAD_FILE) {
876 if (length > 0)
877 flush = true;
878 else
879 urb_offset++;
880 break;
881 }
882
883 if (stage == MESA_SHADER_VERTEX && vs_key->clamp_vertex_color &&
884 (varying == VARYING_SLOT_COL0 ||
885 varying == VARYING_SLOT_COL1 ||
886 varying == VARYING_SLOT_BFC0 ||
887 varying == VARYING_SLOT_BFC1)) {
888 /* We need to clamp these guys, so do a saturating MOV into a
889 * temp register and use that for the payload.
890 */
891 for (int i = 0; i < 4; i++) {
892 fs_reg reg = fs_reg(VGRF, alloc.allocate(1), outputs[varying].type);
893 fs_reg src = offset(this->outputs[varying], bld, i);
894 set_saturate(true, bld.MOV(reg, src));
895 sources[length++] = reg;
896 }
897 } else {
898 int slot_offset = 0;
899
900 /* When using Primitive Replication, there may be multiple slots
901 * assigned to POS.
902 */
903 if (varying == VARYING_SLOT_POS)
904 slot_offset = slot - vue_map->varying_to_slot[VARYING_SLOT_POS];
905
906 for (unsigned i = 0; i < 4; i++) {
907 sources[length++] = offset(this->outputs[varying], bld,
908 i + (slot_offset * 4));
909 }
910 }
911 break;
912 }
913
914 const fs_builder abld = bld.annotate("URB write");
915
916 /* If we've queued up 8 registers of payload (2 VUE slots), if this is
917 * the last slot or if we need to flush (see BAD_FILE varying case
918 * above), emit a URB write send now to flush out the data.
919 */
920 if (length == 8 || (length > 0 && slot == last_slot))
921 flush = true;
922 if (flush) {
923 fs_reg *payload_sources =
924 ralloc_array(mem_ctx, fs_reg, length + header_size);
925 fs_reg payload = fs_reg(VGRF, alloc.allocate(length + header_size),
926 BRW_REGISTER_TYPE_F);
927 payload_sources[0] = urb_handle;
928
929 if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT)
930 payload_sources[1] = per_slot_offsets;
931
932 memcpy(&payload_sources[header_size], sources,
933 length * sizeof sources[0]);
934
935 abld.LOAD_PAYLOAD(payload, payload_sources, length + header_size,
936 header_size);
937
938 fs_inst *inst = abld.emit(opcode, reg_undef, payload);
939
940 /* For ICL WA 1805992985 one needs additional write in the end. */
941 if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL)
942 inst->eot = false;
943 else
944 inst->eot = slot == last_slot && stage != MESA_SHADER_GEOMETRY;
945
946 inst->mlen = length + header_size;
947 inst->offset = urb_offset;
948 urb_offset = starting_urb_offset + slot + 1;
949 length = 0;
950 flush = false;
951 urb_written = true;
952 }
953 }
954
955 /* If we don't have any valid slots to write, just do a minimal urb write
956 * send to terminate the shader. This includes 1 slot of undefined data,
957 * because it's invalid to write 0 data:
958 *
959 * From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
960 * Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
961 * Write Data Payload:
962 *
963 * "The write data payload can be between 1 and 8 message phases long."
964 */
965 if (!urb_written) {
966 /* For GS, just turn EmitVertex() into a no-op. We don't want it to
967 * end the thread, and emit_gs_thread_end() already emits a SEND with
968 * EOT at the end of the program for us.
969 */
970 if (stage == MESA_SHADER_GEOMETRY)
971 return;
972
973 fs_reg payload = fs_reg(VGRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD);
974 bld.exec_all().MOV(payload, urb_handle);
975
976 fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
977 inst->eot = true;
978 inst->mlen = 2;
979 inst->offset = 1;
980 return;
981 }
982
983 /* ICL WA 1805992985:
984 *
985 * ICLLP GPU hangs on one of tessellation vkcts tests with DS not done. The
986 * send cycle, which is a urb write with an eot must be 4 phases long and
987 * all 8 lanes must valid.
988 */
989 if (devinfo->ver == 11 && stage == MESA_SHADER_TESS_EVAL) {
990 fs_reg payload = fs_reg(VGRF, alloc.allocate(6), BRW_REGISTER_TYPE_UD);
991
992 /* Workaround requires all 8 channels (lanes) to be valid. This is
993 * understood to mean they all need to be alive. First trick is to find
994 * a live channel and copy its urb handle for all the other channels to
995 * make sure all handles are valid.
996 */
997 bld.exec_all().MOV(payload, bld.emit_uniformize(urb_handle));
998
999 /* Second trick is to use masked URB write where one can tell the HW to
1000 * actually write data only for selected channels even though all are
1001 * active.
1002 * Third trick is to take advantage of the must-be-zero (MBZ) area in
1003 * the very beginning of the URB.
1004 *
1005 * One masks data to be written only for the first channel and uses
1006 * offset zero explicitly to land data to the MBZ area avoiding trashing
1007 * any other part of the URB.
1008 *
1009 * Since the WA says that the write needs to be 4 phases long one uses
1010 * 4 slots data. All are explicitly zeros in order to to keep the MBZ
1011 * area written as zeros.
1012 */
1013 bld.exec_all().MOV(offset(payload, bld, 1), brw_imm_ud(0x10000u));
1014 bld.exec_all().MOV(offset(payload, bld, 2), brw_imm_ud(0u));
1015 bld.exec_all().MOV(offset(payload, bld, 3), brw_imm_ud(0u));
1016 bld.exec_all().MOV(offset(payload, bld, 4), brw_imm_ud(0u));
1017 bld.exec_all().MOV(offset(payload, bld, 5), brw_imm_ud(0u));
1018
1019 fs_inst *inst = bld.exec_all().emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
1020 reg_undef, payload);
1021 inst->eot = true;
1022 inst->mlen = 6;
1023 inst->offset = 0;
1024 }
1025 }
1026
1027 void
emit_cs_terminate()1028 fs_visitor::emit_cs_terminate()
1029 {
1030 assert(devinfo->ver >= 7);
1031
1032 /* We can't directly send from g0, since sends with EOT have to use
1033 * g112-127. So, copy it to a virtual register, The register allocator will
1034 * make sure it uses the appropriate register range.
1035 */
1036 struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD);
1037 fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
1038 bld.group(8, 0).exec_all().MOV(payload, g0);
1039
1040 /* Send a message to the thread spawner to terminate the thread. */
1041 fs_inst *inst = bld.exec_all()
1042 .emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload);
1043 inst->eot = true;
1044 }
1045
1046 void
emit_barrier()1047 fs_visitor::emit_barrier()
1048 {
1049 /* We are getting the barrier ID from the compute shader header */
1050 assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);
1051
1052 fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
1053
1054 /* Clear the message payload */
1055 bld.exec_all().group(8, 0).MOV(payload, brw_imm_ud(0u));
1056
1057 if (devinfo->verx10 >= 125) {
1058 /* mov r0.2[31:24] into m0.2[31:24] and m0.2[23:16] */
1059 fs_reg m0_10ub = component(retype(payload, BRW_REGISTER_TYPE_UB), 10);
1060 fs_reg r0_11ub =
1061 stride(suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UB), 11),
1062 0, 1, 0);
1063 bld.exec_all().group(2, 0).MOV(m0_10ub, r0_11ub);
1064 } else {
1065 uint32_t barrier_id_mask;
1066 switch (devinfo->ver) {
1067 case 7:
1068 case 8:
1069 barrier_id_mask = 0x0f000000u; break;
1070 case 9:
1071 barrier_id_mask = 0x8f000000u; break;
1072 case 11:
1073 case 12:
1074 barrier_id_mask = 0x7f000000u; break;
1075 default:
1076 unreachable("barrier is only available on gen >= 7");
1077 }
1078
1079 /* Copy the barrier id from r0.2 to the message payload reg.2 */
1080 fs_reg r0_2 = fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD));
1081 bld.exec_all().group(1, 0).AND(component(payload, 2), r0_2,
1082 brw_imm_ud(barrier_id_mask));
1083 }
1084
1085 /* Emit a gateway "barrier" message using the payload we set up, followed
1086 * by a wait instruction.
1087 */
1088 bld.exec_all().emit(SHADER_OPCODE_BARRIER, reg_undef, payload);
1089 }
1090
fs_visitor(const struct brw_compiler * compiler,void * log_data,void * mem_ctx,const brw_base_prog_key * key,struct brw_stage_prog_data * prog_data,const nir_shader * shader,unsigned dispatch_width,int shader_time_index,bool debug_enabled)1091 fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
1092 void *mem_ctx,
1093 const brw_base_prog_key *key,
1094 struct brw_stage_prog_data *prog_data,
1095 const nir_shader *shader,
1096 unsigned dispatch_width,
1097 int shader_time_index,
1098 bool debug_enabled)
1099 : backend_shader(compiler, log_data, mem_ctx, shader, prog_data,
1100 debug_enabled),
1101 key(key), gs_compile(NULL), prog_data(prog_data),
1102 live_analysis(this), regpressure_analysis(this),
1103 performance_analysis(this),
1104 dispatch_width(dispatch_width),
1105 shader_time_index(shader_time_index),
1106 bld(fs_builder(this, dispatch_width).at_end())
1107 {
1108 init();
1109 }
1110
fs_visitor(const struct brw_compiler * compiler,void * log_data,void * mem_ctx,struct brw_gs_compile * c,struct brw_gs_prog_data * prog_data,const nir_shader * shader,int shader_time_index,bool debug_enabled)1111 fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
1112 void *mem_ctx,
1113 struct brw_gs_compile *c,
1114 struct brw_gs_prog_data *prog_data,
1115 const nir_shader *shader,
1116 int shader_time_index,
1117 bool debug_enabled)
1118 : backend_shader(compiler, log_data, mem_ctx, shader,
1119 &prog_data->base.base, debug_enabled),
1120 key(&c->key.base), gs_compile(c),
1121 prog_data(&prog_data->base.base),
1122 live_analysis(this), regpressure_analysis(this),
1123 performance_analysis(this),
1124 dispatch_width(8),
1125 shader_time_index(shader_time_index),
1126 bld(fs_builder(this, dispatch_width).at_end())
1127 {
1128 init();
1129 }
1130
1131
1132 void
init()1133 fs_visitor::init()
1134 {
1135 if (key)
1136 this->key_tex = &key->tex;
1137 else
1138 this->key_tex = NULL;
1139
1140 this->max_dispatch_width = 32;
1141 this->prog_data = this->stage_prog_data;
1142
1143 this->failed = false;
1144 this->fail_msg = NULL;
1145
1146 this->nir_locals = NULL;
1147 this->nir_ssa_values = NULL;
1148 this->nir_system_values = NULL;
1149
1150 memset(&this->payload, 0, sizeof(this->payload));
1151 this->source_depth_to_render_target = false;
1152 this->runtime_check_aads_emit = false;
1153 this->first_non_payload_grf = 0;
1154 this->max_grf = devinfo->ver >= 7 ? GFX7_MRF_HACK_START : BRW_MAX_GRF;
1155
1156 this->uniforms = 0;
1157 this->last_scratch = 0;
1158 this->pull_constant_loc = NULL;
1159 this->push_constant_loc = NULL;
1160
1161 this->shader_stats.scheduler_mode = NULL;
1162 this->shader_stats.promoted_constants = 0,
1163
1164 this->grf_used = 0;
1165 this->spilled_any_registers = false;
1166 }
1167
~fs_visitor()1168 fs_visitor::~fs_visitor()
1169 {
1170 }
1171