1 /*
2 * Copyright © 2010, 2022 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /**
25 * @file
26 */
27
28 #include "brw_eu.h"
29 #include "brw_fs.h"
30 #include "brw_builder.h"
31
32 using namespace brw;
33
34 static void
lower_urb_read_logical_send(const brw_builder & bld,fs_inst * inst)35 lower_urb_read_logical_send(const brw_builder &bld, fs_inst *inst)
36 {
37 const intel_device_info *devinfo = bld.shader->devinfo;
38 const bool per_slot_present =
39 inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
40
41 assert(inst->size_written % REG_SIZE == 0);
42 assert(inst->header_size == 0);
43
44 brw_reg payload_sources[2];
45 unsigned header_size = 0;
46 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
47 if (per_slot_present)
48 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
49
50 brw_reg payload = brw_vgrf(bld.shader->alloc.allocate(header_size),
51 BRW_TYPE_F);
52 bld.LOAD_PAYLOAD(payload, payload_sources, header_size, header_size);
53
54 inst->opcode = SHADER_OPCODE_SEND;
55 inst->header_size = header_size;
56
57 inst->sfid = BRW_SFID_URB;
58 inst->desc = brw_urb_desc(devinfo,
59 GFX8_URB_OPCODE_SIMD8_READ,
60 per_slot_present,
61 false,
62 inst->offset);
63
64 inst->mlen = header_size;
65 inst->ex_desc = 0;
66 inst->ex_mlen = 0;
67 inst->send_is_volatile = true;
68
69 inst->resize_sources(4);
70
71 inst->src[0] = brw_imm_ud(0); /* desc */
72 inst->src[1] = brw_imm_ud(0); /* ex_desc */
73 inst->src[2] = payload;
74 inst->src[3] = brw_null_reg();
75 }
76
77 static void
lower_urb_read_logical_send_xe2(const brw_builder & bld,fs_inst * inst)78 lower_urb_read_logical_send_xe2(const brw_builder &bld, fs_inst *inst)
79 {
80 const intel_device_info *devinfo = bld.shader->devinfo;
81 assert(devinfo->has_lsc);
82
83 assert(inst->size_written % (REG_SIZE * reg_unit(devinfo)) == 0);
84 assert(inst->header_size == 0);
85
86 /* Get the logical send arguments. */
87 const brw_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];
88
89 /* Calculate the total number of components of the payload. */
90 const unsigned dst_comps = inst->size_written / (REG_SIZE * reg_unit(devinfo));
91
92 brw_reg payload = bld.vgrf(BRW_TYPE_UD);
93
94 bld.MOV(payload, handle);
95
96 /* The low 24-bits of the URB handle is a byte offset into the URB area.
97 * Add the (OWord) offset of the write to this value.
98 */
99 if (inst->offset) {
100 bld.ADD(payload, payload, brw_imm_ud(inst->offset * 16));
101 inst->offset = 0;
102 }
103
104 brw_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
105 if (offsets.file != BAD_FILE) {
106 bld.ADD(payload, payload, offsets);
107 }
108
109 inst->sfid = BRW_SFID_URB;
110
111 assert((dst_comps >= 1 && dst_comps <= 4) || dst_comps == 8);
112
113 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
114 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
115 LSC_DATA_SIZE_D32, dst_comps /* num_channels */,
116 false /* transpose */,
117 LSC_CACHE(devinfo, LOAD, L1UC_L3UC));
118
119 /* Update the original instruction. */
120 inst->opcode = SHADER_OPCODE_SEND;
121 inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);
122 inst->ex_mlen = 0;
123 inst->header_size = 0;
124 inst->send_has_side_effects = true;
125 inst->send_is_volatile = false;
126
127 inst->resize_sources(4);
128
129 inst->src[0] = brw_imm_ud(0);
130 inst->src[1] = brw_imm_ud(0);
131
132 inst->src[2] = payload;
133 inst->src[3] = brw_null_reg();
134 }
135
136 static void
lower_urb_write_logical_send(const brw_builder & bld,fs_inst * inst)137 lower_urb_write_logical_send(const brw_builder &bld, fs_inst *inst)
138 {
139 const intel_device_info *devinfo = bld.shader->devinfo;
140 const bool per_slot_present =
141 inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
142 const bool channel_mask_present =
143 inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE;
144
145 assert(inst->header_size == 0);
146
147 const unsigned length = 1 + per_slot_present + channel_mask_present +
148 inst->components_read(URB_LOGICAL_SRC_DATA);
149
150 brw_reg *payload_sources = new brw_reg[length];
151 brw_reg payload = brw_vgrf(bld.shader->alloc.allocate(length),
152 BRW_TYPE_F);
153
154 unsigned header_size = 0;
155 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
156 if (per_slot_present)
157 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
158
159 if (channel_mask_present)
160 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
161
162 for (unsigned i = header_size, j = 0; i < length; i++, j++)
163 payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j);
164
165 bld.LOAD_PAYLOAD(payload, payload_sources, length, header_size);
166
167 delete [] payload_sources;
168
169 inst->opcode = SHADER_OPCODE_SEND;
170 inst->header_size = header_size;
171 inst->dst = brw_null_reg();
172
173 inst->sfid = BRW_SFID_URB;
174 inst->desc = brw_urb_desc(devinfo,
175 GFX8_URB_OPCODE_SIMD8_WRITE,
176 per_slot_present,
177 channel_mask_present,
178 inst->offset);
179
180 inst->mlen = length;
181 inst->ex_desc = 0;
182 inst->ex_mlen = 0;
183 inst->send_has_side_effects = true;
184
185 inst->resize_sources(4);
186
187 inst->src[0] = brw_imm_ud(0); /* desc */
188 inst->src[1] = brw_imm_ud(0); /* ex_desc */
189 inst->src[2] = payload;
190 inst->src[3] = brw_null_reg();
191 }
192
193 static void
lower_urb_write_logical_send_xe2(const brw_builder & bld,fs_inst * inst)194 lower_urb_write_logical_send_xe2(const brw_builder &bld, fs_inst *inst)
195 {
196 const intel_device_info *devinfo = bld.shader->devinfo;
197 assert(devinfo->has_lsc);
198
199 /* Get the logical send arguments. */
200 const brw_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];
201 const brw_reg src = inst->components_read(URB_LOGICAL_SRC_DATA) ?
202 inst->src[URB_LOGICAL_SRC_DATA] : brw_reg(brw_imm_ud(0));
203 assert(brw_type_size_bytes(src.type) == 4);
204
205 /* Calculate the total number of components of the payload. */
206 const unsigned src_comps = MAX2(1, inst->components_read(URB_LOGICAL_SRC_DATA));
207 const unsigned src_sz = brw_type_size_bytes(src.type);
208
209 brw_reg payload = bld.vgrf(BRW_TYPE_UD);
210
211 bld.MOV(payload, handle);
212
213 /* The low 24-bits of the URB handle is a byte offset into the URB area.
214 * Add the (OWord) offset of the write to this value.
215 */
216 if (inst->offset) {
217 bld.ADD(payload, payload, brw_imm_ud(inst->offset * 16));
218 inst->offset = 0;
219 }
220
221 brw_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
222 if (offsets.file != BAD_FILE) {
223 bld.ADD(payload, payload, offsets);
224 }
225
226 const brw_reg cmask = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
227 unsigned mask = 0;
228
229 if (cmask.file != BAD_FILE) {
230 assert(cmask.file == IMM);
231 assert(cmask.type == BRW_TYPE_UD);
232 mask = cmask.ud >> 16;
233 }
234
235 brw_reg payload2 = bld.move_to_vgrf(src, src_comps);
236 const unsigned ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE;
237
238 inst->sfid = BRW_SFID_URB;
239
240 enum lsc_opcode op = mask ? LSC_OP_STORE_CMASK : LSC_OP_STORE;
241 inst->desc = lsc_msg_desc(devinfo, op,
242 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
243 LSC_DATA_SIZE_D32,
244 mask ? mask : src_comps /* num_channels */,
245 false /* transpose */,
246 LSC_CACHE(devinfo, STORE, L1UC_L3UC));
247
248
249 /* Update the original instruction. */
250 inst->opcode = SHADER_OPCODE_SEND;
251 inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);
252 inst->ex_mlen = ex_mlen;
253 inst->header_size = 0;
254 inst->send_has_side_effects = true;
255 inst->send_is_volatile = false;
256
257 inst->resize_sources(4);
258
259 inst->src[0] = brw_imm_ud(0);
260 inst->src[1] = brw_imm_ud(0);
261
262 inst->src[2] = payload;
263 inst->src[3] = payload2;
264 }
265
266 static void
setup_color_payload(const brw_builder & bld,const brw_wm_prog_key * key,brw_reg * dst,brw_reg color,unsigned components)267 setup_color_payload(const brw_builder &bld, const brw_wm_prog_key *key,
268 brw_reg *dst, brw_reg color, unsigned components)
269 {
270 if (key->clamp_fragment_color) {
271 brw_reg tmp = bld.vgrf(BRW_TYPE_F, 4);
272 assert(color.type == BRW_TYPE_F);
273
274 for (unsigned i = 0; i < components; i++)
275 set_saturate(true,
276 bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
277
278 color = tmp;
279 }
280
281 for (unsigned i = 0; i < components; i++)
282 dst[i] = offset(color, bld, i);
283 }
284
285 static void
lower_fb_write_logical_send(const brw_builder & bld,fs_inst * inst,const struct brw_wm_prog_data * prog_data,const brw_wm_prog_key * key,const fs_thread_payload & fs_payload)286 lower_fb_write_logical_send(const brw_builder &bld, fs_inst *inst,
287 const struct brw_wm_prog_data *prog_data,
288 const brw_wm_prog_key *key,
289 const fs_thread_payload &fs_payload)
290 {
291 assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
292 assert(inst->src[FB_WRITE_LOGICAL_SRC_NULL_RT].file == IMM);
293 const intel_device_info *devinfo = bld.shader->devinfo;
294 const brw_reg color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
295 const brw_reg color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
296 const brw_reg src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
297 const brw_reg src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
298 const brw_reg dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
299 const brw_reg src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
300 brw_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
301 const unsigned components =
302 inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
303 const bool null_rt = inst->src[FB_WRITE_LOGICAL_SRC_NULL_RT].ud != 0;
304
305 assert(inst->target != 0 || src0_alpha.file == BAD_FILE);
306
307 brw_reg sources[15];
308 int header_size = 2, payload_header_size;
309 unsigned length = 0;
310
311 if (devinfo->ver < 11 &&
312 (color1.file != BAD_FILE || key->nr_color_regions > 1)) {
313
314 /* From the Sandy Bridge PRM, volume 4, page 198:
315 *
316 * "Dispatched Pixel Enables. One bit per pixel indicating
317 * which pixels were originally enabled when the thread was
318 * dispatched. This field is only required for the end-of-
319 * thread message and on all dual-source messages."
320 */
321 const brw_builder ubld = bld.exec_all().group(8, 0);
322
323 brw_reg header = ubld.vgrf(BRW_TYPE_UD, 2);
324 if (bld.group() < 16) {
325 /* The header starts off as g0 and g1 for the first half */
326 ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
327 BRW_TYPE_UD));
328 } else {
329 /* The header starts off as g0 and g2 for the second half */
330 assert(bld.group() < 32);
331 const brw_reg header_sources[2] = {
332 retype(brw_vec8_grf(0, 0), BRW_TYPE_UD),
333 retype(brw_vec8_grf(2, 0), BRW_TYPE_UD),
334 };
335 ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
336
337 /* Gfx12 will require additional fix-ups if we ever hit this path. */
338 assert(devinfo->ver < 12);
339 }
340
341 uint32_t g00_bits = 0;
342
343 /* Set "Source0 Alpha Present to RenderTarget" bit in message
344 * header.
345 */
346 if (src0_alpha.file != BAD_FILE)
347 g00_bits |= 1 << 11;
348
349 /* Set computes stencil to render target */
350 if (prog_data->computed_stencil)
351 g00_bits |= 1 << 14;
352
353 if (g00_bits) {
354 /* OR extra bits into g0.0 */
355 ubld.group(1, 0).OR(component(header, 0),
356 retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
357 brw_imm_ud(g00_bits));
358 }
359
360 /* Set the render target index for choosing BLEND_STATE. */
361 if (inst->target > 0) {
362 ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target));
363 }
364
365 if (prog_data->uses_kill) {
366 ubld.group(1, 0).MOV(retype(component(header, 15), BRW_TYPE_UW),
367 brw_sample_mask_reg(bld));
368 }
369
370 assert(length == 0);
371 sources[0] = header;
372 sources[1] = horiz_offset(header, 8);
373 length = 2;
374 }
375 assert(length == 0 || length == 2);
376 header_size = length;
377
378 if (fs_payload.aa_dest_stencil_reg[0]) {
379 assert(inst->group < 16);
380 sources[length] = brw_vgrf(bld.shader->alloc.allocate(1), BRW_TYPE_F);
381 bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
382 .MOV(sources[length],
383 brw_reg(brw_vec8_grf(fs_payload.aa_dest_stencil_reg[0], 0)));
384 length++;
385 }
386
387 if (src0_alpha.file != BAD_FILE) {
388 for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
389 const brw_builder &ubld = bld.exec_all().group(8, i)
390 .annotate("FB write src0 alpha");
391 const brw_reg tmp = ubld.vgrf(BRW_TYPE_F);
392 ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
393 setup_color_payload(ubld, key, &sources[length], tmp, 1);
394 length++;
395 }
396 }
397
398 if (sample_mask.file != BAD_FILE) {
399 const brw_reg tmp = brw_vgrf(bld.shader->alloc.allocate(reg_unit(devinfo)),
400 BRW_TYPE_UD);
401
402 /* Hand over gl_SampleMask. Only the lower 16 bits of each channel are
403 * relevant. Since it's unsigned single words one vgrf is always
404 * 16-wide, but only the lower or higher 8 channels will be used by the
405 * hardware when doing a SIMD8 write depending on whether we have
406 * selected the subspans for the first or second half respectively.
407 */
408 assert(sample_mask.file != BAD_FILE &&
409 brw_type_size_bytes(sample_mask.type) == 4);
410 sample_mask.type = BRW_TYPE_UW;
411 sample_mask.stride *= 2;
412
413 bld.exec_all().annotate("FB write oMask")
414 .MOV(horiz_offset(retype(tmp, BRW_TYPE_UW),
415 inst->group % (16 * reg_unit(devinfo))),
416 sample_mask);
417
418 for (unsigned i = 0; i < reg_unit(devinfo); i++)
419 sources[length++] = byte_offset(tmp, REG_SIZE * i);
420 }
421
422 payload_header_size = length;
423
424 setup_color_payload(bld, key, &sources[length], color0, components);
425 length += 4;
426
427 if (color1.file != BAD_FILE) {
428 setup_color_payload(bld, key, &sources[length], color1, components);
429 length += 4;
430 }
431
432 if (src_depth.file != BAD_FILE) {
433 sources[length] = src_depth;
434 length++;
435 }
436
437 if (dst_depth.file != BAD_FILE) {
438 sources[length] = dst_depth;
439 length++;
440 }
441
442 if (src_stencil.file != BAD_FILE) {
443 assert(bld.dispatch_width() == 8 * reg_unit(devinfo));
444
445 /* XXX: src_stencil is only available on gfx9+. dst_depth is never
446 * available on gfx9+. As such it's impossible to have both enabled at the
447 * same time and therefore length cannot overrun the array.
448 */
449 assert(length < 15 * reg_unit(devinfo));
450
451 sources[length] = bld.vgrf(BRW_TYPE_UD);
452 bld.exec_all().annotate("FB write OS")
453 .MOV(retype(sources[length], BRW_TYPE_UB),
454 subscript(src_stencil, BRW_TYPE_UB, 0));
455 length++;
456 }
457
458 /* Send from the GRF */
459 brw_reg payload = brw_vgrf(-1, BRW_TYPE_F);
460 fs_inst *load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
461 payload.nr = bld.shader->alloc.allocate(regs_written(load));
462 load->dst = payload;
463
464 uint32_t msg_ctl = brw_fb_write_msg_control(inst, prog_data);
465
466 /* XXX - Bit 13 Per-sample PS enable */
467 inst->desc =
468 (inst->group / 16) << 11 | /* rt slot group */
469 brw_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,
470 0 /* coarse_rt_write */);
471
472 brw_reg desc = brw_imm_ud(0);
473 if (prog_data->coarse_pixel_dispatch == INTEL_ALWAYS) {
474 inst->desc |= (1 << 18);
475 } else if (prog_data->coarse_pixel_dispatch == INTEL_SOMETIMES) {
476 STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_RT_WRITES == (1 << 18));
477 const brw_builder &ubld = bld.exec_all().group(8, 0);
478 desc = ubld.vgrf(BRW_TYPE_UD);
479 ubld.AND(desc, brw_dynamic_msaa_flags(prog_data),
480 brw_imm_ud(INTEL_MSAA_FLAG_COARSE_RT_WRITES));
481 desc = component(desc, 0);
482 }
483
484 uint32_t ex_desc = 0;
485 if (devinfo->ver >= 20) {
486 ex_desc = inst->target << 21 |
487 null_rt << 20 |
488 (src0_alpha.file != BAD_FILE) << 15 |
489 (src_stencil.file != BAD_FILE) << 14 |
490 (src_depth.file != BAD_FILE) << 13 |
491 (sample_mask.file != BAD_FILE) << 12;
492 } else if (devinfo->ver >= 11) {
493 /* Set the "Render Target Index" and "Src0 Alpha Present" fields
494 * in the extended message descriptor, in lieu of using a header.
495 */
496 ex_desc = inst->target << 12 |
497 null_rt << 20 |
498 (src0_alpha.file != BAD_FILE) << 15;
499 }
500 inst->ex_desc = ex_desc;
501
502 inst->opcode = SHADER_OPCODE_SEND;
503 inst->resize_sources(3);
504 inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
505 inst->src[0] = desc;
506 inst->src[1] = brw_imm_ud(0);
507 inst->src[2] = payload;
508 inst->mlen = regs_written(load);
509 inst->ex_mlen = 0;
510 inst->header_size = header_size;
511 inst->check_tdr = true;
512 inst->send_has_side_effects = true;
513 }
514
515 static void
lower_fb_read_logical_send(const brw_builder & bld,fs_inst * inst,const struct brw_wm_prog_data * wm_prog_data)516 lower_fb_read_logical_send(const brw_builder &bld, fs_inst *inst,
517 const struct brw_wm_prog_data *wm_prog_data)
518 {
519 const intel_device_info *devinfo = bld.shader->devinfo;
520 const brw_builder &ubld = bld.exec_all().group(8, 0);
521 const unsigned length = 2;
522 const brw_reg header = ubld.vgrf(BRW_TYPE_UD, length);
523
524 assert(devinfo->ver >= 9 && devinfo->ver < 20);
525
526 if (bld.group() < 16) {
527 ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
528 BRW_TYPE_UD));
529 } else {
530 assert(bld.group() < 32);
531 const brw_reg header_sources[] = {
532 retype(brw_vec8_grf(0, 0), BRW_TYPE_UD),
533 retype(brw_vec8_grf(2, 0), BRW_TYPE_UD)
534 };
535 ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
536
537 if (devinfo->ver >= 12) {
538 /* On Gfx12 the Viewport and Render Target Array Index fields (AKA
539 * Poly 0 Info) are provided in r1.1 instead of r0.0, and the render
540 * target message header format was updated accordingly -- However
541 * the updated format only works for the lower 16 channels in a
542 * SIMD32 thread, since the higher 16 channels want the subspan data
543 * from r2 instead of r1, so we need to copy over the contents of
544 * r1.1 in order to fix things up.
545 */
546 ubld.group(1, 0).MOV(component(header, 9),
547 retype(brw_vec1_grf(1, 1), BRW_TYPE_UD));
548 }
549 }
550
551 /* BSpec 12470 (Gfx8-11), BSpec 47842 (Gfx12+) :
552 *
553 * "Must be zero for Render Target Read message."
554 *
555 * For bits :
556 * - 14 : Stencil Present to Render Target
557 * - 13 : Source Depth Present to Render Target
558 * - 12 : oMask to Render Target
559 * - 11 : Source0 Alpha Present to Render Target
560 */
561 ubld.group(1, 0).AND(component(header, 0),
562 component(header, 0),
563 brw_imm_ud(~INTEL_MASK(14, 11)));
564
565 inst->resize_sources(4);
566 inst->opcode = SHADER_OPCODE_SEND;
567 inst->src[0] = brw_imm_ud(0);
568 inst->src[1] = brw_imm_ud(0);
569 inst->src[2] = header;
570 inst->src[3] = brw_reg();
571 inst->mlen = length;
572 inst->header_size = length;
573 inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
574 inst->check_tdr = true;
575 inst->desc =
576 (inst->group / 16) << 11 | /* rt slot group */
577 brw_fb_read_desc(devinfo, inst->target,
578 0 /* msg_control */, inst->exec_size,
579 wm_prog_data->persample_dispatch);
580 }
581
582 static bool
is_high_sampler(const struct intel_device_info * devinfo,const brw_reg & sampler)583 is_high_sampler(const struct intel_device_info *devinfo, const brw_reg &sampler)
584 {
585 return sampler.file != IMM || sampler.ud >= 16;
586 }
587
588 static unsigned
sampler_msg_type(const intel_device_info * devinfo,opcode opcode,bool shadow_compare,bool lod_is_zero,bool has_min_lod)589 sampler_msg_type(const intel_device_info *devinfo,
590 opcode opcode, bool shadow_compare,
591 bool lod_is_zero, bool has_min_lod)
592 {
593 switch (opcode) {
594 case SHADER_OPCODE_TEX_LOGICAL:
595 if (devinfo->ver >= 20 && has_min_lod) {
596 return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD :
597 XE2_SAMPLER_MESSAGE_SAMPLE_MLOD;
598 } else {
599 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
600 GFX5_SAMPLER_MESSAGE_SAMPLE;
601 }
602 case FS_OPCODE_TXB_LOGICAL:
603 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
604 GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
605 case SHADER_OPCODE_TXL_LOGICAL:
606 assert(!has_min_lod);
607 if (lod_is_zero) {
608 return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ :
609 GFX9_SAMPLER_MESSAGE_SAMPLE_LZ;
610 }
611 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
612 GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
613 case SHADER_OPCODE_TXS_LOGICAL:
614 case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
615 assert(!has_min_lod);
616 return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
617 case SHADER_OPCODE_TXD_LOGICAL:
618 return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
619 GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
620 case SHADER_OPCODE_TXF_LOGICAL:
621 assert(!has_min_lod);
622 return lod_is_zero ? GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ :
623 GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
624 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
625 case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
626 assert(!has_min_lod);
627 return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
628 case SHADER_OPCODE_TXF_MCS_LOGICAL:
629 assert(!has_min_lod);
630 return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
631 case SHADER_OPCODE_LOD_LOGICAL:
632 assert(!has_min_lod);
633 return GFX5_SAMPLER_MESSAGE_LOD;
634 case SHADER_OPCODE_TG4_LOGICAL:
635 assert(!has_min_lod);
636 return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
637 GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
638 break;
639 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
640 assert(!has_min_lod);
641 return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
642 GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
643 case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
644 assert(!has_min_lod);
645 assert(devinfo->ver >= 20);
646 return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L_C:
647 XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L;
648 case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
649 assert(!has_min_lod);
650 assert(devinfo->ver >= 20);
651 return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_B;
652 case SHADER_OPCODE_TG4_BIAS_LOGICAL:
653 assert(!has_min_lod);
654 assert(devinfo->ver >= 20);
655 return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_B;
656 case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
657 assert(!has_min_lod);
658 assert(devinfo->ver >= 20);
659 return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L_C :
660 XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L;
661 case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
662 assert(!has_min_lod);
663 assert(devinfo->ver >= 20);
664 return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I_C :
665 XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I;
666 case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
667 assert(!has_min_lod);
668 return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
669 default:
670 unreachable("not reached");
671 }
672 }
673
674 /**
675 * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to
676 * the given requested_alignment_sz.
677 */
678 static fs_inst *
emit_load_payload_with_padding(const brw_builder & bld,const brw_reg & dst,const brw_reg * src,unsigned sources,unsigned header_size,unsigned requested_alignment_sz)679 emit_load_payload_with_padding(const brw_builder &bld, const brw_reg &dst,
680 const brw_reg *src, unsigned sources,
681 unsigned header_size,
682 unsigned requested_alignment_sz)
683 {
684 unsigned length = 0;
685 unsigned num_srcs =
686 sources * DIV_ROUND_UP(requested_alignment_sz, bld.dispatch_width());
687 brw_reg *src_comps = new brw_reg[num_srcs];
688
689 for (unsigned i = 0; i < header_size; i++)
690 src_comps[length++] = src[i];
691
692 for (unsigned i = header_size; i < sources; i++) {
693 unsigned src_sz =
694 retype(dst, src[i].type).component_size(bld.dispatch_width());
695 const enum brw_reg_type padding_payload_type =
696 brw_type_with_size(BRW_TYPE_UD, brw_type_size_bits(src[i].type));
697
698 src_comps[length++] = src[i];
699
700 /* Expand the real sources if component of requested payload type is
701 * larger than real source component.
702 */
703 if (src_sz < requested_alignment_sz) {
704 for (unsigned j = 0; j < (requested_alignment_sz / src_sz) - 1; j++) {
705 src_comps[length++] = retype(brw_reg(), padding_payload_type);
706 }
707 }
708 }
709
710 fs_inst *inst = bld.LOAD_PAYLOAD(dst, src_comps, length, header_size);
711 delete[] src_comps;
712
713 return inst;
714 }
715
716 static bool
shader_opcode_needs_header(opcode op)717 shader_opcode_needs_header(opcode op)
718 {
719 switch (op) {
720 case SHADER_OPCODE_TG4_LOGICAL:
721 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
722 case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
723 case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
724 case SHADER_OPCODE_TG4_BIAS_LOGICAL:
725 case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
726 case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
727 case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
728 return true;
729 default:
730 break;
731 }
732
733 return false;
734 }
735
736 static void
lower_sampler_logical_send(const brw_builder & bld,fs_inst * inst,const brw_reg & coordinate,const brw_reg & shadow_c,brw_reg lod,const brw_reg & lod2,const brw_reg & min_lod,const brw_reg & sample_index,const brw_reg & mcs,const brw_reg & surface,const brw_reg & sampler,const brw_reg & surface_handle,const brw_reg & sampler_handle,const brw_reg & tg4_offset,unsigned payload_type_bit_size,unsigned coord_components,unsigned grad_components,bool residency)737 lower_sampler_logical_send(const brw_builder &bld, fs_inst *inst,
738 const brw_reg &coordinate,
739 const brw_reg &shadow_c,
740 brw_reg lod, const brw_reg &lod2,
741 const brw_reg &min_lod,
742 const brw_reg &sample_index,
743 const brw_reg &mcs,
744 const brw_reg &surface,
745 const brw_reg &sampler,
746 const brw_reg &surface_handle,
747 const brw_reg &sampler_handle,
748 const brw_reg &tg4_offset,
749 unsigned payload_type_bit_size,
750 unsigned coord_components,
751 unsigned grad_components,
752 bool residency)
753 {
754 /* We never generate EOT sampler messages */
755 assert(!inst->eot);
756
757 const brw_compiler *compiler = bld.shader->compiler;
758 const intel_device_info *devinfo = bld.shader->devinfo;
759 const enum brw_reg_type payload_type =
760 brw_type_with_size(BRW_TYPE_F, payload_type_bit_size);
761 const enum brw_reg_type payload_unsigned_type =
762 brw_type_with_size(BRW_TYPE_UD, payload_type_bit_size);
763 const enum brw_reg_type payload_signed_type =
764 brw_type_with_size(BRW_TYPE_D, payload_type_bit_size);
765 unsigned header_size = 0, length = 0;
766 opcode op = inst->opcode;
767 brw_reg sources[1 + MAX_SAMPLER_MESSAGE_SIZE];
768 for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
769 sources[i] = bld.vgrf(payload_type);
770
771 /* We must have exactly one of surface/sampler and surface/sampler_handle */
772 assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
773 assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
774
775 if (shader_opcode_needs_header(op) || inst->offset != 0 ||
776 sampler_handle.file != BAD_FILE ||
777 is_high_sampler(devinfo, sampler) ||
778 residency) {
779 /* For general texture offsets (no txf workaround), we need a header to
780 * put them in.
781 *
782 * TG4 needs to place its channel select in the header, for interaction
783 * with ARB_texture_swizzle. The sampler index is only 4-bits, so for
784 * larger sampler numbers we need to offset the Sampler State Pointer in
785 * the header.
786 */
787 brw_reg header = retype(sources[0], BRW_TYPE_UD);
788 for (header_size = 0; header_size < reg_unit(devinfo); header_size++)
789 sources[length++] = byte_offset(header, REG_SIZE * header_size);
790
791 /* If we're requesting fewer than four channels worth of response,
792 * and we have an explicit header, we need to set up the sampler
793 * writemask. It's reversed from normal: 1 means "don't write".
794 */
795 unsigned comps_regs =
796 DIV_ROUND_UP(regs_written(inst) - reg_unit(devinfo) * residency,
797 reg_unit(devinfo));
798 unsigned comp_regs =
799 DIV_ROUND_UP(inst->dst.component_size(inst->exec_size),
800 reg_unit(devinfo) * REG_SIZE);
801 if (comps_regs < 4 * comp_regs) {
802 assert(comps_regs % comp_regs == 0);
803 unsigned mask = ~((1 << (comps_regs / comp_regs)) - 1) & 0xf;
804 inst->offset |= mask << 12;
805 }
806
807 if (residency)
808 inst->offset |= 1 << 23; /* g0.2 bit23 : Pixel Null Mask Enable */
809
810 /* Build the actual header */
811 const brw_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
812 const brw_builder ubld1 = ubld.group(1, 0);
813 if (devinfo->ver >= 11)
814 ubld.MOV(header, brw_imm_ud(0));
815 else
816 ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_TYPE_UD));
817 if (inst->offset) {
818 ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));
819 } else if (devinfo->ver < 11 &&
820 bld.shader->stage != MESA_SHADER_VERTEX &&
821 bld.shader->stage != MESA_SHADER_FRAGMENT) {
822 /* The vertex and fragment stages have g0.2 set to 0, so
823 * header0.2 is 0 when g0 is copied. Other stages may not, so we
824 * must set it to 0 to avoid setting undesirable bits in the
825 * message.
826 */
827 ubld1.MOV(component(header, 2), brw_imm_ud(0));
828 }
829
830 if (sampler_handle.file != BAD_FILE) {
831 /* Bindless sampler handles aren't relative to the sampler state
832 * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
833 * Instead, it's an absolute pointer relative to dynamic state base
834 * address.
835 *
836 * Sampler states are 16 bytes each and the pointer we give here has
837 * to be 32-byte aligned. In order to avoid more indirect messages
838 * than required, we assume that all bindless sampler states are
839 * 32-byte aligned. This sacrifices a bit of general state base
840 * address space but means we can do something more efficient in the
841 * shader.
842 */
843 if (compiler->use_bindless_sampler_offset) {
844 assert(devinfo->ver >= 11);
845 ubld1.OR(component(header, 3), sampler_handle, brw_imm_ud(1));
846 } else {
847 ubld1.MOV(component(header, 3), sampler_handle);
848 }
849 } else if (is_high_sampler(devinfo, sampler)) {
850 brw_reg sampler_state_ptr =
851 retype(brw_vec1_grf(0, 3), BRW_TYPE_UD);
852
853 /* Gfx11+ sampler message headers include bits in 4:0 which conflict
854 * with the ones included in g0.3 bits 4:0. Mask them out.
855 */
856 if (devinfo->ver >= 11) {
857 sampler_state_ptr = ubld1.vgrf(BRW_TYPE_UD);
858 ubld1.AND(sampler_state_ptr,
859 retype(brw_vec1_grf(0, 3), BRW_TYPE_UD),
860 brw_imm_ud(INTEL_MASK(31, 5)));
861 }
862
863 if (sampler.file == IMM) {
864 assert(sampler.ud >= 16);
865 const int sampler_state_size = 16; /* 16 bytes */
866
867 ubld1.ADD(component(header, 3), sampler_state_ptr,
868 brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
869 } else {
870 brw_reg tmp = ubld1.vgrf(BRW_TYPE_UD);
871 ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));
872 ubld1.SHL(tmp, tmp, brw_imm_ud(4));
873 ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
874 }
875 } else if (devinfo->ver >= 11) {
876 /* Gfx11+ sampler message headers include bits in 4:0 which conflict
877 * with the ones included in g0.3 bits 4:0. Mask them out.
878 */
879 ubld1.AND(component(header, 3),
880 retype(brw_vec1_grf(0, 3), BRW_TYPE_UD),
881 brw_imm_ud(INTEL_MASK(31, 5)));
882 }
883 }
884
885 const bool lod_is_zero = lod.is_zero();
886
887 /* On Xe2 and newer platforms, min_lod is the first parameter specifically
888 * so that a bunch of other, possibly unused, parameters don't need to also
889 * be included.
890 */
891 const unsigned msg_type =
892 sampler_msg_type(devinfo, op, inst->shadow_compare, lod_is_zero,
893 min_lod.file != BAD_FILE);
894
895 const bool min_lod_is_first = devinfo->ver >= 20 &&
896 (msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_MLOD ||
897 msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD);
898
899 if (min_lod_is_first) {
900 assert(min_lod.file != BAD_FILE);
901 bld.MOV(sources[length++], min_lod);
902 }
903
904 if (shadow_c.file != BAD_FILE) {
905 bld.MOV(sources[length], shadow_c);
906 length++;
907 }
908
909 bool coordinate_done = false;
910
911 /* Set up the LOD info */
912 switch (op) {
913 case SHADER_OPCODE_TXL_LOGICAL:
914 if (lod_is_zero)
915 break;
916 FALLTHROUGH;
917 case FS_OPCODE_TXB_LOGICAL:
918 case SHADER_OPCODE_TG4_BIAS_LOGICAL:
919 case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
920 case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
921 case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
922 bld.MOV(sources[length], lod);
923 length++;
924 break;
925 case SHADER_OPCODE_TXD_LOGICAL:
926 /* TXD should have been lowered in SIMD16 mode (in SIMD32 mode in
927 * Xe2+).
928 */
929 assert(bld.dispatch_width() == (8 * reg_unit(devinfo)));
930
931 /* Load dPdx and the coordinate together:
932 * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
933 */
934 for (unsigned i = 0; i < coord_components; i++) {
935 bld.MOV(sources[length++], offset(coordinate, bld, i));
936
937 /* For cube map array, the coordinate is (u,v,r,ai) but there are
938 * only derivatives for (u, v, r).
939 */
940 if (i < grad_components) {
941 bld.MOV(sources[length++], offset(lod, bld, i));
942 bld.MOV(sources[length++], offset(lod2, bld, i));
943 }
944 }
945
946 coordinate_done = true;
947 break;
948 case SHADER_OPCODE_TXS_LOGICAL:
949 sources[length] = retype(sources[length], payload_unsigned_type);
950 bld.MOV(sources[length++], lod);
951 break;
952 case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
953 /* We need an LOD; just use 0 */
954 sources[length] = retype(sources[length], payload_unsigned_type);
955 bld.MOV(sources[length++], brw_imm_ud(0));
956 break;
957 case SHADER_OPCODE_TXF_LOGICAL:
958 /* On Gfx9 the parameters are intermixed they are u, v, lod, r. */
959 sources[length] = retype(sources[length], payload_signed_type);
960 bld.MOV(sources[length++], offset(coordinate, bld, 0));
961
962 if (coord_components >= 2) {
963 sources[length] = retype(sources[length], payload_signed_type);
964 bld.MOV(sources[length], offset(coordinate, bld, 1));
965 } else {
966 sources[length] = brw_imm_d(0);
967 }
968 length++;
969
970 if (!lod_is_zero) {
971 sources[length] = retype(sources[length], payload_signed_type);
972 bld.MOV(sources[length++], lod);
973 }
974
975 for (unsigned i = 2; i < coord_components; i++) {
976 sources[length] = retype(sources[length], payload_signed_type);
977 bld.MOV(sources[length++], offset(coordinate, bld, i));
978 }
979
980 coordinate_done = true;
981 break;
982
983 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
984 case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
985 sources[length] = retype(sources[length], payload_unsigned_type);
986 bld.MOV(sources[length++], sample_index);
987
988 /* Data from the multisample control surface. */
989 for (unsigned i = 0; i < 2; ++i) {
990 /* Sampler always writes 4/8 register worth of data but for ld_mcs
991 * only valid data is in first two register. So with 16-bit
992 * payload, we need to split 2-32bit register into 4-16-bit
993 * payload.
994 *
995 * From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
996 * Shared Functions - 3D Sampler - Messages - Message Format:
997 *
998 * ld2dms_w si mcs0 mcs1 mcs2 mcs3 u v r
999 */
1000 if (op == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) {
1001 brw_reg tmp = offset(mcs, bld, i);
1002 sources[length] = retype(sources[length], payload_unsigned_type);
1003 bld.MOV(sources[length++],
1004 mcs.file == IMM ? mcs :
1005 brw_reg(subscript(tmp, payload_unsigned_type, 0)));
1006
1007 sources[length] = retype(sources[length], payload_unsigned_type);
1008 bld.MOV(sources[length++],
1009 mcs.file == IMM ? mcs :
1010 brw_reg(subscript(tmp, payload_unsigned_type, 1)));
1011 } else {
1012 sources[length] = retype(sources[length], payload_unsigned_type);
1013 bld.MOV(sources[length++],
1014 mcs.file == IMM ? mcs : offset(mcs, bld, i));
1015 }
1016 }
1017 FALLTHROUGH;
1018
1019 case SHADER_OPCODE_TXF_MCS_LOGICAL:
1020 /* There is no offsetting for this message; just copy in the integer
1021 * texture coordinates.
1022 */
1023 for (unsigned i = 0; i < coord_components; i++) {
1024 sources[length] = retype(sources[length], payload_signed_type);
1025 bld.MOV(sources[length++], offset(coordinate, bld, i));
1026 }
1027
1028 coordinate_done = true;
1029 break;
1030 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
1031 /* More crazy intermixing */
1032 for (unsigned i = 0; i < 2; i++) /* u, v */
1033 bld.MOV(sources[length++], offset(coordinate, bld, i));
1034
1035 for (unsigned i = 0; i < 2; i++) { /* offu, offv */
1036 sources[length] = retype(sources[length], payload_signed_type);
1037 bld.MOV(sources[length++], offset(tg4_offset, bld, i));
1038 }
1039
1040 if (coord_components == 3) /* r if present */
1041 bld.MOV(sources[length++], offset(coordinate, bld, 2));
1042
1043 coordinate_done = true;
1044 break;
1045 default:
1046 break;
1047 }
1048
1049 /* Set up the coordinate (except for cases where it was done above) */
1050 if (!coordinate_done) {
1051 for (unsigned i = 0; i < coord_components; i++)
1052 bld.MOV(retype(sources[length++], payload_type),
1053 offset(coordinate, bld, i));
1054 }
1055
1056 if (min_lod.file != BAD_FILE && !min_lod_is_first) {
1057 /* Account for all of the missing coordinate sources */
1058 if (op == FS_OPCODE_TXB_LOGICAL && devinfo->ver >= 20) {
1059 /* Bspec 64985:
1060 *
1061 * For sample_b sampler message format:
1062 *
1063 * SIMD16H/SIMD32H
1064 * Param Number 0 1 2 3 4 5
1065 * Param BIAS U V R Ai MLOD
1066 *
1067 * SIMD16/SIMD32
1068 * Param Number 0 1 2 3 4
1069 * Param BIAS_AI U V R MLOD
1070 */
1071 length += 3 - coord_components;
1072 } else if (op == SHADER_OPCODE_TXD_LOGICAL && devinfo->verx10 >= 125) {
1073 /* On DG2 and newer platforms, sample_d can only be used with 1D and
1074 * 2D surfaces, so the maximum number of gradient components is 2.
1075 * In spite of this limitation, the Bspec lists a mysterious R
1076 * component before the min_lod, so the maximum coordinate components
1077 * is 3.
1078 *
1079 * See bspec 45942, "Enable new message layout for cube array"
1080 */
1081 length += 3 - coord_components;
1082 length += (2 - grad_components) * 2;
1083 } else {
1084 length += 4 - coord_components;
1085 if (op == SHADER_OPCODE_TXD_LOGICAL)
1086 length += (3 - grad_components) * 2;
1087 }
1088
1089 bld.MOV(sources[length++], min_lod);
1090
1091 /* Wa_14014595444: Populate MLOD as parameter 5 (twice). */
1092 if (devinfo->verx10 == 125 && op == FS_OPCODE_TXB_LOGICAL &&
1093 !inst->shadow_compare)
1094 bld.MOV(sources[length++], min_lod);
1095 }
1096
1097 const brw_reg src_payload =
1098 brw_vgrf(bld.shader->alloc.allocate(length * bld.dispatch_width() / 8),
1099 BRW_TYPE_F);
1100 /* In case of 16-bit payload each component takes one full register in
1101 * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16
1102 * elements. In SIMD8H case hardware simply expects the components to be
1103 * padded (i.e., aligned on reg boundary).
1104 */
1105 fs_inst *load_payload_inst =
1106 emit_load_payload_with_padding(bld, src_payload, sources, length,
1107 header_size, REG_SIZE * reg_unit(devinfo));
1108 unsigned mlen = load_payload_inst->size_written / REG_SIZE;
1109 unsigned simd_mode = 0;
1110 if (devinfo->ver < 20) {
1111 if (payload_type_bit_size == 16) {
1112 assert(devinfo->ver >= 11);
1113 simd_mode = inst->exec_size <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H :
1114 GFX10_SAMPLER_SIMD_MODE_SIMD16H;
1115 } else {
1116 simd_mode = inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
1117 BRW_SAMPLER_SIMD_MODE_SIMD16;
1118 }
1119 } else {
1120 if (payload_type_bit_size == 16) {
1121 simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16H :
1122 XE2_SAMPLER_SIMD_MODE_SIMD32H;
1123 } else {
1124 simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16 :
1125 XE2_SAMPLER_SIMD_MODE_SIMD32;
1126 }
1127 }
1128
1129 /* Generate the SEND. */
1130 inst->opcode = SHADER_OPCODE_SEND;
1131 inst->mlen = mlen;
1132 inst->header_size = header_size;
1133 inst->sfid = BRW_SFID_SAMPLER;
1134 uint sampler_ret_type = brw_type_size_bits(inst->dst.type) == 16
1135 ? GFX8_SAMPLER_RETURN_FORMAT_16BITS
1136 : GFX8_SAMPLER_RETURN_FORMAT_32BITS;
1137 if (surface.file == IMM &&
1138 (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
1139 inst->desc = brw_sampler_desc(devinfo, surface.ud,
1140 sampler.file == IMM ? sampler.ud % 16 : 0,
1141 msg_type,
1142 simd_mode,
1143 sampler_ret_type);
1144 inst->src[0] = brw_imm_ud(0);
1145 inst->src[1] = brw_imm_ud(0);
1146 } else if (surface_handle.file != BAD_FILE) {
1147 /* Bindless surface */
1148 inst->desc = brw_sampler_desc(devinfo,
1149 GFX9_BTI_BINDLESS,
1150 sampler.file == IMM ? sampler.ud % 16 : 0,
1151 msg_type,
1152 simd_mode,
1153 sampler_ret_type);
1154
1155 /* For bindless samplers, the entire address is included in the message
1156 * header so we can leave the portion in the message descriptor 0.
1157 */
1158 if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {
1159 inst->src[0] = brw_imm_ud(0);
1160 } else {
1161 const brw_builder ubld = bld.group(1, 0).exec_all();
1162 brw_reg desc = ubld.vgrf(BRW_TYPE_UD);
1163 ubld.SHL(desc, sampler, brw_imm_ud(8));
1164 inst->src[0] = component(desc, 0);
1165 }
1166
1167 /* We assume that the driver provided the handle in the top 20 bits so
1168 * we can use the surface handle directly as the extended descriptor.
1169 */
1170 inst->src[1] = retype(surface_handle, BRW_TYPE_UD);
1171 inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1172 } else {
1173 /* Immediate portion of the descriptor */
1174 inst->desc = brw_sampler_desc(devinfo,
1175 0, /* surface */
1176 0, /* sampler */
1177 msg_type,
1178 simd_mode,
1179 sampler_ret_type);
1180 const brw_builder ubld = bld.group(1, 0).exec_all();
1181 brw_reg desc = ubld.vgrf(BRW_TYPE_UD);
1182 if (surface.equals(sampler)) {
1183 /* This case is common in GL */
1184 ubld.MUL(desc, surface, brw_imm_ud(0x101));
1185 } else {
1186 if (sampler_handle.file != BAD_FILE) {
1187 ubld.MOV(desc, surface);
1188 } else if (sampler.file == IMM) {
1189 ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8));
1190 } else {
1191 ubld.SHL(desc, sampler, brw_imm_ud(8));
1192 ubld.OR(desc, desc, surface);
1193 }
1194 }
1195 ubld.AND(desc, desc, brw_imm_ud(0xfff));
1196
1197 inst->src[0] = component(desc, 0);
1198 inst->src[1] = brw_imm_ud(0); /* ex_desc */
1199 }
1200
1201 inst->ex_desc = 0;
1202
1203 inst->src[2] = src_payload;
1204 inst->resize_sources(3);
1205
1206 /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
1207 assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE * reg_unit(devinfo));
1208 }
1209
1210 static unsigned
get_sampler_msg_payload_type_bit_size(const intel_device_info * devinfo,const fs_inst * inst)1211 get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
1212 const fs_inst *inst)
1213 {
1214 assert(inst);
1215 const brw_reg *src = inst->src;
1216 unsigned src_type_size = 0;
1217
1218 /* All sources need to have the same size, therefore seek the first valid
1219 * and take the size from there.
1220 */
1221 for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1222 if (src[i].file != BAD_FILE) {
1223 src_type_size = brw_type_size_bytes(src[i].type);
1224 break;
1225 }
1226 }
1227
1228 assert(src_type_size == 2 || src_type_size == 4);
1229
1230 #ifndef NDEBUG
1231 /* Make sure all sources agree. On gfx12 this doesn't hold when sampling
1232 * compressed multisampled surfaces. There the payload contains MCS data
1233 * which is already in 16-bits unlike the other parameters that need forced
1234 * conversion.
1235 */
1236 if (inst->opcode != SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) {
1237 for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1238 assert(src[i].file == BAD_FILE ||
1239 brw_type_size_bytes(src[i].type) == src_type_size);
1240 }
1241 }
1242 #endif
1243
1244 if (devinfo->verx10 < 125)
1245 return src_type_size * 8;
1246
1247 /* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec:
1248 * 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message
1249 * Format [GFX12:HAS:1209977870] *
1250 *
1251 * ld2dms_w SIMD8H and SIMD16H Only
1252 * ld_mcs SIMD8H and SIMD16H Only
1253 * ld2dms REMOVEDBY(GEN:HAS:1406788836)
1254 */
1255 if (inst->opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL ||
1256 inst->opcode == SHADER_OPCODE_TXF_MCS_LOGICAL)
1257 src_type_size = 2;
1258
1259 return src_type_size * 8;
1260 }
1261
1262 static void
lower_sampler_logical_send(const brw_builder & bld,fs_inst * inst)1263 lower_sampler_logical_send(const brw_builder &bld, fs_inst *inst)
1264 {
1265 const intel_device_info *devinfo = bld.shader->devinfo;
1266 const brw_reg coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
1267 const brw_reg shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
1268 const brw_reg lod = inst->src[TEX_LOGICAL_SRC_LOD];
1269 const brw_reg lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
1270 const brw_reg min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
1271 const brw_reg sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
1272 const brw_reg mcs = inst->src[TEX_LOGICAL_SRC_MCS];
1273 const brw_reg surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
1274 const brw_reg sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
1275 const brw_reg surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
1276 const brw_reg sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
1277 const brw_reg tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
1278 assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
1279 const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
1280 assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
1281 const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
1282 assert(inst->src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
1283 const bool residency = inst->src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
1284
1285 const unsigned msg_payload_type_bit_size =
1286 get_sampler_msg_payload_type_bit_size(devinfo, inst);
1287
1288 /* 16-bit payloads are available only on gfx11+ */
1289 assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11);
1290
1291 lower_sampler_logical_send(bld, inst, coordinate,
1292 shadow_c, lod, lod2, min_lod,
1293 sample_index,
1294 mcs, surface, sampler,
1295 surface_handle, sampler_handle,
1296 tg4_offset,
1297 msg_payload_type_bit_size,
1298 coord_components, grad_components,
1299 residency);
1300 }
1301
1302 /**
1303 * Predicate the specified instruction on the vector mask.
1304 */
1305 static void
emit_predicate_on_vector_mask(const brw_builder & bld,fs_inst * inst)1306 emit_predicate_on_vector_mask(const brw_builder &bld, fs_inst *inst)
1307 {
1308 assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
1309 bld.group() == inst->group &&
1310 bld.dispatch_width() == inst->exec_size);
1311
1312 const brw_builder ubld = bld.exec_all().group(1, 0);
1313
1314 const fs_visitor &s = *bld.shader;
1315 const brw_reg vector_mask = ubld.vgrf(BRW_TYPE_UW);
1316 ubld.UNDEF(vector_mask);
1317 ubld.emit(SHADER_OPCODE_READ_ARCH_REG, vector_mask, retype(brw_sr0_reg(3),
1318 BRW_TYPE_UD));
1319 const unsigned subreg = sample_mask_flag_subreg(s);
1320
1321 ubld.MOV(brw_flag_subreg(subreg + inst->group / 16), vector_mask);
1322
1323 if (inst->predicate) {
1324 assert(inst->predicate == BRW_PREDICATE_NORMAL);
1325 assert(!inst->predicate_inverse);
1326 assert(inst->flag_subreg == 0);
1327 assert(s.devinfo->ver < 20);
1328 /* Combine the vector mask with the existing predicate by using a
1329 * vertical predication mode.
1330 */
1331 inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
1332 } else {
1333 inst->flag_subreg = subreg;
1334 inst->predicate = BRW_PREDICATE_NORMAL;
1335 inst->predicate_inverse = false;
1336 }
1337 }
1338
1339 static void
setup_surface_descriptors(const brw_builder & bld,fs_inst * inst,uint32_t desc,const brw_reg & surface,const brw_reg & surface_handle)1340 setup_surface_descriptors(const brw_builder &bld, fs_inst *inst, uint32_t desc,
1341 const brw_reg &surface, const brw_reg &surface_handle)
1342 {
1343 const brw_compiler *compiler = bld.shader->compiler;
1344
1345 /* We must have exactly one of surface and surface_handle */
1346 assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
1347
1348 if (surface.file == IMM) {
1349 inst->desc = desc | (surface.ud & 0xff);
1350 inst->src[0] = brw_imm_ud(0);
1351 inst->src[1] = brw_imm_ud(0); /* ex_desc */
1352 } else if (surface_handle.file != BAD_FILE) {
1353 /* Bindless surface */
1354 inst->desc = desc | GFX9_BTI_BINDLESS;
1355 inst->src[0] = brw_imm_ud(0);
1356
1357 /* We assume that the driver provided the handle in the top 20 bits so
1358 * we can use the surface handle directly as the extended descriptor.
1359 */
1360 inst->src[1] = retype(surface_handle, BRW_TYPE_UD);
1361 inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1362 } else {
1363 inst->desc = desc;
1364 const brw_builder ubld = bld.exec_all().group(1, 0);
1365 brw_reg tmp = ubld.vgrf(BRW_TYPE_UD);
1366 ubld.AND(tmp, surface, brw_imm_ud(0xff));
1367 inst->src[0] = component(tmp, 0);
1368 inst->src[1] = brw_imm_ud(0); /* ex_desc */
1369 }
1370 }
1371
1372 static void
setup_lsc_surface_descriptors(const brw_builder & bld,fs_inst * inst,uint32_t desc,const brw_reg & surface)1373 setup_lsc_surface_descriptors(const brw_builder &bld, fs_inst *inst,
1374 uint32_t desc, const brw_reg &surface)
1375 {
1376 const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
1377 const brw_compiler *compiler = bld.shader->compiler;
1378
1379 inst->src[0] = brw_imm_ud(0); /* desc */
1380
1381 enum lsc_addr_surface_type surf_type = lsc_msg_desc_addr_type(devinfo, desc);
1382 switch (surf_type) {
1383 case LSC_ADDR_SURFTYPE_BSS:
1384 inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1385 /* fall-through */
1386 case LSC_ADDR_SURFTYPE_SS:
1387 assert(surface.file != BAD_FILE);
1388 /* We assume that the driver provided the handle in the top 20 bits so
1389 * we can use the surface handle directly as the extended descriptor.
1390 */
1391 inst->src[1] = retype(surface, BRW_TYPE_UD);
1392 /* Gfx20+ assumes ExBSO with UGM */
1393 if (devinfo->ver >= 20 && inst->sfid == GFX12_SFID_UGM)
1394 inst->send_ex_bso = true;
1395 break;
1396
1397 case LSC_ADDR_SURFTYPE_BTI:
1398 assert(surface.file != BAD_FILE);
1399 if (surface.file == IMM) {
1400 inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
1401 } else {
1402 const brw_builder ubld = bld.exec_all().group(1, 0);
1403 brw_reg tmp = ubld.vgrf(BRW_TYPE_UD);
1404 ubld.SHL(tmp, surface, brw_imm_ud(24));
1405 inst->src[1] = component(tmp, 0);
1406 }
1407 break;
1408
1409 case LSC_ADDR_SURFTYPE_FLAT:
1410 inst->src[1] = brw_imm_ud(0);
1411 break;
1412
1413 default:
1414 unreachable("Invalid LSC surface address type");
1415 }
1416 }
1417
1418 static enum lsc_addr_size
lsc_addr_size_for_type(enum brw_reg_type type)1419 lsc_addr_size_for_type(enum brw_reg_type type)
1420 {
1421 switch (brw_type_size_bytes(type)) {
1422 case 2: return LSC_ADDR_SIZE_A16;
1423 case 4: return LSC_ADDR_SIZE_A32;
1424 case 8: return LSC_ADDR_SIZE_A64;
1425 default: unreachable("invalid type size");
1426 }
1427 }
1428
1429 static void
lower_lsc_memory_logical_send(const brw_builder & bld,fs_inst * inst)1430 lower_lsc_memory_logical_send(const brw_builder &bld, fs_inst *inst)
1431 {
1432 const intel_device_info *devinfo = bld.shader->devinfo;
1433 assert(devinfo->has_lsc);
1434
1435 assert(inst->src[MEMORY_LOGICAL_OPCODE].file == IMM);
1436 assert(inst->src[MEMORY_LOGICAL_MODE].file == IMM);
1437 assert(inst->src[MEMORY_LOGICAL_BINDING_TYPE].file == IMM);
1438 assert(inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].file == IMM);
1439 assert(inst->src[MEMORY_LOGICAL_DATA_SIZE].file == IMM);
1440 assert(inst->src[MEMORY_LOGICAL_FLAGS].file == IMM);
1441
1442 /* Get the logical send arguments. */
1443 const enum lsc_opcode op = (lsc_opcode) inst->src[MEMORY_LOGICAL_OPCODE].ud;
1444 const enum memory_logical_mode mode =
1445 (enum memory_logical_mode) inst->src[MEMORY_LOGICAL_MODE].ud;
1446 const enum lsc_addr_surface_type binding_type =
1447 (enum lsc_addr_surface_type) inst->src[MEMORY_LOGICAL_BINDING_TYPE].ud;
1448 const brw_reg binding = inst->src[MEMORY_LOGICAL_BINDING];
1449 const brw_reg addr = inst->src[MEMORY_LOGICAL_ADDRESS];
1450 const unsigned coord_components =
1451 inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
1452 enum lsc_data_size data_size =
1453 (enum lsc_data_size) inst->src[MEMORY_LOGICAL_DATA_SIZE].ud;
1454 const unsigned components = inst->src[MEMORY_LOGICAL_COMPONENTS].ud;
1455 const enum memory_flags flags =
1456 (enum memory_flags) inst->src[MEMORY_LOGICAL_FLAGS].ud;
1457 const bool transpose = flags & MEMORY_FLAG_TRANSPOSE;
1458 const bool include_helpers = flags & MEMORY_FLAG_INCLUDE_HELPERS;
1459 const brw_reg data0 = inst->src[MEMORY_LOGICAL_DATA0];
1460 const brw_reg data1 = inst->src[MEMORY_LOGICAL_DATA1];
1461 const bool has_side_effects = inst->has_side_effects();
1462
1463 const uint32_t data_size_B = lsc_data_size_bytes(data_size);
1464 const enum brw_reg_type data_type =
1465 brw_type_with_size(data0.type, data_size_B * 8);
1466
1467 const enum lsc_addr_size addr_size = lsc_addr_size_for_type(addr.type);
1468
1469 brw_reg payload = addr;
1470
1471 if (addr.file != VGRF || !addr.is_contiguous()) {
1472 if (inst->force_writemask_all) {
1473 const brw_builder dbld = bld.group(bld.shader->dispatch_width, 0);
1474 payload = dbld.move_to_vgrf(addr, coord_components);
1475 } else {
1476 payload = bld.move_to_vgrf(addr, coord_components);
1477 }
1478 }
1479
1480 unsigned ex_mlen = 0;
1481 brw_reg payload2;
1482 if (data0.file != BAD_FILE) {
1483 if (transpose) {
1484 assert(data1.file == BAD_FILE);
1485
1486 payload2 = data0;
1487 ex_mlen = DIV_ROUND_UP(components, 8);
1488 } else {
1489 brw_reg data[8];
1490 unsigned size = 0;
1491
1492 assert(components < 8);
1493
1494 for (unsigned i = 0; i < components; i++)
1495 data[size++] = offset(data0, bld, i);
1496
1497 if (data1.file != BAD_FILE) {
1498 for (unsigned i = 0; i < components; i++)
1499 data[size++] = offset(data1, bld, i);
1500 }
1501
1502 payload2 = bld.vgrf(data0.type, size);
1503 bld.LOAD_PAYLOAD(payload2, data, size, 0);
1504 ex_mlen = (size * brw_type_size_bytes(data_type) * inst->exec_size) / REG_SIZE;
1505 }
1506 }
1507
1508 /* Bspec: Atomic instruction -> Cache section:
1509 *
1510 * Atomic messages are always forced to "un-cacheable" in the L1
1511 * cache.
1512 */
1513 unsigned cache_mode =
1514 lsc_opcode_is_atomic(op) ? (unsigned) LSC_CACHE(devinfo, STORE, L1UC_L3WB) :
1515 lsc_opcode_is_store(op) ? (unsigned) LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS) :
1516 (unsigned) LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS);
1517
1518 /* If we're a fragment shader, we have to predicate with the sample mask to
1519 * avoid helper invocations in instructions with side effects, unless they
1520 * are explicitly required. One exception is for scratch writes - even
1521 * though those have side effects, they represent operations that didn't
1522 * originally have any. We want to avoid accessing undefined values from
1523 * scratch, so we disable helper invocations entirely there.
1524 *
1525 * There are also special cases when we actually want to run on helpers
1526 * (ray queries).
1527 */
1528 if (bld.shader->stage == MESA_SHADER_FRAGMENT && !transpose) {
1529 if (include_helpers)
1530 emit_predicate_on_vector_mask(bld, inst);
1531 else if (has_side_effects && mode != MEMORY_MODE_SCRATCH)
1532 brw_emit_predicate_on_sample_mask(bld, inst);
1533 }
1534
1535 switch (mode) {
1536 case MEMORY_MODE_UNTYPED:
1537 case MEMORY_MODE_CONSTANT:
1538 case MEMORY_MODE_SCRATCH:
1539 inst->sfid = GFX12_SFID_UGM;
1540 break;
1541 case MEMORY_MODE_TYPED:
1542 inst->sfid = GFX12_SFID_TGM;
1543 break;
1544 case MEMORY_MODE_SHARED_LOCAL:
1545 inst->sfid = GFX12_SFID_SLM;
1546 break;
1547 }
1548 assert(inst->sfid);
1549
1550 /* Disable LSC data port L1 cache scheme for the TGM load/store for RT
1551 * shaders. (see HSD 18038444588)
1552 */
1553 if (devinfo->ver >= 20 && gl_shader_stage_is_rt(bld.shader->stage) &&
1554 inst->sfid == GFX12_SFID_TGM &&
1555 !lsc_opcode_is_atomic(op)) {
1556 if (lsc_opcode_is_store(op)) {
1557 cache_mode = (unsigned) LSC_CACHE(devinfo, STORE, L1UC_L3WB);
1558 } else {
1559 cache_mode = (unsigned) LSC_CACHE(devinfo, LOAD, L1UC_L3C);
1560 }
1561 }
1562
1563 inst->desc = lsc_msg_desc(devinfo, op, binding_type, addr_size, data_size,
1564 lsc_opcode_has_cmask(op) ?
1565 (1 << components) - 1 : components,
1566 transpose, cache_mode);
1567
1568 /* Set up extended descriptors, fills src[0] and src[1]. */
1569 setup_lsc_surface_descriptors(bld, inst, inst->desc, binding);
1570
1571 inst->opcode = SHADER_OPCODE_SEND;
1572 inst->mlen = lsc_msg_addr_len(devinfo, addr_size,
1573 inst->exec_size * coord_components);
1574 inst->ex_mlen = ex_mlen;
1575 inst->header_size = 0;
1576 inst->send_has_side_effects = has_side_effects;
1577 inst->send_is_volatile = !has_side_effects;
1578
1579 inst->resize_sources(4);
1580
1581 /* Finally, the payload */
1582 inst->src[2] = payload;
1583 inst->src[3] = payload2;
1584 }
1585
1586 static brw_reg
emit_a64_oword_block_header(const brw_builder & bld,const brw_reg & addr)1587 emit_a64_oword_block_header(const brw_builder &bld, const brw_reg &addr)
1588 {
1589 const brw_builder ubld = bld.exec_all().group(8, 0);
1590
1591 assert(brw_type_size_bytes(addr.type) == 8 && addr.stride == 0);
1592
1593 brw_reg expanded_addr = addr;
1594 if (addr.file == UNIFORM) {
1595 /* We can't do stride 1 with the UNIFORM file, it requires stride 0 */
1596 brw_builder ubld1 = ubld.group(1, 0);
1597
1598 brw_reg tmp = ubld1.vgrf(BRW_TYPE_UQ);
1599 ubld1.UNDEF(tmp);
1600
1601 expanded_addr = component(tmp, 0);
1602 ubld1.MOV(expanded_addr, retype(addr, BRW_TYPE_UQ));
1603 }
1604
1605 brw_reg header = ubld.vgrf(BRW_TYPE_UD);
1606 ubld.MOV(header, brw_imm_ud(0));
1607
1608 /* Use a 2-wide MOV to fill out the address */
1609 brw_reg addr_vec2 = expanded_addr;
1610 addr_vec2.type = BRW_TYPE_UD;
1611 addr_vec2.stride = 1;
1612 ubld.group(2, 0).MOV(header, addr_vec2);
1613
1614 return header;
1615 }
1616
1617 static void
lower_hdc_memory_logical_send(const brw_builder & bld,fs_inst * inst)1618 lower_hdc_memory_logical_send(const brw_builder &bld, fs_inst *inst)
1619 {
1620 const intel_device_info *devinfo = bld.shader->devinfo;
1621 const brw_compiler *compiler = bld.shader->compiler;
1622
1623 assert(inst->src[MEMORY_LOGICAL_OPCODE].file == IMM);
1624 assert(inst->src[MEMORY_LOGICAL_MODE].file == IMM);
1625 assert(inst->src[MEMORY_LOGICAL_BINDING_TYPE].file == IMM);
1626 assert(inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].file == IMM);
1627 assert(inst->src[MEMORY_LOGICAL_DATA_SIZE].file == IMM);
1628 assert(inst->src[MEMORY_LOGICAL_FLAGS].file == IMM);
1629
1630 /* Get the logical send arguments. */
1631 const enum lsc_opcode op = (lsc_opcode)inst->src[MEMORY_LOGICAL_OPCODE].ud;
1632 const enum memory_logical_mode mode =
1633 (enum memory_logical_mode) inst->src[MEMORY_LOGICAL_MODE].ud;
1634 enum lsc_addr_surface_type binding_type =
1635 (enum lsc_addr_surface_type) inst->src[MEMORY_LOGICAL_BINDING_TYPE].ud;
1636 brw_reg binding = inst->src[MEMORY_LOGICAL_BINDING];
1637 const brw_reg addr = inst->src[MEMORY_LOGICAL_ADDRESS];
1638 const unsigned coord_components =
1639 inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
1640 const unsigned alignment = inst->src[MEMORY_LOGICAL_ALIGNMENT].ud;
1641 const unsigned components = inst->src[MEMORY_LOGICAL_COMPONENTS].ud;
1642 const enum memory_flags flags =
1643 (enum memory_flags) inst->src[MEMORY_LOGICAL_FLAGS].ud;
1644 const bool block = flags & MEMORY_FLAG_TRANSPOSE;
1645 const bool include_helpers = flags & MEMORY_FLAG_INCLUDE_HELPERS;
1646 const brw_reg data0 = inst->src[MEMORY_LOGICAL_DATA0];
1647 const brw_reg data1 = inst->src[MEMORY_LOGICAL_DATA1];
1648 const bool has_side_effects = inst->has_side_effects();
1649 const bool has_dest = inst->dst.file != BAD_FILE && !inst->dst.is_null();
1650
1651 /* Don't predicate scratch writes on the sample mask. Otherwise,
1652 * FS helper invocations would load undefined values from scratch memory.
1653 * And scratch memory load/stores are produced from operations without
1654 * side-effects, thus they should not have different behavior in the
1655 * helper invocations.
1656 */
1657 bool allow_sample_mask = has_side_effects && mode != MEMORY_MODE_SCRATCH;
1658
1659 const enum lsc_data_size data_size =
1660 (enum lsc_data_size) inst->src[MEMORY_LOGICAL_DATA_SIZE].ud;
1661
1662 /* unpadded data size */
1663 const uint32_t data_bit_size =
1664 data_size == LSC_DATA_SIZE_D8U32 ? 8 :
1665 data_size == LSC_DATA_SIZE_D16U32 ? 16 :
1666 8 * lsc_data_size_bytes(data_size);
1667
1668 const bool byte_scattered =
1669 data_bit_size < 32 || (alignment != 0 && alignment < 4);
1670 const bool dword_scattered = !byte_scattered && mode == MEMORY_MODE_SCRATCH;
1671 const bool surface_access = !byte_scattered && !dword_scattered && !block;
1672
1673 /* SLM block reads must use the 16B-aligned OWord Block Read messages,
1674 * as the unaligned message doesn't exist for SLM.
1675 */
1676 const bool oword_aligned = block && mode == MEMORY_MODE_SHARED_LOCAL;
1677 assert(!oword_aligned || (alignment % 16) == 0);
1678 assert(!block || (alignment % 4) == 0);
1679
1680 enum lsc_addr_size addr_size = lsc_addr_size_for_type(addr.type);
1681 unsigned addr_size_B = coord_components * lsc_addr_size_bytes(addr_size);
1682
1683 brw_reg header;
1684 brw_builder ubld8 = bld.exec_all().group(8, 0);
1685 brw_builder ubld1 = ubld8.group(1, 0);
1686 if (mode == MEMORY_MODE_SCRATCH) {
1687 header = ubld8.vgrf(BRW_TYPE_UD);
1688 ubld8.emit(SHADER_OPCODE_SCRATCH_HEADER, header, brw_ud8_grf(0, 0));
1689 } else if (block) {
1690 if (addr_size == LSC_ADDR_SIZE_A64) {
1691 header = emit_a64_oword_block_header(bld, addr);
1692 } else {
1693 header = ubld8.vgrf(BRW_TYPE_UD);
1694 ubld8.MOV(header, brw_imm_ud(0));
1695 if (oword_aligned)
1696 ubld1.SHR(component(header, 2), addr, brw_imm_ud(4));
1697 else
1698 ubld1.MOV(component(header, 2), addr);
1699 }
1700 }
1701
1702 /* If we're a fragment shader, we have to predicate with the sample mask to
1703 * avoid helper invocations to avoid helper invocations in instructions
1704 * with side effects, unless they are explicitly required.
1705 *
1706 * There are also special cases when we actually want to run on helpers
1707 * (ray queries).
1708 */
1709 if (bld.shader->stage == MESA_SHADER_FRAGMENT) {
1710 if (include_helpers)
1711 emit_predicate_on_vector_mask(bld, inst);
1712 else if (allow_sample_mask &&
1713 (header.file == BAD_FILE || !surface_access))
1714 brw_emit_predicate_on_sample_mask(bld, inst);
1715 }
1716
1717 brw_reg payload, payload2;
1718 unsigned mlen, ex_mlen = 0;
1719
1720 if (!block) {
1721 brw_reg data[11];
1722 unsigned num_sources = 0;
1723 if (header.file != BAD_FILE)
1724 data[num_sources++] = header;
1725
1726 for (unsigned i = 0; i < coord_components; i++)
1727 data[num_sources++] = offset(addr, bld, i);
1728
1729 if (data0.file != BAD_FILE) {
1730 for (unsigned i = 0; i < components; i++)
1731 data[num_sources++] = offset(data0, bld, i);
1732 if (data1.file != BAD_FILE) {
1733 for (unsigned i = 0; i < components; i++)
1734 data[num_sources++] = offset(data1, bld, i);
1735 }
1736 }
1737
1738 assert(num_sources <= ARRAY_SIZE(data));
1739
1740 unsigned payload_size_UDs = (header.file != BAD_FILE ? 1 : 0) +
1741 (addr_size_B / 4) +
1742 (lsc_op_num_data_values(op) * components *
1743 lsc_data_size_bytes(data_size) / 4);
1744
1745 payload = bld.vgrf(BRW_TYPE_UD, payload_size_UDs);
1746 fs_inst *load_payload =
1747 emit_load_payload_with_padding(bld, payload, data, num_sources,
1748 header.file != BAD_FILE ? 1 : 0,
1749 REG_SIZE);
1750 mlen = load_payload->size_written / REG_SIZE;
1751 } else {
1752 assert(data1.file == BAD_FILE);
1753
1754 payload = header;
1755 mlen = 1;
1756
1757 if (data0.file != BAD_FILE) {
1758 payload2 = bld.move_to_vgrf(data0, components);
1759 ex_mlen = components * sizeof(uint32_t) / REG_SIZE;
1760 }
1761 }
1762
1763
1764 if (mode == MEMORY_MODE_SHARED_LOCAL) {
1765 binding_type = LSC_ADDR_SURFTYPE_BTI;
1766 binding = brw_imm_ud(GFX7_BTI_SLM);
1767 } else if (mode == MEMORY_MODE_SCRATCH) {
1768 binding_type = LSC_ADDR_SURFTYPE_BTI;
1769 binding = brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
1770 }
1771
1772 uint32_t sfid, desc;
1773 if (mode == MEMORY_MODE_TYPED) {
1774 assert(addr_size == LSC_ADDR_SIZE_A32);
1775 assert(!block);
1776
1777 sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
1778
1779 if (lsc_opcode_is_atomic(op)) {
1780 desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
1781 lsc_op_to_legacy_atomic(op),
1782 has_dest);
1783 } else {
1784 desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size,
1785 inst->group, components, !has_dest);
1786 }
1787 } else if (mode == MEMORY_MODE_CONSTANT) {
1788 assert(block); /* non-block loads not yet handled */
1789
1790 sfid = GFX6_SFID_DATAPORT_CONSTANT_CACHE;
1791 desc = brw_dp_oword_block_rw_desc(devinfo, false, components, !has_dest);
1792 } else if (addr_size == LSC_ADDR_SIZE_A64) {
1793 assert(binding_type == LSC_ADDR_SURFTYPE_FLAT);
1794 assert(!dword_scattered);
1795
1796 sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
1797
1798 if (lsc_opcode_is_atomic(op)) {
1799 unsigned aop = lsc_op_to_legacy_atomic(op);
1800 if (lsc_opcode_is_atomic_float(op)) {
1801 desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
1802 data_bit_size, aop,
1803 has_dest);
1804 } else {
1805 desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size,
1806 data_bit_size, aop,
1807 has_dest);
1808 }
1809 } else if (block) {
1810 desc = brw_dp_a64_oword_block_rw_desc(devinfo, oword_aligned,
1811 components, !has_dest);
1812 } else if (byte_scattered) {
1813 desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
1814 data_bit_size, !has_dest);
1815 } else {
1816 desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
1817 components, !has_dest);
1818 }
1819 } else {
1820 assert(binding_type != LSC_ADDR_SURFTYPE_FLAT);
1821
1822 sfid = surface_access ? HSW_SFID_DATAPORT_DATA_CACHE_1
1823 : GFX7_SFID_DATAPORT_DATA_CACHE;
1824
1825 if (lsc_opcode_is_atomic(op)) {
1826 unsigned aop = lsc_op_to_legacy_atomic(op);
1827 if (lsc_opcode_is_atomic_float(op)) {
1828 desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,
1829 aop, has_dest);
1830 } else {
1831 desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size,
1832 aop, has_dest);
1833 }
1834 } else if (block) {
1835 desc = brw_dp_oword_block_rw_desc(devinfo, oword_aligned,
1836 components, !has_dest);
1837 } else if (byte_scattered) {
1838 desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1839 data_bit_size, !has_dest);
1840 } else if (dword_scattered) {
1841 desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1842 !has_dest);
1843 } else {
1844 desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1845 components, !has_dest);
1846 }
1847 }
1848
1849 assert(sfid);
1850
1851 /* Update the original instruction. */
1852 inst->opcode = SHADER_OPCODE_SEND;
1853 inst->sfid = sfid;
1854 inst->mlen = mlen;
1855 inst->ex_mlen = ex_mlen;
1856 inst->header_size = header.file != BAD_FILE ? 1 : 0;
1857 inst->send_has_side_effects = has_side_effects;
1858 inst->send_is_volatile = !has_side_effects;
1859
1860 if (block) {
1861 assert(inst->force_writemask_all);
1862 inst->exec_size = components > 8 ? 16 : 8;
1863 }
1864
1865 inst->resize_sources(4);
1866
1867 /* Set up descriptors */
1868 switch (binding_type) {
1869 case LSC_ADDR_SURFTYPE_FLAT:
1870 inst->src[0] = brw_imm_ud(0);
1871 inst->src[1] = brw_imm_ud(0);
1872 break;
1873 case LSC_ADDR_SURFTYPE_BSS:
1874 inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1875 /* fall-through */
1876 case LSC_ADDR_SURFTYPE_SS:
1877 desc |= GFX9_BTI_BINDLESS;
1878
1879 /* We assume that the driver provided the handle in the top 20 bits so
1880 * we can use the surface handle directly as the extended descriptor.
1881 */
1882 inst->src[0] = brw_imm_ud(0);
1883 inst->src[1] = binding;
1884 break;
1885 case LSC_ADDR_SURFTYPE_BTI:
1886 if (binding.file == IMM) {
1887 desc |= binding.ud & 0xff;
1888 inst->src[0] = brw_imm_ud(0);
1889 inst->src[1] = brw_imm_ud(0);
1890 } else {
1891 brw_reg tmp = ubld1.vgrf(BRW_TYPE_UD);
1892 ubld1.AND(tmp, binding, brw_imm_ud(0xff));
1893 inst->src[0] = component(tmp, 0);
1894 inst->src[1] = brw_imm_ud(0);
1895 }
1896 break;
1897 default:
1898 unreachable("Unknown surface type");
1899 }
1900
1901 inst->desc = desc;
1902
1903 /* Finally, the payloads */
1904 inst->src[2] = payload;
1905 inst->src[3] = payload2;
1906 }
1907
1908 static void
lower_lsc_varying_pull_constant_logical_send(const brw_builder & bld,fs_inst * inst)1909 lower_lsc_varying_pull_constant_logical_send(const brw_builder &bld,
1910 fs_inst *inst)
1911 {
1912 const intel_device_info *devinfo = bld.shader->devinfo;
1913 ASSERTED const brw_compiler *compiler = bld.shader->compiler;
1914
1915 brw_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
1916 brw_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
1917 brw_reg offset_B = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
1918 brw_reg alignment_B = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT];
1919
1920 /* We are switching the instruction from an ALU-like instruction to a
1921 * send-from-grf instruction. Since sends can't handle strides or
1922 * source modifiers, we have to make a copy of the offset source.
1923 */
1924 brw_reg ubo_offset = bld.move_to_vgrf(offset_B, 1);
1925
1926 enum lsc_addr_surface_type surf_type =
1927 surface_handle.file == BAD_FILE ?
1928 LSC_ADDR_SURFTYPE_BTI : LSC_ADDR_SURFTYPE_BSS;
1929
1930 assert(alignment_B.file == IMM);
1931 unsigned alignment = alignment_B.ud;
1932
1933 inst->opcode = SHADER_OPCODE_SEND;
1934 inst->sfid = GFX12_SFID_UGM;
1935 inst->resize_sources(3);
1936 inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
1937 compiler->extended_bindless_surface_offset;
1938
1939 assert(!compiler->indirect_ubos_use_sampler);
1940
1941 inst->src[0] = brw_imm_ud(0);
1942 inst->src[2] = ubo_offset; /* payload */
1943
1944 if (alignment >= 4) {
1945 inst->desc =
1946 lsc_msg_desc(devinfo, LSC_OP_LOAD,
1947 surf_type, LSC_ADDR_SIZE_A32,
1948 LSC_DATA_SIZE_D32,
1949 4 /* num_channels */,
1950 false /* transpose */,
1951 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
1952 inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);
1953
1954 setup_lsc_surface_descriptors(bld, inst, inst->desc,
1955 surface.file != BAD_FILE ?
1956 surface : surface_handle);
1957 } else {
1958 inst->desc =
1959 lsc_msg_desc(devinfo, LSC_OP_LOAD,
1960 surf_type, LSC_ADDR_SIZE_A32,
1961 LSC_DATA_SIZE_D32,
1962 1 /* num_channels */,
1963 false /* transpose */,
1964 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
1965 inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);
1966
1967 setup_lsc_surface_descriptors(bld, inst, inst->desc,
1968 surface.file != BAD_FILE ?
1969 surface : surface_handle);
1970
1971 /* The byte scattered messages can only read one dword at a time so
1972 * we have to duplicate the message 4 times to read the full vec4.
1973 * Hopefully, dead code will clean up the mess if some of them aren't
1974 * needed.
1975 */
1976 assert(inst->size_written == 16 * inst->exec_size);
1977 inst->size_written /= 4;
1978 for (unsigned c = 1; c < 4; c++) {
1979 /* Emit a copy of the instruction because we're about to modify
1980 * it. Because this loop starts at 1, we will emit copies for the
1981 * first 3 and the final one will be the modified instruction.
1982 */
1983 bld.emit(*inst);
1984
1985 /* Offset the source */
1986 inst->src[2] = bld.vgrf(BRW_TYPE_UD);
1987 bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
1988
1989 /* Offset the destination */
1990 inst->dst = offset(inst->dst, bld, 1);
1991 }
1992 }
1993 }
1994
1995 static void
lower_varying_pull_constant_logical_send(const brw_builder & bld,fs_inst * inst)1996 lower_varying_pull_constant_logical_send(const brw_builder &bld, fs_inst *inst)
1997 {
1998 const intel_device_info *devinfo = bld.shader->devinfo;
1999 const brw_compiler *compiler = bld.shader->compiler;
2000
2001 brw_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
2002 brw_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
2003 brw_reg offset_B = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
2004
2005 /* We are switching the instruction from an ALU-like instruction to a
2006 * send-from-grf instruction. Since sends can't handle strides or
2007 * source modifiers, we have to make a copy of the offset source.
2008 */
2009 brw_reg ubo_offset = bld.vgrf(BRW_TYPE_UD);
2010 bld.MOV(ubo_offset, offset_B);
2011
2012 assert(inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].file == IMM);
2013 unsigned alignment = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].ud;
2014
2015 inst->opcode = SHADER_OPCODE_SEND;
2016 inst->mlen = inst->exec_size / 8;
2017 inst->resize_sources(3);
2018
2019 /* src[0] & src[1] are filled by setup_surface_descriptors() */
2020 inst->src[2] = ubo_offset; /* payload */
2021
2022 if (compiler->indirect_ubos_use_sampler) {
2023 const unsigned simd_mode =
2024 inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
2025 BRW_SAMPLER_SIMD_MODE_SIMD16;
2026 const uint32_t desc = brw_sampler_desc(devinfo, 0, 0,
2027 GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
2028 simd_mode, 0);
2029
2030 inst->sfid = BRW_SFID_SAMPLER;
2031 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2032 } else if (alignment >= 4) {
2033 const uint32_t desc =
2034 brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
2035 4, /* num_channels */
2036 false /* write */);
2037
2038 inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2039 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2040 } else {
2041 const uint32_t desc =
2042 brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
2043 32, /* bit_size */
2044 false /* write */);
2045
2046 inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
2047 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2048
2049 /* The byte scattered messages can only read one dword at a time so
2050 * we have to duplicate the message 4 times to read the full vec4.
2051 * Hopefully, dead code will clean up the mess if some of them aren't
2052 * needed.
2053 */
2054 assert(inst->size_written == 16 * inst->exec_size);
2055 inst->size_written /= 4;
2056 for (unsigned c = 1; c < 4; c++) {
2057 /* Emit a copy of the instruction because we're about to modify
2058 * it. Because this loop starts at 1, we will emit copies for the
2059 * first 3 and the final one will be the modified instruction.
2060 */
2061 bld.emit(*inst);
2062
2063 /* Offset the source */
2064 inst->src[2] = bld.vgrf(BRW_TYPE_UD);
2065 bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
2066
2067 /* Offset the destination */
2068 inst->dst = offset(inst->dst, bld, 1);
2069 }
2070 }
2071 }
2072
2073 static void
lower_interpolator_logical_send(const brw_builder & bld,fs_inst * inst,const struct brw_wm_prog_key * wm_prog_key,const struct brw_wm_prog_data * wm_prog_data)2074 lower_interpolator_logical_send(const brw_builder &bld, fs_inst *inst,
2075 const struct brw_wm_prog_key *wm_prog_key,
2076 const struct brw_wm_prog_data *wm_prog_data)
2077 {
2078 const intel_device_info *devinfo = bld.shader->devinfo;
2079
2080 /* We have to send something */
2081 brw_reg payload = brw_vec8_grf(0, 0);
2082 unsigned mlen = 1;
2083
2084 unsigned mode;
2085 switch (inst->opcode) {
2086 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2087 assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
2088 mode = GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE;
2089 break;
2090
2091 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2092 assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
2093 mode = GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET;
2094 break;
2095
2096 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2097 payload = inst->src[INTERP_SRC_OFFSET];
2098 mlen = 2 * inst->exec_size / 8;
2099 mode = GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET;
2100 break;
2101
2102 default:
2103 unreachable("Invalid interpolator instruction");
2104 }
2105
2106 const bool dynamic_mode =
2107 inst->src[INTERP_SRC_DYNAMIC_MODE].file != BAD_FILE;
2108
2109 brw_reg desc = inst->src[INTERP_SRC_MSG_DESC];
2110 uint32_t desc_imm =
2111 brw_pixel_interp_desc(devinfo,
2112 /* Leave the mode at 0 if persample_dispatch is
2113 * dynamic, it will be ORed in below.
2114 */
2115 dynamic_mode ? 0 : mode,
2116 inst->pi_noperspective,
2117 false /* coarse_pixel_rate */,
2118 inst->exec_size, inst->group);
2119
2120 if (wm_prog_data->coarse_pixel_dispatch == INTEL_ALWAYS) {
2121 desc_imm |= (1 << 15);
2122 } else if (wm_prog_data->coarse_pixel_dispatch == INTEL_SOMETIMES) {
2123 STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_PI_MSG == (1 << 15));
2124 brw_reg orig_desc = desc;
2125 const brw_builder &ubld = bld.exec_all().group(8, 0);
2126 desc = ubld.vgrf(BRW_TYPE_UD);
2127 ubld.AND(desc, brw_dynamic_msaa_flags(wm_prog_data),
2128 brw_imm_ud(INTEL_MSAA_FLAG_COARSE_PI_MSG));
2129
2130 /* And, if it's AT_OFFSET, we might have a non-trivial descriptor */
2131 if (orig_desc.file == IMM) {
2132 desc_imm |= orig_desc.ud;
2133 } else {
2134 ubld.OR(desc, desc, orig_desc);
2135 }
2136 }
2137
2138 /* If persample_dispatch is dynamic, select the interpolation mode
2139 * dynamically and OR into the descriptor to complete the static part
2140 * generated by brw_pixel_interp_desc().
2141 *
2142 * Why does this work? If you look at the SKL PRMs, Volume 7:
2143 * 3D-Media-GPGPU, Shared Functions Pixel Interpolater, you'll see that
2144 *
2145 * - "Per Message Offset” Message Descriptor
2146 * - “Sample Position Offset” Message Descriptor
2147 *
2148 * have different formats. Fortunately, a fragment shader dispatched at
2149 * pixel rate, will have gl_SampleID = 0 & gl_NumSamples = 1. So the value
2150 * we pack in “Sample Position Offset” will be a 0 and will cover the X/Y
2151 * components of "Per Message Offset”, which will give us the pixel offset 0x0.
2152 */
2153 if (dynamic_mode) {
2154 brw_reg orig_desc = desc;
2155 const brw_builder &ubld = bld.exec_all().group(8, 0);
2156 desc = ubld.vgrf(BRW_TYPE_UD);
2157
2158 /* The predicate should have been built in brw_fs_nir.cpp when emitting
2159 * NIR code. This guarantees that we do not have incorrect interactions
2160 * with the flag register holding the predication result.
2161 */
2162 if (orig_desc.file == IMM) {
2163 /* Not using SEL here because we would generate an instruction with 2
2164 * immediate sources which is not supported by HW.
2165 */
2166 set_predicate_inv(BRW_PREDICATE_NORMAL, false,
2167 ubld.MOV(desc, brw_imm_ud(orig_desc.ud |
2168 GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
2169 set_predicate_inv(BRW_PREDICATE_NORMAL, true,
2170 ubld.MOV(desc, brw_imm_ud(orig_desc.ud |
2171 GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
2172 } else {
2173 set_predicate_inv(BRW_PREDICATE_NORMAL, false,
2174 ubld.OR(desc, orig_desc,
2175 brw_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
2176 set_predicate_inv(BRW_PREDICATE_NORMAL, true,
2177 ubld.OR(desc, orig_desc,
2178 brw_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
2179 }
2180 }
2181
2182 inst->opcode = SHADER_OPCODE_SEND;
2183 inst->sfid = GFX7_SFID_PIXEL_INTERPOLATOR;
2184 inst->desc = desc_imm;
2185 inst->ex_desc = 0;
2186 inst->mlen = mlen;
2187 inst->ex_mlen = 0;
2188 inst->send_has_side_effects = false;
2189 inst->send_is_volatile = false;
2190
2191 inst->resize_sources(3);
2192 inst->src[0] = component(desc, 0);
2193 inst->src[1] = brw_imm_ud(0); /* ex_desc */
2194 inst->src[2] = payload;
2195 }
2196
2197 static void
lower_btd_logical_send(const brw_builder & bld,fs_inst * inst)2198 lower_btd_logical_send(const brw_builder &bld, fs_inst *inst)
2199 {
2200 const intel_device_info *devinfo = bld.shader->devinfo;
2201 brw_reg global_addr = inst->src[0];
2202 const brw_reg btd_record = inst->src[1];
2203
2204 const unsigned unit = reg_unit(devinfo);
2205 const unsigned mlen = 2 * unit;
2206 const brw_builder ubld = bld.exec_all();
2207 brw_reg header = ubld.vgrf(BRW_TYPE_UD, 2 * unit);
2208
2209 ubld.MOV(header, brw_imm_ud(0));
2210 switch (inst->opcode) {
2211 case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
2212 assert(brw_type_size_bytes(global_addr.type) == 8 &&
2213 global_addr.stride == 0);
2214 global_addr.type = BRW_TYPE_UD;
2215 global_addr.stride = 1;
2216 ubld.group(2, 0).MOV(header, global_addr);
2217
2218 /* XXX - There is a Registers Per Thread field in the BTD spawn
2219 * header starting on Xe3, it doesn't appear to be needed
2220 * by the hardware so we don't set it. If it's ever
2221 * needed though we will need some sort of reloc since
2222 * we'll have to initialize it based on the prog_data
2223 * structure of the callee.
2224 */
2225 break;
2226
2227 case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
2228 /* The bottom bit is the Stack ID release bit */
2229 ubld.group(1, 0).MOV(header, brw_imm_ud(1));
2230 break;
2231
2232 default:
2233 unreachable("Invalid BTD message");
2234 }
2235
2236 /* Stack IDs are always in R1 regardless of whether we're coming from a
2237 * bindless shader or a regular compute shader.
2238 */
2239 brw_reg stack_ids = retype(offset(header, bld, 1), BRW_TYPE_UW);
2240 bld.exec_all().MOV(stack_ids, retype(brw_vec8_grf(1 * unit, 0),
2241 BRW_TYPE_UW));
2242
2243 unsigned ex_mlen = 0;
2244 brw_reg payload;
2245 if (inst->opcode == SHADER_OPCODE_BTD_SPAWN_LOGICAL) {
2246 ex_mlen = 2 * (inst->exec_size / 8);
2247 payload = bld.move_to_vgrf(btd_record, 1);
2248 } else {
2249 assert(inst->opcode == SHADER_OPCODE_BTD_RETIRE_LOGICAL);
2250 /* All these messages take a BTD and things complain if we don't provide
2251 * one for RETIRE. However, it shouldn't ever actually get used so fill
2252 * it with zero.
2253 */
2254 ex_mlen = 2 * (inst->exec_size / 8);
2255 payload = bld.move_to_vgrf(brw_imm_uq(0), 1);
2256 }
2257
2258 /* Update the original instruction. */
2259 inst->opcode = SHADER_OPCODE_SEND;
2260 inst->mlen = mlen;
2261 inst->ex_mlen = ex_mlen;
2262 inst->header_size = 0; /* HW docs require has_header = false */
2263 inst->send_has_side_effects = true;
2264 inst->send_is_volatile = false;
2265
2266 /* Set up SFID and descriptors */
2267 inst->sfid = GEN_RT_SFID_BINDLESS_THREAD_DISPATCH;
2268 inst->desc = brw_btd_spawn_desc(devinfo, inst->exec_size,
2269 GEN_RT_BTD_MESSAGE_SPAWN);
2270 inst->resize_sources(4);
2271 inst->src[0] = brw_imm_ud(0); /* desc */
2272 inst->src[1] = brw_imm_ud(0); /* ex_desc */
2273 inst->src[2] = header;
2274 inst->src[3] = payload;
2275 }
2276
2277 static void
lower_trace_ray_logical_send(const brw_builder & bld,fs_inst * inst)2278 lower_trace_ray_logical_send(const brw_builder &bld, fs_inst *inst)
2279 {
2280 const intel_device_info *devinfo = bld.shader->devinfo;
2281 /* The emit_uniformize() in brw_fs_nir.cpp will generate an horizontal
2282 * stride of 0. Below we're doing a MOV() in SIMD2. Since we can't use UQ/Q
2283 * types in on Gfx12.5, we need to tweak the stride with a value of 1 dword
2284 * so that the MOV operates on 2 components rather than twice the same
2285 * component.
2286 */
2287 const brw_reg bvh_level =
2288 inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == IMM ?
2289 inst->src[RT_LOGICAL_SRC_BVH_LEVEL] :
2290 bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL],
2291 inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL));
2292 const brw_reg trace_ray_control =
2293 inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == IMM ?
2294 inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] :
2295 bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL],
2296 inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL));
2297 const brw_reg synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS];
2298 assert(synchronous_src.file == IMM);
2299 const bool synchronous = synchronous_src.ud;
2300
2301 const unsigned unit = reg_unit(devinfo);
2302 const unsigned mlen = unit;
2303 const brw_builder ubld = bld.exec_all();
2304 brw_reg header = ubld.vgrf(BRW_TYPE_UD);
2305 ubld.MOV(header, brw_imm_ud(0));
2306
2307 const brw_reg globals_addr = inst->src[RT_LOGICAL_SRC_GLOBALS];
2308 if (globals_addr.file != UNIFORM) {
2309 brw_reg addr_ud = retype(globals_addr, BRW_TYPE_UD);
2310 addr_ud.stride = 1;
2311 ubld.group(2, 0).MOV(header, addr_ud);
2312 } else {
2313 /* If the globals address comes from a uniform, do not do the SIMD2
2314 * optimization. This occurs in many Vulkan CTS tests.
2315 *
2316 * Many places in the late compiler, including but not limited to an
2317 * assertion in fs_visitor::assign_curb_setup, assume that all uses of a
2318 * UNIFORM will be uniform (i.e., <0,1,0>). The clever SIMD2
2319 * optimization violates that assumption.
2320 */
2321 ubld.group(1, 0).MOV(byte_offset(header, 0),
2322 subscript(globals_addr, BRW_TYPE_UD, 0));
2323 ubld.group(1, 0).MOV(byte_offset(header, 4),
2324 subscript(globals_addr, BRW_TYPE_UD, 1));
2325 }
2326
2327 if (synchronous)
2328 ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous));
2329
2330 const unsigned ex_mlen = inst->exec_size / 8;
2331 brw_reg payload = bld.vgrf(BRW_TYPE_UD);
2332 if (bvh_level.file == IMM &&
2333 trace_ray_control.file == IMM) {
2334 uint32_t high = devinfo->ver >= 20 ? 10 : 9;
2335 bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, high, 8) |
2336 (bvh_level.ud & 0x7)));
2337 } else {
2338 bld.SHL(payload, trace_ray_control, brw_imm_ud(8));
2339 bld.OR(payload, payload, bvh_level);
2340 }
2341
2342 /* When doing synchronous traversal, the HW implicitly computes the
2343 * stack_id using the following formula :
2344 *
2345 * EUID[3:0] & THREAD_ID[2:0] & SIMD_LANE_ID[3:0]
2346 *
2347 * Only in the asynchronous case we need to set the stack_id given from the
2348 * payload register.
2349 */
2350 if (!synchronous) {
2351 bld.AND(subscript(payload, BRW_TYPE_UW, 1),
2352 retype(brw_vec8_grf(1 * unit, 0), BRW_TYPE_UW),
2353 brw_imm_uw(0x7ff));
2354 }
2355
2356 /* Update the original instruction. */
2357 inst->opcode = SHADER_OPCODE_SEND;
2358 inst->mlen = mlen;
2359 inst->ex_mlen = ex_mlen;
2360 inst->header_size = 0; /* HW docs require has_header = false */
2361 inst->send_has_side_effects = true;
2362 inst->send_is_volatile = false;
2363
2364 /* Set up SFID and descriptors */
2365 inst->sfid = GEN_RT_SFID_RAY_TRACE_ACCELERATOR;
2366 inst->desc = brw_rt_trace_ray_desc(devinfo, inst->exec_size);
2367 inst->resize_sources(4);
2368 inst->src[0] = brw_imm_ud(0); /* desc */
2369 inst->src[1] = brw_imm_ud(0); /* ex_desc */
2370 inst->src[2] = header;
2371 inst->src[3] = payload;
2372 }
2373
2374 static void
lower_get_buffer_size(const brw_builder & bld,fs_inst * inst)2375 lower_get_buffer_size(const brw_builder &bld, fs_inst *inst)
2376 {
2377 const intel_device_info *devinfo = bld.shader->devinfo;
2378 /* Since we can only execute this instruction on uniform bti/surface
2379 * handles, brw_fs_nir.cpp should already have limited this to SIMD8.
2380 */
2381 assert(inst->exec_size == (devinfo->ver < 20 ? 8 : 16));
2382
2383 brw_reg surface = inst->src[GET_BUFFER_SIZE_SRC_SURFACE];
2384 brw_reg surface_handle = inst->src[GET_BUFFER_SIZE_SRC_SURFACE_HANDLE];
2385 brw_reg lod = inst->src[GET_BUFFER_SIZE_SRC_LOD];
2386
2387 inst->opcode = SHADER_OPCODE_SEND;
2388 inst->mlen = inst->exec_size / 8;
2389 inst->resize_sources(3);
2390 inst->ex_mlen = 0;
2391 inst->ex_desc = 0;
2392
2393 /* src[0] & src[1] are filled by setup_surface_descriptors() */
2394 inst->src[2] = lod;
2395
2396 const uint32_t return_format = GFX8_SAMPLER_RETURN_FORMAT_32BITS;
2397
2398 const uint32_t desc = brw_sampler_desc(devinfo, 0, 0,
2399 GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
2400 BRW_SAMPLER_SIMD_MODE_SIMD8,
2401 return_format);
2402
2403 inst->dst = retype(inst->dst, BRW_TYPE_UW);
2404 inst->sfid = BRW_SFID_SAMPLER;
2405 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2406 }
2407
2408 bool
brw_lower_logical_sends(fs_visitor & s)2409 brw_lower_logical_sends(fs_visitor &s)
2410 {
2411 const intel_device_info *devinfo = s.devinfo;
2412 bool progress = false;
2413
2414 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
2415 const brw_builder ibld(&s, block, inst);
2416
2417 switch (inst->opcode) {
2418 case FS_OPCODE_FB_WRITE_LOGICAL:
2419 assert(s.stage == MESA_SHADER_FRAGMENT);
2420 lower_fb_write_logical_send(ibld, inst,
2421 brw_wm_prog_data(s.prog_data),
2422 (const brw_wm_prog_key *)s.key,
2423 s.fs_payload());
2424 break;
2425
2426 case FS_OPCODE_FB_READ_LOGICAL:
2427 lower_fb_read_logical_send(ibld, inst, brw_wm_prog_data(s.prog_data));
2428 break;
2429
2430 case SHADER_OPCODE_TEX_LOGICAL:
2431 case SHADER_OPCODE_TXD_LOGICAL:
2432 case SHADER_OPCODE_TXF_LOGICAL:
2433 case SHADER_OPCODE_TXL_LOGICAL:
2434 case SHADER_OPCODE_TXS_LOGICAL:
2435 case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
2436 case FS_OPCODE_TXB_LOGICAL:
2437 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
2438 case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
2439 case SHADER_OPCODE_TXF_MCS_LOGICAL:
2440 case SHADER_OPCODE_LOD_LOGICAL:
2441 case SHADER_OPCODE_TG4_LOGICAL:
2442 case SHADER_OPCODE_TG4_BIAS_LOGICAL:
2443 case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
2444 case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
2445 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
2446 case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
2447 case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
2448 case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
2449 lower_sampler_logical_send(ibld, inst);
2450 break;
2451
2452 case SHADER_OPCODE_GET_BUFFER_SIZE:
2453 lower_get_buffer_size(ibld, inst);
2454 break;
2455
2456 case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
2457 case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
2458 case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
2459 if (devinfo->ver >= 20 ||
2460 (devinfo->has_lsc &&
2461 inst->src[MEMORY_LOGICAL_MODE].ud != MEMORY_MODE_TYPED))
2462 lower_lsc_memory_logical_send(ibld, inst);
2463 else
2464 lower_hdc_memory_logical_send(ibld, inst);
2465 break;
2466
2467 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
2468 if (devinfo->has_lsc && !s.compiler->indirect_ubos_use_sampler)
2469 lower_lsc_varying_pull_constant_logical_send(ibld, inst);
2470 else
2471 lower_varying_pull_constant_logical_send(ibld, inst);
2472 break;
2473
2474 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2475 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2476 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2477 lower_interpolator_logical_send(ibld, inst,
2478 (const brw_wm_prog_key *)s.key,
2479 brw_wm_prog_data(s.prog_data));
2480 break;
2481
2482 case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
2483 case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
2484 lower_btd_logical_send(ibld, inst);
2485 break;
2486
2487 case RT_OPCODE_TRACE_RAY_LOGICAL:
2488 lower_trace_ray_logical_send(ibld, inst);
2489 break;
2490
2491 case SHADER_OPCODE_URB_READ_LOGICAL:
2492 if (devinfo->ver < 20)
2493 lower_urb_read_logical_send(ibld, inst);
2494 else
2495 lower_urb_read_logical_send_xe2(ibld, inst);
2496 break;
2497
2498 case SHADER_OPCODE_URB_WRITE_LOGICAL:
2499 if (devinfo->ver < 20)
2500 lower_urb_write_logical_send(ibld, inst);
2501 else
2502 lower_urb_write_logical_send_xe2(ibld, inst);
2503
2504 break;
2505
2506 default:
2507 continue;
2508 }
2509
2510 progress = true;
2511 }
2512
2513 if (progress)
2514 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2515
2516 return progress;
2517 }
2518
2519 /**
2520 * Turns the generic expression-style uniform pull constant load instruction
2521 * into a hardware-specific series of instructions for loading a pull
2522 * constant.
2523 *
2524 * The expression style allows the CSE pass before this to optimize out
2525 * repeated loads from the same offset, and gives the pre-register-allocation
2526 * scheduling full flexibility, while the conversion to native instructions
2527 * allows the post-register-allocation scheduler the best information
2528 * possible.
2529 *
2530 * Note that execution masking for setting up pull constant loads is special:
2531 * the channels that need to be written are unrelated to the current execution
2532 * mask, since a later instruction will use one of the result channels as a
2533 * source operand for all 8 or 16 of its channels.
2534 */
2535 bool
brw_lower_uniform_pull_constant_loads(fs_visitor & s)2536 brw_lower_uniform_pull_constant_loads(fs_visitor &s)
2537 {
2538 const intel_device_info *devinfo = s.devinfo;
2539 bool progress = false;
2540
2541 foreach_block_and_inst (block, fs_inst, inst, s.cfg) {
2542 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2543 continue;
2544
2545 const brw_reg surface = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE];
2546 const brw_reg surface_handle = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE];
2547 const brw_reg offset_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_OFFSET];
2548 const brw_reg size_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_SIZE];
2549 assert(surface.file == BAD_FILE || surface_handle.file == BAD_FILE);
2550 assert(offset_B.file == IMM);
2551 assert(size_B.file == IMM);
2552
2553 if (devinfo->has_lsc) {
2554 const brw_builder ubld =
2555 brw_builder(&s, block, inst).group(8, 0).exec_all();
2556
2557 const brw_reg payload = ubld.vgrf(BRW_TYPE_UD);
2558 ubld.MOV(payload, offset_B);
2559
2560 inst->sfid = GFX12_SFID_UGM;
2561 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
2562 surface_handle.file == BAD_FILE ?
2563 LSC_ADDR_SURFTYPE_BTI :
2564 LSC_ADDR_SURFTYPE_BSS,
2565 LSC_ADDR_SIZE_A32,
2566 LSC_DATA_SIZE_D32,
2567 inst->size_written / 4,
2568 true /* transpose */,
2569 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
2570
2571 /* Update the original instruction. */
2572 inst->opcode = SHADER_OPCODE_SEND;
2573 inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, 1);
2574 inst->send_ex_bso = surface_handle.file != BAD_FILE &&
2575 s.compiler->extended_bindless_surface_offset;
2576 inst->ex_mlen = 0;
2577 inst->header_size = 0;
2578 inst->send_has_side_effects = false;
2579 inst->send_is_volatile = true;
2580 inst->exec_size = 1;
2581
2582 /* Finally, the payload */
2583
2584 inst->resize_sources(3);
2585 setup_lsc_surface_descriptors(ubld, inst, inst->desc,
2586 surface.file != BAD_FILE ?
2587 surface : surface_handle);
2588 inst->src[2] = payload;
2589
2590 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2591 } else {
2592 const brw_builder ubld = brw_builder(&s, block, inst).exec_all();
2593 brw_reg header = brw_builder(&s, 8).exec_all().vgrf(BRW_TYPE_UD);
2594
2595 ubld.group(8, 0).MOV(header,
2596 retype(brw_vec8_grf(0, 0), BRW_TYPE_UD));
2597 ubld.group(1, 0).MOV(component(header, 2),
2598 brw_imm_ud(offset_B.ud / 16));
2599
2600 inst->sfid = GFX6_SFID_DATAPORT_CONSTANT_CACHE;
2601 inst->opcode = SHADER_OPCODE_SEND;
2602 inst->header_size = 1;
2603 inst->mlen = 1;
2604
2605 uint32_t desc =
2606 brw_dp_oword_block_rw_desc(devinfo, true /* align_16B */,
2607 size_B.ud / 4, false /* write */);
2608
2609 inst->resize_sources(4);
2610
2611 setup_surface_descriptors(ubld, inst, desc, surface, surface_handle);
2612
2613 inst->src[2] = header;
2614 inst->src[3] = brw_reg(); /* unused for reads */
2615
2616 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2617 }
2618
2619 progress = true;
2620 }
2621
2622 return progress;
2623 }
2624
2625 bool
brw_lower_send_descriptors(fs_visitor & s)2626 brw_lower_send_descriptors(fs_visitor &s)
2627 {
2628 const intel_device_info *devinfo = s.devinfo;
2629 bool progress = false;
2630
2631 foreach_block_and_inst (block, fs_inst, inst, s.cfg) {
2632 if (inst->opcode != SHADER_OPCODE_SEND &&
2633 inst->opcode != SHADER_OPCODE_SEND_GATHER)
2634 continue;
2635
2636 const brw_builder ubld = brw_builder(&s, block, inst).exec_all().group(1, 0);
2637
2638 /* Descriptor */
2639 const unsigned rlen = inst->dst.is_null() ? 0 : inst->size_written / REG_SIZE;
2640 unsigned mlen = inst->mlen;
2641 if (inst->opcode == SHADER_OPCODE_SEND_GATHER) {
2642 assert(inst->sources >= 3);
2643 mlen = (inst->sources - 3) * reg_unit(devinfo);
2644 }
2645
2646 uint32_t desc_imm = inst->desc |
2647 brw_message_desc(devinfo, mlen, rlen, inst->header_size);
2648
2649 assert(inst->src[0].file != BAD_FILE);
2650 assert(inst->src[1].file != BAD_FILE);
2651
2652 brw_reg desc = inst->src[0];
2653 if (desc.file == IMM) {
2654 inst->src[0] = brw_imm_ud(desc.ud | desc_imm);
2655 } else {
2656 brw_reg addr_reg = ubld.vaddr(BRW_TYPE_UD,
2657 BRW_ADDRESS_SUBREG_INDIRECT_DESC);
2658 ubld.OR(addr_reg, desc, brw_imm_ud(desc_imm));
2659 inst->src[0] = addr_reg;
2660 }
2661
2662 /* Extended descriptor */
2663 brw_reg ex_desc = inst->src[1];
2664 uint32_t ex_desc_imm = inst->ex_desc |
2665 brw_message_ex_desc(devinfo, inst->ex_mlen);
2666
2667 if (ex_desc.file == IMM)
2668 ex_desc_imm |= ex_desc.ud;
2669
2670 bool needs_addr_reg = false;
2671 if (ex_desc.file != IMM)
2672 needs_addr_reg = true;
2673 if (devinfo->ver < 12 && ex_desc.file == IMM &&
2674 (ex_desc_imm & INTEL_MASK(15, 12)) != 0)
2675 needs_addr_reg = true;
2676
2677 if (inst->send_ex_bso) {
2678 needs_addr_reg = true;
2679 /* When using the extended bindless offset, the whole extended
2680 * descriptor is the surface handle.
2681 */
2682 ex_desc_imm = 0;
2683 } else {
2684 if (needs_addr_reg)
2685 ex_desc_imm |= inst->sfid | inst->eot << 5;
2686 }
2687
2688 if (needs_addr_reg) {
2689 brw_reg addr_reg = ubld.vaddr(BRW_TYPE_UD,
2690 BRW_ADDRESS_SUBREG_INDIRECT_EX_DESC);
2691 if (ex_desc.file == IMM)
2692 ubld.MOV(addr_reg, brw_imm_ud(ex_desc_imm));
2693 else if (ex_desc_imm == 0)
2694 ubld.MOV(addr_reg, ex_desc);
2695 else
2696 ubld.OR(addr_reg, ex_desc, brw_imm_ud(ex_desc_imm));
2697 inst->src[1] = addr_reg;
2698 } else {
2699 inst->src[1] = brw_imm_ud(ex_desc_imm);
2700 }
2701
2702 progress = true;
2703 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2704 }
2705
2706 return progress;
2707 }
2708