1 /*
2 * Copyright © 2010, 2022 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /**
25 * @file brw_lower_logical_sends.cpp
26 */
27
28 #include "brw_eu.h"
29 #include "brw_fs.h"
30 #include "brw_fs_builder.h"
31
32 using namespace brw;
33
34 static void
lower_urb_read_logical_send(const fs_builder & bld,fs_inst * inst)35 lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst)
36 {
37 const intel_device_info *devinfo = bld.shader->devinfo;
38 const bool per_slot_present =
39 inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
40
41 assert(inst->size_written % REG_SIZE == 0);
42 assert(inst->header_size == 0);
43
44 fs_reg payload_sources[2];
45 unsigned header_size = 0;
46 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
47 if (per_slot_present)
48 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
49
50 fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(header_size),
51 BRW_REGISTER_TYPE_F);
52 bld.LOAD_PAYLOAD(payload, payload_sources, header_size, header_size);
53
54 inst->opcode = SHADER_OPCODE_SEND;
55 inst->header_size = header_size;
56
57 inst->sfid = BRW_SFID_URB;
58 inst->desc = brw_urb_desc(devinfo,
59 GFX8_URB_OPCODE_SIMD8_READ,
60 per_slot_present,
61 false,
62 inst->offset);
63
64 inst->mlen = header_size;
65 inst->ex_desc = 0;
66 inst->ex_mlen = 0;
67 inst->send_is_volatile = true;
68
69 inst->resize_sources(4);
70
71 inst->src[0] = brw_imm_ud(0); /* desc */
72 inst->src[1] = brw_imm_ud(0); /* ex_desc */
73 inst->src[2] = payload;
74 inst->src[3] = brw_null_reg();
75 }
76
77 static void
lower_urb_read_logical_send_xe2(const fs_builder & bld,fs_inst * inst)78 lower_urb_read_logical_send_xe2(const fs_builder &bld, fs_inst *inst)
79 {
80 const intel_device_info *devinfo = bld.shader->devinfo;
81 assert(devinfo->has_lsc);
82
83 assert(inst->size_written % (REG_SIZE * reg_unit(devinfo)) == 0);
84 assert(inst->header_size == 0);
85
86 /* Get the logical send arguments. */
87 const fs_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];
88
89 /* Calculate the total number of components of the payload. */
90 const unsigned dst_comps = inst->size_written / (REG_SIZE * reg_unit(devinfo));
91
92 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);
93
94 bld.MOV(payload, handle);
95
96 /* The low 24-bits of the URB handle is a byte offset into the URB area.
97 * Add the (OWord) offset of the write to this value.
98 */
99 if (inst->offset) {
100 bld.ADD(payload, payload, brw_imm_ud(inst->offset * 16));
101 inst->offset = 0;
102 }
103
104 fs_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
105 if (offsets.file != BAD_FILE) {
106 fs_reg offsets_B = bld.vgrf(BRW_REGISTER_TYPE_UD);
107 bld.SHL(offsets_B, offsets, brw_imm_ud(4)); /* OWords -> Bytes */
108 bld.ADD(payload, payload, offsets_B);
109 }
110
111 inst->sfid = BRW_SFID_URB;
112
113 assert((dst_comps >= 1 && dst_comps <= 4) || dst_comps == 8);
114
115 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
116 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
117 1 /* num_coordinates */,
118 LSC_DATA_SIZE_D32, dst_comps /* num_channels */,
119 false /* transpose */,
120 LSC_CACHE(devinfo, STORE, L1UC_L3UC),
121 false /* has_dest */);
122
123
124 /* Update the original instruction. */
125 inst->opcode = SHADER_OPCODE_SEND;
126 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
127 inst->ex_mlen = 0;
128 inst->header_size = 0;
129 inst->send_has_side_effects = true;
130 inst->send_is_volatile = false;
131
132 inst->resize_sources(4);
133
134 inst->src[0] = brw_imm_ud(0);
135 inst->src[1] = brw_imm_ud(0);
136
137 inst->src[2] = payload;
138 inst->src[3] = brw_null_reg();
139 }
140
141 static void
lower_urb_write_logical_send(const fs_builder & bld,fs_inst * inst)142 lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst)
143 {
144 const intel_device_info *devinfo = bld.shader->devinfo;
145 const bool per_slot_present =
146 inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
147 const bool channel_mask_present =
148 inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE;
149
150 assert(inst->header_size == 0);
151
152 const unsigned length = 1 + per_slot_present + channel_mask_present +
153 inst->components_read(URB_LOGICAL_SRC_DATA);
154
155 fs_reg *payload_sources = new fs_reg[length];
156 fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(length),
157 BRW_REGISTER_TYPE_F);
158
159 unsigned header_size = 0;
160 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
161 if (per_slot_present)
162 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
163
164 if (channel_mask_present)
165 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
166
167 for (unsigned i = header_size, j = 0; i < length; i++, j++)
168 payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j);
169
170 bld.LOAD_PAYLOAD(payload, payload_sources, length, header_size);
171
172 delete [] payload_sources;
173
174 inst->opcode = SHADER_OPCODE_SEND;
175 inst->header_size = header_size;
176 inst->dst = brw_null_reg();
177
178 inst->sfid = BRW_SFID_URB;
179 inst->desc = brw_urb_desc(devinfo,
180 GFX8_URB_OPCODE_SIMD8_WRITE,
181 per_slot_present,
182 channel_mask_present,
183 inst->offset);
184
185 inst->mlen = length;
186 inst->ex_desc = 0;
187 inst->ex_mlen = 0;
188 inst->send_has_side_effects = true;
189
190 inst->resize_sources(4);
191
192 inst->src[0] = brw_imm_ud(0); /* desc */
193 inst->src[1] = brw_imm_ud(0); /* ex_desc */
194 inst->src[2] = payload;
195 inst->src[3] = brw_null_reg();
196 }
197
198 static void
lower_urb_write_logical_send_xe2(const fs_builder & bld,fs_inst * inst)199 lower_urb_write_logical_send_xe2(const fs_builder &bld, fs_inst *inst)
200 {
201 const intel_device_info *devinfo = bld.shader->devinfo;
202 assert(devinfo->has_lsc);
203
204 /* Get the logical send arguments. */
205 const fs_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];
206 const fs_reg src = inst->components_read(URB_LOGICAL_SRC_DATA) ?
207 inst->src[URB_LOGICAL_SRC_DATA] : fs_reg(brw_imm_ud(0));
208 assert(type_sz(src.type) == 4);
209
210 /* Calculate the total number of components of the payload. */
211 const unsigned src_comps = MAX2(1, inst->components_read(URB_LOGICAL_SRC_DATA));
212 const unsigned src_sz = type_sz(src.type);
213
214 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);
215
216 bld.MOV(payload, handle);
217
218 /* The low 24-bits of the URB handle is a byte offset into the URB area.
219 * Add the (OWord) offset of the write to this value.
220 */
221 if (inst->offset) {
222 bld.ADD(payload, payload, brw_imm_ud(inst->offset * 16));
223 inst->offset = 0;
224 }
225
226 fs_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
227 if (offsets.file != BAD_FILE) {
228 fs_reg offsets_B = bld.vgrf(BRW_REGISTER_TYPE_UD);
229 bld.SHL(offsets_B, offsets, brw_imm_ud(4)); /* OWords -> Bytes */
230 bld.ADD(payload, payload, offsets_B);
231 }
232
233 const fs_reg cmask = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
234 unsigned mask = 0;
235
236 if (cmask.file != BAD_FILE) {
237 assert(cmask.file == IMM);
238 assert(cmask.type == BRW_REGISTER_TYPE_UD);
239 mask = cmask.ud >> 16;
240 }
241
242 fs_reg payload2 = bld.move_to_vgrf(src, src_comps);
243 const unsigned ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE;
244
245 inst->sfid = BRW_SFID_URB;
246
247 enum lsc_opcode op = mask ? LSC_OP_STORE_CMASK : LSC_OP_STORE;
248 inst->desc = lsc_msg_desc_wcmask(devinfo, op, inst->exec_size,
249 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
250 1 /* num_coordinates */,
251 LSC_DATA_SIZE_D32, src_comps /* num_channels */,
252 false /* transpose */,
253 LSC_CACHE(devinfo, STORE, L1UC_L3UC),
254 false /* has_dest */, mask);
255
256
257 /* Update the original instruction. */
258 inst->opcode = SHADER_OPCODE_SEND;
259 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
260 inst->ex_mlen = ex_mlen;
261 inst->header_size = 0;
262 inst->send_has_side_effects = true;
263 inst->send_is_volatile = false;
264
265 inst->resize_sources(4);
266
267 inst->src[0] = brw_imm_ud(0);
268 inst->src[1] = brw_imm_ud(0);
269
270 inst->src[2] = payload;
271 inst->src[3] = payload2;
272 }
273
274 static void
setup_color_payload(const fs_builder & bld,const brw_wm_prog_key * key,fs_reg * dst,fs_reg color,unsigned components)275 setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
276 fs_reg *dst, fs_reg color, unsigned components)
277 {
278 if (key->clamp_fragment_color) {
279 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
280 assert(color.type == BRW_REGISTER_TYPE_F);
281
282 for (unsigned i = 0; i < components; i++)
283 set_saturate(true,
284 bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
285
286 color = tmp;
287 }
288
289 for (unsigned i = 0; i < components; i++)
290 dst[i] = offset(color, bld, i);
291 }
292
293 static void
lower_fb_write_logical_send(const fs_builder & bld,fs_inst * inst,const struct brw_wm_prog_data * prog_data,const brw_wm_prog_key * key,const fs_thread_payload & fs_payload)294 lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
295 const struct brw_wm_prog_data *prog_data,
296 const brw_wm_prog_key *key,
297 const fs_thread_payload &fs_payload)
298 {
299 assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
300 const intel_device_info *devinfo = bld.shader->devinfo;
301 const fs_reg color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
302 const fs_reg color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
303 const fs_reg src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
304 const fs_reg src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
305 const fs_reg dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
306 const fs_reg src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
307 fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
308 const unsigned components =
309 inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
310
311 assert(inst->target != 0 || src0_alpha.file == BAD_FILE);
312
313 fs_reg sources[15];
314 int header_size = 2, payload_header_size;
315 unsigned length = 0;
316
317 if (devinfo->ver < 11 &&
318 (color1.file != BAD_FILE || key->nr_color_regions > 1)) {
319 assert(devinfo->ver < 20);
320
321 /* From the Sandy Bridge PRM, volume 4, page 198:
322 *
323 * "Dispatched Pixel Enables. One bit per pixel indicating
324 * which pixels were originally enabled when the thread was
325 * dispatched. This field is only required for the end-of-
326 * thread message and on all dual-source messages."
327 */
328 const fs_builder ubld = bld.exec_all().group(8, 0);
329
330 fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
331 if (bld.group() < 16) {
332 /* The header starts off as g0 and g1 for the first half */
333 ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
334 BRW_REGISTER_TYPE_UD));
335 } else {
336 /* The header starts off as g0 and g2 for the second half */
337 assert(bld.group() < 32);
338 const fs_reg header_sources[2] = {
339 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
340 retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD),
341 };
342 ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
343
344 /* Gfx12 will require additional fix-ups if we ever hit this path. */
345 assert(devinfo->ver < 12);
346 }
347
348 uint32_t g00_bits = 0;
349
350 /* Set "Source0 Alpha Present to RenderTarget" bit in message
351 * header.
352 */
353 if (src0_alpha.file != BAD_FILE)
354 g00_bits |= 1 << 11;
355
356 /* Set computes stencil to render target */
357 if (prog_data->computed_stencil)
358 g00_bits |= 1 << 14;
359
360 if (g00_bits) {
361 /* OR extra bits into g0.0 */
362 ubld.group(1, 0).OR(component(header, 0),
363 retype(brw_vec1_grf(0, 0),
364 BRW_REGISTER_TYPE_UD),
365 brw_imm_ud(g00_bits));
366 }
367
368 /* Set the render target index for choosing BLEND_STATE. */
369 if (inst->target > 0) {
370 ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target));
371 }
372
373 if (prog_data->uses_kill) {
374 ubld.group(1, 0).MOV(retype(component(header, 15),
375 BRW_REGISTER_TYPE_UW),
376 brw_sample_mask_reg(bld));
377 }
378
379 assert(length == 0);
380 sources[0] = header;
381 sources[1] = horiz_offset(header, 8);
382 length = 2;
383 }
384 assert(length == 0 || length == 2);
385 header_size = length;
386
387 if (fs_payload.aa_dest_stencil_reg[0]) {
388 assert(inst->group < 16);
389 sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
390 bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
391 .MOV(sources[length],
392 fs_reg(brw_vec8_grf(fs_payload.aa_dest_stencil_reg[0], 0)));
393 length++;
394 }
395
396 if (src0_alpha.file != BAD_FILE) {
397 for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
398 const fs_builder &ubld = bld.exec_all().group(8, i)
399 .annotate("FB write src0 alpha");
400 const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_F);
401 ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
402 setup_color_payload(ubld, key, &sources[length], tmp, 1);
403 length++;
404 }
405 }
406
407 if (sample_mask.file != BAD_FILE) {
408 const fs_reg tmp(VGRF, bld.shader->alloc.allocate(reg_unit(devinfo)),
409 BRW_REGISTER_TYPE_UD);
410
411 /* Hand over gl_SampleMask. Only the lower 16 bits of each channel are
412 * relevant. Since it's unsigned single words one vgrf is always
413 * 16-wide, but only the lower or higher 8 channels will be used by the
414 * hardware when doing a SIMD8 write depending on whether we have
415 * selected the subspans for the first or second half respectively.
416 */
417 assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
418 sample_mask.type = BRW_REGISTER_TYPE_UW;
419 sample_mask.stride *= 2;
420
421 bld.exec_all().annotate("FB write oMask")
422 .MOV(horiz_offset(retype(tmp, BRW_REGISTER_TYPE_UW),
423 inst->group % (16 * reg_unit(devinfo))),
424 sample_mask);
425
426 for (unsigned i = 0; i < reg_unit(devinfo); i++)
427 sources[length++] = byte_offset(tmp, REG_SIZE * i);
428 }
429
430 payload_header_size = length;
431
432 setup_color_payload(bld, key, &sources[length], color0, components);
433 length += 4;
434
435 if (color1.file != BAD_FILE) {
436 setup_color_payload(bld, key, &sources[length], color1, components);
437 length += 4;
438 }
439
440 if (src_depth.file != BAD_FILE) {
441 sources[length] = src_depth;
442 length++;
443 }
444
445 if (dst_depth.file != BAD_FILE) {
446 sources[length] = dst_depth;
447 length++;
448 }
449
450 if (src_stencil.file != BAD_FILE) {
451 assert(bld.dispatch_width() == 8 * reg_unit(devinfo));
452
453 /* XXX: src_stencil is only available on gfx9+. dst_depth is never
454 * available on gfx9+. As such it's impossible to have both enabled at the
455 * same time and therefore length cannot overrun the array.
456 */
457 assert(length < 15 * reg_unit(devinfo));
458
459 sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
460 bld.exec_all().annotate("FB write OS")
461 .MOV(retype(sources[length], BRW_REGISTER_TYPE_UB),
462 subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0));
463 length++;
464 }
465
466 /* Send from the GRF */
467 fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
468 fs_inst *load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
469 payload.nr = bld.shader->alloc.allocate(regs_written(load));
470 load->dst = payload;
471
472 uint32_t msg_ctl = brw_fb_write_msg_control(inst, prog_data);
473
474 inst->desc =
475 (inst->group / 16) << 11 | /* rt slot group */
476 brw_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,
477 0 /* coarse_rt_write */);
478
479 fs_reg desc = brw_imm_ud(0);
480 if (prog_data->coarse_pixel_dispatch == BRW_ALWAYS) {
481 inst->desc |= (1 << 18);
482 } else if (prog_data->coarse_pixel_dispatch == BRW_SOMETIMES) {
483 STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_RT_WRITES == (1 << 18));
484 const fs_builder &ubld = bld.exec_all().group(8, 0);
485 desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
486 ubld.AND(desc, dynamic_msaa_flags(prog_data),
487 brw_imm_ud(INTEL_MSAA_FLAG_COARSE_RT_WRITES));
488 desc = component(desc, 0);
489 }
490
491 uint32_t ex_desc = 0;
492 if (devinfo->ver >= 11) {
493 /* Set the "Render Target Index" and "Src0 Alpha Present" fields
494 * in the extended message descriptor, in lieu of using a header.
495 */
496 ex_desc = inst->target << 12 | (src0_alpha.file != BAD_FILE) << 15;
497
498 if (key->nr_color_regions == 0)
499 ex_desc |= 1 << 20; /* Null Render Target */
500 }
501 inst->ex_desc = ex_desc;
502
503 inst->opcode = SHADER_OPCODE_SEND;
504 inst->resize_sources(3);
505 inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
506 inst->src[0] = desc;
507 inst->src[1] = brw_imm_ud(0);
508 inst->src[2] = payload;
509 inst->mlen = regs_written(load);
510 inst->ex_mlen = 0;
511 inst->header_size = header_size;
512 inst->check_tdr = true;
513 inst->send_has_side_effects = true;
514 }
515
516 static void
lower_fb_read_logical_send(const fs_builder & bld,fs_inst * inst)517 lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)
518 {
519 const intel_device_info *devinfo = bld.shader->devinfo;
520 const fs_builder &ubld = bld.exec_all().group(8, 0);
521 const unsigned length = 2;
522 const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length);
523
524 if (bld.group() < 16) {
525 ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
526 BRW_REGISTER_TYPE_UD));
527 } else {
528 assert(bld.group() < 32);
529 const fs_reg header_sources[] = {
530 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
531 retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD)
532 };
533 ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
534
535 if (devinfo->ver >= 12) {
536 /* On Gfx12 the Viewport and Render Target Array Index fields (AKA
537 * Poly 0 Info) are provided in r1.1 instead of r0.0, and the render
538 * target message header format was updated accordingly -- However
539 * the updated format only works for the lower 16 channels in a
540 * SIMD32 thread, since the higher 16 channels want the subspan data
541 * from r2 instead of r1, so we need to copy over the contents of
542 * r1.1 in order to fix things up.
543 */
544 ubld.group(1, 0).MOV(component(header, 9),
545 retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD));
546 }
547 }
548
549 /* BSpec 12470 (Gfx8-11), BSpec 47842 (Gfx12+) :
550 *
551 * "Must be zero for Render Target Read message."
552 *
553 * For bits :
554 * - 14 : Stencil Present to Render Target
555 * - 13 : Source Depth Present to Render Target
556 * - 12 : oMask to Render Target
557 * - 11 : Source0 Alpha Present to Render Target
558 */
559 ubld.group(1, 0).AND(component(header, 0),
560 component(header, 0),
561 brw_imm_ud(~INTEL_MASK(14, 11)));
562
563 inst->resize_sources(1);
564 inst->src[0] = header;
565 inst->opcode = FS_OPCODE_FB_READ;
566 inst->mlen = length;
567 inst->header_size = length;
568 }
569
570 static bool
is_high_sampler(const struct intel_device_info * devinfo,const fs_reg & sampler)571 is_high_sampler(const struct intel_device_info *devinfo, const fs_reg &sampler)
572 {
573 return sampler.file != IMM || sampler.ud >= 16;
574 }
575
576 static unsigned
sampler_msg_type(const intel_device_info * devinfo,opcode opcode,bool shadow_compare,bool has_min_lod)577 sampler_msg_type(const intel_device_info *devinfo,
578 opcode opcode, bool shadow_compare, bool has_min_lod)
579 {
580 switch (opcode) {
581 case SHADER_OPCODE_TEX:
582 if (devinfo->ver >= 20 && has_min_lod) {
583 return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD :
584 XE2_SAMPLER_MESSAGE_SAMPLE_MLOD;
585 } else {
586 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
587 GFX5_SAMPLER_MESSAGE_SAMPLE;
588 }
589 case FS_OPCODE_TXB:
590 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
591 GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
592 case SHADER_OPCODE_TXL:
593 assert(!has_min_lod);
594 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
595 GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
596 case SHADER_OPCODE_TXL_LZ:
597 assert(!has_min_lod);
598 return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ :
599 GFX9_SAMPLER_MESSAGE_SAMPLE_LZ;
600 case SHADER_OPCODE_TXS:
601 case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
602 assert(!has_min_lod);
603 return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
604 case SHADER_OPCODE_TXD:
605 return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
606 GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
607 case SHADER_OPCODE_TXF:
608 assert(!has_min_lod);
609 return GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
610 case SHADER_OPCODE_TXF_LZ:
611 assert(!has_min_lod);
612 return GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
613 case SHADER_OPCODE_TXF_CMS_W:
614 assert(!has_min_lod);
615 return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
616 case SHADER_OPCODE_TXF_CMS:
617 assert(!has_min_lod);
618 return GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
619 case SHADER_OPCODE_TXF_UMS:
620 assert(!has_min_lod);
621 return GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
622 case SHADER_OPCODE_TXF_MCS:
623 assert(!has_min_lod);
624 return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
625 case SHADER_OPCODE_LOD:
626 assert(!has_min_lod);
627 return GFX5_SAMPLER_MESSAGE_LOD;
628 case SHADER_OPCODE_TG4:
629 assert(!has_min_lod);
630 return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
631 GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
632 break;
633 case SHADER_OPCODE_TG4_OFFSET:
634 assert(!has_min_lod);
635 return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
636 GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
637 case SHADER_OPCODE_TG4_OFFSET_LOD:
638 assert(!has_min_lod);
639 assert(devinfo->ver >= 20);
640 return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L_C:
641 XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L;
642 case SHADER_OPCODE_TG4_OFFSET_BIAS:
643 assert(!has_min_lod);
644 assert(devinfo->ver >= 20);
645 return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_B;
646 case SHADER_OPCODE_TG4_BIAS:
647 assert(!has_min_lod);
648 assert(devinfo->ver >= 20);
649 return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_B;
650 case SHADER_OPCODE_TG4_EXPLICIT_LOD:
651 assert(!has_min_lod);
652 assert(devinfo->ver >= 20);
653 return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L_C :
654 XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L;
655 case SHADER_OPCODE_TG4_IMPLICIT_LOD:
656 assert(!has_min_lod);
657 assert(devinfo->ver >= 20);
658 return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I_C :
659 XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I;
660 case SHADER_OPCODE_SAMPLEINFO:
661 assert(!has_min_lod);
662 return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
663 default:
664 unreachable("not reached");
665 }
666 }
667
668 /**
669 * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to
670 * the given requested_alignment_sz.
671 */
672 static fs_inst *
emit_load_payload_with_padding(const fs_builder & bld,const fs_reg & dst,const fs_reg * src,unsigned sources,unsigned header_size,unsigned requested_alignment_sz)673 emit_load_payload_with_padding(const fs_builder &bld, const fs_reg &dst,
674 const fs_reg *src, unsigned sources,
675 unsigned header_size,
676 unsigned requested_alignment_sz)
677 {
678 unsigned length = 0;
679 unsigned num_srcs =
680 sources * DIV_ROUND_UP(requested_alignment_sz, bld.dispatch_width());
681 fs_reg *src_comps = new fs_reg[num_srcs];
682
683 for (unsigned i = 0; i < header_size; i++)
684 src_comps[length++] = src[i];
685
686 for (unsigned i = header_size; i < sources; i++) {
687 unsigned src_sz =
688 retype(dst, src[i].type).component_size(bld.dispatch_width());
689 const enum brw_reg_type padding_payload_type =
690 brw_reg_type_from_bit_size(type_sz(src[i].type) * 8,
691 BRW_REGISTER_TYPE_UD);
692
693 src_comps[length++] = src[i];
694
695 /* Expand the real sources if component of requested payload type is
696 * larger than real source component.
697 */
698 if (src_sz < requested_alignment_sz) {
699 for (unsigned j = 0; j < (requested_alignment_sz / src_sz) - 1; j++) {
700 src_comps[length++] = retype(fs_reg(), padding_payload_type);
701 }
702 }
703 }
704
705 fs_inst *inst = bld.LOAD_PAYLOAD(dst, src_comps, length, header_size);
706 delete[] src_comps;
707
708 return inst;
709 }
710
711 static bool
shader_opcode_needs_header(opcode op)712 shader_opcode_needs_header(opcode op)
713 {
714 switch (op) {
715 case SHADER_OPCODE_TG4:
716 case SHADER_OPCODE_TG4_OFFSET:
717 case SHADER_OPCODE_TG4_OFFSET_BIAS:
718 case SHADER_OPCODE_TG4_OFFSET_LOD:
719 case SHADER_OPCODE_TG4_BIAS:
720 case SHADER_OPCODE_TG4_EXPLICIT_LOD:
721 case SHADER_OPCODE_TG4_IMPLICIT_LOD:
722 case SHADER_OPCODE_SAMPLEINFO:
723 return true;
724 default:
725 break;
726 }
727
728 return false;
729 }
730
731 static void
lower_sampler_logical_send(const fs_builder & bld,fs_inst * inst,opcode op,const fs_reg & coordinate,const fs_reg & shadow_c,fs_reg lod,const fs_reg & lod2,const fs_reg & min_lod,const fs_reg & sample_index,const fs_reg & mcs,const fs_reg & surface,const fs_reg & sampler,const fs_reg & surface_handle,const fs_reg & sampler_handle,const fs_reg & tg4_offset,unsigned payload_type_bit_size,unsigned coord_components,unsigned grad_components,bool residency)732 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
733 const fs_reg &coordinate,
734 const fs_reg &shadow_c,
735 fs_reg lod, const fs_reg &lod2,
736 const fs_reg &min_lod,
737 const fs_reg &sample_index,
738 const fs_reg &mcs,
739 const fs_reg &surface,
740 const fs_reg &sampler,
741 const fs_reg &surface_handle,
742 const fs_reg &sampler_handle,
743 const fs_reg &tg4_offset,
744 unsigned payload_type_bit_size,
745 unsigned coord_components,
746 unsigned grad_components,
747 bool residency)
748 {
749 const brw_compiler *compiler = bld.shader->compiler;
750 const intel_device_info *devinfo = bld.shader->devinfo;
751 const enum brw_reg_type payload_type =
752 brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_F);
753 const enum brw_reg_type payload_unsigned_type =
754 brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_UD);
755 const enum brw_reg_type payload_signed_type =
756 brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_D);
757 unsigned reg_width = bld.dispatch_width() / 8;
758 unsigned header_size = 0, length = 0;
759 fs_reg sources[1 + MAX_SAMPLER_MESSAGE_SIZE];
760 for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
761 sources[i] = bld.vgrf(payload_type);
762
763 /* We must have exactly one of surface/sampler and surface/sampler_handle */
764 assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
765 assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
766
767 if (shader_opcode_needs_header(op) || inst->offset != 0 || inst->eot ||
768 sampler_handle.file != BAD_FILE ||
769 is_high_sampler(devinfo, sampler) ||
770 residency) {
771 /* For general texture offsets (no txf workaround), we need a header to
772 * put them in.
773 *
774 * TG4 needs to place its channel select in the header, for interaction
775 * with ARB_texture_swizzle. The sampler index is only 4-bits, so for
776 * larger sampler numbers we need to offset the Sampler State Pointer in
777 * the header.
778 */
779 fs_reg header = retype(sources[0], BRW_REGISTER_TYPE_UD);
780 for (header_size = 0; header_size < reg_unit(devinfo); header_size++)
781 sources[length++] = byte_offset(header, REG_SIZE * header_size);
782
783 /* If we're requesting fewer than four channels worth of response,
784 * and we have an explicit header, we need to set up the sampler
785 * writemask. It's reversed from normal: 1 means "don't write".
786 */
787 unsigned reg_count = regs_written(inst) - reg_unit(devinfo) * residency;
788 if (!inst->eot && reg_count < 4 * reg_width) {
789 assert(reg_count % reg_width == 0);
790 unsigned mask = ~((1 << (reg_count / reg_width)) - 1) & 0xf;
791 inst->offset |= mask << 12;
792 }
793
794 if (residency)
795 inst->offset |= 1 << 23; /* g0.2 bit23 : Pixel Null Mask Enable */
796
797 /* Build the actual header */
798 const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
799 const fs_builder ubld1 = ubld.group(1, 0);
800 ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
801 if (inst->offset) {
802 ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));
803 } else if (bld.shader->stage != MESA_SHADER_VERTEX &&
804 bld.shader->stage != MESA_SHADER_FRAGMENT) {
805 /* The vertex and fragment stages have g0.2 set to 0, so
806 * header0.2 is 0 when g0 is copied. Other stages may not, so we
807 * must set it to 0 to avoid setting undesirable bits in the
808 * message.
809 */
810 ubld1.MOV(component(header, 2), brw_imm_ud(0));
811 }
812
813 if (sampler_handle.file != BAD_FILE) {
814 /* Bindless sampler handles aren't relative to the sampler state
815 * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
816 * Instead, it's an absolute pointer relative to dynamic state base
817 * address.
818 *
819 * Sampler states are 16 bytes each and the pointer we give here has
820 * to be 32-byte aligned. In order to avoid more indirect messages
821 * than required, we assume that all bindless sampler states are
822 * 32-byte aligned. This sacrifices a bit of general state base
823 * address space but means we can do something more efficient in the
824 * shader.
825 */
826 if (compiler->use_bindless_sampler_offset) {
827 assert(devinfo->ver >= 11);
828 ubld1.OR(component(header, 3), sampler_handle, brw_imm_ud(1));
829 } else {
830 ubld1.MOV(component(header, 3), sampler_handle);
831 }
832 } else if (is_high_sampler(devinfo, sampler)) {
833 fs_reg sampler_state_ptr =
834 retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD);
835
836 /* Gfx11+ sampler message headers include bits in 4:0 which conflict
837 * with the ones included in g0.3 bits 4:0. Mask them out.
838 */
839 if (devinfo->ver >= 11) {
840 sampler_state_ptr = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
841 ubld1.AND(sampler_state_ptr,
842 retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
843 brw_imm_ud(INTEL_MASK(31, 5)));
844 }
845
846 if (sampler.file == BRW_IMMEDIATE_VALUE) {
847 assert(sampler.ud >= 16);
848 const int sampler_state_size = 16; /* 16 bytes */
849
850 ubld1.ADD(component(header, 3), sampler_state_ptr,
851 brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
852 } else {
853 fs_reg tmp = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
854 ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));
855 ubld1.SHL(tmp, tmp, brw_imm_ud(4));
856 ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
857 }
858 } else if (devinfo->ver >= 11) {
859 /* Gfx11+ sampler message headers include bits in 4:0 which conflict
860 * with the ones included in g0.3 bits 4:0. Mask them out.
861 */
862 ubld1.AND(component(header, 3),
863 retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
864 brw_imm_ud(INTEL_MASK(31, 5)));
865 }
866 }
867
868 /* Change the opcode to account for LOD being zero before the
869 * switch-statement that emits sources based on the opcode.
870 */
871 if (lod.is_zero()) {
872 if (op == SHADER_OPCODE_TXL)
873 op = SHADER_OPCODE_TXL_LZ;
874 else if (op == SHADER_OPCODE_TXF)
875 op = SHADER_OPCODE_TXF_LZ;
876 }
877
878 /* On Xe2 and newer platforms, min_lod is the first parameter specifically
879 * so that a bunch of other, possibly unused, parameters don't need to also
880 * be included.
881 */
882 const unsigned msg_type =
883 sampler_msg_type(devinfo, op, inst->shadow_compare,
884 min_lod.file != BAD_FILE);
885
886 const bool min_lod_is_first = devinfo->ver >= 20 &&
887 (msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_MLOD ||
888 msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD);
889
890 if (min_lod_is_first) {
891 assert(min_lod.file != BAD_FILE);
892 bld.MOV(sources[length++], min_lod);
893 }
894
895 if (shadow_c.file != BAD_FILE) {
896 bld.MOV(sources[length], shadow_c);
897 length++;
898 }
899
900 bool coordinate_done = false;
901
902 /* Set up the LOD info */
903 switch (op) {
904 case FS_OPCODE_TXB:
905 case SHADER_OPCODE_TG4_BIAS:
906 case SHADER_OPCODE_TG4_EXPLICIT_LOD:
907 case SHADER_OPCODE_TG4_OFFSET_LOD:
908 case SHADER_OPCODE_TG4_OFFSET_BIAS:
909 case SHADER_OPCODE_TXL:
910 bld.MOV(sources[length], lod);
911 length++;
912 break;
913 case SHADER_OPCODE_TXD:
914 /* TXD should have been lowered in SIMD16 mode (in SIMD32 mode in
915 * Xe2+).
916 */
917 assert(bld.dispatch_width() == (8 * reg_unit(devinfo)));
918
919 /* Load dPdx and the coordinate together:
920 * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
921 */
922 for (unsigned i = 0; i < coord_components; i++) {
923 bld.MOV(sources[length++], offset(coordinate, bld, i));
924
925 /* For cube map array, the coordinate is (u,v,r,ai) but there are
926 * only derivatives for (u, v, r).
927 */
928 if (i < grad_components) {
929 bld.MOV(sources[length++], offset(lod, bld, i));
930 bld.MOV(sources[length++], offset(lod2, bld, i));
931 }
932 }
933
934 coordinate_done = true;
935 break;
936 case SHADER_OPCODE_TXS:
937 bld.MOV(retype(sources[length], payload_unsigned_type), lod);
938 length++;
939 break;
940 case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
941 /* We need an LOD; just use 0 */
942 bld.MOV(retype(sources[length], payload_unsigned_type), brw_imm_ud(0));
943 length++;
944 break;
945 case SHADER_OPCODE_TXF:
946 case SHADER_OPCODE_TXF_LZ:
947 /* On Gfx9 the parameters are intermixed they are u, v, lod, r. */
948 bld.MOV(retype(sources[length++], payload_signed_type), coordinate);
949
950 if (coord_components >= 2) {
951 bld.MOV(retype(sources[length], payload_signed_type),
952 offset(coordinate, bld, 1));
953 } else {
954 sources[length] = brw_imm_d(0);
955 }
956 length++;
957
958 if (op != SHADER_OPCODE_TXF_LZ) {
959 bld.MOV(retype(sources[length], payload_signed_type), lod);
960 length++;
961 }
962
963 for (unsigned i = 2; i < coord_components; i++)
964 bld.MOV(retype(sources[length++], payload_signed_type),
965 offset(coordinate, bld, i));
966
967 coordinate_done = true;
968 break;
969
970 case SHADER_OPCODE_TXF_CMS:
971 case SHADER_OPCODE_TXF_CMS_W:
972 case SHADER_OPCODE_TXF_UMS:
973 case SHADER_OPCODE_TXF_MCS:
974 if (op == SHADER_OPCODE_TXF_UMS ||
975 op == SHADER_OPCODE_TXF_CMS ||
976 op == SHADER_OPCODE_TXF_CMS_W) {
977 bld.MOV(retype(sources[length++], payload_unsigned_type), sample_index);
978 }
979
980 /* Data from the multisample control surface. */
981 if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
982 unsigned num_mcs_components = 1;
983
984 /* From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
985 * Shared Functions - 3D Sampler - Messages - Message Format:
986 *
987 * ld2dms_w si mcs0 mcs1 mcs2 mcs3 u v r
988 */
989 if (op == SHADER_OPCODE_TXF_CMS_W)
990 num_mcs_components = 2;
991
992 for (unsigned i = 0; i < num_mcs_components; ++i) {
993 /* Sampler always writes 4/8 register worth of data but for ld_mcs
994 * only valid data is in first two register. So with 16-bit
995 * payload, we need to split 2-32bit register into 4-16-bit
996 * payload.
997 *
998 * From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
999 * Shared Functions - 3D Sampler - Messages - Message Format:
1000 *
1001 * ld2dms_w si mcs0 mcs1 mcs2 mcs3 u v r
1002 */
1003 if (devinfo->verx10 >= 125 && op == SHADER_OPCODE_TXF_CMS_W) {
1004 fs_reg tmp = offset(mcs, bld, i);
1005 bld.MOV(retype(sources[length++], payload_unsigned_type),
1006 mcs.file == IMM ? mcs :
1007 subscript(tmp, payload_unsigned_type, 0));
1008 bld.MOV(retype(sources[length++], payload_unsigned_type),
1009 mcs.file == IMM ? mcs :
1010 subscript(tmp, payload_unsigned_type, 1));
1011 } else {
1012 bld.MOV(retype(sources[length++], payload_unsigned_type),
1013 mcs.file == IMM ? mcs : offset(mcs, bld, i));
1014 }
1015 }
1016 }
1017
1018 /* There is no offsetting for this message; just copy in the integer
1019 * texture coordinates.
1020 */
1021 for (unsigned i = 0; i < coord_components; i++)
1022 bld.MOV(retype(sources[length++], payload_signed_type),
1023 offset(coordinate, bld, i));
1024
1025 coordinate_done = true;
1026 break;
1027 case SHADER_OPCODE_TG4_OFFSET:
1028 /* More crazy intermixing */
1029 for (unsigned i = 0; i < 2; i++) /* u, v */
1030 bld.MOV(sources[length++], offset(coordinate, bld, i));
1031
1032 for (unsigned i = 0; i < 2; i++) /* offu, offv */
1033 bld.MOV(retype(sources[length++], payload_signed_type),
1034 offset(tg4_offset, bld, i));
1035
1036 if (coord_components == 3) /* r if present */
1037 bld.MOV(sources[length++], offset(coordinate, bld, 2));
1038
1039 coordinate_done = true;
1040 break;
1041 default:
1042 break;
1043 }
1044
1045 /* Set up the coordinate (except for cases where it was done above) */
1046 if (!coordinate_done) {
1047 for (unsigned i = 0; i < coord_components; i++)
1048 bld.MOV(retype(sources[length++], payload_type),
1049 offset(coordinate, bld, i));
1050 }
1051
1052 if (min_lod.file != BAD_FILE && !min_lod_is_first) {
1053 /* Account for all of the missing coordinate sources */
1054 if (op == FS_OPCODE_TXB && devinfo->ver >= 20 &&
1055 inst->has_packed_lod_ai_src) {
1056 /* Bspec 64985:
1057 *
1058 * For sample_b sampler message format:
1059 *
1060 * SIMD16H/SIMD32H
1061 * Param Number 0 1 2 3 4 5
1062 * Param BIAS U V R Ai MLOD
1063 *
1064 * SIMD16/SIMD32
1065 * Param Number 0 1 2 3 4
1066 * Param BIAS_AI U V R MLOD
1067 */
1068 length += 3 - coord_components;
1069 } else if (op == SHADER_OPCODE_TXD && devinfo->verx10 >= 125) {
1070 /* On DG2 and newer platforms, sample_d can only be used with 1D and
1071 * 2D surfaces, so the maximum number of gradient components is 2.
1072 * In spite of this limitation, the Bspec lists a mysterious R
1073 * component before the min_lod, so the maximum coordinate components
1074 * is 3.
1075 *
1076 * See bspec 45942, "Enable new message layout for cube array"
1077 */
1078 length += 3 - coord_components;
1079 length += (2 - grad_components) * 2;
1080 } else {
1081 length += 4 - coord_components;
1082 if (op == SHADER_OPCODE_TXD)
1083 length += (3 - grad_components) * 2;
1084 }
1085
1086 bld.MOV(sources[length++], min_lod);
1087
1088 /* Wa_14014595444: Populate MLOD as parameter 5 (twice). */
1089 if (devinfo->verx10 == 125 && op == FS_OPCODE_TXB &&
1090 !inst->shadow_compare)
1091 bld.MOV(sources[length++], min_lod);
1092 }
1093
1094 const fs_reg src_payload =
1095 fs_reg(VGRF, bld.shader->alloc.allocate(length * reg_width),
1096 BRW_REGISTER_TYPE_F);
1097 /* In case of 16-bit payload each component takes one full register in
1098 * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16
1099 * elements. In SIMD8H case hardware simply expects the components to be
1100 * padded (i.e., aligned on reg boundary).
1101 */
1102 fs_inst *load_payload_inst =
1103 emit_load_payload_with_padding(bld, src_payload, sources, length,
1104 header_size, REG_SIZE * reg_unit(devinfo));
1105 unsigned mlen = load_payload_inst->size_written / REG_SIZE;
1106 unsigned simd_mode = 0;
1107 if (devinfo->ver < 20) {
1108 if (payload_type_bit_size == 16) {
1109 assert(devinfo->ver >= 11);
1110 simd_mode = inst->exec_size <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H :
1111 GFX10_SAMPLER_SIMD_MODE_SIMD16H;
1112 } else {
1113 simd_mode = inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
1114 BRW_SAMPLER_SIMD_MODE_SIMD16;
1115 }
1116 } else {
1117 if (payload_type_bit_size == 16) {
1118 simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16H :
1119 XE2_SAMPLER_SIMD_MODE_SIMD32H;
1120 } else {
1121 simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16 :
1122 XE2_SAMPLER_SIMD_MODE_SIMD32;
1123 }
1124 }
1125
1126 /* Generate the SEND. */
1127 inst->opcode = SHADER_OPCODE_SEND;
1128 inst->mlen = mlen;
1129 inst->header_size = header_size;
1130
1131 assert(msg_type == sampler_msg_type(devinfo, op, inst->shadow_compare,
1132 min_lod.file != BAD_FILE));
1133
1134 inst->sfid = BRW_SFID_SAMPLER;
1135 if (surface.file == IMM &&
1136 (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
1137 inst->desc = brw_sampler_desc(devinfo, surface.ud,
1138 sampler.file == IMM ? sampler.ud % 16 : 0,
1139 msg_type,
1140 simd_mode,
1141 0 /* return_format unused on gfx7+ */);
1142 inst->src[0] = brw_imm_ud(0);
1143 inst->src[1] = brw_imm_ud(0);
1144 } else if (surface_handle.file != BAD_FILE) {
1145 /* Bindless surface */
1146 inst->desc = brw_sampler_desc(devinfo,
1147 GFX9_BTI_BINDLESS,
1148 sampler.file == IMM ? sampler.ud % 16 : 0,
1149 msg_type,
1150 simd_mode,
1151 0 /* return_format unused on gfx7+ */);
1152
1153 /* For bindless samplers, the entire address is included in the message
1154 * header so we can leave the portion in the message descriptor 0.
1155 */
1156 if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {
1157 inst->src[0] = brw_imm_ud(0);
1158 } else {
1159 const fs_builder ubld = bld.group(1, 0).exec_all();
1160 fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1161 ubld.SHL(desc, sampler, brw_imm_ud(8));
1162 inst->src[0] = component(desc, 0);
1163 }
1164
1165 /* We assume that the driver provided the handle in the top 20 bits so
1166 * we can use the surface handle directly as the extended descriptor.
1167 */
1168 inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
1169 inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1170 } else {
1171 /* Immediate portion of the descriptor */
1172 inst->desc = brw_sampler_desc(devinfo,
1173 0, /* surface */
1174 0, /* sampler */
1175 msg_type,
1176 simd_mode,
1177 0 /* return_format unused on gfx7+ */);
1178 const fs_builder ubld = bld.group(1, 0).exec_all();
1179 fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1180 if (surface.equals(sampler)) {
1181 /* This case is common in GL */
1182 ubld.MUL(desc, surface, brw_imm_ud(0x101));
1183 } else {
1184 if (sampler_handle.file != BAD_FILE) {
1185 ubld.MOV(desc, surface);
1186 } else if (sampler.file == IMM) {
1187 ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8));
1188 } else {
1189 ubld.SHL(desc, sampler, brw_imm_ud(8));
1190 ubld.OR(desc, desc, surface);
1191 }
1192 }
1193 ubld.AND(desc, desc, brw_imm_ud(0xfff));
1194
1195 inst->src[0] = component(desc, 0);
1196 inst->src[1] = brw_imm_ud(0); /* ex_desc */
1197 }
1198
1199 inst->ex_desc = 0;
1200
1201 inst->src[2] = src_payload;
1202 inst->resize_sources(3);
1203
1204 if (inst->eot) {
1205 /* EOT sampler messages don't make sense to split because it would
1206 * involve ending half of the thread early.
1207 */
1208 assert(inst->group == 0);
1209 /* We need to use SENDC for EOT sampler messages */
1210 inst->check_tdr = true;
1211 inst->send_has_side_effects = true;
1212 }
1213
1214 /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
1215 assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE * reg_unit(devinfo));
1216 }
1217
1218 static unsigned
get_sampler_msg_payload_type_bit_size(const intel_device_info * devinfo,opcode op,const fs_inst * inst)1219 get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
1220 opcode op, const fs_inst *inst)
1221 {
1222 assert(inst);
1223 const fs_reg *src = inst->src;
1224 unsigned src_type_size = 0;
1225
1226 /* All sources need to have the same size, therefore seek the first valid
1227 * and take the size from there.
1228 */
1229 for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1230 if (src[i].file != BAD_FILE) {
1231 src_type_size = brw_reg_type_to_size(src[i].type);
1232 break;
1233 }
1234 }
1235
1236 assert(src_type_size == 2 || src_type_size == 4);
1237
1238 #ifndef NDEBUG
1239 /* Make sure all sources agree. On gfx12 this doesn't hold when sampling
1240 * compressed multisampled surfaces. There the payload contains MCS data
1241 * which is already in 16-bits unlike the other parameters that need forced
1242 * conversion.
1243 */
1244 if (devinfo->verx10 < 125 ||
1245 (op != SHADER_OPCODE_TXF_CMS_W &&
1246 op != SHADER_OPCODE_TXF_CMS)) {
1247 for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1248 assert(src[i].file == BAD_FILE ||
1249 brw_reg_type_to_size(src[i].type) == src_type_size);
1250 }
1251 }
1252 #endif
1253
1254 if (devinfo->verx10 < 125)
1255 return src_type_size * 8;
1256
1257 /* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec:
1258 * 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message
1259 * Format [GFX12:HAS:1209977870] *
1260 *
1261 * ld2dms_w SIMD8H and SIMD16H Only
1262 * ld_mcs SIMD8H and SIMD16H Only
1263 * ld2dms REMOVEDBY(GEN:HAS:1406788836)
1264 */
1265
1266 if (op == SHADER_OPCODE_TXF_CMS_W ||
1267 op == SHADER_OPCODE_TXF_CMS ||
1268 op == SHADER_OPCODE_TXF_UMS ||
1269 op == SHADER_OPCODE_TXF_MCS ||
1270 (op == FS_OPCODE_TXB && !inst->has_packed_lod_ai_src &&
1271 devinfo->ver >= 20))
1272 src_type_size = 2;
1273
1274 return src_type_size * 8;
1275 }
1276
1277 static void
lower_sampler_logical_send(const fs_builder & bld,fs_inst * inst,opcode op)1278 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
1279 {
1280 const intel_device_info *devinfo = bld.shader->devinfo;
1281 const fs_reg coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
1282 const fs_reg shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
1283 const fs_reg lod = inst->src[TEX_LOGICAL_SRC_LOD];
1284 const fs_reg lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
1285 const fs_reg min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
1286 const fs_reg sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
1287 const fs_reg mcs = inst->src[TEX_LOGICAL_SRC_MCS];
1288 const fs_reg surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
1289 const fs_reg sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
1290 const fs_reg surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
1291 const fs_reg sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
1292 const fs_reg tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
1293 assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
1294 const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
1295 assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
1296 const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
1297 assert(inst->src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
1298 const bool residency = inst->src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
1299
1300 const unsigned msg_payload_type_bit_size =
1301 get_sampler_msg_payload_type_bit_size(devinfo, op, inst);
1302
1303 /* 16-bit payloads are available only on gfx11+ */
1304 assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11);
1305
1306 lower_sampler_logical_send(bld, inst, op, coordinate,
1307 shadow_c, lod, lod2, min_lod,
1308 sample_index,
1309 mcs, surface, sampler,
1310 surface_handle, sampler_handle,
1311 tg4_offset,
1312 msg_payload_type_bit_size,
1313 coord_components, grad_components,
1314 residency);
1315 }
1316
1317 /**
1318 * Predicate the specified instruction on the vector mask.
1319 */
1320 static void
emit_predicate_on_vector_mask(const fs_builder & bld,fs_inst * inst)1321 emit_predicate_on_vector_mask(const fs_builder &bld, fs_inst *inst)
1322 {
1323 assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
1324 bld.group() == inst->group &&
1325 bld.dispatch_width() == inst->exec_size);
1326
1327 const fs_builder ubld = bld.exec_all().group(1, 0);
1328
1329 const fs_visitor &s = *bld.shader;
1330 const fs_reg vector_mask = ubld.vgrf(BRW_REGISTER_TYPE_UW);
1331 ubld.UNDEF(vector_mask);
1332 ubld.emit(SHADER_OPCODE_READ_SR_REG, vector_mask, brw_imm_ud(3));
1333 const unsigned subreg = sample_mask_flag_subreg(s);
1334
1335 ubld.MOV(brw_flag_subreg(subreg + inst->group / 16), vector_mask);
1336
1337 if (inst->predicate) {
1338 assert(inst->predicate == BRW_PREDICATE_NORMAL);
1339 assert(!inst->predicate_inverse);
1340 assert(inst->flag_subreg == 0);
1341 assert(s.devinfo->ver < 20);
1342 /* Combine the vector mask with the existing predicate by using a
1343 * vertical predication mode.
1344 */
1345 inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
1346 } else {
1347 inst->flag_subreg = subreg;
1348 inst->predicate = BRW_PREDICATE_NORMAL;
1349 inst->predicate_inverse = false;
1350 }
1351 }
1352
1353 static void
setup_surface_descriptors(const fs_builder & bld,fs_inst * inst,uint32_t desc,const fs_reg & surface,const fs_reg & surface_handle)1354 setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc,
1355 const fs_reg &surface, const fs_reg &surface_handle)
1356 {
1357 const brw_compiler *compiler = bld.shader->compiler;
1358
1359 /* We must have exactly one of surface and surface_handle */
1360 assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
1361
1362 if (surface.file == IMM) {
1363 inst->desc = desc | (surface.ud & 0xff);
1364 inst->src[0] = brw_imm_ud(0);
1365 inst->src[1] = brw_imm_ud(0); /* ex_desc */
1366 } else if (surface_handle.file != BAD_FILE) {
1367 /* Bindless surface */
1368 inst->desc = desc | GFX9_BTI_BINDLESS;
1369 inst->src[0] = brw_imm_ud(0);
1370
1371 /* We assume that the driver provided the handle in the top 20 bits so
1372 * we can use the surface handle directly as the extended descriptor.
1373 */
1374 inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
1375 inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1376 } else {
1377 inst->desc = desc;
1378 const fs_builder ubld = bld.exec_all().group(1, 0);
1379 fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1380 ubld.AND(tmp, surface, brw_imm_ud(0xff));
1381 inst->src[0] = component(tmp, 0);
1382 inst->src[1] = brw_imm_ud(0); /* ex_desc */
1383 }
1384 }
1385
1386 static void
setup_lsc_surface_descriptors(const fs_builder & bld,fs_inst * inst,uint32_t desc,const fs_reg & surface)1387 setup_lsc_surface_descriptors(const fs_builder &bld, fs_inst *inst,
1388 uint32_t desc, const fs_reg &surface)
1389 {
1390 const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
1391 const brw_compiler *compiler = bld.shader->compiler;
1392
1393 inst->src[0] = brw_imm_ud(0); /* desc */
1394
1395 enum lsc_addr_surface_type surf_type = lsc_msg_desc_addr_type(devinfo, desc);
1396 switch (surf_type) {
1397 case LSC_ADDR_SURFTYPE_BSS:
1398 inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1399 /* fall-through */
1400 case LSC_ADDR_SURFTYPE_SS:
1401 assert(surface.file != BAD_FILE);
1402 /* We assume that the driver provided the handle in the top 20 bits so
1403 * we can use the surface handle directly as the extended descriptor.
1404 */
1405 inst->src[1] = retype(surface, BRW_REGISTER_TYPE_UD);
1406 break;
1407
1408 case LSC_ADDR_SURFTYPE_BTI:
1409 assert(surface.file != BAD_FILE);
1410 if (surface.file == IMM) {
1411 inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
1412 } else {
1413 const fs_builder ubld = bld.exec_all().group(1, 0);
1414 fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1415 ubld.SHL(tmp, surface, brw_imm_ud(24));
1416 inst->src[1] = component(tmp, 0);
1417 }
1418 break;
1419
1420 case LSC_ADDR_SURFTYPE_FLAT:
1421 inst->src[1] = brw_imm_ud(0);
1422 break;
1423
1424 default:
1425 unreachable("Invalid LSC surface address type");
1426 }
1427 }
1428
1429 static void
lower_surface_logical_send(const fs_builder & bld,fs_inst * inst)1430 lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
1431 {
1432 const brw_compiler *compiler = bld.shader->compiler;
1433 const intel_device_info *devinfo = bld.shader->devinfo;
1434
1435 /* Get the logical send arguments. */
1436 const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1437 const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1438 const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1439 const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1440 const UNUSED fs_reg dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
1441 const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1442 const fs_reg allow_sample_mask =
1443 inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
1444 assert(arg.file == IMM);
1445 assert(allow_sample_mask.file == IMM);
1446
1447 /* Calculate the total number of components of the payload. */
1448 const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
1449 const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1450
1451 const bool is_typed_access =
1452 inst->opcode == SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL ||
1453 inst->opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL ||
1454 inst->opcode == SHADER_OPCODE_TYPED_ATOMIC_LOGICAL;
1455
1456 const bool is_surface_access = is_typed_access ||
1457 inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL ||
1458 inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL ||
1459 inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL;
1460
1461 const bool is_stateless =
1462 surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
1463 surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
1464
1465 const bool has_side_effects = inst->has_side_effects();
1466
1467 fs_reg sample_mask = allow_sample_mask.ud ? brw_sample_mask_reg(bld) :
1468 fs_reg(brw_imm_ud(0xffffffff));
1469
1470 fs_reg header;
1471 if (is_stateless) {
1472 assert(!is_surface_access);
1473 fs_builder ubld = bld.exec_all().group(8, 0);
1474 header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1475 ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
1476 }
1477 const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;
1478
1479 fs_reg payload, payload2;
1480 unsigned mlen, ex_mlen = 0;
1481 if (src.file == BAD_FILE || header.file == BAD_FILE) {
1482 /* We have split sends on gfx9 and above */
1483 if (header.file == BAD_FILE) {
1484 payload = bld.move_to_vgrf(addr, addr_sz);
1485 payload2 = bld.move_to_vgrf(src, src_sz);
1486 mlen = addr_sz * (inst->exec_size / 8);
1487 ex_mlen = src_sz * (inst->exec_size / 8);
1488 } else {
1489 assert(src.file == BAD_FILE);
1490 payload = header;
1491 payload2 = bld.move_to_vgrf(addr, addr_sz);
1492 mlen = header_sz;
1493 ex_mlen = addr_sz * (inst->exec_size / 8);
1494 }
1495 } else {
1496 /* Allocate space for the payload. */
1497 const unsigned sz = header_sz + addr_sz + src_sz;
1498 payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
1499 fs_reg *const components = new fs_reg[sz];
1500 unsigned n = 0;
1501
1502 /* Construct the payload. */
1503 if (header.file != BAD_FILE)
1504 components[n++] = header;
1505
1506 for (unsigned i = 0; i < addr_sz; i++)
1507 components[n++] = offset(addr, bld, i);
1508
1509 for (unsigned i = 0; i < src_sz; i++)
1510 components[n++] = offset(src, bld, i);
1511
1512 bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
1513 mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
1514
1515 delete[] components;
1516 }
1517
1518 /* Predicate the instruction on the sample mask if no header is
1519 * provided.
1520 */
1521 if ((header.file == BAD_FILE || !is_surface_access) &&
1522 sample_mask.file != BAD_FILE && sample_mask.file != IMM)
1523 brw_emit_predicate_on_sample_mask(bld, inst);
1524
1525 uint32_t sfid;
1526 switch (inst->opcode) {
1527 case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1528 case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1529 /* Byte scattered opcodes go through the normal data cache */
1530 sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1531 break;
1532
1533 case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1534 case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1535 sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1536 break;
1537
1538 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1539 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1540 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1541 /* Untyped Surface messages go through the data cache but the SFID value
1542 * changed on Haswell.
1543 */
1544 sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
1545 break;
1546
1547 case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1548 case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1549 case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1550 /* Typed surface messages go through the render cache on IVB and the
1551 * data cache on HSW+.
1552 */
1553 sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
1554 break;
1555
1556 default:
1557 unreachable("Unsupported surface opcode");
1558 }
1559
1560 uint32_t desc;
1561 switch (inst->opcode) {
1562 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1563 desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1564 arg.ud, /* num_channels */
1565 false /* write */);
1566 break;
1567
1568 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1569 desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1570 arg.ud, /* num_channels */
1571 true /* write */);
1572 break;
1573
1574 case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1575 desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1576 arg.ud, /* bit_size */
1577 false /* write */);
1578 break;
1579
1580 case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1581 desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1582 arg.ud, /* bit_size */
1583 true /* write */);
1584 break;
1585
1586 case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1587 assert(arg.ud == 32); /* bit_size */
1588 desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1589 false /* write */);
1590 break;
1591
1592 case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1593 assert(arg.ud == 32); /* bit_size */
1594 desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1595 true /* write */);
1596 break;
1597
1598 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1599 if (lsc_opcode_is_atomic_float((enum lsc_opcode) arg.ud)) {
1600 desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,
1601 lsc_op_to_legacy_atomic(arg.ud),
1602 !inst->dst.is_null());
1603 } else {
1604 desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size,
1605 lsc_op_to_legacy_atomic(arg.ud),
1606 !inst->dst.is_null());
1607 }
1608 break;
1609
1610 case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1611 desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
1612 arg.ud, /* num_channels */
1613 false /* write */);
1614 break;
1615
1616 case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1617 desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
1618 arg.ud, /* num_channels */
1619 true /* write */);
1620 break;
1621
1622 case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1623 desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
1624 lsc_op_to_legacy_atomic(arg.ud),
1625 !inst->dst.is_null());
1626 break;
1627
1628 default:
1629 unreachable("Unknown surface logical instruction");
1630 }
1631
1632 /* Update the original instruction. */
1633 inst->opcode = SHADER_OPCODE_SEND;
1634 inst->mlen = mlen;
1635 inst->ex_mlen = ex_mlen;
1636 inst->header_size = header_sz;
1637 inst->send_has_side_effects = has_side_effects;
1638 inst->send_is_volatile = !has_side_effects;
1639 inst->send_ex_bso = surface_handle.file != BAD_FILE &&
1640 compiler->extended_bindless_surface_offset;
1641
1642 /* Set up SFID and descriptors */
1643 inst->sfid = sfid;
1644 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1645
1646 inst->resize_sources(4);
1647
1648 /* Finally, the payload */
1649 inst->src[2] = payload;
1650 inst->src[3] = payload2;
1651 }
1652
1653 static enum lsc_data_size
lsc_bits_to_data_size(unsigned bit_size)1654 lsc_bits_to_data_size(unsigned bit_size)
1655 {
1656 switch (bit_size / 8) {
1657 case 1: return LSC_DATA_SIZE_D8U32;
1658 case 2: return LSC_DATA_SIZE_D16U32;
1659 case 4: return LSC_DATA_SIZE_D32;
1660 case 8: return LSC_DATA_SIZE_D64;
1661 default:
1662 unreachable("Unsupported data size.");
1663 }
1664 }
1665
1666 static void
lower_lsc_surface_logical_send(const fs_builder & bld,fs_inst * inst)1667 lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
1668 {
1669 const brw_compiler *compiler = bld.shader->compiler;
1670 const intel_device_info *devinfo = bld.shader->devinfo;
1671 assert(devinfo->has_lsc);
1672
1673 /* Get the logical send arguments. */
1674 const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1675 const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1676 const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1677 const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1678 const UNUSED fs_reg dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
1679 const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1680 const fs_reg allow_sample_mask =
1681 inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
1682 assert(arg.file == IMM);
1683 assert(allow_sample_mask.file == IMM);
1684
1685 /* Calculate the total number of components of the payload. */
1686 const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
1687 const unsigned src_comps = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1688 const unsigned src_sz = type_sz(src.type);
1689 const unsigned dst_sz = type_sz(inst->dst.type);
1690
1691 const bool has_side_effects = inst->has_side_effects();
1692
1693 unsigned ex_mlen = 0;
1694 fs_reg payload, payload2;
1695 payload = bld.move_to_vgrf(addr, addr_sz);
1696 if (src.file != BAD_FILE) {
1697 payload2 = bld.move_to_vgrf(src, src_comps);
1698 ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE;
1699 }
1700
1701 /* Predicate the instruction on the sample mask if needed */
1702 fs_reg sample_mask = allow_sample_mask.ud ? brw_sample_mask_reg(bld) :
1703 fs_reg(brw_imm_ud(0xffffffff));
1704 if (sample_mask.file != BAD_FILE && sample_mask.file != IMM)
1705 brw_emit_predicate_on_sample_mask(bld, inst);
1706
1707 if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
1708 inst->sfid = GFX12_SFID_SLM;
1709 else
1710 inst->sfid = GFX12_SFID_UGM;
1711
1712 /* We should have exactly one of surface and surface_handle. For scratch
1713 * messages generated by brw_fs_nir.cpp we also allow a special value to
1714 * know what heap base we should use in STATE_BASE_ADDRESS (SS = Surface
1715 * State Offset, or BSS = Bindless Surface State Offset).
1716 */
1717 bool non_bindless = surface.file == IMM && surface.ud == GFX125_NON_BINDLESS;
1718 assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE) ||
1719 (non_bindless && surface_handle.file != BAD_FILE));
1720
1721 enum lsc_addr_surface_type surf_type;
1722 if (surface_handle.file != BAD_FILE) {
1723 if (surface.file == BAD_FILE) {
1724 assert(!non_bindless);
1725 surf_type = LSC_ADDR_SURFTYPE_BSS;
1726 } else {
1727 assert(surface.file == IMM &&
1728 (surface.ud == 0 || surface.ud == GFX125_NON_BINDLESS));
1729 surf_type = non_bindless ? LSC_ADDR_SURFTYPE_SS : LSC_ADDR_SURFTYPE_BSS;
1730 }
1731 } else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
1732 surf_type = LSC_ADDR_SURFTYPE_FLAT;
1733 else
1734 surf_type = LSC_ADDR_SURFTYPE_BTI;
1735
1736 switch (inst->opcode) {
1737 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1738 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
1739 surf_type, LSC_ADDR_SIZE_A32,
1740 1 /* num_coordinates */,
1741 LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
1742 false /* transpose */,
1743 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
1744 true /* has_dest */);
1745 break;
1746 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1747 inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
1748 surf_type, LSC_ADDR_SIZE_A32,
1749 1 /* num_coordinates */,
1750 LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
1751 false /* transpose */,
1752 LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
1753 false /* has_dest */);
1754 break;
1755 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: {
1756 /* Bspec: Atomic instruction -> Cache section:
1757 *
1758 * Atomic messages are always forced to "un-cacheable" in the L1
1759 * cache.
1760 */
1761 enum lsc_opcode opcode = (enum lsc_opcode) arg.ud;
1762
1763 inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
1764 surf_type, LSC_ADDR_SIZE_A32,
1765 1 /* num_coordinates */,
1766 lsc_bits_to_data_size(dst_sz * 8),
1767 1 /* num_channels */,
1768 false /* transpose */,
1769 LSC_CACHE(devinfo, STORE, L1UC_L3WB),
1770 !inst->dst.is_null());
1771 break;
1772 }
1773 case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1774 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
1775 surf_type, LSC_ADDR_SIZE_A32,
1776 1 /* num_coordinates */,
1777 lsc_bits_to_data_size(arg.ud),
1778 1 /* num_channels */,
1779 false /* transpose */,
1780 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
1781 true /* has_dest */);
1782 break;
1783 case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1784 inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
1785 surf_type, LSC_ADDR_SIZE_A32,
1786 1 /* num_coordinates */,
1787 lsc_bits_to_data_size(arg.ud),
1788 1 /* num_channels */,
1789 false /* transpose */,
1790 LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
1791 false /* has_dest */);
1792 break;
1793 default:
1794 unreachable("Unknown surface logical instruction");
1795 }
1796
1797 /* Update the original instruction. */
1798 inst->opcode = SHADER_OPCODE_SEND;
1799 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
1800 inst->ex_mlen = ex_mlen;
1801 inst->header_size = 0;
1802 inst->send_has_side_effects = has_side_effects;
1803 inst->send_is_volatile = !has_side_effects;
1804 inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
1805 compiler->extended_bindless_surface_offset;
1806
1807 inst->resize_sources(4);
1808
1809 if (non_bindless) {
1810 inst->src[0] = brw_imm_ud(0); /* desc */
1811 inst->src[1] = surface_handle; /* ex_desc */
1812 } else {
1813 setup_lsc_surface_descriptors(bld, inst, inst->desc,
1814 surface.file != BAD_FILE ?
1815 surface : surface_handle);
1816 }
1817
1818 /* Finally, the payload */
1819 inst->src[2] = payload;
1820 inst->src[3] = payload2;
1821 }
1822
1823 static void
lower_lsc_block_logical_send(const fs_builder & bld,fs_inst * inst)1824 lower_lsc_block_logical_send(const fs_builder &bld, fs_inst *inst)
1825 {
1826 const brw_compiler *compiler = bld.shader->compiler;
1827 const intel_device_info *devinfo = bld.shader->devinfo;
1828 assert(devinfo->has_lsc);
1829
1830 /* Get the logical send arguments. */
1831 const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1832 const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1833 const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1834 const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1835 const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1836 assert(arg.file == IMM);
1837 assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);
1838 assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);
1839
1840 const bool is_stateless =
1841 surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
1842 surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
1843
1844 const bool has_side_effects = inst->has_side_effects();
1845
1846 const bool write = inst->opcode == SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
1847
1848 fs_builder ubld = bld.exec_all().group(1, 0);
1849 fs_reg stateless_ex_desc;
1850 if (is_stateless) {
1851 stateless_ex_desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1852 ubld.AND(stateless_ex_desc,
1853 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
1854 brw_imm_ud(INTEL_MASK(31, 10)));
1855 }
1856
1857 fs_reg data;
1858 if (write) {
1859 const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1860 data = retype(bld.move_to_vgrf(src, src_sz), BRW_REGISTER_TYPE_UD);
1861 }
1862
1863 inst->opcode = SHADER_OPCODE_SEND;
1864 if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
1865 inst->sfid = GFX12_SFID_SLM;
1866 else
1867 inst->sfid = GFX12_SFID_UGM;
1868 const enum lsc_addr_surface_type surf_type =
1869 inst->sfid == GFX12_SFID_SLM ?
1870 LSC_ADDR_SURFTYPE_FLAT :
1871 surface.file == BAD_FILE ?
1872 LSC_ADDR_SURFTYPE_BSS : LSC_ADDR_SURFTYPE_BTI;
1873 inst->desc = lsc_msg_desc(devinfo,
1874 write ? LSC_OP_STORE : LSC_OP_LOAD,
1875 1 /* exec_size */,
1876 surf_type,
1877 LSC_ADDR_SIZE_A32,
1878 1 /* num_coordinates */,
1879 LSC_DATA_SIZE_D32,
1880 arg.ud /* num_channels */,
1881 true /* transpose */,
1882 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
1883 !write /* has_dest */);
1884
1885 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
1886 inst->size_written = lsc_msg_desc_dest_len(devinfo, inst->desc) * REG_SIZE;
1887 inst->exec_size = 1;
1888 inst->ex_mlen = write ? DIV_ROUND_UP(arg.ud, 8) : 0;
1889 inst->header_size = 0;
1890 inst->send_has_side_effects = has_side_effects;
1891 inst->send_is_volatile = !has_side_effects;
1892 inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
1893 compiler->extended_bindless_surface_offset;
1894
1895 inst->resize_sources(4);
1896
1897 if (stateless_ex_desc.file != BAD_FILE) {
1898 inst->src[0] = brw_imm_ud(0); /* desc */
1899 inst->src[1] = stateless_ex_desc; /* ex_desc */
1900 } else {
1901 setup_lsc_surface_descriptors(bld, inst, inst->desc,
1902 surface.file != BAD_FILE ?
1903 surface : surface_handle);
1904 }
1905 inst->src[2] = addr; /* payload */
1906 inst->src[3] = data; /* payload2 */
1907 }
1908
1909 static void
lower_surface_block_logical_send(const fs_builder & bld,fs_inst * inst)1910 lower_surface_block_logical_send(const fs_builder &bld, fs_inst *inst)
1911 {
1912 const intel_device_info *devinfo = bld.shader->devinfo;
1913
1914 /* Get the logical send arguments. */
1915 const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1916 const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1917 const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1918 const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1919 const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1920 assert(arg.file == IMM);
1921 assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);
1922 assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);
1923
1924 const bool is_stateless =
1925 surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
1926 surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
1927
1928 const bool has_side_effects = inst->has_side_effects();
1929
1930 const bool align_16B =
1931 inst->opcode != SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL;
1932
1933 const bool write = inst->opcode == SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
1934
1935 /* The address is stored in the header. See MH_A32_GO and MH_BTS_GO. */
1936 fs_builder ubld = bld.exec_all().group(8, 0);
1937 fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1938
1939 if (is_stateless)
1940 ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
1941 else
1942 ubld.MOV(header, brw_imm_d(0));
1943
1944 /* Address in OWord units when aligned to OWords. */
1945 if (align_16B)
1946 ubld.group(1, 0).SHR(component(header, 2), addr, brw_imm_ud(4));
1947 else
1948 ubld.group(1, 0).MOV(component(header, 2), addr);
1949
1950 fs_reg data;
1951 unsigned ex_mlen = 0;
1952 if (write) {
1953 const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1954 data = retype(bld.move_to_vgrf(src, src_sz), BRW_REGISTER_TYPE_UD);
1955 ex_mlen = src_sz * type_sz(src.type) * inst->exec_size / REG_SIZE;
1956 }
1957
1958 inst->opcode = SHADER_OPCODE_SEND;
1959 inst->mlen = 1;
1960 inst->ex_mlen = ex_mlen;
1961 inst->header_size = 1;
1962 inst->send_has_side_effects = has_side_effects;
1963 inst->send_is_volatile = !has_side_effects;
1964
1965 inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1966
1967 const uint32_t desc = brw_dp_oword_block_rw_desc(devinfo, align_16B,
1968 arg.ud, write);
1969 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1970
1971 inst->resize_sources(4);
1972
1973 inst->src[2] = header;
1974 inst->src[3] = data;
1975 }
1976
1977 static fs_reg
emit_a64_oword_block_header(const fs_builder & bld,const fs_reg & addr)1978 emit_a64_oword_block_header(const fs_builder &bld, const fs_reg &addr)
1979 {
1980 const fs_builder ubld = bld.exec_all().group(8, 0);
1981
1982 assert(type_sz(addr.type) == 8 && addr.stride == 0);
1983
1984 fs_reg expanded_addr = addr;
1985 if (addr.file == UNIFORM) {
1986 /* We can't do stride 1 with the UNIFORM file, it requires stride 0 */
1987 expanded_addr = ubld.vgrf(BRW_REGISTER_TYPE_UQ);
1988 expanded_addr.stride = 0;
1989 ubld.MOV(expanded_addr, retype(addr, BRW_REGISTER_TYPE_UQ));
1990 }
1991
1992 fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1993 ubld.MOV(header, brw_imm_ud(0));
1994
1995 /* Use a 2-wide MOV to fill out the address */
1996 fs_reg addr_vec2 = expanded_addr;
1997 addr_vec2.type = BRW_REGISTER_TYPE_UD;
1998 addr_vec2.stride = 1;
1999 ubld.group(2, 0).MOV(header, addr_vec2);
2000
2001 return header;
2002 }
2003
2004 static void
emit_fragment_mask(const fs_builder & bld,fs_inst * inst)2005 emit_fragment_mask(const fs_builder &bld, fs_inst *inst)
2006 {
2007 assert(inst->src[A64_LOGICAL_ENABLE_HELPERS].file == IMM);
2008 const bool enable_helpers = inst->src[A64_LOGICAL_ENABLE_HELPERS].ud;
2009
2010 /* If we're a fragment shader, we have to predicate with the sample mask to
2011 * avoid helper invocations to avoid helper invocations in instructions
2012 * with side effects, unless they are explicitly required.
2013 *
2014 * There are also special cases when we actually want to run on helpers
2015 * (ray queries).
2016 */
2017 assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
2018 if (enable_helpers)
2019 emit_predicate_on_vector_mask(bld, inst);
2020 else if (inst->has_side_effects())
2021 brw_emit_predicate_on_sample_mask(bld, inst);
2022 }
2023
2024 static void
lower_lsc_a64_logical_send(const fs_builder & bld,fs_inst * inst)2025 lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst)
2026 {
2027 const intel_device_info *devinfo = bld.shader->devinfo;
2028
2029 /* Get the logical send arguments. */
2030 const fs_reg addr = inst->src[A64_LOGICAL_ADDRESS];
2031 const fs_reg src = inst->src[A64_LOGICAL_SRC];
2032 const unsigned src_sz = type_sz(src.type);
2033 const unsigned dst_sz = type_sz(inst->dst.type);
2034
2035 const unsigned src_comps = inst->components_read(1);
2036 assert(inst->src[A64_LOGICAL_ARG].file == IMM);
2037 const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
2038 const bool has_side_effects = inst->has_side_effects();
2039
2040 fs_reg payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
2041 fs_reg payload2 = retype(bld.move_to_vgrf(src, src_comps),
2042 BRW_REGISTER_TYPE_UD);
2043 unsigned ex_mlen = src_comps * src_sz * inst->exec_size / REG_SIZE;
2044
2045 switch (inst->opcode) {
2046 case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
2047 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
2048 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2049 1 /* num_coordinates */,
2050 LSC_DATA_SIZE_D32, arg /* num_channels */,
2051 false /* transpose */,
2052 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2053 true /* has_dest */);
2054 break;
2055 case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
2056 inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
2057 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2058 1 /* num_coordinates */,
2059 LSC_DATA_SIZE_D32, arg /* num_channels */,
2060 false /* transpose */,
2061 LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
2062 false /* has_dest */);
2063 break;
2064 case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
2065 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
2066 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2067 1 /* num_coordinates */,
2068 lsc_bits_to_data_size(arg),
2069 1 /* num_channels */,
2070 false /* transpose */,
2071 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2072 true /* has_dest */);
2073 break;
2074 case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
2075 inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
2076 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2077 1 /* num_coordinates */,
2078 lsc_bits_to_data_size(arg),
2079 1 /* num_channels */,
2080 false /* transpose */,
2081 LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
2082 false /* has_dest */);
2083 break;
2084 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: {
2085 /* Bspec: Atomic instruction -> Cache section:
2086 *
2087 * Atomic messages are always forced to "un-cacheable" in the L1
2088 * cache.
2089 */
2090 enum lsc_opcode opcode = (enum lsc_opcode) arg;
2091 inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
2092 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2093 1 /* num_coordinates */,
2094 lsc_bits_to_data_size(dst_sz * 8),
2095 1 /* num_channels */,
2096 false /* transpose */,
2097 LSC_CACHE(devinfo, STORE, L1UC_L3WB),
2098 !inst->dst.is_null());
2099 break;
2100 }
2101 case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
2102 case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2103 inst->exec_size = 1;
2104 inst->desc = lsc_msg_desc(devinfo,
2105 LSC_OP_LOAD,
2106 1 /* exec_size */,
2107 LSC_ADDR_SURFTYPE_FLAT,
2108 LSC_ADDR_SIZE_A64,
2109 1 /* num_coordinates */,
2110 LSC_DATA_SIZE_D32,
2111 arg /* num_channels */,
2112 true /* transpose */,
2113 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2114 true /* has_dest */);
2115 break;
2116 case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
2117 inst->exec_size = 1;
2118 inst->desc = lsc_msg_desc(devinfo,
2119 LSC_OP_STORE,
2120 1 /* exec_size */,
2121 LSC_ADDR_SURFTYPE_FLAT,
2122 LSC_ADDR_SIZE_A64,
2123 1 /* num_coordinates */,
2124 LSC_DATA_SIZE_D32,
2125 arg /* num_channels */,
2126 true /* transpose */,
2127 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2128 false /* has_dest */);
2129
2130 break;
2131 default:
2132 unreachable("Unknown A64 logical instruction");
2133 }
2134
2135 if (bld.shader->stage == MESA_SHADER_FRAGMENT)
2136 emit_fragment_mask(bld, inst);
2137
2138 /* Update the original instruction. */
2139 inst->opcode = SHADER_OPCODE_SEND;
2140 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2141 inst->ex_mlen = ex_mlen;
2142 inst->header_size = 0;
2143 inst->send_has_side_effects = has_side_effects;
2144 inst->send_is_volatile = !has_side_effects;
2145
2146 /* Set up SFID and descriptors */
2147 inst->sfid = GFX12_SFID_UGM;
2148 inst->resize_sources(4);
2149 inst->src[0] = brw_imm_ud(0); /* desc */
2150 inst->src[1] = brw_imm_ud(0); /* ex_desc */
2151 inst->src[2] = payload;
2152 inst->src[3] = payload2;
2153 }
2154
2155 static void
lower_a64_logical_send(const fs_builder & bld,fs_inst * inst)2156 lower_a64_logical_send(const fs_builder &bld, fs_inst *inst)
2157 {
2158 const intel_device_info *devinfo = bld.shader->devinfo;
2159
2160 const fs_reg addr = inst->src[A64_LOGICAL_ADDRESS];
2161 const fs_reg src = inst->src[A64_LOGICAL_SRC];
2162 const unsigned src_comps = inst->components_read(1);
2163 assert(inst->src[A64_LOGICAL_ARG].file == IMM);
2164 const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
2165 const bool has_side_effects = inst->has_side_effects();
2166
2167 fs_reg payload, payload2;
2168 unsigned mlen, ex_mlen = 0, header_size = 0;
2169 if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL ||
2170 inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL ||
2171 inst->opcode == SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL) {
2172
2173 /* OWORD messages only take a scalar address in a header */
2174 mlen = 1;
2175 header_size = 1;
2176 payload = emit_a64_oword_block_header(bld, addr);
2177
2178 if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL) {
2179 ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
2180 payload2 = retype(bld.move_to_vgrf(src, src_comps),
2181 BRW_REGISTER_TYPE_UD);
2182 }
2183 } else {
2184 /* On Skylake and above, we have SENDS */
2185 mlen = 2 * (inst->exec_size / 8);
2186 ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
2187 payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
2188 payload2 = retype(bld.move_to_vgrf(src, src_comps),
2189 BRW_REGISTER_TYPE_UD);
2190 }
2191
2192 uint32_t desc;
2193 switch (inst->opcode) {
2194 case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
2195 desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
2196 arg, /* num_channels */
2197 false /* write */);
2198 break;
2199
2200 case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
2201 desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
2202 arg, /* num_channels */
2203 true /* write */);
2204 break;
2205
2206 case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
2207 desc = brw_dp_a64_oword_block_rw_desc(devinfo,
2208 true, /* align_16B */
2209 arg, /* num_dwords */
2210 false /* write */);
2211 break;
2212
2213 case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2214 desc = brw_dp_a64_oword_block_rw_desc(devinfo,
2215 false, /* align_16B */
2216 arg, /* num_dwords */
2217 false /* write */);
2218 break;
2219
2220 case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
2221 desc = brw_dp_a64_oword_block_rw_desc(devinfo,
2222 true, /* align_16B */
2223 arg, /* num_dwords */
2224 true /* write */);
2225 break;
2226
2227 case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
2228 desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
2229 arg, /* bit_size */
2230 false /* write */);
2231 break;
2232
2233 case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
2234 desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
2235 arg, /* bit_size */
2236 true /* write */);
2237 break;
2238
2239 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
2240 if (lsc_opcode_is_atomic_float((enum lsc_opcode) arg)) {
2241 desc =
2242 brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
2243 type_sz(inst->dst.type) * 8,
2244 lsc_op_to_legacy_atomic(arg),
2245 !inst->dst.is_null());
2246 } else {
2247 desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size,
2248 type_sz(inst->dst.type) * 8,
2249 lsc_op_to_legacy_atomic(arg),
2250 !inst->dst.is_null());
2251 }
2252 break;
2253
2254 default:
2255 unreachable("Unknown A64 logical instruction");
2256 }
2257
2258 if (bld.shader->stage == MESA_SHADER_FRAGMENT)
2259 emit_fragment_mask(bld, inst);
2260
2261 /* Update the original instruction. */
2262 inst->opcode = SHADER_OPCODE_SEND;
2263 inst->mlen = mlen;
2264 inst->ex_mlen = ex_mlen;
2265 inst->header_size = header_size;
2266 inst->send_has_side_effects = has_side_effects;
2267 inst->send_is_volatile = !has_side_effects;
2268
2269 /* Set up SFID and descriptors */
2270 inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2271 inst->desc = desc;
2272 inst->resize_sources(4);
2273 inst->src[0] = brw_imm_ud(0); /* desc */
2274 inst->src[1] = brw_imm_ud(0); /* ex_desc */
2275 inst->src[2] = payload;
2276 inst->src[3] = payload2;
2277 }
2278
2279 static void
lower_lsc_varying_pull_constant_logical_send(const fs_builder & bld,fs_inst * inst)2280 lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld,
2281 fs_inst *inst)
2282 {
2283 const intel_device_info *devinfo = bld.shader->devinfo;
2284 ASSERTED const brw_compiler *compiler = bld.shader->compiler;
2285
2286 fs_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
2287 fs_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
2288 fs_reg offset_B = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
2289 fs_reg alignment_B = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT];
2290
2291 /* We are switching the instruction from an ALU-like instruction to a
2292 * send-from-grf instruction. Since sends can't handle strides or
2293 * source modifiers, we have to make a copy of the offset source.
2294 */
2295 fs_reg ubo_offset = bld.move_to_vgrf(offset_B, 1);
2296
2297 enum lsc_addr_surface_type surf_type =
2298 surface_handle.file == BAD_FILE ?
2299 LSC_ADDR_SURFTYPE_BTI : LSC_ADDR_SURFTYPE_BSS;
2300
2301 assert(alignment_B.file == BRW_IMMEDIATE_VALUE);
2302 unsigned alignment = alignment_B.ud;
2303
2304 inst->opcode = SHADER_OPCODE_SEND;
2305 inst->sfid = GFX12_SFID_UGM;
2306 inst->resize_sources(3);
2307 inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
2308 compiler->extended_bindless_surface_offset;
2309
2310 assert(!compiler->indirect_ubos_use_sampler);
2311
2312 inst->src[0] = brw_imm_ud(0);
2313 inst->src[2] = ubo_offset; /* payload */
2314
2315 if (alignment >= 4) {
2316 inst->desc =
2317 lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
2318 surf_type, LSC_ADDR_SIZE_A32,
2319 1 /* num_coordinates */,
2320 LSC_DATA_SIZE_D32,
2321 4 /* num_channels */,
2322 false /* transpose */,
2323 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2324 true /* has_dest */);
2325 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2326
2327 setup_lsc_surface_descriptors(bld, inst, inst->desc,
2328 surface.file != BAD_FILE ?
2329 surface : surface_handle);
2330 } else {
2331 inst->desc =
2332 lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
2333 surf_type, LSC_ADDR_SIZE_A32,
2334 1 /* num_coordinates */,
2335 LSC_DATA_SIZE_D32,
2336 1 /* num_channels */,
2337 false /* transpose */,
2338 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2339 true /* has_dest */);
2340 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2341
2342 setup_lsc_surface_descriptors(bld, inst, inst->desc,
2343 surface.file != BAD_FILE ?
2344 surface : surface_handle);
2345
2346 /* The byte scattered messages can only read one dword at a time so
2347 * we have to duplicate the message 4 times to read the full vec4.
2348 * Hopefully, dead code will clean up the mess if some of them aren't
2349 * needed.
2350 */
2351 assert(inst->size_written == 16 * inst->exec_size);
2352 inst->size_written /= 4;
2353 for (unsigned c = 1; c < 4; c++) {
2354 /* Emit a copy of the instruction because we're about to modify
2355 * it. Because this loop starts at 1, we will emit copies for the
2356 * first 3 and the final one will be the modified instruction.
2357 */
2358 bld.emit(*inst);
2359
2360 /* Offset the source */
2361 inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);
2362 bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
2363
2364 /* Offset the destination */
2365 inst->dst = offset(inst->dst, bld, 1);
2366 }
2367 }
2368 }
2369
2370 static void
lower_varying_pull_constant_logical_send(const fs_builder & bld,fs_inst * inst)2371 lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
2372 {
2373 const intel_device_info *devinfo = bld.shader->devinfo;
2374 const brw_compiler *compiler = bld.shader->compiler;
2375
2376 fs_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
2377 fs_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
2378 fs_reg offset_B = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
2379
2380 /* We are switching the instruction from an ALU-like instruction to a
2381 * send-from-grf instruction. Since sends can't handle strides or
2382 * source modifiers, we have to make a copy of the offset source.
2383 */
2384 fs_reg ubo_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
2385 bld.MOV(ubo_offset, offset_B);
2386
2387 assert(inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].file == BRW_IMMEDIATE_VALUE);
2388 unsigned alignment = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].ud;
2389
2390 inst->opcode = SHADER_OPCODE_SEND;
2391 inst->mlen = inst->exec_size / 8;
2392 inst->resize_sources(3);
2393
2394 /* src[0] & src[1] are filled by setup_surface_descriptors() */
2395 inst->src[2] = ubo_offset; /* payload */
2396
2397 if (compiler->indirect_ubos_use_sampler) {
2398 const unsigned simd_mode =
2399 inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
2400 BRW_SAMPLER_SIMD_MODE_SIMD16;
2401 const uint32_t desc = brw_sampler_desc(devinfo, 0, 0,
2402 GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
2403 simd_mode, 0);
2404
2405 inst->sfid = BRW_SFID_SAMPLER;
2406 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2407 } else if (alignment >= 4) {
2408 const uint32_t desc =
2409 brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
2410 4, /* num_channels */
2411 false /* write */);
2412
2413 inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2414 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2415 } else {
2416 const uint32_t desc =
2417 brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
2418 32, /* bit_size */
2419 false /* write */);
2420
2421 inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
2422 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2423
2424 /* The byte scattered messages can only read one dword at a time so
2425 * we have to duplicate the message 4 times to read the full vec4.
2426 * Hopefully, dead code will clean up the mess if some of them aren't
2427 * needed.
2428 */
2429 assert(inst->size_written == 16 * inst->exec_size);
2430 inst->size_written /= 4;
2431 for (unsigned c = 1; c < 4; c++) {
2432 /* Emit a copy of the instruction because we're about to modify
2433 * it. Because this loop starts at 1, we will emit copies for the
2434 * first 3 and the final one will be the modified instruction.
2435 */
2436 bld.emit(*inst);
2437
2438 /* Offset the source */
2439 inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);
2440 bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
2441
2442 /* Offset the destination */
2443 inst->dst = offset(inst->dst, bld, 1);
2444 }
2445 }
2446 }
2447
2448 static void
lower_interpolator_logical_send(const fs_builder & bld,fs_inst * inst,const struct brw_wm_prog_key * wm_prog_key,const struct brw_wm_prog_data * wm_prog_data)2449 lower_interpolator_logical_send(const fs_builder &bld, fs_inst *inst,
2450 const struct brw_wm_prog_key *wm_prog_key,
2451 const struct brw_wm_prog_data *wm_prog_data)
2452 {
2453 const intel_device_info *devinfo = bld.shader->devinfo;
2454
2455 /* We have to send something */
2456 fs_reg payload = brw_vec8_grf(0, 0);
2457 unsigned mlen = 1;
2458
2459 unsigned mode;
2460 switch (inst->opcode) {
2461 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2462 assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
2463 mode = GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE;
2464 break;
2465
2466 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2467 assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
2468 mode = GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET;
2469 break;
2470
2471 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2472 payload = inst->src[INTERP_SRC_OFFSET];
2473 mlen = 2 * inst->exec_size / 8;
2474 mode = GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET;
2475 break;
2476
2477 default:
2478 unreachable("Invalid interpolator instruction");
2479 }
2480
2481 const bool dynamic_mode =
2482 inst->src[INTERP_SRC_DYNAMIC_MODE].file != BAD_FILE;
2483
2484 fs_reg desc = inst->src[INTERP_SRC_MSG_DESC];
2485 uint32_t desc_imm =
2486 brw_pixel_interp_desc(devinfo,
2487 /* Leave the mode at 0 if persample_dispatch is
2488 * dynamic, it will be ORed in below.
2489 */
2490 dynamic_mode ? 0 : mode,
2491 inst->pi_noperspective,
2492 false /* coarse_pixel_rate */,
2493 inst->exec_size, inst->group);
2494
2495 if (wm_prog_data->coarse_pixel_dispatch == BRW_ALWAYS) {
2496 desc_imm |= (1 << 15);
2497 } else if (wm_prog_data->coarse_pixel_dispatch == BRW_SOMETIMES) {
2498 STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_PI_MSG == (1 << 15));
2499 fs_reg orig_desc = desc;
2500 const fs_builder &ubld = bld.exec_all().group(8, 0);
2501 desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2502 ubld.AND(desc, dynamic_msaa_flags(wm_prog_data),
2503 brw_imm_ud(INTEL_MSAA_FLAG_COARSE_PI_MSG));
2504
2505 /* And, if it's AT_OFFSET, we might have a non-trivial descriptor */
2506 if (orig_desc.file == IMM) {
2507 desc_imm |= orig_desc.ud;
2508 } else {
2509 ubld.OR(desc, desc, orig_desc);
2510 }
2511 }
2512
2513 /* If persample_dispatch is dynamic, select the interpolation mode
2514 * dynamically and OR into the descriptor to complete the static part
2515 * generated by brw_pixel_interp_desc().
2516 *
2517 * Why does this work? If you look at the SKL PRMs, Volume 7:
2518 * 3D-Media-GPGPU, Shared Functions Pixel Interpolater, you'll see that
2519 *
2520 * - "Per Message Offset” Message Descriptor
2521 * - “Sample Position Offset” Message Descriptor
2522 *
2523 * have different formats. Fortunately, a fragment shader dispatched at
2524 * pixel rate, will have gl_SampleID = 0 & gl_NumSamples = 1. So the value
2525 * we pack in “Sample Position Offset” will be a 0 and will cover the X/Y
2526 * components of "Per Message Offset”, which will give us the pixel offset 0x0.
2527 */
2528 if (dynamic_mode) {
2529 fs_reg orig_desc = desc;
2530 const fs_builder &ubld = bld.exec_all().group(8, 0);
2531 desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2532
2533 /* The predicate should have been built in brw_fs_nir.cpp when emitting
2534 * NIR code. This guarantees that we do not have incorrect interactions
2535 * with the flag register holding the predication result.
2536 */
2537 if (orig_desc.file == IMM) {
2538 /* Not using SEL here because we would generate an instruction with 2
2539 * immediate sources which is not supported by HW.
2540 */
2541 set_predicate_inv(BRW_PREDICATE_NORMAL, false,
2542 ubld.MOV(desc, brw_imm_ud(orig_desc.ud |
2543 GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
2544 set_predicate_inv(BRW_PREDICATE_NORMAL, true,
2545 ubld.MOV(desc, brw_imm_ud(orig_desc.ud |
2546 GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
2547 } else {
2548 set_predicate_inv(BRW_PREDICATE_NORMAL, false,
2549 ubld.OR(desc, orig_desc,
2550 brw_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
2551 set_predicate_inv(BRW_PREDICATE_NORMAL, true,
2552 ubld.OR(desc, orig_desc,
2553 brw_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
2554 }
2555 }
2556
2557 inst->opcode = SHADER_OPCODE_SEND;
2558 inst->sfid = GFX7_SFID_PIXEL_INTERPOLATOR;
2559 inst->desc = desc_imm;
2560 inst->ex_desc = 0;
2561 inst->mlen = mlen;
2562 inst->ex_mlen = 0;
2563 inst->send_has_side_effects = false;
2564 inst->send_is_volatile = false;
2565
2566 inst->resize_sources(3);
2567 inst->src[0] = component(desc, 0);
2568 inst->src[1] = brw_imm_ud(0); /* ex_desc */
2569 inst->src[2] = payload;
2570 }
2571
2572 static void
lower_btd_logical_send(const fs_builder & bld,fs_inst * inst)2573 lower_btd_logical_send(const fs_builder &bld, fs_inst *inst)
2574 {
2575 const intel_device_info *devinfo = bld.shader->devinfo;
2576 fs_reg global_addr = inst->src[0];
2577 const fs_reg btd_record = inst->src[1];
2578
2579 const unsigned unit = reg_unit(devinfo);
2580 const unsigned mlen = 2 * unit;
2581 const fs_builder ubld = bld.exec_all();
2582 fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2 * unit);
2583
2584 ubld.MOV(header, brw_imm_ud(0));
2585 switch (inst->opcode) {
2586 case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
2587 assert(type_sz(global_addr.type) == 8 && global_addr.stride == 0);
2588 global_addr.type = BRW_REGISTER_TYPE_UD;
2589 global_addr.stride = 1;
2590 ubld.group(2, 0).MOV(header, global_addr);
2591 break;
2592
2593 case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
2594 /* The bottom bit is the Stack ID release bit */
2595 ubld.group(1, 0).MOV(header, brw_imm_ud(1));
2596 break;
2597
2598 default:
2599 unreachable("Invalid BTD message");
2600 }
2601
2602 /* Stack IDs are always in R1 regardless of whether we're coming from a
2603 * bindless shader or a regular compute shader.
2604 */
2605 fs_reg stack_ids = retype(offset(header, bld, 1), BRW_REGISTER_TYPE_UW);
2606 bld.exec_all().MOV(stack_ids, retype(brw_vec8_grf(1 * unit, 0),
2607 BRW_REGISTER_TYPE_UW));
2608
2609 unsigned ex_mlen = 0;
2610 fs_reg payload;
2611 if (inst->opcode == SHADER_OPCODE_BTD_SPAWN_LOGICAL) {
2612 ex_mlen = 2 * (inst->exec_size / 8);
2613 payload = bld.move_to_vgrf(btd_record, 1);
2614 } else {
2615 assert(inst->opcode == SHADER_OPCODE_BTD_RETIRE_LOGICAL);
2616 /* All these messages take a BTD and things complain if we don't provide
2617 * one for RETIRE. However, it shouldn't ever actually get used so fill
2618 * it with zero.
2619 */
2620 ex_mlen = 2 * (inst->exec_size / 8);
2621 payload = bld.move_to_vgrf(brw_imm_uq(0), 1);
2622 }
2623
2624 /* Update the original instruction. */
2625 inst->opcode = SHADER_OPCODE_SEND;
2626 inst->mlen = mlen;
2627 inst->ex_mlen = ex_mlen;
2628 inst->header_size = 0; /* HW docs require has_header = false */
2629 inst->send_has_side_effects = true;
2630 inst->send_is_volatile = false;
2631
2632 /* Set up SFID and descriptors */
2633 inst->sfid = GEN_RT_SFID_BINDLESS_THREAD_DISPATCH;
2634 inst->desc = brw_btd_spawn_desc(devinfo, inst->exec_size,
2635 GEN_RT_BTD_MESSAGE_SPAWN);
2636 inst->resize_sources(4);
2637 inst->src[0] = brw_imm_ud(0); /* desc */
2638 inst->src[1] = brw_imm_ud(0); /* ex_desc */
2639 inst->src[2] = header;
2640 inst->src[3] = payload;
2641 }
2642
2643 static void
lower_trace_ray_logical_send(const fs_builder & bld,fs_inst * inst)2644 lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst)
2645 {
2646 const intel_device_info *devinfo = bld.shader->devinfo;
2647 /* The emit_uniformize() in brw_fs_nir.cpp will generate an horizontal
2648 * stride of 0. Below we're doing a MOV() in SIMD2. Since we can't use UQ/Q
2649 * types in on Gfx12.5, we need to tweak the stride with a value of 1 dword
2650 * so that the MOV operates on 2 components rather than twice the same
2651 * component.
2652 */
2653 fs_reg globals_addr = retype(inst->src[RT_LOGICAL_SRC_GLOBALS], BRW_REGISTER_TYPE_UD);
2654 globals_addr.stride = 1;
2655 const fs_reg bvh_level =
2656 inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == BRW_IMMEDIATE_VALUE ?
2657 inst->src[RT_LOGICAL_SRC_BVH_LEVEL] :
2658 bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL],
2659 inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL));
2660 const fs_reg trace_ray_control =
2661 inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == BRW_IMMEDIATE_VALUE ?
2662 inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] :
2663 bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL],
2664 inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL));
2665 const fs_reg synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS];
2666 assert(synchronous_src.file == BRW_IMMEDIATE_VALUE);
2667 const bool synchronous = synchronous_src.ud;
2668
2669 const unsigned unit = reg_unit(devinfo);
2670 const unsigned mlen = unit;
2671 const fs_builder ubld = bld.exec_all();
2672 fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2673 ubld.MOV(header, brw_imm_ud(0));
2674 ubld.group(2, 0).MOV(header, globals_addr);
2675 if (synchronous)
2676 ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous));
2677
2678 const unsigned ex_mlen = inst->exec_size / 8;
2679 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);
2680 if (bvh_level.file == BRW_IMMEDIATE_VALUE &&
2681 trace_ray_control.file == BRW_IMMEDIATE_VALUE) {
2682 bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, 9, 8) |
2683 (bvh_level.ud & 0x7)));
2684 } else {
2685 bld.SHL(payload, trace_ray_control, brw_imm_ud(8));
2686 bld.OR(payload, payload, bvh_level);
2687 }
2688
2689 /* When doing synchronous traversal, the HW implicitly computes the
2690 * stack_id using the following formula :
2691 *
2692 * EUID[3:0] & THREAD_ID[2:0] & SIMD_LANE_ID[3:0]
2693 *
2694 * Only in the asynchronous case we need to set the stack_id given from the
2695 * payload register.
2696 */
2697 if (!synchronous) {
2698 bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1),
2699 retype(brw_vec8_grf(1 * unit, 0), BRW_REGISTER_TYPE_UW),
2700 brw_imm_uw(0x7ff));
2701 }
2702
2703 /* Update the original instruction. */
2704 inst->opcode = SHADER_OPCODE_SEND;
2705 inst->mlen = mlen;
2706 inst->ex_mlen = ex_mlen;
2707 inst->header_size = 0; /* HW docs require has_header = false */
2708 inst->send_has_side_effects = true;
2709 inst->send_is_volatile = false;
2710
2711 /* Set up SFID and descriptors */
2712 inst->sfid = GEN_RT_SFID_RAY_TRACE_ACCELERATOR;
2713 inst->desc = brw_rt_trace_ray_desc(devinfo, inst->exec_size);
2714 inst->resize_sources(4);
2715 inst->src[0] = brw_imm_ud(0); /* desc */
2716 inst->src[1] = brw_imm_ud(0); /* ex_desc */
2717 inst->src[2] = header;
2718 inst->src[3] = payload;
2719 }
2720
2721 static void
lower_get_buffer_size(const fs_builder & bld,fs_inst * inst)2722 lower_get_buffer_size(const fs_builder &bld, fs_inst *inst)
2723 {
2724 const intel_device_info *devinfo = bld.shader->devinfo;
2725 /* Since we can only execute this instruction on uniform bti/surface
2726 * handles, brw_fs_nir.cpp should already have limited this to SIMD8.
2727 */
2728 assert(inst->exec_size == (devinfo->ver < 20 ? 8 : 16));
2729
2730 fs_reg surface = inst->src[GET_BUFFER_SIZE_SRC_SURFACE];
2731 fs_reg surface_handle = inst->src[GET_BUFFER_SIZE_SRC_SURFACE_HANDLE];
2732 fs_reg lod = inst->src[GET_BUFFER_SIZE_SRC_LOD];
2733
2734 inst->opcode = SHADER_OPCODE_SEND;
2735 inst->mlen = inst->exec_size / 8;
2736 inst->resize_sources(3);
2737 inst->ex_mlen = 0;
2738 inst->ex_desc = 0;
2739
2740 /* src[0] & src[1] are filled by setup_surface_descriptors() */
2741 inst->src[2] = lod;
2742
2743 const uint32_t return_format = GFX8_SAMPLER_RETURN_FORMAT_32BITS;
2744
2745 const uint32_t desc = brw_sampler_desc(devinfo, 0, 0,
2746 GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
2747 BRW_SAMPLER_SIMD_MODE_SIMD8,
2748 return_format);
2749
2750 inst->dst = retype(inst->dst, BRW_REGISTER_TYPE_UW);
2751 inst->sfid = BRW_SFID_SAMPLER;
2752 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2753 }
2754
2755 bool
brw_fs_lower_logical_sends(fs_visitor & s)2756 brw_fs_lower_logical_sends(fs_visitor &s)
2757 {
2758 const intel_device_info *devinfo = s.devinfo;
2759 bool progress = false;
2760
2761 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
2762 const fs_builder ibld(&s, block, inst);
2763
2764 switch (inst->opcode) {
2765 case FS_OPCODE_FB_WRITE_LOGICAL:
2766 assert(s.stage == MESA_SHADER_FRAGMENT);
2767 lower_fb_write_logical_send(ibld, inst,
2768 brw_wm_prog_data(s.prog_data),
2769 (const brw_wm_prog_key *)s.key,
2770 s.fs_payload());
2771 break;
2772
2773 case FS_OPCODE_FB_READ_LOGICAL:
2774 lower_fb_read_logical_send(ibld, inst);
2775 break;
2776
2777 case SHADER_OPCODE_TEX_LOGICAL:
2778 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
2779 break;
2780
2781 case SHADER_OPCODE_TXD_LOGICAL:
2782 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
2783 break;
2784
2785 case SHADER_OPCODE_TXF_LOGICAL:
2786 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
2787 break;
2788
2789 case SHADER_OPCODE_TXL_LOGICAL:
2790 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
2791 break;
2792
2793 case SHADER_OPCODE_TXS_LOGICAL:
2794 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
2795 break;
2796
2797 case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
2798 lower_sampler_logical_send(ibld, inst,
2799 SHADER_OPCODE_IMAGE_SIZE_LOGICAL);
2800 break;
2801
2802 case FS_OPCODE_TXB_LOGICAL:
2803 lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
2804 break;
2805
2806 case SHADER_OPCODE_TXF_CMS_LOGICAL:
2807 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
2808 break;
2809
2810 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
2811 case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
2812 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
2813 break;
2814
2815 case SHADER_OPCODE_TXF_UMS_LOGICAL:
2816 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
2817 break;
2818
2819 case SHADER_OPCODE_TXF_MCS_LOGICAL:
2820 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
2821 break;
2822
2823 case SHADER_OPCODE_LOD_LOGICAL:
2824 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
2825 break;
2826
2827 case SHADER_OPCODE_TG4_LOGICAL:
2828 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
2829 break;
2830
2831 case SHADER_OPCODE_TG4_BIAS_LOGICAL:
2832 assert(devinfo->ver >= 20);
2833 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_BIAS);
2834 break;
2835
2836 case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
2837 assert(devinfo->ver >= 20);
2838 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_EXPLICIT_LOD);
2839 break;
2840
2841 case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
2842 assert(devinfo->ver >= 20);
2843 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_IMPLICIT_LOD);
2844 break;
2845
2846 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
2847 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
2848 break;
2849
2850 case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
2851 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET_LOD);
2852 break;
2853
2854 case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
2855 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET_BIAS);
2856 break;
2857
2858 case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
2859 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO);
2860 break;
2861
2862 case SHADER_OPCODE_GET_BUFFER_SIZE:
2863 lower_get_buffer_size(ibld, inst);
2864 break;
2865
2866 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
2867 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
2868 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
2869 case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
2870 case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
2871 if (devinfo->has_lsc) {
2872 lower_lsc_surface_logical_send(ibld, inst);
2873 break;
2874 }
2875 case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
2876 case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
2877 case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
2878 case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
2879 case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
2880 lower_surface_logical_send(ibld, inst);
2881 break;
2882
2883 case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2884 case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
2885 if (devinfo->has_lsc) {
2886 lower_lsc_block_logical_send(ibld, inst);
2887 break;
2888 }
2889 lower_surface_block_logical_send(ibld, inst);
2890 break;
2891
2892 case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
2893 case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
2894 case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
2895 case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
2896 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
2897 case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
2898 case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2899 case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
2900 if (devinfo->has_lsc) {
2901 lower_lsc_a64_logical_send(ibld, inst);
2902 break;
2903 }
2904 lower_a64_logical_send(ibld, inst);
2905 break;
2906
2907 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
2908 if (devinfo->has_lsc && !s.compiler->indirect_ubos_use_sampler)
2909 lower_lsc_varying_pull_constant_logical_send(ibld, inst);
2910 else
2911 lower_varying_pull_constant_logical_send(ibld, inst);
2912 break;
2913
2914 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2915 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2916 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2917 lower_interpolator_logical_send(ibld, inst,
2918 (const brw_wm_prog_key *)s.key,
2919 brw_wm_prog_data(s.prog_data));
2920 break;
2921
2922 case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
2923 case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
2924 lower_btd_logical_send(ibld, inst);
2925 break;
2926
2927 case RT_OPCODE_TRACE_RAY_LOGICAL:
2928 lower_trace_ray_logical_send(ibld, inst);
2929 break;
2930
2931 case SHADER_OPCODE_URB_READ_LOGICAL:
2932 if (devinfo->ver < 20)
2933 lower_urb_read_logical_send(ibld, inst);
2934 else
2935 lower_urb_read_logical_send_xe2(ibld, inst);
2936 break;
2937
2938 case SHADER_OPCODE_URB_WRITE_LOGICAL:
2939 if (devinfo->ver < 20)
2940 lower_urb_write_logical_send(ibld, inst);
2941 else
2942 lower_urb_write_logical_send_xe2(ibld, inst);
2943
2944 break;
2945
2946 default:
2947 continue;
2948 }
2949
2950 progress = true;
2951 }
2952
2953 if (progress)
2954 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2955
2956 return progress;
2957 }
2958
2959 /**
2960 * Turns the generic expression-style uniform pull constant load instruction
2961 * into a hardware-specific series of instructions for loading a pull
2962 * constant.
2963 *
2964 * The expression style allows the CSE pass before this to optimize out
2965 * repeated loads from the same offset, and gives the pre-register-allocation
2966 * scheduling full flexibility, while the conversion to native instructions
2967 * allows the post-register-allocation scheduler the best information
2968 * possible.
2969 *
2970 * Note that execution masking for setting up pull constant loads is special:
2971 * the channels that need to be written are unrelated to the current execution
2972 * mask, since a later instruction will use one of the result channels as a
2973 * source operand for all 8 or 16 of its channels.
2974 */
2975 bool
brw_fs_lower_uniform_pull_constant_loads(fs_visitor & s)2976 brw_fs_lower_uniform_pull_constant_loads(fs_visitor &s)
2977 {
2978 const intel_device_info *devinfo = s.devinfo;
2979 bool progress = false;
2980
2981 foreach_block_and_inst (block, fs_inst, inst, s.cfg) {
2982 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2983 continue;
2984
2985 const fs_reg surface = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE];
2986 const fs_reg surface_handle = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE];
2987 const fs_reg offset_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_OFFSET];
2988 const fs_reg size_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_SIZE];
2989 assert(surface.file == BAD_FILE || surface_handle.file == BAD_FILE);
2990 assert(offset_B.file == IMM);
2991 assert(size_B.file == IMM);
2992
2993 if (devinfo->has_lsc) {
2994 const fs_builder ubld =
2995 fs_builder(&s, block, inst).group(8, 0).exec_all();
2996
2997 const fs_reg payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2998 ubld.MOV(payload, offset_B);
2999
3000 inst->sfid = GFX12_SFID_UGM;
3001 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
3002 1 /* simd_size */,
3003 surface_handle.file == BAD_FILE ?
3004 LSC_ADDR_SURFTYPE_BTI :
3005 LSC_ADDR_SURFTYPE_BSS,
3006 LSC_ADDR_SIZE_A32,
3007 1 /* num_coordinates */,
3008 LSC_DATA_SIZE_D32,
3009 inst->size_written / 4,
3010 true /* transpose */,
3011 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
3012 true /* has_dest */);
3013
3014 /* Update the original instruction. */
3015 inst->opcode = SHADER_OPCODE_SEND;
3016 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
3017 inst->send_ex_bso = surface_handle.file != BAD_FILE &&
3018 s.compiler->extended_bindless_surface_offset;
3019 inst->ex_mlen = 0;
3020 inst->header_size = 0;
3021 inst->send_has_side_effects = false;
3022 inst->send_is_volatile = true;
3023 inst->exec_size = 1;
3024
3025 /* Finally, the payload */
3026
3027 inst->resize_sources(3);
3028 setup_lsc_surface_descriptors(ubld, inst, inst->desc,
3029 surface.file != BAD_FILE ?
3030 surface : surface_handle);
3031 inst->src[2] = payload;
3032
3033 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3034 } else {
3035 const fs_builder ubld = fs_builder(&s, block, inst).exec_all();
3036 fs_reg header = fs_builder(&s, 8).exec_all().vgrf(BRW_REGISTER_TYPE_UD);
3037
3038 ubld.group(8, 0).MOV(header,
3039 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
3040 ubld.group(1, 0).MOV(component(header, 2),
3041 brw_imm_ud(offset_B.ud / 16));
3042
3043 inst->sfid = GFX6_SFID_DATAPORT_CONSTANT_CACHE;
3044 inst->opcode = SHADER_OPCODE_SEND;
3045 inst->header_size = 1;
3046 inst->mlen = 1;
3047
3048 uint32_t desc =
3049 brw_dp_oword_block_rw_desc(devinfo, true /* align_16B */,
3050 size_B.ud / 4, false /* write */);
3051
3052 inst->resize_sources(4);
3053
3054 setup_surface_descriptors(ubld, inst, desc, surface, surface_handle);
3055
3056 inst->src[2] = header;
3057 inst->src[3] = fs_reg(); /* unused for reads */
3058
3059 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3060 }
3061
3062 progress = true;
3063 }
3064
3065 return progress;
3066 }
3067