1 /*
2 * Copyright © 2010, 2022 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /**
25 * @file brw_lower_logical_sends.cpp
26 */
27
28 #include "brw_eu.h"
29 #include "brw_fs.h"
30
31 using namespace brw;
32
33 static void
lower_urb_read_logical_send(const fs_builder & bld,fs_inst * inst)34 lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst)
35 {
36 const intel_device_info *devinfo = bld.shader->devinfo;
37 const bool per_slot_present =
38 inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
39
40 assert(inst->size_written % REG_SIZE == 0);
41 assert(inst->header_size == 0);
42
43 fs_reg *payload_sources = new fs_reg[inst->mlen];
44 fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(inst->mlen),
45 BRW_REGISTER_TYPE_F);
46
47 unsigned header_size = 0;
48 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
49 if (per_slot_present)
50 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
51
52 bld.LOAD_PAYLOAD(payload, payload_sources, inst->mlen, header_size);
53
54 delete [] payload_sources;
55
56 inst->opcode = SHADER_OPCODE_SEND;
57 inst->header_size = header_size;
58
59 inst->sfid = BRW_SFID_URB;
60 inst->desc = brw_urb_desc(devinfo,
61 GFX8_URB_OPCODE_SIMD8_READ,
62 per_slot_present,
63 false,
64 inst->offset);
65
66 inst->ex_desc = 0;
67 inst->ex_mlen = 0;
68 inst->send_is_volatile = true;
69
70 inst->resize_sources(4);
71
72 inst->src[0] = brw_imm_ud(0); /* desc */
73 inst->src[1] = brw_imm_ud(0); /* ex_desc */
74 inst->src[2] = payload;
75 inst->src[3] = brw_null_reg();
76 }
77
78 static void
lower_urb_write_logical_send(const fs_builder & bld,fs_inst * inst)79 lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst)
80 {
81 const intel_device_info *devinfo = bld.shader->devinfo;
82 const bool per_slot_present =
83 inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
84 const bool channel_mask_present =
85 inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE;
86
87 assert(inst->header_size == 0);
88
89 fs_reg *payload_sources = new fs_reg[inst->mlen];
90 fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(inst->mlen),
91 BRW_REGISTER_TYPE_F);
92
93 unsigned header_size = 0;
94 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
95 if (per_slot_present)
96 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
97
98 if (channel_mask_present)
99 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
100
101 for (unsigned i = header_size, j = 0; i < inst->mlen; i++, j++)
102 payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j);
103
104 bld.LOAD_PAYLOAD(payload, payload_sources, inst->mlen, header_size);
105
106 delete [] payload_sources;
107
108 inst->opcode = SHADER_OPCODE_SEND;
109 inst->header_size = header_size;
110 inst->dst = brw_null_reg();
111
112 inst->sfid = BRW_SFID_URB;
113 inst->desc = brw_urb_desc(devinfo,
114 GFX8_URB_OPCODE_SIMD8_WRITE,
115 per_slot_present,
116 channel_mask_present,
117 inst->offset);
118
119 inst->ex_desc = 0;
120 inst->ex_mlen = 0;
121 inst->send_has_side_effects = true;
122
123 inst->resize_sources(4);
124
125 inst->src[0] = brw_imm_ud(0); /* desc */
126 inst->src[1] = brw_imm_ud(0); /* ex_desc */
127 inst->src[2] = payload;
128 inst->src[3] = brw_null_reg();
129 }
130
131 static void
setup_color_payload(const fs_builder & bld,const brw_wm_prog_key * key,fs_reg * dst,fs_reg color,unsigned components)132 setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
133 fs_reg *dst, fs_reg color, unsigned components)
134 {
135 if (key->clamp_fragment_color) {
136 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
137 assert(color.type == BRW_REGISTER_TYPE_F);
138
139 for (unsigned i = 0; i < components; i++)
140 set_saturate(true,
141 bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
142
143 color = tmp;
144 }
145
146 for (unsigned i = 0; i < components; i++)
147 dst[i] = offset(color, bld, i);
148 }
149
150 static void
lower_fb_write_logical_send(const fs_builder & bld,fs_inst * inst,const struct brw_wm_prog_data * prog_data,const brw_wm_prog_key * key,const fs_visitor::thread_payload & payload)151 lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
152 const struct brw_wm_prog_data *prog_data,
153 const brw_wm_prog_key *key,
154 const fs_visitor::thread_payload &payload)
155 {
156 assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
157 const intel_device_info *devinfo = bld.shader->devinfo;
158 const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
159 const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
160 const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
161 const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
162 const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
163 const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
164 fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
165 const unsigned components =
166 inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
167
168 assert(inst->target != 0 || src0_alpha.file == BAD_FILE);
169
170 /* We can potentially have a message length of up to 15, so we have to set
171 * base_mrf to either 0 or 1 in order to fit in m0..m15.
172 */
173 fs_reg sources[15];
174 int header_size = 2, payload_header_size;
175 unsigned length = 0;
176
177 if (devinfo->ver < 6) {
178 /* TODO: Support SIMD32 on gfx4-5 */
179 assert(bld.group() < 16);
180
181 /* For gfx4-5, we always have a header consisting of g0 and g1. We have
182 * an implied MOV from g0,g1 to the start of the message. The MOV from
183 * g0 is handled by the hardware and the MOV from g1 is provided by the
184 * generator. This is required because, on gfx4-5, the generator may
185 * generate two write messages with different message lengths in order
186 * to handle AA data properly.
187 *
188 * Also, since the pixel mask goes in the g0 portion of the message and
189 * since render target writes are the last thing in the shader, we write
190 * the pixel mask directly into g0 and it will get copied as part of the
191 * implied write.
192 */
193 if (prog_data->uses_kill) {
194 bld.exec_all().group(1, 0)
195 .MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW),
196 brw_sample_mask_reg(bld));
197 }
198
199 assert(length == 0);
200 length = 2;
201 } else if ((devinfo->verx10 <= 70 &&
202 prog_data->uses_kill) ||
203 (devinfo->ver < 11 &&
204 (color1.file != BAD_FILE || key->nr_color_regions > 1))) {
205 /* From the Sandy Bridge PRM, volume 4, page 198:
206 *
207 * "Dispatched Pixel Enables. One bit per pixel indicating
208 * which pixels were originally enabled when the thread was
209 * dispatched. This field is only required for the end-of-
210 * thread message and on all dual-source messages."
211 */
212 const fs_builder ubld = bld.exec_all().group(8, 0);
213
214 fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
215 if (bld.group() < 16) {
216 /* The header starts off as g0 and g1 for the first half */
217 ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
218 BRW_REGISTER_TYPE_UD));
219 } else {
220 /* The header starts off as g0 and g2 for the second half */
221 assert(bld.group() < 32);
222 const fs_reg header_sources[2] = {
223 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
224 retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD),
225 };
226 ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
227
228 /* Gfx12 will require additional fix-ups if we ever hit this path. */
229 assert(devinfo->ver < 12);
230 }
231
232 uint32_t g00_bits = 0;
233
234 /* Set "Source0 Alpha Present to RenderTarget" bit in message
235 * header.
236 */
237 if (src0_alpha.file != BAD_FILE)
238 g00_bits |= 1 << 11;
239
240 /* Set computes stencil to render target */
241 if (prog_data->computed_stencil)
242 g00_bits |= 1 << 14;
243
244 if (g00_bits) {
245 /* OR extra bits into g0.0 */
246 ubld.group(1, 0).OR(component(header, 0),
247 retype(brw_vec1_grf(0, 0),
248 BRW_REGISTER_TYPE_UD),
249 brw_imm_ud(g00_bits));
250 }
251
252 /* Set the render target index for choosing BLEND_STATE. */
253 if (inst->target > 0) {
254 ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target));
255 }
256
257 if (prog_data->uses_kill) {
258 ubld.group(1, 0).MOV(retype(component(header, 15),
259 BRW_REGISTER_TYPE_UW),
260 brw_sample_mask_reg(bld));
261 }
262
263 assert(length == 0);
264 sources[0] = header;
265 sources[1] = horiz_offset(header, 8);
266 length = 2;
267 }
268 assert(length == 0 || length == 2);
269 header_size = length;
270
271 if (payload.aa_dest_stencil_reg[0]) {
272 assert(inst->group < 16);
273 sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
274 bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
275 .MOV(sources[length],
276 fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg[0], 0)));
277 length++;
278 }
279
280 if (src0_alpha.file != BAD_FILE) {
281 for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
282 const fs_builder &ubld = bld.exec_all().group(8, i)
283 .annotate("FB write src0 alpha");
284 const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_F);
285 ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
286 setup_color_payload(ubld, key, &sources[length], tmp, 1);
287 length++;
288 }
289 }
290
291 if (sample_mask.file != BAD_FILE) {
292 sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),
293 BRW_REGISTER_TYPE_UD);
294
295 /* Hand over gl_SampleMask. Only the lower 16 bits of each channel are
296 * relevant. Since it's unsigned single words one vgrf is always
297 * 16-wide, but only the lower or higher 8 channels will be used by the
298 * hardware when doing a SIMD8 write depending on whether we have
299 * selected the subspans for the first or second half respectively.
300 */
301 assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
302 sample_mask.type = BRW_REGISTER_TYPE_UW;
303 sample_mask.stride *= 2;
304
305 bld.exec_all().annotate("FB write oMask")
306 .MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW),
307 inst->group % 16),
308 sample_mask);
309 length++;
310 }
311
312 payload_header_size = length;
313
314 setup_color_payload(bld, key, &sources[length], color0, components);
315 length += 4;
316
317 if (color1.file != BAD_FILE) {
318 setup_color_payload(bld, key, &sources[length], color1, components);
319 length += 4;
320 }
321
322 if (src_depth.file != BAD_FILE) {
323 sources[length] = src_depth;
324 length++;
325 }
326
327 if (dst_depth.file != BAD_FILE) {
328 sources[length] = dst_depth;
329 length++;
330 }
331
332 if (src_stencil.file != BAD_FILE) {
333 assert(devinfo->ver >= 9);
334 assert(bld.dispatch_width() == 8);
335
336 /* XXX: src_stencil is only available on gfx9+. dst_depth is never
337 * available on gfx9+. As such it's impossible to have both enabled at the
338 * same time and therefore length cannot overrun the array.
339 */
340 assert(length < 15);
341
342 sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
343 bld.exec_all().annotate("FB write OS")
344 .MOV(retype(sources[length], BRW_REGISTER_TYPE_UB),
345 subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0));
346 length++;
347 }
348
349 fs_inst *load;
350 if (devinfo->ver >= 7) {
351 /* Send from the GRF */
352 fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
353 load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
354 payload.nr = bld.shader->alloc.allocate(regs_written(load));
355 load->dst = payload;
356
357 uint32_t msg_ctl = brw_fb_write_msg_control(inst, prog_data);
358
359 inst->desc =
360 (inst->group / 16) << 11 | /* rt slot group */
361 brw_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,
362 prog_data->per_coarse_pixel_dispatch);
363
364 uint32_t ex_desc = 0;
365 if (devinfo->ver >= 11) {
366 /* Set the "Render Target Index" and "Src0 Alpha Present" fields
367 * in the extended message descriptor, in lieu of using a header.
368 */
369 ex_desc = inst->target << 12 | (src0_alpha.file != BAD_FILE) << 15;
370
371 if (key->nr_color_regions == 0)
372 ex_desc |= 1 << 20; /* Null Render Target */
373 }
374 inst->ex_desc = ex_desc;
375
376 inst->opcode = SHADER_OPCODE_SEND;
377 inst->resize_sources(3);
378 inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
379 inst->src[0] = brw_imm_ud(0);
380 inst->src[1] = brw_imm_ud(0);
381 inst->src[2] = payload;
382 inst->mlen = regs_written(load);
383 inst->ex_mlen = 0;
384 inst->header_size = header_size;
385 inst->check_tdr = true;
386 inst->send_has_side_effects = true;
387 } else {
388 /* Send from the MRF */
389 load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
390 sources, length, payload_header_size);
391
392 /* On pre-SNB, we have to interlace the color values. LOAD_PAYLOAD
393 * will do this for us if we just give it a COMPR4 destination.
394 */
395 if (devinfo->ver < 6 && bld.dispatch_width() == 16)
396 load->dst.nr |= BRW_MRF_COMPR4;
397
398 if (devinfo->ver < 6) {
399 /* Set up src[0] for the implied MOV from grf0-1 */
400 inst->resize_sources(1);
401 inst->src[0] = brw_vec8_grf(0, 0);
402 } else {
403 inst->resize_sources(0);
404 }
405 inst->base_mrf = 1;
406 inst->opcode = FS_OPCODE_FB_WRITE;
407 inst->mlen = regs_written(load);
408 inst->header_size = header_size;
409 }
410 }
411
412 static void
lower_fb_read_logical_send(const fs_builder & bld,fs_inst * inst)413 lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)
414 {
415 const intel_device_info *devinfo = bld.shader->devinfo;
416 const fs_builder &ubld = bld.exec_all().group(8, 0);
417 const unsigned length = 2;
418 const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length);
419
420 if (bld.group() < 16) {
421 ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
422 BRW_REGISTER_TYPE_UD));
423 } else {
424 assert(bld.group() < 32);
425 const fs_reg header_sources[] = {
426 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
427 retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD)
428 };
429 ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
430
431 if (devinfo->ver >= 12) {
432 /* On Gfx12 the Viewport and Render Target Array Index fields (AKA
433 * Poly 0 Info) are provided in r1.1 instead of r0.0, and the render
434 * target message header format was updated accordingly -- However
435 * the updated format only works for the lower 16 channels in a
436 * SIMD32 thread, since the higher 16 channels want the subspan data
437 * from r2 instead of r1, so we need to copy over the contents of
438 * r1.1 in order to fix things up.
439 */
440 ubld.group(1, 0).MOV(component(header, 9),
441 retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD));
442 }
443 }
444
445 /* BSpec 12470 (Gfx8-11), BSpec 47842 (Gfx12+) :
446 *
447 * "Must be zero for Render Target Read message."
448 *
449 * For bits :
450 * - 14 : Stencil Present to Render Target
451 * - 13 : Source Depth Present to Render Target
452 * - 12 : oMask to Render Target
453 * - 11 : Source0 Alpha Present to Render Target
454 */
455 ubld.group(1, 0).AND(component(header, 0),
456 component(header, 0),
457 brw_imm_ud(~INTEL_MASK(14, 11)));
458
459 inst->resize_sources(1);
460 inst->src[0] = header;
461 inst->opcode = FS_OPCODE_FB_READ;
462 inst->mlen = length;
463 inst->header_size = length;
464 }
465
466 static void
lower_sampler_logical_send_gfx4(const fs_builder & bld,fs_inst * inst,opcode op,const fs_reg & coordinate,const fs_reg & shadow_c,const fs_reg & lod,const fs_reg & lod2,const fs_reg & surface,const fs_reg & sampler,unsigned coord_components,unsigned grad_components)467 lower_sampler_logical_send_gfx4(const fs_builder &bld, fs_inst *inst, opcode op,
468 const fs_reg &coordinate,
469 const fs_reg &shadow_c,
470 const fs_reg &lod, const fs_reg &lod2,
471 const fs_reg &surface,
472 const fs_reg &sampler,
473 unsigned coord_components,
474 unsigned grad_components)
475 {
476 const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
477 op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
478 fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
479 fs_reg msg_end = msg_begin;
480
481 /* g0 header. */
482 msg_end = offset(msg_end, bld.group(8, 0), 1);
483
484 for (unsigned i = 0; i < coord_components; i++)
485 bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
486 offset(coordinate, bld, i));
487
488 msg_end = offset(msg_end, bld, coord_components);
489
490 /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
491 * require all three components to be present and zero if they are unused.
492 */
493 if (coord_components > 0 &&
494 (has_lod || shadow_c.file != BAD_FILE ||
495 (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
496 assert(coord_components <= 3);
497 for (unsigned i = 0; i < 3 - coord_components; i++)
498 bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));
499
500 msg_end = offset(msg_end, bld, 3 - coord_components);
501 }
502
503 if (op == SHADER_OPCODE_TXD) {
504 /* TXD unsupported in SIMD16 mode. */
505 assert(bld.dispatch_width() == 8);
506
507 /* the slots for u and v are always present, but r is optional */
508 if (coord_components < 2)
509 msg_end = offset(msg_end, bld, 2 - coord_components);
510
511 /* P = u, v, r
512 * dPdx = dudx, dvdx, drdx
513 * dPdy = dudy, dvdy, drdy
514 *
515 * 1-arg: Does not exist.
516 *
517 * 2-arg: dudx dvdx dudy dvdy
518 * dPdx.x dPdx.y dPdy.x dPdy.y
519 * m4 m5 m6 m7
520 *
521 * 3-arg: dudx dvdx drdx dudy dvdy drdy
522 * dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
523 * m5 m6 m7 m8 m9 m10
524 */
525 for (unsigned i = 0; i < grad_components; i++)
526 bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
527
528 msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
529
530 for (unsigned i = 0; i < grad_components; i++)
531 bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
532
533 msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
534 }
535
536 if (has_lod) {
537 /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
538 * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
539 */
540 assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
541 bld.dispatch_width() == 16);
542
543 const brw_reg_type type =
544 (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
545 BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
546 bld.MOV(retype(msg_end, type), lod);
547 msg_end = offset(msg_end, bld, 1);
548 }
549
550 if (shadow_c.file != BAD_FILE) {
551 if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
552 /* There's no plain shadow compare message, so we use shadow
553 * compare with a bias of 0.0.
554 */
555 bld.MOV(msg_end, brw_imm_f(0.0f));
556 msg_end = offset(msg_end, bld, 1);
557 }
558
559 bld.MOV(msg_end, shadow_c);
560 msg_end = offset(msg_end, bld, 1);
561 }
562
563 inst->opcode = op;
564 inst->src[0] = reg_undef;
565 inst->src[1] = surface;
566 inst->src[2] = sampler;
567 inst->resize_sources(3);
568 inst->base_mrf = msg_begin.nr;
569 inst->mlen = msg_end.nr - msg_begin.nr;
570 inst->header_size = 1;
571 }
572
573 static void
lower_sampler_logical_send_gfx5(const fs_builder & bld,fs_inst * inst,opcode op,const fs_reg & coordinate,const fs_reg & shadow_c,const fs_reg & lod,const fs_reg & lod2,const fs_reg & sample_index,const fs_reg & surface,const fs_reg & sampler,unsigned coord_components,unsigned grad_components)574 lower_sampler_logical_send_gfx5(const fs_builder &bld, fs_inst *inst, opcode op,
575 const fs_reg &coordinate,
576 const fs_reg &shadow_c,
577 const fs_reg &lod, const fs_reg &lod2,
578 const fs_reg &sample_index,
579 const fs_reg &surface,
580 const fs_reg &sampler,
581 unsigned coord_components,
582 unsigned grad_components)
583 {
584 fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
585 fs_reg msg_coords = message;
586 unsigned header_size = 0;
587
588 if (inst->offset != 0) {
589 /* The offsets set up by the visitor are in the m1 header, so we can't
590 * go headerless.
591 */
592 header_size = 1;
593 message.nr--;
594 }
595
596 for (unsigned i = 0; i < coord_components; i++)
597 bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type),
598 offset(coordinate, bld, i));
599
600 fs_reg msg_end = offset(msg_coords, bld, coord_components);
601 fs_reg msg_lod = offset(msg_coords, bld, 4);
602
603 if (shadow_c.file != BAD_FILE) {
604 fs_reg msg_shadow = msg_lod;
605 bld.MOV(msg_shadow, shadow_c);
606 msg_lod = offset(msg_shadow, bld, 1);
607 msg_end = msg_lod;
608 }
609
610 switch (op) {
611 case SHADER_OPCODE_TXL:
612 case FS_OPCODE_TXB:
613 bld.MOV(msg_lod, lod);
614 msg_end = offset(msg_lod, bld, 1);
615 break;
616 case SHADER_OPCODE_TXD:
617 /**
618 * P = u, v, r
619 * dPdx = dudx, dvdx, drdx
620 * dPdy = dudy, dvdy, drdy
621 *
622 * Load up these values:
623 * - dudx dudy dvdx dvdy drdx drdy
624 * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
625 */
626 msg_end = msg_lod;
627 for (unsigned i = 0; i < grad_components; i++) {
628 bld.MOV(msg_end, offset(lod, bld, i));
629 msg_end = offset(msg_end, bld, 1);
630
631 bld.MOV(msg_end, offset(lod2, bld, i));
632 msg_end = offset(msg_end, bld, 1);
633 }
634 break;
635 case SHADER_OPCODE_TXS:
636 msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
637 bld.MOV(msg_lod, lod);
638 msg_end = offset(msg_lod, bld, 1);
639 break;
640 case SHADER_OPCODE_TXF:
641 msg_lod = offset(msg_coords, bld, 3);
642 bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
643 msg_end = offset(msg_lod, bld, 1);
644 break;
645 case SHADER_OPCODE_TXF_CMS:
646 msg_lod = offset(msg_coords, bld, 3);
647 /* lod */
648 bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
649 /* sample index */
650 bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
651 msg_end = offset(msg_lod, bld, 2);
652 break;
653 default:
654 break;
655 }
656
657 inst->opcode = op;
658 inst->src[0] = reg_undef;
659 inst->src[1] = surface;
660 inst->src[2] = sampler;
661 inst->resize_sources(3);
662 inst->base_mrf = message.nr;
663 inst->mlen = msg_end.nr - message.nr;
664 inst->header_size = header_size;
665
666 /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
667 assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
668 }
669
670 static bool
is_high_sampler(const struct intel_device_info * devinfo,const fs_reg & sampler)671 is_high_sampler(const struct intel_device_info *devinfo, const fs_reg &sampler)
672 {
673 if (devinfo->verx10 <= 70)
674 return false;
675
676 return sampler.file != IMM || sampler.ud >= 16;
677 }
678
679 static unsigned
sampler_msg_type(const intel_device_info * devinfo,opcode opcode,bool shadow_compare)680 sampler_msg_type(const intel_device_info *devinfo,
681 opcode opcode, bool shadow_compare)
682 {
683 assert(devinfo->ver >= 5);
684 switch (opcode) {
685 case SHADER_OPCODE_TEX:
686 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
687 GFX5_SAMPLER_MESSAGE_SAMPLE;
688 case FS_OPCODE_TXB:
689 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
690 GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
691 case SHADER_OPCODE_TXL:
692 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
693 GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
694 case SHADER_OPCODE_TXL_LZ:
695 return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ :
696 GFX9_SAMPLER_MESSAGE_SAMPLE_LZ;
697 case SHADER_OPCODE_TXS:
698 case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
699 return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
700 case SHADER_OPCODE_TXD:
701 assert(!shadow_compare || devinfo->verx10 >= 75);
702 return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
703 GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
704 case SHADER_OPCODE_TXF:
705 return GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
706 case SHADER_OPCODE_TXF_LZ:
707 assert(devinfo->ver >= 9);
708 return GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
709 case SHADER_OPCODE_TXF_CMS_W:
710 assert(devinfo->ver >= 9);
711 return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
712 case SHADER_OPCODE_TXF_CMS:
713 return devinfo->ver >= 7 ? GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS :
714 GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
715 case SHADER_OPCODE_TXF_UMS:
716 assert(devinfo->ver >= 7);
717 return GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
718 case SHADER_OPCODE_TXF_MCS:
719 assert(devinfo->ver >= 7);
720 return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
721 case SHADER_OPCODE_LOD:
722 return GFX5_SAMPLER_MESSAGE_LOD;
723 case SHADER_OPCODE_TG4:
724 assert(devinfo->ver >= 7);
725 return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
726 GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
727 break;
728 case SHADER_OPCODE_TG4_OFFSET:
729 assert(devinfo->ver >= 7);
730 return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
731 GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
732 case SHADER_OPCODE_SAMPLEINFO:
733 return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
734 default:
735 unreachable("not reached");
736 }
737 }
738
739 /**
740 * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to
741 * the given requested_alignment_sz.
742 */
743 static fs_inst *
emit_load_payload_with_padding(const fs_builder & bld,const fs_reg & dst,const fs_reg * src,unsigned sources,unsigned header_size,unsigned requested_alignment_sz)744 emit_load_payload_with_padding(const fs_builder &bld, const fs_reg &dst,
745 const fs_reg *src, unsigned sources,
746 unsigned header_size,
747 unsigned requested_alignment_sz)
748 {
749 unsigned length = 0;
750 unsigned num_srcs =
751 sources * DIV_ROUND_UP(requested_alignment_sz, bld.dispatch_width());
752 fs_reg *src_comps = new fs_reg[num_srcs];
753
754 for (unsigned i = 0; i < header_size; i++)
755 src_comps[length++] = src[i];
756
757 for (unsigned i = header_size; i < sources; i++) {
758 unsigned src_sz =
759 retype(dst, src[i].type).component_size(bld.dispatch_width());
760 const enum brw_reg_type padding_payload_type =
761 brw_reg_type_from_bit_size(type_sz(src[i].type) * 8,
762 BRW_REGISTER_TYPE_UD);
763
764 src_comps[length++] = src[i];
765
766 /* Expand the real sources if component of requested payload type is
767 * larger than real source component.
768 */
769 if (src_sz < requested_alignment_sz) {
770 for (unsigned j = 0; j < (requested_alignment_sz / src_sz) - 1; j++) {
771 src_comps[length++] = retype(fs_reg(), padding_payload_type);
772 }
773 }
774 }
775
776 fs_inst *inst = bld.LOAD_PAYLOAD(dst, src_comps, length, header_size);
777 delete[] src_comps;
778
779 return inst;
780 }
781
782 static void
lower_sampler_logical_send_gfx7(const fs_builder & bld,fs_inst * inst,opcode op,const fs_reg & coordinate,const fs_reg & shadow_c,fs_reg lod,const fs_reg & lod2,const fs_reg & min_lod,const fs_reg & sample_index,const fs_reg & mcs,const fs_reg & surface,const fs_reg & sampler,const fs_reg & surface_handle,const fs_reg & sampler_handle,const fs_reg & tg4_offset,unsigned payload_type_bit_size,unsigned coord_components,unsigned grad_components)783 lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
784 const fs_reg &coordinate,
785 const fs_reg &shadow_c,
786 fs_reg lod, const fs_reg &lod2,
787 const fs_reg &min_lod,
788 const fs_reg &sample_index,
789 const fs_reg &mcs,
790 const fs_reg &surface,
791 const fs_reg &sampler,
792 const fs_reg &surface_handle,
793 const fs_reg &sampler_handle,
794 const fs_reg &tg4_offset,
795 unsigned payload_type_bit_size,
796 unsigned coord_components,
797 unsigned grad_components)
798 {
799 const intel_device_info *devinfo = bld.shader->devinfo;
800 const enum brw_reg_type payload_type =
801 brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_F);
802 const enum brw_reg_type payload_unsigned_type =
803 brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_UD);
804 const enum brw_reg_type payload_signed_type =
805 brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_D);
806 unsigned reg_width = bld.dispatch_width() / 8;
807 unsigned header_size = 0, length = 0;
808 fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
809 for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
810 sources[i] = bld.vgrf(payload_type);
811
812 /* We must have exactly one of surface/sampler and surface/sampler_handle */
813 assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
814 assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
815
816 if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
817 inst->offset != 0 || inst->eot ||
818 op == SHADER_OPCODE_SAMPLEINFO ||
819 sampler_handle.file != BAD_FILE ||
820 is_high_sampler(devinfo, sampler)) {
821 /* For general texture offsets (no txf workaround), we need a header to
822 * put them in.
823 *
824 * TG4 needs to place its channel select in the header, for interaction
825 * with ARB_texture_swizzle. The sampler index is only 4-bits, so for
826 * larger sampler numbers we need to offset the Sampler State Pointer in
827 * the header.
828 */
829 fs_reg header = retype(sources[0], BRW_REGISTER_TYPE_UD);
830 header_size = 1;
831 length++;
832
833 /* If we're requesting fewer than four channels worth of response,
834 * and we have an explicit header, we need to set up the sampler
835 * writemask. It's reversed from normal: 1 means "don't write".
836 */
837 if (!inst->eot && regs_written(inst) != 4 * reg_width) {
838 assert(regs_written(inst) % reg_width == 0);
839 unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf;
840 inst->offset |= mask << 12;
841 }
842
843 /* Build the actual header */
844 const fs_builder ubld = bld.exec_all().group(8, 0);
845 const fs_builder ubld1 = ubld.group(1, 0);
846 ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
847 if (inst->offset) {
848 ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));
849 } else if (bld.shader->stage != MESA_SHADER_VERTEX &&
850 bld.shader->stage != MESA_SHADER_FRAGMENT) {
851 /* The vertex and fragment stages have g0.2 set to 0, so
852 * header0.2 is 0 when g0 is copied. Other stages may not, so we
853 * must set it to 0 to avoid setting undesirable bits in the
854 * message.
855 */
856 ubld1.MOV(component(header, 2), brw_imm_ud(0));
857 }
858
859 if (sampler_handle.file != BAD_FILE) {
860 /* Bindless sampler handles aren't relative to the sampler state
861 * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
862 * Instead, it's an absolute pointer relative to dynamic state base
863 * address.
864 *
865 * Sampler states are 16 bytes each and the pointer we give here has
866 * to be 32-byte aligned. In order to avoid more indirect messages
867 * than required, we assume that all bindless sampler states are
868 * 32-byte aligned. This sacrifices a bit of general state base
869 * address space but means we can do something more efficient in the
870 * shader.
871 */
872 ubld1.MOV(component(header, 3), sampler_handle);
873 } else if (is_high_sampler(devinfo, sampler)) {
874 fs_reg sampler_state_ptr =
875 retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD);
876
877 /* Gfx11+ sampler message headers include bits in 4:0 which conflict
878 * with the ones included in g0.3 bits 4:0. Mask them out.
879 */
880 if (devinfo->ver >= 11) {
881 sampler_state_ptr = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
882 ubld1.AND(sampler_state_ptr,
883 retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
884 brw_imm_ud(INTEL_MASK(31, 5)));
885 }
886
887 if (sampler.file == BRW_IMMEDIATE_VALUE) {
888 assert(sampler.ud >= 16);
889 const int sampler_state_size = 16; /* 16 bytes */
890
891 ubld1.ADD(component(header, 3), sampler_state_ptr,
892 brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
893 } else {
894 fs_reg tmp = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
895 ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));
896 ubld1.SHL(tmp, tmp, brw_imm_ud(4));
897 ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
898 }
899 } else if (devinfo->ver >= 11) {
900 /* Gfx11+ sampler message headers include bits in 4:0 which conflict
901 * with the ones included in g0.3 bits 4:0. Mask them out.
902 */
903 ubld1.AND(component(header, 3),
904 retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
905 brw_imm_ud(INTEL_MASK(31, 5)));
906 }
907 }
908
909 if (shadow_c.file != BAD_FILE) {
910 bld.MOV(sources[length], shadow_c);
911 length++;
912 }
913
914 bool coordinate_done = false;
915
916 /* Set up the LOD info */
917 switch (op) {
918 case FS_OPCODE_TXB:
919 case SHADER_OPCODE_TXL:
920 if (devinfo->ver >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) {
921 op = SHADER_OPCODE_TXL_LZ;
922 break;
923 }
924 bld.MOV(sources[length], lod);
925 length++;
926 break;
927 case SHADER_OPCODE_TXD:
928 /* TXD should have been lowered in SIMD16 mode. */
929 assert(bld.dispatch_width() == 8);
930
931 /* Load dPdx and the coordinate together:
932 * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
933 */
934 for (unsigned i = 0; i < coord_components; i++) {
935 bld.MOV(sources[length++], offset(coordinate, bld, i));
936
937 /* For cube map array, the coordinate is (u,v,r,ai) but there are
938 * only derivatives for (u, v, r).
939 */
940 if (i < grad_components) {
941 bld.MOV(sources[length++], offset(lod, bld, i));
942 bld.MOV(sources[length++], offset(lod2, bld, i));
943 }
944 }
945
946 coordinate_done = true;
947 break;
948 case SHADER_OPCODE_TXS:
949 bld.MOV(retype(sources[length], payload_unsigned_type), lod);
950 length++;
951 break;
952 case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
953 /* We need an LOD; just use 0 */
954 bld.MOV(retype(sources[length], payload_unsigned_type), brw_imm_ud(0));
955 length++;
956 break;
957 case SHADER_OPCODE_TXF:
958 /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
959 * On Gfx9 they are u, v, lod, r
960 */
961 bld.MOV(retype(sources[length++], payload_signed_type), coordinate);
962
963 if (devinfo->ver >= 9) {
964 if (coord_components >= 2) {
965 bld.MOV(retype(sources[length], payload_signed_type),
966 offset(coordinate, bld, 1));
967 } else {
968 sources[length] = brw_imm_d(0);
969 }
970 length++;
971 }
972
973 if (devinfo->ver >= 9 && lod.is_zero()) {
974 op = SHADER_OPCODE_TXF_LZ;
975 } else {
976 bld.MOV(retype(sources[length], payload_signed_type), lod);
977 length++;
978 }
979
980 for (unsigned i = devinfo->ver >= 9 ? 2 : 1; i < coord_components; i++)
981 bld.MOV(retype(sources[length++], payload_signed_type),
982 offset(coordinate, bld, i));
983
984 coordinate_done = true;
985 break;
986
987 case SHADER_OPCODE_TXF_CMS:
988 case SHADER_OPCODE_TXF_CMS_W:
989 case SHADER_OPCODE_TXF_UMS:
990 case SHADER_OPCODE_TXF_MCS:
991 if (op == SHADER_OPCODE_TXF_UMS ||
992 op == SHADER_OPCODE_TXF_CMS ||
993 op == SHADER_OPCODE_TXF_CMS_W) {
994 bld.MOV(retype(sources[length++], payload_unsigned_type), sample_index);
995 }
996
997 /* Data from the multisample control surface. */
998 if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
999 unsigned num_mcs_components = 1;
1000
1001 /* From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
1002 * Shared Functions - 3D Sampler - Messages - Message Format:
1003 *
1004 * ld2dms_w si mcs0 mcs1 mcs2 mcs3 u v r
1005 */
1006 if (devinfo->verx10 >= 125 && op == SHADER_OPCODE_TXF_CMS_W)
1007 num_mcs_components = 4;
1008 else if (op == SHADER_OPCODE_TXF_CMS_W)
1009 num_mcs_components = 2;
1010
1011 for (unsigned i = 0; i < num_mcs_components; ++i) {
1012 bld.MOV(retype(sources[length++], payload_unsigned_type),
1013 mcs.file == IMM ? mcs : offset(mcs, bld, i));
1014 }
1015 }
1016
1017 /* There is no offsetting for this message; just copy in the integer
1018 * texture coordinates.
1019 */
1020 for (unsigned i = 0; i < coord_components; i++)
1021 bld.MOV(retype(sources[length++], payload_signed_type),
1022 offset(coordinate, bld, i));
1023
1024 coordinate_done = true;
1025 break;
1026 case SHADER_OPCODE_TG4_OFFSET:
1027 /* More crazy intermixing */
1028 for (unsigned i = 0; i < 2; i++) /* u, v */
1029 bld.MOV(sources[length++], offset(coordinate, bld, i));
1030
1031 for (unsigned i = 0; i < 2; i++) /* offu, offv */
1032 bld.MOV(retype(sources[length++], payload_signed_type),
1033 offset(tg4_offset, bld, i));
1034
1035 if (coord_components == 3) /* r if present */
1036 bld.MOV(sources[length++], offset(coordinate, bld, 2));
1037
1038 coordinate_done = true;
1039 break;
1040 default:
1041 break;
1042 }
1043
1044 /* Set up the coordinate (except for cases where it was done above) */
1045 if (!coordinate_done) {
1046 for (unsigned i = 0; i < coord_components; i++)
1047 bld.MOV(retype(sources[length++], payload_type),
1048 offset(coordinate, bld, i));
1049 }
1050
1051 if (min_lod.file != BAD_FILE) {
1052 /* Account for all of the missing coordinate sources */
1053 if (op == SHADER_OPCODE_TXD && devinfo->verx10 >= 125) {
1054 /* On DG2 and newer platforms, sample_d can only be used with 1D and
1055 * 2D surfaces, so the maximum number of gradient components is 2.
1056 * In spite of this limitation, the Bspec lists a mysterious R
1057 * component before the min_lod, so the maximum coordinate components
1058 * is 3.
1059 *
1060 * Wa_1209978020
1061 */
1062 length += 3 - coord_components;
1063 length += (2 - grad_components) * 2;
1064 } else {
1065 length += 4 - coord_components;
1066 if (op == SHADER_OPCODE_TXD)
1067 length += (3 - grad_components) * 2;
1068 }
1069
1070 bld.MOV(sources[length++], min_lod);
1071 }
1072
1073 const fs_reg src_payload =
1074 fs_reg(VGRF, bld.shader->alloc.allocate(length * reg_width),
1075 BRW_REGISTER_TYPE_F);
1076 /* In case of 16-bit payload each component takes one full register in
1077 * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16
1078 * elements. In SIMD8H case hardware simply expects the components to be
1079 * padded (i.e., aligned on reg boundary).
1080 */
1081 fs_inst *load_payload_inst =
1082 emit_load_payload_with_padding(bld, src_payload, sources, length,
1083 header_size, REG_SIZE);
1084 unsigned mlen = load_payload_inst->size_written / REG_SIZE;
1085 unsigned simd_mode = 0;
1086 if (payload_type_bit_size == 16) {
1087 assert(devinfo->ver >= 11);
1088 simd_mode = inst->exec_size <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H :
1089 GFX10_SAMPLER_SIMD_MODE_SIMD16H;
1090 } else {
1091 simd_mode = inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
1092 BRW_SAMPLER_SIMD_MODE_SIMD16;
1093 }
1094
1095 /* Generate the SEND. */
1096 inst->opcode = SHADER_OPCODE_SEND;
1097 inst->mlen = mlen;
1098 inst->header_size = header_size;
1099
1100 const unsigned msg_type =
1101 sampler_msg_type(devinfo, op, inst->shadow_compare);
1102
1103 inst->sfid = BRW_SFID_SAMPLER;
1104 if (surface.file == IMM &&
1105 (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
1106 inst->desc = brw_sampler_desc(devinfo, surface.ud,
1107 sampler.file == IMM ? sampler.ud % 16 : 0,
1108 msg_type,
1109 simd_mode,
1110 0 /* return_format unused on gfx7+ */);
1111 inst->src[0] = brw_imm_ud(0);
1112 inst->src[1] = brw_imm_ud(0);
1113 } else if (surface_handle.file != BAD_FILE) {
1114 /* Bindless surface */
1115 assert(devinfo->ver >= 9);
1116 inst->desc = brw_sampler_desc(devinfo,
1117 GFX9_BTI_BINDLESS,
1118 sampler.file == IMM ? sampler.ud % 16 : 0,
1119 msg_type,
1120 simd_mode,
1121 0 /* return_format unused on gfx7+ */);
1122
1123 /* For bindless samplers, the entire address is included in the message
1124 * header so we can leave the portion in the message descriptor 0.
1125 */
1126 if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {
1127 inst->src[0] = brw_imm_ud(0);
1128 } else {
1129 const fs_builder ubld = bld.group(1, 0).exec_all();
1130 fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1131 ubld.SHL(desc, sampler, brw_imm_ud(8));
1132 inst->src[0] = desc;
1133 }
1134
1135 /* We assume that the driver provided the handle in the top 20 bits so
1136 * we can use the surface handle directly as the extended descriptor.
1137 */
1138 inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
1139 } else {
1140 /* Immediate portion of the descriptor */
1141 inst->desc = brw_sampler_desc(devinfo,
1142 0, /* surface */
1143 0, /* sampler */
1144 msg_type,
1145 simd_mode,
1146 0 /* return_format unused on gfx7+ */);
1147 const fs_builder ubld = bld.group(1, 0).exec_all();
1148 fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1149 if (surface.equals(sampler)) {
1150 /* This case is common in GL */
1151 ubld.MUL(desc, surface, brw_imm_ud(0x101));
1152 } else {
1153 if (sampler_handle.file != BAD_FILE) {
1154 ubld.MOV(desc, surface);
1155 } else if (sampler.file == IMM) {
1156 ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8));
1157 } else {
1158 ubld.SHL(desc, sampler, brw_imm_ud(8));
1159 ubld.OR(desc, desc, surface);
1160 }
1161 }
1162 ubld.AND(desc, desc, brw_imm_ud(0xfff));
1163
1164 inst->src[0] = component(desc, 0);
1165 inst->src[1] = brw_imm_ud(0); /* ex_desc */
1166 }
1167
1168 inst->ex_desc = 0;
1169
1170 inst->src[2] = src_payload;
1171 inst->resize_sources(3);
1172
1173 if (inst->eot) {
1174 /* EOT sampler messages don't make sense to split because it would
1175 * involve ending half of the thread early.
1176 */
1177 assert(inst->group == 0);
1178 /* We need to use SENDC for EOT sampler messages */
1179 inst->check_tdr = true;
1180 inst->send_has_side_effects = true;
1181 }
1182
1183 /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
1184 assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
1185 }
1186
1187 static unsigned
get_sampler_msg_payload_type_bit_size(const intel_device_info * devinfo,opcode op,const fs_reg * src)1188 get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
1189 opcode op, const fs_reg *src)
1190 {
1191 unsigned src_type_size = 0;
1192
1193 /* All sources need to have the same size, therefore seek the first valid
1194 * and take the size from there.
1195 */
1196 for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1197 if (src[i].file != BAD_FILE) {
1198 src_type_size = brw_reg_type_to_size(src[i].type);
1199 break;
1200 }
1201 }
1202
1203 assert(src_type_size == 2 || src_type_size == 4);
1204
1205 #ifndef NDEBUG
1206 /* Make sure all sources agree. On gfx12 this doesn't hold when sampling
1207 * compressed multisampled surfaces. There the payload contains MCS data
1208 * which is already in 16-bits unlike the other parameters that need forced
1209 * conversion.
1210 */
1211 if (devinfo->verx10 < 125 ||
1212 (op != SHADER_OPCODE_TXF_CMS_W &&
1213 op != SHADER_OPCODE_TXF_CMS)) {
1214 for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1215 assert(src[i].file == BAD_FILE ||
1216 brw_reg_type_to_size(src[i].type) == src_type_size);
1217 }
1218 }
1219 #endif
1220
1221 if (devinfo->verx10 < 125)
1222 return src_type_size * 8;
1223
1224 /* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec:
1225 * 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message
1226 * Format [GFX12:HAS:1209977870] *
1227 *
1228 * ld2dms_w SIMD8H and SIMD16H Only
1229 * ld_mcs SIMD8H and SIMD16H Only
1230 * ld2dms REMOVEDBY(GEN:HAS:1406788836)
1231 */
1232
1233 if (op == SHADER_OPCODE_TXF_CMS_W ||
1234 op == SHADER_OPCODE_TXF_CMS ||
1235 op == SHADER_OPCODE_TXF_UMS ||
1236 op == SHADER_OPCODE_TXF_MCS)
1237 src_type_size = 2;
1238
1239 return src_type_size * 8;
1240 }
1241
1242 static void
lower_sampler_logical_send(const fs_builder & bld,fs_inst * inst,opcode op)1243 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
1244 {
1245 const intel_device_info *devinfo = bld.shader->devinfo;
1246 const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
1247 const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
1248 const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD];
1249 const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
1250 const fs_reg &min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
1251 const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
1252 const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];
1253 const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
1254 const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
1255 const fs_reg &surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
1256 const fs_reg &sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
1257 const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
1258 assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
1259 const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
1260 assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
1261 const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
1262
1263 if (devinfo->ver >= 7) {
1264 const unsigned msg_payload_type_bit_size =
1265 get_sampler_msg_payload_type_bit_size(devinfo, op, inst->src);
1266
1267 /* 16-bit payloads are available only on gfx11+ */
1268 assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11);
1269
1270 lower_sampler_logical_send_gfx7(bld, inst, op, coordinate,
1271 shadow_c, lod, lod2, min_lod,
1272 sample_index,
1273 mcs, surface, sampler,
1274 surface_handle, sampler_handle,
1275 tg4_offset,
1276 msg_payload_type_bit_size,
1277 coord_components, grad_components);
1278 } else if (devinfo->ver >= 5) {
1279 lower_sampler_logical_send_gfx5(bld, inst, op, coordinate,
1280 shadow_c, lod, lod2, sample_index,
1281 surface, sampler,
1282 coord_components, grad_components);
1283 } else {
1284 lower_sampler_logical_send_gfx4(bld, inst, op, coordinate,
1285 shadow_c, lod, lod2,
1286 surface, sampler,
1287 coord_components, grad_components);
1288 }
1289 }
1290
1291 /**
1292 * Predicate the specified instruction on the vector mask.
1293 */
1294 static void
emit_predicate_on_vector_mask(const fs_builder & bld,fs_inst * inst)1295 emit_predicate_on_vector_mask(const fs_builder &bld, fs_inst *inst)
1296 {
1297 assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
1298 bld.group() == inst->group &&
1299 bld.dispatch_width() == inst->exec_size);
1300
1301 const fs_builder ubld = bld.exec_all().group(1, 0);
1302
1303 const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader);
1304 const fs_reg vector_mask = ubld.vgrf(BRW_REGISTER_TYPE_UW);
1305 ubld.emit(SHADER_OPCODE_READ_SR_REG, vector_mask, brw_imm_ud(3));
1306 const unsigned subreg = sample_mask_flag_subreg(v);
1307
1308 ubld.MOV(brw_flag_subreg(subreg + inst->group / 16), vector_mask);
1309
1310 if (inst->predicate) {
1311 assert(inst->predicate == BRW_PREDICATE_NORMAL);
1312 assert(!inst->predicate_inverse);
1313 assert(inst->flag_subreg == 0);
1314 /* Combine the vector mask with the existing predicate by using a
1315 * vertical predication mode.
1316 */
1317 inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
1318 } else {
1319 inst->flag_subreg = subreg;
1320 inst->predicate = BRW_PREDICATE_NORMAL;
1321 inst->predicate_inverse = false;
1322 }
1323 }
1324
1325 static void
setup_surface_descriptors(const fs_builder & bld,fs_inst * inst,uint32_t desc,const fs_reg & surface,const fs_reg & surface_handle)1326 setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc,
1327 const fs_reg &surface, const fs_reg &surface_handle)
1328 {
1329 const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
1330
1331 /* We must have exactly one of surface and surface_handle */
1332 assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
1333
1334 if (surface.file == IMM) {
1335 inst->desc = desc | (surface.ud & 0xff);
1336 inst->src[0] = brw_imm_ud(0);
1337 inst->src[1] = brw_imm_ud(0); /* ex_desc */
1338 } else if (surface_handle.file != BAD_FILE) {
1339 /* Bindless surface */
1340 assert(devinfo->ver >= 9);
1341 inst->desc = desc | GFX9_BTI_BINDLESS;
1342 inst->src[0] = brw_imm_ud(0);
1343
1344 /* We assume that the driver provided the handle in the top 20 bits so
1345 * we can use the surface handle directly as the extended descriptor.
1346 */
1347 inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
1348 } else {
1349 inst->desc = desc;
1350 const fs_builder ubld = bld.exec_all().group(1, 0);
1351 fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1352 ubld.AND(tmp, surface, brw_imm_ud(0xff));
1353 inst->src[0] = component(tmp, 0);
1354 inst->src[1] = brw_imm_ud(0); /* ex_desc */
1355 }
1356 }
1357
1358 static void
lower_surface_logical_send(const fs_builder & bld,fs_inst * inst)1359 lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
1360 {
1361 const intel_device_info *devinfo = bld.shader->devinfo;
1362
1363 /* Get the logical send arguments. */
1364 const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1365 const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1366 const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1367 const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1368 const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
1369 const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1370 const fs_reg &allow_sample_mask =
1371 inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
1372 assert(arg.file == IMM);
1373 assert(allow_sample_mask.file == IMM);
1374
1375 /* Calculate the total number of components of the payload. */
1376 const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
1377 const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1378
1379 const bool is_typed_access =
1380 inst->opcode == SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL ||
1381 inst->opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL ||
1382 inst->opcode == SHADER_OPCODE_TYPED_ATOMIC_LOGICAL;
1383
1384 const bool is_surface_access = is_typed_access ||
1385 inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL ||
1386 inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL ||
1387 inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL;
1388
1389 const bool is_stateless =
1390 surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
1391 surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
1392
1393 const bool has_side_effects = inst->has_side_effects();
1394
1395 fs_reg sample_mask = allow_sample_mask.ud ? brw_sample_mask_reg(bld) :
1396 fs_reg(brw_imm_d(0xffff));
1397
1398 /* From the BDW PRM Volume 7, page 147:
1399 *
1400 * "For the Data Cache Data Port*, the header must be present for the
1401 * following message types: [...] Typed read/write/atomics"
1402 *
1403 * Earlier generations have a similar wording. Because of this restriction
1404 * we don't attempt to implement sample masks via predication for such
1405 * messages prior to Gfx9, since we have to provide a header anyway. On
1406 * Gfx11+ the header has been removed so we can only use predication.
1407 *
1408 * For all stateless A32 messages, we also need a header
1409 */
1410 fs_reg header;
1411 if ((devinfo->ver < 9 && is_typed_access) || is_stateless) {
1412 fs_builder ubld = bld.exec_all().group(8, 0);
1413 header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1414 if (is_stateless) {
1415 assert(!is_surface_access);
1416 ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
1417 } else {
1418 ubld.MOV(header, brw_imm_d(0));
1419 if (is_surface_access)
1420 ubld.group(1, 0).MOV(component(header, 7), sample_mask);
1421 }
1422 }
1423 const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;
1424
1425 fs_reg payload, payload2;
1426 unsigned mlen, ex_mlen = 0;
1427 if (devinfo->ver >= 9 &&
1428 (src.file == BAD_FILE || header.file == BAD_FILE)) {
1429 /* We have split sends on gfx9 and above */
1430 if (header.file == BAD_FILE) {
1431 payload = bld.move_to_vgrf(addr, addr_sz);
1432 payload2 = bld.move_to_vgrf(src, src_sz);
1433 mlen = addr_sz * (inst->exec_size / 8);
1434 ex_mlen = src_sz * (inst->exec_size / 8);
1435 } else {
1436 assert(src.file == BAD_FILE);
1437 payload = header;
1438 payload2 = bld.move_to_vgrf(addr, addr_sz);
1439 mlen = header_sz;
1440 ex_mlen = addr_sz * (inst->exec_size / 8);
1441 }
1442 } else {
1443 /* Allocate space for the payload. */
1444 const unsigned sz = header_sz + addr_sz + src_sz;
1445 payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
1446 fs_reg *const components = new fs_reg[sz];
1447 unsigned n = 0;
1448
1449 /* Construct the payload. */
1450 if (header.file != BAD_FILE)
1451 components[n++] = header;
1452
1453 for (unsigned i = 0; i < addr_sz; i++)
1454 components[n++] = offset(addr, bld, i);
1455
1456 for (unsigned i = 0; i < src_sz; i++)
1457 components[n++] = offset(src, bld, i);
1458
1459 bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
1460 mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
1461
1462 delete[] components;
1463 }
1464
1465 /* Predicate the instruction on the sample mask if no header is
1466 * provided.
1467 */
1468 if ((header.file == BAD_FILE || !is_surface_access) &&
1469 sample_mask.file != BAD_FILE && sample_mask.file != IMM)
1470 brw_emit_predicate_on_sample_mask(bld, inst);
1471
1472 uint32_t sfid;
1473 switch (inst->opcode) {
1474 case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1475 case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1476 /* Byte scattered opcodes go through the normal data cache */
1477 sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1478 break;
1479
1480 case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1481 case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1482 sfid = devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
1483 devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
1484 BRW_DATAPORT_READ_TARGET_RENDER_CACHE;
1485 break;
1486
1487 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1488 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1489 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1490 case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
1491 /* Untyped Surface messages go through the data cache but the SFID value
1492 * changed on Haswell.
1493 */
1494 sfid = (devinfo->verx10 >= 75 ?
1495 HSW_SFID_DATAPORT_DATA_CACHE_1 :
1496 GFX7_SFID_DATAPORT_DATA_CACHE);
1497 break;
1498
1499 case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1500 case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1501 case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1502 /* Typed surface messages go through the render cache on IVB and the
1503 * data cache on HSW+.
1504 */
1505 sfid = (devinfo->verx10 >= 75 ?
1506 HSW_SFID_DATAPORT_DATA_CACHE_1 :
1507 GFX6_SFID_DATAPORT_RENDER_CACHE);
1508 break;
1509
1510 default:
1511 unreachable("Unsupported surface opcode");
1512 }
1513
1514 uint32_t desc;
1515 switch (inst->opcode) {
1516 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1517 desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1518 arg.ud, /* num_channels */
1519 false /* write */);
1520 break;
1521
1522 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1523 desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1524 arg.ud, /* num_channels */
1525 true /* write */);
1526 break;
1527
1528 case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1529 desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1530 arg.ud, /* bit_size */
1531 false /* write */);
1532 break;
1533
1534 case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1535 desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1536 arg.ud, /* bit_size */
1537 true /* write */);
1538 break;
1539
1540 case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1541 assert(arg.ud == 32); /* bit_size */
1542 desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1543 false /* write */);
1544 break;
1545
1546 case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1547 assert(arg.ud == 32); /* bit_size */
1548 desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1549 true /* write */);
1550 break;
1551
1552 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1553 desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size,
1554 arg.ud, /* atomic_op */
1555 !inst->dst.is_null());
1556 break;
1557
1558 case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
1559 desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,
1560 arg.ud, /* atomic_op */
1561 !inst->dst.is_null());
1562 break;
1563
1564 case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1565 desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
1566 arg.ud, /* num_channels */
1567 false /* write */);
1568 break;
1569
1570 case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1571 desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
1572 arg.ud, /* num_channels */
1573 true /* write */);
1574 break;
1575
1576 case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1577 desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
1578 arg.ud, /* atomic_op */
1579 !inst->dst.is_null());
1580 break;
1581
1582 default:
1583 unreachable("Unknown surface logical instruction");
1584 }
1585
1586 /* Update the original instruction. */
1587 inst->opcode = SHADER_OPCODE_SEND;
1588 inst->mlen = mlen;
1589 inst->ex_mlen = ex_mlen;
1590 inst->header_size = header_sz;
1591 inst->send_has_side_effects = has_side_effects;
1592 inst->send_is_volatile = !has_side_effects;
1593
1594 /* Set up SFID and descriptors */
1595 inst->sfid = sfid;
1596 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1597
1598 inst->resize_sources(4);
1599
1600 /* Finally, the payload */
1601 inst->src[2] = payload;
1602 inst->src[3] = payload2;
1603 }
1604
1605 static enum lsc_opcode
brw_atomic_op_to_lsc_atomic_op(unsigned op)1606 brw_atomic_op_to_lsc_atomic_op(unsigned op)
1607 {
1608 switch(op) {
1609 case BRW_AOP_AND:
1610 return LSC_OP_ATOMIC_AND;
1611 case BRW_AOP_OR:
1612 return LSC_OP_ATOMIC_OR;
1613 case BRW_AOP_XOR:
1614 return LSC_OP_ATOMIC_XOR;
1615 case BRW_AOP_MOV:
1616 return LSC_OP_ATOMIC_STORE;
1617 case BRW_AOP_INC:
1618 return LSC_OP_ATOMIC_INC;
1619 case BRW_AOP_DEC:
1620 return LSC_OP_ATOMIC_DEC;
1621 case BRW_AOP_ADD:
1622 return LSC_OP_ATOMIC_ADD;
1623 case BRW_AOP_SUB:
1624 return LSC_OP_ATOMIC_SUB;
1625 case BRW_AOP_IMAX:
1626 return LSC_OP_ATOMIC_MAX;
1627 case BRW_AOP_IMIN:
1628 return LSC_OP_ATOMIC_MIN;
1629 case BRW_AOP_UMAX:
1630 return LSC_OP_ATOMIC_UMAX;
1631 case BRW_AOP_UMIN:
1632 return LSC_OP_ATOMIC_UMIN;
1633 case BRW_AOP_CMPWR:
1634 return LSC_OP_ATOMIC_CMPXCHG;
1635 default:
1636 assert(false);
1637 unreachable("invalid atomic opcode");
1638 }
1639 }
1640
1641 static enum lsc_opcode
brw_atomic_op_to_lsc_fatomic_op(uint32_t aop)1642 brw_atomic_op_to_lsc_fatomic_op(uint32_t aop)
1643 {
1644 switch(aop) {
1645 case BRW_AOP_FMAX:
1646 return LSC_OP_ATOMIC_FMAX;
1647 case BRW_AOP_FMIN:
1648 return LSC_OP_ATOMIC_FMIN;
1649 case BRW_AOP_FCMPWR:
1650 return LSC_OP_ATOMIC_FCMPXCHG;
1651 case BRW_AOP_FADD:
1652 return LSC_OP_ATOMIC_FADD;
1653 default:
1654 unreachable("Unsupported float atomic opcode");
1655 }
1656 }
1657
1658 static enum lsc_data_size
lsc_bits_to_data_size(unsigned bit_size)1659 lsc_bits_to_data_size(unsigned bit_size)
1660 {
1661 switch (bit_size / 8) {
1662 case 1: return LSC_DATA_SIZE_D8U32;
1663 case 2: return LSC_DATA_SIZE_D16U32;
1664 case 4: return LSC_DATA_SIZE_D32;
1665 case 8: return LSC_DATA_SIZE_D64;
1666 default:
1667 unreachable("Unsupported data size.");
1668 }
1669 }
1670
1671 static void
lower_lsc_surface_logical_send(const fs_builder & bld,fs_inst * inst)1672 lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
1673 {
1674 const intel_device_info *devinfo = bld.shader->devinfo;
1675 assert(devinfo->has_lsc);
1676
1677 /* Get the logical send arguments. */
1678 const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1679 const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1680 const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1681 const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1682 const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
1683 const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1684 const fs_reg allow_sample_mask =
1685 inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
1686 assert(arg.file == IMM);
1687 assert(allow_sample_mask.file == IMM);
1688
1689 /* Calculate the total number of components of the payload. */
1690 const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
1691 const unsigned src_comps = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1692 const unsigned src_sz = type_sz(src.type);
1693
1694 const bool has_side_effects = inst->has_side_effects();
1695
1696 unsigned ex_mlen = 0;
1697 fs_reg payload, payload2;
1698 payload = bld.move_to_vgrf(addr, addr_sz);
1699 if (src.file != BAD_FILE) {
1700 payload2 = bld.move_to_vgrf(src, src_comps);
1701 ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE;
1702 }
1703
1704 /* Predicate the instruction on the sample mask if needed */
1705 fs_reg sample_mask = allow_sample_mask.ud ? brw_sample_mask_reg(bld) :
1706 fs_reg(brw_imm_d(0xffff));
1707 if (sample_mask.file != BAD_FILE && sample_mask.file != IMM)
1708 brw_emit_predicate_on_sample_mask(bld, inst);
1709
1710 if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
1711 inst->sfid = GFX12_SFID_SLM;
1712 else
1713 inst->sfid = GFX12_SFID_UGM;
1714
1715 /* We must have exactly one of surface and surface_handle */
1716 assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
1717
1718 enum lsc_addr_surface_type surf_type;
1719 if (surface_handle.file != BAD_FILE)
1720 surf_type = LSC_ADDR_SURFTYPE_BSS;
1721 else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
1722 surf_type = LSC_ADDR_SURFTYPE_FLAT;
1723 else
1724 surf_type = LSC_ADDR_SURFTYPE_BTI;
1725
1726 switch (inst->opcode) {
1727 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1728 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
1729 surf_type, LSC_ADDR_SIZE_A32,
1730 1 /* num_coordinates */,
1731 LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
1732 false /* transpose */,
1733 LSC_CACHE_LOAD_L1STATE_L3MOCS,
1734 true /* has_dest */);
1735 break;
1736 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1737 inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
1738 surf_type, LSC_ADDR_SIZE_A32,
1739 1 /* num_coordinates */,
1740 LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
1741 false /* transpose */,
1742 LSC_CACHE_STORE_L1STATE_L3MOCS,
1743 false /* has_dest */);
1744 break;
1745 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1746 case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: {
1747 /* Bspec: Atomic instruction -> Cache section:
1748 *
1749 * Atomic messages are always forced to "un-cacheable" in the L1
1750 * cache.
1751 */
1752 enum lsc_opcode opcode =
1753 inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL ?
1754 brw_atomic_op_to_lsc_fatomic_op(arg.ud) :
1755 brw_atomic_op_to_lsc_atomic_op(arg.ud);
1756 inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
1757 surf_type, LSC_ADDR_SIZE_A32,
1758 1 /* num_coordinates */,
1759 lsc_bits_to_data_size(src_sz * 8),
1760 1 /* num_channels */,
1761 false /* transpose */,
1762 LSC_CACHE_STORE_L1UC_L3WB,
1763 !inst->dst.is_null());
1764 break;
1765 }
1766 case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1767 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
1768 surf_type, LSC_ADDR_SIZE_A32,
1769 1 /* num_coordinates */,
1770 lsc_bits_to_data_size(arg.ud),
1771 1 /* num_channels */,
1772 false /* transpose */,
1773 LSC_CACHE_LOAD_L1STATE_L3MOCS,
1774 true /* has_dest */);
1775 break;
1776 case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1777 inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
1778 surf_type, LSC_ADDR_SIZE_A32,
1779 1 /* num_coordinates */,
1780 lsc_bits_to_data_size(arg.ud),
1781 1 /* num_channels */,
1782 false /* transpose */,
1783 LSC_CACHE_STORE_L1STATE_L3MOCS,
1784 false /* has_dest */);
1785 break;
1786 default:
1787 unreachable("Unknown surface logical instruction");
1788 }
1789
1790 inst->src[0] = brw_imm_ud(0);
1791
1792 /* Set up extended descriptors */
1793 switch (surf_type) {
1794 case LSC_ADDR_SURFTYPE_FLAT:
1795 inst->src[1] = brw_imm_ud(0);
1796 break;
1797 case LSC_ADDR_SURFTYPE_BSS:
1798 /* We assume that the driver provided the handle in the top 20 bits so
1799 * we can use the surface handle directly as the extended descriptor.
1800 */
1801 inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
1802 break;
1803 case LSC_ADDR_SURFTYPE_BTI:
1804 if (surface.file == IMM) {
1805 inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
1806 } else {
1807 const fs_builder ubld = bld.exec_all().group(1, 0);
1808 fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1809 ubld.SHL(tmp, surface, brw_imm_ud(24));
1810 inst->src[1] = component(tmp, 0);
1811 }
1812 break;
1813 default:
1814 unreachable("Unknown surface type");
1815 }
1816
1817 /* Update the original instruction. */
1818 inst->opcode = SHADER_OPCODE_SEND;
1819 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
1820 inst->ex_mlen = ex_mlen;
1821 inst->header_size = 0;
1822 inst->send_has_side_effects = has_side_effects;
1823 inst->send_is_volatile = !has_side_effects;
1824
1825 inst->resize_sources(4);
1826
1827 /* Finally, the payload */
1828 inst->src[2] = payload;
1829 inst->src[3] = payload2;
1830 }
1831
1832 static void
lower_surface_block_logical_send(const fs_builder & bld,fs_inst * inst)1833 lower_surface_block_logical_send(const fs_builder &bld, fs_inst *inst)
1834 {
1835 const intel_device_info *devinfo = bld.shader->devinfo;
1836 assert(devinfo->ver >= 9);
1837
1838 /* Get the logical send arguments. */
1839 const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1840 const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1841 const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1842 const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1843 const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1844 assert(arg.file == IMM);
1845 assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);
1846 assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);
1847
1848 const bool is_stateless =
1849 surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
1850 surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
1851
1852 const bool has_side_effects = inst->has_side_effects();
1853
1854 const bool align_16B =
1855 inst->opcode != SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL;
1856
1857 const bool write = inst->opcode == SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
1858
1859 /* The address is stored in the header. See MH_A32_GO and MH_BTS_GO. */
1860 fs_builder ubld = bld.exec_all().group(8, 0);
1861 fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1862
1863 if (is_stateless)
1864 ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
1865 else
1866 ubld.MOV(header, brw_imm_d(0));
1867
1868 /* Address in OWord units when aligned to OWords. */
1869 if (align_16B)
1870 ubld.group(1, 0).SHR(component(header, 2), addr, brw_imm_ud(4));
1871 else
1872 ubld.group(1, 0).MOV(component(header, 2), addr);
1873
1874 fs_reg data;
1875 unsigned ex_mlen = 0;
1876 if (write) {
1877 const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1878 data = retype(bld.move_to_vgrf(src, src_sz), BRW_REGISTER_TYPE_UD);
1879 ex_mlen = src_sz * type_sz(src.type) * inst->exec_size / REG_SIZE;
1880 }
1881
1882 inst->opcode = SHADER_OPCODE_SEND;
1883 inst->mlen = 1;
1884 inst->ex_mlen = ex_mlen;
1885 inst->header_size = 1;
1886 inst->send_has_side_effects = has_side_effects;
1887 inst->send_is_volatile = !has_side_effects;
1888
1889 inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1890
1891 const uint32_t desc = brw_dp_oword_block_rw_desc(devinfo, align_16B,
1892 arg.ud, write);
1893 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1894
1895 inst->resize_sources(4);
1896
1897 inst->src[2] = header;
1898 inst->src[3] = data;
1899 }
1900
1901 static fs_reg
emit_a64_oword_block_header(const fs_builder & bld,const fs_reg & addr)1902 emit_a64_oword_block_header(const fs_builder &bld, const fs_reg &addr)
1903 {
1904 const fs_builder ubld = bld.exec_all().group(8, 0);
1905
1906 assert(type_sz(addr.type) == 8 && addr.stride == 0);
1907
1908 fs_reg expanded_addr = addr;
1909 if (addr.file == UNIFORM) {
1910 /* We can't do stride 1 with the UNIFORM file, it requires stride 0 */
1911 expanded_addr = ubld.vgrf(BRW_REGISTER_TYPE_UQ);
1912 expanded_addr.stride = 0;
1913 ubld.MOV(expanded_addr, retype(addr, BRW_REGISTER_TYPE_UQ));
1914 }
1915
1916 fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1917 ubld.MOV(header, brw_imm_ud(0));
1918
1919 /* Use a 2-wide MOV to fill out the address */
1920 fs_reg addr_vec2 = expanded_addr;
1921 addr_vec2.type = BRW_REGISTER_TYPE_UD;
1922 addr_vec2.stride = 1;
1923 ubld.group(2, 0).MOV(header, addr_vec2);
1924
1925 return header;
1926 }
1927
1928 static void
emit_fragment_mask(const fs_builder & bld,fs_inst * inst)1929 emit_fragment_mask(const fs_builder &bld, fs_inst *inst)
1930 {
1931 assert(inst->src[A64_LOGICAL_ENABLE_HELPERS].file == IMM);
1932 const bool enable_helpers = inst->src[A64_LOGICAL_ENABLE_HELPERS].ud;
1933
1934 /* If we're a fragment shader, we have to predicate with the sample mask to
1935 * avoid helper invocations to avoid helper invocations in instructions
1936 * with side effects, unless they are explicitly required.
1937 *
1938 * There are also special cases when we actually want to run on helpers
1939 * (ray queries).
1940 */
1941 assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
1942 if (enable_helpers)
1943 emit_predicate_on_vector_mask(bld, inst);
1944 else if (inst->has_side_effects())
1945 brw_emit_predicate_on_sample_mask(bld, inst);
1946 }
1947
1948 static void
lower_lsc_a64_logical_send(const fs_builder & bld,fs_inst * inst)1949 lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst)
1950 {
1951 const intel_device_info *devinfo = bld.shader->devinfo;
1952
1953 /* Get the logical send arguments. */
1954 const fs_reg &addr = inst->src[A64_LOGICAL_ADDRESS];
1955 const fs_reg &src = inst->src[A64_LOGICAL_SRC];
1956 const unsigned src_sz = type_sz(src.type);
1957
1958 const unsigned src_comps = inst->components_read(1);
1959 assert(inst->src[A64_LOGICAL_ARG].file == IMM);
1960 const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
1961 const bool has_side_effects = inst->has_side_effects();
1962
1963 fs_reg payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
1964 fs_reg payload2 = retype(bld.move_to_vgrf(src, src_comps),
1965 BRW_REGISTER_TYPE_UD);
1966 unsigned ex_mlen = src_comps * src_sz * inst->exec_size / REG_SIZE;
1967
1968 switch (inst->opcode) {
1969 case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
1970 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
1971 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
1972 1 /* num_coordinates */,
1973 LSC_DATA_SIZE_D32, arg /* num_channels */,
1974 false /* transpose */,
1975 LSC_CACHE_LOAD_L1STATE_L3MOCS,
1976 true /* has_dest */);
1977 break;
1978 case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
1979 inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
1980 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
1981 1 /* num_coordinates */,
1982 LSC_DATA_SIZE_D32, arg /* num_channels */,
1983 false /* transpose */,
1984 LSC_CACHE_STORE_L1STATE_L3MOCS,
1985 false /* has_dest */);
1986 break;
1987 case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
1988 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
1989 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
1990 1 /* num_coordinates */,
1991 lsc_bits_to_data_size(arg),
1992 1 /* num_channels */,
1993 false /* transpose */,
1994 LSC_CACHE_LOAD_L1STATE_L3MOCS,
1995 true /* has_dest */);
1996 break;
1997 case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
1998 inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
1999 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2000 1 /* num_coordinates */,
2001 lsc_bits_to_data_size(arg),
2002 1 /* num_channels */,
2003 false /* transpose */,
2004 LSC_CACHE_STORE_L1STATE_L3MOCS,
2005 false /* has_dest */);
2006 break;
2007 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
2008 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
2009 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: {
2010 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
2011 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
2012 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL:
2013 /* Bspec: Atomic instruction -> Cache section:
2014 *
2015 * Atomic messages are always forced to "un-cacheable" in the L1
2016 * cache.
2017 */
2018 enum lsc_opcode opcode =
2019 (inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL ||
2020 inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL ||
2021 inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL) ?
2022 brw_atomic_op_to_lsc_atomic_op(arg) :
2023 brw_atomic_op_to_lsc_fatomic_op(arg);
2024 inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
2025 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2026 1 /* num_coordinates */,
2027 lsc_bits_to_data_size(src_sz * 8),
2028 1 /* num_channels */,
2029 false /* transpose */,
2030 LSC_CACHE_STORE_L1UC_L3WB,
2031 !inst->dst.is_null());
2032 break;
2033 }
2034 default:
2035 unreachable("Unknown A64 logical instruction");
2036 }
2037
2038 if (bld.shader->stage == MESA_SHADER_FRAGMENT)
2039 emit_fragment_mask(bld, inst);
2040
2041 /* Update the original instruction. */
2042 inst->opcode = SHADER_OPCODE_SEND;
2043 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2044 inst->ex_mlen = ex_mlen;
2045 inst->header_size = 0;
2046 inst->send_has_side_effects = has_side_effects;
2047 inst->send_is_volatile = !has_side_effects;
2048
2049 /* Set up SFID and descriptors */
2050 inst->sfid = GFX12_SFID_UGM;
2051 inst->resize_sources(4);
2052 inst->src[0] = brw_imm_ud(0); /* desc */
2053 inst->src[1] = brw_imm_ud(0); /* ex_desc */
2054 inst->src[2] = payload;
2055 inst->src[3] = payload2;
2056 }
2057
2058 static void
lower_a64_logical_send(const fs_builder & bld,fs_inst * inst)2059 lower_a64_logical_send(const fs_builder &bld, fs_inst *inst)
2060 {
2061 const intel_device_info *devinfo = bld.shader->devinfo;
2062
2063 const fs_reg &addr = inst->src[A64_LOGICAL_ADDRESS];
2064 const fs_reg &src = inst->src[A64_LOGICAL_SRC];
2065 const unsigned src_comps = inst->components_read(1);
2066 assert(inst->src[A64_LOGICAL_ARG].file == IMM);
2067 const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
2068 const bool has_side_effects = inst->has_side_effects();
2069
2070 fs_reg payload, payload2;
2071 unsigned mlen, ex_mlen = 0, header_size = 0;
2072 if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL ||
2073 inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL ||
2074 inst->opcode == SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL) {
2075 assert(devinfo->ver >= 9);
2076
2077 /* OWORD messages only take a scalar address in a header */
2078 mlen = 1;
2079 header_size = 1;
2080 payload = emit_a64_oword_block_header(bld, addr);
2081
2082 if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL) {
2083 ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
2084 payload2 = retype(bld.move_to_vgrf(src, src_comps),
2085 BRW_REGISTER_TYPE_UD);
2086 }
2087 } else if (devinfo->ver >= 9) {
2088 /* On Skylake and above, we have SENDS */
2089 mlen = 2 * (inst->exec_size / 8);
2090 ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
2091 payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
2092 payload2 = retype(bld.move_to_vgrf(src, src_comps),
2093 BRW_REGISTER_TYPE_UD);
2094 } else {
2095 /* Add two because the address is 64-bit */
2096 const unsigned dwords = 2 + src_comps;
2097 mlen = dwords * (inst->exec_size / 8);
2098
2099 fs_reg sources[5];
2100
2101 sources[0] = addr;
2102
2103 for (unsigned i = 0; i < src_comps; i++)
2104 sources[1 + i] = offset(src, bld, i);
2105
2106 payload = bld.vgrf(BRW_REGISTER_TYPE_UD, dwords);
2107 bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0);
2108 }
2109
2110 uint32_t desc;
2111 switch (inst->opcode) {
2112 case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
2113 desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
2114 arg, /* num_channels */
2115 false /* write */);
2116 break;
2117
2118 case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
2119 desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
2120 arg, /* num_channels */
2121 true /* write */);
2122 break;
2123
2124 case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
2125 desc = brw_dp_a64_oword_block_rw_desc(devinfo,
2126 true, /* align_16B */
2127 arg, /* num_dwords */
2128 false /* write */);
2129 break;
2130
2131 case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2132 desc = brw_dp_a64_oword_block_rw_desc(devinfo,
2133 false, /* align_16B */
2134 arg, /* num_dwords */
2135 false /* write */);
2136 break;
2137
2138 case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
2139 desc = brw_dp_a64_oword_block_rw_desc(devinfo,
2140 true, /* align_16B */
2141 arg, /* num_dwords */
2142 true /* write */);
2143 break;
2144
2145 case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
2146 desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
2147 arg, /* bit_size */
2148 false /* write */);
2149 break;
2150
2151 case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
2152 desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
2153 arg, /* bit_size */
2154 true /* write */);
2155 break;
2156
2157 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
2158 desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 32,
2159 arg, /* atomic_op */
2160 !inst->dst.is_null());
2161 break;
2162
2163 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
2164 desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 16,
2165 arg, /* atomic_op */
2166 !inst->dst.is_null());
2167 break;
2168
2169 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
2170 desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 64,
2171 arg, /* atomic_op */
2172 !inst->dst.is_null());
2173 break;
2174
2175 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
2176 desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
2177 16, /* bit_size */
2178 arg, /* atomic_op */
2179 !inst->dst.is_null());
2180 break;
2181
2182 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
2183 desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
2184 32, /* bit_size */
2185 arg, /* atomic_op */
2186 !inst->dst.is_null());
2187 break;
2188
2189 default:
2190 unreachable("Unknown A64 logical instruction");
2191 }
2192
2193 if (bld.shader->stage == MESA_SHADER_FRAGMENT)
2194 emit_fragment_mask(bld, inst);
2195
2196 /* Update the original instruction. */
2197 inst->opcode = SHADER_OPCODE_SEND;
2198 inst->mlen = mlen;
2199 inst->ex_mlen = ex_mlen;
2200 inst->header_size = header_size;
2201 inst->send_has_side_effects = has_side_effects;
2202 inst->send_is_volatile = !has_side_effects;
2203
2204 /* Set up SFID and descriptors */
2205 inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2206 inst->desc = desc;
2207 inst->resize_sources(4);
2208 inst->src[0] = brw_imm_ud(0); /* desc */
2209 inst->src[1] = brw_imm_ud(0); /* ex_desc */
2210 inst->src[2] = payload;
2211 inst->src[3] = payload2;
2212 }
2213
2214 static void
lower_lsc_varying_pull_constant_logical_send(const fs_builder & bld,fs_inst * inst)2215 lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld,
2216 fs_inst *inst)
2217 {
2218 const intel_device_info *devinfo = bld.shader->devinfo;
2219 ASSERTED const brw_compiler *compiler = bld.shader->compiler;
2220
2221 fs_reg index = inst->src[0];
2222
2223 /* We are switching the instruction from an ALU-like instruction to a
2224 * send-from-grf instruction. Since sends can't handle strides or
2225 * source modifiers, we have to make a copy of the offset source.
2226 */
2227 fs_reg ubo_offset = bld.move_to_vgrf(inst->src[1], 1);
2228
2229 assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
2230 unsigned alignment = inst->src[2].ud;
2231
2232 inst->opcode = SHADER_OPCODE_SEND;
2233 inst->sfid = GFX12_SFID_UGM;
2234 inst->resize_sources(3);
2235 inst->src[0] = brw_imm_ud(0);
2236
2237 if (index.file == IMM) {
2238 inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, index.ud));
2239 } else {
2240 const fs_builder ubld = bld.exec_all().group(1, 0);
2241 fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2242 ubld.SHL(tmp, index, brw_imm_ud(24));
2243 inst->src[1] = component(tmp, 0);
2244 }
2245
2246 assert(!compiler->indirect_ubos_use_sampler);
2247
2248 inst->src[2] = ubo_offset; /* payload */
2249 if (alignment >= 4) {
2250 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
2251 LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32,
2252 1 /* num_coordinates */,
2253 LSC_DATA_SIZE_D32,
2254 4 /* num_channels */,
2255 false /* transpose */,
2256 LSC_CACHE_LOAD_L1STATE_L3MOCS,
2257 true /* has_dest */);
2258 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2259 } else {
2260 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
2261 LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32,
2262 1 /* num_coordinates */,
2263 LSC_DATA_SIZE_D32,
2264 1 /* num_channels */,
2265 false /* transpose */,
2266 LSC_CACHE_LOAD_L1STATE_L3MOCS,
2267 true /* has_dest */);
2268 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2269 /* The byte scattered messages can only read one dword at a time so
2270 * we have to duplicate the message 4 times to read the full vec4.
2271 * Hopefully, dead code will clean up the mess if some of them aren't
2272 * needed.
2273 */
2274 assert(inst->size_written == 16 * inst->exec_size);
2275 inst->size_written /= 4;
2276 for (unsigned c = 1; c < 4; c++) {
2277 /* Emit a copy of the instruction because we're about to modify
2278 * it. Because this loop starts at 1, we will emit copies for the
2279 * first 3 and the final one will be the modified instruction.
2280 */
2281 bld.emit(*inst);
2282
2283 /* Offset the source */
2284 inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);
2285 bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
2286
2287 /* Offset the destination */
2288 inst->dst = offset(inst->dst, bld, 1);
2289 }
2290 }
2291 }
2292
2293 static void
lower_varying_pull_constant_logical_send(const fs_builder & bld,fs_inst * inst)2294 lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
2295 {
2296 const intel_device_info *devinfo = bld.shader->devinfo;
2297 const brw_compiler *compiler = bld.shader->compiler;
2298
2299 if (devinfo->ver >= 7) {
2300 fs_reg index = inst->src[0];
2301 /* We are switching the instruction from an ALU-like instruction to a
2302 * send-from-grf instruction. Since sends can't handle strides or
2303 * source modifiers, we have to make a copy of the offset source.
2304 */
2305 fs_reg ubo_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
2306 bld.MOV(ubo_offset, inst->src[1]);
2307
2308 assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
2309 unsigned alignment = inst->src[2].ud;
2310
2311 inst->opcode = SHADER_OPCODE_SEND;
2312 inst->mlen = inst->exec_size / 8;
2313 inst->resize_sources(3);
2314
2315 if (index.file == IMM) {
2316 inst->desc = index.ud & 0xff;
2317 inst->src[0] = brw_imm_ud(0);
2318 } else {
2319 inst->desc = 0;
2320 const fs_builder ubld = bld.exec_all().group(1, 0);
2321 fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2322 ubld.AND(tmp, index, brw_imm_ud(0xff));
2323 inst->src[0] = component(tmp, 0);
2324 }
2325 inst->src[1] = brw_imm_ud(0); /* ex_desc */
2326 inst->src[2] = ubo_offset; /* payload */
2327
2328 if (compiler->indirect_ubos_use_sampler) {
2329 const unsigned simd_mode =
2330 inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
2331 BRW_SAMPLER_SIMD_MODE_SIMD16;
2332
2333 inst->sfid = BRW_SFID_SAMPLER;
2334 inst->desc |= brw_sampler_desc(devinfo, 0, 0,
2335 GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
2336 simd_mode, 0);
2337 } else if (alignment >= 4) {
2338 inst->sfid = (devinfo->verx10 >= 75 ?
2339 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2340 GFX7_SFID_DATAPORT_DATA_CACHE);
2341 inst->desc |= brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
2342 4, /* num_channels */
2343 false /* write */);
2344 } else {
2345 inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
2346 inst->desc |= brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
2347 32, /* bit_size */
2348 false /* write */);
2349 /* The byte scattered messages can only read one dword at a time so
2350 * we have to duplicate the message 4 times to read the full vec4.
2351 * Hopefully, dead code will clean up the mess if some of them aren't
2352 * needed.
2353 */
2354 assert(inst->size_written == 16 * inst->exec_size);
2355 inst->size_written /= 4;
2356 for (unsigned c = 1; c < 4; c++) {
2357 /* Emit a copy of the instruction because we're about to modify
2358 * it. Because this loop starts at 1, we will emit copies for the
2359 * first 3 and the final one will be the modified instruction.
2360 */
2361 bld.emit(*inst);
2362
2363 /* Offset the source */
2364 inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);
2365 bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
2366
2367 /* Offset the destination */
2368 inst->dst = offset(inst->dst, bld, 1);
2369 }
2370 }
2371 } else {
2372 const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->ver),
2373 BRW_REGISTER_TYPE_UD);
2374
2375 bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]);
2376
2377 inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4;
2378 inst->resize_sources(1);
2379 inst->base_mrf = payload.nr;
2380 inst->header_size = 1;
2381 inst->mlen = 1 + inst->exec_size / 8;
2382 }
2383 }
2384
2385 static void
lower_math_logical_send(const fs_builder & bld,fs_inst * inst)2386 lower_math_logical_send(const fs_builder &bld, fs_inst *inst)
2387 {
2388 assert(bld.shader->devinfo->ver < 6);
2389
2390 inst->base_mrf = 2;
2391 inst->mlen = inst->sources * inst->exec_size / 8;
2392
2393 if (inst->sources > 1) {
2394 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
2395 * "Message Payload":
2396 *
2397 * "Operand0[7]. For the INT DIV functions, this operand is the
2398 * denominator."
2399 * ...
2400 * "Operand1[7]. For the INT DIV functions, this operand is the
2401 * numerator."
2402 */
2403 const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
2404 const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
2405 const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
2406
2407 inst->resize_sources(1);
2408 inst->src[0] = src0;
2409
2410 assert(inst->exec_size == 8);
2411 bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);
2412 }
2413 }
2414
2415 static void
lower_btd_logical_send(const fs_builder & bld,fs_inst * inst)2416 lower_btd_logical_send(const fs_builder &bld, fs_inst *inst)
2417 {
2418 const intel_device_info *devinfo = bld.shader->devinfo;
2419 fs_reg global_addr = inst->src[0];
2420 const fs_reg &btd_record = inst->src[1];
2421
2422 const unsigned mlen = 2;
2423 const fs_builder ubld = bld.exec_all().group(8, 0);
2424 fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2425
2426 ubld.MOV(header, brw_imm_ud(0));
2427 switch (inst->opcode) {
2428 case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
2429 assert(type_sz(global_addr.type) == 8 && global_addr.stride == 0);
2430 global_addr.type = BRW_REGISTER_TYPE_UD;
2431 global_addr.stride = 1;
2432 ubld.group(2, 0).MOV(header, global_addr);
2433 break;
2434
2435 case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
2436 /* The bottom bit is the Stack ID release bit */
2437 ubld.group(1, 0).MOV(header, brw_imm_ud(1));
2438 break;
2439
2440 default:
2441 unreachable("Invalid BTD message");
2442 }
2443
2444 /* Stack IDs are always in R1 regardless of whether we're coming from a
2445 * bindless shader or a regular compute shader.
2446 */
2447 fs_reg stack_ids =
2448 retype(byte_offset(header, REG_SIZE), BRW_REGISTER_TYPE_UW);
2449 bld.MOV(stack_ids, retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW));
2450
2451 unsigned ex_mlen = 0;
2452 fs_reg payload;
2453 if (inst->opcode == SHADER_OPCODE_BTD_SPAWN_LOGICAL) {
2454 ex_mlen = 2 * (inst->exec_size / 8);
2455 payload = bld.move_to_vgrf(btd_record, 1);
2456 } else {
2457 assert(inst->opcode == SHADER_OPCODE_BTD_RETIRE_LOGICAL);
2458 /* All these messages take a BTD and things complain if we don't provide
2459 * one for RETIRE. However, it shouldn't ever actually get used so fill
2460 * it with zero.
2461 */
2462 ex_mlen = 2 * (inst->exec_size / 8);
2463 payload = bld.move_to_vgrf(brw_imm_uq(0), 1);
2464 }
2465
2466 /* Update the original instruction. */
2467 inst->opcode = SHADER_OPCODE_SEND;
2468 inst->mlen = mlen;
2469 inst->ex_mlen = ex_mlen;
2470 inst->header_size = 0; /* HW docs require has_header = false */
2471 inst->send_has_side_effects = true;
2472 inst->send_is_volatile = false;
2473
2474 /* Set up SFID and descriptors */
2475 inst->sfid = GEN_RT_SFID_BINDLESS_THREAD_DISPATCH;
2476 inst->desc = brw_btd_spawn_desc(devinfo, inst->exec_size,
2477 GEN_RT_BTD_MESSAGE_SPAWN);
2478 inst->resize_sources(4);
2479 inst->src[0] = brw_imm_ud(0); /* desc */
2480 inst->src[1] = brw_imm_ud(0); /* ex_desc */
2481 inst->src[2] = header;
2482 inst->src[3] = payload;
2483 }
2484
2485 static void
lower_trace_ray_logical_send(const fs_builder & bld,fs_inst * inst)2486 lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst)
2487 {
2488 const intel_device_info *devinfo = bld.shader->devinfo;
2489 /* The emit_uniformize() in brw_fs_nir.cpp will generate an horizontal
2490 * stride of 0. Below we're doing a MOV() in SIMD2. Since we can't use UQ/Q
2491 * types in on Gfx12.5, we need to tweak the stride with a value of 1 dword
2492 * so that the MOV operates on 2 components rather than twice the same
2493 * component.
2494 */
2495 fs_reg globals_addr = retype(inst->src[RT_LOGICAL_SRC_GLOBALS], BRW_REGISTER_TYPE_UD);
2496 globals_addr.stride = 1;
2497 const fs_reg &bvh_level =
2498 inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == BRW_IMMEDIATE_VALUE ?
2499 inst->src[RT_LOGICAL_SRC_BVH_LEVEL] :
2500 bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL],
2501 inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL));
2502 const fs_reg &trace_ray_control =
2503 inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == BRW_IMMEDIATE_VALUE ?
2504 inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] :
2505 bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL],
2506 inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL));
2507 const fs_reg &synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS];
2508 assert(synchronous_src.file == BRW_IMMEDIATE_VALUE);
2509 const bool synchronous = synchronous_src.ud;
2510
2511 const unsigned mlen = 1;
2512 const fs_builder ubld = bld.exec_all().group(8, 0);
2513 fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2514 ubld.MOV(header, brw_imm_ud(0));
2515 ubld.group(2, 0).MOV(header, globals_addr);
2516 if (synchronous)
2517 ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous));
2518
2519 const unsigned ex_mlen = inst->exec_size / 8;
2520 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);
2521 if (bvh_level.file == BRW_IMMEDIATE_VALUE &&
2522 trace_ray_control.file == BRW_IMMEDIATE_VALUE) {
2523 bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, 9, 8) |
2524 (bvh_level.ud & 0x7)));
2525 } else {
2526 bld.SHL(payload, trace_ray_control, brw_imm_ud(8));
2527 bld.OR(payload, payload, bvh_level);
2528 }
2529
2530 /* When doing synchronous traversal, the HW implicitly computes the
2531 * stack_id using the following formula :
2532 *
2533 * EUID[3:0] & THREAD_ID[2:0] & SIMD_LANE_ID[3:0]
2534 *
2535 * Only in the asynchronous case we need to set the stack_id given from the
2536 * payload register.
2537 */
2538 if (!synchronous) {
2539 bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1),
2540 retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW),
2541 brw_imm_uw(0x7ff));
2542 }
2543
2544 /* Update the original instruction. */
2545 inst->opcode = SHADER_OPCODE_SEND;
2546 inst->mlen = mlen;
2547 inst->ex_mlen = ex_mlen;
2548 inst->header_size = 0; /* HW docs require has_header = false */
2549 inst->send_has_side_effects = true;
2550 inst->send_is_volatile = false;
2551
2552 /* Set up SFID and descriptors */
2553 inst->sfid = GEN_RT_SFID_RAY_TRACE_ACCELERATOR;
2554 inst->desc = brw_rt_trace_ray_desc(devinfo, inst->exec_size);
2555 inst->resize_sources(4);
2556 inst->src[0] = brw_imm_ud(0); /* desc */
2557 inst->src[1] = brw_imm_ud(0); /* ex_desc */
2558 inst->src[2] = header;
2559 inst->src[3] = payload;
2560 }
2561
2562 bool
lower_logical_sends()2563 fs_visitor::lower_logical_sends()
2564 {
2565 bool progress = false;
2566
2567 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2568 const fs_builder ibld(this, block, inst);
2569
2570 switch (inst->opcode) {
2571 case FS_OPCODE_FB_WRITE_LOGICAL:
2572 assert(stage == MESA_SHADER_FRAGMENT);
2573 lower_fb_write_logical_send(ibld, inst,
2574 brw_wm_prog_data(prog_data),
2575 (const brw_wm_prog_key *)key,
2576 payload);
2577 break;
2578
2579 case FS_OPCODE_FB_READ_LOGICAL:
2580 lower_fb_read_logical_send(ibld, inst);
2581 break;
2582
2583 case SHADER_OPCODE_TEX_LOGICAL:
2584 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
2585 break;
2586
2587 case SHADER_OPCODE_TXD_LOGICAL:
2588 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
2589 break;
2590
2591 case SHADER_OPCODE_TXF_LOGICAL:
2592 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
2593 break;
2594
2595 case SHADER_OPCODE_TXL_LOGICAL:
2596 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
2597 break;
2598
2599 case SHADER_OPCODE_TXS_LOGICAL:
2600 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
2601 break;
2602
2603 case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
2604 lower_sampler_logical_send(ibld, inst,
2605 SHADER_OPCODE_IMAGE_SIZE_LOGICAL);
2606 break;
2607
2608 case FS_OPCODE_TXB_LOGICAL:
2609 lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
2610 break;
2611
2612 case SHADER_OPCODE_TXF_CMS_LOGICAL:
2613 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
2614 break;
2615
2616 case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
2617 case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
2618 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
2619 break;
2620
2621 case SHADER_OPCODE_TXF_UMS_LOGICAL:
2622 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
2623 break;
2624
2625 case SHADER_OPCODE_TXF_MCS_LOGICAL:
2626 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
2627 break;
2628
2629 case SHADER_OPCODE_LOD_LOGICAL:
2630 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
2631 break;
2632
2633 case SHADER_OPCODE_TG4_LOGICAL:
2634 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
2635 break;
2636
2637 case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
2638 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
2639 break;
2640
2641 case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
2642 lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO);
2643 break;
2644
2645 case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
2646 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
2647 case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
2648 case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
2649 case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
2650 case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
2651 if (devinfo->has_lsc) {
2652 lower_lsc_surface_logical_send(ibld, inst);
2653 break;
2654 }
2655 case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
2656 case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
2657 case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
2658 case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
2659 case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
2660 lower_surface_logical_send(ibld, inst);
2661 break;
2662
2663 case SHADER_OPCODE_OWORD_BLOCK_READ_LOGICAL:
2664 case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2665 case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
2666 lower_surface_block_logical_send(ibld, inst);
2667 break;
2668
2669 case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
2670 case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
2671 case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
2672 case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
2673 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
2674 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
2675 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
2676 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
2677 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
2678 case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL:
2679 if (devinfo->has_lsc) {
2680 lower_lsc_a64_logical_send(ibld, inst);
2681 break;
2682 }
2683 case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
2684 case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2685 case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
2686 lower_a64_logical_send(ibld, inst);
2687 break;
2688
2689 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
2690 if (devinfo->has_lsc && !compiler->indirect_ubos_use_sampler)
2691 lower_lsc_varying_pull_constant_logical_send(ibld, inst);
2692 else
2693 lower_varying_pull_constant_logical_send(ibld, inst);
2694 break;
2695
2696 case SHADER_OPCODE_RCP:
2697 case SHADER_OPCODE_RSQ:
2698 case SHADER_OPCODE_SQRT:
2699 case SHADER_OPCODE_EXP2:
2700 case SHADER_OPCODE_LOG2:
2701 case SHADER_OPCODE_SIN:
2702 case SHADER_OPCODE_COS:
2703 case SHADER_OPCODE_POW:
2704 case SHADER_OPCODE_INT_QUOTIENT:
2705 case SHADER_OPCODE_INT_REMAINDER:
2706 /* The math opcodes are overloaded for the send-like and
2707 * expression-like instructions which seems kind of icky. Gfx6+ has
2708 * a native (but rather quirky) MATH instruction so we don't need to
2709 * do anything here. On Gfx4-5 we'll have to lower the Gfx6-like
2710 * logical instructions (which we can easily recognize because they
2711 * have mlen = 0) into send-like virtual instructions.
2712 */
2713 if (devinfo->ver < 6 && inst->mlen == 0) {
2714 lower_math_logical_send(ibld, inst);
2715 break;
2716
2717 } else {
2718 continue;
2719 }
2720
2721 case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
2722 case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
2723 lower_btd_logical_send(ibld, inst);
2724 break;
2725
2726 case RT_OPCODE_TRACE_RAY_LOGICAL:
2727 lower_trace_ray_logical_send(ibld, inst);
2728 break;
2729
2730 case SHADER_OPCODE_URB_READ_LOGICAL:
2731 lower_urb_read_logical_send(ibld, inst);
2732 break;
2733
2734 case SHADER_OPCODE_URB_WRITE_LOGICAL:
2735 lower_urb_write_logical_send(ibld, inst);
2736 break;
2737
2738 default:
2739 continue;
2740 }
2741
2742 progress = true;
2743 }
2744
2745 if (progress)
2746 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2747
2748 return progress;
2749 }
2750