1 /*
2 * Copyright © 2010, 2022 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /**
25 * @file elk_lower_logical_sends.cpp
26 */
27
28 #include "elk_eu.h"
29 #include "elk_fs.h"
30 #include "elk_fs_builder.h"
31
32 using namespace elk;
33
34 static void
lower_urb_read_logical_send(const fs_builder & bld,elk_fs_inst * inst)35 lower_urb_read_logical_send(const fs_builder &bld, elk_fs_inst *inst)
36 {
37 const intel_device_info *devinfo = bld.shader->devinfo;
38 const bool per_slot_present =
39 inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
40
41 assert(inst->size_written % REG_SIZE == 0);
42 assert(inst->header_size == 0);
43
44 elk_fs_reg payload_sources[2];
45 unsigned header_size = 0;
46 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
47 if (per_slot_present)
48 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
49
50 elk_fs_reg payload = elk_fs_reg(VGRF, bld.shader->alloc.allocate(header_size),
51 ELK_REGISTER_TYPE_F);
52 bld.LOAD_PAYLOAD(payload, payload_sources, header_size, header_size);
53
54 inst->opcode = ELK_SHADER_OPCODE_SEND;
55 inst->header_size = header_size;
56
57 inst->sfid = ELK_SFID_URB;
58 inst->desc = elk_urb_desc(devinfo,
59 GFX8_URB_OPCODE_SIMD8_READ,
60 per_slot_present,
61 false,
62 inst->offset);
63
64 inst->mlen = header_size;
65 inst->ex_desc = 0;
66 inst->ex_mlen = 0;
67 inst->send_is_volatile = true;
68
69 inst->resize_sources(4);
70
71 inst->src[0] = elk_imm_ud(0); /* desc */
72 inst->src[1] = elk_imm_ud(0); /* ex_desc */
73 inst->src[2] = payload;
74 inst->src[3] = elk_null_reg();
75 }
76
77 static void
lower_urb_read_logical_send_xe2(const fs_builder & bld,elk_fs_inst * inst)78 lower_urb_read_logical_send_xe2(const fs_builder &bld, elk_fs_inst *inst)
79 {
80 const intel_device_info *devinfo = bld.shader->devinfo;
81 assert(devinfo->has_lsc);
82
83 assert(inst->size_written % (REG_SIZE * reg_unit(devinfo)) == 0);
84 assert(inst->header_size == 0);
85
86 /* Get the logical send arguments. */
87 const elk_fs_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];
88
89 /* Calculate the total number of components of the payload. */
90 const unsigned dst_comps = inst->size_written / (REG_SIZE * reg_unit(devinfo));
91
92 elk_fs_reg payload = bld.vgrf(ELK_REGISTER_TYPE_UD);
93
94 bld.MOV(payload, handle);
95
96 /* The low 24-bits of the URB handle is a byte offset into the URB area.
97 * Add the (OWord) offset of the write to this value.
98 */
99 if (inst->offset) {
100 bld.ADD(payload, payload, elk_imm_ud(inst->offset * 16));
101 inst->offset = 0;
102 }
103
104 elk_fs_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
105 if (offsets.file != BAD_FILE) {
106 elk_fs_reg offsets_B = bld.vgrf(ELK_REGISTER_TYPE_UD);
107 bld.SHL(offsets_B, offsets, elk_imm_ud(4)); /* OWords -> Bytes */
108 bld.ADD(payload, payload, offsets_B);
109 }
110
111 inst->sfid = ELK_SFID_URB;
112
113 assert((dst_comps >= 1 && dst_comps <= 4) || dst_comps == 8);
114
115 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
116 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
117 1 /* num_coordinates */,
118 LSC_DATA_SIZE_D32, dst_comps /* num_channels */,
119 false /* transpose */,
120 LSC_CACHE(devinfo, STORE, L1UC_L3UC),
121 false /* has_dest */);
122
123
124 /* Update the original instruction. */
125 inst->opcode = ELK_SHADER_OPCODE_SEND;
126 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
127 inst->ex_mlen = 0;
128 inst->header_size = 0;
129 inst->send_has_side_effects = true;
130 inst->send_is_volatile = false;
131
132 inst->resize_sources(4);
133
134 inst->src[0] = elk_imm_ud(0);
135 inst->src[1] = elk_imm_ud(0);
136
137 inst->src[2] = payload;
138 inst->src[3] = elk_null_reg();
139 }
140
141 static void
lower_urb_write_logical_send(const fs_builder & bld,elk_fs_inst * inst)142 lower_urb_write_logical_send(const fs_builder &bld, elk_fs_inst *inst)
143 {
144 const intel_device_info *devinfo = bld.shader->devinfo;
145 const bool per_slot_present =
146 inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
147 const bool channel_mask_present =
148 inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE;
149
150 assert(inst->header_size == 0);
151
152 const unsigned length = 1 + per_slot_present + channel_mask_present +
153 inst->components_read(URB_LOGICAL_SRC_DATA);
154
155 elk_fs_reg *payload_sources = new elk_fs_reg[length];
156 elk_fs_reg payload = elk_fs_reg(VGRF, bld.shader->alloc.allocate(length),
157 ELK_REGISTER_TYPE_F);
158
159 unsigned header_size = 0;
160 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
161 if (per_slot_present)
162 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
163
164 if (channel_mask_present)
165 payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
166
167 for (unsigned i = header_size, j = 0; i < length; i++, j++)
168 payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j);
169
170 bld.LOAD_PAYLOAD(payload, payload_sources, length, header_size);
171
172 delete [] payload_sources;
173
174 inst->opcode = ELK_SHADER_OPCODE_SEND;
175 inst->header_size = header_size;
176 inst->dst = elk_null_reg();
177
178 inst->sfid = ELK_SFID_URB;
179 inst->desc = elk_urb_desc(devinfo,
180 GFX8_URB_OPCODE_SIMD8_WRITE,
181 per_slot_present,
182 channel_mask_present,
183 inst->offset);
184
185 inst->mlen = length;
186 inst->ex_desc = 0;
187 inst->ex_mlen = 0;
188 inst->send_has_side_effects = true;
189
190 inst->resize_sources(4);
191
192 inst->src[0] = elk_imm_ud(0); /* desc */
193 inst->src[1] = elk_imm_ud(0); /* ex_desc */
194 inst->src[2] = payload;
195 inst->src[3] = elk_null_reg();
196 }
197
198 static void
lower_urb_write_logical_send_xe2(const fs_builder & bld,elk_fs_inst * inst)199 lower_urb_write_logical_send_xe2(const fs_builder &bld, elk_fs_inst *inst)
200 {
201 const intel_device_info *devinfo = bld.shader->devinfo;
202 assert(devinfo->has_lsc);
203
204 /* Get the logical send arguments. */
205 const elk_fs_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];
206 const elk_fs_reg src = inst->components_read(URB_LOGICAL_SRC_DATA) ?
207 inst->src[URB_LOGICAL_SRC_DATA] : elk_fs_reg(elk_imm_ud(0));
208 assert(type_sz(src.type) == 4);
209
210 /* Calculate the total number of components of the payload. */
211 const unsigned src_comps = MAX2(1, inst->components_read(URB_LOGICAL_SRC_DATA));
212 const unsigned src_sz = type_sz(src.type);
213
214 elk_fs_reg payload = bld.vgrf(ELK_REGISTER_TYPE_UD);
215
216 bld.MOV(payload, handle);
217
218 /* The low 24-bits of the URB handle is a byte offset into the URB area.
219 * Add the (OWord) offset of the write to this value.
220 */
221 if (inst->offset) {
222 bld.ADD(payload, payload, elk_imm_ud(inst->offset * 16));
223 inst->offset = 0;
224 }
225
226 elk_fs_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
227 if (offsets.file != BAD_FILE) {
228 elk_fs_reg offsets_B = bld.vgrf(ELK_REGISTER_TYPE_UD);
229 bld.SHL(offsets_B, offsets, elk_imm_ud(4)); /* OWords -> Bytes */
230 bld.ADD(payload, payload, offsets_B);
231 }
232
233 const elk_fs_reg cmask = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
234 unsigned mask = 0;
235
236 if (cmask.file != BAD_FILE) {
237 assert(cmask.file == IMM);
238 assert(cmask.type == ELK_REGISTER_TYPE_UD);
239 mask = cmask.ud >> 16;
240 }
241
242 elk_fs_reg payload2 = bld.move_to_vgrf(src, src_comps);
243 const unsigned ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE;
244
245 inst->sfid = ELK_SFID_URB;
246
247 enum elk_lsc_opcode op = mask ? LSC_OP_STORE_CMASK : LSC_OP_STORE;
248 inst->desc = lsc_msg_desc_wcmask(devinfo, op, inst->exec_size,
249 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
250 1 /* num_coordinates */,
251 LSC_DATA_SIZE_D32, src_comps /* num_channels */,
252 false /* transpose */,
253 LSC_CACHE(devinfo, STORE, L1UC_L3UC),
254 false /* has_dest */, mask);
255
256
257 /* Update the original instruction. */
258 inst->opcode = ELK_SHADER_OPCODE_SEND;
259 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
260 inst->ex_mlen = ex_mlen;
261 inst->header_size = 0;
262 inst->send_has_side_effects = true;
263 inst->send_is_volatile = false;
264
265 inst->resize_sources(4);
266
267 inst->src[0] = elk_imm_ud(0);
268 inst->src[1] = elk_imm_ud(0);
269
270 inst->src[2] = payload;
271 inst->src[3] = payload2;
272 }
273
274 static void
setup_color_payload(const fs_builder & bld,const elk_wm_prog_key * key,elk_fs_reg * dst,elk_fs_reg color,unsigned components)275 setup_color_payload(const fs_builder &bld, const elk_wm_prog_key *key,
276 elk_fs_reg *dst, elk_fs_reg color, unsigned components)
277 {
278 if (key->clamp_fragment_color) {
279 elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_F, 4);
280 assert(color.type == ELK_REGISTER_TYPE_F);
281
282 for (unsigned i = 0; i < components; i++)
283 set_saturate(true,
284 bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
285
286 color = tmp;
287 }
288
289 for (unsigned i = 0; i < components; i++)
290 dst[i] = offset(color, bld, i);
291 }
292
293 static void
lower_fb_write_logical_send(const fs_builder & bld,elk_fs_inst * inst,const struct elk_wm_prog_data * prog_data,const elk_wm_prog_key * key,const elk_fs_thread_payload & payload)294 lower_fb_write_logical_send(const fs_builder &bld, elk_fs_inst *inst,
295 const struct elk_wm_prog_data *prog_data,
296 const elk_wm_prog_key *key,
297 const elk_fs_thread_payload &payload)
298 {
299 assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
300 const intel_device_info *devinfo = bld.shader->devinfo;
301 const elk_fs_reg color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
302 const elk_fs_reg color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
303 const elk_fs_reg src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
304 const elk_fs_reg src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
305 const elk_fs_reg dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
306 const elk_fs_reg src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
307 elk_fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
308 const unsigned components =
309 inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
310
311 assert(inst->target != 0 || src0_alpha.file == BAD_FILE);
312
313 /* We can potentially have a message length of up to 15, so we have to set
314 * base_mrf to either 0 or 1 in order to fit in m0..m15.
315 */
316 elk_fs_reg sources[15];
317 int header_size = 2, payload_header_size;
318 unsigned length = 0;
319
320 if (devinfo->ver < 6) {
321 /* TODO: Support SIMD32 on gfx4-5 */
322 assert(bld.group() < 16);
323
324 /* For gfx4-5, we always have a header consisting of g0 and g1. We have
325 * an implied MOV from g0,g1 to the start of the message. The MOV from
326 * g0 is handled by the hardware and the MOV from g1 is provided by the
327 * generator. This is required because, on gfx4-5, the generator may
328 * generate two write messages with different message lengths in order
329 * to handle AA data properly.
330 *
331 * Also, since the pixel mask goes in the g0 portion of the message and
332 * since render target writes are the last thing in the shader, we write
333 * the pixel mask directly into g0 and it will get copied as part of the
334 * implied write.
335 */
336 if (prog_data->uses_kill) {
337 bld.exec_all().group(1, 0)
338 .MOV(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UW),
339 elk_sample_mask_reg(bld));
340 }
341
342 assert(length == 0);
343 length = 2;
344 } else if ((devinfo->verx10 <= 70 &&
345 prog_data->uses_kill) ||
346 (devinfo->ver < 11 &&
347 (color1.file != BAD_FILE || key->nr_color_regions > 1))) {
348 assert(devinfo->ver < 20);
349
350 /* From the Sandy Bridge PRM, volume 4, page 198:
351 *
352 * "Dispatched Pixel Enables. One bit per pixel indicating
353 * which pixels were originally enabled when the thread was
354 * dispatched. This field is only required for the end-of-
355 * thread message and on all dual-source messages."
356 */
357 const fs_builder ubld = bld.exec_all().group(8, 0);
358
359 elk_fs_reg header = ubld.vgrf(ELK_REGISTER_TYPE_UD, 2);
360 if (bld.group() < 16) {
361 /* The header starts off as g0 and g1 for the first half */
362 ubld.group(16, 0).MOV(header, retype(elk_vec8_grf(0, 0),
363 ELK_REGISTER_TYPE_UD));
364 } else {
365 /* The header starts off as g0 and g2 for the second half */
366 assert(bld.group() < 32);
367 const elk_fs_reg header_sources[2] = {
368 retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD),
369 retype(elk_vec8_grf(2, 0), ELK_REGISTER_TYPE_UD),
370 };
371 ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
372
373 /* Gfx12 will require additional fix-ups if we ever hit this path. */
374 assert(devinfo->ver < 12);
375 }
376
377 uint32_t g00_bits = 0;
378
379 /* Set "Source0 Alpha Present to RenderTarget" bit in message
380 * header.
381 */
382 if (src0_alpha.file != BAD_FILE)
383 g00_bits |= 1 << 11;
384
385 /* Set computes stencil to render target */
386 if (prog_data->computed_stencil)
387 g00_bits |= 1 << 14;
388
389 if (g00_bits) {
390 /* OR extra bits into g0.0 */
391 ubld.group(1, 0).OR(component(header, 0),
392 retype(elk_vec1_grf(0, 0),
393 ELK_REGISTER_TYPE_UD),
394 elk_imm_ud(g00_bits));
395 }
396
397 /* Set the render target index for choosing BLEND_STATE. */
398 if (inst->target > 0) {
399 ubld.group(1, 0).MOV(component(header, 2), elk_imm_ud(inst->target));
400 }
401
402 if (prog_data->uses_kill) {
403 ubld.group(1, 0).MOV(retype(component(header, 15),
404 ELK_REGISTER_TYPE_UW),
405 elk_sample_mask_reg(bld));
406 }
407
408 assert(length == 0);
409 sources[0] = header;
410 sources[1] = horiz_offset(header, 8);
411 length = 2;
412 }
413 assert(length == 0 || length == 2);
414 header_size = length;
415
416 if (payload.aa_dest_stencil_reg[0]) {
417 assert(inst->group < 16);
418 sources[length] = elk_fs_reg(VGRF, bld.shader->alloc.allocate(1));
419 bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
420 .MOV(sources[length],
421 elk_fs_reg(elk_vec8_grf(payload.aa_dest_stencil_reg[0], 0)));
422 length++;
423 }
424
425 if (src0_alpha.file != BAD_FILE) {
426 for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
427 const fs_builder &ubld = bld.exec_all().group(8, i)
428 .annotate("FB write src0 alpha");
429 const elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_F);
430 ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
431 setup_color_payload(ubld, key, &sources[length], tmp, 1);
432 length++;
433 }
434 }
435
436 if (sample_mask.file != BAD_FILE) {
437 const elk_fs_reg tmp(VGRF, bld.shader->alloc.allocate(reg_unit(devinfo)),
438 ELK_REGISTER_TYPE_UD);
439
440 /* Hand over gl_SampleMask. Only the lower 16 bits of each channel are
441 * relevant. Since it's unsigned single words one vgrf is always
442 * 16-wide, but only the lower or higher 8 channels will be used by the
443 * hardware when doing a SIMD8 write depending on whether we have
444 * selected the subspans for the first or second half respectively.
445 */
446 assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
447 sample_mask.type = ELK_REGISTER_TYPE_UW;
448 sample_mask.stride *= 2;
449
450 bld.exec_all().annotate("FB write oMask")
451 .MOV(horiz_offset(retype(tmp, ELK_REGISTER_TYPE_UW),
452 inst->group % (16 * reg_unit(devinfo))),
453 sample_mask);
454
455 for (unsigned i = 0; i < reg_unit(devinfo); i++)
456 sources[length++] = byte_offset(tmp, REG_SIZE * i);
457 }
458
459 payload_header_size = length;
460
461 setup_color_payload(bld, key, &sources[length], color0, components);
462 length += 4;
463
464 if (color1.file != BAD_FILE) {
465 setup_color_payload(bld, key, &sources[length], color1, components);
466 length += 4;
467 }
468
469 if (src_depth.file != BAD_FILE) {
470 sources[length] = src_depth;
471 length++;
472 }
473
474 if (dst_depth.file != BAD_FILE) {
475 sources[length] = dst_depth;
476 length++;
477 }
478
479 if (src_stencil.file != BAD_FILE) {
480 assert(devinfo->ver >= 9);
481 assert(bld.dispatch_width() == 8 * reg_unit(devinfo));
482
483 /* XXX: src_stencil is only available on gfx9+. dst_depth is never
484 * available on gfx9+. As such it's impossible to have both enabled at the
485 * same time and therefore length cannot overrun the array.
486 */
487 assert(length < 15 * reg_unit(devinfo));
488
489 sources[length] = bld.vgrf(ELK_REGISTER_TYPE_UD);
490 bld.exec_all().annotate("FB write OS")
491 .MOV(retype(sources[length], ELK_REGISTER_TYPE_UB),
492 subscript(src_stencil, ELK_REGISTER_TYPE_UB, 0));
493 length++;
494 }
495
496 elk_fs_inst *load;
497 if (devinfo->ver >= 7) {
498 /* Send from the GRF */
499 elk_fs_reg payload = elk_fs_reg(VGRF, -1, ELK_REGISTER_TYPE_F);
500 load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
501 payload.nr = bld.shader->alloc.allocate(regs_written(load));
502 load->dst = payload;
503
504 uint32_t msg_ctl = elk_fb_write_msg_control(inst, prog_data);
505
506 inst->desc =
507 (inst->group / 16) << 11 | /* rt slot group */
508 elk_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,
509 0 /* coarse_rt_write */);
510
511 elk_fs_reg desc = elk_imm_ud(0);
512 if (prog_data->coarse_pixel_dispatch == ELK_ALWAYS) {
513 inst->desc |= (1 << 18);
514 } else if (prog_data->coarse_pixel_dispatch == ELK_SOMETIMES) {
515 STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_RT_WRITES == (1 << 18));
516 const fs_builder &ubld = bld.exec_all().group(8, 0);
517 desc = ubld.vgrf(ELK_REGISTER_TYPE_UD);
518 ubld.AND(desc, dynamic_msaa_flags(prog_data),
519 elk_imm_ud(INTEL_MSAA_FLAG_COARSE_RT_WRITES));
520 desc = component(desc, 0);
521 }
522
523 uint32_t ex_desc = 0;
524 if (devinfo->ver >= 11) {
525 /* Set the "Render Target Index" and "Src0 Alpha Present" fields
526 * in the extended message descriptor, in lieu of using a header.
527 */
528 ex_desc = inst->target << 12 | (src0_alpha.file != BAD_FILE) << 15;
529
530 if (key->nr_color_regions == 0)
531 ex_desc |= 1 << 20; /* Null Render Target */
532 }
533 inst->ex_desc = ex_desc;
534
535 inst->opcode = ELK_SHADER_OPCODE_SEND;
536 inst->resize_sources(3);
537 inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
538 inst->src[0] = desc;
539 inst->src[1] = elk_imm_ud(0);
540 inst->src[2] = payload;
541 inst->mlen = regs_written(load);
542 inst->ex_mlen = 0;
543 inst->header_size = header_size;
544 inst->check_tdr = true;
545 inst->send_has_side_effects = true;
546 } else {
547 /* Send from the MRF */
548 load = bld.LOAD_PAYLOAD(elk_fs_reg(MRF, 1, ELK_REGISTER_TYPE_F),
549 sources, length, payload_header_size);
550
551 /* On pre-SNB, we have to interlace the color values. LOAD_PAYLOAD
552 * will do this for us if we just give it a COMPR4 destination.
553 */
554 if (devinfo->ver < 6 && bld.dispatch_width() == 16)
555 load->dst.nr |= ELK_MRF_COMPR4;
556
557 if (devinfo->ver < 6) {
558 /* Set up src[0] for the implied MOV from grf0-1 */
559 inst->resize_sources(1);
560 inst->src[0] = elk_vec8_grf(0, 0);
561 } else {
562 inst->resize_sources(0);
563 }
564 inst->base_mrf = 1;
565 inst->opcode = ELK_FS_OPCODE_FB_WRITE;
566 inst->mlen = regs_written(load);
567 inst->header_size = header_size;
568 }
569 }
570
571 static void
lower_fb_read_logical_send(const fs_builder & bld,elk_fs_inst * inst)572 lower_fb_read_logical_send(const fs_builder &bld, elk_fs_inst *inst)
573 {
574 const intel_device_info *devinfo = bld.shader->devinfo;
575 const fs_builder &ubld = bld.exec_all().group(8, 0);
576 const unsigned length = 2;
577 const elk_fs_reg header = ubld.vgrf(ELK_REGISTER_TYPE_UD, length);
578
579 if (bld.group() < 16) {
580 ubld.group(16, 0).MOV(header, retype(elk_vec8_grf(0, 0),
581 ELK_REGISTER_TYPE_UD));
582 } else {
583 assert(bld.group() < 32);
584 const elk_fs_reg header_sources[] = {
585 retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD),
586 retype(elk_vec8_grf(2, 0), ELK_REGISTER_TYPE_UD)
587 };
588 ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
589
590 if (devinfo->ver >= 12) {
591 /* On Gfx12 the Viewport and Render Target Array Index fields (AKA
592 * Poly 0 Info) are provided in r1.1 instead of r0.0, and the render
593 * target message header format was updated accordingly -- However
594 * the updated format only works for the lower 16 channels in a
595 * SIMD32 thread, since the higher 16 channels want the subspan data
596 * from r2 instead of r1, so we need to copy over the contents of
597 * r1.1 in order to fix things up.
598 */
599 ubld.group(1, 0).MOV(component(header, 9),
600 retype(elk_vec1_grf(1, 1), ELK_REGISTER_TYPE_UD));
601 }
602 }
603
604 /* BSpec 12470 (Gfx8-11), BSpec 47842 (Gfx12+) :
605 *
606 * "Must be zero for Render Target Read message."
607 *
608 * For bits :
609 * - 14 : Stencil Present to Render Target
610 * - 13 : Source Depth Present to Render Target
611 * - 12 : oMask to Render Target
612 * - 11 : Source0 Alpha Present to Render Target
613 */
614 ubld.group(1, 0).AND(component(header, 0),
615 component(header, 0),
616 elk_imm_ud(~INTEL_MASK(14, 11)));
617
618 inst->resize_sources(1);
619 inst->src[0] = header;
620 inst->opcode = ELK_FS_OPCODE_FB_READ;
621 inst->mlen = length;
622 inst->header_size = length;
623 }
624
625 static void
lower_sampler_logical_send_gfx4(const fs_builder & bld,elk_fs_inst * inst,elk_opcode op,const elk_fs_reg & coordinate,const elk_fs_reg & shadow_c,const elk_fs_reg & lod,const elk_fs_reg & lod2,const elk_fs_reg & surface,const elk_fs_reg & sampler,unsigned coord_components,unsigned grad_components)626 lower_sampler_logical_send_gfx4(const fs_builder &bld, elk_fs_inst *inst, elk_opcode op,
627 const elk_fs_reg &coordinate,
628 const elk_fs_reg &shadow_c,
629 const elk_fs_reg &lod, const elk_fs_reg &lod2,
630 const elk_fs_reg &surface,
631 const elk_fs_reg &sampler,
632 unsigned coord_components,
633 unsigned grad_components)
634 {
635 const bool has_lod = (op == ELK_SHADER_OPCODE_TXL || op == ELK_FS_OPCODE_TXB ||
636 op == ELK_SHADER_OPCODE_TXF || op == ELK_SHADER_OPCODE_TXS);
637 elk_fs_reg msg_begin(MRF, 1, ELK_REGISTER_TYPE_F);
638 elk_fs_reg msg_end = msg_begin;
639
640 /* g0 header. */
641 msg_end = offset(msg_end, bld.group(8, 0), 1);
642
643 for (unsigned i = 0; i < coord_components; i++)
644 bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
645 offset(coordinate, bld, i));
646
647 msg_end = offset(msg_end, bld, coord_components);
648
649 /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
650 * require all three components to be present and zero if they are unused.
651 */
652 if (coord_components > 0 &&
653 (has_lod || shadow_c.file != BAD_FILE ||
654 (op == ELK_SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
655 assert(coord_components <= 3);
656 for (unsigned i = 0; i < 3 - coord_components; i++)
657 bld.MOV(offset(msg_end, bld, i), elk_imm_f(0.0f));
658
659 msg_end = offset(msg_end, bld, 3 - coord_components);
660 }
661
662 if (op == ELK_SHADER_OPCODE_TXD) {
663 /* TXD unsupported in SIMD16 mode. */
664 assert(bld.dispatch_width() == 8);
665
666 /* the slots for u and v are always present, but r is optional */
667 if (coord_components < 2)
668 msg_end = offset(msg_end, bld, 2 - coord_components);
669
670 /* P = u, v, r
671 * dPdx = dudx, dvdx, drdx
672 * dPdy = dudy, dvdy, drdy
673 *
674 * 1-arg: Does not exist.
675 *
676 * 2-arg: dudx dvdx dudy dvdy
677 * dPdx.x dPdx.y dPdy.x dPdy.y
678 * m4 m5 m6 m7
679 *
680 * 3-arg: dudx dvdx drdx dudy dvdy drdy
681 * dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
682 * m5 m6 m7 m8 m9 m10
683 */
684 for (unsigned i = 0; i < grad_components; i++)
685 bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
686
687 msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
688
689 for (unsigned i = 0; i < grad_components; i++)
690 bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
691
692 msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
693 }
694
695 if (has_lod) {
696 /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
697 * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
698 */
699 assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
700 bld.dispatch_width() == 16);
701
702 const elk_reg_type type =
703 (op == ELK_SHADER_OPCODE_TXF || op == ELK_SHADER_OPCODE_TXS ?
704 ELK_REGISTER_TYPE_UD : ELK_REGISTER_TYPE_F);
705 bld.MOV(retype(msg_end, type), lod);
706 msg_end = offset(msg_end, bld, 1);
707 }
708
709 if (shadow_c.file != BAD_FILE) {
710 if (op == ELK_SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
711 /* There's no plain shadow compare message, so we use shadow
712 * compare with a bias of 0.0.
713 */
714 bld.MOV(msg_end, elk_imm_f(0.0f));
715 msg_end = offset(msg_end, bld, 1);
716 }
717
718 bld.MOV(msg_end, shadow_c);
719 msg_end = offset(msg_end, bld, 1);
720 }
721
722 inst->opcode = op;
723 inst->src[0] = reg_undef;
724 inst->src[1] = surface;
725 inst->src[2] = sampler;
726 inst->resize_sources(3);
727 inst->base_mrf = msg_begin.nr;
728 inst->mlen = msg_end.nr - msg_begin.nr;
729 inst->header_size = 1;
730 }
731
732 static void
lower_sampler_logical_send_gfx5(const fs_builder & bld,elk_fs_inst * inst,elk_opcode op,const elk_fs_reg & coordinate,const elk_fs_reg & shadow_c,const elk_fs_reg & lod,const elk_fs_reg & lod2,const elk_fs_reg & sample_index,const elk_fs_reg & surface,const elk_fs_reg & sampler,unsigned coord_components,unsigned grad_components)733 lower_sampler_logical_send_gfx5(const fs_builder &bld, elk_fs_inst *inst, elk_opcode op,
734 const elk_fs_reg &coordinate,
735 const elk_fs_reg &shadow_c,
736 const elk_fs_reg &lod, const elk_fs_reg &lod2,
737 const elk_fs_reg &sample_index,
738 const elk_fs_reg &surface,
739 const elk_fs_reg &sampler,
740 unsigned coord_components,
741 unsigned grad_components)
742 {
743 elk_fs_reg message(MRF, 2, ELK_REGISTER_TYPE_F);
744 elk_fs_reg msg_coords = message;
745 unsigned header_size = 0;
746
747 if (inst->offset != 0) {
748 /* The offsets set up by the visitor are in the m1 header, so we can't
749 * go headerless.
750 */
751 header_size = 1;
752 message.nr--;
753 }
754
755 for (unsigned i = 0; i < coord_components; i++)
756 bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type),
757 offset(coordinate, bld, i));
758
759 elk_fs_reg msg_end = offset(msg_coords, bld, coord_components);
760 elk_fs_reg msg_lod = offset(msg_coords, bld, 4);
761
762 if (shadow_c.file != BAD_FILE) {
763 elk_fs_reg msg_shadow = msg_lod;
764 bld.MOV(msg_shadow, shadow_c);
765 msg_lod = offset(msg_shadow, bld, 1);
766 msg_end = msg_lod;
767 }
768
769 switch (op) {
770 case ELK_SHADER_OPCODE_TXL:
771 case ELK_FS_OPCODE_TXB:
772 bld.MOV(msg_lod, lod);
773 msg_end = offset(msg_lod, bld, 1);
774 break;
775 case ELK_SHADER_OPCODE_TXD:
776 /**
777 * P = u, v, r
778 * dPdx = dudx, dvdx, drdx
779 * dPdy = dudy, dvdy, drdy
780 *
781 * Load up these values:
782 * - dudx dudy dvdx dvdy drdx drdy
783 * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
784 */
785 msg_end = msg_lod;
786 for (unsigned i = 0; i < grad_components; i++) {
787 bld.MOV(msg_end, offset(lod, bld, i));
788 msg_end = offset(msg_end, bld, 1);
789
790 bld.MOV(msg_end, offset(lod2, bld, i));
791 msg_end = offset(msg_end, bld, 1);
792 }
793 break;
794 case ELK_SHADER_OPCODE_TXS:
795 msg_lod = retype(msg_end, ELK_REGISTER_TYPE_UD);
796 bld.MOV(msg_lod, lod);
797 msg_end = offset(msg_lod, bld, 1);
798 break;
799 case ELK_SHADER_OPCODE_TXF:
800 msg_lod = offset(msg_coords, bld, 3);
801 bld.MOV(retype(msg_lod, ELK_REGISTER_TYPE_UD), lod);
802 msg_end = offset(msg_lod, bld, 1);
803 break;
804 case ELK_SHADER_OPCODE_TXF_CMS:
805 msg_lod = offset(msg_coords, bld, 3);
806 /* lod */
807 bld.MOV(retype(msg_lod, ELK_REGISTER_TYPE_UD), elk_imm_ud(0u));
808 /* sample index */
809 bld.MOV(retype(offset(msg_lod, bld, 1), ELK_REGISTER_TYPE_UD), sample_index);
810 msg_end = offset(msg_lod, bld, 2);
811 break;
812 default:
813 break;
814 }
815
816 inst->opcode = op;
817 inst->src[0] = reg_undef;
818 inst->src[1] = surface;
819 inst->src[2] = sampler;
820 inst->resize_sources(3);
821 inst->base_mrf = message.nr;
822 inst->mlen = msg_end.nr - message.nr;
823 inst->header_size = header_size;
824
825 /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
826 assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
827 }
828
829 static bool
is_high_sampler(const struct intel_device_info * devinfo,const elk_fs_reg & sampler)830 is_high_sampler(const struct intel_device_info *devinfo, const elk_fs_reg &sampler)
831 {
832 if (devinfo->verx10 <= 70)
833 return false;
834
835 return sampler.file != IMM || sampler.ud >= 16;
836 }
837
838 static unsigned
sampler_msg_type(const intel_device_info * devinfo,elk_opcode opcode,bool shadow_compare,bool has_min_lod)839 sampler_msg_type(const intel_device_info *devinfo,
840 elk_opcode opcode, bool shadow_compare, bool has_min_lod)
841 {
842 assert(devinfo->ver >= 5);
843 switch (opcode) {
844 case ELK_SHADER_OPCODE_TEX:
845 if (devinfo->ver >= 20 && has_min_lod) {
846 return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD :
847 XE2_SAMPLER_MESSAGE_SAMPLE_MLOD;
848 } else {
849 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
850 GFX5_SAMPLER_MESSAGE_SAMPLE;
851 }
852 case ELK_FS_OPCODE_TXB:
853 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
854 GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
855 case ELK_SHADER_OPCODE_TXL:
856 assert(!has_min_lod);
857 return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
858 GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
859 case ELK_SHADER_OPCODE_TXL_LZ:
860 assert(!has_min_lod);
861 return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ :
862 GFX9_SAMPLER_MESSAGE_SAMPLE_LZ;
863 case ELK_SHADER_OPCODE_TXS:
864 case ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
865 assert(!has_min_lod);
866 return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
867 case ELK_SHADER_OPCODE_TXD:
868 assert(!shadow_compare || devinfo->verx10 >= 75);
869 return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
870 GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
871 case ELK_SHADER_OPCODE_TXF:
872 assert(!has_min_lod);
873 return GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
874 case ELK_SHADER_OPCODE_TXF_LZ:
875 assert(!has_min_lod);
876 assert(devinfo->ver >= 9);
877 return GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
878 case ELK_SHADER_OPCODE_TXF_CMS_W:
879 assert(!has_min_lod);
880 assert(devinfo->ver >= 9);
881 return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
882 case ELK_SHADER_OPCODE_TXF_CMS:
883 assert(!has_min_lod);
884 return devinfo->ver >= 7 ? GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS :
885 GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
886 case ELK_SHADER_OPCODE_TXF_UMS:
887 assert(!has_min_lod);
888 assert(devinfo->ver >= 7);
889 return GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
890 case ELK_SHADER_OPCODE_TXF_MCS:
891 assert(!has_min_lod);
892 assert(devinfo->ver >= 7);
893 return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
894 case ELK_SHADER_OPCODE_LOD:
895 assert(!has_min_lod);
896 return GFX5_SAMPLER_MESSAGE_LOD;
897 case ELK_SHADER_OPCODE_TG4:
898 assert(!has_min_lod);
899 assert(devinfo->ver >= 7);
900 return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
901 GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
902 break;
903 case ELK_SHADER_OPCODE_TG4_OFFSET:
904 assert(!has_min_lod);
905 assert(devinfo->ver >= 7);
906 return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
907 GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
908 case ELK_SHADER_OPCODE_SAMPLEINFO:
909 assert(!has_min_lod);
910 return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
911 default:
912 unreachable("not reached");
913 }
914 }
915
916 /**
917 * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to
918 * the given requested_alignment_sz.
919 */
920 static elk_fs_inst *
emit_load_payload_with_padding(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg * src,unsigned sources,unsigned header_size,unsigned requested_alignment_sz)921 emit_load_payload_with_padding(const fs_builder &bld, const elk_fs_reg &dst,
922 const elk_fs_reg *src, unsigned sources,
923 unsigned header_size,
924 unsigned requested_alignment_sz)
925 {
926 unsigned length = 0;
927 unsigned num_srcs =
928 sources * DIV_ROUND_UP(requested_alignment_sz, bld.dispatch_width());
929 elk_fs_reg *src_comps = new elk_fs_reg[num_srcs];
930
931 for (unsigned i = 0; i < header_size; i++)
932 src_comps[length++] = src[i];
933
934 for (unsigned i = header_size; i < sources; i++) {
935 unsigned src_sz =
936 retype(dst, src[i].type).component_size(bld.dispatch_width());
937 const enum elk_reg_type padding_payload_type =
938 elk_reg_type_from_bit_size(type_sz(src[i].type) * 8,
939 ELK_REGISTER_TYPE_UD);
940
941 src_comps[length++] = src[i];
942
943 /* Expand the real sources if component of requested payload type is
944 * larger than real source component.
945 */
946 if (src_sz < requested_alignment_sz) {
947 for (unsigned j = 0; j < (requested_alignment_sz / src_sz) - 1; j++) {
948 src_comps[length++] = retype(elk_fs_reg(), padding_payload_type);
949 }
950 }
951 }
952
953 elk_fs_inst *inst = bld.LOAD_PAYLOAD(dst, src_comps, length, header_size);
954 delete[] src_comps;
955
956 return inst;
957 }
958
959 static void
lower_sampler_logical_send_gfx7(const fs_builder & bld,elk_fs_inst * inst,elk_opcode op,const elk_fs_reg & coordinate,const elk_fs_reg & shadow_c,elk_fs_reg lod,const elk_fs_reg & lod2,const elk_fs_reg & min_lod,const elk_fs_reg & sample_index,const elk_fs_reg & mcs,const elk_fs_reg & surface,const elk_fs_reg & sampler,const elk_fs_reg & surface_handle,const elk_fs_reg & sampler_handle,const elk_fs_reg & tg4_offset,unsigned payload_type_bit_size,unsigned coord_components,unsigned grad_components,bool residency)960 lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_opcode op,
961 const elk_fs_reg &coordinate,
962 const elk_fs_reg &shadow_c,
963 elk_fs_reg lod, const elk_fs_reg &lod2,
964 const elk_fs_reg &min_lod,
965 const elk_fs_reg &sample_index,
966 const elk_fs_reg &mcs,
967 const elk_fs_reg &surface,
968 const elk_fs_reg &sampler,
969 const elk_fs_reg &surface_handle,
970 const elk_fs_reg &sampler_handle,
971 const elk_fs_reg &tg4_offset,
972 unsigned payload_type_bit_size,
973 unsigned coord_components,
974 unsigned grad_components,
975 bool residency)
976 {
977 const elk_compiler *compiler = bld.shader->compiler;
978 const intel_device_info *devinfo = bld.shader->devinfo;
979 const enum elk_reg_type payload_type =
980 elk_reg_type_from_bit_size(payload_type_bit_size, ELK_REGISTER_TYPE_F);
981 const enum elk_reg_type payload_unsigned_type =
982 elk_reg_type_from_bit_size(payload_type_bit_size, ELK_REGISTER_TYPE_UD);
983 const enum elk_reg_type payload_signed_type =
984 elk_reg_type_from_bit_size(payload_type_bit_size, ELK_REGISTER_TYPE_D);
985 unsigned reg_width = bld.dispatch_width() / 8;
986 unsigned header_size = 0, length = 0;
987 elk_fs_reg sources[1 + MAX_SAMPLER_MESSAGE_SIZE];
988 for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
989 sources[i] = bld.vgrf(payload_type);
990
991 /* We must have exactly one of surface/sampler and surface/sampler_handle */
992 assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
993 assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
994
995 if (op == ELK_SHADER_OPCODE_TG4 || op == ELK_SHADER_OPCODE_TG4_OFFSET ||
996 inst->offset != 0 || inst->eot ||
997 op == ELK_SHADER_OPCODE_SAMPLEINFO ||
998 sampler_handle.file != BAD_FILE ||
999 is_high_sampler(devinfo, sampler) ||
1000 residency) {
1001 /* For general texture offsets (no txf workaround), we need a header to
1002 * put them in.
1003 *
1004 * TG4 needs to place its channel select in the header, for interaction
1005 * with ARB_texture_swizzle. The sampler index is only 4-bits, so for
1006 * larger sampler numbers we need to offset the Sampler State Pointer in
1007 * the header.
1008 */
1009 elk_fs_reg header = retype(sources[0], ELK_REGISTER_TYPE_UD);
1010 for (header_size = 0; header_size < reg_unit(devinfo); header_size++)
1011 sources[length++] = byte_offset(header, REG_SIZE * header_size);
1012
1013 /* If we're requesting fewer than four channels worth of response,
1014 * and we have an explicit header, we need to set up the sampler
1015 * writemask. It's reversed from normal: 1 means "don't write".
1016 */
1017 unsigned reg_count = regs_written(inst) - reg_unit(devinfo) * residency;
1018 if (!inst->eot && reg_count < 4 * reg_width) {
1019 assert(reg_count % reg_width == 0);
1020 unsigned mask = ~((1 << (reg_count / reg_width)) - 1) & 0xf;
1021 inst->offset |= mask << 12;
1022 }
1023
1024 if (residency)
1025 inst->offset |= 1 << 23; /* g0.2 bit23 : Pixel Null Mask Enable */
1026
1027 /* Build the actual header */
1028 const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
1029 const fs_builder ubld1 = ubld.group(1, 0);
1030 ubld.MOV(header, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
1031 if (inst->offset) {
1032 ubld1.MOV(component(header, 2), elk_imm_ud(inst->offset));
1033 } else if (bld.shader->stage != MESA_SHADER_VERTEX &&
1034 bld.shader->stage != MESA_SHADER_FRAGMENT) {
1035 /* The vertex and fragment stages have g0.2 set to 0, so
1036 * header0.2 is 0 when g0 is copied. Other stages may not, so we
1037 * must set it to 0 to avoid setting undesirable bits in the
1038 * message.
1039 */
1040 ubld1.MOV(component(header, 2), elk_imm_ud(0));
1041 }
1042
1043 if (sampler_handle.file != BAD_FILE) {
1044 /* Bindless sampler handles aren't relative to the sampler state
1045 * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
1046 * Instead, it's an absolute pointer relative to dynamic state base
1047 * address.
1048 *
1049 * Sampler states are 16 bytes each and the pointer we give here has
1050 * to be 32-byte aligned. In order to avoid more indirect messages
1051 * than required, we assume that all bindless sampler states are
1052 * 32-byte aligned. This sacrifices a bit of general state base
1053 * address space but means we can do something more efficient in the
1054 * shader.
1055 */
1056 if (compiler->use_bindless_sampler_offset) {
1057 assert(devinfo->ver >= 11);
1058 ubld1.OR(component(header, 3), sampler_handle, elk_imm_ud(1));
1059 } else {
1060 ubld1.MOV(component(header, 3), sampler_handle);
1061 }
1062 } else if (is_high_sampler(devinfo, sampler)) {
1063 elk_fs_reg sampler_state_ptr =
1064 retype(elk_vec1_grf(0, 3), ELK_REGISTER_TYPE_UD);
1065
1066 /* Gfx11+ sampler message headers include bits in 4:0 which conflict
1067 * with the ones included in g0.3 bits 4:0. Mask them out.
1068 */
1069 if (devinfo->ver >= 11) {
1070 sampler_state_ptr = ubld1.vgrf(ELK_REGISTER_TYPE_UD);
1071 ubld1.AND(sampler_state_ptr,
1072 retype(elk_vec1_grf(0, 3), ELK_REGISTER_TYPE_UD),
1073 elk_imm_ud(INTEL_MASK(31, 5)));
1074 }
1075
1076 if (sampler.file == ELK_IMMEDIATE_VALUE) {
1077 assert(sampler.ud >= 16);
1078 const int sampler_state_size = 16; /* 16 bytes */
1079
1080 ubld1.ADD(component(header, 3), sampler_state_ptr,
1081 elk_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
1082 } else {
1083 elk_fs_reg tmp = ubld1.vgrf(ELK_REGISTER_TYPE_UD);
1084 ubld1.AND(tmp, sampler, elk_imm_ud(0x0f0));
1085 ubld1.SHL(tmp, tmp, elk_imm_ud(4));
1086 ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
1087 }
1088 } else if (devinfo->ver >= 11) {
1089 /* Gfx11+ sampler message headers include bits in 4:0 which conflict
1090 * with the ones included in g0.3 bits 4:0. Mask them out.
1091 */
1092 ubld1.AND(component(header, 3),
1093 retype(elk_vec1_grf(0, 3), ELK_REGISTER_TYPE_UD),
1094 elk_imm_ud(INTEL_MASK(31, 5)));
1095 }
1096 }
1097
1098 /* Change the opcode to account for LOD being zero before the
1099 * switch-statement that emits sources based on the opcode.
1100 */
1101 if (devinfo->ver >= 9 && lod.is_zero()) {
1102 if (op == ELK_SHADER_OPCODE_TXL)
1103 op = ELK_SHADER_OPCODE_TXL_LZ;
1104 else if (op == ELK_SHADER_OPCODE_TXF)
1105 op = ELK_SHADER_OPCODE_TXF_LZ;
1106 }
1107
1108 /* On Xe2 and newer platforms, min_lod is the first parameter specifically
1109 * so that a bunch of other, possibly unused, parameters don't need to also
1110 * be included.
1111 */
1112 const unsigned msg_type =
1113 sampler_msg_type(devinfo, op, inst->shadow_compare,
1114 min_lod.file != BAD_FILE);
1115
1116 const bool min_lod_is_first = devinfo->ver >= 20 &&
1117 (msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_MLOD ||
1118 msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD);
1119
1120 if (min_lod_is_first) {
1121 assert(min_lod.file != BAD_FILE);
1122 bld.MOV(sources[length++], min_lod);
1123 }
1124
1125 if (shadow_c.file != BAD_FILE) {
1126 bld.MOV(sources[length], shadow_c);
1127 length++;
1128 }
1129
1130 bool coordinate_done = false;
1131
1132 /* Set up the LOD info */
1133 switch (op) {
1134 case ELK_FS_OPCODE_TXB:
1135 case ELK_SHADER_OPCODE_TXL:
1136 bld.MOV(sources[length], lod);
1137 length++;
1138 break;
1139 case ELK_SHADER_OPCODE_TXD:
1140 /* TXD should have been lowered in SIMD16 mode (in SIMD32 mode in
1141 * Xe2+).
1142 */
1143 assert(bld.dispatch_width() == (8 * reg_unit(devinfo)));
1144
1145 /* Load dPdx and the coordinate together:
1146 * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
1147 */
1148 for (unsigned i = 0; i < coord_components; i++) {
1149 bld.MOV(sources[length++], offset(coordinate, bld, i));
1150
1151 /* For cube map array, the coordinate is (u,v,r,ai) but there are
1152 * only derivatives for (u, v, r).
1153 */
1154 if (i < grad_components) {
1155 bld.MOV(sources[length++], offset(lod, bld, i));
1156 bld.MOV(sources[length++], offset(lod2, bld, i));
1157 }
1158 }
1159
1160 coordinate_done = true;
1161 break;
1162 case ELK_SHADER_OPCODE_TXS:
1163 bld.MOV(retype(sources[length], payload_unsigned_type), lod);
1164 length++;
1165 break;
1166 case ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
1167 /* We need an LOD; just use 0 */
1168 bld.MOV(retype(sources[length], payload_unsigned_type), elk_imm_ud(0));
1169 length++;
1170 break;
1171 case ELK_SHADER_OPCODE_TXF:
1172 case ELK_SHADER_OPCODE_TXF_LZ:
1173 /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
1174 * On Gfx9 they are u, v, lod, r
1175 */
1176 bld.MOV(retype(sources[length++], payload_signed_type), coordinate);
1177
1178 if (devinfo->ver >= 9) {
1179 if (coord_components >= 2) {
1180 bld.MOV(retype(sources[length], payload_signed_type),
1181 offset(coordinate, bld, 1));
1182 } else {
1183 sources[length] = elk_imm_d(0);
1184 }
1185 length++;
1186 }
1187
1188 if (op != ELK_SHADER_OPCODE_TXF_LZ) {
1189 bld.MOV(retype(sources[length], payload_signed_type), lod);
1190 length++;
1191 }
1192
1193 for (unsigned i = devinfo->ver >= 9 ? 2 : 1; i < coord_components; i++)
1194 bld.MOV(retype(sources[length++], payload_signed_type),
1195 offset(coordinate, bld, i));
1196
1197 coordinate_done = true;
1198 break;
1199
1200 case ELK_SHADER_OPCODE_TXF_CMS:
1201 case ELK_SHADER_OPCODE_TXF_CMS_W:
1202 case ELK_SHADER_OPCODE_TXF_UMS:
1203 case ELK_SHADER_OPCODE_TXF_MCS:
1204 if (op == ELK_SHADER_OPCODE_TXF_UMS ||
1205 op == ELK_SHADER_OPCODE_TXF_CMS ||
1206 op == ELK_SHADER_OPCODE_TXF_CMS_W) {
1207 bld.MOV(retype(sources[length++], payload_unsigned_type), sample_index);
1208 }
1209
1210 /* Data from the multisample control surface. */
1211 if (op == ELK_SHADER_OPCODE_TXF_CMS || op == ELK_SHADER_OPCODE_TXF_CMS_W) {
1212 unsigned num_mcs_components = 1;
1213
1214 /* From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
1215 * Shared Functions - 3D Sampler - Messages - Message Format:
1216 *
1217 * ld2dms_w si mcs0 mcs1 mcs2 mcs3 u v r
1218 */
1219 if (op == ELK_SHADER_OPCODE_TXF_CMS_W)
1220 num_mcs_components = 2;
1221
1222 for (unsigned i = 0; i < num_mcs_components; ++i) {
1223 /* Sampler always writes 4/8 register worth of data but for ld_mcs
1224 * only valid data is in first two register. So with 16-bit
1225 * payload, we need to split 2-32bit register into 4-16-bit
1226 * payload.
1227 *
1228 * From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
1229 * Shared Functions - 3D Sampler - Messages - Message Format:
1230 *
1231 * ld2dms_w si mcs0 mcs1 mcs2 mcs3 u v r
1232 */
1233 if (devinfo->verx10 >= 125 && op == ELK_SHADER_OPCODE_TXF_CMS_W) {
1234 elk_fs_reg tmp = offset(mcs, bld, i);
1235 bld.MOV(retype(sources[length++], payload_unsigned_type),
1236 mcs.file == IMM ? mcs :
1237 subscript(tmp, payload_unsigned_type, 0));
1238 bld.MOV(retype(sources[length++], payload_unsigned_type),
1239 mcs.file == IMM ? mcs :
1240 subscript(tmp, payload_unsigned_type, 1));
1241 } else {
1242 bld.MOV(retype(sources[length++], payload_unsigned_type),
1243 mcs.file == IMM ? mcs : offset(mcs, bld, i));
1244 }
1245 }
1246 }
1247
1248 /* There is no offsetting for this message; just copy in the integer
1249 * texture coordinates.
1250 */
1251 for (unsigned i = 0; i < coord_components; i++)
1252 bld.MOV(retype(sources[length++], payload_signed_type),
1253 offset(coordinate, bld, i));
1254
1255 coordinate_done = true;
1256 break;
1257 case ELK_SHADER_OPCODE_TG4_OFFSET:
1258 /* More crazy intermixing */
1259 for (unsigned i = 0; i < 2; i++) /* u, v */
1260 bld.MOV(sources[length++], offset(coordinate, bld, i));
1261
1262 for (unsigned i = 0; i < 2; i++) /* offu, offv */
1263 bld.MOV(retype(sources[length++], payload_signed_type),
1264 offset(tg4_offset, bld, i));
1265
1266 if (coord_components == 3) /* r if present */
1267 bld.MOV(sources[length++], offset(coordinate, bld, 2));
1268
1269 coordinate_done = true;
1270 break;
1271 default:
1272 break;
1273 }
1274
1275 /* Set up the coordinate (except for cases where it was done above) */
1276 if (!coordinate_done) {
1277 for (unsigned i = 0; i < coord_components; i++)
1278 bld.MOV(retype(sources[length++], payload_type),
1279 offset(coordinate, bld, i));
1280 }
1281
1282 if (min_lod.file != BAD_FILE && !min_lod_is_first) {
1283 /* Account for all of the missing coordinate sources */
1284 if (op == ELK_SHADER_OPCODE_TXD && devinfo->verx10 >= 125) {
1285 /* On DG2 and newer platforms, sample_d can only be used with 1D and
1286 * 2D surfaces, so the maximum number of gradient components is 2.
1287 * In spite of this limitation, the Bspec lists a mysterious R
1288 * component before the min_lod, so the maximum coordinate components
1289 * is 3.
1290 *
1291 * See bspec 45942, "Enable new message layout for cube array"
1292 */
1293 length += 3 - coord_components;
1294 length += (2 - grad_components) * 2;
1295 } else {
1296 length += 4 - coord_components;
1297 if (op == ELK_SHADER_OPCODE_TXD)
1298 length += (3 - grad_components) * 2;
1299 }
1300
1301 bld.MOV(sources[length++], min_lod);
1302
1303 /* Wa_14014595444: Populate MLOD as parameter 5 (twice). */
1304 if (devinfo->verx10 == 125 && op == ELK_FS_OPCODE_TXB &&
1305 !inst->shadow_compare)
1306 bld.MOV(sources[length++], min_lod);
1307 }
1308
1309 const elk_fs_reg src_payload =
1310 elk_fs_reg(VGRF, bld.shader->alloc.allocate(length * reg_width),
1311 ELK_REGISTER_TYPE_F);
1312 /* In case of 16-bit payload each component takes one full register in
1313 * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16
1314 * elements. In SIMD8H case hardware simply expects the components to be
1315 * padded (i.e., aligned on reg boundary).
1316 */
1317 elk_fs_inst *load_payload_inst =
1318 emit_load_payload_with_padding(bld, src_payload, sources, length,
1319 header_size, REG_SIZE * reg_unit(devinfo));
1320 unsigned mlen = load_payload_inst->size_written / REG_SIZE;
1321 unsigned simd_mode = 0;
1322 if (devinfo->ver < 20) {
1323 if (payload_type_bit_size == 16) {
1324 assert(devinfo->ver >= 11);
1325 simd_mode = inst->exec_size <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H :
1326 GFX10_SAMPLER_SIMD_MODE_SIMD16H;
1327 } else {
1328 simd_mode = inst->exec_size <= 8 ? ELK_SAMPLER_SIMD_MODE_SIMD8 :
1329 ELK_SAMPLER_SIMD_MODE_SIMD16;
1330 }
1331 } else {
1332 if (payload_type_bit_size == 16) {
1333 simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16H :
1334 XE2_SAMPLER_SIMD_MODE_SIMD32H;
1335 } else {
1336 simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16 :
1337 XE2_SAMPLER_SIMD_MODE_SIMD32;
1338 }
1339 }
1340
1341 /* Generate the SEND. */
1342 inst->opcode = ELK_SHADER_OPCODE_SEND;
1343 inst->mlen = mlen;
1344 inst->header_size = header_size;
1345
1346 assert(msg_type == sampler_msg_type(devinfo, op, inst->shadow_compare,
1347 min_lod.file != BAD_FILE));
1348
1349 inst->sfid = ELK_SFID_SAMPLER;
1350 if (surface.file == IMM &&
1351 (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
1352 inst->desc = elk_sampler_desc(devinfo, surface.ud,
1353 sampler.file == IMM ? sampler.ud % 16 : 0,
1354 msg_type,
1355 simd_mode,
1356 0 /* return_format unused on gfx7+ */);
1357 inst->src[0] = elk_imm_ud(0);
1358 inst->src[1] = elk_imm_ud(0);
1359 } else if (surface_handle.file != BAD_FILE) {
1360 /* Bindless surface */
1361 assert(devinfo->ver >= 9);
1362 inst->desc = elk_sampler_desc(devinfo,
1363 GFX9_BTI_BINDLESS,
1364 sampler.file == IMM ? sampler.ud % 16 : 0,
1365 msg_type,
1366 simd_mode,
1367 0 /* return_format unused on gfx7+ */);
1368
1369 /* For bindless samplers, the entire address is included in the message
1370 * header so we can leave the portion in the message descriptor 0.
1371 */
1372 if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {
1373 inst->src[0] = elk_imm_ud(0);
1374 } else {
1375 const fs_builder ubld = bld.group(1, 0).exec_all();
1376 elk_fs_reg desc = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1377 ubld.SHL(desc, sampler, elk_imm_ud(8));
1378 inst->src[0] = component(desc, 0);
1379 }
1380
1381 /* We assume that the driver provided the handle in the top 20 bits so
1382 * we can use the surface handle directly as the extended descriptor.
1383 */
1384 inst->src[1] = retype(surface_handle, ELK_REGISTER_TYPE_UD);
1385 inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1386 } else {
1387 /* Immediate portion of the descriptor */
1388 inst->desc = elk_sampler_desc(devinfo,
1389 0, /* surface */
1390 0, /* sampler */
1391 msg_type,
1392 simd_mode,
1393 0 /* return_format unused on gfx7+ */);
1394 const fs_builder ubld = bld.group(1, 0).exec_all();
1395 elk_fs_reg desc = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1396 if (surface.equals(sampler)) {
1397 /* This case is common in GL */
1398 ubld.MUL(desc, surface, elk_imm_ud(0x101));
1399 } else {
1400 if (sampler_handle.file != BAD_FILE) {
1401 ubld.MOV(desc, surface);
1402 } else if (sampler.file == IMM) {
1403 ubld.OR(desc, surface, elk_imm_ud(sampler.ud << 8));
1404 } else {
1405 ubld.SHL(desc, sampler, elk_imm_ud(8));
1406 ubld.OR(desc, desc, surface);
1407 }
1408 }
1409 ubld.AND(desc, desc, elk_imm_ud(0xfff));
1410
1411 inst->src[0] = component(desc, 0);
1412 inst->src[1] = elk_imm_ud(0); /* ex_desc */
1413 }
1414
1415 inst->ex_desc = 0;
1416
1417 inst->src[2] = src_payload;
1418 inst->resize_sources(3);
1419
1420 if (inst->eot) {
1421 /* EOT sampler messages don't make sense to split because it would
1422 * involve ending half of the thread early.
1423 */
1424 assert(inst->group == 0);
1425 /* We need to use SENDC for EOT sampler messages */
1426 inst->check_tdr = true;
1427 inst->send_has_side_effects = true;
1428 }
1429
1430 /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
1431 assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE * reg_unit(devinfo));
1432 }
1433
1434 static unsigned
get_sampler_msg_payload_type_bit_size(const intel_device_info * devinfo,elk_opcode op,const elk_fs_reg * src)1435 get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
1436 elk_opcode op, const elk_fs_reg *src)
1437 {
1438 unsigned src_type_size = 0;
1439
1440 /* All sources need to have the same size, therefore seek the first valid
1441 * and take the size from there.
1442 */
1443 for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1444 if (src[i].file != BAD_FILE) {
1445 src_type_size = elk_reg_type_to_size(src[i].type);
1446 break;
1447 }
1448 }
1449
1450 assert(src_type_size == 2 || src_type_size == 4);
1451
1452 #ifndef NDEBUG
1453 /* Make sure all sources agree. On gfx12 this doesn't hold when sampling
1454 * compressed multisampled surfaces. There the payload contains MCS data
1455 * which is already in 16-bits unlike the other parameters that need forced
1456 * conversion.
1457 */
1458 if (devinfo->verx10 < 125 ||
1459 (op != ELK_SHADER_OPCODE_TXF_CMS_W &&
1460 op != ELK_SHADER_OPCODE_TXF_CMS)) {
1461 for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1462 assert(src[i].file == BAD_FILE ||
1463 elk_reg_type_to_size(src[i].type) == src_type_size);
1464 }
1465 }
1466 #endif
1467
1468 if (devinfo->verx10 < 125)
1469 return src_type_size * 8;
1470
1471 /* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec:
1472 * 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message
1473 * Format [GFX12:HAS:1209977870] *
1474 *
1475 * ld2dms_w SIMD8H and SIMD16H Only
1476 * ld_mcs SIMD8H and SIMD16H Only
1477 * ld2dms REMOVEDBY(GEN:HAS:1406788836)
1478 */
1479
1480 if (op == ELK_SHADER_OPCODE_TXF_CMS_W ||
1481 op == ELK_SHADER_OPCODE_TXF_CMS ||
1482 op == ELK_SHADER_OPCODE_TXF_UMS ||
1483 op == ELK_SHADER_OPCODE_TXF_MCS)
1484 src_type_size = 2;
1485
1486 return src_type_size * 8;
1487 }
1488
1489 static void
lower_sampler_logical_send(const fs_builder & bld,elk_fs_inst * inst,elk_opcode op)1490 lower_sampler_logical_send(const fs_builder &bld, elk_fs_inst *inst, elk_opcode op)
1491 {
1492 const intel_device_info *devinfo = bld.shader->devinfo;
1493 const elk_fs_reg coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
1494 const elk_fs_reg shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
1495 const elk_fs_reg lod = inst->src[TEX_LOGICAL_SRC_LOD];
1496 const elk_fs_reg lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
1497 const elk_fs_reg min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
1498 const elk_fs_reg sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
1499 const elk_fs_reg mcs = inst->src[TEX_LOGICAL_SRC_MCS];
1500 const elk_fs_reg surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
1501 const elk_fs_reg sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
1502 const elk_fs_reg surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
1503 const elk_fs_reg sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
1504 const elk_fs_reg tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
1505 assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
1506 const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
1507 assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
1508 const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
1509 assert(inst->src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
1510 const bool residency = inst->src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
1511 /* residency is only supported on Gfx8+ */
1512 assert(!residency || devinfo->ver >= 8);
1513
1514 if (devinfo->ver >= 7) {
1515 const unsigned msg_payload_type_bit_size =
1516 get_sampler_msg_payload_type_bit_size(devinfo, op, inst->src);
1517
1518 /* 16-bit payloads are available only on gfx11+ */
1519 assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11);
1520
1521 lower_sampler_logical_send_gfx7(bld, inst, op, coordinate,
1522 shadow_c, lod, lod2, min_lod,
1523 sample_index,
1524 mcs, surface, sampler,
1525 surface_handle, sampler_handle,
1526 tg4_offset,
1527 msg_payload_type_bit_size,
1528 coord_components, grad_components,
1529 residency);
1530 } else if (devinfo->ver >= 5) {
1531 lower_sampler_logical_send_gfx5(bld, inst, op, coordinate,
1532 shadow_c, lod, lod2, sample_index,
1533 surface, sampler,
1534 coord_components, grad_components);
1535 } else {
1536 lower_sampler_logical_send_gfx4(bld, inst, op, coordinate,
1537 shadow_c, lod, lod2,
1538 surface, sampler,
1539 coord_components, grad_components);
1540 }
1541 }
1542
1543 /**
1544 * Predicate the specified instruction on the vector mask.
1545 */
1546 static void
emit_predicate_on_vector_mask(const fs_builder & bld,elk_fs_inst * inst)1547 emit_predicate_on_vector_mask(const fs_builder &bld, elk_fs_inst *inst)
1548 {
1549 assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
1550 bld.group() == inst->group &&
1551 bld.dispatch_width() == inst->exec_size);
1552
1553 const fs_builder ubld = bld.exec_all().group(1, 0);
1554
1555 const elk_fs_visitor &s = *bld.shader;
1556 const elk_fs_reg vector_mask = ubld.vgrf(ELK_REGISTER_TYPE_UW);
1557 ubld.UNDEF(vector_mask);
1558 ubld.emit(ELK_SHADER_OPCODE_READ_SR_REG, vector_mask, elk_imm_ud(3));
1559 const unsigned subreg = sample_mask_flag_subreg(s);
1560
1561 ubld.MOV(elk_flag_subreg(subreg + inst->group / 16), vector_mask);
1562
1563 if (inst->predicate) {
1564 assert(inst->predicate == ELK_PREDICATE_NORMAL);
1565 assert(!inst->predicate_inverse);
1566 assert(inst->flag_subreg == 0);
1567 assert(s.devinfo->ver < 20);
1568 /* Combine the vector mask with the existing predicate by using a
1569 * vertical predication mode.
1570 */
1571 inst->predicate = ELK_PREDICATE_ALIGN1_ALLV;
1572 } else {
1573 inst->flag_subreg = subreg;
1574 inst->predicate = ELK_PREDICATE_NORMAL;
1575 inst->predicate_inverse = false;
1576 }
1577 }
1578
1579 static void
setup_surface_descriptors(const fs_builder & bld,elk_fs_inst * inst,uint32_t desc,const elk_fs_reg & surface,const elk_fs_reg & surface_handle)1580 setup_surface_descriptors(const fs_builder &bld, elk_fs_inst *inst, uint32_t desc,
1581 const elk_fs_reg &surface, const elk_fs_reg &surface_handle)
1582 {
1583 const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
1584 const elk_compiler *compiler = bld.shader->compiler;
1585
1586 /* We must have exactly one of surface and surface_handle */
1587 assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
1588
1589 if (surface.file == IMM) {
1590 inst->desc = desc | (surface.ud & 0xff);
1591 inst->src[0] = elk_imm_ud(0);
1592 inst->src[1] = elk_imm_ud(0); /* ex_desc */
1593 } else if (surface_handle.file != BAD_FILE) {
1594 /* Bindless surface */
1595 assert(devinfo->ver >= 9);
1596 inst->desc = desc | GFX9_BTI_BINDLESS;
1597 inst->src[0] = elk_imm_ud(0);
1598
1599 /* We assume that the driver provided the handle in the top 20 bits so
1600 * we can use the surface handle directly as the extended descriptor.
1601 */
1602 inst->src[1] = retype(surface_handle, ELK_REGISTER_TYPE_UD);
1603 inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1604 } else {
1605 inst->desc = desc;
1606 const fs_builder ubld = bld.exec_all().group(1, 0);
1607 elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1608 ubld.AND(tmp, surface, elk_imm_ud(0xff));
1609 inst->src[0] = component(tmp, 0);
1610 inst->src[1] = elk_imm_ud(0); /* ex_desc */
1611 }
1612 }
1613
1614 static void
setup_lsc_surface_descriptors(const fs_builder & bld,elk_fs_inst * inst,uint32_t desc,const elk_fs_reg & surface)1615 setup_lsc_surface_descriptors(const fs_builder &bld, elk_fs_inst *inst,
1616 uint32_t desc, const elk_fs_reg &surface)
1617 {
1618 const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
1619 const elk_compiler *compiler = bld.shader->compiler;
1620
1621 inst->src[0] = elk_imm_ud(0); /* desc */
1622
1623 enum lsc_addr_surface_type surf_type = lsc_msg_desc_addr_type(devinfo, desc);
1624 switch (surf_type) {
1625 case LSC_ADDR_SURFTYPE_BSS:
1626 inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1627 /* fall-through */
1628 case LSC_ADDR_SURFTYPE_SS:
1629 assert(surface.file != BAD_FILE);
1630 /* We assume that the driver provided the handle in the top 20 bits so
1631 * we can use the surface handle directly as the extended descriptor.
1632 */
1633 inst->src[1] = retype(surface, ELK_REGISTER_TYPE_UD);
1634 break;
1635
1636 case LSC_ADDR_SURFTYPE_BTI:
1637 assert(surface.file != BAD_FILE);
1638 if (surface.file == IMM) {
1639 inst->src[1] = elk_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
1640 } else {
1641 const fs_builder ubld = bld.exec_all().group(1, 0);
1642 elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1643 ubld.SHL(tmp, surface, elk_imm_ud(24));
1644 inst->src[1] = component(tmp, 0);
1645 }
1646 break;
1647
1648 case LSC_ADDR_SURFTYPE_FLAT:
1649 inst->src[1] = elk_imm_ud(0);
1650 break;
1651
1652 default:
1653 unreachable("Invalid LSC surface address type");
1654 }
1655 }
1656
1657 static void
lower_surface_logical_send(const fs_builder & bld,elk_fs_inst * inst)1658 lower_surface_logical_send(const fs_builder &bld, elk_fs_inst *inst)
1659 {
1660 const elk_compiler *compiler = bld.shader->compiler;
1661 const intel_device_info *devinfo = bld.shader->devinfo;
1662
1663 /* Get the logical send arguments. */
1664 const elk_fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1665 const elk_fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1666 const elk_fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1667 const elk_fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1668 const UNUSED elk_fs_reg dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
1669 const elk_fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1670 const elk_fs_reg allow_sample_mask =
1671 inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
1672 assert(arg.file == IMM);
1673 assert(allow_sample_mask.file == IMM);
1674
1675 /* Calculate the total number of components of the payload. */
1676 const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
1677 const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1678
1679 const bool is_typed_access =
1680 inst->opcode == ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL ||
1681 inst->opcode == ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL ||
1682 inst->opcode == ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL;
1683
1684 const bool is_surface_access = is_typed_access ||
1685 inst->opcode == ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL ||
1686 inst->opcode == ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL ||
1687 inst->opcode == ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL;
1688
1689 const bool is_stateless =
1690 surface.file == IMM && (surface.ud == ELK_BTI_STATELESS ||
1691 surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
1692
1693 const bool has_side_effects = inst->has_side_effects();
1694
1695 elk_fs_reg sample_mask = allow_sample_mask.ud ? elk_sample_mask_reg(bld) :
1696 elk_fs_reg(elk_imm_ud(0xffffffff));
1697
1698 /* From the BDW PRM Volume 7, page 147:
1699 *
1700 * "For the Data Cache Data Port*, the header must be present for the
1701 * following message types: [...] Typed read/write/atomics"
1702 *
1703 * Earlier generations have a similar wording. Because of this restriction
1704 * we don't attempt to implement sample masks via predication for such
1705 * messages prior to Gfx9, since we have to provide a header anyway. On
1706 * Gfx11+ the header has been removed so we can only use predication.
1707 *
1708 * For all stateless A32 messages, we also need a header
1709 */
1710 elk_fs_reg header;
1711 if ((devinfo->ver < 9 && is_typed_access) || is_stateless) {
1712 fs_builder ubld = bld.exec_all().group(8, 0);
1713 header = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1714 if (is_stateless) {
1715 assert(!is_surface_access);
1716 ubld.emit(ELK_SHADER_OPCODE_SCRATCH_HEADER, header);
1717 } else {
1718 ubld.MOV(header, elk_imm_d(0));
1719 if (is_surface_access)
1720 ubld.group(1, 0).MOV(component(header, 7), sample_mask);
1721 }
1722 }
1723 const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;
1724
1725 elk_fs_reg payload, payload2;
1726 unsigned mlen, ex_mlen = 0;
1727 if (devinfo->ver >= 9 &&
1728 (src.file == BAD_FILE || header.file == BAD_FILE)) {
1729 /* We have split sends on gfx9 and above */
1730 if (header.file == BAD_FILE) {
1731 payload = bld.move_to_vgrf(addr, addr_sz);
1732 payload2 = bld.move_to_vgrf(src, src_sz);
1733 mlen = addr_sz * (inst->exec_size / 8);
1734 ex_mlen = src_sz * (inst->exec_size / 8);
1735 } else {
1736 assert(src.file == BAD_FILE);
1737 payload = header;
1738 payload2 = bld.move_to_vgrf(addr, addr_sz);
1739 mlen = header_sz;
1740 ex_mlen = addr_sz * (inst->exec_size / 8);
1741 }
1742 } else {
1743 /* Allocate space for the payload. */
1744 const unsigned sz = header_sz + addr_sz + src_sz;
1745 payload = bld.vgrf(ELK_REGISTER_TYPE_UD, sz);
1746 elk_fs_reg *const components = new elk_fs_reg[sz];
1747 unsigned n = 0;
1748
1749 /* Construct the payload. */
1750 if (header.file != BAD_FILE)
1751 components[n++] = header;
1752
1753 for (unsigned i = 0; i < addr_sz; i++)
1754 components[n++] = offset(addr, bld, i);
1755
1756 for (unsigned i = 0; i < src_sz; i++)
1757 components[n++] = offset(src, bld, i);
1758
1759 bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
1760 mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
1761
1762 delete[] components;
1763 }
1764
1765 /* Predicate the instruction on the sample mask if no header is
1766 * provided.
1767 */
1768 if ((header.file == BAD_FILE || !is_surface_access) &&
1769 sample_mask.file != BAD_FILE && sample_mask.file != IMM)
1770 elk_emit_predicate_on_sample_mask(bld, inst);
1771
1772 uint32_t sfid;
1773 switch (inst->opcode) {
1774 case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1775 case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1776 /* Byte scattered opcodes go through the normal data cache */
1777 sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1778 break;
1779
1780 case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1781 case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1782 sfid = devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
1783 devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
1784 ELK_DATAPORT_READ_TARGET_RENDER_CACHE;
1785 break;
1786
1787 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1788 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1789 case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1790 /* Untyped Surface messages go through the data cache but the SFID value
1791 * changed on Haswell.
1792 */
1793 sfid = (devinfo->verx10 >= 75 ?
1794 HSW_SFID_DATAPORT_DATA_CACHE_1 :
1795 GFX7_SFID_DATAPORT_DATA_CACHE);
1796 break;
1797
1798 case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1799 case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1800 case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1801 /* Typed surface messages go through the render cache on IVB and the
1802 * data cache on HSW+.
1803 */
1804 sfid = (devinfo->verx10 >= 75 ?
1805 HSW_SFID_DATAPORT_DATA_CACHE_1 :
1806 GFX6_SFID_DATAPORT_RENDER_CACHE);
1807 break;
1808
1809 default:
1810 unreachable("Unsupported surface opcode");
1811 }
1812
1813 uint32_t desc;
1814 switch (inst->opcode) {
1815 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1816 desc = elk_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1817 arg.ud, /* num_channels */
1818 false /* write */);
1819 break;
1820
1821 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1822 desc = elk_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1823 arg.ud, /* num_channels */
1824 true /* write */);
1825 break;
1826
1827 case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1828 desc = elk_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1829 arg.ud, /* bit_size */
1830 false /* write */);
1831 break;
1832
1833 case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1834 desc = elk_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1835 arg.ud, /* bit_size */
1836 true /* write */);
1837 break;
1838
1839 case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1840 assert(arg.ud == 32); /* bit_size */
1841 desc = elk_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1842 false /* write */);
1843 break;
1844
1845 case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1846 assert(arg.ud == 32); /* bit_size */
1847 desc = elk_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1848 true /* write */);
1849 break;
1850
1851 case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1852 if (elk_lsc_opcode_is_atomic_float((enum elk_lsc_opcode) arg.ud)) {
1853 desc = elk_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,
1854 lsc_op_to_legacy_atomic(arg.ud),
1855 !inst->dst.is_null());
1856 } else {
1857 desc = elk_dp_untyped_atomic_desc(devinfo, inst->exec_size,
1858 lsc_op_to_legacy_atomic(arg.ud),
1859 !inst->dst.is_null());
1860 }
1861 break;
1862
1863 case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1864 desc = elk_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
1865 arg.ud, /* num_channels */
1866 false /* write */);
1867 break;
1868
1869 case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1870 desc = elk_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
1871 arg.ud, /* num_channels */
1872 true /* write */);
1873 break;
1874
1875 case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1876 desc = elk_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
1877 lsc_op_to_legacy_atomic(arg.ud),
1878 !inst->dst.is_null());
1879 break;
1880
1881 default:
1882 unreachable("Unknown surface logical instruction");
1883 }
1884
1885 /* Update the original instruction. */
1886 inst->opcode = ELK_SHADER_OPCODE_SEND;
1887 inst->mlen = mlen;
1888 inst->ex_mlen = ex_mlen;
1889 inst->header_size = header_sz;
1890 inst->send_has_side_effects = has_side_effects;
1891 inst->send_is_volatile = !has_side_effects;
1892 inst->send_ex_bso = surface_handle.file != BAD_FILE &&
1893 compiler->extended_bindless_surface_offset;
1894
1895 /* Set up SFID and descriptors */
1896 inst->sfid = sfid;
1897 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1898
1899 inst->resize_sources(4);
1900
1901 /* Finally, the payload */
1902 inst->src[2] = payload;
1903 inst->src[3] = payload2;
1904 }
1905
1906 static enum lsc_data_size
lsc_bits_to_data_size(unsigned bit_size)1907 lsc_bits_to_data_size(unsigned bit_size)
1908 {
1909 switch (bit_size / 8) {
1910 case 1: return LSC_DATA_SIZE_D8U32;
1911 case 2: return LSC_DATA_SIZE_D16U32;
1912 case 4: return LSC_DATA_SIZE_D32;
1913 case 8: return LSC_DATA_SIZE_D64;
1914 default:
1915 unreachable("Unsupported data size.");
1916 }
1917 }
1918
1919 static void
lower_lsc_surface_logical_send(const fs_builder & bld,elk_fs_inst * inst)1920 lower_lsc_surface_logical_send(const fs_builder &bld, elk_fs_inst *inst)
1921 {
1922 const elk_compiler *compiler = bld.shader->compiler;
1923 const intel_device_info *devinfo = bld.shader->devinfo;
1924 assert(devinfo->has_lsc);
1925
1926 /* Get the logical send arguments. */
1927 const elk_fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1928 const elk_fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1929 const elk_fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1930 const elk_fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1931 const UNUSED elk_fs_reg dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
1932 const elk_fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1933 const elk_fs_reg allow_sample_mask =
1934 inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
1935 assert(arg.file == IMM);
1936 assert(allow_sample_mask.file == IMM);
1937
1938 /* Calculate the total number of components of the payload. */
1939 const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
1940 const unsigned src_comps = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1941 const unsigned src_sz = type_sz(src.type);
1942 const unsigned dst_sz = type_sz(inst->dst.type);
1943
1944 const bool has_side_effects = inst->has_side_effects();
1945
1946 unsigned ex_mlen = 0;
1947 elk_fs_reg payload, payload2;
1948 payload = bld.move_to_vgrf(addr, addr_sz);
1949 if (src.file != BAD_FILE) {
1950 payload2 = bld.move_to_vgrf(src, src_comps);
1951 ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE;
1952 }
1953
1954 /* Predicate the instruction on the sample mask if needed */
1955 elk_fs_reg sample_mask = allow_sample_mask.ud ? elk_sample_mask_reg(bld) :
1956 elk_fs_reg(elk_imm_ud(0xffffffff));
1957 if (sample_mask.file != BAD_FILE && sample_mask.file != IMM)
1958 elk_emit_predicate_on_sample_mask(bld, inst);
1959
1960 if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
1961 inst->sfid = GFX12_SFID_SLM;
1962 else
1963 inst->sfid = GFX12_SFID_UGM;
1964
1965 /* We should have exactly one of surface and surface_handle. For scratch
1966 * messages generated by elk_fs_nir.cpp we also allow a special value to
1967 * know what heap base we should use in STATE_BASE_ADDRESS (SS = Surface
1968 * State Offset, or BSS = Bindless Surface State Offset).
1969 */
1970 bool non_bindless = surface.file == IMM && surface.ud == GFX125_NON_BINDLESS;
1971 assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE) ||
1972 (non_bindless && surface_handle.file != BAD_FILE));
1973
1974 enum lsc_addr_surface_type surf_type;
1975 if (surface_handle.file != BAD_FILE) {
1976 if (surface.file == BAD_FILE) {
1977 assert(!non_bindless);
1978 surf_type = LSC_ADDR_SURFTYPE_BSS;
1979 } else {
1980 assert(surface.file == IMM &&
1981 (surface.ud == 0 || surface.ud == GFX125_NON_BINDLESS));
1982 surf_type = non_bindless ? LSC_ADDR_SURFTYPE_SS : LSC_ADDR_SURFTYPE_BSS;
1983 }
1984 } else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
1985 surf_type = LSC_ADDR_SURFTYPE_FLAT;
1986 else
1987 surf_type = LSC_ADDR_SURFTYPE_BTI;
1988
1989 switch (inst->opcode) {
1990 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1991 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
1992 surf_type, LSC_ADDR_SIZE_A32,
1993 1 /* num_coordinates */,
1994 LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
1995 false /* transpose */,
1996 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
1997 true /* has_dest */);
1998 break;
1999 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
2000 inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
2001 surf_type, LSC_ADDR_SIZE_A32,
2002 1 /* num_coordinates */,
2003 LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
2004 false /* transpose */,
2005 LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
2006 false /* has_dest */);
2007 break;
2008 case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: {
2009 /* Bspec: Atomic instruction -> Cache section:
2010 *
2011 * Atomic messages are always forced to "un-cacheable" in the L1
2012 * cache.
2013 */
2014 enum elk_lsc_opcode opcode = (enum elk_lsc_opcode) arg.ud;
2015
2016 inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
2017 surf_type, LSC_ADDR_SIZE_A32,
2018 1 /* num_coordinates */,
2019 lsc_bits_to_data_size(dst_sz * 8),
2020 1 /* num_channels */,
2021 false /* transpose */,
2022 LSC_CACHE(devinfo, STORE, L1UC_L3WB),
2023 !inst->dst.is_null());
2024 break;
2025 }
2026 case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
2027 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
2028 surf_type, LSC_ADDR_SIZE_A32,
2029 1 /* num_coordinates */,
2030 lsc_bits_to_data_size(arg.ud),
2031 1 /* num_channels */,
2032 false /* transpose */,
2033 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2034 true /* has_dest */);
2035 break;
2036 case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
2037 inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
2038 surf_type, LSC_ADDR_SIZE_A32,
2039 1 /* num_coordinates */,
2040 lsc_bits_to_data_size(arg.ud),
2041 1 /* num_channels */,
2042 false /* transpose */,
2043 LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
2044 false /* has_dest */);
2045 break;
2046 default:
2047 unreachable("Unknown surface logical instruction");
2048 }
2049
2050 /* Update the original instruction. */
2051 inst->opcode = ELK_SHADER_OPCODE_SEND;
2052 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2053 inst->ex_mlen = ex_mlen;
2054 inst->header_size = 0;
2055 inst->send_has_side_effects = has_side_effects;
2056 inst->send_is_volatile = !has_side_effects;
2057 inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
2058 compiler->extended_bindless_surface_offset;
2059
2060 inst->resize_sources(4);
2061
2062 if (non_bindless) {
2063 inst->src[0] = elk_imm_ud(0); /* desc */
2064 inst->src[1] = surface_handle; /* ex_desc */
2065 } else {
2066 setup_lsc_surface_descriptors(bld, inst, inst->desc,
2067 surface.file != BAD_FILE ?
2068 surface : surface_handle);
2069 }
2070
2071 /* Finally, the payload */
2072 inst->src[2] = payload;
2073 inst->src[3] = payload2;
2074 }
2075
2076 static void
lower_lsc_block_logical_send(const fs_builder & bld,elk_fs_inst * inst)2077 lower_lsc_block_logical_send(const fs_builder &bld, elk_fs_inst *inst)
2078 {
2079 const elk_compiler *compiler = bld.shader->compiler;
2080 const intel_device_info *devinfo = bld.shader->devinfo;
2081 assert(devinfo->has_lsc);
2082
2083 /* Get the logical send arguments. */
2084 const elk_fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
2085 const elk_fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
2086 const elk_fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
2087 const elk_fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
2088 const elk_fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
2089 assert(arg.file == IMM);
2090 assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);
2091 assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);
2092
2093 const bool is_stateless =
2094 surface.file == IMM && (surface.ud == ELK_BTI_STATELESS ||
2095 surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
2096
2097 const bool has_side_effects = inst->has_side_effects();
2098
2099 const bool write = inst->opcode == ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
2100
2101 fs_builder ubld = bld.exec_all().group(1, 0);
2102 elk_fs_reg stateless_ex_desc;
2103 if (is_stateless) {
2104 stateless_ex_desc = ubld.vgrf(ELK_REGISTER_TYPE_UD);
2105 ubld.AND(stateless_ex_desc,
2106 retype(elk_vec1_grf(0, 5), ELK_REGISTER_TYPE_UD),
2107 elk_imm_ud(INTEL_MASK(31, 10)));
2108 }
2109
2110 elk_fs_reg data;
2111 if (write) {
2112 const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
2113 data = retype(bld.move_to_vgrf(src, src_sz), ELK_REGISTER_TYPE_UD);
2114 }
2115
2116 inst->opcode = ELK_SHADER_OPCODE_SEND;
2117 if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
2118 inst->sfid = GFX12_SFID_SLM;
2119 else
2120 inst->sfid = GFX12_SFID_UGM;
2121 const enum lsc_addr_surface_type surf_type =
2122 inst->sfid == GFX12_SFID_SLM ?
2123 LSC_ADDR_SURFTYPE_FLAT :
2124 surface.file == BAD_FILE ?
2125 LSC_ADDR_SURFTYPE_BSS : LSC_ADDR_SURFTYPE_BTI;
2126 inst->desc = lsc_msg_desc(devinfo,
2127 write ? LSC_OP_STORE : LSC_OP_LOAD,
2128 1 /* exec_size */,
2129 surf_type,
2130 LSC_ADDR_SIZE_A32,
2131 1 /* num_coordinates */,
2132 LSC_DATA_SIZE_D32,
2133 arg.ud /* num_channels */,
2134 true /* transpose */,
2135 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2136 !write /* has_dest */);
2137
2138 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2139 inst->size_written = lsc_msg_desc_dest_len(devinfo, inst->desc) * REG_SIZE;
2140 inst->exec_size = 1;
2141 inst->ex_mlen = write ? DIV_ROUND_UP(arg.ud, 8) : 0;
2142 inst->header_size = 0;
2143 inst->send_has_side_effects = has_side_effects;
2144 inst->send_is_volatile = !has_side_effects;
2145 inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
2146 compiler->extended_bindless_surface_offset;
2147
2148 inst->resize_sources(4);
2149
2150 if (stateless_ex_desc.file != BAD_FILE) {
2151 inst->src[0] = elk_imm_ud(0); /* desc */
2152 inst->src[1] = stateless_ex_desc; /* ex_desc */
2153 } else {
2154 setup_lsc_surface_descriptors(bld, inst, inst->desc,
2155 surface.file != BAD_FILE ?
2156 surface : surface_handle);
2157 }
2158 inst->src[2] = addr; /* payload */
2159 inst->src[3] = data; /* payload2 */
2160 }
2161
2162 static void
lower_surface_block_logical_send(const fs_builder & bld,elk_fs_inst * inst)2163 lower_surface_block_logical_send(const fs_builder &bld, elk_fs_inst *inst)
2164 {
2165 const intel_device_info *devinfo = bld.shader->devinfo;
2166 assert(devinfo->ver >= 9);
2167
2168 /* Get the logical send arguments. */
2169 const elk_fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
2170 const elk_fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
2171 const elk_fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
2172 const elk_fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
2173 const elk_fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
2174 assert(arg.file == IMM);
2175 assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);
2176 assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);
2177
2178 const bool is_stateless =
2179 surface.file == IMM && (surface.ud == ELK_BTI_STATELESS ||
2180 surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
2181
2182 const bool has_side_effects = inst->has_side_effects();
2183
2184 const bool align_16B =
2185 inst->opcode != ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL;
2186
2187 const bool write = inst->opcode == ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
2188
2189 /* The address is stored in the header. See MH_A32_GO and MH_BTS_GO. */
2190 fs_builder ubld = bld.exec_all().group(8, 0);
2191 elk_fs_reg header = ubld.vgrf(ELK_REGISTER_TYPE_UD);
2192
2193 if (is_stateless)
2194 ubld.emit(ELK_SHADER_OPCODE_SCRATCH_HEADER, header);
2195 else
2196 ubld.MOV(header, elk_imm_d(0));
2197
2198 /* Address in OWord units when aligned to OWords. */
2199 if (align_16B)
2200 ubld.group(1, 0).SHR(component(header, 2), addr, elk_imm_ud(4));
2201 else
2202 ubld.group(1, 0).MOV(component(header, 2), addr);
2203
2204 elk_fs_reg data;
2205 unsigned ex_mlen = 0;
2206 if (write) {
2207 const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
2208 data = retype(bld.move_to_vgrf(src, src_sz), ELK_REGISTER_TYPE_UD);
2209 ex_mlen = src_sz * type_sz(src.type) * inst->exec_size / REG_SIZE;
2210 }
2211
2212 inst->opcode = ELK_SHADER_OPCODE_SEND;
2213 inst->mlen = 1;
2214 inst->ex_mlen = ex_mlen;
2215 inst->header_size = 1;
2216 inst->send_has_side_effects = has_side_effects;
2217 inst->send_is_volatile = !has_side_effects;
2218
2219 inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
2220
2221 const uint32_t desc = elk_dp_oword_block_rw_desc(devinfo, align_16B,
2222 arg.ud, write);
2223 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2224
2225 inst->resize_sources(4);
2226
2227 inst->src[2] = header;
2228 inst->src[3] = data;
2229 }
2230
2231 static elk_fs_reg
emit_a64_oword_block_header(const fs_builder & bld,const elk_fs_reg & addr)2232 emit_a64_oword_block_header(const fs_builder &bld, const elk_fs_reg &addr)
2233 {
2234 const fs_builder ubld = bld.exec_all().group(8, 0);
2235
2236 assert(type_sz(addr.type) == 8 && addr.stride == 0);
2237
2238 elk_fs_reg expanded_addr = addr;
2239 if (addr.file == UNIFORM) {
2240 /* We can't do stride 1 with the UNIFORM file, it requires stride 0 */
2241 expanded_addr = ubld.vgrf(ELK_REGISTER_TYPE_UQ);
2242 expanded_addr.stride = 0;
2243 ubld.MOV(expanded_addr, retype(addr, ELK_REGISTER_TYPE_UQ));
2244 }
2245
2246 elk_fs_reg header = ubld.vgrf(ELK_REGISTER_TYPE_UD);
2247 ubld.MOV(header, elk_imm_ud(0));
2248
2249 /* Use a 2-wide MOV to fill out the address */
2250 elk_fs_reg addr_vec2 = expanded_addr;
2251 addr_vec2.type = ELK_REGISTER_TYPE_UD;
2252 addr_vec2.stride = 1;
2253 ubld.group(2, 0).MOV(header, addr_vec2);
2254
2255 return header;
2256 }
2257
2258 static void
emit_fragment_mask(const fs_builder & bld,elk_fs_inst * inst)2259 emit_fragment_mask(const fs_builder &bld, elk_fs_inst *inst)
2260 {
2261 assert(inst->src[A64_LOGICAL_ENABLE_HELPERS].file == IMM);
2262 const bool enable_helpers = inst->src[A64_LOGICAL_ENABLE_HELPERS].ud;
2263
2264 /* If we're a fragment shader, we have to predicate with the sample mask to
2265 * avoid helper invocations to avoid helper invocations in instructions
2266 * with side effects, unless they are explicitly required.
2267 *
2268 * There are also special cases when we actually want to run on helpers
2269 * (ray queries).
2270 */
2271 assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
2272 if (enable_helpers)
2273 emit_predicate_on_vector_mask(bld, inst);
2274 else if (inst->has_side_effects())
2275 elk_emit_predicate_on_sample_mask(bld, inst);
2276 }
2277
2278 static void
lower_lsc_a64_logical_send(const fs_builder & bld,elk_fs_inst * inst)2279 lower_lsc_a64_logical_send(const fs_builder &bld, elk_fs_inst *inst)
2280 {
2281 const intel_device_info *devinfo = bld.shader->devinfo;
2282
2283 /* Get the logical send arguments. */
2284 const elk_fs_reg addr = inst->src[A64_LOGICAL_ADDRESS];
2285 const elk_fs_reg src = inst->src[A64_LOGICAL_SRC];
2286 const unsigned src_sz = type_sz(src.type);
2287 const unsigned dst_sz = type_sz(inst->dst.type);
2288
2289 const unsigned src_comps = inst->components_read(1);
2290 assert(inst->src[A64_LOGICAL_ARG].file == IMM);
2291 const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
2292 const bool has_side_effects = inst->has_side_effects();
2293
2294 elk_fs_reg payload = retype(bld.move_to_vgrf(addr, 1), ELK_REGISTER_TYPE_UD);
2295 elk_fs_reg payload2 = retype(bld.move_to_vgrf(src, src_comps),
2296 ELK_REGISTER_TYPE_UD);
2297 unsigned ex_mlen = src_comps * src_sz * inst->exec_size / REG_SIZE;
2298
2299 switch (inst->opcode) {
2300 case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
2301 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
2302 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2303 1 /* num_coordinates */,
2304 LSC_DATA_SIZE_D32, arg /* num_channels */,
2305 false /* transpose */,
2306 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2307 true /* has_dest */);
2308 break;
2309 case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
2310 inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
2311 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2312 1 /* num_coordinates */,
2313 LSC_DATA_SIZE_D32, arg /* num_channels */,
2314 false /* transpose */,
2315 LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
2316 false /* has_dest */);
2317 break;
2318 case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
2319 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
2320 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2321 1 /* num_coordinates */,
2322 lsc_bits_to_data_size(arg),
2323 1 /* num_channels */,
2324 false /* transpose */,
2325 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2326 true /* has_dest */);
2327 break;
2328 case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
2329 inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
2330 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2331 1 /* num_coordinates */,
2332 lsc_bits_to_data_size(arg),
2333 1 /* num_channels */,
2334 false /* transpose */,
2335 LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
2336 false /* has_dest */);
2337 break;
2338 case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: {
2339 /* Bspec: Atomic instruction -> Cache section:
2340 *
2341 * Atomic messages are always forced to "un-cacheable" in the L1
2342 * cache.
2343 */
2344 enum elk_lsc_opcode opcode = (enum elk_lsc_opcode) arg;
2345 inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
2346 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2347 1 /* num_coordinates */,
2348 lsc_bits_to_data_size(dst_sz * 8),
2349 1 /* num_channels */,
2350 false /* transpose */,
2351 LSC_CACHE(devinfo, STORE, L1UC_L3WB),
2352 !inst->dst.is_null());
2353 break;
2354 }
2355 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
2356 case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2357 inst->exec_size = 1;
2358 inst->desc = lsc_msg_desc(devinfo,
2359 LSC_OP_LOAD,
2360 1 /* exec_size */,
2361 LSC_ADDR_SURFTYPE_FLAT,
2362 LSC_ADDR_SIZE_A64,
2363 1 /* num_coordinates */,
2364 LSC_DATA_SIZE_D32,
2365 arg /* num_channels */,
2366 true /* transpose */,
2367 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2368 true /* has_dest */);
2369 break;
2370 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
2371 inst->exec_size = 1;
2372 inst->desc = lsc_msg_desc(devinfo,
2373 LSC_OP_STORE,
2374 1 /* exec_size */,
2375 LSC_ADDR_SURFTYPE_FLAT,
2376 LSC_ADDR_SIZE_A64,
2377 1 /* num_coordinates */,
2378 LSC_DATA_SIZE_D32,
2379 arg /* num_channels */,
2380 true /* transpose */,
2381 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2382 false /* has_dest */);
2383
2384 break;
2385 default:
2386 unreachable("Unknown A64 logical instruction");
2387 }
2388
2389 if (bld.shader->stage == MESA_SHADER_FRAGMENT)
2390 emit_fragment_mask(bld, inst);
2391
2392 /* Update the original instruction. */
2393 inst->opcode = ELK_SHADER_OPCODE_SEND;
2394 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2395 inst->ex_mlen = ex_mlen;
2396 inst->header_size = 0;
2397 inst->send_has_side_effects = has_side_effects;
2398 inst->send_is_volatile = !has_side_effects;
2399
2400 /* Set up SFID and descriptors */
2401 inst->sfid = GFX12_SFID_UGM;
2402 inst->resize_sources(4);
2403 inst->src[0] = elk_imm_ud(0); /* desc */
2404 inst->src[1] = elk_imm_ud(0); /* ex_desc */
2405 inst->src[2] = payload;
2406 inst->src[3] = payload2;
2407 }
2408
2409 static void
lower_a64_logical_send(const fs_builder & bld,elk_fs_inst * inst)2410 lower_a64_logical_send(const fs_builder &bld, elk_fs_inst *inst)
2411 {
2412 const intel_device_info *devinfo = bld.shader->devinfo;
2413
2414 const elk_fs_reg addr = inst->src[A64_LOGICAL_ADDRESS];
2415 const elk_fs_reg src = inst->src[A64_LOGICAL_SRC];
2416 const unsigned src_comps = inst->components_read(1);
2417 assert(inst->src[A64_LOGICAL_ARG].file == IMM);
2418 const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
2419 const bool has_side_effects = inst->has_side_effects();
2420
2421 elk_fs_reg payload, payload2;
2422 unsigned mlen, ex_mlen = 0, header_size = 0;
2423 if (inst->opcode == ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL ||
2424 inst->opcode == ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL ||
2425 inst->opcode == ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL) {
2426 assert(devinfo->ver >= 9);
2427
2428 /* OWORD messages only take a scalar address in a header */
2429 mlen = 1;
2430 header_size = 1;
2431 payload = emit_a64_oword_block_header(bld, addr);
2432
2433 if (inst->opcode == ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL) {
2434 ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
2435 payload2 = retype(bld.move_to_vgrf(src, src_comps),
2436 ELK_REGISTER_TYPE_UD);
2437 }
2438 } else if (devinfo->ver >= 9) {
2439 /* On Skylake and above, we have SENDS */
2440 mlen = 2 * (inst->exec_size / 8);
2441 ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
2442 payload = retype(bld.move_to_vgrf(addr, 1), ELK_REGISTER_TYPE_UD);
2443 payload2 = retype(bld.move_to_vgrf(src, src_comps),
2444 ELK_REGISTER_TYPE_UD);
2445 } else {
2446 /* Add two because the address is 64-bit */
2447 const unsigned dwords = 2 + src_comps;
2448 mlen = dwords * (inst->exec_size / 8);
2449
2450 elk_fs_reg sources[5];
2451
2452 sources[0] = addr;
2453
2454 for (unsigned i = 0; i < src_comps; i++)
2455 sources[1 + i] = offset(src, bld, i);
2456
2457 payload = bld.vgrf(ELK_REGISTER_TYPE_UD, dwords);
2458 bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0);
2459 }
2460
2461 uint32_t desc;
2462 switch (inst->opcode) {
2463 case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
2464 desc = elk_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
2465 arg, /* num_channels */
2466 false /* write */);
2467 break;
2468
2469 case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
2470 desc = elk_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
2471 arg, /* num_channels */
2472 true /* write */);
2473 break;
2474
2475 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
2476 desc = elk_dp_a64_oword_block_rw_desc(devinfo,
2477 true, /* align_16B */
2478 arg, /* num_dwords */
2479 false /* write */);
2480 break;
2481
2482 case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2483 desc = elk_dp_a64_oword_block_rw_desc(devinfo,
2484 false, /* align_16B */
2485 arg, /* num_dwords */
2486 false /* write */);
2487 break;
2488
2489 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
2490 desc = elk_dp_a64_oword_block_rw_desc(devinfo,
2491 true, /* align_16B */
2492 arg, /* num_dwords */
2493 true /* write */);
2494 break;
2495
2496 case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
2497 desc = elk_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
2498 arg, /* bit_size */
2499 false /* write */);
2500 break;
2501
2502 case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
2503 desc = elk_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
2504 arg, /* bit_size */
2505 true /* write */);
2506 break;
2507
2508 case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
2509 if (elk_lsc_opcode_is_atomic_float((enum elk_lsc_opcode) arg)) {
2510 desc =
2511 elk_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
2512 type_sz(inst->dst.type) * 8,
2513 lsc_op_to_legacy_atomic(arg),
2514 !inst->dst.is_null());
2515 } else {
2516 desc = elk_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size,
2517 type_sz(inst->dst.type) * 8,
2518 lsc_op_to_legacy_atomic(arg),
2519 !inst->dst.is_null());
2520 }
2521 break;
2522
2523 default:
2524 unreachable("Unknown A64 logical instruction");
2525 }
2526
2527 if (bld.shader->stage == MESA_SHADER_FRAGMENT)
2528 emit_fragment_mask(bld, inst);
2529
2530 /* Update the original instruction. */
2531 inst->opcode = ELK_SHADER_OPCODE_SEND;
2532 inst->mlen = mlen;
2533 inst->ex_mlen = ex_mlen;
2534 inst->header_size = header_size;
2535 inst->send_has_side_effects = has_side_effects;
2536 inst->send_is_volatile = !has_side_effects;
2537
2538 /* Set up SFID and descriptors */
2539 inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2540 inst->desc = desc;
2541 inst->resize_sources(4);
2542 inst->src[0] = elk_imm_ud(0); /* desc */
2543 inst->src[1] = elk_imm_ud(0); /* ex_desc */
2544 inst->src[2] = payload;
2545 inst->src[3] = payload2;
2546 }
2547
2548 static void
lower_lsc_varying_pull_constant_logical_send(const fs_builder & bld,elk_fs_inst * inst)2549 lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld,
2550 elk_fs_inst *inst)
2551 {
2552 const intel_device_info *devinfo = bld.shader->devinfo;
2553 ASSERTED const elk_compiler *compiler = bld.shader->compiler;
2554
2555 elk_fs_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
2556 elk_fs_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
2557 elk_fs_reg offset_B = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
2558 elk_fs_reg alignment_B = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT];
2559
2560 /* We are switching the instruction from an ALU-like instruction to a
2561 * send-from-grf instruction. Since sends can't handle strides or
2562 * source modifiers, we have to make a copy of the offset source.
2563 */
2564 elk_fs_reg ubo_offset = bld.move_to_vgrf(offset_B, 1);
2565
2566 enum lsc_addr_surface_type surf_type =
2567 surface_handle.file == BAD_FILE ?
2568 LSC_ADDR_SURFTYPE_BTI : LSC_ADDR_SURFTYPE_BSS;
2569
2570 assert(alignment_B.file == ELK_IMMEDIATE_VALUE);
2571 unsigned alignment = alignment_B.ud;
2572
2573 inst->opcode = ELK_SHADER_OPCODE_SEND;
2574 inst->sfid = GFX12_SFID_UGM;
2575 inst->resize_sources(3);
2576 inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
2577 compiler->extended_bindless_surface_offset;
2578
2579 assert(!compiler->indirect_ubos_use_sampler);
2580
2581 inst->src[0] = elk_imm_ud(0);
2582 inst->src[2] = ubo_offset; /* payload */
2583
2584 if (alignment >= 4) {
2585 inst->desc =
2586 lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
2587 surf_type, LSC_ADDR_SIZE_A32,
2588 1 /* num_coordinates */,
2589 LSC_DATA_SIZE_D32,
2590 4 /* num_channels */,
2591 false /* transpose */,
2592 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2593 true /* has_dest */);
2594 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2595
2596 setup_lsc_surface_descriptors(bld, inst, inst->desc,
2597 surface.file != BAD_FILE ?
2598 surface : surface_handle);
2599 } else {
2600 inst->desc =
2601 lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
2602 surf_type, LSC_ADDR_SIZE_A32,
2603 1 /* num_coordinates */,
2604 LSC_DATA_SIZE_D32,
2605 1 /* num_channels */,
2606 false /* transpose */,
2607 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2608 true /* has_dest */);
2609 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2610
2611 setup_lsc_surface_descriptors(bld, inst, inst->desc,
2612 surface.file != BAD_FILE ?
2613 surface : surface_handle);
2614
2615 /* The byte scattered messages can only read one dword at a time so
2616 * we have to duplicate the message 4 times to read the full vec4.
2617 * Hopefully, dead code will clean up the mess if some of them aren't
2618 * needed.
2619 */
2620 assert(inst->size_written == 16 * inst->exec_size);
2621 inst->size_written /= 4;
2622 for (unsigned c = 1; c < 4; c++) {
2623 /* Emit a copy of the instruction because we're about to modify
2624 * it. Because this loop starts at 1, we will emit copies for the
2625 * first 3 and the final one will be the modified instruction.
2626 */
2627 bld.emit(*inst);
2628
2629 /* Offset the source */
2630 inst->src[2] = bld.vgrf(ELK_REGISTER_TYPE_UD);
2631 bld.ADD(inst->src[2], ubo_offset, elk_imm_ud(c * 4));
2632
2633 /* Offset the destination */
2634 inst->dst = offset(inst->dst, bld, 1);
2635 }
2636 }
2637 }
2638
2639 static void
lower_varying_pull_constant_logical_send(const fs_builder & bld,elk_fs_inst * inst)2640 lower_varying_pull_constant_logical_send(const fs_builder &bld, elk_fs_inst *inst)
2641 {
2642 const intel_device_info *devinfo = bld.shader->devinfo;
2643 const elk_compiler *compiler = bld.shader->compiler;
2644
2645 if (devinfo->ver >= 7) {
2646 elk_fs_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
2647 elk_fs_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
2648 elk_fs_reg offset_B = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
2649
2650 /* We are switching the instruction from an ALU-like instruction to a
2651 * send-from-grf instruction. Since sends can't handle strides or
2652 * source modifiers, we have to make a copy of the offset source.
2653 */
2654 elk_fs_reg ubo_offset = bld.vgrf(ELK_REGISTER_TYPE_UD);
2655 bld.MOV(ubo_offset, offset_B);
2656
2657 assert(inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].file == ELK_IMMEDIATE_VALUE);
2658 unsigned alignment = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].ud;
2659
2660 inst->opcode = ELK_SHADER_OPCODE_SEND;
2661 inst->mlen = inst->exec_size / 8;
2662 inst->resize_sources(3);
2663
2664 /* src[0] & src[1] are filled by setup_surface_descriptors() */
2665 inst->src[2] = ubo_offset; /* payload */
2666
2667 if (compiler->indirect_ubos_use_sampler) {
2668 const unsigned simd_mode =
2669 inst->exec_size <= 8 ? ELK_SAMPLER_SIMD_MODE_SIMD8 :
2670 ELK_SAMPLER_SIMD_MODE_SIMD16;
2671 const uint32_t desc = elk_sampler_desc(devinfo, 0, 0,
2672 GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
2673 simd_mode, 0);
2674
2675 inst->sfid = ELK_SFID_SAMPLER;
2676 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2677 } else if (alignment >= 4) {
2678 const uint32_t desc =
2679 elk_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
2680 4, /* num_channels */
2681 false /* write */);
2682
2683 inst->sfid = (devinfo->verx10 >= 75 ?
2684 HSW_SFID_DATAPORT_DATA_CACHE_1 :
2685 GFX7_SFID_DATAPORT_DATA_CACHE);
2686 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2687 } else {
2688 const uint32_t desc =
2689 elk_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
2690 32, /* bit_size */
2691 false /* write */);
2692
2693 inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
2694 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2695
2696 /* The byte scattered messages can only read one dword at a time so
2697 * we have to duplicate the message 4 times to read the full vec4.
2698 * Hopefully, dead code will clean up the mess if some of them aren't
2699 * needed.
2700 */
2701 assert(inst->size_written == 16 * inst->exec_size);
2702 inst->size_written /= 4;
2703 for (unsigned c = 1; c < 4; c++) {
2704 /* Emit a copy of the instruction because we're about to modify
2705 * it. Because this loop starts at 1, we will emit copies for the
2706 * first 3 and the final one will be the modified instruction.
2707 */
2708 bld.emit(*inst);
2709
2710 /* Offset the source */
2711 inst->src[2] = bld.vgrf(ELK_REGISTER_TYPE_UD);
2712 bld.ADD(inst->src[2], ubo_offset, elk_imm_ud(c * 4));
2713
2714 /* Offset the destination */
2715 inst->dst = offset(inst->dst, bld, 1);
2716 }
2717 }
2718 } else {
2719 elk_fs_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
2720 elk_fs_reg offset = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
2721 assert(inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE].file == BAD_FILE);
2722
2723 const elk_fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->ver),
2724 ELK_REGISTER_TYPE_UD);
2725
2726 bld.MOV(byte_offset(payload, REG_SIZE), offset);
2727
2728 inst->opcode = ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4;
2729 inst->base_mrf = payload.nr;
2730 inst->header_size = 1;
2731 inst->mlen = 1 + inst->exec_size / 8;
2732
2733 inst->resize_sources(1);
2734 inst->src[0] = surface;
2735 }
2736 }
2737
2738 static void
lower_math_logical_send(const fs_builder & bld,elk_fs_inst * inst)2739 lower_math_logical_send(const fs_builder &bld, elk_fs_inst *inst)
2740 {
2741 assert(bld.shader->devinfo->ver < 6);
2742
2743 inst->base_mrf = 2;
2744 inst->mlen = inst->sources * inst->exec_size / 8;
2745
2746 if (inst->sources > 1) {
2747 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
2748 * "Message Payload":
2749 *
2750 * "Operand0[7]. For the INT DIV functions, this operand is the
2751 * denominator."
2752 * ...
2753 * "Operand1[7]. For the INT DIV functions, this operand is the
2754 * numerator."
2755 */
2756 const bool is_int_div = inst->opcode != ELK_SHADER_OPCODE_POW;
2757 const elk_fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
2758 const elk_fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
2759
2760 inst->resize_sources(1);
2761 inst->src[0] = src0;
2762
2763 assert(inst->exec_size == 8);
2764 bld.MOV(elk_fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);
2765 }
2766 }
2767
2768 static void
lower_interpolator_logical_send(const fs_builder & bld,elk_fs_inst * inst,const struct elk_wm_prog_key * wm_prog_key,const struct elk_wm_prog_data * wm_prog_data)2769 lower_interpolator_logical_send(const fs_builder &bld, elk_fs_inst *inst,
2770 const struct elk_wm_prog_key *wm_prog_key,
2771 const struct elk_wm_prog_data *wm_prog_data)
2772 {
2773 const intel_device_info *devinfo = bld.shader->devinfo;
2774
2775 /* We have to send something */
2776 elk_fs_reg payload = elk_vec8_grf(0, 0);
2777 unsigned mlen = 1;
2778
2779 unsigned mode;
2780 switch (inst->opcode) {
2781 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2782 assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
2783 mode = GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE;
2784 break;
2785
2786 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2787 assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
2788 mode = GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET;
2789 break;
2790
2791 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2792 payload = inst->src[INTERP_SRC_OFFSET];
2793 mlen = 2 * inst->exec_size / 8;
2794 mode = GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET;
2795 break;
2796
2797 default:
2798 unreachable("Invalid interpolator instruction");
2799 }
2800
2801 const bool dynamic_mode =
2802 inst->src[INTERP_SRC_DYNAMIC_MODE].file != BAD_FILE;
2803
2804 elk_fs_reg desc = inst->src[INTERP_SRC_MSG_DESC];
2805 uint32_t desc_imm =
2806 elk_pixel_interp_desc(devinfo,
2807 /* Leave the mode at 0 if persample_dispatch is
2808 * dynamic, it will be ORed in below.
2809 */
2810 dynamic_mode ? 0 : mode,
2811 inst->pi_noperspective,
2812 false /* coarse_pixel_rate */,
2813 inst->exec_size, inst->group);
2814
2815 if (wm_prog_data->coarse_pixel_dispatch == ELK_ALWAYS) {
2816 desc_imm |= (1 << 15);
2817 } else if (wm_prog_data->coarse_pixel_dispatch == ELK_SOMETIMES) {
2818 STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_PI_MSG == (1 << 15));
2819 elk_fs_reg orig_desc = desc;
2820 const fs_builder &ubld = bld.exec_all().group(8, 0);
2821 desc = ubld.vgrf(ELK_REGISTER_TYPE_UD);
2822 ubld.AND(desc, dynamic_msaa_flags(wm_prog_data),
2823 elk_imm_ud(INTEL_MSAA_FLAG_COARSE_PI_MSG));
2824
2825 /* And, if it's AT_OFFSET, we might have a non-trivial descriptor */
2826 if (orig_desc.file == IMM) {
2827 desc_imm |= orig_desc.ud;
2828 } else {
2829 ubld.OR(desc, desc, orig_desc);
2830 }
2831 }
2832
2833 /* If persample_dispatch is dynamic, select the interpolation mode
2834 * dynamically and OR into the descriptor to complete the static part
2835 * generated by elk_pixel_interp_desc().
2836 *
2837 * Why does this work? If you look at the SKL PRMs, Volume 7:
2838 * 3D-Media-GPGPU, Shared Functions Pixel Interpolater, you'll see that
2839 *
2840 * - "Per Message Offset” Message Descriptor
2841 * - “Sample Position Offset” Message Descriptor
2842 *
2843 * have different formats. Fortunately, a fragment shader dispatched at
2844 * pixel rate, will have gl_SampleID = 0 & gl_NumSamples = 1. So the value
2845 * we pack in “Sample Position Offset” will be a 0 and will cover the X/Y
2846 * components of "Per Message Offset”, which will give us the pixel offset 0x0.
2847 */
2848 if (dynamic_mode) {
2849 elk_fs_reg orig_desc = desc;
2850 const fs_builder &ubld = bld.exec_all().group(8, 0);
2851 desc = ubld.vgrf(ELK_REGISTER_TYPE_UD);
2852
2853 /* The predicate should have been built in elk_fs_nir.cpp when emitting
2854 * NIR code. This guarantees that we do not have incorrect interactions
2855 * with the flag register holding the predication result.
2856 */
2857 if (orig_desc.file == IMM) {
2858 /* Not using SEL here because we would generate an instruction with 2
2859 * immediate sources which is not supported by HW.
2860 */
2861 set_predicate_inv(ELK_PREDICATE_NORMAL, false,
2862 ubld.MOV(desc, elk_imm_ud(orig_desc.ud |
2863 GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
2864 set_predicate_inv(ELK_PREDICATE_NORMAL, true,
2865 ubld.MOV(desc, elk_imm_ud(orig_desc.ud |
2866 GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
2867 } else {
2868 set_predicate_inv(ELK_PREDICATE_NORMAL, false,
2869 ubld.OR(desc, orig_desc,
2870 elk_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
2871 set_predicate_inv(ELK_PREDICATE_NORMAL, true,
2872 ubld.OR(desc, orig_desc,
2873 elk_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
2874 }
2875 }
2876
2877 assert(bld.shader->devinfo->ver >= 7);
2878 inst->opcode = ELK_SHADER_OPCODE_SEND;
2879 inst->sfid = GFX7_SFID_PIXEL_INTERPOLATOR;
2880 inst->desc = desc_imm;
2881 inst->ex_desc = 0;
2882 inst->mlen = mlen;
2883 inst->ex_mlen = 0;
2884 inst->send_has_side_effects = false;
2885 inst->send_is_volatile = false;
2886
2887 inst->resize_sources(3);
2888 inst->src[0] = component(desc, 0);
2889 inst->src[1] = elk_imm_ud(0); /* ex_desc */
2890 inst->src[2] = payload;
2891 }
2892
2893 static void
lower_get_buffer_size(const fs_builder & bld,elk_fs_inst * inst)2894 lower_get_buffer_size(const fs_builder &bld, elk_fs_inst *inst)
2895 {
2896 const intel_device_info *devinfo = bld.shader->devinfo;
2897 assert(devinfo->ver >= 7);
2898 /* Since we can only execute this instruction on uniform bti/surface
2899 * handles, elk_fs_nir.cpp should already have limited this to SIMD8.
2900 */
2901 assert(inst->exec_size == (devinfo->ver < 20 ? 8 : 16));
2902
2903 elk_fs_reg surface = inst->src[GET_BUFFER_SIZE_SRC_SURFACE];
2904 elk_fs_reg surface_handle = inst->src[GET_BUFFER_SIZE_SRC_SURFACE_HANDLE];
2905 elk_fs_reg lod = inst->src[GET_BUFFER_SIZE_SRC_LOD];
2906
2907 inst->opcode = ELK_SHADER_OPCODE_SEND;
2908 inst->mlen = inst->exec_size / 8;
2909 inst->resize_sources(3);
2910 inst->ex_mlen = 0;
2911 inst->ex_desc = 0;
2912
2913 /* src[0] & src[1] are filled by setup_surface_descriptors() */
2914 inst->src[2] = lod;
2915
2916 const uint32_t return_format = devinfo->ver >= 8 ?
2917 GFX8_SAMPLER_RETURN_FORMAT_32BITS : ELK_SAMPLER_RETURN_FORMAT_SINT32;
2918
2919 const uint32_t desc = elk_sampler_desc(devinfo, 0, 0,
2920 GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
2921 ELK_SAMPLER_SIMD_MODE_SIMD8,
2922 return_format);
2923
2924 inst->dst = retype(inst->dst, ELK_REGISTER_TYPE_UW);
2925 inst->sfid = ELK_SFID_SAMPLER;
2926 setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2927 }
2928
2929 bool
lower_logical_sends()2930 elk_fs_visitor::lower_logical_sends()
2931 {
2932 bool progress = false;
2933
2934 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
2935 const fs_builder ibld(this, block, inst);
2936
2937 switch (inst->opcode) {
2938 case ELK_FS_OPCODE_FB_WRITE_LOGICAL:
2939 assert(stage == MESA_SHADER_FRAGMENT);
2940 lower_fb_write_logical_send(ibld, inst,
2941 elk_wm_prog_data(prog_data),
2942 (const elk_wm_prog_key *)key,
2943 fs_payload());
2944 break;
2945
2946 case ELK_FS_OPCODE_FB_READ_LOGICAL:
2947 lower_fb_read_logical_send(ibld, inst);
2948 break;
2949
2950 case ELK_SHADER_OPCODE_TEX_LOGICAL:
2951 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TEX);
2952 break;
2953
2954 case ELK_SHADER_OPCODE_TXD_LOGICAL:
2955 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXD);
2956 break;
2957
2958 case ELK_SHADER_OPCODE_TXF_LOGICAL:
2959 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF);
2960 break;
2961
2962 case ELK_SHADER_OPCODE_TXL_LOGICAL:
2963 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXL);
2964 break;
2965
2966 case ELK_SHADER_OPCODE_TXS_LOGICAL:
2967 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXS);
2968 break;
2969
2970 case ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
2971 lower_sampler_logical_send(ibld, inst,
2972 ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL);
2973 break;
2974
2975 case ELK_FS_OPCODE_TXB_LOGICAL:
2976 lower_sampler_logical_send(ibld, inst, ELK_FS_OPCODE_TXB);
2977 break;
2978
2979 case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
2980 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF_CMS);
2981 break;
2982
2983 case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
2984 case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
2985 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF_CMS_W);
2986 break;
2987
2988 case ELK_SHADER_OPCODE_TXF_UMS_LOGICAL:
2989 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF_UMS);
2990 break;
2991
2992 case ELK_SHADER_OPCODE_TXF_MCS_LOGICAL:
2993 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF_MCS);
2994 break;
2995
2996 case ELK_SHADER_OPCODE_LOD_LOGICAL:
2997 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_LOD);
2998 break;
2999
3000 case ELK_SHADER_OPCODE_TG4_LOGICAL:
3001 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TG4);
3002 break;
3003
3004 case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
3005 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TG4_OFFSET);
3006 break;
3007
3008 case ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL:
3009 lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_SAMPLEINFO);
3010 break;
3011
3012 case ELK_SHADER_OPCODE_GET_BUFFER_SIZE:
3013 lower_get_buffer_size(ibld, inst);
3014 break;
3015
3016 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
3017 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
3018 case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
3019 case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
3020 case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
3021 if (devinfo->has_lsc) {
3022 lower_lsc_surface_logical_send(ibld, inst);
3023 break;
3024 }
3025 case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
3026 case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
3027 case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
3028 case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
3029 case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
3030 lower_surface_logical_send(ibld, inst);
3031 break;
3032
3033 case ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
3034 case ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
3035 if (devinfo->has_lsc) {
3036 lower_lsc_block_logical_send(ibld, inst);
3037 break;
3038 }
3039 lower_surface_block_logical_send(ibld, inst);
3040 break;
3041
3042 case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
3043 case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
3044 case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
3045 case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
3046 case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
3047 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
3048 case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
3049 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
3050 if (devinfo->has_lsc) {
3051 lower_lsc_a64_logical_send(ibld, inst);
3052 break;
3053 }
3054 lower_a64_logical_send(ibld, inst);
3055 break;
3056
3057 case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
3058 if (devinfo->has_lsc && !compiler->indirect_ubos_use_sampler)
3059 lower_lsc_varying_pull_constant_logical_send(ibld, inst);
3060 else
3061 lower_varying_pull_constant_logical_send(ibld, inst);
3062 break;
3063
3064 case ELK_SHADER_OPCODE_RCP:
3065 case ELK_SHADER_OPCODE_RSQ:
3066 case ELK_SHADER_OPCODE_SQRT:
3067 case ELK_SHADER_OPCODE_EXP2:
3068 case ELK_SHADER_OPCODE_LOG2:
3069 case ELK_SHADER_OPCODE_SIN:
3070 case ELK_SHADER_OPCODE_COS:
3071 case ELK_SHADER_OPCODE_POW:
3072 case ELK_SHADER_OPCODE_INT_QUOTIENT:
3073 case ELK_SHADER_OPCODE_INT_REMAINDER:
3074 /* The math opcodes are overloaded for the send-like and
3075 * expression-like instructions which seems kind of icky. Gfx6+ has
3076 * a native (but rather quirky) MATH instruction so we don't need to
3077 * do anything here. On Gfx4-5 we'll have to lower the Gfx6-like
3078 * logical instructions (which we can easily recognize because they
3079 * have mlen = 0) into send-like virtual instructions.
3080 */
3081 if (devinfo->ver < 6 && inst->mlen == 0) {
3082 lower_math_logical_send(ibld, inst);
3083 break;
3084
3085 } else {
3086 continue;
3087 }
3088
3089 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
3090 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
3091 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
3092 lower_interpolator_logical_send(ibld, inst,
3093 (const elk_wm_prog_key *)key,
3094 elk_wm_prog_data(prog_data));
3095 break;
3096
3097 case ELK_SHADER_OPCODE_URB_READ_LOGICAL:
3098 if (devinfo->ver < 20)
3099 lower_urb_read_logical_send(ibld, inst);
3100 else
3101 lower_urb_read_logical_send_xe2(ibld, inst);
3102 break;
3103
3104 case ELK_SHADER_OPCODE_URB_WRITE_LOGICAL:
3105 if (devinfo->ver < 20)
3106 lower_urb_write_logical_send(ibld, inst);
3107 else
3108 lower_urb_write_logical_send_xe2(ibld, inst);
3109
3110 break;
3111
3112 default:
3113 continue;
3114 }
3115
3116 progress = true;
3117 }
3118
3119 if (progress)
3120 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3121
3122 return progress;
3123 }
3124
3125 /**
3126 * Turns the generic expression-style uniform pull constant load instruction
3127 * into a hardware-specific series of instructions for loading a pull
3128 * constant.
3129 *
3130 * The expression style allows the CSE pass before this to optimize out
3131 * repeated loads from the same offset, and gives the pre-register-allocation
3132 * scheduling full flexibility, while the conversion to native instructions
3133 * allows the post-register-allocation scheduler the best information
3134 * possible.
3135 *
3136 * Note that execution masking for setting up pull constant loads is special:
3137 * the channels that need to be written are unrelated to the current execution
3138 * mask, since a later instruction will use one of the result channels as a
3139 * source operand for all 8 or 16 of its channels.
3140 */
3141 bool
lower_uniform_pull_constant_loads()3142 elk_fs_visitor::lower_uniform_pull_constant_loads()
3143 {
3144 bool progress = false;
3145
3146 foreach_block_and_inst (block, elk_fs_inst, inst, cfg) {
3147 if (inst->opcode != ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3148 continue;
3149
3150 const elk_fs_reg surface = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE];
3151 const elk_fs_reg surface_handle = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE];
3152 const elk_fs_reg offset_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_OFFSET];
3153 const elk_fs_reg size_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_SIZE];
3154 assert(surface.file == BAD_FILE || surface_handle.file == BAD_FILE);
3155 assert(offset_B.file == IMM);
3156 assert(size_B.file == IMM);
3157
3158 if (devinfo->has_lsc) {
3159 const fs_builder ubld =
3160 fs_builder(this, block, inst).group(8, 0).exec_all();
3161
3162 const elk_fs_reg payload = ubld.vgrf(ELK_REGISTER_TYPE_UD);
3163 ubld.MOV(payload, offset_B);
3164
3165 inst->sfid = GFX12_SFID_UGM;
3166 inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
3167 1 /* simd_size */,
3168 surface_handle.file == BAD_FILE ?
3169 LSC_ADDR_SURFTYPE_BTI :
3170 LSC_ADDR_SURFTYPE_BSS,
3171 LSC_ADDR_SIZE_A32,
3172 1 /* num_coordinates */,
3173 LSC_DATA_SIZE_D32,
3174 inst->size_written / 4,
3175 true /* transpose */,
3176 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
3177 true /* has_dest */);
3178
3179 /* Update the original instruction. */
3180 inst->opcode = ELK_SHADER_OPCODE_SEND;
3181 inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
3182 inst->send_ex_bso = surface_handle.file != BAD_FILE &&
3183 compiler->extended_bindless_surface_offset;
3184 inst->ex_mlen = 0;
3185 inst->header_size = 0;
3186 inst->send_has_side_effects = false;
3187 inst->send_is_volatile = true;
3188 inst->exec_size = 1;
3189
3190 /* Finally, the payload */
3191
3192 inst->resize_sources(3);
3193 setup_lsc_surface_descriptors(ubld, inst, inst->desc,
3194 surface.file != BAD_FILE ?
3195 surface : surface_handle);
3196 inst->src[2] = payload;
3197
3198 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3199 } else if (devinfo->ver >= 7) {
3200 const fs_builder ubld = fs_builder(this, block, inst).exec_all();
3201 elk_fs_reg header = fs_builder(this, 8).exec_all().vgrf(ELK_REGISTER_TYPE_UD);
3202
3203 ubld.group(8, 0).MOV(header,
3204 retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
3205 ubld.group(1, 0).MOV(component(header, 2),
3206 elk_imm_ud(offset_B.ud / 16));
3207
3208 inst->sfid = GFX6_SFID_DATAPORT_CONSTANT_CACHE;
3209 inst->opcode = ELK_SHADER_OPCODE_SEND;
3210 inst->header_size = 1;
3211 inst->mlen = 1;
3212
3213 uint32_t desc =
3214 elk_dp_oword_block_rw_desc(devinfo, true /* align_16B */,
3215 size_B.ud / 4, false /* write */);
3216
3217 inst->resize_sources(4);
3218
3219 setup_surface_descriptors(ubld, inst, desc, surface, surface_handle);
3220
3221 inst->src[2] = header;
3222 inst->src[3] = elk_fs_reg(); /* unused for reads */
3223
3224 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3225 } else {
3226 assert(surface_handle.file == BAD_FILE);
3227 /* Before register allocation, we didn't tell the scheduler about the
3228 * MRF we use. We know it's safe to use this MRF because nothing
3229 * else does except for register spill/unspill, which generates and
3230 * uses its MRF within a single IR instruction.
3231 */
3232 inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
3233 inst->mlen = 1;
3234 }
3235
3236 progress = true;
3237 }
3238
3239 return progress;
3240 }
3241