• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010, 2022 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /**
25  * @file
26  */
27 
28 #include "brw_eu.h"
29 #include "brw_fs.h"
30 #include "brw_builder.h"
31 
32 using namespace brw;
33 
34 static void
lower_urb_read_logical_send(const brw_builder & bld,fs_inst * inst)35 lower_urb_read_logical_send(const brw_builder &bld, fs_inst *inst)
36 {
37    const intel_device_info *devinfo = bld.shader->devinfo;
38    const bool per_slot_present =
39       inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
40 
41    assert(inst->size_written % REG_SIZE == 0);
42    assert(inst->header_size == 0);
43 
44    brw_reg payload_sources[2];
45    unsigned header_size = 0;
46    payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
47    if (per_slot_present)
48       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
49 
50    brw_reg payload = brw_vgrf(bld.shader->alloc.allocate(header_size),
51                              BRW_TYPE_F);
52    bld.LOAD_PAYLOAD(payload, payload_sources, header_size, header_size);
53 
54    inst->opcode = SHADER_OPCODE_SEND;
55    inst->header_size = header_size;
56 
57    inst->sfid = BRW_SFID_URB;
58    inst->desc = brw_urb_desc(devinfo,
59                              GFX8_URB_OPCODE_SIMD8_READ,
60                              per_slot_present,
61                              false,
62                              inst->offset);
63 
64    inst->mlen = header_size;
65    inst->ex_desc = 0;
66    inst->ex_mlen = 0;
67    inst->send_is_volatile = true;
68 
69    inst->resize_sources(4);
70 
71    inst->src[0] = brw_imm_ud(0); /* desc */
72    inst->src[1] = brw_imm_ud(0); /* ex_desc */
73    inst->src[2] = payload;
74    inst->src[3] = brw_null_reg();
75 }
76 
77 static void
lower_urb_read_logical_send_xe2(const brw_builder & bld,fs_inst * inst)78 lower_urb_read_logical_send_xe2(const brw_builder &bld, fs_inst *inst)
79 {
80    const intel_device_info *devinfo = bld.shader->devinfo;
81    assert(devinfo->has_lsc);
82 
83    assert(inst->size_written % (REG_SIZE * reg_unit(devinfo)) == 0);
84    assert(inst->header_size == 0);
85 
86    /* Get the logical send arguments. */
87    const brw_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];
88 
89    /* Calculate the total number of components of the payload. */
90    const unsigned dst_comps = inst->size_written / (REG_SIZE * reg_unit(devinfo));
91 
92    brw_reg payload = bld.vgrf(BRW_TYPE_UD);
93 
94    bld.MOV(payload, handle);
95 
96    /* The low 24-bits of the URB handle is a byte offset into the URB area.
97     * Add the (OWord) offset of the write to this value.
98     */
99    if (inst->offset) {
100       bld.ADD(payload, payload, brw_imm_ud(inst->offset * 16));
101       inst->offset = 0;
102    }
103 
104    brw_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
105    if (offsets.file != BAD_FILE) {
106       bld.ADD(payload, payload, offsets);
107    }
108 
109    inst->sfid = BRW_SFID_URB;
110 
111    assert((dst_comps >= 1 && dst_comps <= 4) || dst_comps == 8);
112 
113    inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
114                              LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
115                              LSC_DATA_SIZE_D32, dst_comps /* num_channels */,
116                              false /* transpose */,
117                              LSC_CACHE(devinfo, LOAD, L1UC_L3UC));
118 
119    /* Update the original instruction. */
120    inst->opcode = SHADER_OPCODE_SEND;
121    inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);
122    inst->ex_mlen = 0;
123    inst->header_size = 0;
124    inst->send_has_side_effects = true;
125    inst->send_is_volatile = false;
126 
127    inst->resize_sources(4);
128 
129    inst->src[0] = brw_imm_ud(0);
130    inst->src[1] = brw_imm_ud(0);
131 
132    inst->src[2] = payload;
133    inst->src[3] = brw_null_reg();
134 }
135 
136 static void
lower_urb_write_logical_send(const brw_builder & bld,fs_inst * inst)137 lower_urb_write_logical_send(const brw_builder &bld, fs_inst *inst)
138 {
139    const intel_device_info *devinfo = bld.shader->devinfo;
140    const bool per_slot_present =
141       inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
142    const bool channel_mask_present =
143       inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE;
144 
145    assert(inst->header_size == 0);
146 
147    const unsigned length = 1 + per_slot_present + channel_mask_present +
148                            inst->components_read(URB_LOGICAL_SRC_DATA);
149 
150    brw_reg *payload_sources = new brw_reg[length];
151    brw_reg payload = brw_vgrf(bld.shader->alloc.allocate(length),
152                              BRW_TYPE_F);
153 
154    unsigned header_size = 0;
155    payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
156    if (per_slot_present)
157       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
158 
159    if (channel_mask_present)
160       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
161 
162    for (unsigned i = header_size, j = 0; i < length; i++, j++)
163       payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j);
164 
165    bld.LOAD_PAYLOAD(payload, payload_sources, length, header_size);
166 
167    delete [] payload_sources;
168 
169    inst->opcode = SHADER_OPCODE_SEND;
170    inst->header_size = header_size;
171    inst->dst = brw_null_reg();
172 
173    inst->sfid = BRW_SFID_URB;
174    inst->desc = brw_urb_desc(devinfo,
175                              GFX8_URB_OPCODE_SIMD8_WRITE,
176                              per_slot_present,
177                              channel_mask_present,
178                              inst->offset);
179 
180    inst->mlen = length;
181    inst->ex_desc = 0;
182    inst->ex_mlen = 0;
183    inst->send_has_side_effects = true;
184 
185    inst->resize_sources(4);
186 
187    inst->src[0] = brw_imm_ud(0); /* desc */
188    inst->src[1] = brw_imm_ud(0); /* ex_desc */
189    inst->src[2] = payload;
190    inst->src[3] = brw_null_reg();
191 }
192 
193 static void
lower_urb_write_logical_send_xe2(const brw_builder & bld,fs_inst * inst)194 lower_urb_write_logical_send_xe2(const brw_builder &bld, fs_inst *inst)
195 {
196    const intel_device_info *devinfo = bld.shader->devinfo;
197    assert(devinfo->has_lsc);
198 
199    /* Get the logical send arguments. */
200    const brw_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];
201    const brw_reg src = inst->components_read(URB_LOGICAL_SRC_DATA) ?
202       inst->src[URB_LOGICAL_SRC_DATA] : brw_reg(brw_imm_ud(0));
203    assert(brw_type_size_bytes(src.type) == 4);
204 
205    /* Calculate the total number of components of the payload. */
206    const unsigned src_comps = MAX2(1, inst->components_read(URB_LOGICAL_SRC_DATA));
207    const unsigned src_sz = brw_type_size_bytes(src.type);
208 
209    brw_reg payload = bld.vgrf(BRW_TYPE_UD);
210 
211    bld.MOV(payload, handle);
212 
213    /* The low 24-bits of the URB handle is a byte offset into the URB area.
214     * Add the (OWord) offset of the write to this value.
215     */
216    if (inst->offset) {
217       bld.ADD(payload, payload, brw_imm_ud(inst->offset * 16));
218       inst->offset = 0;
219    }
220 
221    brw_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
222    if (offsets.file != BAD_FILE) {
223       bld.ADD(payload, payload, offsets);
224    }
225 
226    const brw_reg cmask = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
227    unsigned mask = 0;
228 
229    if (cmask.file != BAD_FILE) {
230       assert(cmask.file == IMM);
231       assert(cmask.type == BRW_TYPE_UD);
232       mask = cmask.ud >> 16;
233    }
234 
235    brw_reg payload2 = bld.move_to_vgrf(src, src_comps);
236    const unsigned ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE;
237 
238    inst->sfid = BRW_SFID_URB;
239 
240    enum lsc_opcode op = mask ? LSC_OP_STORE_CMASK : LSC_OP_STORE;
241    inst->desc = lsc_msg_desc(devinfo, op,
242                              LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
243                              LSC_DATA_SIZE_D32,
244                              mask ? mask : src_comps /* num_channels */,
245                              false /* transpose */,
246                              LSC_CACHE(devinfo, STORE, L1UC_L3UC));
247 
248 
249    /* Update the original instruction. */
250    inst->opcode = SHADER_OPCODE_SEND;
251    inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);
252    inst->ex_mlen = ex_mlen;
253    inst->header_size = 0;
254    inst->send_has_side_effects = true;
255    inst->send_is_volatile = false;
256 
257    inst->resize_sources(4);
258 
259    inst->src[0] = brw_imm_ud(0);
260    inst->src[1] = brw_imm_ud(0);
261 
262    inst->src[2] = payload;
263    inst->src[3] = payload2;
264 }
265 
266 static void
setup_color_payload(const brw_builder & bld,const brw_wm_prog_key * key,brw_reg * dst,brw_reg color,unsigned components)267 setup_color_payload(const brw_builder &bld, const brw_wm_prog_key *key,
268                     brw_reg *dst, brw_reg color, unsigned components)
269 {
270    if (key->clamp_fragment_color) {
271       brw_reg tmp = bld.vgrf(BRW_TYPE_F, 4);
272       assert(color.type == BRW_TYPE_F);
273 
274       for (unsigned i = 0; i < components; i++)
275          set_saturate(true,
276                       bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
277 
278       color = tmp;
279    }
280 
281    for (unsigned i = 0; i < components; i++)
282       dst[i] = offset(color, bld, i);
283 }
284 
285 static void
lower_fb_write_logical_send(const brw_builder & bld,fs_inst * inst,const struct brw_wm_prog_data * prog_data,const brw_wm_prog_key * key,const fs_thread_payload & fs_payload)286 lower_fb_write_logical_send(const brw_builder &bld, fs_inst *inst,
287                             const struct brw_wm_prog_data *prog_data,
288                             const brw_wm_prog_key *key,
289                             const fs_thread_payload &fs_payload)
290 {
291    assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
292    assert(inst->src[FB_WRITE_LOGICAL_SRC_NULL_RT].file == IMM);
293    const intel_device_info *devinfo = bld.shader->devinfo;
294    const brw_reg color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
295    const brw_reg color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
296    const brw_reg src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
297    const brw_reg src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
298    const brw_reg dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
299    const brw_reg src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
300    brw_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
301    const unsigned components =
302       inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
303    const bool null_rt = inst->src[FB_WRITE_LOGICAL_SRC_NULL_RT].ud != 0;
304 
305    assert(inst->target != 0 || src0_alpha.file == BAD_FILE);
306 
307    brw_reg sources[15];
308    int header_size = 2, payload_header_size;
309    unsigned length = 0;
310 
311    if (devinfo->ver < 11 &&
312       (color1.file != BAD_FILE || key->nr_color_regions > 1)) {
313 
314       /* From the Sandy Bridge PRM, volume 4, page 198:
315        *
316        *     "Dispatched Pixel Enables. One bit per pixel indicating
317        *      which pixels were originally enabled when the thread was
318        *      dispatched. This field is only required for the end-of-
319        *      thread message and on all dual-source messages."
320        */
321       const brw_builder ubld = bld.exec_all().group(8, 0);
322 
323       brw_reg header = ubld.vgrf(BRW_TYPE_UD, 2);
324       if (bld.group() < 16) {
325          /* The header starts off as g0 and g1 for the first half */
326          ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
327                                               BRW_TYPE_UD));
328       } else {
329          /* The header starts off as g0 and g2 for the second half */
330          assert(bld.group() < 32);
331          const brw_reg header_sources[2] = {
332             retype(brw_vec8_grf(0, 0), BRW_TYPE_UD),
333             retype(brw_vec8_grf(2, 0), BRW_TYPE_UD),
334          };
335          ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
336 
337          /* Gfx12 will require additional fix-ups if we ever hit this path. */
338          assert(devinfo->ver < 12);
339       }
340 
341       uint32_t g00_bits = 0;
342 
343       /* Set "Source0 Alpha Present to RenderTarget" bit in message
344        * header.
345        */
346       if (src0_alpha.file != BAD_FILE)
347          g00_bits |= 1 << 11;
348 
349       /* Set computes stencil to render target */
350       if (prog_data->computed_stencil)
351          g00_bits |= 1 << 14;
352 
353       if (g00_bits) {
354          /* OR extra bits into g0.0 */
355          ubld.group(1, 0).OR(component(header, 0),
356                              retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
357                              brw_imm_ud(g00_bits));
358       }
359 
360       /* Set the render target index for choosing BLEND_STATE. */
361       if (inst->target > 0) {
362          ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target));
363       }
364 
365       if (prog_data->uses_kill) {
366          ubld.group(1, 0).MOV(retype(component(header, 15), BRW_TYPE_UW),
367                               brw_sample_mask_reg(bld));
368       }
369 
370       assert(length == 0);
371       sources[0] = header;
372       sources[1] = horiz_offset(header, 8);
373       length = 2;
374    }
375    assert(length == 0 || length == 2);
376    header_size = length;
377 
378    if (fs_payload.aa_dest_stencil_reg[0]) {
379       assert(inst->group < 16);
380       sources[length] = brw_vgrf(bld.shader->alloc.allocate(1), BRW_TYPE_F);
381       bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
382          .MOV(sources[length],
383               brw_reg(brw_vec8_grf(fs_payload.aa_dest_stencil_reg[0], 0)));
384       length++;
385    }
386 
387    if (src0_alpha.file != BAD_FILE) {
388       for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
389          const brw_builder &ubld = bld.exec_all().group(8, i)
390                                       .annotate("FB write src0 alpha");
391          const brw_reg tmp = ubld.vgrf(BRW_TYPE_F);
392          ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
393          setup_color_payload(ubld, key, &sources[length], tmp, 1);
394          length++;
395       }
396    }
397 
398    if (sample_mask.file != BAD_FILE) {
399       const brw_reg tmp = brw_vgrf(bld.shader->alloc.allocate(reg_unit(devinfo)),
400                                   BRW_TYPE_UD);
401 
402       /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
403        * relevant.  Since it's unsigned single words one vgrf is always
404        * 16-wide, but only the lower or higher 8 channels will be used by the
405        * hardware when doing a SIMD8 write depending on whether we have
406        * selected the subspans for the first or second half respectively.
407        */
408       assert(sample_mask.file != BAD_FILE &&
409              brw_type_size_bytes(sample_mask.type) == 4);
410       sample_mask.type = BRW_TYPE_UW;
411       sample_mask.stride *= 2;
412 
413       bld.exec_all().annotate("FB write oMask")
414          .MOV(horiz_offset(retype(tmp, BRW_TYPE_UW),
415                            inst->group % (16 * reg_unit(devinfo))),
416               sample_mask);
417 
418       for (unsigned i = 0; i < reg_unit(devinfo); i++)
419          sources[length++] = byte_offset(tmp, REG_SIZE * i);
420    }
421 
422    payload_header_size = length;
423 
424    setup_color_payload(bld, key, &sources[length], color0, components);
425    length += 4;
426 
427    if (color1.file != BAD_FILE) {
428       setup_color_payload(bld, key, &sources[length], color1, components);
429       length += 4;
430    }
431 
432    if (src_depth.file != BAD_FILE) {
433       sources[length] = src_depth;
434       length++;
435    }
436 
437    if (dst_depth.file != BAD_FILE) {
438       sources[length] = dst_depth;
439       length++;
440    }
441 
442    if (src_stencil.file != BAD_FILE) {
443       assert(bld.dispatch_width() == 8 * reg_unit(devinfo));
444 
445       /* XXX: src_stencil is only available on gfx9+. dst_depth is never
446        * available on gfx9+. As such it's impossible to have both enabled at the
447        * same time and therefore length cannot overrun the array.
448        */
449       assert(length < 15 * reg_unit(devinfo));
450 
451       sources[length] = bld.vgrf(BRW_TYPE_UD);
452       bld.exec_all().annotate("FB write OS")
453          .MOV(retype(sources[length], BRW_TYPE_UB),
454               subscript(src_stencil, BRW_TYPE_UB, 0));
455       length++;
456    }
457 
458    /* Send from the GRF */
459    brw_reg payload = brw_vgrf(-1, BRW_TYPE_F);
460    fs_inst *load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
461    payload.nr = bld.shader->alloc.allocate(regs_written(load));
462    load->dst = payload;
463 
464    uint32_t msg_ctl = brw_fb_write_msg_control(inst, prog_data);
465 
466    /* XXX - Bit 13 Per-sample PS enable */
467    inst->desc =
468       (inst->group / 16) << 11 | /* rt slot group */
469       brw_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,
470                         0 /* coarse_rt_write */);
471 
472    brw_reg desc = brw_imm_ud(0);
473    if (prog_data->coarse_pixel_dispatch == INTEL_ALWAYS) {
474       inst->desc |= (1 << 18);
475    } else if (prog_data->coarse_pixel_dispatch == INTEL_SOMETIMES) {
476       STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_RT_WRITES == (1 << 18));
477       const brw_builder &ubld = bld.exec_all().group(8, 0);
478       desc = ubld.vgrf(BRW_TYPE_UD);
479       ubld.AND(desc, brw_dynamic_msaa_flags(prog_data),
480                brw_imm_ud(INTEL_MSAA_FLAG_COARSE_RT_WRITES));
481       desc = component(desc, 0);
482    }
483 
484    uint32_t ex_desc = 0;
485    if (devinfo->ver >= 20) {
486       ex_desc = inst->target << 21 |
487                 null_rt << 20 |
488                 (src0_alpha.file != BAD_FILE) << 15 |
489                 (src_stencil.file != BAD_FILE) << 14 |
490                 (src_depth.file != BAD_FILE) << 13 |
491                 (sample_mask.file != BAD_FILE) << 12;
492    } else if (devinfo->ver >= 11) {
493       /* Set the "Render Target Index" and "Src0 Alpha Present" fields
494        * in the extended message descriptor, in lieu of using a header.
495        */
496       ex_desc = inst->target << 12 |
497                 null_rt << 20 |
498                 (src0_alpha.file != BAD_FILE) << 15;
499    }
500    inst->ex_desc = ex_desc;
501 
502    inst->opcode = SHADER_OPCODE_SEND;
503    inst->resize_sources(3);
504    inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
505    inst->src[0] = desc;
506    inst->src[1] = brw_imm_ud(0);
507    inst->src[2] = payload;
508    inst->mlen = regs_written(load);
509    inst->ex_mlen = 0;
510    inst->header_size = header_size;
511    inst->check_tdr = true;
512    inst->send_has_side_effects = true;
513 }
514 
515 static void
lower_fb_read_logical_send(const brw_builder & bld,fs_inst * inst,const struct brw_wm_prog_data * wm_prog_data)516 lower_fb_read_logical_send(const brw_builder &bld, fs_inst *inst,
517                            const struct brw_wm_prog_data *wm_prog_data)
518 {
519    const intel_device_info *devinfo = bld.shader->devinfo;
520    const brw_builder &ubld = bld.exec_all().group(8, 0);
521    const unsigned length = 2;
522    const brw_reg header = ubld.vgrf(BRW_TYPE_UD, length);
523 
524    assert(devinfo->ver >= 9 && devinfo->ver < 20);
525 
526    if (bld.group() < 16) {
527       ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
528                                            BRW_TYPE_UD));
529    } else {
530       assert(bld.group() < 32);
531       const brw_reg header_sources[] = {
532          retype(brw_vec8_grf(0, 0), BRW_TYPE_UD),
533          retype(brw_vec8_grf(2, 0), BRW_TYPE_UD)
534       };
535       ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
536 
537       if (devinfo->ver >= 12) {
538          /* On Gfx12 the Viewport and Render Target Array Index fields (AKA
539           * Poly 0 Info) are provided in r1.1 instead of r0.0, and the render
540           * target message header format was updated accordingly -- However
541           * the updated format only works for the lower 16 channels in a
542           * SIMD32 thread, since the higher 16 channels want the subspan data
543           * from r2 instead of r1, so we need to copy over the contents of
544           * r1.1 in order to fix things up.
545           */
546          ubld.group(1, 0).MOV(component(header, 9),
547                               retype(brw_vec1_grf(1, 1), BRW_TYPE_UD));
548       }
549    }
550 
551    /* BSpec 12470 (Gfx8-11), BSpec 47842 (Gfx12+) :
552     *
553     *   "Must be zero for Render Target Read message."
554     *
555     * For bits :
556     *   - 14 : Stencil Present to Render Target
557     *   - 13 : Source Depth Present to Render Target
558     *   - 12 : oMask to Render Target
559     *   - 11 : Source0 Alpha Present to Render Target
560     */
561    ubld.group(1, 0).AND(component(header, 0),
562                         component(header, 0),
563                         brw_imm_ud(~INTEL_MASK(14, 11)));
564 
565    inst->resize_sources(4);
566    inst->opcode = SHADER_OPCODE_SEND;
567    inst->src[0] = brw_imm_ud(0);
568    inst->src[1] = brw_imm_ud(0);
569    inst->src[2] = header;
570    inst->src[3] = brw_reg();
571    inst->mlen = length;
572    inst->header_size = length;
573    inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
574    inst->check_tdr = true;
575    inst->desc =
576       (inst->group / 16) << 11 | /* rt slot group */
577       brw_fb_read_desc(devinfo, inst->target,
578                        0 /* msg_control */, inst->exec_size,
579                        wm_prog_data->persample_dispatch);
580 }
581 
582 static bool
is_high_sampler(const struct intel_device_info * devinfo,const brw_reg & sampler)583 is_high_sampler(const struct intel_device_info *devinfo, const brw_reg &sampler)
584 {
585    return sampler.file != IMM || sampler.ud >= 16;
586 }
587 
588 static unsigned
sampler_msg_type(const intel_device_info * devinfo,opcode opcode,bool shadow_compare,bool lod_is_zero,bool has_min_lod)589 sampler_msg_type(const intel_device_info *devinfo,
590                  opcode opcode, bool shadow_compare,
591                  bool lod_is_zero, bool has_min_lod)
592 {
593    switch (opcode) {
594    case SHADER_OPCODE_TEX_LOGICAL:
595       if (devinfo->ver >= 20 && has_min_lod) {
596          return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD :
597                                  XE2_SAMPLER_MESSAGE_SAMPLE_MLOD;
598       } else {
599          return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
600                                  GFX5_SAMPLER_MESSAGE_SAMPLE;
601       }
602    case FS_OPCODE_TXB_LOGICAL:
603       return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
604                               GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
605    case SHADER_OPCODE_TXL_LOGICAL:
606       assert(!has_min_lod);
607       if (lod_is_zero) {
608          return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ :
609                                  GFX9_SAMPLER_MESSAGE_SAMPLE_LZ;
610       }
611       return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
612                               GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
613    case SHADER_OPCODE_TXS_LOGICAL:
614    case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
615       assert(!has_min_lod);
616       return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
617    case SHADER_OPCODE_TXD_LOGICAL:
618       return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
619                               GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
620    case SHADER_OPCODE_TXF_LOGICAL:
621       assert(!has_min_lod);
622       return lod_is_zero ? GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ :
623                            GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
624    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
625    case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
626       assert(!has_min_lod);
627       return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
628    case SHADER_OPCODE_TXF_MCS_LOGICAL:
629       assert(!has_min_lod);
630       return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
631    case SHADER_OPCODE_LOD_LOGICAL:
632       assert(!has_min_lod);
633       return GFX5_SAMPLER_MESSAGE_LOD;
634    case SHADER_OPCODE_TG4_LOGICAL:
635       assert(!has_min_lod);
636       return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
637                               GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
638       break;
639    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
640       assert(!has_min_lod);
641       return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
642                               GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
643    case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
644       assert(!has_min_lod);
645       assert(devinfo->ver >= 20);
646       return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L_C:
647                               XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L;
648    case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
649       assert(!has_min_lod);
650       assert(devinfo->ver >= 20);
651       return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_B;
652    case SHADER_OPCODE_TG4_BIAS_LOGICAL:
653       assert(!has_min_lod);
654       assert(devinfo->ver >= 20);
655       return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_B;
656    case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
657       assert(!has_min_lod);
658       assert(devinfo->ver >= 20);
659       return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L_C :
660                               XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L;
661    case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
662       assert(!has_min_lod);
663       assert(devinfo->ver >= 20);
664       return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I_C :
665                               XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I;
666   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
667       assert(!has_min_lod);
668       return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
669    default:
670       unreachable("not reached");
671    }
672 }
673 
674 /**
675  * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to
676  * the given requested_alignment_sz.
677  */
678 static fs_inst *
emit_load_payload_with_padding(const brw_builder & bld,const brw_reg & dst,const brw_reg * src,unsigned sources,unsigned header_size,unsigned requested_alignment_sz)679 emit_load_payload_with_padding(const brw_builder &bld, const brw_reg &dst,
680                                const brw_reg *src, unsigned sources,
681                                unsigned header_size,
682                                unsigned requested_alignment_sz)
683 {
684    unsigned length = 0;
685    unsigned num_srcs =
686       sources * DIV_ROUND_UP(requested_alignment_sz, bld.dispatch_width());
687    brw_reg *src_comps = new brw_reg[num_srcs];
688 
689    for (unsigned i = 0; i < header_size; i++)
690       src_comps[length++] = src[i];
691 
692    for (unsigned i = header_size; i < sources; i++) {
693       unsigned src_sz =
694          retype(dst, src[i].type).component_size(bld.dispatch_width());
695       const enum brw_reg_type padding_payload_type =
696          brw_type_with_size(BRW_TYPE_UD, brw_type_size_bits(src[i].type));
697 
698       src_comps[length++] = src[i];
699 
700       /* Expand the real sources if component of requested payload type is
701        * larger than real source component.
702        */
703       if (src_sz < requested_alignment_sz) {
704          for (unsigned j = 0; j < (requested_alignment_sz / src_sz) - 1; j++) {
705             src_comps[length++] = retype(brw_reg(), padding_payload_type);
706          }
707       }
708    }
709 
710    fs_inst *inst = bld.LOAD_PAYLOAD(dst, src_comps, length, header_size);
711    delete[] src_comps;
712 
713    return inst;
714 }
715 
716 static bool
shader_opcode_needs_header(opcode op)717 shader_opcode_needs_header(opcode op)
718 {
719    switch (op) {
720    case SHADER_OPCODE_TG4_LOGICAL:
721    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
722    case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
723    case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
724    case SHADER_OPCODE_TG4_BIAS_LOGICAL:
725    case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
726    case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
727    case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
728       return true;
729    default:
730       break;
731    }
732 
733    return false;
734 }
735 
736 static void
lower_sampler_logical_send(const brw_builder & bld,fs_inst * inst,const brw_reg & coordinate,const brw_reg & shadow_c,brw_reg lod,const brw_reg & lod2,const brw_reg & min_lod,const brw_reg & sample_index,const brw_reg & mcs,const brw_reg & surface,const brw_reg & sampler,const brw_reg & surface_handle,const brw_reg & sampler_handle,const brw_reg & tg4_offset,unsigned payload_type_bit_size,unsigned coord_components,unsigned grad_components,bool residency)737 lower_sampler_logical_send(const brw_builder &bld, fs_inst *inst,
738                            const brw_reg &coordinate,
739                            const brw_reg &shadow_c,
740                            brw_reg lod, const brw_reg &lod2,
741                            const brw_reg &min_lod,
742                            const brw_reg &sample_index,
743                            const brw_reg &mcs,
744                            const brw_reg &surface,
745                            const brw_reg &sampler,
746                            const brw_reg &surface_handle,
747                            const brw_reg &sampler_handle,
748                            const brw_reg &tg4_offset,
749                            unsigned payload_type_bit_size,
750                            unsigned coord_components,
751                            unsigned grad_components,
752                            bool residency)
753 {
754    /* We never generate EOT sampler messages */
755    assert(!inst->eot);
756 
757    const brw_compiler *compiler = bld.shader->compiler;
758    const intel_device_info *devinfo = bld.shader->devinfo;
759    const enum brw_reg_type payload_type =
760       brw_type_with_size(BRW_TYPE_F, payload_type_bit_size);
761    const enum brw_reg_type payload_unsigned_type =
762       brw_type_with_size(BRW_TYPE_UD, payload_type_bit_size);
763    const enum brw_reg_type payload_signed_type =
764       brw_type_with_size(BRW_TYPE_D, payload_type_bit_size);
765    unsigned header_size = 0, length = 0;
766    opcode op = inst->opcode;
767    brw_reg sources[1 + MAX_SAMPLER_MESSAGE_SIZE];
768    for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
769       sources[i] = bld.vgrf(payload_type);
770 
771    /* We must have exactly one of surface/sampler and surface/sampler_handle */
772    assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
773    assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
774 
775    if (shader_opcode_needs_header(op) || inst->offset != 0 ||
776        sampler_handle.file != BAD_FILE ||
777        is_high_sampler(devinfo, sampler) ||
778        residency) {
779       /* For general texture offsets (no txf workaround), we need a header to
780        * put them in.
781        *
782        * TG4 needs to place its channel select in the header, for interaction
783        * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
784        * larger sampler numbers we need to offset the Sampler State Pointer in
785        * the header.
786        */
787       brw_reg header = retype(sources[0], BRW_TYPE_UD);
788       for (header_size = 0; header_size < reg_unit(devinfo); header_size++)
789          sources[length++] = byte_offset(header, REG_SIZE * header_size);
790 
791       /* If we're requesting fewer than four channels worth of response,
792        * and we have an explicit header, we need to set up the sampler
793        * writemask.  It's reversed from normal: 1 means "don't write".
794        */
795       unsigned comps_regs =
796          DIV_ROUND_UP(regs_written(inst) - reg_unit(devinfo) * residency,
797                       reg_unit(devinfo));
798       unsigned comp_regs =
799          DIV_ROUND_UP(inst->dst.component_size(inst->exec_size),
800                       reg_unit(devinfo) * REG_SIZE);
801       if (comps_regs < 4 * comp_regs) {
802          assert(comps_regs % comp_regs == 0);
803          unsigned mask = ~((1 << (comps_regs / comp_regs)) - 1) & 0xf;
804          inst->offset |= mask << 12;
805       }
806 
807       if (residency)
808          inst->offset |= 1 << 23; /* g0.2 bit23 : Pixel Null Mask Enable */
809 
810       /* Build the actual header */
811       const brw_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
812       const brw_builder ubld1 = ubld.group(1, 0);
813       if (devinfo->ver >= 11)
814          ubld.MOV(header, brw_imm_ud(0));
815       else
816          ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_TYPE_UD));
817       if (inst->offset) {
818          ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));
819       } else if (devinfo->ver < 11 &&
820                  bld.shader->stage != MESA_SHADER_VERTEX &&
821                  bld.shader->stage != MESA_SHADER_FRAGMENT) {
822          /* The vertex and fragment stages have g0.2 set to 0, so
823           * header0.2 is 0 when g0 is copied. Other stages may not, so we
824           * must set it to 0 to avoid setting undesirable bits in the
825           * message.
826           */
827          ubld1.MOV(component(header, 2), brw_imm_ud(0));
828       }
829 
830       if (sampler_handle.file != BAD_FILE) {
831          /* Bindless sampler handles aren't relative to the sampler state
832           * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
833           * Instead, it's an absolute pointer relative to dynamic state base
834           * address.
835           *
836           * Sampler states are 16 bytes each and the pointer we give here has
837           * to be 32-byte aligned.  In order to avoid more indirect messages
838           * than required, we assume that all bindless sampler states are
839           * 32-byte aligned.  This sacrifices a bit of general state base
840           * address space but means we can do something more efficient in the
841           * shader.
842           */
843          if (compiler->use_bindless_sampler_offset) {
844             assert(devinfo->ver >= 11);
845             ubld1.OR(component(header, 3), sampler_handle, brw_imm_ud(1));
846          } else {
847             ubld1.MOV(component(header, 3), sampler_handle);
848          }
849       } else if (is_high_sampler(devinfo, sampler)) {
850          brw_reg sampler_state_ptr =
851             retype(brw_vec1_grf(0, 3), BRW_TYPE_UD);
852 
853          /* Gfx11+ sampler message headers include bits in 4:0 which conflict
854           * with the ones included in g0.3 bits 4:0.  Mask them out.
855           */
856          if (devinfo->ver >= 11) {
857             sampler_state_ptr = ubld1.vgrf(BRW_TYPE_UD);
858             ubld1.AND(sampler_state_ptr,
859                       retype(brw_vec1_grf(0, 3), BRW_TYPE_UD),
860                       brw_imm_ud(INTEL_MASK(31, 5)));
861          }
862 
863          if (sampler.file == IMM) {
864             assert(sampler.ud >= 16);
865             const int sampler_state_size = 16; /* 16 bytes */
866 
867             ubld1.ADD(component(header, 3), sampler_state_ptr,
868                       brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
869          } else {
870             brw_reg tmp = ubld1.vgrf(BRW_TYPE_UD);
871             ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));
872             ubld1.SHL(tmp, tmp, brw_imm_ud(4));
873             ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
874          }
875       } else if (devinfo->ver >= 11) {
876          /* Gfx11+ sampler message headers include bits in 4:0 which conflict
877           * with the ones included in g0.3 bits 4:0.  Mask them out.
878           */
879          ubld1.AND(component(header, 3),
880                    retype(brw_vec1_grf(0, 3), BRW_TYPE_UD),
881                    brw_imm_ud(INTEL_MASK(31, 5)));
882       }
883    }
884 
885    const bool lod_is_zero = lod.is_zero();
886 
887    /* On Xe2 and newer platforms, min_lod is the first parameter specifically
888     * so that a bunch of other, possibly unused, parameters don't need to also
889     * be included.
890     */
891    const unsigned msg_type =
892       sampler_msg_type(devinfo, op, inst->shadow_compare, lod_is_zero,
893                        min_lod.file != BAD_FILE);
894 
895    const bool min_lod_is_first = devinfo->ver >= 20 &&
896       (msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_MLOD ||
897        msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD);
898 
899    if (min_lod_is_first) {
900       assert(min_lod.file != BAD_FILE);
901       bld.MOV(sources[length++], min_lod);
902    }
903 
904    if (shadow_c.file != BAD_FILE) {
905       bld.MOV(sources[length], shadow_c);
906       length++;
907    }
908 
909    bool coordinate_done = false;
910 
911    /* Set up the LOD info */
912    switch (op) {
913    case SHADER_OPCODE_TXL_LOGICAL:
914       if (lod_is_zero)
915          break;
916       FALLTHROUGH;
917    case FS_OPCODE_TXB_LOGICAL:
918    case SHADER_OPCODE_TG4_BIAS_LOGICAL:
919    case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
920    case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
921    case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
922       bld.MOV(sources[length], lod);
923       length++;
924       break;
925    case SHADER_OPCODE_TXD_LOGICAL:
926       /* TXD should have been lowered in SIMD16 mode (in SIMD32 mode in
927        * Xe2+).
928        */
929       assert(bld.dispatch_width() == (8 * reg_unit(devinfo)));
930 
931       /* Load dPdx and the coordinate together:
932        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
933        */
934       for (unsigned i = 0; i < coord_components; i++) {
935          bld.MOV(sources[length++], offset(coordinate, bld, i));
936 
937          /* For cube map array, the coordinate is (u,v,r,ai) but there are
938           * only derivatives for (u, v, r).
939           */
940          if (i < grad_components) {
941             bld.MOV(sources[length++], offset(lod, bld, i));
942             bld.MOV(sources[length++], offset(lod2, bld, i));
943          }
944       }
945 
946       coordinate_done = true;
947       break;
948    case SHADER_OPCODE_TXS_LOGICAL:
949       sources[length] = retype(sources[length], payload_unsigned_type);
950       bld.MOV(sources[length++], lod);
951       break;
952    case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
953       /* We need an LOD; just use 0 */
954       sources[length] = retype(sources[length], payload_unsigned_type);
955       bld.MOV(sources[length++], brw_imm_ud(0));
956       break;
957    case SHADER_OPCODE_TXF_LOGICAL:
958        /* On Gfx9 the parameters are intermixed they are u, v, lod, r. */
959       sources[length] = retype(sources[length], payload_signed_type);
960       bld.MOV(sources[length++], offset(coordinate, bld, 0));
961 
962       if (coord_components >= 2) {
963          sources[length] = retype(sources[length], payload_signed_type);
964          bld.MOV(sources[length], offset(coordinate, bld, 1));
965       } else {
966          sources[length] = brw_imm_d(0);
967       }
968       length++;
969 
970       if (!lod_is_zero) {
971          sources[length] = retype(sources[length], payload_signed_type);
972          bld.MOV(sources[length++], lod);
973       }
974 
975       for (unsigned i = 2; i < coord_components; i++) {
976          sources[length] = retype(sources[length], payload_signed_type);
977          bld.MOV(sources[length++], offset(coordinate, bld, i));
978       }
979 
980       coordinate_done = true;
981       break;
982 
983    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
984    case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
985       sources[length] = retype(sources[length], payload_unsigned_type);
986       bld.MOV(sources[length++], sample_index);
987 
988       /* Data from the multisample control surface. */
989       for (unsigned i = 0; i < 2; ++i) {
990          /* Sampler always writes 4/8 register worth of data but for ld_mcs
991           * only valid data is in first two register. So with 16-bit
992           * payload, we need to split 2-32bit register into 4-16-bit
993           * payload.
994           *
995           * From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
996           * Shared Functions - 3D Sampler - Messages - Message Format:
997           *
998           *    ld2dms_w   si  mcs0 mcs1 mcs2  mcs3  u  v  r
999           */
1000          if (op == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) {
1001             brw_reg tmp = offset(mcs, bld, i);
1002             sources[length] = retype(sources[length], payload_unsigned_type);
1003             bld.MOV(sources[length++],
1004                     mcs.file == IMM ? mcs :
1005                     brw_reg(subscript(tmp, payload_unsigned_type, 0)));
1006 
1007             sources[length] = retype(sources[length], payload_unsigned_type);
1008             bld.MOV(sources[length++],
1009                     mcs.file == IMM ? mcs :
1010                     brw_reg(subscript(tmp, payload_unsigned_type, 1)));
1011          } else {
1012             sources[length] = retype(sources[length], payload_unsigned_type);
1013             bld.MOV(sources[length++],
1014                     mcs.file == IMM ? mcs : offset(mcs, bld, i));
1015          }
1016       }
1017       FALLTHROUGH;
1018 
1019    case SHADER_OPCODE_TXF_MCS_LOGICAL:
1020       /* There is no offsetting for this message; just copy in the integer
1021        * texture coordinates.
1022        */
1023       for (unsigned i = 0; i < coord_components; i++) {
1024          sources[length] = retype(sources[length], payload_signed_type);
1025          bld.MOV(sources[length++], offset(coordinate, bld, i));
1026       }
1027 
1028       coordinate_done = true;
1029       break;
1030    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
1031       /* More crazy intermixing */
1032       for (unsigned i = 0; i < 2; i++) /* u, v */
1033          bld.MOV(sources[length++], offset(coordinate, bld, i));
1034 
1035       for (unsigned i = 0; i < 2; i++) { /* offu, offv */
1036          sources[length] = retype(sources[length], payload_signed_type);
1037          bld.MOV(sources[length++], offset(tg4_offset, bld, i));
1038       }
1039 
1040       if (coord_components == 3) /* r if present */
1041          bld.MOV(sources[length++], offset(coordinate, bld, 2));
1042 
1043       coordinate_done = true;
1044       break;
1045    default:
1046       break;
1047    }
1048 
1049    /* Set up the coordinate (except for cases where it was done above) */
1050    if (!coordinate_done) {
1051       for (unsigned i = 0; i < coord_components; i++)
1052          bld.MOV(retype(sources[length++], payload_type),
1053                  offset(coordinate, bld, i));
1054    }
1055 
1056    if (min_lod.file != BAD_FILE && !min_lod_is_first) {
1057       /* Account for all of the missing coordinate sources */
1058       if (op == FS_OPCODE_TXB_LOGICAL && devinfo->ver >= 20) {
1059          /* Bspec 64985:
1060           *
1061           * For sample_b sampler message format:
1062           *
1063           * SIMD16H/SIMD32H
1064           * Param Number   0     1  2  3  4  5
1065           * Param          BIAS  U  V  R  Ai MLOD
1066           *
1067           * SIMD16/SIMD32
1068           * Param Number   0        1  2  3  4
1069           * Param          BIAS_AI  U  V  R  MLOD
1070           */
1071          length += 3 - coord_components;
1072       } else if (op == SHADER_OPCODE_TXD_LOGICAL && devinfo->verx10 >= 125) {
1073          /* On DG2 and newer platforms, sample_d can only be used with 1D and
1074           * 2D surfaces, so the maximum number of gradient components is 2.
1075           * In spite of this limitation, the Bspec lists a mysterious R
1076           * component before the min_lod, so the maximum coordinate components
1077           * is 3.
1078           *
1079           * See bspec 45942, "Enable new message layout for cube array"
1080           */
1081          length += 3 - coord_components;
1082          length += (2 - grad_components) * 2;
1083       } else {
1084          length += 4 - coord_components;
1085          if (op == SHADER_OPCODE_TXD_LOGICAL)
1086             length += (3 - grad_components) * 2;
1087       }
1088 
1089       bld.MOV(sources[length++], min_lod);
1090 
1091       /* Wa_14014595444: Populate MLOD as parameter 5 (twice). */
1092        if (devinfo->verx10 == 125 && op == FS_OPCODE_TXB_LOGICAL &&
1093           !inst->shadow_compare)
1094          bld.MOV(sources[length++], min_lod);
1095    }
1096 
1097    const brw_reg src_payload =
1098       brw_vgrf(bld.shader->alloc.allocate(length * bld.dispatch_width() / 8),
1099                BRW_TYPE_F);
1100    /* In case of 16-bit payload each component takes one full register in
1101     * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16
1102     * elements. In SIMD8H case hardware simply expects the components to be
1103     * padded (i.e., aligned on reg boundary).
1104     */
1105    fs_inst *load_payload_inst =
1106       emit_load_payload_with_padding(bld, src_payload, sources, length,
1107                                      header_size, REG_SIZE * reg_unit(devinfo));
1108    unsigned mlen = load_payload_inst->size_written / REG_SIZE;
1109    unsigned simd_mode = 0;
1110    if (devinfo->ver < 20) {
1111       if (payload_type_bit_size == 16) {
1112          assert(devinfo->ver >= 11);
1113          simd_mode = inst->exec_size <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H :
1114             GFX10_SAMPLER_SIMD_MODE_SIMD16H;
1115       } else {
1116          simd_mode = inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
1117             BRW_SAMPLER_SIMD_MODE_SIMD16;
1118       }
1119    } else {
1120       if (payload_type_bit_size == 16) {
1121          simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16H :
1122             XE2_SAMPLER_SIMD_MODE_SIMD32H;
1123       } else {
1124          simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16 :
1125             XE2_SAMPLER_SIMD_MODE_SIMD32;
1126       }
1127    }
1128 
1129    /* Generate the SEND. */
1130    inst->opcode = SHADER_OPCODE_SEND;
1131    inst->mlen = mlen;
1132    inst->header_size = header_size;
1133    inst->sfid = BRW_SFID_SAMPLER;
1134    uint sampler_ret_type = brw_type_size_bits(inst->dst.type) == 16
1135       ? GFX8_SAMPLER_RETURN_FORMAT_16BITS
1136       : GFX8_SAMPLER_RETURN_FORMAT_32BITS;
1137    if (surface.file == IMM &&
1138        (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
1139       inst->desc = brw_sampler_desc(devinfo, surface.ud,
1140                                     sampler.file == IMM ? sampler.ud % 16 : 0,
1141                                     msg_type,
1142                                     simd_mode,
1143                                     sampler_ret_type);
1144       inst->src[0] = brw_imm_ud(0);
1145       inst->src[1] = brw_imm_ud(0);
1146    } else if (surface_handle.file != BAD_FILE) {
1147       /* Bindless surface */
1148       inst->desc = brw_sampler_desc(devinfo,
1149                                     GFX9_BTI_BINDLESS,
1150                                     sampler.file == IMM ? sampler.ud % 16 : 0,
1151                                     msg_type,
1152                                     simd_mode,
1153                                     sampler_ret_type);
1154 
1155       /* For bindless samplers, the entire address is included in the message
1156        * header so we can leave the portion in the message descriptor 0.
1157        */
1158       if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {
1159          inst->src[0] = brw_imm_ud(0);
1160       } else {
1161          const brw_builder ubld = bld.group(1, 0).exec_all();
1162          brw_reg desc = ubld.vgrf(BRW_TYPE_UD);
1163          ubld.SHL(desc, sampler, brw_imm_ud(8));
1164          inst->src[0] = component(desc, 0);
1165       }
1166 
1167       /* We assume that the driver provided the handle in the top 20 bits so
1168        * we can use the surface handle directly as the extended descriptor.
1169        */
1170       inst->src[1] = retype(surface_handle, BRW_TYPE_UD);
1171       inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1172    } else {
1173       /* Immediate portion of the descriptor */
1174       inst->desc = brw_sampler_desc(devinfo,
1175                                     0, /* surface */
1176                                     0, /* sampler */
1177                                     msg_type,
1178                                     simd_mode,
1179                                     sampler_ret_type);
1180       const brw_builder ubld = bld.group(1, 0).exec_all();
1181       brw_reg desc = ubld.vgrf(BRW_TYPE_UD);
1182       if (surface.equals(sampler)) {
1183          /* This case is common in GL */
1184          ubld.MUL(desc, surface, brw_imm_ud(0x101));
1185       } else {
1186          if (sampler_handle.file != BAD_FILE) {
1187             ubld.MOV(desc, surface);
1188          } else if (sampler.file == IMM) {
1189             ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8));
1190          } else {
1191             ubld.SHL(desc, sampler, brw_imm_ud(8));
1192             ubld.OR(desc, desc, surface);
1193          }
1194       }
1195       ubld.AND(desc, desc, brw_imm_ud(0xfff));
1196 
1197       inst->src[0] = component(desc, 0);
1198       inst->src[1] = brw_imm_ud(0); /* ex_desc */
1199    }
1200 
1201    inst->ex_desc = 0;
1202 
1203    inst->src[2] = src_payload;
1204    inst->resize_sources(3);
1205 
1206    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
1207    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE * reg_unit(devinfo));
1208 }
1209 
1210 static unsigned
get_sampler_msg_payload_type_bit_size(const intel_device_info * devinfo,const fs_inst * inst)1211 get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
1212                                       const fs_inst *inst)
1213 {
1214    assert(inst);
1215    const brw_reg *src = inst->src;
1216    unsigned src_type_size = 0;
1217 
1218    /* All sources need to have the same size, therefore seek the first valid
1219     * and take the size from there.
1220     */
1221    for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1222       if (src[i].file != BAD_FILE) {
1223          src_type_size = brw_type_size_bytes(src[i].type);
1224          break;
1225       }
1226    }
1227 
1228    assert(src_type_size == 2 || src_type_size == 4);
1229 
1230 #ifndef NDEBUG
1231    /* Make sure all sources agree. On gfx12 this doesn't hold when sampling
1232     * compressed multisampled surfaces. There the payload contains MCS data
1233     * which is already in 16-bits unlike the other parameters that need forced
1234     * conversion.
1235     */
1236    if (inst->opcode != SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) {
1237       for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1238          assert(src[i].file == BAD_FILE ||
1239                 brw_type_size_bytes(src[i].type) == src_type_size);
1240       }
1241    }
1242 #endif
1243 
1244    if (devinfo->verx10 < 125)
1245       return src_type_size * 8;
1246 
1247    /* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec:
1248     * 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message
1249     * Format [GFX12:HAS:1209977870] *
1250     *
1251     *  ld2dms_w       SIMD8H and SIMD16H Only
1252     *  ld_mcs         SIMD8H and SIMD16H Only
1253     *  ld2dms         REMOVEDBY(GEN:HAS:1406788836)
1254     */
1255    if (inst->opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL ||
1256        inst->opcode == SHADER_OPCODE_TXF_MCS_LOGICAL)
1257       src_type_size = 2;
1258 
1259    return src_type_size * 8;
1260 }
1261 
1262 static void
lower_sampler_logical_send(const brw_builder & bld,fs_inst * inst)1263 lower_sampler_logical_send(const brw_builder &bld, fs_inst *inst)
1264 {
1265    const intel_device_info *devinfo = bld.shader->devinfo;
1266    const brw_reg coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
1267    const brw_reg shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
1268    const brw_reg lod = inst->src[TEX_LOGICAL_SRC_LOD];
1269    const brw_reg lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
1270    const brw_reg min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
1271    const brw_reg sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
1272    const brw_reg mcs = inst->src[TEX_LOGICAL_SRC_MCS];
1273    const brw_reg surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
1274    const brw_reg sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
1275    const brw_reg surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
1276    const brw_reg sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
1277    const brw_reg tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
1278    assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
1279    const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
1280    assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
1281    const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
1282    assert(inst->src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
1283    const bool residency = inst->src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
1284 
1285    const unsigned msg_payload_type_bit_size =
1286       get_sampler_msg_payload_type_bit_size(devinfo, inst);
1287 
1288    /* 16-bit payloads are available only on gfx11+ */
1289    assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11);
1290 
1291    lower_sampler_logical_send(bld, inst, coordinate,
1292                               shadow_c, lod, lod2, min_lod,
1293                               sample_index,
1294                               mcs, surface, sampler,
1295                               surface_handle, sampler_handle,
1296                               tg4_offset,
1297                               msg_payload_type_bit_size,
1298                               coord_components, grad_components,
1299                               residency);
1300 }
1301 
1302 /**
1303  * Predicate the specified instruction on the vector mask.
1304  */
1305 static void
emit_predicate_on_vector_mask(const brw_builder & bld,fs_inst * inst)1306 emit_predicate_on_vector_mask(const brw_builder &bld, fs_inst *inst)
1307 {
1308    assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
1309           bld.group() == inst->group &&
1310           bld.dispatch_width() == inst->exec_size);
1311 
1312    const brw_builder ubld = bld.exec_all().group(1, 0);
1313 
1314    const fs_visitor &s = *bld.shader;
1315    const brw_reg vector_mask = ubld.vgrf(BRW_TYPE_UW);
1316    ubld.UNDEF(vector_mask);
1317    ubld.emit(SHADER_OPCODE_READ_ARCH_REG, vector_mask, retype(brw_sr0_reg(3),
1318                                                               BRW_TYPE_UD));
1319    const unsigned subreg = sample_mask_flag_subreg(s);
1320 
1321    ubld.MOV(brw_flag_subreg(subreg + inst->group / 16), vector_mask);
1322 
1323    if (inst->predicate) {
1324       assert(inst->predicate == BRW_PREDICATE_NORMAL);
1325       assert(!inst->predicate_inverse);
1326       assert(inst->flag_subreg == 0);
1327       assert(s.devinfo->ver < 20);
1328       /* Combine the vector mask with the existing predicate by using a
1329        * vertical predication mode.
1330        */
1331       inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
1332    } else {
1333       inst->flag_subreg = subreg;
1334       inst->predicate = BRW_PREDICATE_NORMAL;
1335       inst->predicate_inverse = false;
1336    }
1337 }
1338 
1339 static void
setup_surface_descriptors(const brw_builder & bld,fs_inst * inst,uint32_t desc,const brw_reg & surface,const brw_reg & surface_handle)1340 setup_surface_descriptors(const brw_builder &bld, fs_inst *inst, uint32_t desc,
1341                           const brw_reg &surface, const brw_reg &surface_handle)
1342 {
1343    const brw_compiler *compiler = bld.shader->compiler;
1344 
1345    /* We must have exactly one of surface and surface_handle */
1346    assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
1347 
1348    if (surface.file == IMM) {
1349       inst->desc = desc | (surface.ud & 0xff);
1350       inst->src[0] = brw_imm_ud(0);
1351       inst->src[1] = brw_imm_ud(0); /* ex_desc */
1352    } else if (surface_handle.file != BAD_FILE) {
1353       /* Bindless surface */
1354       inst->desc = desc | GFX9_BTI_BINDLESS;
1355       inst->src[0] = brw_imm_ud(0);
1356 
1357       /* We assume that the driver provided the handle in the top 20 bits so
1358        * we can use the surface handle directly as the extended descriptor.
1359        */
1360       inst->src[1] = retype(surface_handle, BRW_TYPE_UD);
1361       inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1362    } else {
1363       inst->desc = desc;
1364       const brw_builder ubld = bld.exec_all().group(1, 0);
1365       brw_reg tmp = ubld.vgrf(BRW_TYPE_UD);
1366       ubld.AND(tmp, surface, brw_imm_ud(0xff));
1367       inst->src[0] = component(tmp, 0);
1368       inst->src[1] = brw_imm_ud(0); /* ex_desc */
1369    }
1370 }
1371 
1372 static void
setup_lsc_surface_descriptors(const brw_builder & bld,fs_inst * inst,uint32_t desc,const brw_reg & surface)1373 setup_lsc_surface_descriptors(const brw_builder &bld, fs_inst *inst,
1374                               uint32_t desc, const brw_reg &surface)
1375 {
1376    const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
1377    const brw_compiler *compiler = bld.shader->compiler;
1378 
1379    inst->src[0] = brw_imm_ud(0); /* desc */
1380 
1381    enum lsc_addr_surface_type surf_type = lsc_msg_desc_addr_type(devinfo, desc);
1382    switch (surf_type) {
1383    case LSC_ADDR_SURFTYPE_BSS:
1384       inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1385       /* fall-through */
1386    case LSC_ADDR_SURFTYPE_SS:
1387       assert(surface.file != BAD_FILE);
1388       /* We assume that the driver provided the handle in the top 20 bits so
1389        * we can use the surface handle directly as the extended descriptor.
1390        */
1391       inst->src[1] = retype(surface, BRW_TYPE_UD);
1392       /* Gfx20+ assumes ExBSO with UGM */
1393       if (devinfo->ver >= 20 && inst->sfid == GFX12_SFID_UGM)
1394          inst->send_ex_bso = true;
1395       break;
1396 
1397    case LSC_ADDR_SURFTYPE_BTI:
1398       assert(surface.file != BAD_FILE);
1399       if (surface.file == IMM) {
1400          inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
1401       } else {
1402          const brw_builder ubld = bld.exec_all().group(1, 0);
1403          brw_reg tmp = ubld.vgrf(BRW_TYPE_UD);
1404          ubld.SHL(tmp, surface, brw_imm_ud(24));
1405          inst->src[1] = component(tmp, 0);
1406       }
1407       break;
1408 
1409    case LSC_ADDR_SURFTYPE_FLAT:
1410       inst->src[1] = brw_imm_ud(0);
1411       break;
1412 
1413    default:
1414       unreachable("Invalid LSC surface address type");
1415    }
1416 }
1417 
1418 static enum lsc_addr_size
lsc_addr_size_for_type(enum brw_reg_type type)1419 lsc_addr_size_for_type(enum brw_reg_type type)
1420 {
1421    switch (brw_type_size_bytes(type)) {
1422    case 2: return LSC_ADDR_SIZE_A16;
1423    case 4: return LSC_ADDR_SIZE_A32;
1424    case 8: return LSC_ADDR_SIZE_A64;
1425    default: unreachable("invalid type size");
1426    }
1427 }
1428 
1429 static void
lower_lsc_memory_logical_send(const brw_builder & bld,fs_inst * inst)1430 lower_lsc_memory_logical_send(const brw_builder &bld, fs_inst *inst)
1431 {
1432    const intel_device_info *devinfo = bld.shader->devinfo;
1433    assert(devinfo->has_lsc);
1434 
1435    assert(inst->src[MEMORY_LOGICAL_OPCODE].file == IMM);
1436    assert(inst->src[MEMORY_LOGICAL_MODE].file == IMM);
1437    assert(inst->src[MEMORY_LOGICAL_BINDING_TYPE].file == IMM);
1438    assert(inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].file == IMM);
1439    assert(inst->src[MEMORY_LOGICAL_DATA_SIZE].file == IMM);
1440    assert(inst->src[MEMORY_LOGICAL_FLAGS].file == IMM);
1441 
1442    /* Get the logical send arguments. */
1443    const enum lsc_opcode op = (lsc_opcode) inst->src[MEMORY_LOGICAL_OPCODE].ud;
1444    const enum memory_logical_mode mode =
1445       (enum memory_logical_mode) inst->src[MEMORY_LOGICAL_MODE].ud;
1446    const enum lsc_addr_surface_type binding_type =
1447       (enum lsc_addr_surface_type) inst->src[MEMORY_LOGICAL_BINDING_TYPE].ud;
1448    const brw_reg binding = inst->src[MEMORY_LOGICAL_BINDING];
1449    const brw_reg addr = inst->src[MEMORY_LOGICAL_ADDRESS];
1450    const unsigned coord_components =
1451       inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
1452    enum lsc_data_size data_size =
1453       (enum lsc_data_size) inst->src[MEMORY_LOGICAL_DATA_SIZE].ud;
1454    const unsigned components = inst->src[MEMORY_LOGICAL_COMPONENTS].ud;
1455    const enum memory_flags flags =
1456       (enum memory_flags) inst->src[MEMORY_LOGICAL_FLAGS].ud;
1457    const bool transpose = flags & MEMORY_FLAG_TRANSPOSE;
1458    const bool include_helpers = flags & MEMORY_FLAG_INCLUDE_HELPERS;
1459    const brw_reg data0 = inst->src[MEMORY_LOGICAL_DATA0];
1460    const brw_reg data1 = inst->src[MEMORY_LOGICAL_DATA1];
1461    const bool has_side_effects = inst->has_side_effects();
1462 
1463    const uint32_t data_size_B = lsc_data_size_bytes(data_size);
1464    const enum brw_reg_type data_type =
1465       brw_type_with_size(data0.type, data_size_B * 8);
1466 
1467    const enum lsc_addr_size addr_size = lsc_addr_size_for_type(addr.type);
1468 
1469    brw_reg payload = addr;
1470 
1471    if (addr.file != VGRF || !addr.is_contiguous()) {
1472       if (inst->force_writemask_all) {
1473          const brw_builder dbld = bld.group(bld.shader->dispatch_width, 0);
1474          payload = dbld.move_to_vgrf(addr, coord_components);
1475       } else {
1476          payload = bld.move_to_vgrf(addr, coord_components);
1477       }
1478    }
1479 
1480    unsigned ex_mlen = 0;
1481    brw_reg payload2;
1482    if (data0.file != BAD_FILE) {
1483       if (transpose) {
1484          assert(data1.file == BAD_FILE);
1485 
1486          payload2 = data0;
1487          ex_mlen = DIV_ROUND_UP(components, 8);
1488       } else {
1489          brw_reg data[8];
1490          unsigned size = 0;
1491 
1492          assert(components < 8);
1493 
1494          for (unsigned i = 0; i < components; i++)
1495             data[size++] = offset(data0, bld, i);
1496 
1497          if (data1.file != BAD_FILE) {
1498             for (unsigned i = 0; i < components; i++)
1499                data[size++] = offset(data1, bld, i);
1500          }
1501 
1502          payload2 = bld.vgrf(data0.type, size);
1503          bld.LOAD_PAYLOAD(payload2, data, size, 0);
1504          ex_mlen = (size * brw_type_size_bytes(data_type) * inst->exec_size) / REG_SIZE;
1505       }
1506    }
1507 
1508    /* Bspec: Atomic instruction -> Cache section:
1509     *
1510     *    Atomic messages are always forced to "un-cacheable" in the L1
1511     *    cache.
1512     */
1513    unsigned cache_mode =
1514       lsc_opcode_is_atomic(op) ? (unsigned) LSC_CACHE(devinfo, STORE, L1UC_L3WB) :
1515       lsc_opcode_is_store(op)  ? (unsigned) LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS) :
1516       (unsigned) LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS);
1517 
1518    /* If we're a fragment shader, we have to predicate with the sample mask to
1519     * avoid helper invocations in instructions with side effects, unless they
1520     * are explicitly required.  One exception is for scratch writes - even
1521     * though those have side effects, they represent operations that didn't
1522     * originally have any.  We want to avoid accessing undefined values from
1523     * scratch, so we disable helper invocations entirely there.
1524     *
1525     * There are also special cases when we actually want to run on helpers
1526     * (ray queries).
1527     */
1528    if (bld.shader->stage == MESA_SHADER_FRAGMENT && !transpose) {
1529       if (include_helpers)
1530          emit_predicate_on_vector_mask(bld, inst);
1531       else if (has_side_effects && mode != MEMORY_MODE_SCRATCH)
1532          brw_emit_predicate_on_sample_mask(bld, inst);
1533    }
1534 
1535    switch (mode) {
1536    case MEMORY_MODE_UNTYPED:
1537    case MEMORY_MODE_CONSTANT:
1538    case MEMORY_MODE_SCRATCH:
1539       inst->sfid = GFX12_SFID_UGM;
1540       break;
1541    case MEMORY_MODE_TYPED:
1542       inst->sfid = GFX12_SFID_TGM;
1543       break;
1544    case MEMORY_MODE_SHARED_LOCAL:
1545       inst->sfid = GFX12_SFID_SLM;
1546       break;
1547    }
1548    assert(inst->sfid);
1549 
1550    /* Disable LSC data port L1 cache scheme for the TGM load/store for RT
1551     * shaders. (see HSD 18038444588)
1552     */
1553    if (devinfo->ver >= 20 && gl_shader_stage_is_rt(bld.shader->stage) &&
1554        inst->sfid == GFX12_SFID_TGM &&
1555        !lsc_opcode_is_atomic(op)) {
1556       if (lsc_opcode_is_store(op)) {
1557          cache_mode = (unsigned) LSC_CACHE(devinfo, STORE, L1UC_L3WB);
1558       } else {
1559          cache_mode = (unsigned) LSC_CACHE(devinfo, LOAD, L1UC_L3C);
1560       }
1561    }
1562 
1563    inst->desc = lsc_msg_desc(devinfo, op, binding_type, addr_size, data_size,
1564                              lsc_opcode_has_cmask(op) ?
1565                              (1 << components) - 1 : components,
1566                              transpose, cache_mode);
1567 
1568    /* Set up extended descriptors, fills src[0] and src[1]. */
1569    setup_lsc_surface_descriptors(bld, inst, inst->desc, binding);
1570 
1571    inst->opcode = SHADER_OPCODE_SEND;
1572    inst->mlen = lsc_msg_addr_len(devinfo, addr_size,
1573                                  inst->exec_size * coord_components);
1574    inst->ex_mlen = ex_mlen;
1575    inst->header_size = 0;
1576    inst->send_has_side_effects = has_side_effects;
1577    inst->send_is_volatile = !has_side_effects;
1578 
1579    inst->resize_sources(4);
1580 
1581    /* Finally, the payload */
1582    inst->src[2] = payload;
1583    inst->src[3] = payload2;
1584 }
1585 
1586 static brw_reg
emit_a64_oword_block_header(const brw_builder & bld,const brw_reg & addr)1587 emit_a64_oword_block_header(const brw_builder &bld, const brw_reg &addr)
1588 {
1589    const brw_builder ubld = bld.exec_all().group(8, 0);
1590 
1591    assert(brw_type_size_bytes(addr.type) == 8 && addr.stride == 0);
1592 
1593    brw_reg expanded_addr = addr;
1594    if (addr.file == UNIFORM) {
1595       /* We can't do stride 1 with the UNIFORM file, it requires stride 0 */
1596       brw_builder ubld1 = ubld.group(1, 0);
1597 
1598       brw_reg tmp = ubld1.vgrf(BRW_TYPE_UQ);
1599       ubld1.UNDEF(tmp);
1600 
1601       expanded_addr = component(tmp, 0);
1602       ubld1.MOV(expanded_addr, retype(addr, BRW_TYPE_UQ));
1603    }
1604 
1605    brw_reg header = ubld.vgrf(BRW_TYPE_UD);
1606    ubld.MOV(header, brw_imm_ud(0));
1607 
1608    /* Use a 2-wide MOV to fill out the address */
1609    brw_reg addr_vec2 = expanded_addr;
1610    addr_vec2.type = BRW_TYPE_UD;
1611    addr_vec2.stride = 1;
1612    ubld.group(2, 0).MOV(header, addr_vec2);
1613 
1614    return header;
1615 }
1616 
1617 static void
lower_hdc_memory_logical_send(const brw_builder & bld,fs_inst * inst)1618 lower_hdc_memory_logical_send(const brw_builder &bld, fs_inst *inst)
1619 {
1620    const intel_device_info *devinfo = bld.shader->devinfo;
1621    const brw_compiler *compiler = bld.shader->compiler;
1622 
1623    assert(inst->src[MEMORY_LOGICAL_OPCODE].file == IMM);
1624    assert(inst->src[MEMORY_LOGICAL_MODE].file == IMM);
1625    assert(inst->src[MEMORY_LOGICAL_BINDING_TYPE].file == IMM);
1626    assert(inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].file == IMM);
1627    assert(inst->src[MEMORY_LOGICAL_DATA_SIZE].file == IMM);
1628    assert(inst->src[MEMORY_LOGICAL_FLAGS].file == IMM);
1629 
1630    /* Get the logical send arguments. */
1631    const enum lsc_opcode op = (lsc_opcode)inst->src[MEMORY_LOGICAL_OPCODE].ud;
1632    const enum memory_logical_mode mode =
1633       (enum memory_logical_mode) inst->src[MEMORY_LOGICAL_MODE].ud;
1634    enum lsc_addr_surface_type binding_type =
1635       (enum lsc_addr_surface_type) inst->src[MEMORY_LOGICAL_BINDING_TYPE].ud;
1636    brw_reg binding = inst->src[MEMORY_LOGICAL_BINDING];
1637    const brw_reg addr = inst->src[MEMORY_LOGICAL_ADDRESS];
1638    const unsigned coord_components =
1639       inst->src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
1640    const unsigned alignment = inst->src[MEMORY_LOGICAL_ALIGNMENT].ud;
1641    const unsigned components = inst->src[MEMORY_LOGICAL_COMPONENTS].ud;
1642    const enum memory_flags flags =
1643       (enum memory_flags) inst->src[MEMORY_LOGICAL_FLAGS].ud;
1644    const bool block = flags & MEMORY_FLAG_TRANSPOSE;
1645    const bool include_helpers = flags & MEMORY_FLAG_INCLUDE_HELPERS;
1646    const brw_reg data0 = inst->src[MEMORY_LOGICAL_DATA0];
1647    const brw_reg data1 = inst->src[MEMORY_LOGICAL_DATA1];
1648    const bool has_side_effects = inst->has_side_effects();
1649    const bool has_dest = inst->dst.file != BAD_FILE && !inst->dst.is_null();
1650 
1651    /* Don't predicate scratch writes on the sample mask.  Otherwise,
1652     * FS helper invocations would load undefined values from scratch memory.
1653     * And scratch memory load/stores are produced from operations without
1654     * side-effects, thus they should not have different behavior in the
1655     * helper invocations.
1656     */
1657    bool allow_sample_mask = has_side_effects && mode != MEMORY_MODE_SCRATCH;
1658 
1659    const enum lsc_data_size data_size =
1660       (enum lsc_data_size) inst->src[MEMORY_LOGICAL_DATA_SIZE].ud;
1661 
1662    /* unpadded data size */
1663    const uint32_t data_bit_size =
1664       data_size == LSC_DATA_SIZE_D8U32 ? 8 :
1665       data_size == LSC_DATA_SIZE_D16U32 ? 16 :
1666       8 * lsc_data_size_bytes(data_size);
1667 
1668    const bool byte_scattered =
1669       data_bit_size < 32 || (alignment != 0 && alignment < 4);
1670    const bool dword_scattered = !byte_scattered && mode == MEMORY_MODE_SCRATCH;
1671    const bool surface_access = !byte_scattered && !dword_scattered && !block;
1672 
1673    /* SLM block reads must use the 16B-aligned OWord Block Read messages,
1674     * as the unaligned message doesn't exist for SLM.
1675     */
1676    const bool oword_aligned = block && mode == MEMORY_MODE_SHARED_LOCAL;
1677    assert(!oword_aligned || (alignment % 16) == 0);
1678    assert(!block || (alignment % 4) == 0);
1679 
1680    enum lsc_addr_size addr_size = lsc_addr_size_for_type(addr.type);
1681    unsigned addr_size_B = coord_components * lsc_addr_size_bytes(addr_size);
1682 
1683    brw_reg header;
1684    brw_builder ubld8 = bld.exec_all().group(8, 0);
1685    brw_builder ubld1 = ubld8.group(1, 0);
1686    if (mode == MEMORY_MODE_SCRATCH) {
1687       header = ubld8.vgrf(BRW_TYPE_UD);
1688       ubld8.emit(SHADER_OPCODE_SCRATCH_HEADER, header, brw_ud8_grf(0, 0));
1689    } else if (block) {
1690       if (addr_size == LSC_ADDR_SIZE_A64) {
1691          header = emit_a64_oword_block_header(bld, addr);
1692       } else {
1693          header = ubld8.vgrf(BRW_TYPE_UD);
1694          ubld8.MOV(header, brw_imm_ud(0));
1695          if (oword_aligned)
1696             ubld1.SHR(component(header, 2), addr, brw_imm_ud(4));
1697          else
1698             ubld1.MOV(component(header, 2), addr);
1699       }
1700    }
1701 
1702    /* If we're a fragment shader, we have to predicate with the sample mask to
1703     * avoid helper invocations to avoid helper invocations in instructions
1704     * with side effects, unless they are explicitly required.
1705     *
1706     * There are also special cases when we actually want to run on helpers
1707     * (ray queries).
1708     */
1709    if (bld.shader->stage == MESA_SHADER_FRAGMENT) {
1710       if (include_helpers)
1711          emit_predicate_on_vector_mask(bld, inst);
1712       else if (allow_sample_mask &&
1713                (header.file == BAD_FILE || !surface_access))
1714          brw_emit_predicate_on_sample_mask(bld, inst);
1715    }
1716 
1717    brw_reg payload, payload2;
1718    unsigned mlen, ex_mlen = 0;
1719 
1720    if (!block) {
1721       brw_reg data[11];
1722       unsigned num_sources = 0;
1723       if (header.file != BAD_FILE)
1724          data[num_sources++] = header;
1725 
1726       for (unsigned i = 0; i < coord_components; i++)
1727          data[num_sources++] = offset(addr, bld, i);
1728 
1729       if (data0.file != BAD_FILE) {
1730          for (unsigned i = 0; i < components; i++)
1731             data[num_sources++] = offset(data0, bld, i);
1732          if (data1.file != BAD_FILE) {
1733             for (unsigned i = 0; i < components; i++)
1734                data[num_sources++] = offset(data1, bld, i);
1735          }
1736       }
1737 
1738       assert(num_sources <= ARRAY_SIZE(data));
1739 
1740       unsigned payload_size_UDs = (header.file != BAD_FILE ? 1 : 0) +
1741                                   (addr_size_B / 4) +
1742                                   (lsc_op_num_data_values(op) * components *
1743                                    lsc_data_size_bytes(data_size) / 4);
1744 
1745       payload = bld.vgrf(BRW_TYPE_UD, payload_size_UDs);
1746       fs_inst *load_payload =
1747          emit_load_payload_with_padding(bld, payload, data, num_sources,
1748                                         header.file != BAD_FILE ? 1 : 0,
1749                                         REG_SIZE);
1750       mlen = load_payload->size_written / REG_SIZE;
1751    } else {
1752       assert(data1.file == BAD_FILE);
1753 
1754       payload = header;
1755       mlen = 1;
1756 
1757       if (data0.file != BAD_FILE) {
1758          payload2 = bld.move_to_vgrf(data0, components);
1759          ex_mlen = components * sizeof(uint32_t) / REG_SIZE;
1760       }
1761    }
1762 
1763 
1764    if (mode == MEMORY_MODE_SHARED_LOCAL) {
1765       binding_type = LSC_ADDR_SURFTYPE_BTI;
1766       binding = brw_imm_ud(GFX7_BTI_SLM);
1767    } else if (mode == MEMORY_MODE_SCRATCH) {
1768       binding_type = LSC_ADDR_SURFTYPE_BTI;
1769       binding = brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
1770    }
1771 
1772    uint32_t sfid, desc;
1773    if (mode == MEMORY_MODE_TYPED) {
1774       assert(addr_size == LSC_ADDR_SIZE_A32);
1775       assert(!block);
1776 
1777       sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
1778 
1779       if (lsc_opcode_is_atomic(op)) {
1780          desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
1781                                          lsc_op_to_legacy_atomic(op),
1782                                          has_dest);
1783       } else {
1784          desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size,
1785                                              inst->group, components, !has_dest);
1786       }
1787    } else if (mode == MEMORY_MODE_CONSTANT) {
1788       assert(block); /* non-block loads not yet handled */
1789 
1790       sfid = GFX6_SFID_DATAPORT_CONSTANT_CACHE;
1791       desc = brw_dp_oword_block_rw_desc(devinfo, false, components, !has_dest);
1792    } else if (addr_size == LSC_ADDR_SIZE_A64) {
1793       assert(binding_type == LSC_ADDR_SURFTYPE_FLAT);
1794       assert(!dword_scattered);
1795 
1796       sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
1797 
1798       if (lsc_opcode_is_atomic(op)) {
1799          unsigned aop = lsc_op_to_legacy_atomic(op);
1800          if (lsc_opcode_is_atomic_float(op)) {
1801             desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
1802                                                         data_bit_size, aop,
1803                                                         has_dest);
1804          } else {
1805             desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size,
1806                                                   data_bit_size, aop,
1807                                                   has_dest);
1808          }
1809       } else if (block) {
1810          desc = brw_dp_a64_oword_block_rw_desc(devinfo, oword_aligned,
1811                                                components, !has_dest);
1812       } else if (byte_scattered) {
1813          desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
1814                                                   data_bit_size, !has_dest);
1815       } else {
1816          desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
1817                                                    components, !has_dest);
1818       }
1819    } else {
1820       assert(binding_type != LSC_ADDR_SURFTYPE_FLAT);
1821 
1822       sfid = surface_access ? HSW_SFID_DATAPORT_DATA_CACHE_1
1823                             : GFX7_SFID_DATAPORT_DATA_CACHE;
1824 
1825       if (lsc_opcode_is_atomic(op)) {
1826          unsigned aop = lsc_op_to_legacy_atomic(op);
1827          if (lsc_opcode_is_atomic_float(op)) {
1828             desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,
1829                                                     aop, has_dest);
1830          } else {
1831             desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size,
1832                                               aop, has_dest);
1833          }
1834       } else if (block) {
1835          desc = brw_dp_oword_block_rw_desc(devinfo, oword_aligned,
1836                                            components, !has_dest);
1837       } else if (byte_scattered) {
1838          desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1839                                               data_bit_size, !has_dest);
1840       } else if (dword_scattered) {
1841          desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1842                                                !has_dest);
1843       } else {
1844          desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1845                                                components, !has_dest);
1846       }
1847    }
1848 
1849    assert(sfid);
1850 
1851    /* Update the original instruction. */
1852    inst->opcode = SHADER_OPCODE_SEND;
1853    inst->sfid = sfid;
1854    inst->mlen = mlen;
1855    inst->ex_mlen = ex_mlen;
1856    inst->header_size = header.file != BAD_FILE ? 1 : 0;
1857    inst->send_has_side_effects = has_side_effects;
1858    inst->send_is_volatile = !has_side_effects;
1859 
1860    if (block) {
1861       assert(inst->force_writemask_all);
1862       inst->exec_size = components > 8 ? 16 : 8;
1863    }
1864 
1865    inst->resize_sources(4);
1866 
1867    /* Set up descriptors */
1868    switch (binding_type) {
1869    case LSC_ADDR_SURFTYPE_FLAT:
1870       inst->src[0] = brw_imm_ud(0);
1871       inst->src[1] = brw_imm_ud(0);
1872       break;
1873    case LSC_ADDR_SURFTYPE_BSS:
1874       inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1875       /* fall-through */
1876    case LSC_ADDR_SURFTYPE_SS:
1877       desc |= GFX9_BTI_BINDLESS;
1878 
1879       /* We assume that the driver provided the handle in the top 20 bits so
1880        * we can use the surface handle directly as the extended descriptor.
1881        */
1882       inst->src[0] = brw_imm_ud(0);
1883       inst->src[1] = binding;
1884       break;
1885    case LSC_ADDR_SURFTYPE_BTI:
1886       if (binding.file == IMM) {
1887          desc |= binding.ud & 0xff;
1888          inst->src[0] = brw_imm_ud(0);
1889          inst->src[1] = brw_imm_ud(0);
1890       } else {
1891          brw_reg tmp = ubld1.vgrf(BRW_TYPE_UD);
1892          ubld1.AND(tmp, binding, brw_imm_ud(0xff));
1893          inst->src[0] = component(tmp, 0);
1894          inst->src[1] = brw_imm_ud(0);
1895       }
1896       break;
1897    default:
1898       unreachable("Unknown surface type");
1899    }
1900 
1901    inst->desc = desc;
1902 
1903    /* Finally, the payloads */
1904    inst->src[2] = payload;
1905    inst->src[3] = payload2;
1906 }
1907 
1908 static void
lower_lsc_varying_pull_constant_logical_send(const brw_builder & bld,fs_inst * inst)1909 lower_lsc_varying_pull_constant_logical_send(const brw_builder &bld,
1910                                              fs_inst *inst)
1911 {
1912    const intel_device_info *devinfo = bld.shader->devinfo;
1913    ASSERTED const brw_compiler *compiler = bld.shader->compiler;
1914 
1915    brw_reg surface        = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
1916    brw_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
1917    brw_reg offset_B       = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
1918    brw_reg alignment_B    = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT];
1919 
1920    /* We are switching the instruction from an ALU-like instruction to a
1921     * send-from-grf instruction.  Since sends can't handle strides or
1922     * source modifiers, we have to make a copy of the offset source.
1923     */
1924    brw_reg ubo_offset = bld.move_to_vgrf(offset_B, 1);
1925 
1926    enum lsc_addr_surface_type surf_type =
1927       surface_handle.file == BAD_FILE ?
1928       LSC_ADDR_SURFTYPE_BTI : LSC_ADDR_SURFTYPE_BSS;
1929 
1930    assert(alignment_B.file == IMM);
1931    unsigned alignment = alignment_B.ud;
1932 
1933    inst->opcode = SHADER_OPCODE_SEND;
1934    inst->sfid = GFX12_SFID_UGM;
1935    inst->resize_sources(3);
1936    inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
1937                        compiler->extended_bindless_surface_offset;
1938 
1939    assert(!compiler->indirect_ubos_use_sampler);
1940 
1941    inst->src[0] = brw_imm_ud(0);
1942    inst->src[2] = ubo_offset; /* payload */
1943 
1944    if (alignment >= 4) {
1945       inst->desc =
1946          lsc_msg_desc(devinfo, LSC_OP_LOAD,
1947                       surf_type, LSC_ADDR_SIZE_A32,
1948                       LSC_DATA_SIZE_D32,
1949                       4 /* num_channels */,
1950                       false /* transpose */,
1951                       LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
1952       inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);
1953 
1954       setup_lsc_surface_descriptors(bld, inst, inst->desc,
1955                                     surface.file != BAD_FILE ?
1956                                     surface : surface_handle);
1957    } else {
1958       inst->desc =
1959          lsc_msg_desc(devinfo, LSC_OP_LOAD,
1960                       surf_type, LSC_ADDR_SIZE_A32,
1961                       LSC_DATA_SIZE_D32,
1962                       1 /* num_channels */,
1963                       false /* transpose */,
1964                       LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
1965       inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, inst->exec_size);
1966 
1967       setup_lsc_surface_descriptors(bld, inst, inst->desc,
1968                                     surface.file != BAD_FILE ?
1969                                     surface : surface_handle);
1970 
1971       /* The byte scattered messages can only read one dword at a time so
1972        * we have to duplicate the message 4 times to read the full vec4.
1973        * Hopefully, dead code will clean up the mess if some of them aren't
1974        * needed.
1975        */
1976       assert(inst->size_written == 16 * inst->exec_size);
1977       inst->size_written /= 4;
1978       for (unsigned c = 1; c < 4; c++) {
1979          /* Emit a copy of the instruction because we're about to modify
1980           * it.  Because this loop starts at 1, we will emit copies for the
1981           * first 3 and the final one will be the modified instruction.
1982           */
1983          bld.emit(*inst);
1984 
1985          /* Offset the source */
1986          inst->src[2] = bld.vgrf(BRW_TYPE_UD);
1987          bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
1988 
1989          /* Offset the destination */
1990          inst->dst = offset(inst->dst, bld, 1);
1991       }
1992    }
1993 }
1994 
1995 static void
lower_varying_pull_constant_logical_send(const brw_builder & bld,fs_inst * inst)1996 lower_varying_pull_constant_logical_send(const brw_builder &bld, fs_inst *inst)
1997 {
1998    const intel_device_info *devinfo = bld.shader->devinfo;
1999    const brw_compiler *compiler = bld.shader->compiler;
2000 
2001    brw_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
2002    brw_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
2003    brw_reg offset_B = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
2004 
2005    /* We are switching the instruction from an ALU-like instruction to a
2006     * send-from-grf instruction.  Since sends can't handle strides or
2007     * source modifiers, we have to make a copy of the offset source.
2008     */
2009    brw_reg ubo_offset = bld.vgrf(BRW_TYPE_UD);
2010    bld.MOV(ubo_offset, offset_B);
2011 
2012    assert(inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].file == IMM);
2013    unsigned alignment = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].ud;
2014 
2015    inst->opcode = SHADER_OPCODE_SEND;
2016    inst->mlen = inst->exec_size / 8;
2017    inst->resize_sources(3);
2018 
2019    /* src[0] & src[1] are filled by setup_surface_descriptors() */
2020    inst->src[2] = ubo_offset; /* payload */
2021 
2022    if (compiler->indirect_ubos_use_sampler) {
2023       const unsigned simd_mode =
2024          inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
2025                                 BRW_SAMPLER_SIMD_MODE_SIMD16;
2026       const uint32_t desc = brw_sampler_desc(devinfo, 0, 0,
2027                                              GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
2028                                              simd_mode, 0);
2029 
2030       inst->sfid = BRW_SFID_SAMPLER;
2031       setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2032    } else if (alignment >= 4) {
2033       const uint32_t desc =
2034          brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
2035                                         4, /* num_channels */
2036                                         false   /* write */);
2037 
2038       inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2039       setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2040    } else {
2041       const uint32_t desc =
2042          brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
2043                                        32,     /* bit_size */
2044                                        false   /* write */);
2045 
2046       inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
2047       setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2048 
2049       /* The byte scattered messages can only read one dword at a time so
2050        * we have to duplicate the message 4 times to read the full vec4.
2051        * Hopefully, dead code will clean up the mess if some of them aren't
2052        * needed.
2053        */
2054       assert(inst->size_written == 16 * inst->exec_size);
2055       inst->size_written /= 4;
2056       for (unsigned c = 1; c < 4; c++) {
2057          /* Emit a copy of the instruction because we're about to modify
2058           * it.  Because this loop starts at 1, we will emit copies for the
2059           * first 3 and the final one will be the modified instruction.
2060           */
2061          bld.emit(*inst);
2062 
2063          /* Offset the source */
2064          inst->src[2] = bld.vgrf(BRW_TYPE_UD);
2065          bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
2066 
2067          /* Offset the destination */
2068          inst->dst = offset(inst->dst, bld, 1);
2069       }
2070    }
2071 }
2072 
2073 static void
lower_interpolator_logical_send(const brw_builder & bld,fs_inst * inst,const struct brw_wm_prog_key * wm_prog_key,const struct brw_wm_prog_data * wm_prog_data)2074 lower_interpolator_logical_send(const brw_builder &bld, fs_inst *inst,
2075                                 const struct brw_wm_prog_key *wm_prog_key,
2076                                 const struct brw_wm_prog_data *wm_prog_data)
2077 {
2078    const intel_device_info *devinfo = bld.shader->devinfo;
2079 
2080    /* We have to send something */
2081    brw_reg payload = brw_vec8_grf(0, 0);
2082    unsigned mlen = 1;
2083 
2084    unsigned mode;
2085    switch (inst->opcode) {
2086    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2087       assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
2088       mode = GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE;
2089       break;
2090 
2091    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2092       assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
2093       mode = GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET;
2094       break;
2095 
2096    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2097       payload = inst->src[INTERP_SRC_OFFSET];
2098       mlen = 2 * inst->exec_size / 8;
2099       mode = GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET;
2100       break;
2101 
2102    default:
2103       unreachable("Invalid interpolator instruction");
2104    }
2105 
2106    const bool dynamic_mode =
2107       inst->src[INTERP_SRC_DYNAMIC_MODE].file != BAD_FILE;
2108 
2109    brw_reg desc = inst->src[INTERP_SRC_MSG_DESC];
2110    uint32_t desc_imm =
2111       brw_pixel_interp_desc(devinfo,
2112                             /* Leave the mode at 0 if persample_dispatch is
2113                              * dynamic, it will be ORed in below.
2114                              */
2115                             dynamic_mode ? 0 : mode,
2116                             inst->pi_noperspective,
2117                             false /* coarse_pixel_rate */,
2118                             inst->exec_size, inst->group);
2119 
2120    if (wm_prog_data->coarse_pixel_dispatch == INTEL_ALWAYS) {
2121       desc_imm |= (1 << 15);
2122    } else if (wm_prog_data->coarse_pixel_dispatch == INTEL_SOMETIMES) {
2123       STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_PI_MSG == (1 << 15));
2124       brw_reg orig_desc = desc;
2125       const brw_builder &ubld = bld.exec_all().group(8, 0);
2126       desc = ubld.vgrf(BRW_TYPE_UD);
2127       ubld.AND(desc, brw_dynamic_msaa_flags(wm_prog_data),
2128                brw_imm_ud(INTEL_MSAA_FLAG_COARSE_PI_MSG));
2129 
2130       /* And, if it's AT_OFFSET, we might have a non-trivial descriptor */
2131       if (orig_desc.file == IMM) {
2132          desc_imm |= orig_desc.ud;
2133       } else {
2134          ubld.OR(desc, desc, orig_desc);
2135       }
2136    }
2137 
2138    /* If persample_dispatch is dynamic, select the interpolation mode
2139     * dynamically and OR into the descriptor to complete the static part
2140     * generated by brw_pixel_interp_desc().
2141     *
2142     * Why does this work? If you look at the SKL PRMs, Volume 7:
2143     * 3D-Media-GPGPU, Shared Functions Pixel Interpolater, you'll see that
2144     *
2145     *   - "Per Message Offset” Message Descriptor
2146     *   - “Sample Position Offset” Message Descriptor
2147     *
2148     * have different formats. Fortunately, a fragment shader dispatched at
2149     * pixel rate, will have gl_SampleID = 0 & gl_NumSamples = 1. So the value
2150     * we pack in “Sample Position Offset” will be a 0 and will cover the X/Y
2151     * components of "Per Message Offset”, which will give us the pixel offset 0x0.
2152     */
2153    if (dynamic_mode) {
2154       brw_reg orig_desc = desc;
2155       const brw_builder &ubld = bld.exec_all().group(8, 0);
2156       desc = ubld.vgrf(BRW_TYPE_UD);
2157 
2158       /* The predicate should have been built in brw_fs_nir.cpp when emitting
2159        * NIR code. This guarantees that we do not have incorrect interactions
2160        * with the flag register holding the predication result.
2161        */
2162       if (orig_desc.file == IMM) {
2163          /* Not using SEL here because we would generate an instruction with 2
2164           * immediate sources which is not supported by HW.
2165           */
2166          set_predicate_inv(BRW_PREDICATE_NORMAL, false,
2167                            ubld.MOV(desc, brw_imm_ud(orig_desc.ud |
2168                                                      GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
2169          set_predicate_inv(BRW_PREDICATE_NORMAL, true,
2170                            ubld.MOV(desc, brw_imm_ud(orig_desc.ud |
2171                                                      GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
2172       } else {
2173          set_predicate_inv(BRW_PREDICATE_NORMAL, false,
2174                            ubld.OR(desc, orig_desc,
2175                                    brw_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
2176          set_predicate_inv(BRW_PREDICATE_NORMAL, true,
2177                            ubld.OR(desc, orig_desc,
2178                                    brw_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
2179       }
2180    }
2181 
2182    inst->opcode = SHADER_OPCODE_SEND;
2183    inst->sfid = GFX7_SFID_PIXEL_INTERPOLATOR;
2184    inst->desc = desc_imm;
2185    inst->ex_desc = 0;
2186    inst->mlen = mlen;
2187    inst->ex_mlen = 0;
2188    inst->send_has_side_effects = false;
2189    inst->send_is_volatile = false;
2190 
2191    inst->resize_sources(3);
2192    inst->src[0] = component(desc, 0);
2193    inst->src[1] = brw_imm_ud(0); /* ex_desc */
2194    inst->src[2] = payload;
2195 }
2196 
2197 static void
lower_btd_logical_send(const brw_builder & bld,fs_inst * inst)2198 lower_btd_logical_send(const brw_builder &bld, fs_inst *inst)
2199 {
2200    const intel_device_info *devinfo = bld.shader->devinfo;
2201    brw_reg global_addr = inst->src[0];
2202    const brw_reg btd_record = inst->src[1];
2203 
2204    const unsigned unit = reg_unit(devinfo);
2205    const unsigned mlen = 2 * unit;
2206    const brw_builder ubld = bld.exec_all();
2207    brw_reg header = ubld.vgrf(BRW_TYPE_UD, 2 * unit);
2208 
2209    ubld.MOV(header, brw_imm_ud(0));
2210    switch (inst->opcode) {
2211    case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
2212       assert(brw_type_size_bytes(global_addr.type) == 8 &&
2213              global_addr.stride == 0);
2214       global_addr.type = BRW_TYPE_UD;
2215       global_addr.stride = 1;
2216       ubld.group(2, 0).MOV(header, global_addr);
2217 
2218       /* XXX - There is a Registers Per Thread field in the BTD spawn
2219        *       header starting on Xe3, it doesn't appear to be needed
2220        *       by the hardware so we don't set it.  If it's ever
2221        *       needed though we will need some sort of reloc since
2222        *       we'll have to initialize it based on the prog_data
2223        *       structure of the callee.
2224        */
2225       break;
2226 
2227    case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
2228       /* The bottom bit is the Stack ID release bit */
2229       ubld.group(1, 0).MOV(header, brw_imm_ud(1));
2230       break;
2231 
2232    default:
2233       unreachable("Invalid BTD message");
2234    }
2235 
2236    /* Stack IDs are always in R1 regardless of whether we're coming from a
2237     * bindless shader or a regular compute shader.
2238     */
2239    brw_reg stack_ids = retype(offset(header, bld, 1), BRW_TYPE_UW);
2240    bld.exec_all().MOV(stack_ids, retype(brw_vec8_grf(1 * unit, 0),
2241                                         BRW_TYPE_UW));
2242 
2243    unsigned ex_mlen = 0;
2244    brw_reg payload;
2245    if (inst->opcode == SHADER_OPCODE_BTD_SPAWN_LOGICAL) {
2246       ex_mlen = 2 * (inst->exec_size / 8);
2247       payload = bld.move_to_vgrf(btd_record, 1);
2248    } else {
2249       assert(inst->opcode == SHADER_OPCODE_BTD_RETIRE_LOGICAL);
2250       /* All these messages take a BTD and things complain if we don't provide
2251        * one for RETIRE.  However, it shouldn't ever actually get used so fill
2252        * it with zero.
2253        */
2254       ex_mlen = 2 * (inst->exec_size / 8);
2255       payload = bld.move_to_vgrf(brw_imm_uq(0), 1);
2256    }
2257 
2258    /* Update the original instruction. */
2259    inst->opcode = SHADER_OPCODE_SEND;
2260    inst->mlen = mlen;
2261    inst->ex_mlen = ex_mlen;
2262    inst->header_size = 0; /* HW docs require has_header = false */
2263    inst->send_has_side_effects = true;
2264    inst->send_is_volatile = false;
2265 
2266    /* Set up SFID and descriptors */
2267    inst->sfid = GEN_RT_SFID_BINDLESS_THREAD_DISPATCH;
2268    inst->desc = brw_btd_spawn_desc(devinfo, inst->exec_size,
2269                                    GEN_RT_BTD_MESSAGE_SPAWN);
2270    inst->resize_sources(4);
2271    inst->src[0] = brw_imm_ud(0); /* desc */
2272    inst->src[1] = brw_imm_ud(0); /* ex_desc */
2273    inst->src[2] = header;
2274    inst->src[3] = payload;
2275 }
2276 
2277 static void
lower_trace_ray_logical_send(const brw_builder & bld,fs_inst * inst)2278 lower_trace_ray_logical_send(const brw_builder &bld, fs_inst *inst)
2279 {
2280    const intel_device_info *devinfo = bld.shader->devinfo;
2281    /* The emit_uniformize() in brw_fs_nir.cpp will generate an horizontal
2282     * stride of 0. Below we're doing a MOV() in SIMD2. Since we can't use UQ/Q
2283     * types in on Gfx12.5, we need to tweak the stride with a value of 1 dword
2284     * so that the MOV operates on 2 components rather than twice the same
2285     * component.
2286     */
2287    const brw_reg bvh_level =
2288       inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == IMM ?
2289       inst->src[RT_LOGICAL_SRC_BVH_LEVEL] :
2290       bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL],
2291                        inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL));
2292    const brw_reg trace_ray_control =
2293       inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == IMM ?
2294       inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] :
2295       bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL],
2296                        inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL));
2297    const brw_reg synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS];
2298    assert(synchronous_src.file == IMM);
2299    const bool synchronous = synchronous_src.ud;
2300 
2301    const unsigned unit = reg_unit(devinfo);
2302    const unsigned mlen = unit;
2303    const brw_builder ubld = bld.exec_all();
2304    brw_reg header = ubld.vgrf(BRW_TYPE_UD);
2305    ubld.MOV(header, brw_imm_ud(0));
2306 
2307    const brw_reg globals_addr = inst->src[RT_LOGICAL_SRC_GLOBALS];
2308    if (globals_addr.file != UNIFORM) {
2309       brw_reg addr_ud = retype(globals_addr, BRW_TYPE_UD);
2310       addr_ud.stride = 1;
2311       ubld.group(2, 0).MOV(header, addr_ud);
2312    } else {
2313       /* If the globals address comes from a uniform, do not do the SIMD2
2314        * optimization. This occurs in many Vulkan CTS tests.
2315        *
2316        * Many places in the late compiler, including but not limited to an
2317        * assertion in fs_visitor::assign_curb_setup, assume that all uses of a
2318        * UNIFORM will be uniform (i.e., <0,1,0>). The clever SIMD2
2319        * optimization violates that assumption.
2320        */
2321       ubld.group(1, 0).MOV(byte_offset(header, 0),
2322                            subscript(globals_addr, BRW_TYPE_UD, 0));
2323       ubld.group(1, 0).MOV(byte_offset(header, 4),
2324                            subscript(globals_addr, BRW_TYPE_UD, 1));
2325    }
2326 
2327    if (synchronous)
2328       ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous));
2329 
2330    const unsigned ex_mlen = inst->exec_size / 8;
2331    brw_reg payload = bld.vgrf(BRW_TYPE_UD);
2332    if (bvh_level.file == IMM &&
2333        trace_ray_control.file == IMM) {
2334       uint32_t high = devinfo->ver >= 20 ? 10 : 9;
2335       bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, high, 8) |
2336                                   (bvh_level.ud & 0x7)));
2337    } else {
2338       bld.SHL(payload, trace_ray_control, brw_imm_ud(8));
2339       bld.OR(payload, payload, bvh_level);
2340    }
2341 
2342    /* When doing synchronous traversal, the HW implicitly computes the
2343     * stack_id using the following formula :
2344     *
2345     *    EUID[3:0] & THREAD_ID[2:0] & SIMD_LANE_ID[3:0]
2346     *
2347     * Only in the asynchronous case we need to set the stack_id given from the
2348     * payload register.
2349     */
2350    if (!synchronous) {
2351       bld.AND(subscript(payload, BRW_TYPE_UW, 1),
2352               retype(brw_vec8_grf(1 * unit, 0), BRW_TYPE_UW),
2353               brw_imm_uw(0x7ff));
2354    }
2355 
2356    /* Update the original instruction. */
2357    inst->opcode = SHADER_OPCODE_SEND;
2358    inst->mlen = mlen;
2359    inst->ex_mlen = ex_mlen;
2360    inst->header_size = 0; /* HW docs require has_header = false */
2361    inst->send_has_side_effects = true;
2362    inst->send_is_volatile = false;
2363 
2364    /* Set up SFID and descriptors */
2365    inst->sfid = GEN_RT_SFID_RAY_TRACE_ACCELERATOR;
2366    inst->desc = brw_rt_trace_ray_desc(devinfo, inst->exec_size);
2367    inst->resize_sources(4);
2368    inst->src[0] = brw_imm_ud(0); /* desc */
2369    inst->src[1] = brw_imm_ud(0); /* ex_desc */
2370    inst->src[2] = header;
2371    inst->src[3] = payload;
2372 }
2373 
2374 static void
lower_get_buffer_size(const brw_builder & bld,fs_inst * inst)2375 lower_get_buffer_size(const brw_builder &bld, fs_inst *inst)
2376 {
2377    const intel_device_info *devinfo = bld.shader->devinfo;
2378    /* Since we can only execute this instruction on uniform bti/surface
2379     * handles, brw_fs_nir.cpp should already have limited this to SIMD8.
2380     */
2381    assert(inst->exec_size == (devinfo->ver < 20 ? 8 : 16));
2382 
2383    brw_reg surface = inst->src[GET_BUFFER_SIZE_SRC_SURFACE];
2384    brw_reg surface_handle = inst->src[GET_BUFFER_SIZE_SRC_SURFACE_HANDLE];
2385    brw_reg lod = inst->src[GET_BUFFER_SIZE_SRC_LOD];
2386 
2387    inst->opcode = SHADER_OPCODE_SEND;
2388    inst->mlen = inst->exec_size / 8;
2389    inst->resize_sources(3);
2390    inst->ex_mlen = 0;
2391    inst->ex_desc = 0;
2392 
2393    /* src[0] & src[1] are filled by setup_surface_descriptors() */
2394    inst->src[2] = lod;
2395 
2396    const uint32_t return_format = GFX8_SAMPLER_RETURN_FORMAT_32BITS;
2397 
2398    const uint32_t desc = brw_sampler_desc(devinfo, 0, 0,
2399                                           GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
2400                                           BRW_SAMPLER_SIMD_MODE_SIMD8,
2401                                           return_format);
2402 
2403    inst->dst = retype(inst->dst, BRW_TYPE_UW);
2404    inst->sfid = BRW_SFID_SAMPLER;
2405    setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2406 }
2407 
2408 bool
brw_lower_logical_sends(fs_visitor & s)2409 brw_lower_logical_sends(fs_visitor &s)
2410 {
2411    const intel_device_info *devinfo = s.devinfo;
2412    bool progress = false;
2413 
2414    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
2415       const brw_builder ibld(&s, block, inst);
2416 
2417       switch (inst->opcode) {
2418       case FS_OPCODE_FB_WRITE_LOGICAL:
2419          assert(s.stage == MESA_SHADER_FRAGMENT);
2420          lower_fb_write_logical_send(ibld, inst,
2421                                      brw_wm_prog_data(s.prog_data),
2422                                      (const brw_wm_prog_key *)s.key,
2423                                      s.fs_payload());
2424          break;
2425 
2426       case FS_OPCODE_FB_READ_LOGICAL:
2427          lower_fb_read_logical_send(ibld, inst, brw_wm_prog_data(s.prog_data));
2428          break;
2429 
2430       case SHADER_OPCODE_TEX_LOGICAL:
2431       case SHADER_OPCODE_TXD_LOGICAL:
2432       case SHADER_OPCODE_TXF_LOGICAL:
2433       case SHADER_OPCODE_TXL_LOGICAL:
2434       case SHADER_OPCODE_TXS_LOGICAL:
2435       case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
2436       case FS_OPCODE_TXB_LOGICAL:
2437       case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
2438       case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
2439       case SHADER_OPCODE_TXF_MCS_LOGICAL:
2440       case SHADER_OPCODE_LOD_LOGICAL:
2441       case SHADER_OPCODE_TG4_LOGICAL:
2442       case SHADER_OPCODE_TG4_BIAS_LOGICAL:
2443       case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
2444       case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
2445       case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
2446       case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
2447       case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
2448       case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
2449          lower_sampler_logical_send(ibld, inst);
2450          break;
2451 
2452       case SHADER_OPCODE_GET_BUFFER_SIZE:
2453          lower_get_buffer_size(ibld, inst);
2454          break;
2455 
2456       case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
2457       case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
2458       case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
2459          if (devinfo->ver >= 20 ||
2460              (devinfo->has_lsc &&
2461               inst->src[MEMORY_LOGICAL_MODE].ud != MEMORY_MODE_TYPED))
2462             lower_lsc_memory_logical_send(ibld, inst);
2463          else
2464             lower_hdc_memory_logical_send(ibld, inst);
2465          break;
2466 
2467       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
2468          if (devinfo->has_lsc && !s.compiler->indirect_ubos_use_sampler)
2469             lower_lsc_varying_pull_constant_logical_send(ibld, inst);
2470          else
2471             lower_varying_pull_constant_logical_send(ibld, inst);
2472          break;
2473 
2474       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2475       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2476       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2477          lower_interpolator_logical_send(ibld, inst,
2478                                          (const brw_wm_prog_key *)s.key,
2479                                          brw_wm_prog_data(s.prog_data));
2480          break;
2481 
2482       case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
2483       case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
2484          lower_btd_logical_send(ibld, inst);
2485          break;
2486 
2487       case RT_OPCODE_TRACE_RAY_LOGICAL:
2488          lower_trace_ray_logical_send(ibld, inst);
2489          break;
2490 
2491       case SHADER_OPCODE_URB_READ_LOGICAL:
2492          if (devinfo->ver < 20)
2493             lower_urb_read_logical_send(ibld, inst);
2494          else
2495             lower_urb_read_logical_send_xe2(ibld, inst);
2496          break;
2497 
2498       case SHADER_OPCODE_URB_WRITE_LOGICAL:
2499          if (devinfo->ver < 20)
2500             lower_urb_write_logical_send(ibld, inst);
2501          else
2502             lower_urb_write_logical_send_xe2(ibld, inst);
2503 
2504          break;
2505 
2506       default:
2507          continue;
2508       }
2509 
2510       progress = true;
2511    }
2512 
2513    if (progress)
2514       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2515 
2516    return progress;
2517 }
2518 
2519 /**
2520  * Turns the generic expression-style uniform pull constant load instruction
2521  * into a hardware-specific series of instructions for loading a pull
2522  * constant.
2523  *
2524  * The expression style allows the CSE pass before this to optimize out
2525  * repeated loads from the same offset, and gives the pre-register-allocation
2526  * scheduling full flexibility, while the conversion to native instructions
2527  * allows the post-register-allocation scheduler the best information
2528  * possible.
2529  *
2530  * Note that execution masking for setting up pull constant loads is special:
2531  * the channels that need to be written are unrelated to the current execution
2532  * mask, since a later instruction will use one of the result channels as a
2533  * source operand for all 8 or 16 of its channels.
2534  */
2535 bool
brw_lower_uniform_pull_constant_loads(fs_visitor & s)2536 brw_lower_uniform_pull_constant_loads(fs_visitor &s)
2537 {
2538    const intel_device_info *devinfo = s.devinfo;
2539    bool progress = false;
2540 
2541    foreach_block_and_inst (block, fs_inst, inst, s.cfg) {
2542       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2543          continue;
2544 
2545       const brw_reg surface = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE];
2546       const brw_reg surface_handle = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE];
2547       const brw_reg offset_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_OFFSET];
2548       const brw_reg size_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_SIZE];
2549       assert(surface.file == BAD_FILE || surface_handle.file == BAD_FILE);
2550       assert(offset_B.file == IMM);
2551       assert(size_B.file == IMM);
2552 
2553       if (devinfo->has_lsc) {
2554          const brw_builder ubld =
2555             brw_builder(&s, block, inst).group(8, 0).exec_all();
2556 
2557          const brw_reg payload = ubld.vgrf(BRW_TYPE_UD);
2558          ubld.MOV(payload, offset_B);
2559 
2560          inst->sfid = GFX12_SFID_UGM;
2561          inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
2562                                    surface_handle.file == BAD_FILE ?
2563                                    LSC_ADDR_SURFTYPE_BTI :
2564                                    LSC_ADDR_SURFTYPE_BSS,
2565                                    LSC_ADDR_SIZE_A32,
2566                                    LSC_DATA_SIZE_D32,
2567                                    inst->size_written / 4,
2568                                    true /* transpose */,
2569                                    LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
2570 
2571          /* Update the original instruction. */
2572          inst->opcode = SHADER_OPCODE_SEND;
2573          inst->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, 1);
2574          inst->send_ex_bso = surface_handle.file != BAD_FILE &&
2575                              s.compiler->extended_bindless_surface_offset;
2576          inst->ex_mlen = 0;
2577          inst->header_size = 0;
2578          inst->send_has_side_effects = false;
2579          inst->send_is_volatile = true;
2580          inst->exec_size = 1;
2581 
2582          /* Finally, the payload */
2583 
2584          inst->resize_sources(3);
2585          setup_lsc_surface_descriptors(ubld, inst, inst->desc,
2586                                        surface.file != BAD_FILE ?
2587                                        surface : surface_handle);
2588          inst->src[2] = payload;
2589 
2590          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2591       } else {
2592          const brw_builder ubld = brw_builder(&s, block, inst).exec_all();
2593          brw_reg header = brw_builder(&s, 8).exec_all().vgrf(BRW_TYPE_UD);
2594 
2595          ubld.group(8, 0).MOV(header,
2596                               retype(brw_vec8_grf(0, 0), BRW_TYPE_UD));
2597          ubld.group(1, 0).MOV(component(header, 2),
2598                               brw_imm_ud(offset_B.ud / 16));
2599 
2600          inst->sfid = GFX6_SFID_DATAPORT_CONSTANT_CACHE;
2601          inst->opcode = SHADER_OPCODE_SEND;
2602          inst->header_size = 1;
2603          inst->mlen = 1;
2604 
2605          uint32_t desc =
2606             brw_dp_oword_block_rw_desc(devinfo, true /* align_16B */,
2607                                        size_B.ud / 4, false /* write */);
2608 
2609          inst->resize_sources(4);
2610 
2611          setup_surface_descriptors(ubld, inst, desc, surface, surface_handle);
2612 
2613          inst->src[2] = header;
2614          inst->src[3] = brw_reg(); /* unused for reads */
2615 
2616          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2617       }
2618 
2619       progress = true;
2620    }
2621 
2622    return progress;
2623 }
2624 
2625 bool
brw_lower_send_descriptors(fs_visitor & s)2626 brw_lower_send_descriptors(fs_visitor &s)
2627 {
2628    const intel_device_info *devinfo = s.devinfo;
2629    bool progress = false;
2630 
2631    foreach_block_and_inst (block, fs_inst, inst, s.cfg) {
2632       if (inst->opcode != SHADER_OPCODE_SEND &&
2633           inst->opcode != SHADER_OPCODE_SEND_GATHER)
2634          continue;
2635 
2636       const brw_builder ubld = brw_builder(&s, block, inst).exec_all().group(1, 0);
2637 
2638       /* Descriptor */
2639       const unsigned rlen = inst->dst.is_null() ? 0 : inst->size_written / REG_SIZE;
2640       unsigned mlen = inst->mlen;
2641       if (inst->opcode == SHADER_OPCODE_SEND_GATHER) {
2642          assert(inst->sources >= 3);
2643          mlen = (inst->sources - 3) * reg_unit(devinfo);
2644       }
2645 
2646       uint32_t desc_imm = inst->desc |
2647          brw_message_desc(devinfo, mlen, rlen, inst->header_size);
2648 
2649       assert(inst->src[0].file != BAD_FILE);
2650       assert(inst->src[1].file != BAD_FILE);
2651 
2652       brw_reg desc = inst->src[0];
2653       if (desc.file == IMM) {
2654          inst->src[0] = brw_imm_ud(desc.ud | desc_imm);
2655       } else {
2656          brw_reg addr_reg = ubld.vaddr(BRW_TYPE_UD,
2657                                        BRW_ADDRESS_SUBREG_INDIRECT_DESC);
2658          ubld.OR(addr_reg, desc, brw_imm_ud(desc_imm));
2659          inst->src[0] = addr_reg;
2660       }
2661 
2662       /* Extended descriptor */
2663       brw_reg ex_desc = inst->src[1];
2664       uint32_t ex_desc_imm = inst->ex_desc |
2665          brw_message_ex_desc(devinfo, inst->ex_mlen);
2666 
2667       if (ex_desc.file == IMM)
2668          ex_desc_imm |= ex_desc.ud;
2669 
2670       bool needs_addr_reg = false;
2671       if (ex_desc.file != IMM)
2672          needs_addr_reg = true;
2673       if (devinfo->ver < 12 && ex_desc.file == IMM &&
2674           (ex_desc_imm & INTEL_MASK(15, 12)) != 0)
2675          needs_addr_reg = true;
2676 
2677       if (inst->send_ex_bso) {
2678          needs_addr_reg = true;
2679          /* When using the extended bindless offset, the whole extended
2680           * descriptor is the surface handle.
2681           */
2682          ex_desc_imm = 0;
2683       } else {
2684          if (needs_addr_reg)
2685             ex_desc_imm |=  inst->sfid | inst->eot << 5;
2686       }
2687 
2688       if (needs_addr_reg) {
2689          brw_reg addr_reg = ubld.vaddr(BRW_TYPE_UD,
2690                                        BRW_ADDRESS_SUBREG_INDIRECT_EX_DESC);
2691          if (ex_desc.file == IMM)
2692             ubld.MOV(addr_reg, brw_imm_ud(ex_desc_imm));
2693          else if (ex_desc_imm == 0)
2694             ubld.MOV(addr_reg, ex_desc);
2695          else
2696             ubld.OR(addr_reg, ex_desc, brw_imm_ud(ex_desc_imm));
2697          inst->src[1] = addr_reg;
2698       } else {
2699          inst->src[1] = brw_imm_ud(ex_desc_imm);
2700       }
2701 
2702       progress = true;
2703       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2704    }
2705 
2706    return progress;
2707 }
2708