• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010, 2022 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /**
25  * @file brw_lower_logical_sends.cpp
26  */
27 
28 #include "brw_eu.h"
29 #include "brw_fs.h"
30 #include "brw_fs_builder.h"
31 
32 using namespace brw;
33 
34 static void
lower_urb_read_logical_send(const fs_builder & bld,fs_inst * inst)35 lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst)
36 {
37    const intel_device_info *devinfo = bld.shader->devinfo;
38    const bool per_slot_present =
39       inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
40 
41    assert(inst->size_written % REG_SIZE == 0);
42    assert(inst->header_size == 0);
43 
44    fs_reg payload_sources[2];
45    unsigned header_size = 0;
46    payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
47    if (per_slot_present)
48       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
49 
50    fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(header_size),
51                            BRW_REGISTER_TYPE_F);
52    bld.LOAD_PAYLOAD(payload, payload_sources, header_size, header_size);
53 
54    inst->opcode = SHADER_OPCODE_SEND;
55    inst->header_size = header_size;
56 
57    inst->sfid = BRW_SFID_URB;
58    inst->desc = brw_urb_desc(devinfo,
59                              GFX8_URB_OPCODE_SIMD8_READ,
60                              per_slot_present,
61                              false,
62                              inst->offset);
63 
64    inst->mlen = header_size;
65    inst->ex_desc = 0;
66    inst->ex_mlen = 0;
67    inst->send_is_volatile = true;
68 
69    inst->resize_sources(4);
70 
71    inst->src[0] = brw_imm_ud(0); /* desc */
72    inst->src[1] = brw_imm_ud(0); /* ex_desc */
73    inst->src[2] = payload;
74    inst->src[3] = brw_null_reg();
75 }
76 
77 static void
lower_urb_read_logical_send_xe2(const fs_builder & bld,fs_inst * inst)78 lower_urb_read_logical_send_xe2(const fs_builder &bld, fs_inst *inst)
79 {
80    const intel_device_info *devinfo = bld.shader->devinfo;
81    assert(devinfo->has_lsc);
82 
83    assert(inst->size_written % (REG_SIZE * reg_unit(devinfo)) == 0);
84    assert(inst->header_size == 0);
85 
86    /* Get the logical send arguments. */
87    const fs_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];
88 
89    /* Calculate the total number of components of the payload. */
90    const unsigned dst_comps = inst->size_written / (REG_SIZE * reg_unit(devinfo));
91 
92    fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);
93 
94    bld.MOV(payload, handle);
95 
96    /* The low 24-bits of the URB handle is a byte offset into the URB area.
97     * Add the (OWord) offset of the write to this value.
98     */
99    if (inst->offset) {
100       bld.ADD(payload, payload, brw_imm_ud(inst->offset * 16));
101       inst->offset = 0;
102    }
103 
104    fs_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
105    if (offsets.file != BAD_FILE) {
106       fs_reg offsets_B = bld.vgrf(BRW_REGISTER_TYPE_UD);
107       bld.SHL(offsets_B, offsets, brw_imm_ud(4)); /* OWords -> Bytes */
108       bld.ADD(payload, payload, offsets_B);
109    }
110 
111    inst->sfid = BRW_SFID_URB;
112 
113    assert((dst_comps >= 1 && dst_comps <= 4) || dst_comps == 8);
114 
115    inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
116                              LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
117                              1 /* num_coordinates */,
118                              LSC_DATA_SIZE_D32, dst_comps /* num_channels */,
119                              false /* transpose */,
120                              LSC_CACHE(devinfo, STORE, L1UC_L3UC),
121                              false /* has_dest */);
122 
123 
124    /* Update the original instruction. */
125    inst->opcode = SHADER_OPCODE_SEND;
126    inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
127    inst->ex_mlen = 0;
128    inst->header_size = 0;
129    inst->send_has_side_effects = true;
130    inst->send_is_volatile = false;
131 
132    inst->resize_sources(4);
133 
134    inst->src[0] = brw_imm_ud(0);
135    inst->src[1] = brw_imm_ud(0);
136 
137    inst->src[2] = payload;
138    inst->src[3] = brw_null_reg();
139 }
140 
141 static void
lower_urb_write_logical_send(const fs_builder & bld,fs_inst * inst)142 lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst)
143 {
144    const intel_device_info *devinfo = bld.shader->devinfo;
145    const bool per_slot_present =
146       inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
147    const bool channel_mask_present =
148       inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE;
149 
150    assert(inst->header_size == 0);
151 
152    const unsigned length = 1 + per_slot_present + channel_mask_present +
153                            inst->components_read(URB_LOGICAL_SRC_DATA);
154 
155    fs_reg *payload_sources = new fs_reg[length];
156    fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(length),
157                            BRW_REGISTER_TYPE_F);
158 
159    unsigned header_size = 0;
160    payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
161    if (per_slot_present)
162       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
163 
164    if (channel_mask_present)
165       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
166 
167    for (unsigned i = header_size, j = 0; i < length; i++, j++)
168       payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j);
169 
170    bld.LOAD_PAYLOAD(payload, payload_sources, length, header_size);
171 
172    delete [] payload_sources;
173 
174    inst->opcode = SHADER_OPCODE_SEND;
175    inst->header_size = header_size;
176    inst->dst = brw_null_reg();
177 
178    inst->sfid = BRW_SFID_URB;
179    inst->desc = brw_urb_desc(devinfo,
180                              GFX8_URB_OPCODE_SIMD8_WRITE,
181                              per_slot_present,
182                              channel_mask_present,
183                              inst->offset);
184 
185    inst->mlen = length;
186    inst->ex_desc = 0;
187    inst->ex_mlen = 0;
188    inst->send_has_side_effects = true;
189 
190    inst->resize_sources(4);
191 
192    inst->src[0] = brw_imm_ud(0); /* desc */
193    inst->src[1] = brw_imm_ud(0); /* ex_desc */
194    inst->src[2] = payload;
195    inst->src[3] = brw_null_reg();
196 }
197 
198 static void
lower_urb_write_logical_send_xe2(const fs_builder & bld,fs_inst * inst)199 lower_urb_write_logical_send_xe2(const fs_builder &bld, fs_inst *inst)
200 {
201    const intel_device_info *devinfo = bld.shader->devinfo;
202    assert(devinfo->has_lsc);
203 
204    /* Get the logical send arguments. */
205    const fs_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];
206    const fs_reg src = inst->components_read(URB_LOGICAL_SRC_DATA) ?
207       inst->src[URB_LOGICAL_SRC_DATA] : fs_reg(brw_imm_ud(0));
208    assert(type_sz(src.type) == 4);
209 
210    /* Calculate the total number of components of the payload. */
211    const unsigned src_comps = MAX2(1, inst->components_read(URB_LOGICAL_SRC_DATA));
212    const unsigned src_sz = type_sz(src.type);
213 
214    fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);
215 
216    bld.MOV(payload, handle);
217 
218    /* The low 24-bits of the URB handle is a byte offset into the URB area.
219     * Add the (OWord) offset of the write to this value.
220     */
221    if (inst->offset) {
222       bld.ADD(payload, payload, brw_imm_ud(inst->offset * 16));
223       inst->offset = 0;
224    }
225 
226    fs_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
227    if (offsets.file != BAD_FILE) {
228       fs_reg offsets_B = bld.vgrf(BRW_REGISTER_TYPE_UD);
229       bld.SHL(offsets_B, offsets, brw_imm_ud(4)); /* OWords -> Bytes */
230       bld.ADD(payload, payload, offsets_B);
231    }
232 
233    const fs_reg cmask = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
234    unsigned mask = 0;
235 
236    if (cmask.file != BAD_FILE) {
237       assert(cmask.file == IMM);
238       assert(cmask.type == BRW_REGISTER_TYPE_UD);
239       mask = cmask.ud >> 16;
240    }
241 
242    fs_reg payload2 = bld.move_to_vgrf(src, src_comps);
243    const unsigned ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE;
244 
245    inst->sfid = BRW_SFID_URB;
246 
247    enum lsc_opcode op = mask ? LSC_OP_STORE_CMASK : LSC_OP_STORE;
248    inst->desc = lsc_msg_desc_wcmask(devinfo, op, inst->exec_size,
249                              LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
250                              1 /* num_coordinates */,
251                              LSC_DATA_SIZE_D32, src_comps /* num_channels */,
252                              false /* transpose */,
253                              LSC_CACHE(devinfo, STORE, L1UC_L3UC),
254                              false /* has_dest */, mask);
255 
256 
257    /* Update the original instruction. */
258    inst->opcode = SHADER_OPCODE_SEND;
259    inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
260    inst->ex_mlen = ex_mlen;
261    inst->header_size = 0;
262    inst->send_has_side_effects = true;
263    inst->send_is_volatile = false;
264 
265    inst->resize_sources(4);
266 
267    inst->src[0] = brw_imm_ud(0);
268    inst->src[1] = brw_imm_ud(0);
269 
270    inst->src[2] = payload;
271    inst->src[3] = payload2;
272 }
273 
274 static void
setup_color_payload(const fs_builder & bld,const brw_wm_prog_key * key,fs_reg * dst,fs_reg color,unsigned components)275 setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
276                     fs_reg *dst, fs_reg color, unsigned components)
277 {
278    if (key->clamp_fragment_color) {
279       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
280       assert(color.type == BRW_REGISTER_TYPE_F);
281 
282       for (unsigned i = 0; i < components; i++)
283          set_saturate(true,
284                       bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
285 
286       color = tmp;
287    }
288 
289    for (unsigned i = 0; i < components; i++)
290       dst[i] = offset(color, bld, i);
291 }
292 
293 static void
lower_fb_write_logical_send(const fs_builder & bld,fs_inst * inst,const struct brw_wm_prog_data * prog_data,const brw_wm_prog_key * key,const fs_thread_payload & fs_payload)294 lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
295                             const struct brw_wm_prog_data *prog_data,
296                             const brw_wm_prog_key *key,
297                             const fs_thread_payload &fs_payload)
298 {
299    assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
300    const intel_device_info *devinfo = bld.shader->devinfo;
301    const fs_reg color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
302    const fs_reg color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
303    const fs_reg src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
304    const fs_reg src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
305    const fs_reg dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
306    const fs_reg src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
307    fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
308    const unsigned components =
309       inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
310 
311    assert(inst->target != 0 || src0_alpha.file == BAD_FILE);
312 
313    fs_reg sources[15];
314    int header_size = 2, payload_header_size;
315    unsigned length = 0;
316 
317    if (devinfo->ver < 11 &&
318       (color1.file != BAD_FILE || key->nr_color_regions > 1)) {
319       assert(devinfo->ver < 20);
320 
321       /* From the Sandy Bridge PRM, volume 4, page 198:
322        *
323        *     "Dispatched Pixel Enables. One bit per pixel indicating
324        *      which pixels were originally enabled when the thread was
325        *      dispatched. This field is only required for the end-of-
326        *      thread message and on all dual-source messages."
327        */
328       const fs_builder ubld = bld.exec_all().group(8, 0);
329 
330       fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
331       if (bld.group() < 16) {
332          /* The header starts off as g0 and g1 for the first half */
333          ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
334                                               BRW_REGISTER_TYPE_UD));
335       } else {
336          /* The header starts off as g0 and g2 for the second half */
337          assert(bld.group() < 32);
338          const fs_reg header_sources[2] = {
339             retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
340             retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD),
341          };
342          ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
343 
344          /* Gfx12 will require additional fix-ups if we ever hit this path. */
345          assert(devinfo->ver < 12);
346       }
347 
348       uint32_t g00_bits = 0;
349 
350       /* Set "Source0 Alpha Present to RenderTarget" bit in message
351        * header.
352        */
353       if (src0_alpha.file != BAD_FILE)
354          g00_bits |= 1 << 11;
355 
356       /* Set computes stencil to render target */
357       if (prog_data->computed_stencil)
358          g00_bits |= 1 << 14;
359 
360       if (g00_bits) {
361          /* OR extra bits into g0.0 */
362          ubld.group(1, 0).OR(component(header, 0),
363                              retype(brw_vec1_grf(0, 0),
364                                     BRW_REGISTER_TYPE_UD),
365                              brw_imm_ud(g00_bits));
366       }
367 
368       /* Set the render target index for choosing BLEND_STATE. */
369       if (inst->target > 0) {
370          ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target));
371       }
372 
373       if (prog_data->uses_kill) {
374          ubld.group(1, 0).MOV(retype(component(header, 15),
375                                      BRW_REGISTER_TYPE_UW),
376                               brw_sample_mask_reg(bld));
377       }
378 
379       assert(length == 0);
380       sources[0] = header;
381       sources[1] = horiz_offset(header, 8);
382       length = 2;
383    }
384    assert(length == 0 || length == 2);
385    header_size = length;
386 
387    if (fs_payload.aa_dest_stencil_reg[0]) {
388       assert(inst->group < 16);
389       sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
390       bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
391          .MOV(sources[length],
392               fs_reg(brw_vec8_grf(fs_payload.aa_dest_stencil_reg[0], 0)));
393       length++;
394    }
395 
396    if (src0_alpha.file != BAD_FILE) {
397       for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
398          const fs_builder &ubld = bld.exec_all().group(8, i)
399                                     .annotate("FB write src0 alpha");
400          const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_F);
401          ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
402          setup_color_payload(ubld, key, &sources[length], tmp, 1);
403          length++;
404       }
405    }
406 
407    if (sample_mask.file != BAD_FILE) {
408       const fs_reg tmp(VGRF, bld.shader->alloc.allocate(reg_unit(devinfo)),
409                        BRW_REGISTER_TYPE_UD);
410 
411       /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
412        * relevant.  Since it's unsigned single words one vgrf is always
413        * 16-wide, but only the lower or higher 8 channels will be used by the
414        * hardware when doing a SIMD8 write depending on whether we have
415        * selected the subspans for the first or second half respectively.
416        */
417       assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
418       sample_mask.type = BRW_REGISTER_TYPE_UW;
419       sample_mask.stride *= 2;
420 
421       bld.exec_all().annotate("FB write oMask")
422          .MOV(horiz_offset(retype(tmp, BRW_REGISTER_TYPE_UW),
423                            inst->group % (16 * reg_unit(devinfo))),
424               sample_mask);
425 
426       for (unsigned i = 0; i < reg_unit(devinfo); i++)
427          sources[length++] = byte_offset(tmp, REG_SIZE * i);
428    }
429 
430    payload_header_size = length;
431 
432    setup_color_payload(bld, key, &sources[length], color0, components);
433    length += 4;
434 
435    if (color1.file != BAD_FILE) {
436       setup_color_payload(bld, key, &sources[length], color1, components);
437       length += 4;
438    }
439 
440    if (src_depth.file != BAD_FILE) {
441       sources[length] = src_depth;
442       length++;
443    }
444 
445    if (dst_depth.file != BAD_FILE) {
446       sources[length] = dst_depth;
447       length++;
448    }
449 
450    if (src_stencil.file != BAD_FILE) {
451       assert(bld.dispatch_width() == 8 * reg_unit(devinfo));
452 
453       /* XXX: src_stencil is only available on gfx9+. dst_depth is never
454        * available on gfx9+. As such it's impossible to have both enabled at the
455        * same time and therefore length cannot overrun the array.
456        */
457       assert(length < 15 * reg_unit(devinfo));
458 
459       sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
460       bld.exec_all().annotate("FB write OS")
461          .MOV(retype(sources[length], BRW_REGISTER_TYPE_UB),
462               subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0));
463       length++;
464    }
465 
466    /* Send from the GRF */
467    fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
468    fs_inst *load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
469    payload.nr = bld.shader->alloc.allocate(regs_written(load));
470    load->dst = payload;
471 
472    uint32_t msg_ctl = brw_fb_write_msg_control(inst, prog_data);
473 
474    inst->desc =
475       (inst->group / 16) << 11 | /* rt slot group */
476       brw_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,
477                         0 /* coarse_rt_write */);
478 
479    fs_reg desc = brw_imm_ud(0);
480    if (prog_data->coarse_pixel_dispatch == BRW_ALWAYS) {
481       inst->desc |= (1 << 18);
482    } else if (prog_data->coarse_pixel_dispatch == BRW_SOMETIMES) {
483       STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_RT_WRITES == (1 << 18));
484       const fs_builder &ubld = bld.exec_all().group(8, 0);
485       desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
486       ubld.AND(desc, dynamic_msaa_flags(prog_data),
487                brw_imm_ud(INTEL_MSAA_FLAG_COARSE_RT_WRITES));
488       desc = component(desc, 0);
489    }
490 
491    uint32_t ex_desc = 0;
492    if (devinfo->ver >= 11) {
493       /* Set the "Render Target Index" and "Src0 Alpha Present" fields
494        * in the extended message descriptor, in lieu of using a header.
495        */
496       ex_desc = inst->target << 12 | (src0_alpha.file != BAD_FILE) << 15;
497 
498       if (key->nr_color_regions == 0)
499          ex_desc |= 1 << 20; /* Null Render Target */
500    }
501    inst->ex_desc = ex_desc;
502 
503    inst->opcode = SHADER_OPCODE_SEND;
504    inst->resize_sources(3);
505    inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
506    inst->src[0] = desc;
507    inst->src[1] = brw_imm_ud(0);
508    inst->src[2] = payload;
509    inst->mlen = regs_written(load);
510    inst->ex_mlen = 0;
511    inst->header_size = header_size;
512    inst->check_tdr = true;
513    inst->send_has_side_effects = true;
514 }
515 
516 static void
lower_fb_read_logical_send(const fs_builder & bld,fs_inst * inst)517 lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)
518 {
519    const intel_device_info *devinfo = bld.shader->devinfo;
520    const fs_builder &ubld = bld.exec_all().group(8, 0);
521    const unsigned length = 2;
522    const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length);
523 
524    if (bld.group() < 16) {
525       ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
526                                            BRW_REGISTER_TYPE_UD));
527    } else {
528       assert(bld.group() < 32);
529       const fs_reg header_sources[] = {
530          retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
531          retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD)
532       };
533       ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
534 
535       if (devinfo->ver >= 12) {
536          /* On Gfx12 the Viewport and Render Target Array Index fields (AKA
537           * Poly 0 Info) are provided in r1.1 instead of r0.0, and the render
538           * target message header format was updated accordingly -- However
539           * the updated format only works for the lower 16 channels in a
540           * SIMD32 thread, since the higher 16 channels want the subspan data
541           * from r2 instead of r1, so we need to copy over the contents of
542           * r1.1 in order to fix things up.
543           */
544          ubld.group(1, 0).MOV(component(header, 9),
545                               retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD));
546       }
547    }
548 
549    /* BSpec 12470 (Gfx8-11), BSpec 47842 (Gfx12+) :
550     *
551     *   "Must be zero for Render Target Read message."
552     *
553     * For bits :
554     *   - 14 : Stencil Present to Render Target
555     *   - 13 : Source Depth Present to Render Target
556     *   - 12 : oMask to Render Target
557     *   - 11 : Source0 Alpha Present to Render Target
558     */
559    ubld.group(1, 0).AND(component(header, 0),
560                         component(header, 0),
561                         brw_imm_ud(~INTEL_MASK(14, 11)));
562 
563    inst->resize_sources(1);
564    inst->src[0] = header;
565    inst->opcode = FS_OPCODE_FB_READ;
566    inst->mlen = length;
567    inst->header_size = length;
568 }
569 
570 static bool
is_high_sampler(const struct intel_device_info * devinfo,const fs_reg & sampler)571 is_high_sampler(const struct intel_device_info *devinfo, const fs_reg &sampler)
572 {
573    return sampler.file != IMM || sampler.ud >= 16;
574 }
575 
576 static unsigned
sampler_msg_type(const intel_device_info * devinfo,opcode opcode,bool shadow_compare,bool has_min_lod)577 sampler_msg_type(const intel_device_info *devinfo,
578                  opcode opcode, bool shadow_compare, bool has_min_lod)
579 {
580    switch (opcode) {
581    case SHADER_OPCODE_TEX:
582       if (devinfo->ver >= 20 && has_min_lod) {
583          return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD :
584                                  XE2_SAMPLER_MESSAGE_SAMPLE_MLOD;
585       } else {
586          return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
587                                  GFX5_SAMPLER_MESSAGE_SAMPLE;
588       }
589    case FS_OPCODE_TXB:
590       return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
591                               GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
592    case SHADER_OPCODE_TXL:
593       assert(!has_min_lod);
594       return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
595                               GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
596    case SHADER_OPCODE_TXL_LZ:
597       assert(!has_min_lod);
598       return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ :
599                               GFX9_SAMPLER_MESSAGE_SAMPLE_LZ;
600    case SHADER_OPCODE_TXS:
601    case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
602       assert(!has_min_lod);
603       return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
604    case SHADER_OPCODE_TXD:
605       return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
606                               GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
607    case SHADER_OPCODE_TXF:
608       assert(!has_min_lod);
609       return GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
610    case SHADER_OPCODE_TXF_LZ:
611       assert(!has_min_lod);
612       return GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
613    case SHADER_OPCODE_TXF_CMS_W:
614       assert(!has_min_lod);
615       return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
616    case SHADER_OPCODE_TXF_CMS:
617       assert(!has_min_lod);
618       return GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
619    case SHADER_OPCODE_TXF_UMS:
620       assert(!has_min_lod);
621       return GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
622    case SHADER_OPCODE_TXF_MCS:
623       assert(!has_min_lod);
624       return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
625    case SHADER_OPCODE_LOD:
626       assert(!has_min_lod);
627       return GFX5_SAMPLER_MESSAGE_LOD;
628    case SHADER_OPCODE_TG4:
629       assert(!has_min_lod);
630       return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
631                               GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
632       break;
633    case SHADER_OPCODE_TG4_OFFSET:
634       assert(!has_min_lod);
635       return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
636                               GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
637    case SHADER_OPCODE_TG4_OFFSET_LOD:
638       assert(!has_min_lod);
639       assert(devinfo->ver >= 20);
640       return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L_C:
641                               XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L;
642    case SHADER_OPCODE_TG4_OFFSET_BIAS:
643       assert(!has_min_lod);
644       assert(devinfo->ver >= 20);
645       return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_B;
646    case SHADER_OPCODE_TG4_BIAS:
647       assert(!has_min_lod);
648       assert(devinfo->ver >= 20);
649       return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_B;
650    case SHADER_OPCODE_TG4_EXPLICIT_LOD:
651       assert(!has_min_lod);
652       assert(devinfo->ver >= 20);
653       return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L_C :
654                               XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L;
655    case SHADER_OPCODE_TG4_IMPLICIT_LOD:
656       assert(!has_min_lod);
657       assert(devinfo->ver >= 20);
658       return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I_C :
659                               XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I;
660   case SHADER_OPCODE_SAMPLEINFO:
661       assert(!has_min_lod);
662       return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
663    default:
664       unreachable("not reached");
665    }
666 }
667 
668 /**
669  * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to
670  * the given requested_alignment_sz.
671  */
672 static fs_inst *
emit_load_payload_with_padding(const fs_builder & bld,const fs_reg & dst,const fs_reg * src,unsigned sources,unsigned header_size,unsigned requested_alignment_sz)673 emit_load_payload_with_padding(const fs_builder &bld, const fs_reg &dst,
674                                const fs_reg *src, unsigned sources,
675                                unsigned header_size,
676                                unsigned requested_alignment_sz)
677 {
678    unsigned length = 0;
679    unsigned num_srcs =
680       sources * DIV_ROUND_UP(requested_alignment_sz, bld.dispatch_width());
681    fs_reg *src_comps = new fs_reg[num_srcs];
682 
683    for (unsigned i = 0; i < header_size; i++)
684       src_comps[length++] = src[i];
685 
686    for (unsigned i = header_size; i < sources; i++) {
687       unsigned src_sz =
688          retype(dst, src[i].type).component_size(bld.dispatch_width());
689       const enum brw_reg_type padding_payload_type =
690          brw_reg_type_from_bit_size(type_sz(src[i].type) * 8,
691                                     BRW_REGISTER_TYPE_UD);
692 
693       src_comps[length++] = src[i];
694 
695       /* Expand the real sources if component of requested payload type is
696        * larger than real source component.
697        */
698       if (src_sz < requested_alignment_sz) {
699          for (unsigned j = 0; j < (requested_alignment_sz / src_sz) - 1; j++) {
700             src_comps[length++] = retype(fs_reg(), padding_payload_type);
701          }
702       }
703    }
704 
705    fs_inst *inst = bld.LOAD_PAYLOAD(dst, src_comps, length, header_size);
706    delete[] src_comps;
707 
708    return inst;
709 }
710 
711 static bool
shader_opcode_needs_header(opcode op)712 shader_opcode_needs_header(opcode op)
713 {
714    switch (op) {
715    case SHADER_OPCODE_TG4:
716    case SHADER_OPCODE_TG4_OFFSET:
717    case SHADER_OPCODE_TG4_OFFSET_BIAS:
718    case SHADER_OPCODE_TG4_OFFSET_LOD:
719    case SHADER_OPCODE_TG4_BIAS:
720    case SHADER_OPCODE_TG4_EXPLICIT_LOD:
721    case SHADER_OPCODE_TG4_IMPLICIT_LOD:
722    case SHADER_OPCODE_SAMPLEINFO:
723       return true;
724    default:
725       break;
726    }
727 
728    return false;
729 }
730 
731 static void
lower_sampler_logical_send(const fs_builder & bld,fs_inst * inst,opcode op,const fs_reg & coordinate,const fs_reg & shadow_c,fs_reg lod,const fs_reg & lod2,const fs_reg & min_lod,const fs_reg & sample_index,const fs_reg & mcs,const fs_reg & surface,const fs_reg & sampler,const fs_reg & surface_handle,const fs_reg & sampler_handle,const fs_reg & tg4_offset,unsigned payload_type_bit_size,unsigned coord_components,unsigned grad_components,bool residency)732 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
733                            const fs_reg &coordinate,
734                            const fs_reg &shadow_c,
735                            fs_reg lod, const fs_reg &lod2,
736                            const fs_reg &min_lod,
737                            const fs_reg &sample_index,
738                            const fs_reg &mcs,
739                            const fs_reg &surface,
740                            const fs_reg &sampler,
741                            const fs_reg &surface_handle,
742                            const fs_reg &sampler_handle,
743                            const fs_reg &tg4_offset,
744                            unsigned payload_type_bit_size,
745                            unsigned coord_components,
746                            unsigned grad_components,
747                            bool residency)
748 {
749    const brw_compiler *compiler = bld.shader->compiler;
750    const intel_device_info *devinfo = bld.shader->devinfo;
751    const enum brw_reg_type payload_type =
752       brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_F);
753    const enum brw_reg_type payload_unsigned_type =
754       brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_UD);
755    const enum brw_reg_type payload_signed_type =
756       brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_D);
757    unsigned reg_width = bld.dispatch_width() / 8;
758    unsigned header_size = 0, length = 0;
759    fs_reg sources[1 + MAX_SAMPLER_MESSAGE_SIZE];
760    for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
761       sources[i] = bld.vgrf(payload_type);
762 
763    /* We must have exactly one of surface/sampler and surface/sampler_handle */
764    assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
765    assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
766 
767    if (shader_opcode_needs_header(op) || inst->offset != 0 || inst->eot ||
768        sampler_handle.file != BAD_FILE ||
769        is_high_sampler(devinfo, sampler) ||
770        residency) {
771       /* For general texture offsets (no txf workaround), we need a header to
772        * put them in.
773        *
774        * TG4 needs to place its channel select in the header, for interaction
775        * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
776        * larger sampler numbers we need to offset the Sampler State Pointer in
777        * the header.
778        */
779       fs_reg header = retype(sources[0], BRW_REGISTER_TYPE_UD);
780       for (header_size = 0; header_size < reg_unit(devinfo); header_size++)
781          sources[length++] = byte_offset(header, REG_SIZE * header_size);
782 
783       /* If we're requesting fewer than four channels worth of response,
784        * and we have an explicit header, we need to set up the sampler
785        * writemask.  It's reversed from normal: 1 means "don't write".
786        */
787       unsigned reg_count = regs_written(inst) - reg_unit(devinfo) * residency;
788       if (!inst->eot && reg_count < 4 * reg_width) {
789          assert(reg_count % reg_width == 0);
790          unsigned mask = ~((1 << (reg_count / reg_width)) - 1) & 0xf;
791          inst->offset |= mask << 12;
792       }
793 
794       if (residency)
795          inst->offset |= 1 << 23; /* g0.2 bit23 : Pixel Null Mask Enable */
796 
797       /* Build the actual header */
798       const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
799       const fs_builder ubld1 = ubld.group(1, 0);
800       ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
801       if (inst->offset) {
802          ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));
803       } else if (bld.shader->stage != MESA_SHADER_VERTEX &&
804                  bld.shader->stage != MESA_SHADER_FRAGMENT) {
805          /* The vertex and fragment stages have g0.2 set to 0, so
806           * header0.2 is 0 when g0 is copied. Other stages may not, so we
807           * must set it to 0 to avoid setting undesirable bits in the
808           * message.
809           */
810          ubld1.MOV(component(header, 2), brw_imm_ud(0));
811       }
812 
813       if (sampler_handle.file != BAD_FILE) {
814          /* Bindless sampler handles aren't relative to the sampler state
815           * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
816           * Instead, it's an absolute pointer relative to dynamic state base
817           * address.
818           *
819           * Sampler states are 16 bytes each and the pointer we give here has
820           * to be 32-byte aligned.  In order to avoid more indirect messages
821           * than required, we assume that all bindless sampler states are
822           * 32-byte aligned.  This sacrifices a bit of general state base
823           * address space but means we can do something more efficient in the
824           * shader.
825           */
826          if (compiler->use_bindless_sampler_offset) {
827             assert(devinfo->ver >= 11);
828             ubld1.OR(component(header, 3), sampler_handle, brw_imm_ud(1));
829          } else {
830             ubld1.MOV(component(header, 3), sampler_handle);
831          }
832       } else if (is_high_sampler(devinfo, sampler)) {
833          fs_reg sampler_state_ptr =
834             retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD);
835 
836          /* Gfx11+ sampler message headers include bits in 4:0 which conflict
837           * with the ones included in g0.3 bits 4:0.  Mask them out.
838           */
839          if (devinfo->ver >= 11) {
840             sampler_state_ptr = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
841             ubld1.AND(sampler_state_ptr,
842                       retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
843                       brw_imm_ud(INTEL_MASK(31, 5)));
844          }
845 
846          if (sampler.file == BRW_IMMEDIATE_VALUE) {
847             assert(sampler.ud >= 16);
848             const int sampler_state_size = 16; /* 16 bytes */
849 
850             ubld1.ADD(component(header, 3), sampler_state_ptr,
851                       brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
852          } else {
853             fs_reg tmp = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
854             ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));
855             ubld1.SHL(tmp, tmp, brw_imm_ud(4));
856             ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
857          }
858       } else if (devinfo->ver >= 11) {
859          /* Gfx11+ sampler message headers include bits in 4:0 which conflict
860           * with the ones included in g0.3 bits 4:0.  Mask them out.
861           */
862          ubld1.AND(component(header, 3),
863                    retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
864                    brw_imm_ud(INTEL_MASK(31, 5)));
865       }
866    }
867 
868    /* Change the opcode to account for LOD being zero before the
869     * switch-statement that emits sources based on the opcode.
870     */
871    if (lod.is_zero()) {
872       if (op == SHADER_OPCODE_TXL)
873          op = SHADER_OPCODE_TXL_LZ;
874       else if (op == SHADER_OPCODE_TXF)
875          op = SHADER_OPCODE_TXF_LZ;
876    }
877 
878    /* On Xe2 and newer platforms, min_lod is the first parameter specifically
879     * so that a bunch of other, possibly unused, parameters don't need to also
880     * be included.
881     */
882    const unsigned msg_type =
883       sampler_msg_type(devinfo, op, inst->shadow_compare,
884                        min_lod.file != BAD_FILE);
885 
886    const bool min_lod_is_first = devinfo->ver >= 20 &&
887       (msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_MLOD ||
888        msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD);
889 
890    if (min_lod_is_first) {
891       assert(min_lod.file != BAD_FILE);
892       bld.MOV(sources[length++], min_lod);
893    }
894 
895    if (shadow_c.file != BAD_FILE) {
896       bld.MOV(sources[length], shadow_c);
897       length++;
898    }
899 
900    bool coordinate_done = false;
901 
902    /* Set up the LOD info */
903    switch (op) {
904    case FS_OPCODE_TXB:
905    case SHADER_OPCODE_TG4_BIAS:
906    case SHADER_OPCODE_TG4_EXPLICIT_LOD:
907    case SHADER_OPCODE_TG4_OFFSET_LOD:
908    case SHADER_OPCODE_TG4_OFFSET_BIAS:
909    case SHADER_OPCODE_TXL:
910       bld.MOV(sources[length], lod);
911       length++;
912       break;
913    case SHADER_OPCODE_TXD:
914       /* TXD should have been lowered in SIMD16 mode (in SIMD32 mode in
915        * Xe2+).
916        */
917       assert(bld.dispatch_width() == (8 * reg_unit(devinfo)));
918 
919       /* Load dPdx and the coordinate together:
920        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
921        */
922       for (unsigned i = 0; i < coord_components; i++) {
923          bld.MOV(sources[length++], offset(coordinate, bld, i));
924 
925          /* For cube map array, the coordinate is (u,v,r,ai) but there are
926           * only derivatives for (u, v, r).
927           */
928          if (i < grad_components) {
929             bld.MOV(sources[length++], offset(lod, bld, i));
930             bld.MOV(sources[length++], offset(lod2, bld, i));
931          }
932       }
933 
934       coordinate_done = true;
935       break;
936    case SHADER_OPCODE_TXS:
937       bld.MOV(retype(sources[length], payload_unsigned_type), lod);
938       length++;
939       break;
940    case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
941       /* We need an LOD; just use 0 */
942       bld.MOV(retype(sources[length], payload_unsigned_type), brw_imm_ud(0));
943       length++;
944       break;
945    case SHADER_OPCODE_TXF:
946    case SHADER_OPCODE_TXF_LZ:
947        /* On Gfx9 the parameters are intermixed they are u, v, lod, r. */
948       bld.MOV(retype(sources[length++], payload_signed_type), coordinate);
949 
950       if (coord_components >= 2) {
951          bld.MOV(retype(sources[length], payload_signed_type),
952                  offset(coordinate, bld, 1));
953       } else {
954          sources[length] = brw_imm_d(0);
955       }
956       length++;
957 
958       if (op != SHADER_OPCODE_TXF_LZ) {
959          bld.MOV(retype(sources[length], payload_signed_type), lod);
960          length++;
961       }
962 
963       for (unsigned i = 2; i < coord_components; i++)
964          bld.MOV(retype(sources[length++], payload_signed_type),
965                  offset(coordinate, bld, i));
966 
967       coordinate_done = true;
968       break;
969 
970    case SHADER_OPCODE_TXF_CMS:
971    case SHADER_OPCODE_TXF_CMS_W:
972    case SHADER_OPCODE_TXF_UMS:
973    case SHADER_OPCODE_TXF_MCS:
974       if (op == SHADER_OPCODE_TXF_UMS ||
975           op == SHADER_OPCODE_TXF_CMS ||
976           op == SHADER_OPCODE_TXF_CMS_W) {
977          bld.MOV(retype(sources[length++], payload_unsigned_type), sample_index);
978       }
979 
980       /* Data from the multisample control surface. */
981       if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
982          unsigned num_mcs_components = 1;
983 
984          /* From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
985           * Shared Functions - 3D Sampler - Messages - Message Format:
986           *
987           *    ld2dms_w   si  mcs0 mcs1 mcs2  mcs3  u  v  r
988           */
989          if (op == SHADER_OPCODE_TXF_CMS_W)
990             num_mcs_components = 2;
991 
992          for (unsigned i = 0; i < num_mcs_components; ++i) {
993             /* Sampler always writes 4/8 register worth of data but for ld_mcs
994              * only valid data is in first two register. So with 16-bit
995              * payload, we need to split 2-32bit register into 4-16-bit
996              * payload.
997              *
998              * From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
999              * Shared Functions - 3D Sampler - Messages - Message Format:
1000              *
1001              *    ld2dms_w   si  mcs0 mcs1 mcs2  mcs3  u  v  r
1002              */
1003             if (devinfo->verx10 >= 125 && op == SHADER_OPCODE_TXF_CMS_W) {
1004                fs_reg tmp = offset(mcs, bld, i);
1005                bld.MOV(retype(sources[length++], payload_unsigned_type),
1006                        mcs.file == IMM ? mcs :
1007                        subscript(tmp, payload_unsigned_type, 0));
1008                bld.MOV(retype(sources[length++], payload_unsigned_type),
1009                        mcs.file == IMM ? mcs :
1010                        subscript(tmp, payload_unsigned_type, 1));
1011             } else {
1012                bld.MOV(retype(sources[length++], payload_unsigned_type),
1013                        mcs.file == IMM ? mcs : offset(mcs, bld, i));
1014             }
1015          }
1016       }
1017 
1018       /* There is no offsetting for this message; just copy in the integer
1019        * texture coordinates.
1020        */
1021       for (unsigned i = 0; i < coord_components; i++)
1022          bld.MOV(retype(sources[length++], payload_signed_type),
1023                  offset(coordinate, bld, i));
1024 
1025       coordinate_done = true;
1026       break;
1027    case SHADER_OPCODE_TG4_OFFSET:
1028       /* More crazy intermixing */
1029       for (unsigned i = 0; i < 2; i++) /* u, v */
1030          bld.MOV(sources[length++], offset(coordinate, bld, i));
1031 
1032       for (unsigned i = 0; i < 2; i++) /* offu, offv */
1033          bld.MOV(retype(sources[length++], payload_signed_type),
1034                  offset(tg4_offset, bld, i));
1035 
1036       if (coord_components == 3) /* r if present */
1037          bld.MOV(sources[length++], offset(coordinate, bld, 2));
1038 
1039       coordinate_done = true;
1040       break;
1041    default:
1042       break;
1043    }
1044 
1045    /* Set up the coordinate (except for cases where it was done above) */
1046    if (!coordinate_done) {
1047       for (unsigned i = 0; i < coord_components; i++)
1048          bld.MOV(retype(sources[length++], payload_type),
1049                  offset(coordinate, bld, i));
1050    }
1051 
1052    if (min_lod.file != BAD_FILE && !min_lod_is_first) {
1053       /* Account for all of the missing coordinate sources */
1054       if (op == FS_OPCODE_TXB && devinfo->ver >= 20 &&
1055           inst->has_packed_lod_ai_src) {
1056          /* Bspec 64985:
1057           *
1058           * For sample_b sampler message format:
1059           *
1060           * SIMD16H/SIMD32H
1061           * Param Number   0     1  2  3  4  5
1062           * Param          BIAS  U  V  R  Ai MLOD
1063           *
1064           * SIMD16/SIMD32
1065           * Param Number   0        1  2  3  4
1066           * Param          BIAS_AI  U  V  R  MLOD
1067           */
1068          length += 3 - coord_components;
1069       } else if (op == SHADER_OPCODE_TXD && devinfo->verx10 >= 125) {
1070          /* On DG2 and newer platforms, sample_d can only be used with 1D and
1071           * 2D surfaces, so the maximum number of gradient components is 2.
1072           * In spite of this limitation, the Bspec lists a mysterious R
1073           * component before the min_lod, so the maximum coordinate components
1074           * is 3.
1075           *
1076           * See bspec 45942, "Enable new message layout for cube array"
1077           */
1078          length += 3 - coord_components;
1079          length += (2 - grad_components) * 2;
1080       } else {
1081          length += 4 - coord_components;
1082          if (op == SHADER_OPCODE_TXD)
1083             length += (3 - grad_components) * 2;
1084       }
1085 
1086       bld.MOV(sources[length++], min_lod);
1087 
1088       /* Wa_14014595444: Populate MLOD as parameter 5 (twice). */
1089        if (devinfo->verx10 == 125 && op == FS_OPCODE_TXB &&
1090           !inst->shadow_compare)
1091          bld.MOV(sources[length++], min_lod);
1092    }
1093 
1094    const fs_reg src_payload =
1095       fs_reg(VGRF, bld.shader->alloc.allocate(length * reg_width),
1096                                               BRW_REGISTER_TYPE_F);
1097    /* In case of 16-bit payload each component takes one full register in
1098     * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16
1099     * elements. In SIMD8H case hardware simply expects the components to be
1100     * padded (i.e., aligned on reg boundary).
1101     */
1102    fs_inst *load_payload_inst =
1103       emit_load_payload_with_padding(bld, src_payload, sources, length,
1104                                      header_size, REG_SIZE * reg_unit(devinfo));
1105    unsigned mlen = load_payload_inst->size_written / REG_SIZE;
1106    unsigned simd_mode = 0;
1107    if (devinfo->ver < 20) {
1108       if (payload_type_bit_size == 16) {
1109          assert(devinfo->ver >= 11);
1110          simd_mode = inst->exec_size <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H :
1111             GFX10_SAMPLER_SIMD_MODE_SIMD16H;
1112       } else {
1113          simd_mode = inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
1114             BRW_SAMPLER_SIMD_MODE_SIMD16;
1115       }
1116    } else {
1117       if (payload_type_bit_size == 16) {
1118          simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16H :
1119             XE2_SAMPLER_SIMD_MODE_SIMD32H;
1120       } else {
1121          simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16 :
1122             XE2_SAMPLER_SIMD_MODE_SIMD32;
1123       }
1124    }
1125 
1126    /* Generate the SEND. */
1127    inst->opcode = SHADER_OPCODE_SEND;
1128    inst->mlen = mlen;
1129    inst->header_size = header_size;
1130 
1131    assert(msg_type == sampler_msg_type(devinfo, op, inst->shadow_compare,
1132                                        min_lod.file != BAD_FILE));
1133 
1134    inst->sfid = BRW_SFID_SAMPLER;
1135    if (surface.file == IMM &&
1136        (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
1137       inst->desc = brw_sampler_desc(devinfo, surface.ud,
1138                                     sampler.file == IMM ? sampler.ud % 16 : 0,
1139                                     msg_type,
1140                                     simd_mode,
1141                                     0 /* return_format unused on gfx7+ */);
1142       inst->src[0] = brw_imm_ud(0);
1143       inst->src[1] = brw_imm_ud(0);
1144    } else if (surface_handle.file != BAD_FILE) {
1145       /* Bindless surface */
1146       inst->desc = brw_sampler_desc(devinfo,
1147                                     GFX9_BTI_BINDLESS,
1148                                     sampler.file == IMM ? sampler.ud % 16 : 0,
1149                                     msg_type,
1150                                     simd_mode,
1151                                     0 /* return_format unused on gfx7+ */);
1152 
1153       /* For bindless samplers, the entire address is included in the message
1154        * header so we can leave the portion in the message descriptor 0.
1155        */
1156       if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {
1157          inst->src[0] = brw_imm_ud(0);
1158       } else {
1159          const fs_builder ubld = bld.group(1, 0).exec_all();
1160          fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1161          ubld.SHL(desc, sampler, brw_imm_ud(8));
1162          inst->src[0] = component(desc, 0);
1163       }
1164 
1165       /* We assume that the driver provided the handle in the top 20 bits so
1166        * we can use the surface handle directly as the extended descriptor.
1167        */
1168       inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
1169       inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1170    } else {
1171       /* Immediate portion of the descriptor */
1172       inst->desc = brw_sampler_desc(devinfo,
1173                                     0, /* surface */
1174                                     0, /* sampler */
1175                                     msg_type,
1176                                     simd_mode,
1177                                     0 /* return_format unused on gfx7+ */);
1178       const fs_builder ubld = bld.group(1, 0).exec_all();
1179       fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1180       if (surface.equals(sampler)) {
1181          /* This case is common in GL */
1182          ubld.MUL(desc, surface, brw_imm_ud(0x101));
1183       } else {
1184          if (sampler_handle.file != BAD_FILE) {
1185             ubld.MOV(desc, surface);
1186          } else if (sampler.file == IMM) {
1187             ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8));
1188          } else {
1189             ubld.SHL(desc, sampler, brw_imm_ud(8));
1190             ubld.OR(desc, desc, surface);
1191          }
1192       }
1193       ubld.AND(desc, desc, brw_imm_ud(0xfff));
1194 
1195       inst->src[0] = component(desc, 0);
1196       inst->src[1] = brw_imm_ud(0); /* ex_desc */
1197    }
1198 
1199    inst->ex_desc = 0;
1200 
1201    inst->src[2] = src_payload;
1202    inst->resize_sources(3);
1203 
1204    if (inst->eot) {
1205       /* EOT sampler messages don't make sense to split because it would
1206        * involve ending half of the thread early.
1207        */
1208       assert(inst->group == 0);
1209       /* We need to use SENDC for EOT sampler messages */
1210       inst->check_tdr = true;
1211       inst->send_has_side_effects = true;
1212    }
1213 
1214    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
1215    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE * reg_unit(devinfo));
1216 }
1217 
1218 static unsigned
get_sampler_msg_payload_type_bit_size(const intel_device_info * devinfo,opcode op,const fs_inst * inst)1219 get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
1220                                       opcode op, const fs_inst *inst)
1221 {
1222    assert(inst);
1223    const fs_reg *src = inst->src;
1224    unsigned src_type_size = 0;
1225 
1226    /* All sources need to have the same size, therefore seek the first valid
1227     * and take the size from there.
1228     */
1229    for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1230       if (src[i].file != BAD_FILE) {
1231          src_type_size = brw_reg_type_to_size(src[i].type);
1232          break;
1233       }
1234    }
1235 
1236    assert(src_type_size == 2 || src_type_size == 4);
1237 
1238 #ifndef NDEBUG
1239    /* Make sure all sources agree. On gfx12 this doesn't hold when sampling
1240     * compressed multisampled surfaces. There the payload contains MCS data
1241     * which is already in 16-bits unlike the other parameters that need forced
1242     * conversion.
1243     */
1244    if (devinfo->verx10 < 125 ||
1245        (op != SHADER_OPCODE_TXF_CMS_W &&
1246         op != SHADER_OPCODE_TXF_CMS)) {
1247       for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1248          assert(src[i].file == BAD_FILE ||
1249                 brw_reg_type_to_size(src[i].type) == src_type_size);
1250       }
1251    }
1252 #endif
1253 
1254    if (devinfo->verx10 < 125)
1255       return src_type_size * 8;
1256 
1257    /* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec:
1258     * 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message
1259     * Format [GFX12:HAS:1209977870] *
1260     *
1261     *  ld2dms_w       SIMD8H and SIMD16H Only
1262     *  ld_mcs         SIMD8H and SIMD16H Only
1263     *  ld2dms         REMOVEDBY(GEN:HAS:1406788836)
1264     */
1265 
1266    if (op == SHADER_OPCODE_TXF_CMS_W ||
1267        op == SHADER_OPCODE_TXF_CMS ||
1268        op == SHADER_OPCODE_TXF_UMS ||
1269        op == SHADER_OPCODE_TXF_MCS ||
1270        (op == FS_OPCODE_TXB && !inst->has_packed_lod_ai_src &&
1271         devinfo->ver >= 20))
1272       src_type_size = 2;
1273 
1274    return src_type_size * 8;
1275 }
1276 
1277 static void
lower_sampler_logical_send(const fs_builder & bld,fs_inst * inst,opcode op)1278 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
1279 {
1280    const intel_device_info *devinfo = bld.shader->devinfo;
1281    const fs_reg coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
1282    const fs_reg shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
1283    const fs_reg lod = inst->src[TEX_LOGICAL_SRC_LOD];
1284    const fs_reg lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
1285    const fs_reg min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
1286    const fs_reg sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
1287    const fs_reg mcs = inst->src[TEX_LOGICAL_SRC_MCS];
1288    const fs_reg surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
1289    const fs_reg sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
1290    const fs_reg surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
1291    const fs_reg sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
1292    const fs_reg tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
1293    assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
1294    const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
1295    assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
1296    const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
1297    assert(inst->src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
1298    const bool residency = inst->src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
1299 
1300    const unsigned msg_payload_type_bit_size =
1301       get_sampler_msg_payload_type_bit_size(devinfo, op, inst);
1302 
1303    /* 16-bit payloads are available only on gfx11+ */
1304    assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11);
1305 
1306    lower_sampler_logical_send(bld, inst, op, coordinate,
1307                               shadow_c, lod, lod2, min_lod,
1308                               sample_index,
1309                               mcs, surface, sampler,
1310                               surface_handle, sampler_handle,
1311                               tg4_offset,
1312                               msg_payload_type_bit_size,
1313                               coord_components, grad_components,
1314                               residency);
1315 }
1316 
1317 /**
1318  * Predicate the specified instruction on the vector mask.
1319  */
1320 static void
emit_predicate_on_vector_mask(const fs_builder & bld,fs_inst * inst)1321 emit_predicate_on_vector_mask(const fs_builder &bld, fs_inst *inst)
1322 {
1323    assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
1324           bld.group() == inst->group &&
1325           bld.dispatch_width() == inst->exec_size);
1326 
1327    const fs_builder ubld = bld.exec_all().group(1, 0);
1328 
1329    const fs_visitor &s = *bld.shader;
1330    const fs_reg vector_mask = ubld.vgrf(BRW_REGISTER_TYPE_UW);
1331    ubld.UNDEF(vector_mask);
1332    ubld.emit(SHADER_OPCODE_READ_SR_REG, vector_mask, brw_imm_ud(3));
1333    const unsigned subreg = sample_mask_flag_subreg(s);
1334 
1335    ubld.MOV(brw_flag_subreg(subreg + inst->group / 16), vector_mask);
1336 
1337    if (inst->predicate) {
1338       assert(inst->predicate == BRW_PREDICATE_NORMAL);
1339       assert(!inst->predicate_inverse);
1340       assert(inst->flag_subreg == 0);
1341       assert(s.devinfo->ver < 20);
1342       /* Combine the vector mask with the existing predicate by using a
1343        * vertical predication mode.
1344        */
1345       inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
1346    } else {
1347       inst->flag_subreg = subreg;
1348       inst->predicate = BRW_PREDICATE_NORMAL;
1349       inst->predicate_inverse = false;
1350    }
1351 }
1352 
1353 static void
setup_surface_descriptors(const fs_builder & bld,fs_inst * inst,uint32_t desc,const fs_reg & surface,const fs_reg & surface_handle)1354 setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc,
1355                           const fs_reg &surface, const fs_reg &surface_handle)
1356 {
1357    const brw_compiler *compiler = bld.shader->compiler;
1358 
1359    /* We must have exactly one of surface and surface_handle */
1360    assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
1361 
1362    if (surface.file == IMM) {
1363       inst->desc = desc | (surface.ud & 0xff);
1364       inst->src[0] = brw_imm_ud(0);
1365       inst->src[1] = brw_imm_ud(0); /* ex_desc */
1366    } else if (surface_handle.file != BAD_FILE) {
1367       /* Bindless surface */
1368       inst->desc = desc | GFX9_BTI_BINDLESS;
1369       inst->src[0] = brw_imm_ud(0);
1370 
1371       /* We assume that the driver provided the handle in the top 20 bits so
1372        * we can use the surface handle directly as the extended descriptor.
1373        */
1374       inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
1375       inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1376    } else {
1377       inst->desc = desc;
1378       const fs_builder ubld = bld.exec_all().group(1, 0);
1379       fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1380       ubld.AND(tmp, surface, brw_imm_ud(0xff));
1381       inst->src[0] = component(tmp, 0);
1382       inst->src[1] = brw_imm_ud(0); /* ex_desc */
1383    }
1384 }
1385 
1386 static void
setup_lsc_surface_descriptors(const fs_builder & bld,fs_inst * inst,uint32_t desc,const fs_reg & surface)1387 setup_lsc_surface_descriptors(const fs_builder &bld, fs_inst *inst,
1388                               uint32_t desc, const fs_reg &surface)
1389 {
1390    const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
1391    const brw_compiler *compiler = bld.shader->compiler;
1392 
1393    inst->src[0] = brw_imm_ud(0); /* desc */
1394 
1395    enum lsc_addr_surface_type surf_type = lsc_msg_desc_addr_type(devinfo, desc);
1396    switch (surf_type) {
1397    case LSC_ADDR_SURFTYPE_BSS:
1398       inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1399       /* fall-through */
1400    case LSC_ADDR_SURFTYPE_SS:
1401       assert(surface.file != BAD_FILE);
1402       /* We assume that the driver provided the handle in the top 20 bits so
1403        * we can use the surface handle directly as the extended descriptor.
1404        */
1405       inst->src[1] = retype(surface, BRW_REGISTER_TYPE_UD);
1406       break;
1407 
1408    case LSC_ADDR_SURFTYPE_BTI:
1409       assert(surface.file != BAD_FILE);
1410       if (surface.file == IMM) {
1411          inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
1412       } else {
1413          const fs_builder ubld = bld.exec_all().group(1, 0);
1414          fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1415          ubld.SHL(tmp, surface, brw_imm_ud(24));
1416          inst->src[1] = component(tmp, 0);
1417       }
1418       break;
1419 
1420    case LSC_ADDR_SURFTYPE_FLAT:
1421       inst->src[1] = brw_imm_ud(0);
1422       break;
1423 
1424    default:
1425       unreachable("Invalid LSC surface address type");
1426    }
1427 }
1428 
1429 static void
lower_surface_logical_send(const fs_builder & bld,fs_inst * inst)1430 lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
1431 {
1432    const brw_compiler *compiler = bld.shader->compiler;
1433    const intel_device_info *devinfo = bld.shader->devinfo;
1434 
1435    /* Get the logical send arguments. */
1436    const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1437    const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1438    const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1439    const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1440    const UNUSED fs_reg dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
1441    const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1442    const fs_reg allow_sample_mask =
1443       inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
1444    assert(arg.file == IMM);
1445    assert(allow_sample_mask.file == IMM);
1446 
1447    /* Calculate the total number of components of the payload. */
1448    const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
1449    const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1450 
1451    const bool is_typed_access =
1452       inst->opcode == SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL ||
1453       inst->opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL ||
1454       inst->opcode == SHADER_OPCODE_TYPED_ATOMIC_LOGICAL;
1455 
1456    const bool is_surface_access = is_typed_access ||
1457       inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL ||
1458       inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL ||
1459       inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL;
1460 
1461    const bool is_stateless =
1462       surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
1463                               surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
1464 
1465    const bool has_side_effects = inst->has_side_effects();
1466 
1467    fs_reg sample_mask = allow_sample_mask.ud ? brw_sample_mask_reg(bld) :
1468                                                fs_reg(brw_imm_ud(0xffffffff));
1469 
1470    fs_reg header;
1471    if (is_stateless) {
1472       assert(!is_surface_access);
1473       fs_builder ubld = bld.exec_all().group(8, 0);
1474       header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1475       ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
1476    }
1477    const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;
1478 
1479    fs_reg payload, payload2;
1480    unsigned mlen, ex_mlen = 0;
1481    if (src.file == BAD_FILE || header.file == BAD_FILE) {
1482       /* We have split sends on gfx9 and above */
1483       if (header.file == BAD_FILE) {
1484          payload = bld.move_to_vgrf(addr, addr_sz);
1485          payload2 = bld.move_to_vgrf(src, src_sz);
1486          mlen = addr_sz * (inst->exec_size / 8);
1487          ex_mlen = src_sz * (inst->exec_size / 8);
1488       } else {
1489          assert(src.file == BAD_FILE);
1490          payload = header;
1491          payload2 = bld.move_to_vgrf(addr, addr_sz);
1492          mlen = header_sz;
1493          ex_mlen = addr_sz * (inst->exec_size / 8);
1494       }
1495    } else {
1496       /* Allocate space for the payload. */
1497       const unsigned sz = header_sz + addr_sz + src_sz;
1498       payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
1499       fs_reg *const components = new fs_reg[sz];
1500       unsigned n = 0;
1501 
1502       /* Construct the payload. */
1503       if (header.file != BAD_FILE)
1504          components[n++] = header;
1505 
1506       for (unsigned i = 0; i < addr_sz; i++)
1507          components[n++] = offset(addr, bld, i);
1508 
1509       for (unsigned i = 0; i < src_sz; i++)
1510          components[n++] = offset(src, bld, i);
1511 
1512       bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
1513       mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
1514 
1515       delete[] components;
1516    }
1517 
1518    /* Predicate the instruction on the sample mask if no header is
1519     * provided.
1520     */
1521    if ((header.file == BAD_FILE || !is_surface_access) &&
1522        sample_mask.file != BAD_FILE && sample_mask.file != IMM)
1523       brw_emit_predicate_on_sample_mask(bld, inst);
1524 
1525    uint32_t sfid;
1526    switch (inst->opcode) {
1527    case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1528    case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1529       /* Byte scattered opcodes go through the normal data cache */
1530       sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1531       break;
1532 
1533    case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1534    case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1535       sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1536       break;
1537 
1538    case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1539    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1540    case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1541       /* Untyped Surface messages go through the data cache but the SFID value
1542        * changed on Haswell.
1543        */
1544       sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
1545       break;
1546 
1547    case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1548    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1549    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1550       /* Typed surface messages go through the render cache on IVB and the
1551        * data cache on HSW+.
1552        */
1553       sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
1554       break;
1555 
1556    default:
1557       unreachable("Unsupported surface opcode");
1558    }
1559 
1560    uint32_t desc;
1561    switch (inst->opcode) {
1562    case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1563       desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1564                                             arg.ud, /* num_channels */
1565                                             false   /* write */);
1566       break;
1567 
1568    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1569       desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1570                                             arg.ud, /* num_channels */
1571                                             true    /* write */);
1572       break;
1573 
1574    case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1575       desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1576                                            arg.ud, /* bit_size */
1577                                            false   /* write */);
1578       break;
1579 
1580    case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1581       desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1582                                            arg.ud, /* bit_size */
1583                                            true    /* write */);
1584       break;
1585 
1586    case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1587       assert(arg.ud == 32); /* bit_size */
1588       desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1589                                             false  /* write */);
1590       break;
1591 
1592    case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1593       assert(arg.ud == 32); /* bit_size */
1594       desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1595                                             true   /* write */);
1596       break;
1597 
1598    case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1599       if (lsc_opcode_is_atomic_float((enum lsc_opcode) arg.ud)) {
1600          desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,
1601                                                  lsc_op_to_legacy_atomic(arg.ud),
1602                                                  !inst->dst.is_null());
1603       } else {
1604          desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size,
1605                                            lsc_op_to_legacy_atomic(arg.ud),
1606                                            !inst->dst.is_null());
1607       }
1608       break;
1609 
1610    case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1611       desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
1612                                           arg.ud, /* num_channels */
1613                                           false   /* write */);
1614       break;
1615 
1616    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1617       desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
1618                                           arg.ud, /* num_channels */
1619                                           true    /* write */);
1620       break;
1621 
1622    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1623       desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
1624                                       lsc_op_to_legacy_atomic(arg.ud),
1625                                       !inst->dst.is_null());
1626       break;
1627 
1628    default:
1629       unreachable("Unknown surface logical instruction");
1630    }
1631 
1632    /* Update the original instruction. */
1633    inst->opcode = SHADER_OPCODE_SEND;
1634    inst->mlen = mlen;
1635    inst->ex_mlen = ex_mlen;
1636    inst->header_size = header_sz;
1637    inst->send_has_side_effects = has_side_effects;
1638    inst->send_is_volatile = !has_side_effects;
1639    inst->send_ex_bso = surface_handle.file != BAD_FILE &&
1640                        compiler->extended_bindless_surface_offset;
1641 
1642    /* Set up SFID and descriptors */
1643    inst->sfid = sfid;
1644    setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1645 
1646    inst->resize_sources(4);
1647 
1648    /* Finally, the payload */
1649    inst->src[2] = payload;
1650    inst->src[3] = payload2;
1651 }
1652 
1653 static enum lsc_data_size
lsc_bits_to_data_size(unsigned bit_size)1654 lsc_bits_to_data_size(unsigned bit_size)
1655 {
1656    switch (bit_size / 8) {
1657    case 1:  return LSC_DATA_SIZE_D8U32;
1658    case 2:  return LSC_DATA_SIZE_D16U32;
1659    case 4:  return LSC_DATA_SIZE_D32;
1660    case 8:  return LSC_DATA_SIZE_D64;
1661    default:
1662       unreachable("Unsupported data size.");
1663    }
1664 }
1665 
1666 static void
lower_lsc_surface_logical_send(const fs_builder & bld,fs_inst * inst)1667 lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
1668 {
1669    const brw_compiler *compiler = bld.shader->compiler;
1670    const intel_device_info *devinfo = bld.shader->devinfo;
1671    assert(devinfo->has_lsc);
1672 
1673    /* Get the logical send arguments. */
1674    const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1675    const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1676    const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1677    const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1678    const UNUSED fs_reg dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
1679    const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1680    const fs_reg allow_sample_mask =
1681       inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
1682    assert(arg.file == IMM);
1683    assert(allow_sample_mask.file == IMM);
1684 
1685    /* Calculate the total number of components of the payload. */
1686    const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
1687    const unsigned src_comps = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1688    const unsigned src_sz = type_sz(src.type);
1689    const unsigned dst_sz = type_sz(inst->dst.type);
1690 
1691    const bool has_side_effects = inst->has_side_effects();
1692 
1693    unsigned ex_mlen = 0;
1694    fs_reg payload, payload2;
1695    payload = bld.move_to_vgrf(addr, addr_sz);
1696    if (src.file != BAD_FILE) {
1697       payload2 = bld.move_to_vgrf(src, src_comps);
1698       ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE;
1699    }
1700 
1701    /* Predicate the instruction on the sample mask if needed */
1702    fs_reg sample_mask = allow_sample_mask.ud ? brw_sample_mask_reg(bld) :
1703                                                fs_reg(brw_imm_ud(0xffffffff));
1704    if (sample_mask.file != BAD_FILE && sample_mask.file != IMM)
1705       brw_emit_predicate_on_sample_mask(bld, inst);
1706 
1707    if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
1708       inst->sfid = GFX12_SFID_SLM;
1709    else
1710       inst->sfid = GFX12_SFID_UGM;
1711 
1712    /* We should have exactly one of surface and surface_handle. For scratch
1713     * messages generated by brw_fs_nir.cpp we also allow a special value to
1714     * know what heap base we should use in STATE_BASE_ADDRESS (SS = Surface
1715     * State Offset, or BSS = Bindless Surface State Offset).
1716     */
1717    bool non_bindless = surface.file == IMM && surface.ud == GFX125_NON_BINDLESS;
1718    assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE) ||
1719           (non_bindless && surface_handle.file != BAD_FILE));
1720 
1721    enum lsc_addr_surface_type surf_type;
1722    if (surface_handle.file != BAD_FILE) {
1723       if (surface.file == BAD_FILE) {
1724          assert(!non_bindless);
1725          surf_type = LSC_ADDR_SURFTYPE_BSS;
1726       } else {
1727          assert(surface.file == IMM &&
1728                 (surface.ud == 0 || surface.ud == GFX125_NON_BINDLESS));
1729          surf_type = non_bindless ? LSC_ADDR_SURFTYPE_SS : LSC_ADDR_SURFTYPE_BSS;
1730       }
1731    } else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
1732       surf_type = LSC_ADDR_SURFTYPE_FLAT;
1733    else
1734       surf_type = LSC_ADDR_SURFTYPE_BTI;
1735 
1736    switch (inst->opcode) {
1737    case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1738       inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
1739                                 surf_type, LSC_ADDR_SIZE_A32,
1740                                 1 /* num_coordinates */,
1741                                 LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
1742                                 false /* transpose */,
1743                                 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
1744                                 true /* has_dest */);
1745       break;
1746    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1747       inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
1748                                 surf_type, LSC_ADDR_SIZE_A32,
1749                                 1 /* num_coordinates */,
1750                                 LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
1751                                 false /* transpose */,
1752                                 LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
1753                                 false /* has_dest */);
1754       break;
1755    case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: {
1756       /* Bspec: Atomic instruction -> Cache section:
1757        *
1758        *    Atomic messages are always forced to "un-cacheable" in the L1
1759        *    cache.
1760        */
1761       enum lsc_opcode opcode = (enum lsc_opcode) arg.ud;
1762 
1763       inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
1764                                 surf_type, LSC_ADDR_SIZE_A32,
1765                                 1 /* num_coordinates */,
1766                                 lsc_bits_to_data_size(dst_sz * 8),
1767                                 1 /* num_channels */,
1768                                 false /* transpose */,
1769                                 LSC_CACHE(devinfo, STORE, L1UC_L3WB),
1770                                 !inst->dst.is_null());
1771       break;
1772    }
1773    case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1774       inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
1775                                 surf_type, LSC_ADDR_SIZE_A32,
1776                                 1 /* num_coordinates */,
1777                                 lsc_bits_to_data_size(arg.ud),
1778                                 1 /* num_channels */,
1779                                 false /* transpose */,
1780                                 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
1781                                 true /* has_dest */);
1782       break;
1783    case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1784       inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
1785                                 surf_type, LSC_ADDR_SIZE_A32,
1786                                 1 /* num_coordinates */,
1787                                 lsc_bits_to_data_size(arg.ud),
1788                                 1 /* num_channels */,
1789                                 false /* transpose */,
1790                                 LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
1791                                 false /* has_dest */);
1792       break;
1793    default:
1794       unreachable("Unknown surface logical instruction");
1795    }
1796 
1797    /* Update the original instruction. */
1798    inst->opcode = SHADER_OPCODE_SEND;
1799    inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
1800    inst->ex_mlen = ex_mlen;
1801    inst->header_size = 0;
1802    inst->send_has_side_effects = has_side_effects;
1803    inst->send_is_volatile = !has_side_effects;
1804    inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
1805                        compiler->extended_bindless_surface_offset;
1806 
1807    inst->resize_sources(4);
1808 
1809    if (non_bindless) {
1810       inst->src[0] = brw_imm_ud(0);     /* desc */
1811       inst->src[1] = surface_handle;    /* ex_desc */
1812    } else {
1813       setup_lsc_surface_descriptors(bld, inst, inst->desc,
1814                                     surface.file != BAD_FILE ?
1815                                     surface : surface_handle);
1816    }
1817 
1818    /* Finally, the payload */
1819    inst->src[2] = payload;
1820    inst->src[3] = payload2;
1821 }
1822 
1823 static void
lower_lsc_block_logical_send(const fs_builder & bld,fs_inst * inst)1824 lower_lsc_block_logical_send(const fs_builder &bld, fs_inst *inst)
1825 {
1826    const brw_compiler *compiler = bld.shader->compiler;
1827    const intel_device_info *devinfo = bld.shader->devinfo;
1828    assert(devinfo->has_lsc);
1829 
1830    /* Get the logical send arguments. */
1831    const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1832    const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1833    const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1834    const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1835    const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1836    assert(arg.file == IMM);
1837    assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);
1838    assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);
1839 
1840    const bool is_stateless =
1841       surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
1842                               surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
1843 
1844    const bool has_side_effects = inst->has_side_effects();
1845 
1846    const bool write = inst->opcode == SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
1847 
1848    fs_builder ubld = bld.exec_all().group(1, 0);
1849    fs_reg stateless_ex_desc;
1850    if (is_stateless) {
1851       stateless_ex_desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1852       ubld.AND(stateless_ex_desc,
1853                retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
1854                brw_imm_ud(INTEL_MASK(31, 10)));
1855    }
1856 
1857    fs_reg data;
1858    if (write) {
1859       const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1860       data = retype(bld.move_to_vgrf(src, src_sz), BRW_REGISTER_TYPE_UD);
1861    }
1862 
1863    inst->opcode = SHADER_OPCODE_SEND;
1864    if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
1865       inst->sfid = GFX12_SFID_SLM;
1866    else
1867       inst->sfid = GFX12_SFID_UGM;
1868    const enum lsc_addr_surface_type surf_type =
1869       inst->sfid == GFX12_SFID_SLM ?
1870       LSC_ADDR_SURFTYPE_FLAT :
1871       surface.file == BAD_FILE ?
1872       LSC_ADDR_SURFTYPE_BSS : LSC_ADDR_SURFTYPE_BTI;
1873    inst->desc = lsc_msg_desc(devinfo,
1874                              write ? LSC_OP_STORE : LSC_OP_LOAD,
1875                              1 /* exec_size */,
1876                              surf_type,
1877                              LSC_ADDR_SIZE_A32,
1878                              1 /* num_coordinates */,
1879                              LSC_DATA_SIZE_D32,
1880                              arg.ud /* num_channels */,
1881                              true /* transpose */,
1882                              LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
1883                              !write /* has_dest */);
1884 
1885    inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
1886    inst->size_written = lsc_msg_desc_dest_len(devinfo, inst->desc) * REG_SIZE;
1887    inst->exec_size = 1;
1888    inst->ex_mlen = write ? DIV_ROUND_UP(arg.ud, 8) : 0;
1889    inst->header_size = 0;
1890    inst->send_has_side_effects = has_side_effects;
1891    inst->send_is_volatile = !has_side_effects;
1892    inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
1893                        compiler->extended_bindless_surface_offset;
1894 
1895    inst->resize_sources(4);
1896 
1897    if (stateless_ex_desc.file != BAD_FILE) {
1898       inst->src[0] = brw_imm_ud(0);     /* desc */
1899       inst->src[1] = stateless_ex_desc; /* ex_desc */
1900    } else {
1901       setup_lsc_surface_descriptors(bld, inst, inst->desc,
1902                                     surface.file != BAD_FILE ?
1903                                     surface : surface_handle);
1904    }
1905    inst->src[2] = addr;          /* payload */
1906    inst->src[3] = data;          /* payload2 */
1907 }
1908 
1909 static void
lower_surface_block_logical_send(const fs_builder & bld,fs_inst * inst)1910 lower_surface_block_logical_send(const fs_builder &bld, fs_inst *inst)
1911 {
1912    const intel_device_info *devinfo = bld.shader->devinfo;
1913 
1914    /* Get the logical send arguments. */
1915    const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1916    const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1917    const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1918    const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1919    const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1920    assert(arg.file == IMM);
1921    assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);
1922    assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);
1923 
1924    const bool is_stateless =
1925       surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
1926                               surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
1927 
1928    const bool has_side_effects = inst->has_side_effects();
1929 
1930    const bool align_16B =
1931       inst->opcode != SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL;
1932 
1933    const bool write = inst->opcode == SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
1934 
1935    /* The address is stored in the header.  See MH_A32_GO and MH_BTS_GO. */
1936    fs_builder ubld = bld.exec_all().group(8, 0);
1937    fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1938 
1939    if (is_stateless)
1940       ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
1941    else
1942       ubld.MOV(header, brw_imm_d(0));
1943 
1944    /* Address in OWord units when aligned to OWords. */
1945    if (align_16B)
1946       ubld.group(1, 0).SHR(component(header, 2), addr, brw_imm_ud(4));
1947    else
1948       ubld.group(1, 0).MOV(component(header, 2), addr);
1949 
1950    fs_reg data;
1951    unsigned ex_mlen = 0;
1952    if (write) {
1953       const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1954       data = retype(bld.move_to_vgrf(src, src_sz), BRW_REGISTER_TYPE_UD);
1955       ex_mlen = src_sz * type_sz(src.type) * inst->exec_size / REG_SIZE;
1956    }
1957 
1958    inst->opcode = SHADER_OPCODE_SEND;
1959    inst->mlen = 1;
1960    inst->ex_mlen = ex_mlen;
1961    inst->header_size = 1;
1962    inst->send_has_side_effects = has_side_effects;
1963    inst->send_is_volatile = !has_side_effects;
1964 
1965    inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1966 
1967    const uint32_t desc = brw_dp_oword_block_rw_desc(devinfo, align_16B,
1968                                                     arg.ud, write);
1969    setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1970 
1971    inst->resize_sources(4);
1972 
1973    inst->src[2] = header;
1974    inst->src[3] = data;
1975 }
1976 
1977 static fs_reg
emit_a64_oword_block_header(const fs_builder & bld,const fs_reg & addr)1978 emit_a64_oword_block_header(const fs_builder &bld, const fs_reg &addr)
1979 {
1980    const fs_builder ubld = bld.exec_all().group(8, 0);
1981 
1982    assert(type_sz(addr.type) == 8 && addr.stride == 0);
1983 
1984    fs_reg expanded_addr = addr;
1985    if (addr.file == UNIFORM) {
1986       /* We can't do stride 1 with the UNIFORM file, it requires stride 0 */
1987       expanded_addr = ubld.vgrf(BRW_REGISTER_TYPE_UQ);
1988       expanded_addr.stride = 0;
1989       ubld.MOV(expanded_addr, retype(addr, BRW_REGISTER_TYPE_UQ));
1990    }
1991 
1992    fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1993    ubld.MOV(header, brw_imm_ud(0));
1994 
1995    /* Use a 2-wide MOV to fill out the address */
1996    fs_reg addr_vec2 = expanded_addr;
1997    addr_vec2.type = BRW_REGISTER_TYPE_UD;
1998    addr_vec2.stride = 1;
1999    ubld.group(2, 0).MOV(header, addr_vec2);
2000 
2001    return header;
2002 }
2003 
2004 static void
emit_fragment_mask(const fs_builder & bld,fs_inst * inst)2005 emit_fragment_mask(const fs_builder &bld, fs_inst *inst)
2006 {
2007    assert(inst->src[A64_LOGICAL_ENABLE_HELPERS].file == IMM);
2008    const bool enable_helpers = inst->src[A64_LOGICAL_ENABLE_HELPERS].ud;
2009 
2010    /* If we're a fragment shader, we have to predicate with the sample mask to
2011     * avoid helper invocations to avoid helper invocations in instructions
2012     * with side effects, unless they are explicitly required.
2013     *
2014     * There are also special cases when we actually want to run on helpers
2015     * (ray queries).
2016     */
2017    assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
2018    if (enable_helpers)
2019       emit_predicate_on_vector_mask(bld, inst);
2020    else if (inst->has_side_effects())
2021       brw_emit_predicate_on_sample_mask(bld, inst);
2022 }
2023 
2024 static void
lower_lsc_a64_logical_send(const fs_builder & bld,fs_inst * inst)2025 lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst)
2026 {
2027    const intel_device_info *devinfo = bld.shader->devinfo;
2028 
2029    /* Get the logical send arguments. */
2030    const fs_reg addr = inst->src[A64_LOGICAL_ADDRESS];
2031    const fs_reg src = inst->src[A64_LOGICAL_SRC];
2032    const unsigned src_sz = type_sz(src.type);
2033    const unsigned dst_sz = type_sz(inst->dst.type);
2034 
2035    const unsigned src_comps = inst->components_read(1);
2036    assert(inst->src[A64_LOGICAL_ARG].file == IMM);
2037    const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
2038    const bool has_side_effects = inst->has_side_effects();
2039 
2040    fs_reg payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
2041    fs_reg payload2 = retype(bld.move_to_vgrf(src, src_comps),
2042                             BRW_REGISTER_TYPE_UD);
2043    unsigned ex_mlen = src_comps * src_sz * inst->exec_size / REG_SIZE;
2044 
2045    switch (inst->opcode) {
2046    case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
2047       inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
2048                                 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2049                                 1 /* num_coordinates */,
2050                                 LSC_DATA_SIZE_D32, arg /* num_channels */,
2051                                 false /* transpose */,
2052                                 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2053                                 true /* has_dest */);
2054       break;
2055    case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
2056       inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
2057                                 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2058                                 1 /* num_coordinates */,
2059                                 LSC_DATA_SIZE_D32, arg /* num_channels */,
2060                                 false /* transpose */,
2061                                 LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
2062                                 false /* has_dest */);
2063       break;
2064    case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
2065       inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
2066                                 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2067                                 1 /* num_coordinates */,
2068                                 lsc_bits_to_data_size(arg),
2069                                 1 /* num_channels */,
2070                                 false /* transpose */,
2071                                 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2072                                 true /* has_dest */);
2073       break;
2074    case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
2075       inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
2076                                 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2077                                 1 /* num_coordinates */,
2078                                 lsc_bits_to_data_size(arg),
2079                                 1 /* num_channels */,
2080                                 false /* transpose */,
2081                                 LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
2082                                 false /* has_dest */);
2083       break;
2084    case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: {
2085       /* Bspec: Atomic instruction -> Cache section:
2086        *
2087        *    Atomic messages are always forced to "un-cacheable" in the L1
2088        *    cache.
2089        */
2090       enum lsc_opcode opcode = (enum lsc_opcode) arg;
2091       inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
2092                                 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2093                                 1 /* num_coordinates */,
2094                                 lsc_bits_to_data_size(dst_sz * 8),
2095                                 1 /* num_channels */,
2096                                 false /* transpose */,
2097                                 LSC_CACHE(devinfo, STORE, L1UC_L3WB),
2098                                 !inst->dst.is_null());
2099       break;
2100    }
2101    case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
2102    case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2103       inst->exec_size = 1;
2104       inst->desc = lsc_msg_desc(devinfo,
2105                                 LSC_OP_LOAD,
2106                                 1 /* exec_size */,
2107                                 LSC_ADDR_SURFTYPE_FLAT,
2108                                 LSC_ADDR_SIZE_A64,
2109                                 1 /* num_coordinates */,
2110                                 LSC_DATA_SIZE_D32,
2111                                 arg /* num_channels */,
2112                                 true /* transpose */,
2113                                 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2114                                 true /* has_dest */);
2115       break;
2116    case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
2117       inst->exec_size = 1;
2118       inst->desc = lsc_msg_desc(devinfo,
2119                                 LSC_OP_STORE,
2120                                 1 /* exec_size */,
2121                                 LSC_ADDR_SURFTYPE_FLAT,
2122                                 LSC_ADDR_SIZE_A64,
2123                                 1 /* num_coordinates */,
2124                                 LSC_DATA_SIZE_D32,
2125                                 arg /* num_channels */,
2126                                 true /* transpose */,
2127                                 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2128                                 false /* has_dest */);
2129 
2130       break;
2131    default:
2132       unreachable("Unknown A64 logical instruction");
2133    }
2134 
2135    if (bld.shader->stage == MESA_SHADER_FRAGMENT)
2136       emit_fragment_mask(bld, inst);
2137 
2138    /* Update the original instruction. */
2139    inst->opcode = SHADER_OPCODE_SEND;
2140    inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2141    inst->ex_mlen = ex_mlen;
2142    inst->header_size = 0;
2143    inst->send_has_side_effects = has_side_effects;
2144    inst->send_is_volatile = !has_side_effects;
2145 
2146    /* Set up SFID and descriptors */
2147    inst->sfid = GFX12_SFID_UGM;
2148    inst->resize_sources(4);
2149    inst->src[0] = brw_imm_ud(0); /* desc */
2150    inst->src[1] = brw_imm_ud(0); /* ex_desc */
2151    inst->src[2] = payload;
2152    inst->src[3] = payload2;
2153 }
2154 
2155 static void
lower_a64_logical_send(const fs_builder & bld,fs_inst * inst)2156 lower_a64_logical_send(const fs_builder &bld, fs_inst *inst)
2157 {
2158    const intel_device_info *devinfo = bld.shader->devinfo;
2159 
2160    const fs_reg addr = inst->src[A64_LOGICAL_ADDRESS];
2161    const fs_reg src = inst->src[A64_LOGICAL_SRC];
2162    const unsigned src_comps = inst->components_read(1);
2163    assert(inst->src[A64_LOGICAL_ARG].file == IMM);
2164    const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
2165    const bool has_side_effects = inst->has_side_effects();
2166 
2167    fs_reg payload, payload2;
2168    unsigned mlen, ex_mlen = 0, header_size = 0;
2169    if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL ||
2170        inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL ||
2171        inst->opcode == SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL) {
2172 
2173       /* OWORD messages only take a scalar address in a header */
2174       mlen = 1;
2175       header_size = 1;
2176       payload = emit_a64_oword_block_header(bld, addr);
2177 
2178       if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL) {
2179          ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
2180          payload2 = retype(bld.move_to_vgrf(src, src_comps),
2181                            BRW_REGISTER_TYPE_UD);
2182       }
2183    } else {
2184       /* On Skylake and above, we have SENDS */
2185       mlen = 2 * (inst->exec_size / 8);
2186       ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
2187       payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
2188       payload2 = retype(bld.move_to_vgrf(src, src_comps),
2189                         BRW_REGISTER_TYPE_UD);
2190    }
2191 
2192    uint32_t desc;
2193    switch (inst->opcode) {
2194    case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
2195       desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
2196                                                 arg,   /* num_channels */
2197                                                 false  /* write */);
2198       break;
2199 
2200    case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
2201       desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
2202                                                 arg,   /* num_channels */
2203                                                 true   /* write */);
2204       break;
2205 
2206    case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
2207       desc = brw_dp_a64_oword_block_rw_desc(devinfo,
2208                                             true,    /* align_16B */
2209                                             arg,     /* num_dwords */
2210                                             false    /* write */);
2211       break;
2212 
2213    case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2214       desc = brw_dp_a64_oword_block_rw_desc(devinfo,
2215                                             false,   /* align_16B */
2216                                             arg,     /* num_dwords */
2217                                             false    /* write */);
2218       break;
2219 
2220    case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
2221       desc = brw_dp_a64_oword_block_rw_desc(devinfo,
2222                                             true,    /* align_16B */
2223                                             arg,     /* num_dwords */
2224                                             true     /* write */);
2225       break;
2226 
2227    case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
2228       desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
2229                                                arg,   /* bit_size */
2230                                                false  /* write */);
2231       break;
2232 
2233    case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
2234       desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
2235                                                arg,   /* bit_size */
2236                                                true   /* write */);
2237       break;
2238 
2239    case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
2240       if (lsc_opcode_is_atomic_float((enum lsc_opcode) arg)) {
2241          desc =
2242             brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
2243                                                  type_sz(inst->dst.type) * 8,
2244                                                  lsc_op_to_legacy_atomic(arg),
2245                                                  !inst->dst.is_null());
2246       } else {
2247          desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size,
2248                                                type_sz(inst->dst.type) * 8,
2249                                                lsc_op_to_legacy_atomic(arg),
2250                                                !inst->dst.is_null());
2251       }
2252       break;
2253 
2254    default:
2255       unreachable("Unknown A64 logical instruction");
2256    }
2257 
2258    if (bld.shader->stage == MESA_SHADER_FRAGMENT)
2259       emit_fragment_mask(bld, inst);
2260 
2261    /* Update the original instruction. */
2262    inst->opcode = SHADER_OPCODE_SEND;
2263    inst->mlen = mlen;
2264    inst->ex_mlen = ex_mlen;
2265    inst->header_size = header_size;
2266    inst->send_has_side_effects = has_side_effects;
2267    inst->send_is_volatile = !has_side_effects;
2268 
2269    /* Set up SFID and descriptors */
2270    inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2271    inst->desc = desc;
2272    inst->resize_sources(4);
2273    inst->src[0] = brw_imm_ud(0); /* desc */
2274    inst->src[1] = brw_imm_ud(0); /* ex_desc */
2275    inst->src[2] = payload;
2276    inst->src[3] = payload2;
2277 }
2278 
2279 static void
lower_lsc_varying_pull_constant_logical_send(const fs_builder & bld,fs_inst * inst)2280 lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld,
2281                                              fs_inst *inst)
2282 {
2283    const intel_device_info *devinfo = bld.shader->devinfo;
2284    ASSERTED const brw_compiler *compiler = bld.shader->compiler;
2285 
2286    fs_reg surface        = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
2287    fs_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
2288    fs_reg offset_B       = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
2289    fs_reg alignment_B    = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT];
2290 
2291    /* We are switching the instruction from an ALU-like instruction to a
2292     * send-from-grf instruction.  Since sends can't handle strides or
2293     * source modifiers, we have to make a copy of the offset source.
2294     */
2295    fs_reg ubo_offset = bld.move_to_vgrf(offset_B, 1);
2296 
2297    enum lsc_addr_surface_type surf_type =
2298       surface_handle.file == BAD_FILE ?
2299       LSC_ADDR_SURFTYPE_BTI : LSC_ADDR_SURFTYPE_BSS;
2300 
2301    assert(alignment_B.file == BRW_IMMEDIATE_VALUE);
2302    unsigned alignment = alignment_B.ud;
2303 
2304    inst->opcode = SHADER_OPCODE_SEND;
2305    inst->sfid = GFX12_SFID_UGM;
2306    inst->resize_sources(3);
2307    inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
2308                        compiler->extended_bindless_surface_offset;
2309 
2310    assert(!compiler->indirect_ubos_use_sampler);
2311 
2312    inst->src[0] = brw_imm_ud(0);
2313    inst->src[2] = ubo_offset; /* payload */
2314 
2315    if (alignment >= 4) {
2316       inst->desc =
2317          lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
2318                       surf_type, LSC_ADDR_SIZE_A32,
2319                       1 /* num_coordinates */,
2320                       LSC_DATA_SIZE_D32,
2321                       4 /* num_channels */,
2322                       false /* transpose */,
2323                       LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2324                       true /* has_dest */);
2325       inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2326 
2327       setup_lsc_surface_descriptors(bld, inst, inst->desc,
2328                                     surface.file != BAD_FILE ?
2329                                     surface : surface_handle);
2330    } else {
2331       inst->desc =
2332          lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
2333                       surf_type, LSC_ADDR_SIZE_A32,
2334                       1 /* num_coordinates */,
2335                       LSC_DATA_SIZE_D32,
2336                       1 /* num_channels */,
2337                       false /* transpose */,
2338                       LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2339                       true /* has_dest */);
2340       inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2341 
2342       setup_lsc_surface_descriptors(bld, inst, inst->desc,
2343                                     surface.file != BAD_FILE ?
2344                                     surface : surface_handle);
2345 
2346       /* The byte scattered messages can only read one dword at a time so
2347        * we have to duplicate the message 4 times to read the full vec4.
2348        * Hopefully, dead code will clean up the mess if some of them aren't
2349        * needed.
2350        */
2351       assert(inst->size_written == 16 * inst->exec_size);
2352       inst->size_written /= 4;
2353       for (unsigned c = 1; c < 4; c++) {
2354          /* Emit a copy of the instruction because we're about to modify
2355           * it.  Because this loop starts at 1, we will emit copies for the
2356           * first 3 and the final one will be the modified instruction.
2357           */
2358          bld.emit(*inst);
2359 
2360          /* Offset the source */
2361          inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);
2362          bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
2363 
2364          /* Offset the destination */
2365          inst->dst = offset(inst->dst, bld, 1);
2366       }
2367    }
2368 }
2369 
2370 static void
lower_varying_pull_constant_logical_send(const fs_builder & bld,fs_inst * inst)2371 lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
2372 {
2373    const intel_device_info *devinfo = bld.shader->devinfo;
2374    const brw_compiler *compiler = bld.shader->compiler;
2375 
2376    fs_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
2377    fs_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
2378    fs_reg offset_B = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
2379 
2380    /* We are switching the instruction from an ALU-like instruction to a
2381     * send-from-grf instruction.  Since sends can't handle strides or
2382     * source modifiers, we have to make a copy of the offset source.
2383     */
2384    fs_reg ubo_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
2385    bld.MOV(ubo_offset, offset_B);
2386 
2387    assert(inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].file == BRW_IMMEDIATE_VALUE);
2388    unsigned alignment = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].ud;
2389 
2390    inst->opcode = SHADER_OPCODE_SEND;
2391    inst->mlen = inst->exec_size / 8;
2392    inst->resize_sources(3);
2393 
2394    /* src[0] & src[1] are filled by setup_surface_descriptors() */
2395    inst->src[2] = ubo_offset; /* payload */
2396 
2397    if (compiler->indirect_ubos_use_sampler) {
2398       const unsigned simd_mode =
2399          inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
2400                                 BRW_SAMPLER_SIMD_MODE_SIMD16;
2401       const uint32_t desc = brw_sampler_desc(devinfo, 0, 0,
2402                                              GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
2403                                              simd_mode, 0);
2404 
2405       inst->sfid = BRW_SFID_SAMPLER;
2406       setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2407    } else if (alignment >= 4) {
2408       const uint32_t desc =
2409          brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
2410                                         4, /* num_channels */
2411                                         false   /* write */);
2412 
2413       inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2414       setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2415    } else {
2416       const uint32_t desc =
2417          brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
2418                                        32,     /* bit_size */
2419                                        false   /* write */);
2420 
2421       inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
2422       setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2423 
2424       /* The byte scattered messages can only read one dword at a time so
2425        * we have to duplicate the message 4 times to read the full vec4.
2426        * Hopefully, dead code will clean up the mess if some of them aren't
2427        * needed.
2428        */
2429       assert(inst->size_written == 16 * inst->exec_size);
2430       inst->size_written /= 4;
2431       for (unsigned c = 1; c < 4; c++) {
2432          /* Emit a copy of the instruction because we're about to modify
2433           * it.  Because this loop starts at 1, we will emit copies for the
2434           * first 3 and the final one will be the modified instruction.
2435           */
2436          bld.emit(*inst);
2437 
2438          /* Offset the source */
2439          inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);
2440          bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
2441 
2442          /* Offset the destination */
2443          inst->dst = offset(inst->dst, bld, 1);
2444       }
2445    }
2446 }
2447 
2448 static void
lower_interpolator_logical_send(const fs_builder & bld,fs_inst * inst,const struct brw_wm_prog_key * wm_prog_key,const struct brw_wm_prog_data * wm_prog_data)2449 lower_interpolator_logical_send(const fs_builder &bld, fs_inst *inst,
2450                                 const struct brw_wm_prog_key *wm_prog_key,
2451                                 const struct brw_wm_prog_data *wm_prog_data)
2452 {
2453    const intel_device_info *devinfo = bld.shader->devinfo;
2454 
2455    /* We have to send something */
2456    fs_reg payload = brw_vec8_grf(0, 0);
2457    unsigned mlen = 1;
2458 
2459    unsigned mode;
2460    switch (inst->opcode) {
2461    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2462       assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
2463       mode = GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE;
2464       break;
2465 
2466    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2467       assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
2468       mode = GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET;
2469       break;
2470 
2471    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2472       payload = inst->src[INTERP_SRC_OFFSET];
2473       mlen = 2 * inst->exec_size / 8;
2474       mode = GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET;
2475       break;
2476 
2477    default:
2478       unreachable("Invalid interpolator instruction");
2479    }
2480 
2481    const bool dynamic_mode =
2482       inst->src[INTERP_SRC_DYNAMIC_MODE].file != BAD_FILE;
2483 
2484    fs_reg desc = inst->src[INTERP_SRC_MSG_DESC];
2485    uint32_t desc_imm =
2486       brw_pixel_interp_desc(devinfo,
2487                             /* Leave the mode at 0 if persample_dispatch is
2488                              * dynamic, it will be ORed in below.
2489                              */
2490                             dynamic_mode ? 0 : mode,
2491                             inst->pi_noperspective,
2492                             false /* coarse_pixel_rate */,
2493                             inst->exec_size, inst->group);
2494 
2495    if (wm_prog_data->coarse_pixel_dispatch == BRW_ALWAYS) {
2496       desc_imm |= (1 << 15);
2497    } else if (wm_prog_data->coarse_pixel_dispatch == BRW_SOMETIMES) {
2498       STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_PI_MSG == (1 << 15));
2499       fs_reg orig_desc = desc;
2500       const fs_builder &ubld = bld.exec_all().group(8, 0);
2501       desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2502       ubld.AND(desc, dynamic_msaa_flags(wm_prog_data),
2503                brw_imm_ud(INTEL_MSAA_FLAG_COARSE_PI_MSG));
2504 
2505       /* And, if it's AT_OFFSET, we might have a non-trivial descriptor */
2506       if (orig_desc.file == IMM) {
2507          desc_imm |= orig_desc.ud;
2508       } else {
2509          ubld.OR(desc, desc, orig_desc);
2510       }
2511    }
2512 
2513    /* If persample_dispatch is dynamic, select the interpolation mode
2514     * dynamically and OR into the descriptor to complete the static part
2515     * generated by brw_pixel_interp_desc().
2516     *
2517     * Why does this work? If you look at the SKL PRMs, Volume 7:
2518     * 3D-Media-GPGPU, Shared Functions Pixel Interpolater, you'll see that
2519     *
2520     *   - "Per Message Offset” Message Descriptor
2521     *   - “Sample Position Offset” Message Descriptor
2522     *
2523     * have different formats. Fortunately, a fragment shader dispatched at
2524     * pixel rate, will have gl_SampleID = 0 & gl_NumSamples = 1. So the value
2525     * we pack in “Sample Position Offset” will be a 0 and will cover the X/Y
2526     * components of "Per Message Offset”, which will give us the pixel offset 0x0.
2527     */
2528    if (dynamic_mode) {
2529       fs_reg orig_desc = desc;
2530       const fs_builder &ubld = bld.exec_all().group(8, 0);
2531       desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2532 
2533       /* The predicate should have been built in brw_fs_nir.cpp when emitting
2534        * NIR code. This guarantees that we do not have incorrect interactions
2535        * with the flag register holding the predication result.
2536        */
2537       if (orig_desc.file == IMM) {
2538          /* Not using SEL here because we would generate an instruction with 2
2539           * immediate sources which is not supported by HW.
2540           */
2541          set_predicate_inv(BRW_PREDICATE_NORMAL, false,
2542                            ubld.MOV(desc, brw_imm_ud(orig_desc.ud |
2543                                                      GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
2544          set_predicate_inv(BRW_PREDICATE_NORMAL, true,
2545                            ubld.MOV(desc, brw_imm_ud(orig_desc.ud |
2546                                                      GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
2547       } else {
2548          set_predicate_inv(BRW_PREDICATE_NORMAL, false,
2549                            ubld.OR(desc, orig_desc,
2550                                    brw_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
2551          set_predicate_inv(BRW_PREDICATE_NORMAL, true,
2552                            ubld.OR(desc, orig_desc,
2553                                    brw_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
2554       }
2555    }
2556 
2557    inst->opcode = SHADER_OPCODE_SEND;
2558    inst->sfid = GFX7_SFID_PIXEL_INTERPOLATOR;
2559    inst->desc = desc_imm;
2560    inst->ex_desc = 0;
2561    inst->mlen = mlen;
2562    inst->ex_mlen = 0;
2563    inst->send_has_side_effects = false;
2564    inst->send_is_volatile = false;
2565 
2566    inst->resize_sources(3);
2567    inst->src[0] = component(desc, 0);
2568    inst->src[1] = brw_imm_ud(0); /* ex_desc */
2569    inst->src[2] = payload;
2570 }
2571 
2572 static void
lower_btd_logical_send(const fs_builder & bld,fs_inst * inst)2573 lower_btd_logical_send(const fs_builder &bld, fs_inst *inst)
2574 {
2575    const intel_device_info *devinfo = bld.shader->devinfo;
2576    fs_reg global_addr = inst->src[0];
2577    const fs_reg btd_record = inst->src[1];
2578 
2579    const unsigned unit = reg_unit(devinfo);
2580    const unsigned mlen = 2 * unit;
2581    const fs_builder ubld = bld.exec_all();
2582    fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2 * unit);
2583 
2584    ubld.MOV(header, brw_imm_ud(0));
2585    switch (inst->opcode) {
2586    case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
2587       assert(type_sz(global_addr.type) == 8 && global_addr.stride == 0);
2588       global_addr.type = BRW_REGISTER_TYPE_UD;
2589       global_addr.stride = 1;
2590       ubld.group(2, 0).MOV(header, global_addr);
2591       break;
2592 
2593    case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
2594       /* The bottom bit is the Stack ID release bit */
2595       ubld.group(1, 0).MOV(header, brw_imm_ud(1));
2596       break;
2597 
2598    default:
2599       unreachable("Invalid BTD message");
2600    }
2601 
2602    /* Stack IDs are always in R1 regardless of whether we're coming from a
2603     * bindless shader or a regular compute shader.
2604     */
2605    fs_reg stack_ids = retype(offset(header, bld, 1), BRW_REGISTER_TYPE_UW);
2606    bld.exec_all().MOV(stack_ids, retype(brw_vec8_grf(1 * unit, 0),
2607                                         BRW_REGISTER_TYPE_UW));
2608 
2609    unsigned ex_mlen = 0;
2610    fs_reg payload;
2611    if (inst->opcode == SHADER_OPCODE_BTD_SPAWN_LOGICAL) {
2612       ex_mlen = 2 * (inst->exec_size / 8);
2613       payload = bld.move_to_vgrf(btd_record, 1);
2614    } else {
2615       assert(inst->opcode == SHADER_OPCODE_BTD_RETIRE_LOGICAL);
2616       /* All these messages take a BTD and things complain if we don't provide
2617        * one for RETIRE.  However, it shouldn't ever actually get used so fill
2618        * it with zero.
2619        */
2620       ex_mlen = 2 * (inst->exec_size / 8);
2621       payload = bld.move_to_vgrf(brw_imm_uq(0), 1);
2622    }
2623 
2624    /* Update the original instruction. */
2625    inst->opcode = SHADER_OPCODE_SEND;
2626    inst->mlen = mlen;
2627    inst->ex_mlen = ex_mlen;
2628    inst->header_size = 0; /* HW docs require has_header = false */
2629    inst->send_has_side_effects = true;
2630    inst->send_is_volatile = false;
2631 
2632    /* Set up SFID and descriptors */
2633    inst->sfid = GEN_RT_SFID_BINDLESS_THREAD_DISPATCH;
2634    inst->desc = brw_btd_spawn_desc(devinfo, inst->exec_size,
2635                                    GEN_RT_BTD_MESSAGE_SPAWN);
2636    inst->resize_sources(4);
2637    inst->src[0] = brw_imm_ud(0); /* desc */
2638    inst->src[1] = brw_imm_ud(0); /* ex_desc */
2639    inst->src[2] = header;
2640    inst->src[3] = payload;
2641 }
2642 
2643 static void
lower_trace_ray_logical_send(const fs_builder & bld,fs_inst * inst)2644 lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst)
2645 {
2646    const intel_device_info *devinfo = bld.shader->devinfo;
2647    /* The emit_uniformize() in brw_fs_nir.cpp will generate an horizontal
2648     * stride of 0. Below we're doing a MOV() in SIMD2. Since we can't use UQ/Q
2649     * types in on Gfx12.5, we need to tweak the stride with a value of 1 dword
2650     * so that the MOV operates on 2 components rather than twice the same
2651     * component.
2652     */
2653    fs_reg globals_addr = retype(inst->src[RT_LOGICAL_SRC_GLOBALS], BRW_REGISTER_TYPE_UD);
2654    globals_addr.stride = 1;
2655    const fs_reg bvh_level =
2656       inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == BRW_IMMEDIATE_VALUE ?
2657       inst->src[RT_LOGICAL_SRC_BVH_LEVEL] :
2658       bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL],
2659                        inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL));
2660    const fs_reg trace_ray_control =
2661       inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == BRW_IMMEDIATE_VALUE ?
2662       inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] :
2663       bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL],
2664                        inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL));
2665    const fs_reg synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS];
2666    assert(synchronous_src.file == BRW_IMMEDIATE_VALUE);
2667    const bool synchronous = synchronous_src.ud;
2668 
2669    const unsigned unit = reg_unit(devinfo);
2670    const unsigned mlen = unit;
2671    const fs_builder ubld = bld.exec_all();
2672    fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2673    ubld.MOV(header, brw_imm_ud(0));
2674    ubld.group(2, 0).MOV(header, globals_addr);
2675    if (synchronous)
2676       ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous));
2677 
2678    const unsigned ex_mlen = inst->exec_size / 8;
2679    fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);
2680    if (bvh_level.file == BRW_IMMEDIATE_VALUE &&
2681        trace_ray_control.file == BRW_IMMEDIATE_VALUE) {
2682       bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, 9, 8) |
2683                                   (bvh_level.ud & 0x7)));
2684    } else {
2685       bld.SHL(payload, trace_ray_control, brw_imm_ud(8));
2686       bld.OR(payload, payload, bvh_level);
2687    }
2688 
2689    /* When doing synchronous traversal, the HW implicitly computes the
2690     * stack_id using the following formula :
2691     *
2692     *    EUID[3:0] & THREAD_ID[2:0] & SIMD_LANE_ID[3:0]
2693     *
2694     * Only in the asynchronous case we need to set the stack_id given from the
2695     * payload register.
2696     */
2697    if (!synchronous) {
2698       bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1),
2699               retype(brw_vec8_grf(1 * unit, 0), BRW_REGISTER_TYPE_UW),
2700               brw_imm_uw(0x7ff));
2701    }
2702 
2703    /* Update the original instruction. */
2704    inst->opcode = SHADER_OPCODE_SEND;
2705    inst->mlen = mlen;
2706    inst->ex_mlen = ex_mlen;
2707    inst->header_size = 0; /* HW docs require has_header = false */
2708    inst->send_has_side_effects = true;
2709    inst->send_is_volatile = false;
2710 
2711    /* Set up SFID and descriptors */
2712    inst->sfid = GEN_RT_SFID_RAY_TRACE_ACCELERATOR;
2713    inst->desc = brw_rt_trace_ray_desc(devinfo, inst->exec_size);
2714    inst->resize_sources(4);
2715    inst->src[0] = brw_imm_ud(0); /* desc */
2716    inst->src[1] = brw_imm_ud(0); /* ex_desc */
2717    inst->src[2] = header;
2718    inst->src[3] = payload;
2719 }
2720 
2721 static void
lower_get_buffer_size(const fs_builder & bld,fs_inst * inst)2722 lower_get_buffer_size(const fs_builder &bld, fs_inst *inst)
2723 {
2724    const intel_device_info *devinfo = bld.shader->devinfo;
2725    /* Since we can only execute this instruction on uniform bti/surface
2726     * handles, brw_fs_nir.cpp should already have limited this to SIMD8.
2727     */
2728    assert(inst->exec_size == (devinfo->ver < 20 ? 8 : 16));
2729 
2730    fs_reg surface = inst->src[GET_BUFFER_SIZE_SRC_SURFACE];
2731    fs_reg surface_handle = inst->src[GET_BUFFER_SIZE_SRC_SURFACE_HANDLE];
2732    fs_reg lod = inst->src[GET_BUFFER_SIZE_SRC_LOD];
2733 
2734    inst->opcode = SHADER_OPCODE_SEND;
2735    inst->mlen = inst->exec_size / 8;
2736    inst->resize_sources(3);
2737    inst->ex_mlen = 0;
2738    inst->ex_desc = 0;
2739 
2740    /* src[0] & src[1] are filled by setup_surface_descriptors() */
2741    inst->src[2] = lod;
2742 
2743    const uint32_t return_format = GFX8_SAMPLER_RETURN_FORMAT_32BITS;
2744 
2745    const uint32_t desc = brw_sampler_desc(devinfo, 0, 0,
2746                                           GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
2747                                           BRW_SAMPLER_SIMD_MODE_SIMD8,
2748                                           return_format);
2749 
2750    inst->dst = retype(inst->dst, BRW_REGISTER_TYPE_UW);
2751    inst->sfid = BRW_SFID_SAMPLER;
2752    setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2753 }
2754 
2755 bool
brw_fs_lower_logical_sends(fs_visitor & s)2756 brw_fs_lower_logical_sends(fs_visitor &s)
2757 {
2758    const intel_device_info *devinfo = s.devinfo;
2759    bool progress = false;
2760 
2761    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
2762       const fs_builder ibld(&s, block, inst);
2763 
2764       switch (inst->opcode) {
2765       case FS_OPCODE_FB_WRITE_LOGICAL:
2766          assert(s.stage == MESA_SHADER_FRAGMENT);
2767          lower_fb_write_logical_send(ibld, inst,
2768                                      brw_wm_prog_data(s.prog_data),
2769                                      (const brw_wm_prog_key *)s.key,
2770                                      s.fs_payload());
2771          break;
2772 
2773       case FS_OPCODE_FB_READ_LOGICAL:
2774          lower_fb_read_logical_send(ibld, inst);
2775          break;
2776 
2777       case SHADER_OPCODE_TEX_LOGICAL:
2778          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
2779          break;
2780 
2781       case SHADER_OPCODE_TXD_LOGICAL:
2782          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
2783          break;
2784 
2785       case SHADER_OPCODE_TXF_LOGICAL:
2786          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
2787          break;
2788 
2789       case SHADER_OPCODE_TXL_LOGICAL:
2790          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
2791          break;
2792 
2793       case SHADER_OPCODE_TXS_LOGICAL:
2794          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
2795          break;
2796 
2797       case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
2798          lower_sampler_logical_send(ibld, inst,
2799                                     SHADER_OPCODE_IMAGE_SIZE_LOGICAL);
2800          break;
2801 
2802       case FS_OPCODE_TXB_LOGICAL:
2803          lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
2804          break;
2805 
2806       case SHADER_OPCODE_TXF_CMS_LOGICAL:
2807          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
2808          break;
2809 
2810       case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
2811       case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
2812          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
2813          break;
2814 
2815       case SHADER_OPCODE_TXF_UMS_LOGICAL:
2816          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
2817          break;
2818 
2819       case SHADER_OPCODE_TXF_MCS_LOGICAL:
2820          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
2821          break;
2822 
2823       case SHADER_OPCODE_LOD_LOGICAL:
2824          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
2825          break;
2826 
2827       case SHADER_OPCODE_TG4_LOGICAL:
2828          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
2829          break;
2830 
2831       case SHADER_OPCODE_TG4_BIAS_LOGICAL:
2832          assert(devinfo->ver >= 20);
2833          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_BIAS);
2834          break;
2835 
2836       case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
2837          assert(devinfo->ver >= 20);
2838          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_EXPLICIT_LOD);
2839          break;
2840 
2841       case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
2842          assert(devinfo->ver >= 20);
2843          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_IMPLICIT_LOD);
2844          break;
2845 
2846       case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
2847          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
2848          break;
2849 
2850       case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
2851          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET_LOD);
2852          break;
2853 
2854       case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
2855          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET_BIAS);
2856          break;
2857 
2858       case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
2859          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO);
2860          break;
2861 
2862       case SHADER_OPCODE_GET_BUFFER_SIZE:
2863          lower_get_buffer_size(ibld, inst);
2864          break;
2865 
2866       case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
2867       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
2868       case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
2869       case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
2870       case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
2871          if (devinfo->has_lsc) {
2872             lower_lsc_surface_logical_send(ibld, inst);
2873             break;
2874          }
2875       case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
2876       case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
2877       case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
2878       case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
2879       case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
2880          lower_surface_logical_send(ibld, inst);
2881          break;
2882 
2883       case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2884       case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
2885          if (devinfo->has_lsc) {
2886             lower_lsc_block_logical_send(ibld, inst);
2887             break;
2888          }
2889          lower_surface_block_logical_send(ibld, inst);
2890          break;
2891 
2892       case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
2893       case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
2894       case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
2895       case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
2896       case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
2897       case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
2898       case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2899       case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
2900          if (devinfo->has_lsc) {
2901             lower_lsc_a64_logical_send(ibld, inst);
2902             break;
2903          }
2904          lower_a64_logical_send(ibld, inst);
2905          break;
2906 
2907       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
2908          if (devinfo->has_lsc && !s.compiler->indirect_ubos_use_sampler)
2909             lower_lsc_varying_pull_constant_logical_send(ibld, inst);
2910          else
2911             lower_varying_pull_constant_logical_send(ibld, inst);
2912          break;
2913 
2914       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2915       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2916       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2917          lower_interpolator_logical_send(ibld, inst,
2918                                          (const brw_wm_prog_key *)s.key,
2919                                          brw_wm_prog_data(s.prog_data));
2920          break;
2921 
2922       case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
2923       case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
2924          lower_btd_logical_send(ibld, inst);
2925          break;
2926 
2927       case RT_OPCODE_TRACE_RAY_LOGICAL:
2928          lower_trace_ray_logical_send(ibld, inst);
2929          break;
2930 
2931       case SHADER_OPCODE_URB_READ_LOGICAL:
2932          if (devinfo->ver < 20)
2933             lower_urb_read_logical_send(ibld, inst);
2934          else
2935             lower_urb_read_logical_send_xe2(ibld, inst);
2936          break;
2937 
2938       case SHADER_OPCODE_URB_WRITE_LOGICAL:
2939          if (devinfo->ver < 20)
2940             lower_urb_write_logical_send(ibld, inst);
2941          else
2942             lower_urb_write_logical_send_xe2(ibld, inst);
2943 
2944          break;
2945 
2946       default:
2947          continue;
2948       }
2949 
2950       progress = true;
2951    }
2952 
2953    if (progress)
2954       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2955 
2956    return progress;
2957 }
2958 
2959 /**
2960  * Turns the generic expression-style uniform pull constant load instruction
2961  * into a hardware-specific series of instructions for loading a pull
2962  * constant.
2963  *
2964  * The expression style allows the CSE pass before this to optimize out
2965  * repeated loads from the same offset, and gives the pre-register-allocation
2966  * scheduling full flexibility, while the conversion to native instructions
2967  * allows the post-register-allocation scheduler the best information
2968  * possible.
2969  *
2970  * Note that execution masking for setting up pull constant loads is special:
2971  * the channels that need to be written are unrelated to the current execution
2972  * mask, since a later instruction will use one of the result channels as a
2973  * source operand for all 8 or 16 of its channels.
2974  */
2975 bool
brw_fs_lower_uniform_pull_constant_loads(fs_visitor & s)2976 brw_fs_lower_uniform_pull_constant_loads(fs_visitor &s)
2977 {
2978    const intel_device_info *devinfo = s.devinfo;
2979    bool progress = false;
2980 
2981    foreach_block_and_inst (block, fs_inst, inst, s.cfg) {
2982       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2983          continue;
2984 
2985       const fs_reg surface = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE];
2986       const fs_reg surface_handle = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE];
2987       const fs_reg offset_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_OFFSET];
2988       const fs_reg size_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_SIZE];
2989       assert(surface.file == BAD_FILE || surface_handle.file == BAD_FILE);
2990       assert(offset_B.file == IMM);
2991       assert(size_B.file == IMM);
2992 
2993       if (devinfo->has_lsc) {
2994          const fs_builder ubld =
2995             fs_builder(&s, block, inst).group(8, 0).exec_all();
2996 
2997          const fs_reg payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2998          ubld.MOV(payload, offset_B);
2999 
3000          inst->sfid = GFX12_SFID_UGM;
3001          inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
3002                                    1 /* simd_size */,
3003                                    surface_handle.file == BAD_FILE ?
3004                                    LSC_ADDR_SURFTYPE_BTI :
3005                                    LSC_ADDR_SURFTYPE_BSS,
3006                                    LSC_ADDR_SIZE_A32,
3007                                    1 /* num_coordinates */,
3008                                    LSC_DATA_SIZE_D32,
3009                                    inst->size_written / 4,
3010                                    true /* transpose */,
3011                                    LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
3012                                    true /* has_dest */);
3013 
3014          /* Update the original instruction. */
3015          inst->opcode = SHADER_OPCODE_SEND;
3016          inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
3017          inst->send_ex_bso = surface_handle.file != BAD_FILE &&
3018                              s.compiler->extended_bindless_surface_offset;
3019          inst->ex_mlen = 0;
3020          inst->header_size = 0;
3021          inst->send_has_side_effects = false;
3022          inst->send_is_volatile = true;
3023          inst->exec_size = 1;
3024 
3025          /* Finally, the payload */
3026 
3027          inst->resize_sources(3);
3028          setup_lsc_surface_descriptors(ubld, inst, inst->desc,
3029                                        surface.file != BAD_FILE ?
3030                                        surface : surface_handle);
3031          inst->src[2] = payload;
3032 
3033          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3034       } else {
3035          const fs_builder ubld = fs_builder(&s, block, inst).exec_all();
3036          fs_reg header = fs_builder(&s, 8).exec_all().vgrf(BRW_REGISTER_TYPE_UD);
3037 
3038          ubld.group(8, 0).MOV(header,
3039                               retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
3040          ubld.group(1, 0).MOV(component(header, 2),
3041                               brw_imm_ud(offset_B.ud / 16));
3042 
3043          inst->sfid = GFX6_SFID_DATAPORT_CONSTANT_CACHE;
3044          inst->opcode = SHADER_OPCODE_SEND;
3045          inst->header_size = 1;
3046          inst->mlen = 1;
3047 
3048          uint32_t desc =
3049             brw_dp_oword_block_rw_desc(devinfo, true /* align_16B */,
3050                                        size_B.ud / 4, false /* write */);
3051 
3052          inst->resize_sources(4);
3053 
3054          setup_surface_descriptors(ubld, inst, desc, surface, surface_handle);
3055 
3056          inst->src[2] = header;
3057          inst->src[3] = fs_reg(); /* unused for reads */
3058 
3059          s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3060       }
3061 
3062       progress = true;
3063    }
3064 
3065    return progress;
3066 }
3067