• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010, 2022 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /**
25  * @file brw_lower_logical_sends.cpp
26  */
27 
28 #include "brw_eu.h"
29 #include "brw_fs.h"
30 
31 using namespace brw;
32 
33 static void
lower_urb_read_logical_send(const fs_builder & bld,fs_inst * inst)34 lower_urb_read_logical_send(const fs_builder &bld, fs_inst *inst)
35 {
36    const intel_device_info *devinfo = bld.shader->devinfo;
37    const bool per_slot_present =
38       inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
39 
40    assert(inst->size_written % REG_SIZE == 0);
41    assert(inst->header_size == 0);
42 
43    fs_reg *payload_sources = new fs_reg[inst->mlen];
44    fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(inst->mlen),
45                            BRW_REGISTER_TYPE_F);
46 
47    unsigned header_size = 0;
48    payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
49    if (per_slot_present)
50       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
51 
52    bld.LOAD_PAYLOAD(payload, payload_sources, inst->mlen, header_size);
53 
54    delete [] payload_sources;
55 
56    inst->opcode = SHADER_OPCODE_SEND;
57    inst->header_size = header_size;
58 
59    inst->sfid = BRW_SFID_URB;
60    inst->desc = brw_urb_desc(devinfo,
61                              GFX8_URB_OPCODE_SIMD8_READ,
62                              per_slot_present,
63                              false,
64                              inst->offset);
65 
66    inst->ex_desc = 0;
67    inst->ex_mlen = 0;
68    inst->send_is_volatile = true;
69 
70    inst->resize_sources(4);
71 
72    inst->src[0] = brw_imm_ud(0); /* desc */
73    inst->src[1] = brw_imm_ud(0); /* ex_desc */
74    inst->src[2] = payload;
75    inst->src[3] = brw_null_reg();
76 }
77 
78 static void
lower_urb_write_logical_send(const fs_builder & bld,fs_inst * inst)79 lower_urb_write_logical_send(const fs_builder &bld, fs_inst *inst)
80 {
81    const intel_device_info *devinfo = bld.shader->devinfo;
82    const bool per_slot_present =
83       inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
84    const bool channel_mask_present =
85       inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE;
86 
87    assert(inst->header_size == 0);
88 
89    fs_reg *payload_sources = new fs_reg[inst->mlen];
90    fs_reg payload = fs_reg(VGRF, bld.shader->alloc.allocate(inst->mlen),
91                            BRW_REGISTER_TYPE_F);
92 
93    unsigned header_size = 0;
94    payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
95    if (per_slot_present)
96       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
97 
98    if (channel_mask_present)
99       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
100 
101    for (unsigned i = header_size, j = 0; i < inst->mlen; i++, j++)
102       payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j);
103 
104    bld.LOAD_PAYLOAD(payload, payload_sources, inst->mlen, header_size);
105 
106    delete [] payload_sources;
107 
108    inst->opcode = SHADER_OPCODE_SEND;
109    inst->header_size = header_size;
110    inst->dst = brw_null_reg();
111 
112    inst->sfid = BRW_SFID_URB;
113    inst->desc = brw_urb_desc(devinfo,
114                              GFX8_URB_OPCODE_SIMD8_WRITE,
115                              per_slot_present,
116                              channel_mask_present,
117                              inst->offset);
118 
119    inst->ex_desc = 0;
120    inst->ex_mlen = 0;
121    inst->send_has_side_effects = true;
122 
123    inst->resize_sources(4);
124 
125    inst->src[0] = brw_imm_ud(0); /* desc */
126    inst->src[1] = brw_imm_ud(0); /* ex_desc */
127    inst->src[2] = payload;
128    inst->src[3] = brw_null_reg();
129 }
130 
131 static void
setup_color_payload(const fs_builder & bld,const brw_wm_prog_key * key,fs_reg * dst,fs_reg color,unsigned components)132 setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
133                     fs_reg *dst, fs_reg color, unsigned components)
134 {
135    if (key->clamp_fragment_color) {
136       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
137       assert(color.type == BRW_REGISTER_TYPE_F);
138 
139       for (unsigned i = 0; i < components; i++)
140          set_saturate(true,
141                       bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
142 
143       color = tmp;
144    }
145 
146    for (unsigned i = 0; i < components; i++)
147       dst[i] = offset(color, bld, i);
148 }
149 
150 static void
lower_fb_write_logical_send(const fs_builder & bld,fs_inst * inst,const struct brw_wm_prog_data * prog_data,const brw_wm_prog_key * key,const fs_visitor::thread_payload & payload)151 lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
152                             const struct brw_wm_prog_data *prog_data,
153                             const brw_wm_prog_key *key,
154                             const fs_visitor::thread_payload &payload)
155 {
156    assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
157    const intel_device_info *devinfo = bld.shader->devinfo;
158    const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
159    const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
160    const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
161    const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
162    const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
163    const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
164    fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
165    const unsigned components =
166       inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
167 
168    assert(inst->target != 0 || src0_alpha.file == BAD_FILE);
169 
170    /* We can potentially have a message length of up to 15, so we have to set
171     * base_mrf to either 0 or 1 in order to fit in m0..m15.
172     */
173    fs_reg sources[15];
174    int header_size = 2, payload_header_size;
175    unsigned length = 0;
176 
177    if (devinfo->ver < 6) {
178       /* TODO: Support SIMD32 on gfx4-5 */
179       assert(bld.group() < 16);
180 
181       /* For gfx4-5, we always have a header consisting of g0 and g1.  We have
182        * an implied MOV from g0,g1 to the start of the message.  The MOV from
183        * g0 is handled by the hardware and the MOV from g1 is provided by the
184        * generator.  This is required because, on gfx4-5, the generator may
185        * generate two write messages with different message lengths in order
186        * to handle AA data properly.
187        *
188        * Also, since the pixel mask goes in the g0 portion of the message and
189        * since render target writes are the last thing in the shader, we write
190        * the pixel mask directly into g0 and it will get copied as part of the
191        * implied write.
192        */
193       if (prog_data->uses_kill) {
194          bld.exec_all().group(1, 0)
195             .MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW),
196                  brw_sample_mask_reg(bld));
197       }
198 
199       assert(length == 0);
200       length = 2;
201    } else if ((devinfo->verx10 <= 70 &&
202                prog_data->uses_kill) ||
203               (devinfo->ver < 11 &&
204                (color1.file != BAD_FILE || key->nr_color_regions > 1))) {
205       /* From the Sandy Bridge PRM, volume 4, page 198:
206        *
207        *     "Dispatched Pixel Enables. One bit per pixel indicating
208        *      which pixels were originally enabled when the thread was
209        *      dispatched. This field is only required for the end-of-
210        *      thread message and on all dual-source messages."
211        */
212       const fs_builder ubld = bld.exec_all().group(8, 0);
213 
214       fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
215       if (bld.group() < 16) {
216          /* The header starts off as g0 and g1 for the first half */
217          ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
218                                               BRW_REGISTER_TYPE_UD));
219       } else {
220          /* The header starts off as g0 and g2 for the second half */
221          assert(bld.group() < 32);
222          const fs_reg header_sources[2] = {
223             retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
224             retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD),
225          };
226          ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
227 
228          /* Gfx12 will require additional fix-ups if we ever hit this path. */
229          assert(devinfo->ver < 12);
230       }
231 
232       uint32_t g00_bits = 0;
233 
234       /* Set "Source0 Alpha Present to RenderTarget" bit in message
235        * header.
236        */
237       if (src0_alpha.file != BAD_FILE)
238          g00_bits |= 1 << 11;
239 
240       /* Set computes stencil to render target */
241       if (prog_data->computed_stencil)
242          g00_bits |= 1 << 14;
243 
244       if (g00_bits) {
245          /* OR extra bits into g0.0 */
246          ubld.group(1, 0).OR(component(header, 0),
247                              retype(brw_vec1_grf(0, 0),
248                                     BRW_REGISTER_TYPE_UD),
249                              brw_imm_ud(g00_bits));
250       }
251 
252       /* Set the render target index for choosing BLEND_STATE. */
253       if (inst->target > 0) {
254          ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target));
255       }
256 
257       if (prog_data->uses_kill) {
258          ubld.group(1, 0).MOV(retype(component(header, 15),
259                                      BRW_REGISTER_TYPE_UW),
260                               brw_sample_mask_reg(bld));
261       }
262 
263       assert(length == 0);
264       sources[0] = header;
265       sources[1] = horiz_offset(header, 8);
266       length = 2;
267    }
268    assert(length == 0 || length == 2);
269    header_size = length;
270 
271    if (payload.aa_dest_stencil_reg[0]) {
272       assert(inst->group < 16);
273       sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
274       bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
275          .MOV(sources[length],
276               fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg[0], 0)));
277       length++;
278    }
279 
280    if (src0_alpha.file != BAD_FILE) {
281       for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
282          const fs_builder &ubld = bld.exec_all().group(8, i)
283                                     .annotate("FB write src0 alpha");
284          const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_F);
285          ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
286          setup_color_payload(ubld, key, &sources[length], tmp, 1);
287          length++;
288       }
289    }
290 
291    if (sample_mask.file != BAD_FILE) {
292       sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),
293                                BRW_REGISTER_TYPE_UD);
294 
295       /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
296        * relevant.  Since it's unsigned single words one vgrf is always
297        * 16-wide, but only the lower or higher 8 channels will be used by the
298        * hardware when doing a SIMD8 write depending on whether we have
299        * selected the subspans for the first or second half respectively.
300        */
301       assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
302       sample_mask.type = BRW_REGISTER_TYPE_UW;
303       sample_mask.stride *= 2;
304 
305       bld.exec_all().annotate("FB write oMask")
306          .MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW),
307                            inst->group % 16),
308               sample_mask);
309       length++;
310    }
311 
312    payload_header_size = length;
313 
314    setup_color_payload(bld, key, &sources[length], color0, components);
315    length += 4;
316 
317    if (color1.file != BAD_FILE) {
318       setup_color_payload(bld, key, &sources[length], color1, components);
319       length += 4;
320    }
321 
322    if (src_depth.file != BAD_FILE) {
323       sources[length] = src_depth;
324       length++;
325    }
326 
327    if (dst_depth.file != BAD_FILE) {
328       sources[length] = dst_depth;
329       length++;
330    }
331 
332    if (src_stencil.file != BAD_FILE) {
333       assert(devinfo->ver >= 9);
334       assert(bld.dispatch_width() == 8);
335 
336       /* XXX: src_stencil is only available on gfx9+. dst_depth is never
337        * available on gfx9+. As such it's impossible to have both enabled at the
338        * same time and therefore length cannot overrun the array.
339        */
340       assert(length < 15);
341 
342       sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
343       bld.exec_all().annotate("FB write OS")
344          .MOV(retype(sources[length], BRW_REGISTER_TYPE_UB),
345               subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0));
346       length++;
347    }
348 
349    fs_inst *load;
350    if (devinfo->ver >= 7) {
351       /* Send from the GRF */
352       fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
353       load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
354       payload.nr = bld.shader->alloc.allocate(regs_written(load));
355       load->dst = payload;
356 
357       uint32_t msg_ctl = brw_fb_write_msg_control(inst, prog_data);
358 
359       inst->desc =
360          (inst->group / 16) << 11 | /* rt slot group */
361          brw_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,
362                            prog_data->per_coarse_pixel_dispatch);
363 
364       uint32_t ex_desc = 0;
365       if (devinfo->ver >= 11) {
366          /* Set the "Render Target Index" and "Src0 Alpha Present" fields
367           * in the extended message descriptor, in lieu of using a header.
368           */
369          ex_desc = inst->target << 12 | (src0_alpha.file != BAD_FILE) << 15;
370 
371          if (key->nr_color_regions == 0)
372             ex_desc |= 1 << 20; /* Null Render Target */
373       }
374       inst->ex_desc = ex_desc;
375 
376       inst->opcode = SHADER_OPCODE_SEND;
377       inst->resize_sources(3);
378       inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
379       inst->src[0] = brw_imm_ud(0);
380       inst->src[1] = brw_imm_ud(0);
381       inst->src[2] = payload;
382       inst->mlen = regs_written(load);
383       inst->ex_mlen = 0;
384       inst->header_size = header_size;
385       inst->check_tdr = true;
386       inst->send_has_side_effects = true;
387    } else {
388       /* Send from the MRF */
389       load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
390                               sources, length, payload_header_size);
391 
392       /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
393        * will do this for us if we just give it a COMPR4 destination.
394        */
395       if (devinfo->ver < 6 && bld.dispatch_width() == 16)
396          load->dst.nr |= BRW_MRF_COMPR4;
397 
398       if (devinfo->ver < 6) {
399          /* Set up src[0] for the implied MOV from grf0-1 */
400          inst->resize_sources(1);
401          inst->src[0] = brw_vec8_grf(0, 0);
402       } else {
403          inst->resize_sources(0);
404       }
405       inst->base_mrf = 1;
406       inst->opcode = FS_OPCODE_FB_WRITE;
407       inst->mlen = regs_written(load);
408       inst->header_size = header_size;
409    }
410 }
411 
412 static void
lower_fb_read_logical_send(const fs_builder & bld,fs_inst * inst)413 lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)
414 {
415    const intel_device_info *devinfo = bld.shader->devinfo;
416    const fs_builder &ubld = bld.exec_all().group(8, 0);
417    const unsigned length = 2;
418    const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length);
419 
420    if (bld.group() < 16) {
421       ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
422                                            BRW_REGISTER_TYPE_UD));
423    } else {
424       assert(bld.group() < 32);
425       const fs_reg header_sources[] = {
426          retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
427          retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD)
428       };
429       ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
430 
431       if (devinfo->ver >= 12) {
432          /* On Gfx12 the Viewport and Render Target Array Index fields (AKA
433           * Poly 0 Info) are provided in r1.1 instead of r0.0, and the render
434           * target message header format was updated accordingly -- However
435           * the updated format only works for the lower 16 channels in a
436           * SIMD32 thread, since the higher 16 channels want the subspan data
437           * from r2 instead of r1, so we need to copy over the contents of
438           * r1.1 in order to fix things up.
439           */
440          ubld.group(1, 0).MOV(component(header, 9),
441                               retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD));
442       }
443    }
444 
445    /* BSpec 12470 (Gfx8-11), BSpec 47842 (Gfx12+) :
446     *
447     *   "Must be zero for Render Target Read message."
448     *
449     * For bits :
450     *   - 14 : Stencil Present to Render Target
451     *   - 13 : Source Depth Present to Render Target
452     *   - 12 : oMask to Render Target
453     *   - 11 : Source0 Alpha Present to Render Target
454     */
455    ubld.group(1, 0).AND(component(header, 0),
456                         component(header, 0),
457                         brw_imm_ud(~INTEL_MASK(14, 11)));
458 
459    inst->resize_sources(1);
460    inst->src[0] = header;
461    inst->opcode = FS_OPCODE_FB_READ;
462    inst->mlen = length;
463    inst->header_size = length;
464 }
465 
466 static void
lower_sampler_logical_send_gfx4(const fs_builder & bld,fs_inst * inst,opcode op,const fs_reg & coordinate,const fs_reg & shadow_c,const fs_reg & lod,const fs_reg & lod2,const fs_reg & surface,const fs_reg & sampler,unsigned coord_components,unsigned grad_components)467 lower_sampler_logical_send_gfx4(const fs_builder &bld, fs_inst *inst, opcode op,
468                                 const fs_reg &coordinate,
469                                 const fs_reg &shadow_c,
470                                 const fs_reg &lod, const fs_reg &lod2,
471                                 const fs_reg &surface,
472                                 const fs_reg &sampler,
473                                 unsigned coord_components,
474                                 unsigned grad_components)
475 {
476    const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
477                          op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
478    fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
479    fs_reg msg_end = msg_begin;
480 
481    /* g0 header. */
482    msg_end = offset(msg_end, bld.group(8, 0), 1);
483 
484    for (unsigned i = 0; i < coord_components; i++)
485       bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
486               offset(coordinate, bld, i));
487 
488    msg_end = offset(msg_end, bld, coord_components);
489 
490    /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
491     * require all three components to be present and zero if they are unused.
492     */
493    if (coord_components > 0 &&
494        (has_lod || shadow_c.file != BAD_FILE ||
495         (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
496       assert(coord_components <= 3);
497       for (unsigned i = 0; i < 3 - coord_components; i++)
498          bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));
499 
500       msg_end = offset(msg_end, bld, 3 - coord_components);
501    }
502 
503    if (op == SHADER_OPCODE_TXD) {
504       /* TXD unsupported in SIMD16 mode. */
505       assert(bld.dispatch_width() == 8);
506 
507       /* the slots for u and v are always present, but r is optional */
508       if (coord_components < 2)
509          msg_end = offset(msg_end, bld, 2 - coord_components);
510 
511       /*  P   = u, v, r
512        * dPdx = dudx, dvdx, drdx
513        * dPdy = dudy, dvdy, drdy
514        *
515        * 1-arg: Does not exist.
516        *
517        * 2-arg: dudx   dvdx   dudy   dvdy
518        *        dPdx.x dPdx.y dPdy.x dPdy.y
519        *        m4     m5     m6     m7
520        *
521        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
522        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
523        *        m5     m6     m7     m8     m9     m10
524        */
525       for (unsigned i = 0; i < grad_components; i++)
526          bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
527 
528       msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
529 
530       for (unsigned i = 0; i < grad_components; i++)
531          bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
532 
533       msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
534    }
535 
536    if (has_lod) {
537       /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
538        * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
539        */
540       assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
541              bld.dispatch_width() == 16);
542 
543       const brw_reg_type type =
544          (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
545           BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
546       bld.MOV(retype(msg_end, type), lod);
547       msg_end = offset(msg_end, bld, 1);
548    }
549 
550    if (shadow_c.file != BAD_FILE) {
551       if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
552          /* There's no plain shadow compare message, so we use shadow
553           * compare with a bias of 0.0.
554           */
555          bld.MOV(msg_end, brw_imm_f(0.0f));
556          msg_end = offset(msg_end, bld, 1);
557       }
558 
559       bld.MOV(msg_end, shadow_c);
560       msg_end = offset(msg_end, bld, 1);
561    }
562 
563    inst->opcode = op;
564    inst->src[0] = reg_undef;
565    inst->src[1] = surface;
566    inst->src[2] = sampler;
567    inst->resize_sources(3);
568    inst->base_mrf = msg_begin.nr;
569    inst->mlen = msg_end.nr - msg_begin.nr;
570    inst->header_size = 1;
571 }
572 
573 static void
lower_sampler_logical_send_gfx5(const fs_builder & bld,fs_inst * inst,opcode op,const fs_reg & coordinate,const fs_reg & shadow_c,const fs_reg & lod,const fs_reg & lod2,const fs_reg & sample_index,const fs_reg & surface,const fs_reg & sampler,unsigned coord_components,unsigned grad_components)574 lower_sampler_logical_send_gfx5(const fs_builder &bld, fs_inst *inst, opcode op,
575                                 const fs_reg &coordinate,
576                                 const fs_reg &shadow_c,
577                                 const fs_reg &lod, const fs_reg &lod2,
578                                 const fs_reg &sample_index,
579                                 const fs_reg &surface,
580                                 const fs_reg &sampler,
581                                 unsigned coord_components,
582                                 unsigned grad_components)
583 {
584    fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
585    fs_reg msg_coords = message;
586    unsigned header_size = 0;
587 
588    if (inst->offset != 0) {
589       /* The offsets set up by the visitor are in the m1 header, so we can't
590        * go headerless.
591        */
592       header_size = 1;
593       message.nr--;
594    }
595 
596    for (unsigned i = 0; i < coord_components; i++)
597       bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type),
598               offset(coordinate, bld, i));
599 
600    fs_reg msg_end = offset(msg_coords, bld, coord_components);
601    fs_reg msg_lod = offset(msg_coords, bld, 4);
602 
603    if (shadow_c.file != BAD_FILE) {
604       fs_reg msg_shadow = msg_lod;
605       bld.MOV(msg_shadow, shadow_c);
606       msg_lod = offset(msg_shadow, bld, 1);
607       msg_end = msg_lod;
608    }
609 
610    switch (op) {
611    case SHADER_OPCODE_TXL:
612    case FS_OPCODE_TXB:
613       bld.MOV(msg_lod, lod);
614       msg_end = offset(msg_lod, bld, 1);
615       break;
616    case SHADER_OPCODE_TXD:
617       /**
618        *  P   =  u,    v,    r
619        * dPdx = dudx, dvdx, drdx
620        * dPdy = dudy, dvdy, drdy
621        *
622        * Load up these values:
623        * - dudx   dudy   dvdx   dvdy   drdx   drdy
624        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
625        */
626       msg_end = msg_lod;
627       for (unsigned i = 0; i < grad_components; i++) {
628          bld.MOV(msg_end, offset(lod, bld, i));
629          msg_end = offset(msg_end, bld, 1);
630 
631          bld.MOV(msg_end, offset(lod2, bld, i));
632          msg_end = offset(msg_end, bld, 1);
633       }
634       break;
635    case SHADER_OPCODE_TXS:
636       msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
637       bld.MOV(msg_lod, lod);
638       msg_end = offset(msg_lod, bld, 1);
639       break;
640    case SHADER_OPCODE_TXF:
641       msg_lod = offset(msg_coords, bld, 3);
642       bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
643       msg_end = offset(msg_lod, bld, 1);
644       break;
645    case SHADER_OPCODE_TXF_CMS:
646       msg_lod = offset(msg_coords, bld, 3);
647       /* lod */
648       bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
649       /* sample index */
650       bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
651       msg_end = offset(msg_lod, bld, 2);
652       break;
653    default:
654       break;
655    }
656 
657    inst->opcode = op;
658    inst->src[0] = reg_undef;
659    inst->src[1] = surface;
660    inst->src[2] = sampler;
661    inst->resize_sources(3);
662    inst->base_mrf = message.nr;
663    inst->mlen = msg_end.nr - message.nr;
664    inst->header_size = header_size;
665 
666    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
667    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
668 }
669 
670 static bool
is_high_sampler(const struct intel_device_info * devinfo,const fs_reg & sampler)671 is_high_sampler(const struct intel_device_info *devinfo, const fs_reg &sampler)
672 {
673    if (devinfo->verx10 <= 70)
674       return false;
675 
676    return sampler.file != IMM || sampler.ud >= 16;
677 }
678 
679 static unsigned
sampler_msg_type(const intel_device_info * devinfo,opcode opcode,bool shadow_compare)680 sampler_msg_type(const intel_device_info *devinfo,
681                  opcode opcode, bool shadow_compare)
682 {
683    assert(devinfo->ver >= 5);
684    switch (opcode) {
685    case SHADER_OPCODE_TEX:
686       return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
687                               GFX5_SAMPLER_MESSAGE_SAMPLE;
688    case FS_OPCODE_TXB:
689       return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
690                               GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
691    case SHADER_OPCODE_TXL:
692       return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
693                               GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
694    case SHADER_OPCODE_TXL_LZ:
695       return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ :
696                               GFX9_SAMPLER_MESSAGE_SAMPLE_LZ;
697    case SHADER_OPCODE_TXS:
698    case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
699       return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
700    case SHADER_OPCODE_TXD:
701       assert(!shadow_compare || devinfo->verx10 >= 75);
702       return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
703                               GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
704    case SHADER_OPCODE_TXF:
705       return GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
706    case SHADER_OPCODE_TXF_LZ:
707       assert(devinfo->ver >= 9);
708       return GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
709    case SHADER_OPCODE_TXF_CMS_W:
710       assert(devinfo->ver >= 9);
711       return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
712    case SHADER_OPCODE_TXF_CMS:
713       return devinfo->ver >= 7 ? GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS :
714                                  GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
715    case SHADER_OPCODE_TXF_UMS:
716       assert(devinfo->ver >= 7);
717       return GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
718    case SHADER_OPCODE_TXF_MCS:
719       assert(devinfo->ver >= 7);
720       return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
721    case SHADER_OPCODE_LOD:
722       return GFX5_SAMPLER_MESSAGE_LOD;
723    case SHADER_OPCODE_TG4:
724       assert(devinfo->ver >= 7);
725       return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
726                               GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
727       break;
728    case SHADER_OPCODE_TG4_OFFSET:
729       assert(devinfo->ver >= 7);
730       return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
731                               GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
732    case SHADER_OPCODE_SAMPLEINFO:
733       return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
734    default:
735       unreachable("not reached");
736    }
737 }
738 
739 /**
740  * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to
741  * the given requested_alignment_sz.
742  */
743 static fs_inst *
emit_load_payload_with_padding(const fs_builder & bld,const fs_reg & dst,const fs_reg * src,unsigned sources,unsigned header_size,unsigned requested_alignment_sz)744 emit_load_payload_with_padding(const fs_builder &bld, const fs_reg &dst,
745                                const fs_reg *src, unsigned sources,
746                                unsigned header_size,
747                                unsigned requested_alignment_sz)
748 {
749    unsigned length = 0;
750    unsigned num_srcs =
751       sources * DIV_ROUND_UP(requested_alignment_sz, bld.dispatch_width());
752    fs_reg *src_comps = new fs_reg[num_srcs];
753 
754    for (unsigned i = 0; i < header_size; i++)
755       src_comps[length++] = src[i];
756 
757    for (unsigned i = header_size; i < sources; i++) {
758       unsigned src_sz =
759          retype(dst, src[i].type).component_size(bld.dispatch_width());
760       const enum brw_reg_type padding_payload_type =
761          brw_reg_type_from_bit_size(type_sz(src[i].type) * 8,
762                                     BRW_REGISTER_TYPE_UD);
763 
764       src_comps[length++] = src[i];
765 
766       /* Expand the real sources if component of requested payload type is
767        * larger than real source component.
768        */
769       if (src_sz < requested_alignment_sz) {
770          for (unsigned j = 0; j < (requested_alignment_sz / src_sz) - 1; j++) {
771             src_comps[length++] = retype(fs_reg(), padding_payload_type);
772          }
773       }
774    }
775 
776    fs_inst *inst = bld.LOAD_PAYLOAD(dst, src_comps, length, header_size);
777    delete[] src_comps;
778 
779    return inst;
780 }
781 
782 static void
lower_sampler_logical_send_gfx7(const fs_builder & bld,fs_inst * inst,opcode op,const fs_reg & coordinate,const fs_reg & shadow_c,fs_reg lod,const fs_reg & lod2,const fs_reg & min_lod,const fs_reg & sample_index,const fs_reg & mcs,const fs_reg & surface,const fs_reg & sampler,const fs_reg & surface_handle,const fs_reg & sampler_handle,const fs_reg & tg4_offset,unsigned payload_type_bit_size,unsigned coord_components,unsigned grad_components)783 lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
784                                 const fs_reg &coordinate,
785                                 const fs_reg &shadow_c,
786                                 fs_reg lod, const fs_reg &lod2,
787                                 const fs_reg &min_lod,
788                                 const fs_reg &sample_index,
789                                 const fs_reg &mcs,
790                                 const fs_reg &surface,
791                                 const fs_reg &sampler,
792                                 const fs_reg &surface_handle,
793                                 const fs_reg &sampler_handle,
794                                 const fs_reg &tg4_offset,
795                                 unsigned payload_type_bit_size,
796                                 unsigned coord_components,
797                                 unsigned grad_components)
798 {
799    const intel_device_info *devinfo = bld.shader->devinfo;
800    const enum brw_reg_type payload_type =
801       brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_F);
802    const enum brw_reg_type payload_unsigned_type =
803       brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_UD);
804    const enum brw_reg_type payload_signed_type =
805       brw_reg_type_from_bit_size(payload_type_bit_size, BRW_REGISTER_TYPE_D);
806    unsigned reg_width = bld.dispatch_width() / 8;
807    unsigned header_size = 0, length = 0;
808    fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
809    for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
810       sources[i] = bld.vgrf(payload_type);
811 
812    /* We must have exactly one of surface/sampler and surface/sampler_handle */
813    assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
814    assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
815 
816    if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
817        inst->offset != 0 || inst->eot ||
818        op == SHADER_OPCODE_SAMPLEINFO ||
819        sampler_handle.file != BAD_FILE ||
820        is_high_sampler(devinfo, sampler)) {
821       /* For general texture offsets (no txf workaround), we need a header to
822        * put them in.
823        *
824        * TG4 needs to place its channel select in the header, for interaction
825        * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
826        * larger sampler numbers we need to offset the Sampler State Pointer in
827        * the header.
828        */
829       fs_reg header = retype(sources[0], BRW_REGISTER_TYPE_UD);
830       header_size = 1;
831       length++;
832 
833       /* If we're requesting fewer than four channels worth of response,
834        * and we have an explicit header, we need to set up the sampler
835        * writemask.  It's reversed from normal: 1 means "don't write".
836        */
837       if (!inst->eot && regs_written(inst) != 4 * reg_width) {
838          assert(regs_written(inst) % reg_width == 0);
839          unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf;
840          inst->offset |= mask << 12;
841       }
842 
843       /* Build the actual header */
844       const fs_builder ubld = bld.exec_all().group(8, 0);
845       const fs_builder ubld1 = ubld.group(1, 0);
846       ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
847       if (inst->offset) {
848          ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));
849       } else if (bld.shader->stage != MESA_SHADER_VERTEX &&
850                  bld.shader->stage != MESA_SHADER_FRAGMENT) {
851          /* The vertex and fragment stages have g0.2 set to 0, so
852           * header0.2 is 0 when g0 is copied. Other stages may not, so we
853           * must set it to 0 to avoid setting undesirable bits in the
854           * message.
855           */
856          ubld1.MOV(component(header, 2), brw_imm_ud(0));
857       }
858 
859       if (sampler_handle.file != BAD_FILE) {
860          /* Bindless sampler handles aren't relative to the sampler state
861           * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
862           * Instead, it's an absolute pointer relative to dynamic state base
863           * address.
864           *
865           * Sampler states are 16 bytes each and the pointer we give here has
866           * to be 32-byte aligned.  In order to avoid more indirect messages
867           * than required, we assume that all bindless sampler states are
868           * 32-byte aligned.  This sacrifices a bit of general state base
869           * address space but means we can do something more efficient in the
870           * shader.
871           */
872          ubld1.MOV(component(header, 3), sampler_handle);
873       } else if (is_high_sampler(devinfo, sampler)) {
874          fs_reg sampler_state_ptr =
875             retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD);
876 
877          /* Gfx11+ sampler message headers include bits in 4:0 which conflict
878           * with the ones included in g0.3 bits 4:0.  Mask them out.
879           */
880          if (devinfo->ver >= 11) {
881             sampler_state_ptr = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
882             ubld1.AND(sampler_state_ptr,
883                       retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
884                       brw_imm_ud(INTEL_MASK(31, 5)));
885          }
886 
887          if (sampler.file == BRW_IMMEDIATE_VALUE) {
888             assert(sampler.ud >= 16);
889             const int sampler_state_size = 16; /* 16 bytes */
890 
891             ubld1.ADD(component(header, 3), sampler_state_ptr,
892                       brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
893          } else {
894             fs_reg tmp = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
895             ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));
896             ubld1.SHL(tmp, tmp, brw_imm_ud(4));
897             ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
898          }
899       } else if (devinfo->ver >= 11) {
900          /* Gfx11+ sampler message headers include bits in 4:0 which conflict
901           * with the ones included in g0.3 bits 4:0.  Mask them out.
902           */
903          ubld1.AND(component(header, 3),
904                    retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
905                    brw_imm_ud(INTEL_MASK(31, 5)));
906       }
907    }
908 
909    if (shadow_c.file != BAD_FILE) {
910       bld.MOV(sources[length], shadow_c);
911       length++;
912    }
913 
914    bool coordinate_done = false;
915 
916    /* Set up the LOD info */
917    switch (op) {
918    case FS_OPCODE_TXB:
919    case SHADER_OPCODE_TXL:
920       if (devinfo->ver >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) {
921          op = SHADER_OPCODE_TXL_LZ;
922          break;
923       }
924       bld.MOV(sources[length], lod);
925       length++;
926       break;
927    case SHADER_OPCODE_TXD:
928       /* TXD should have been lowered in SIMD16 mode. */
929       assert(bld.dispatch_width() == 8);
930 
931       /* Load dPdx and the coordinate together:
932        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
933        */
934       for (unsigned i = 0; i < coord_components; i++) {
935          bld.MOV(sources[length++], offset(coordinate, bld, i));
936 
937          /* For cube map array, the coordinate is (u,v,r,ai) but there are
938           * only derivatives for (u, v, r).
939           */
940          if (i < grad_components) {
941             bld.MOV(sources[length++], offset(lod, bld, i));
942             bld.MOV(sources[length++], offset(lod2, bld, i));
943          }
944       }
945 
946       coordinate_done = true;
947       break;
948    case SHADER_OPCODE_TXS:
949       bld.MOV(retype(sources[length], payload_unsigned_type), lod);
950       length++;
951       break;
952    case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
953       /* We need an LOD; just use 0 */
954       bld.MOV(retype(sources[length], payload_unsigned_type), brw_imm_ud(0));
955       length++;
956       break;
957    case SHADER_OPCODE_TXF:
958       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
959        * On Gfx9 they are u, v, lod, r
960        */
961       bld.MOV(retype(sources[length++], payload_signed_type), coordinate);
962 
963       if (devinfo->ver >= 9) {
964          if (coord_components >= 2) {
965             bld.MOV(retype(sources[length], payload_signed_type),
966                     offset(coordinate, bld, 1));
967          } else {
968             sources[length] = brw_imm_d(0);
969          }
970          length++;
971       }
972 
973       if (devinfo->ver >= 9 && lod.is_zero()) {
974          op = SHADER_OPCODE_TXF_LZ;
975       } else {
976          bld.MOV(retype(sources[length], payload_signed_type), lod);
977          length++;
978       }
979 
980       for (unsigned i = devinfo->ver >= 9 ? 2 : 1; i < coord_components; i++)
981          bld.MOV(retype(sources[length++], payload_signed_type),
982                  offset(coordinate, bld, i));
983 
984       coordinate_done = true;
985       break;
986 
987    case SHADER_OPCODE_TXF_CMS:
988    case SHADER_OPCODE_TXF_CMS_W:
989    case SHADER_OPCODE_TXF_UMS:
990    case SHADER_OPCODE_TXF_MCS:
991       if (op == SHADER_OPCODE_TXF_UMS ||
992           op == SHADER_OPCODE_TXF_CMS ||
993           op == SHADER_OPCODE_TXF_CMS_W) {
994          bld.MOV(retype(sources[length++], payload_unsigned_type), sample_index);
995       }
996 
997       /* Data from the multisample control surface. */
998       if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
999          unsigned num_mcs_components = 1;
1000 
1001          /* From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
1002           * Shared Functions - 3D Sampler - Messages - Message Format:
1003           *
1004           *    ld2dms_w   si  mcs0 mcs1 mcs2  mcs3  u  v  r
1005           */
1006          if (devinfo->verx10 >= 125 && op == SHADER_OPCODE_TXF_CMS_W)
1007             num_mcs_components = 4;
1008          else if (op == SHADER_OPCODE_TXF_CMS_W)
1009             num_mcs_components = 2;
1010 
1011          for (unsigned i = 0; i < num_mcs_components; ++i) {
1012             bld.MOV(retype(sources[length++], payload_unsigned_type),
1013                     mcs.file == IMM ? mcs : offset(mcs, bld, i));
1014          }
1015       }
1016 
1017       /* There is no offsetting for this message; just copy in the integer
1018        * texture coordinates.
1019        */
1020       for (unsigned i = 0; i < coord_components; i++)
1021          bld.MOV(retype(sources[length++], payload_signed_type),
1022                  offset(coordinate, bld, i));
1023 
1024       coordinate_done = true;
1025       break;
1026    case SHADER_OPCODE_TG4_OFFSET:
1027       /* More crazy intermixing */
1028       for (unsigned i = 0; i < 2; i++) /* u, v */
1029          bld.MOV(sources[length++], offset(coordinate, bld, i));
1030 
1031       for (unsigned i = 0; i < 2; i++) /* offu, offv */
1032          bld.MOV(retype(sources[length++], payload_signed_type),
1033                  offset(tg4_offset, bld, i));
1034 
1035       if (coord_components == 3) /* r if present */
1036          bld.MOV(sources[length++], offset(coordinate, bld, 2));
1037 
1038       coordinate_done = true;
1039       break;
1040    default:
1041       break;
1042    }
1043 
1044    /* Set up the coordinate (except for cases where it was done above) */
1045    if (!coordinate_done) {
1046       for (unsigned i = 0; i < coord_components; i++)
1047          bld.MOV(retype(sources[length++], payload_type),
1048                  offset(coordinate, bld, i));
1049    }
1050 
1051    if (min_lod.file != BAD_FILE) {
1052       /* Account for all of the missing coordinate sources */
1053       if (op == SHADER_OPCODE_TXD && devinfo->verx10 >= 125) {
1054          /* On DG2 and newer platforms, sample_d can only be used with 1D and
1055           * 2D surfaces, so the maximum number of gradient components is 2.
1056           * In spite of this limitation, the Bspec lists a mysterious R
1057           * component before the min_lod, so the maximum coordinate components
1058           * is 3.
1059           *
1060           * Wa_1209978020
1061           */
1062          length += 3 - coord_components;
1063          length += (2 - grad_components) * 2;
1064       } else {
1065          length += 4 - coord_components;
1066          if (op == SHADER_OPCODE_TXD)
1067             length += (3 - grad_components) * 2;
1068       }
1069 
1070       bld.MOV(sources[length++], min_lod);
1071    }
1072 
1073    const fs_reg src_payload =
1074       fs_reg(VGRF, bld.shader->alloc.allocate(length * reg_width),
1075                                               BRW_REGISTER_TYPE_F);
1076    /* In case of 16-bit payload each component takes one full register in
1077     * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16
1078     * elements. In SIMD8H case hardware simply expects the components to be
1079     * padded (i.e., aligned on reg boundary).
1080     */
1081    fs_inst *load_payload_inst =
1082       emit_load_payload_with_padding(bld, src_payload, sources, length,
1083                                      header_size, REG_SIZE);
1084    unsigned mlen = load_payload_inst->size_written / REG_SIZE;
1085    unsigned simd_mode = 0;
1086    if (payload_type_bit_size == 16) {
1087       assert(devinfo->ver >= 11);
1088       simd_mode = inst->exec_size <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H :
1089                                          GFX10_SAMPLER_SIMD_MODE_SIMD16H;
1090    } else {
1091       simd_mode = inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
1092                                          BRW_SAMPLER_SIMD_MODE_SIMD16;
1093    }
1094 
1095    /* Generate the SEND. */
1096    inst->opcode = SHADER_OPCODE_SEND;
1097    inst->mlen = mlen;
1098    inst->header_size = header_size;
1099 
1100    const unsigned msg_type =
1101       sampler_msg_type(devinfo, op, inst->shadow_compare);
1102 
1103    inst->sfid = BRW_SFID_SAMPLER;
1104    if (surface.file == IMM &&
1105        (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
1106       inst->desc = brw_sampler_desc(devinfo, surface.ud,
1107                                     sampler.file == IMM ? sampler.ud % 16 : 0,
1108                                     msg_type,
1109                                     simd_mode,
1110                                     0 /* return_format unused on gfx7+ */);
1111       inst->src[0] = brw_imm_ud(0);
1112       inst->src[1] = brw_imm_ud(0);
1113    } else if (surface_handle.file != BAD_FILE) {
1114       /* Bindless surface */
1115       assert(devinfo->ver >= 9);
1116       inst->desc = brw_sampler_desc(devinfo,
1117                                     GFX9_BTI_BINDLESS,
1118                                     sampler.file == IMM ? sampler.ud % 16 : 0,
1119                                     msg_type,
1120                                     simd_mode,
1121                                     0 /* return_format unused on gfx7+ */);
1122 
1123       /* For bindless samplers, the entire address is included in the message
1124        * header so we can leave the portion in the message descriptor 0.
1125        */
1126       if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {
1127          inst->src[0] = brw_imm_ud(0);
1128       } else {
1129          const fs_builder ubld = bld.group(1, 0).exec_all();
1130          fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1131          ubld.SHL(desc, sampler, brw_imm_ud(8));
1132          inst->src[0] = desc;
1133       }
1134 
1135       /* We assume that the driver provided the handle in the top 20 bits so
1136        * we can use the surface handle directly as the extended descriptor.
1137        */
1138       inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
1139    } else {
1140       /* Immediate portion of the descriptor */
1141       inst->desc = brw_sampler_desc(devinfo,
1142                                     0, /* surface */
1143                                     0, /* sampler */
1144                                     msg_type,
1145                                     simd_mode,
1146                                     0 /* return_format unused on gfx7+ */);
1147       const fs_builder ubld = bld.group(1, 0).exec_all();
1148       fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1149       if (surface.equals(sampler)) {
1150          /* This case is common in GL */
1151          ubld.MUL(desc, surface, brw_imm_ud(0x101));
1152       } else {
1153          if (sampler_handle.file != BAD_FILE) {
1154             ubld.MOV(desc, surface);
1155          } else if (sampler.file == IMM) {
1156             ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8));
1157          } else {
1158             ubld.SHL(desc, sampler, brw_imm_ud(8));
1159             ubld.OR(desc, desc, surface);
1160          }
1161       }
1162       ubld.AND(desc, desc, brw_imm_ud(0xfff));
1163 
1164       inst->src[0] = component(desc, 0);
1165       inst->src[1] = brw_imm_ud(0); /* ex_desc */
1166    }
1167 
1168    inst->ex_desc = 0;
1169 
1170    inst->src[2] = src_payload;
1171    inst->resize_sources(3);
1172 
1173    if (inst->eot) {
1174       /* EOT sampler messages don't make sense to split because it would
1175        * involve ending half of the thread early.
1176        */
1177       assert(inst->group == 0);
1178       /* We need to use SENDC for EOT sampler messages */
1179       inst->check_tdr = true;
1180       inst->send_has_side_effects = true;
1181    }
1182 
1183    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
1184    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
1185 }
1186 
1187 static unsigned
get_sampler_msg_payload_type_bit_size(const intel_device_info * devinfo,opcode op,const fs_reg * src)1188 get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
1189                                       opcode op, const fs_reg *src)
1190 {
1191    unsigned src_type_size = 0;
1192 
1193    /* All sources need to have the same size, therefore seek the first valid
1194     * and take the size from there.
1195     */
1196    for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1197       if (src[i].file != BAD_FILE) {
1198          src_type_size = brw_reg_type_to_size(src[i].type);
1199          break;
1200       }
1201    }
1202 
1203    assert(src_type_size == 2 || src_type_size == 4);
1204 
1205 #ifndef NDEBUG
1206    /* Make sure all sources agree. On gfx12 this doesn't hold when sampling
1207     * compressed multisampled surfaces. There the payload contains MCS data
1208     * which is already in 16-bits unlike the other parameters that need forced
1209     * conversion.
1210     */
1211    if (devinfo->verx10 < 125 ||
1212        (op != SHADER_OPCODE_TXF_CMS_W &&
1213         op != SHADER_OPCODE_TXF_CMS)) {
1214       for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1215          assert(src[i].file == BAD_FILE ||
1216                 brw_reg_type_to_size(src[i].type) == src_type_size);
1217       }
1218    }
1219 #endif
1220 
1221    if (devinfo->verx10 < 125)
1222       return src_type_size * 8;
1223 
1224    /* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec:
1225     * 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message
1226     * Format [GFX12:HAS:1209977870] *
1227     *
1228     *  ld2dms_w       SIMD8H and SIMD16H Only
1229     *  ld_mcs         SIMD8H and SIMD16H Only
1230     *  ld2dms         REMOVEDBY(GEN:HAS:1406788836)
1231     */
1232 
1233    if (op == SHADER_OPCODE_TXF_CMS_W ||
1234        op == SHADER_OPCODE_TXF_CMS ||
1235        op == SHADER_OPCODE_TXF_UMS ||
1236        op == SHADER_OPCODE_TXF_MCS)
1237       src_type_size = 2;
1238 
1239    return src_type_size * 8;
1240 }
1241 
1242 static void
lower_sampler_logical_send(const fs_builder & bld,fs_inst * inst,opcode op)1243 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
1244 {
1245    const intel_device_info *devinfo = bld.shader->devinfo;
1246    const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
1247    const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
1248    const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD];
1249    const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
1250    const fs_reg &min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
1251    const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
1252    const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];
1253    const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
1254    const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
1255    const fs_reg &surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
1256    const fs_reg &sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
1257    const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
1258    assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
1259    const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
1260    assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
1261    const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
1262 
1263    if (devinfo->ver >= 7) {
1264       const unsigned msg_payload_type_bit_size =
1265          get_sampler_msg_payload_type_bit_size(devinfo, op, inst->src);
1266 
1267       /* 16-bit payloads are available only on gfx11+ */
1268       assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11);
1269 
1270       lower_sampler_logical_send_gfx7(bld, inst, op, coordinate,
1271                                       shadow_c, lod, lod2, min_lod,
1272                                       sample_index,
1273                                       mcs, surface, sampler,
1274                                       surface_handle, sampler_handle,
1275                                       tg4_offset,
1276                                       msg_payload_type_bit_size,
1277                                       coord_components, grad_components);
1278    } else if (devinfo->ver >= 5) {
1279       lower_sampler_logical_send_gfx5(bld, inst, op, coordinate,
1280                                       shadow_c, lod, lod2, sample_index,
1281                                       surface, sampler,
1282                                       coord_components, grad_components);
1283    } else {
1284       lower_sampler_logical_send_gfx4(bld, inst, op, coordinate,
1285                                       shadow_c, lod, lod2,
1286                                       surface, sampler,
1287                                       coord_components, grad_components);
1288    }
1289 }
1290 
1291 /**
1292  * Predicate the specified instruction on the vector mask.
1293  */
1294 static void
emit_predicate_on_vector_mask(const fs_builder & bld,fs_inst * inst)1295 emit_predicate_on_vector_mask(const fs_builder &bld, fs_inst *inst)
1296 {
1297    assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
1298           bld.group() == inst->group &&
1299           bld.dispatch_width() == inst->exec_size);
1300 
1301    const fs_builder ubld = bld.exec_all().group(1, 0);
1302 
1303    const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader);
1304    const fs_reg vector_mask = ubld.vgrf(BRW_REGISTER_TYPE_UW);
1305    ubld.emit(SHADER_OPCODE_READ_SR_REG, vector_mask, brw_imm_ud(3));
1306    const unsigned subreg = sample_mask_flag_subreg(v);
1307 
1308    ubld.MOV(brw_flag_subreg(subreg + inst->group / 16), vector_mask);
1309 
1310    if (inst->predicate) {
1311       assert(inst->predicate == BRW_PREDICATE_NORMAL);
1312       assert(!inst->predicate_inverse);
1313       assert(inst->flag_subreg == 0);
1314       /* Combine the vector mask with the existing predicate by using a
1315        * vertical predication mode.
1316        */
1317       inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
1318    } else {
1319       inst->flag_subreg = subreg;
1320       inst->predicate = BRW_PREDICATE_NORMAL;
1321       inst->predicate_inverse = false;
1322    }
1323 }
1324 
1325 static void
setup_surface_descriptors(const fs_builder & bld,fs_inst * inst,uint32_t desc,const fs_reg & surface,const fs_reg & surface_handle)1326 setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc,
1327                           const fs_reg &surface, const fs_reg &surface_handle)
1328 {
1329    const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
1330 
1331    /* We must have exactly one of surface and surface_handle */
1332    assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
1333 
1334    if (surface.file == IMM) {
1335       inst->desc = desc | (surface.ud & 0xff);
1336       inst->src[0] = brw_imm_ud(0);
1337       inst->src[1] = brw_imm_ud(0); /* ex_desc */
1338    } else if (surface_handle.file != BAD_FILE) {
1339       /* Bindless surface */
1340       assert(devinfo->ver >= 9);
1341       inst->desc = desc | GFX9_BTI_BINDLESS;
1342       inst->src[0] = brw_imm_ud(0);
1343 
1344       /* We assume that the driver provided the handle in the top 20 bits so
1345        * we can use the surface handle directly as the extended descriptor.
1346        */
1347       inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
1348    } else {
1349       inst->desc = desc;
1350       const fs_builder ubld = bld.exec_all().group(1, 0);
1351       fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1352       ubld.AND(tmp, surface, brw_imm_ud(0xff));
1353       inst->src[0] = component(tmp, 0);
1354       inst->src[1] = brw_imm_ud(0); /* ex_desc */
1355    }
1356 }
1357 
1358 static void
lower_surface_logical_send(const fs_builder & bld,fs_inst * inst)1359 lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
1360 {
1361    const intel_device_info *devinfo = bld.shader->devinfo;
1362 
1363    /* Get the logical send arguments. */
1364    const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1365    const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1366    const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1367    const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1368    const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
1369    const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1370    const fs_reg &allow_sample_mask =
1371       inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
1372    assert(arg.file == IMM);
1373    assert(allow_sample_mask.file == IMM);
1374 
1375    /* Calculate the total number of components of the payload. */
1376    const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
1377    const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1378 
1379    const bool is_typed_access =
1380       inst->opcode == SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL ||
1381       inst->opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL ||
1382       inst->opcode == SHADER_OPCODE_TYPED_ATOMIC_LOGICAL;
1383 
1384    const bool is_surface_access = is_typed_access ||
1385       inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL ||
1386       inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL ||
1387       inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL;
1388 
1389    const bool is_stateless =
1390       surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
1391                               surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
1392 
1393    const bool has_side_effects = inst->has_side_effects();
1394 
1395    fs_reg sample_mask = allow_sample_mask.ud ? brw_sample_mask_reg(bld) :
1396                                                fs_reg(brw_imm_d(0xffff));
1397 
1398    /* From the BDW PRM Volume 7, page 147:
1399     *
1400     *  "For the Data Cache Data Port*, the header must be present for the
1401     *   following message types: [...] Typed read/write/atomics"
1402     *
1403     * Earlier generations have a similar wording.  Because of this restriction
1404     * we don't attempt to implement sample masks via predication for such
1405     * messages prior to Gfx9, since we have to provide a header anyway.  On
1406     * Gfx11+ the header has been removed so we can only use predication.
1407     *
1408     * For all stateless A32 messages, we also need a header
1409     */
1410    fs_reg header;
1411    if ((devinfo->ver < 9 && is_typed_access) || is_stateless) {
1412       fs_builder ubld = bld.exec_all().group(8, 0);
1413       header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1414       if (is_stateless) {
1415          assert(!is_surface_access);
1416          ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
1417       } else {
1418          ubld.MOV(header, brw_imm_d(0));
1419          if (is_surface_access)
1420             ubld.group(1, 0).MOV(component(header, 7), sample_mask);
1421       }
1422    }
1423    const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;
1424 
1425    fs_reg payload, payload2;
1426    unsigned mlen, ex_mlen = 0;
1427    if (devinfo->ver >= 9 &&
1428        (src.file == BAD_FILE || header.file == BAD_FILE)) {
1429       /* We have split sends on gfx9 and above */
1430       if (header.file == BAD_FILE) {
1431          payload = bld.move_to_vgrf(addr, addr_sz);
1432          payload2 = bld.move_to_vgrf(src, src_sz);
1433          mlen = addr_sz * (inst->exec_size / 8);
1434          ex_mlen = src_sz * (inst->exec_size / 8);
1435       } else {
1436          assert(src.file == BAD_FILE);
1437          payload = header;
1438          payload2 = bld.move_to_vgrf(addr, addr_sz);
1439          mlen = header_sz;
1440          ex_mlen = addr_sz * (inst->exec_size / 8);
1441       }
1442    } else {
1443       /* Allocate space for the payload. */
1444       const unsigned sz = header_sz + addr_sz + src_sz;
1445       payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
1446       fs_reg *const components = new fs_reg[sz];
1447       unsigned n = 0;
1448 
1449       /* Construct the payload. */
1450       if (header.file != BAD_FILE)
1451          components[n++] = header;
1452 
1453       for (unsigned i = 0; i < addr_sz; i++)
1454          components[n++] = offset(addr, bld, i);
1455 
1456       for (unsigned i = 0; i < src_sz; i++)
1457          components[n++] = offset(src, bld, i);
1458 
1459       bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
1460       mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
1461 
1462       delete[] components;
1463    }
1464 
1465    /* Predicate the instruction on the sample mask if no header is
1466     * provided.
1467     */
1468    if ((header.file == BAD_FILE || !is_surface_access) &&
1469        sample_mask.file != BAD_FILE && sample_mask.file != IMM)
1470       brw_emit_predicate_on_sample_mask(bld, inst);
1471 
1472    uint32_t sfid;
1473    switch (inst->opcode) {
1474    case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1475    case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1476       /* Byte scattered opcodes go through the normal data cache */
1477       sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1478       break;
1479 
1480    case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1481    case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1482       sfid =  devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
1483               devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
1484                                   BRW_DATAPORT_READ_TARGET_RENDER_CACHE;
1485       break;
1486 
1487    case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1488    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1489    case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1490    case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
1491       /* Untyped Surface messages go through the data cache but the SFID value
1492        * changed on Haswell.
1493        */
1494       sfid = (devinfo->verx10 >= 75 ?
1495               HSW_SFID_DATAPORT_DATA_CACHE_1 :
1496               GFX7_SFID_DATAPORT_DATA_CACHE);
1497       break;
1498 
1499    case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1500    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1501    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1502       /* Typed surface messages go through the render cache on IVB and the
1503        * data cache on HSW+.
1504        */
1505       sfid = (devinfo->verx10 >= 75 ?
1506               HSW_SFID_DATAPORT_DATA_CACHE_1 :
1507               GFX6_SFID_DATAPORT_RENDER_CACHE);
1508       break;
1509 
1510    default:
1511       unreachable("Unsupported surface opcode");
1512    }
1513 
1514    uint32_t desc;
1515    switch (inst->opcode) {
1516    case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1517       desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1518                                             arg.ud, /* num_channels */
1519                                             false   /* write */);
1520       break;
1521 
1522    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1523       desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1524                                             arg.ud, /* num_channels */
1525                                             true    /* write */);
1526       break;
1527 
1528    case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1529       desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1530                                            arg.ud, /* bit_size */
1531                                            false   /* write */);
1532       break;
1533 
1534    case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1535       desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1536                                            arg.ud, /* bit_size */
1537                                            true    /* write */);
1538       break;
1539 
1540    case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1541       assert(arg.ud == 32); /* bit_size */
1542       desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1543                                             false  /* write */);
1544       break;
1545 
1546    case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1547       assert(arg.ud == 32); /* bit_size */
1548       desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1549                                             true   /* write */);
1550       break;
1551 
1552    case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1553       desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size,
1554                                         arg.ud, /* atomic_op */
1555                                         !inst->dst.is_null());
1556       break;
1557 
1558    case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
1559       desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,
1560                                               arg.ud, /* atomic_op */
1561                                               !inst->dst.is_null());
1562       break;
1563 
1564    case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1565       desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
1566                                           arg.ud, /* num_channels */
1567                                           false   /* write */);
1568       break;
1569 
1570    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1571       desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
1572                                           arg.ud, /* num_channels */
1573                                           true    /* write */);
1574       break;
1575 
1576    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1577       desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
1578                                       arg.ud, /* atomic_op */
1579                                       !inst->dst.is_null());
1580       break;
1581 
1582    default:
1583       unreachable("Unknown surface logical instruction");
1584    }
1585 
1586    /* Update the original instruction. */
1587    inst->opcode = SHADER_OPCODE_SEND;
1588    inst->mlen = mlen;
1589    inst->ex_mlen = ex_mlen;
1590    inst->header_size = header_sz;
1591    inst->send_has_side_effects = has_side_effects;
1592    inst->send_is_volatile = !has_side_effects;
1593 
1594    /* Set up SFID and descriptors */
1595    inst->sfid = sfid;
1596    setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1597 
1598    inst->resize_sources(4);
1599 
1600    /* Finally, the payload */
1601    inst->src[2] = payload;
1602    inst->src[3] = payload2;
1603 }
1604 
1605 static enum lsc_opcode
brw_atomic_op_to_lsc_atomic_op(unsigned op)1606 brw_atomic_op_to_lsc_atomic_op(unsigned op)
1607 {
1608    switch(op) {
1609    case BRW_AOP_AND:
1610       return LSC_OP_ATOMIC_AND;
1611    case BRW_AOP_OR:
1612       return LSC_OP_ATOMIC_OR;
1613    case BRW_AOP_XOR:
1614       return LSC_OP_ATOMIC_XOR;
1615    case BRW_AOP_MOV:
1616       return LSC_OP_ATOMIC_STORE;
1617    case BRW_AOP_INC:
1618       return LSC_OP_ATOMIC_INC;
1619    case BRW_AOP_DEC:
1620       return LSC_OP_ATOMIC_DEC;
1621    case BRW_AOP_ADD:
1622       return LSC_OP_ATOMIC_ADD;
1623    case BRW_AOP_SUB:
1624       return LSC_OP_ATOMIC_SUB;
1625    case BRW_AOP_IMAX:
1626       return LSC_OP_ATOMIC_MAX;
1627    case BRW_AOP_IMIN:
1628       return LSC_OP_ATOMIC_MIN;
1629    case BRW_AOP_UMAX:
1630       return LSC_OP_ATOMIC_UMAX;
1631    case BRW_AOP_UMIN:
1632       return LSC_OP_ATOMIC_UMIN;
1633    case BRW_AOP_CMPWR:
1634       return LSC_OP_ATOMIC_CMPXCHG;
1635    default:
1636       assert(false);
1637       unreachable("invalid atomic opcode");
1638    }
1639 }
1640 
1641 static enum lsc_opcode
brw_atomic_op_to_lsc_fatomic_op(uint32_t aop)1642 brw_atomic_op_to_lsc_fatomic_op(uint32_t aop)
1643 {
1644    switch(aop) {
1645    case BRW_AOP_FMAX:
1646       return LSC_OP_ATOMIC_FMAX;
1647    case BRW_AOP_FMIN:
1648       return LSC_OP_ATOMIC_FMIN;
1649    case BRW_AOP_FCMPWR:
1650       return LSC_OP_ATOMIC_FCMPXCHG;
1651    case BRW_AOP_FADD:
1652       return LSC_OP_ATOMIC_FADD;
1653    default:
1654       unreachable("Unsupported float atomic opcode");
1655    }
1656 }
1657 
1658 static enum lsc_data_size
lsc_bits_to_data_size(unsigned bit_size)1659 lsc_bits_to_data_size(unsigned bit_size)
1660 {
1661    switch (bit_size / 8) {
1662    case 1:  return LSC_DATA_SIZE_D8U32;
1663    case 2:  return LSC_DATA_SIZE_D16U32;
1664    case 4:  return LSC_DATA_SIZE_D32;
1665    case 8:  return LSC_DATA_SIZE_D64;
1666    default:
1667       unreachable("Unsupported data size.");
1668    }
1669 }
1670 
1671 static void
lower_lsc_surface_logical_send(const fs_builder & bld,fs_inst * inst)1672 lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
1673 {
1674    const intel_device_info *devinfo = bld.shader->devinfo;
1675    assert(devinfo->has_lsc);
1676 
1677    /* Get the logical send arguments. */
1678    const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1679    const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1680    const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1681    const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1682    const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
1683    const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1684    const fs_reg allow_sample_mask =
1685       inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
1686    assert(arg.file == IMM);
1687    assert(allow_sample_mask.file == IMM);
1688 
1689    /* Calculate the total number of components of the payload. */
1690    const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
1691    const unsigned src_comps = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1692    const unsigned src_sz = type_sz(src.type);
1693 
1694    const bool has_side_effects = inst->has_side_effects();
1695 
1696    unsigned ex_mlen = 0;
1697    fs_reg payload, payload2;
1698    payload = bld.move_to_vgrf(addr, addr_sz);
1699    if (src.file != BAD_FILE) {
1700       payload2 = bld.move_to_vgrf(src, src_comps);
1701       ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE;
1702    }
1703 
1704    /* Predicate the instruction on the sample mask if needed */
1705    fs_reg sample_mask = allow_sample_mask.ud ? brw_sample_mask_reg(bld) :
1706                                                fs_reg(brw_imm_d(0xffff));
1707    if (sample_mask.file != BAD_FILE && sample_mask.file != IMM)
1708       brw_emit_predicate_on_sample_mask(bld, inst);
1709 
1710    if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
1711       inst->sfid = GFX12_SFID_SLM;
1712    else
1713       inst->sfid = GFX12_SFID_UGM;
1714 
1715    /* We must have exactly one of surface and surface_handle */
1716    assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
1717 
1718    enum lsc_addr_surface_type surf_type;
1719    if (surface_handle.file != BAD_FILE)
1720       surf_type = LSC_ADDR_SURFTYPE_BSS;
1721    else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
1722       surf_type = LSC_ADDR_SURFTYPE_FLAT;
1723    else
1724       surf_type = LSC_ADDR_SURFTYPE_BTI;
1725 
1726    switch (inst->opcode) {
1727    case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1728       inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
1729                                 surf_type, LSC_ADDR_SIZE_A32,
1730                                 1 /* num_coordinates */,
1731                                 LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
1732                                 false /* transpose */,
1733                                 LSC_CACHE_LOAD_L1STATE_L3MOCS,
1734                                 true /* has_dest */);
1735       break;
1736    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1737       inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
1738                                 surf_type, LSC_ADDR_SIZE_A32,
1739                                 1 /* num_coordinates */,
1740                                 LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
1741                                 false /* transpose */,
1742                                 LSC_CACHE_STORE_L1STATE_L3MOCS,
1743                                 false /* has_dest */);
1744       break;
1745    case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1746    case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: {
1747       /* Bspec: Atomic instruction -> Cache section:
1748        *
1749        *    Atomic messages are always forced to "un-cacheable" in the L1
1750        *    cache.
1751        */
1752       enum lsc_opcode opcode =
1753          inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL ?
1754          brw_atomic_op_to_lsc_fatomic_op(arg.ud) :
1755          brw_atomic_op_to_lsc_atomic_op(arg.ud);
1756       inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
1757                                 surf_type, LSC_ADDR_SIZE_A32,
1758                                 1 /* num_coordinates */,
1759                                 lsc_bits_to_data_size(src_sz * 8),
1760                                 1 /* num_channels */,
1761                                 false /* transpose */,
1762                                 LSC_CACHE_STORE_L1UC_L3WB,
1763                                 !inst->dst.is_null());
1764       break;
1765    }
1766    case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1767       inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
1768                                 surf_type, LSC_ADDR_SIZE_A32,
1769                                 1 /* num_coordinates */,
1770                                 lsc_bits_to_data_size(arg.ud),
1771                                 1 /* num_channels */,
1772                                 false /* transpose */,
1773                                 LSC_CACHE_LOAD_L1STATE_L3MOCS,
1774                                 true /* has_dest */);
1775       break;
1776    case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1777       inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
1778                                 surf_type, LSC_ADDR_SIZE_A32,
1779                                 1 /* num_coordinates */,
1780                                 lsc_bits_to_data_size(arg.ud),
1781                                 1 /* num_channels */,
1782                                 false /* transpose */,
1783                                 LSC_CACHE_STORE_L1STATE_L3MOCS,
1784                                 false /* has_dest */);
1785       break;
1786    default:
1787       unreachable("Unknown surface logical instruction");
1788    }
1789 
1790    inst->src[0] = brw_imm_ud(0);
1791 
1792    /* Set up extended descriptors */
1793    switch (surf_type) {
1794    case LSC_ADDR_SURFTYPE_FLAT:
1795       inst->src[1] = brw_imm_ud(0);
1796       break;
1797    case LSC_ADDR_SURFTYPE_BSS:
1798       /* We assume that the driver provided the handle in the top 20 bits so
1799        * we can use the surface handle directly as the extended descriptor.
1800        */
1801       inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
1802       break;
1803    case LSC_ADDR_SURFTYPE_BTI:
1804       if (surface.file == IMM) {
1805          inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
1806       } else {
1807          const fs_builder ubld = bld.exec_all().group(1, 0);
1808          fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1809          ubld.SHL(tmp, surface, brw_imm_ud(24));
1810          inst->src[1] = component(tmp, 0);
1811       }
1812       break;
1813    default:
1814       unreachable("Unknown surface type");
1815    }
1816 
1817    /* Update the original instruction. */
1818    inst->opcode = SHADER_OPCODE_SEND;
1819    inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
1820    inst->ex_mlen = ex_mlen;
1821    inst->header_size = 0;
1822    inst->send_has_side_effects = has_side_effects;
1823    inst->send_is_volatile = !has_side_effects;
1824 
1825    inst->resize_sources(4);
1826 
1827    /* Finally, the payload */
1828    inst->src[2] = payload;
1829    inst->src[3] = payload2;
1830 }
1831 
1832 static void
lower_surface_block_logical_send(const fs_builder & bld,fs_inst * inst)1833 lower_surface_block_logical_send(const fs_builder &bld, fs_inst *inst)
1834 {
1835    const intel_device_info *devinfo = bld.shader->devinfo;
1836    assert(devinfo->ver >= 9);
1837 
1838    /* Get the logical send arguments. */
1839    const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1840    const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1841    const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1842    const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1843    const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1844    assert(arg.file == IMM);
1845    assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);
1846    assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);
1847 
1848    const bool is_stateless =
1849       surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
1850                               surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
1851 
1852    const bool has_side_effects = inst->has_side_effects();
1853 
1854    const bool align_16B =
1855       inst->opcode != SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL;
1856 
1857    const bool write = inst->opcode == SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
1858 
1859    /* The address is stored in the header.  See MH_A32_GO and MH_BTS_GO. */
1860    fs_builder ubld = bld.exec_all().group(8, 0);
1861    fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1862 
1863    if (is_stateless)
1864       ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
1865    else
1866       ubld.MOV(header, brw_imm_d(0));
1867 
1868    /* Address in OWord units when aligned to OWords. */
1869    if (align_16B)
1870       ubld.group(1, 0).SHR(component(header, 2), addr, brw_imm_ud(4));
1871    else
1872       ubld.group(1, 0).MOV(component(header, 2), addr);
1873 
1874    fs_reg data;
1875    unsigned ex_mlen = 0;
1876    if (write) {
1877       const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1878       data = retype(bld.move_to_vgrf(src, src_sz), BRW_REGISTER_TYPE_UD);
1879       ex_mlen = src_sz * type_sz(src.type) * inst->exec_size / REG_SIZE;
1880    }
1881 
1882    inst->opcode = SHADER_OPCODE_SEND;
1883    inst->mlen = 1;
1884    inst->ex_mlen = ex_mlen;
1885    inst->header_size = 1;
1886    inst->send_has_side_effects = has_side_effects;
1887    inst->send_is_volatile = !has_side_effects;
1888 
1889    inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1890 
1891    const uint32_t desc = brw_dp_oword_block_rw_desc(devinfo, align_16B,
1892                                                     arg.ud, write);
1893    setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1894 
1895    inst->resize_sources(4);
1896 
1897    inst->src[2] = header;
1898    inst->src[3] = data;
1899 }
1900 
1901 static fs_reg
emit_a64_oword_block_header(const fs_builder & bld,const fs_reg & addr)1902 emit_a64_oword_block_header(const fs_builder &bld, const fs_reg &addr)
1903 {
1904    const fs_builder ubld = bld.exec_all().group(8, 0);
1905 
1906    assert(type_sz(addr.type) == 8 && addr.stride == 0);
1907 
1908    fs_reg expanded_addr = addr;
1909    if (addr.file == UNIFORM) {
1910       /* We can't do stride 1 with the UNIFORM file, it requires stride 0 */
1911       expanded_addr = ubld.vgrf(BRW_REGISTER_TYPE_UQ);
1912       expanded_addr.stride = 0;
1913       ubld.MOV(expanded_addr, retype(addr, BRW_REGISTER_TYPE_UQ));
1914    }
1915 
1916    fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1917    ubld.MOV(header, brw_imm_ud(0));
1918 
1919    /* Use a 2-wide MOV to fill out the address */
1920    fs_reg addr_vec2 = expanded_addr;
1921    addr_vec2.type = BRW_REGISTER_TYPE_UD;
1922    addr_vec2.stride = 1;
1923    ubld.group(2, 0).MOV(header, addr_vec2);
1924 
1925    return header;
1926 }
1927 
1928 static void
emit_fragment_mask(const fs_builder & bld,fs_inst * inst)1929 emit_fragment_mask(const fs_builder &bld, fs_inst *inst)
1930 {
1931    assert(inst->src[A64_LOGICAL_ENABLE_HELPERS].file == IMM);
1932    const bool enable_helpers = inst->src[A64_LOGICAL_ENABLE_HELPERS].ud;
1933 
1934    /* If we're a fragment shader, we have to predicate with the sample mask to
1935     * avoid helper invocations to avoid helper invocations in instructions
1936     * with side effects, unless they are explicitly required.
1937     *
1938     * There are also special cases when we actually want to run on helpers
1939     * (ray queries).
1940     */
1941    assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
1942    if (enable_helpers)
1943       emit_predicate_on_vector_mask(bld, inst);
1944    else if (inst->has_side_effects())
1945       brw_emit_predicate_on_sample_mask(bld, inst);
1946 }
1947 
1948 static void
lower_lsc_a64_logical_send(const fs_builder & bld,fs_inst * inst)1949 lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst)
1950 {
1951    const intel_device_info *devinfo = bld.shader->devinfo;
1952 
1953    /* Get the logical send arguments. */
1954    const fs_reg &addr = inst->src[A64_LOGICAL_ADDRESS];
1955    const fs_reg &src = inst->src[A64_LOGICAL_SRC];
1956    const unsigned src_sz = type_sz(src.type);
1957 
1958    const unsigned src_comps = inst->components_read(1);
1959    assert(inst->src[A64_LOGICAL_ARG].file == IMM);
1960    const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
1961    const bool has_side_effects = inst->has_side_effects();
1962 
1963    fs_reg payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
1964    fs_reg payload2 = retype(bld.move_to_vgrf(src, src_comps),
1965                             BRW_REGISTER_TYPE_UD);
1966    unsigned ex_mlen = src_comps * src_sz * inst->exec_size / REG_SIZE;
1967 
1968    switch (inst->opcode) {
1969    case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
1970       inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
1971                                 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
1972                                 1 /* num_coordinates */,
1973                                 LSC_DATA_SIZE_D32, arg /* num_channels */,
1974                                 false /* transpose */,
1975                                 LSC_CACHE_LOAD_L1STATE_L3MOCS,
1976                                 true /* has_dest */);
1977       break;
1978    case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
1979       inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
1980                                 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
1981                                 1 /* num_coordinates */,
1982                                 LSC_DATA_SIZE_D32, arg /* num_channels */,
1983                                 false /* transpose */,
1984                                 LSC_CACHE_STORE_L1STATE_L3MOCS,
1985                                 false /* has_dest */);
1986       break;
1987    case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
1988       inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
1989                                 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
1990                                 1 /* num_coordinates */,
1991                                 lsc_bits_to_data_size(arg),
1992                                 1 /* num_channels */,
1993                                 false /* transpose */,
1994                                 LSC_CACHE_LOAD_L1STATE_L3MOCS,
1995                                 true /* has_dest */);
1996       break;
1997    case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
1998       inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
1999                                 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2000                                 1 /* num_coordinates */,
2001                                 lsc_bits_to_data_size(arg),
2002                                 1 /* num_channels */,
2003                                 false /* transpose */,
2004                                 LSC_CACHE_STORE_L1STATE_L3MOCS,
2005                                 false /* has_dest */);
2006       break;
2007    case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
2008    case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
2009    case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: {
2010    case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
2011    case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
2012    case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL:
2013       /* Bspec: Atomic instruction -> Cache section:
2014        *
2015        *    Atomic messages are always forced to "un-cacheable" in the L1
2016        *    cache.
2017        */
2018       enum lsc_opcode opcode =
2019          (inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL ||
2020           inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL ||
2021           inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL) ?
2022          brw_atomic_op_to_lsc_atomic_op(arg) :
2023          brw_atomic_op_to_lsc_fatomic_op(arg);
2024       inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
2025                                 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2026                                 1 /* num_coordinates */,
2027                                 lsc_bits_to_data_size(src_sz * 8),
2028                                 1 /* num_channels */,
2029                                 false /* transpose */,
2030                                 LSC_CACHE_STORE_L1UC_L3WB,
2031                                 !inst->dst.is_null());
2032       break;
2033    }
2034    default:
2035       unreachable("Unknown A64 logical instruction");
2036    }
2037 
2038    if (bld.shader->stage == MESA_SHADER_FRAGMENT)
2039       emit_fragment_mask(bld, inst);
2040 
2041    /* Update the original instruction. */
2042    inst->opcode = SHADER_OPCODE_SEND;
2043    inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2044    inst->ex_mlen = ex_mlen;
2045    inst->header_size = 0;
2046    inst->send_has_side_effects = has_side_effects;
2047    inst->send_is_volatile = !has_side_effects;
2048 
2049    /* Set up SFID and descriptors */
2050    inst->sfid = GFX12_SFID_UGM;
2051    inst->resize_sources(4);
2052    inst->src[0] = brw_imm_ud(0); /* desc */
2053    inst->src[1] = brw_imm_ud(0); /* ex_desc */
2054    inst->src[2] = payload;
2055    inst->src[3] = payload2;
2056 }
2057 
2058 static void
lower_a64_logical_send(const fs_builder & bld,fs_inst * inst)2059 lower_a64_logical_send(const fs_builder &bld, fs_inst *inst)
2060 {
2061    const intel_device_info *devinfo = bld.shader->devinfo;
2062 
2063    const fs_reg &addr = inst->src[A64_LOGICAL_ADDRESS];
2064    const fs_reg &src = inst->src[A64_LOGICAL_SRC];
2065    const unsigned src_comps = inst->components_read(1);
2066    assert(inst->src[A64_LOGICAL_ARG].file == IMM);
2067    const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
2068    const bool has_side_effects = inst->has_side_effects();
2069 
2070    fs_reg payload, payload2;
2071    unsigned mlen, ex_mlen = 0, header_size = 0;
2072    if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL ||
2073        inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL ||
2074        inst->opcode == SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL) {
2075       assert(devinfo->ver >= 9);
2076 
2077       /* OWORD messages only take a scalar address in a header */
2078       mlen = 1;
2079       header_size = 1;
2080       payload = emit_a64_oword_block_header(bld, addr);
2081 
2082       if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL) {
2083          ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
2084          payload2 = retype(bld.move_to_vgrf(src, src_comps),
2085                            BRW_REGISTER_TYPE_UD);
2086       }
2087    } else if (devinfo->ver >= 9) {
2088       /* On Skylake and above, we have SENDS */
2089       mlen = 2 * (inst->exec_size / 8);
2090       ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
2091       payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
2092       payload2 = retype(bld.move_to_vgrf(src, src_comps),
2093                         BRW_REGISTER_TYPE_UD);
2094    } else {
2095       /* Add two because the address is 64-bit */
2096       const unsigned dwords = 2 + src_comps;
2097       mlen = dwords * (inst->exec_size / 8);
2098 
2099       fs_reg sources[5];
2100 
2101       sources[0] = addr;
2102 
2103       for (unsigned i = 0; i < src_comps; i++)
2104          sources[1 + i] = offset(src, bld, i);
2105 
2106       payload = bld.vgrf(BRW_REGISTER_TYPE_UD, dwords);
2107       bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0);
2108    }
2109 
2110    uint32_t desc;
2111    switch (inst->opcode) {
2112    case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
2113       desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
2114                                                 arg,   /* num_channels */
2115                                                 false  /* write */);
2116       break;
2117 
2118    case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
2119       desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
2120                                                 arg,   /* num_channels */
2121                                                 true   /* write */);
2122       break;
2123 
2124    case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
2125       desc = brw_dp_a64_oword_block_rw_desc(devinfo,
2126                                             true,    /* align_16B */
2127                                             arg,     /* num_dwords */
2128                                             false    /* write */);
2129       break;
2130 
2131    case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2132       desc = brw_dp_a64_oword_block_rw_desc(devinfo,
2133                                             false,   /* align_16B */
2134                                             arg,     /* num_dwords */
2135                                             false    /* write */);
2136       break;
2137 
2138    case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
2139       desc = brw_dp_a64_oword_block_rw_desc(devinfo,
2140                                             true,    /* align_16B */
2141                                             arg,     /* num_dwords */
2142                                             true     /* write */);
2143       break;
2144 
2145    case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
2146       desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
2147                                                arg,   /* bit_size */
2148                                                false  /* write */);
2149       break;
2150 
2151    case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
2152       desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
2153                                                arg,   /* bit_size */
2154                                                true   /* write */);
2155       break;
2156 
2157    case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
2158       desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 32,
2159                                             arg,   /* atomic_op */
2160                                             !inst->dst.is_null());
2161       break;
2162 
2163    case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
2164       desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 16,
2165                                             arg,   /* atomic_op */
2166                                             !inst->dst.is_null());
2167       break;
2168 
2169    case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
2170       desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 64,
2171                                             arg,   /* atomic_op */
2172                                             !inst->dst.is_null());
2173       break;
2174 
2175    case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
2176       desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
2177                                                   16, /* bit_size */
2178                                                   arg,   /* atomic_op */
2179                                                   !inst->dst.is_null());
2180       break;
2181 
2182    case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
2183       desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
2184                                                   32, /* bit_size */
2185                                                   arg,   /* atomic_op */
2186                                                   !inst->dst.is_null());
2187       break;
2188 
2189    default:
2190       unreachable("Unknown A64 logical instruction");
2191    }
2192 
2193    if (bld.shader->stage == MESA_SHADER_FRAGMENT)
2194       emit_fragment_mask(bld, inst);
2195 
2196    /* Update the original instruction. */
2197    inst->opcode = SHADER_OPCODE_SEND;
2198    inst->mlen = mlen;
2199    inst->ex_mlen = ex_mlen;
2200    inst->header_size = header_size;
2201    inst->send_has_side_effects = has_side_effects;
2202    inst->send_is_volatile = !has_side_effects;
2203 
2204    /* Set up SFID and descriptors */
2205    inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2206    inst->desc = desc;
2207    inst->resize_sources(4);
2208    inst->src[0] = brw_imm_ud(0); /* desc */
2209    inst->src[1] = brw_imm_ud(0); /* ex_desc */
2210    inst->src[2] = payload;
2211    inst->src[3] = payload2;
2212 }
2213 
2214 static void
lower_lsc_varying_pull_constant_logical_send(const fs_builder & bld,fs_inst * inst)2215 lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld,
2216                                              fs_inst *inst)
2217 {
2218    const intel_device_info *devinfo = bld.shader->devinfo;
2219    ASSERTED const brw_compiler *compiler = bld.shader->compiler;
2220 
2221    fs_reg index = inst->src[0];
2222 
2223    /* We are switching the instruction from an ALU-like instruction to a
2224     * send-from-grf instruction.  Since sends can't handle strides or
2225     * source modifiers, we have to make a copy of the offset source.
2226     */
2227    fs_reg ubo_offset = bld.move_to_vgrf(inst->src[1], 1);
2228 
2229    assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
2230    unsigned alignment = inst->src[2].ud;
2231 
2232    inst->opcode = SHADER_OPCODE_SEND;
2233    inst->sfid = GFX12_SFID_UGM;
2234    inst->resize_sources(3);
2235    inst->src[0] = brw_imm_ud(0);
2236 
2237    if (index.file == IMM) {
2238       inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, index.ud));
2239    } else {
2240       const fs_builder ubld = bld.exec_all().group(1, 0);
2241       fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2242       ubld.SHL(tmp, index, brw_imm_ud(24));
2243       inst->src[1] = component(tmp, 0);
2244    }
2245 
2246    assert(!compiler->indirect_ubos_use_sampler);
2247 
2248    inst->src[2] = ubo_offset; /* payload */
2249    if (alignment >= 4) {
2250       inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
2251                                 LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32,
2252                                 1 /* num_coordinates */,
2253                                 LSC_DATA_SIZE_D32,
2254                                 4 /* num_channels */,
2255                                 false /* transpose */,
2256                                 LSC_CACHE_LOAD_L1STATE_L3MOCS,
2257                                 true /* has_dest */);
2258       inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2259    } else {
2260       inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
2261                                 LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32,
2262                                 1 /* num_coordinates */,
2263                                 LSC_DATA_SIZE_D32,
2264                                 1 /* num_channels */,
2265                                 false /* transpose */,
2266                                 LSC_CACHE_LOAD_L1STATE_L3MOCS,
2267                                 true /* has_dest */);
2268       inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2269       /* The byte scattered messages can only read one dword at a time so
2270        * we have to duplicate the message 4 times to read the full vec4.
2271        * Hopefully, dead code will clean up the mess if some of them aren't
2272        * needed.
2273        */
2274       assert(inst->size_written == 16 * inst->exec_size);
2275       inst->size_written /= 4;
2276       for (unsigned c = 1; c < 4; c++) {
2277          /* Emit a copy of the instruction because we're about to modify
2278           * it.  Because this loop starts at 1, we will emit copies for the
2279           * first 3 and the final one will be the modified instruction.
2280           */
2281          bld.emit(*inst);
2282 
2283          /* Offset the source */
2284          inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);
2285          bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
2286 
2287          /* Offset the destination */
2288          inst->dst = offset(inst->dst, bld, 1);
2289       }
2290    }
2291 }
2292 
2293 static void
lower_varying_pull_constant_logical_send(const fs_builder & bld,fs_inst * inst)2294 lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
2295 {
2296    const intel_device_info *devinfo = bld.shader->devinfo;
2297    const brw_compiler *compiler = bld.shader->compiler;
2298 
2299    if (devinfo->ver >= 7) {
2300       fs_reg index = inst->src[0];
2301       /* We are switching the instruction from an ALU-like instruction to a
2302        * send-from-grf instruction.  Since sends can't handle strides or
2303        * source modifiers, we have to make a copy of the offset source.
2304        */
2305       fs_reg ubo_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
2306       bld.MOV(ubo_offset, inst->src[1]);
2307 
2308       assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
2309       unsigned alignment = inst->src[2].ud;
2310 
2311       inst->opcode = SHADER_OPCODE_SEND;
2312       inst->mlen = inst->exec_size / 8;
2313       inst->resize_sources(3);
2314 
2315       if (index.file == IMM) {
2316          inst->desc = index.ud & 0xff;
2317          inst->src[0] = brw_imm_ud(0);
2318       } else {
2319          inst->desc = 0;
2320          const fs_builder ubld = bld.exec_all().group(1, 0);
2321          fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2322          ubld.AND(tmp, index, brw_imm_ud(0xff));
2323          inst->src[0] = component(tmp, 0);
2324       }
2325       inst->src[1] = brw_imm_ud(0); /* ex_desc */
2326       inst->src[2] = ubo_offset; /* payload */
2327 
2328       if (compiler->indirect_ubos_use_sampler) {
2329          const unsigned simd_mode =
2330             inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
2331                                    BRW_SAMPLER_SIMD_MODE_SIMD16;
2332 
2333          inst->sfid = BRW_SFID_SAMPLER;
2334          inst->desc |= brw_sampler_desc(devinfo, 0, 0,
2335                                         GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
2336                                         simd_mode, 0);
2337       } else if (alignment >= 4) {
2338          inst->sfid = (devinfo->verx10 >= 75 ?
2339                        HSW_SFID_DATAPORT_DATA_CACHE_1 :
2340                        GFX7_SFID_DATAPORT_DATA_CACHE);
2341          inst->desc |= brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
2342                                                       4, /* num_channels */
2343                                                       false   /* write */);
2344       } else {
2345          inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
2346          inst->desc |= brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
2347                                                      32,     /* bit_size */
2348                                                      false   /* write */);
2349          /* The byte scattered messages can only read one dword at a time so
2350           * we have to duplicate the message 4 times to read the full vec4.
2351           * Hopefully, dead code will clean up the mess if some of them aren't
2352           * needed.
2353           */
2354          assert(inst->size_written == 16 * inst->exec_size);
2355          inst->size_written /= 4;
2356          for (unsigned c = 1; c < 4; c++) {
2357             /* Emit a copy of the instruction because we're about to modify
2358              * it.  Because this loop starts at 1, we will emit copies for the
2359              * first 3 and the final one will be the modified instruction.
2360              */
2361             bld.emit(*inst);
2362 
2363             /* Offset the source */
2364             inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);
2365             bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
2366 
2367             /* Offset the destination */
2368             inst->dst = offset(inst->dst, bld, 1);
2369          }
2370       }
2371    } else {
2372       const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->ver),
2373                            BRW_REGISTER_TYPE_UD);
2374 
2375       bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]);
2376 
2377       inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4;
2378       inst->resize_sources(1);
2379       inst->base_mrf = payload.nr;
2380       inst->header_size = 1;
2381       inst->mlen = 1 + inst->exec_size / 8;
2382    }
2383 }
2384 
2385 static void
lower_math_logical_send(const fs_builder & bld,fs_inst * inst)2386 lower_math_logical_send(const fs_builder &bld, fs_inst *inst)
2387 {
2388    assert(bld.shader->devinfo->ver < 6);
2389 
2390    inst->base_mrf = 2;
2391    inst->mlen = inst->sources * inst->exec_size / 8;
2392 
2393    if (inst->sources > 1) {
2394       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
2395        * "Message Payload":
2396        *
2397        * "Operand0[7].  For the INT DIV functions, this operand is the
2398        *  denominator."
2399        *  ...
2400        * "Operand1[7].  For the INT DIV functions, this operand is the
2401        *  numerator."
2402        */
2403       const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
2404       const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
2405       const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
2406 
2407       inst->resize_sources(1);
2408       inst->src[0] = src0;
2409 
2410       assert(inst->exec_size == 8);
2411       bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);
2412    }
2413 }
2414 
2415 static void
lower_btd_logical_send(const fs_builder & bld,fs_inst * inst)2416 lower_btd_logical_send(const fs_builder &bld, fs_inst *inst)
2417 {
2418    const intel_device_info *devinfo = bld.shader->devinfo;
2419    fs_reg global_addr = inst->src[0];
2420    const fs_reg &btd_record = inst->src[1];
2421 
2422    const unsigned mlen = 2;
2423    const fs_builder ubld = bld.exec_all().group(8, 0);
2424    fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2425 
2426    ubld.MOV(header, brw_imm_ud(0));
2427    switch (inst->opcode) {
2428    case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
2429       assert(type_sz(global_addr.type) == 8 && global_addr.stride == 0);
2430       global_addr.type = BRW_REGISTER_TYPE_UD;
2431       global_addr.stride = 1;
2432       ubld.group(2, 0).MOV(header, global_addr);
2433       break;
2434 
2435    case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
2436       /* The bottom bit is the Stack ID release bit */
2437       ubld.group(1, 0).MOV(header, brw_imm_ud(1));
2438       break;
2439 
2440    default:
2441       unreachable("Invalid BTD message");
2442    }
2443 
2444    /* Stack IDs are always in R1 regardless of whether we're coming from a
2445     * bindless shader or a regular compute shader.
2446     */
2447    fs_reg stack_ids =
2448       retype(byte_offset(header, REG_SIZE), BRW_REGISTER_TYPE_UW);
2449    bld.MOV(stack_ids, retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW));
2450 
2451    unsigned ex_mlen = 0;
2452    fs_reg payload;
2453    if (inst->opcode == SHADER_OPCODE_BTD_SPAWN_LOGICAL) {
2454       ex_mlen = 2 * (inst->exec_size / 8);
2455       payload = bld.move_to_vgrf(btd_record, 1);
2456    } else {
2457       assert(inst->opcode == SHADER_OPCODE_BTD_RETIRE_LOGICAL);
2458       /* All these messages take a BTD and things complain if we don't provide
2459        * one for RETIRE.  However, it shouldn't ever actually get used so fill
2460        * it with zero.
2461        */
2462       ex_mlen = 2 * (inst->exec_size / 8);
2463       payload = bld.move_to_vgrf(brw_imm_uq(0), 1);
2464    }
2465 
2466    /* Update the original instruction. */
2467    inst->opcode = SHADER_OPCODE_SEND;
2468    inst->mlen = mlen;
2469    inst->ex_mlen = ex_mlen;
2470    inst->header_size = 0; /* HW docs require has_header = false */
2471    inst->send_has_side_effects = true;
2472    inst->send_is_volatile = false;
2473 
2474    /* Set up SFID and descriptors */
2475    inst->sfid = GEN_RT_SFID_BINDLESS_THREAD_DISPATCH;
2476    inst->desc = brw_btd_spawn_desc(devinfo, inst->exec_size,
2477                                    GEN_RT_BTD_MESSAGE_SPAWN);
2478    inst->resize_sources(4);
2479    inst->src[0] = brw_imm_ud(0); /* desc */
2480    inst->src[1] = brw_imm_ud(0); /* ex_desc */
2481    inst->src[2] = header;
2482    inst->src[3] = payload;
2483 }
2484 
2485 static void
lower_trace_ray_logical_send(const fs_builder & bld,fs_inst * inst)2486 lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst)
2487 {
2488    const intel_device_info *devinfo = bld.shader->devinfo;
2489    /* The emit_uniformize() in brw_fs_nir.cpp will generate an horizontal
2490     * stride of 0. Below we're doing a MOV() in SIMD2. Since we can't use UQ/Q
2491     * types in on Gfx12.5, we need to tweak the stride with a value of 1 dword
2492     * so that the MOV operates on 2 components rather than twice the same
2493     * component.
2494     */
2495    fs_reg globals_addr = retype(inst->src[RT_LOGICAL_SRC_GLOBALS], BRW_REGISTER_TYPE_UD);
2496    globals_addr.stride = 1;
2497    const fs_reg &bvh_level =
2498       inst->src[RT_LOGICAL_SRC_BVH_LEVEL].file == BRW_IMMEDIATE_VALUE ?
2499       inst->src[RT_LOGICAL_SRC_BVH_LEVEL] :
2500       bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_BVH_LEVEL],
2501                        inst->components_read(RT_LOGICAL_SRC_BVH_LEVEL));
2502    const fs_reg &trace_ray_control =
2503       inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL].file == BRW_IMMEDIATE_VALUE ?
2504       inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL] :
2505       bld.move_to_vgrf(inst->src[RT_LOGICAL_SRC_TRACE_RAY_CONTROL],
2506                        inst->components_read(RT_LOGICAL_SRC_TRACE_RAY_CONTROL));
2507    const fs_reg &synchronous_src = inst->src[RT_LOGICAL_SRC_SYNCHRONOUS];
2508    assert(synchronous_src.file == BRW_IMMEDIATE_VALUE);
2509    const bool synchronous = synchronous_src.ud;
2510 
2511    const unsigned mlen = 1;
2512    const fs_builder ubld = bld.exec_all().group(8, 0);
2513    fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2514    ubld.MOV(header, brw_imm_ud(0));
2515    ubld.group(2, 0).MOV(header, globals_addr);
2516    if (synchronous)
2517       ubld.group(1, 0).MOV(byte_offset(header, 16), brw_imm_ud(synchronous));
2518 
2519    const unsigned ex_mlen = inst->exec_size / 8;
2520    fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);
2521    if (bvh_level.file == BRW_IMMEDIATE_VALUE &&
2522        trace_ray_control.file == BRW_IMMEDIATE_VALUE) {
2523       bld.MOV(payload, brw_imm_ud(SET_BITS(trace_ray_control.ud, 9, 8) |
2524                                   (bvh_level.ud & 0x7)));
2525    } else {
2526       bld.SHL(payload, trace_ray_control, brw_imm_ud(8));
2527       bld.OR(payload, payload, bvh_level);
2528    }
2529 
2530    /* When doing synchronous traversal, the HW implicitly computes the
2531     * stack_id using the following formula :
2532     *
2533     *    EUID[3:0] & THREAD_ID[2:0] & SIMD_LANE_ID[3:0]
2534     *
2535     * Only in the asynchronous case we need to set the stack_id given from the
2536     * payload register.
2537     */
2538    if (!synchronous) {
2539       bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1),
2540               retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW),
2541               brw_imm_uw(0x7ff));
2542    }
2543 
2544    /* Update the original instruction. */
2545    inst->opcode = SHADER_OPCODE_SEND;
2546    inst->mlen = mlen;
2547    inst->ex_mlen = ex_mlen;
2548    inst->header_size = 0; /* HW docs require has_header = false */
2549    inst->send_has_side_effects = true;
2550    inst->send_is_volatile = false;
2551 
2552    /* Set up SFID and descriptors */
2553    inst->sfid = GEN_RT_SFID_RAY_TRACE_ACCELERATOR;
2554    inst->desc = brw_rt_trace_ray_desc(devinfo, inst->exec_size);
2555    inst->resize_sources(4);
2556    inst->src[0] = brw_imm_ud(0); /* desc */
2557    inst->src[1] = brw_imm_ud(0); /* ex_desc */
2558    inst->src[2] = header;
2559    inst->src[3] = payload;
2560 }
2561 
2562 bool
lower_logical_sends()2563 fs_visitor::lower_logical_sends()
2564 {
2565    bool progress = false;
2566 
2567    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2568       const fs_builder ibld(this, block, inst);
2569 
2570       switch (inst->opcode) {
2571       case FS_OPCODE_FB_WRITE_LOGICAL:
2572          assert(stage == MESA_SHADER_FRAGMENT);
2573          lower_fb_write_logical_send(ibld, inst,
2574                                      brw_wm_prog_data(prog_data),
2575                                      (const brw_wm_prog_key *)key,
2576                                      payload);
2577          break;
2578 
2579       case FS_OPCODE_FB_READ_LOGICAL:
2580          lower_fb_read_logical_send(ibld, inst);
2581          break;
2582 
2583       case SHADER_OPCODE_TEX_LOGICAL:
2584          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
2585          break;
2586 
2587       case SHADER_OPCODE_TXD_LOGICAL:
2588          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
2589          break;
2590 
2591       case SHADER_OPCODE_TXF_LOGICAL:
2592          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
2593          break;
2594 
2595       case SHADER_OPCODE_TXL_LOGICAL:
2596          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
2597          break;
2598 
2599       case SHADER_OPCODE_TXS_LOGICAL:
2600          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
2601          break;
2602 
2603       case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
2604          lower_sampler_logical_send(ibld, inst,
2605                                     SHADER_OPCODE_IMAGE_SIZE_LOGICAL);
2606          break;
2607 
2608       case FS_OPCODE_TXB_LOGICAL:
2609          lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
2610          break;
2611 
2612       case SHADER_OPCODE_TXF_CMS_LOGICAL:
2613          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
2614          break;
2615 
2616       case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
2617       case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
2618          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
2619          break;
2620 
2621       case SHADER_OPCODE_TXF_UMS_LOGICAL:
2622          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
2623          break;
2624 
2625       case SHADER_OPCODE_TXF_MCS_LOGICAL:
2626          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
2627          break;
2628 
2629       case SHADER_OPCODE_LOD_LOGICAL:
2630          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
2631          break;
2632 
2633       case SHADER_OPCODE_TG4_LOGICAL:
2634          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
2635          break;
2636 
2637       case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
2638          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
2639          break;
2640 
2641       case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
2642          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO);
2643          break;
2644 
2645       case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
2646       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
2647       case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
2648       case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
2649       case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
2650       case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
2651          if (devinfo->has_lsc) {
2652             lower_lsc_surface_logical_send(ibld, inst);
2653             break;
2654          }
2655       case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
2656       case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
2657       case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
2658       case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
2659       case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
2660          lower_surface_logical_send(ibld, inst);
2661          break;
2662 
2663       case SHADER_OPCODE_OWORD_BLOCK_READ_LOGICAL:
2664       case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2665       case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
2666          lower_surface_block_logical_send(ibld, inst);
2667          break;
2668 
2669       case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
2670       case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
2671       case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
2672       case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
2673       case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
2674       case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
2675       case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
2676       case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
2677       case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
2678       case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL:
2679          if (devinfo->has_lsc) {
2680             lower_lsc_a64_logical_send(ibld, inst);
2681             break;
2682          }
2683       case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
2684       case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2685       case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
2686          lower_a64_logical_send(ibld, inst);
2687          break;
2688 
2689       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
2690          if (devinfo->has_lsc && !compiler->indirect_ubos_use_sampler)
2691             lower_lsc_varying_pull_constant_logical_send(ibld, inst);
2692          else
2693             lower_varying_pull_constant_logical_send(ibld, inst);
2694          break;
2695 
2696       case SHADER_OPCODE_RCP:
2697       case SHADER_OPCODE_RSQ:
2698       case SHADER_OPCODE_SQRT:
2699       case SHADER_OPCODE_EXP2:
2700       case SHADER_OPCODE_LOG2:
2701       case SHADER_OPCODE_SIN:
2702       case SHADER_OPCODE_COS:
2703       case SHADER_OPCODE_POW:
2704       case SHADER_OPCODE_INT_QUOTIENT:
2705       case SHADER_OPCODE_INT_REMAINDER:
2706          /* The math opcodes are overloaded for the send-like and
2707           * expression-like instructions which seems kind of icky.  Gfx6+ has
2708           * a native (but rather quirky) MATH instruction so we don't need to
2709           * do anything here.  On Gfx4-5 we'll have to lower the Gfx6-like
2710           * logical instructions (which we can easily recognize because they
2711           * have mlen = 0) into send-like virtual instructions.
2712           */
2713          if (devinfo->ver < 6 && inst->mlen == 0) {
2714             lower_math_logical_send(ibld, inst);
2715             break;
2716 
2717          } else {
2718             continue;
2719          }
2720 
2721       case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
2722       case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
2723          lower_btd_logical_send(ibld, inst);
2724          break;
2725 
2726       case RT_OPCODE_TRACE_RAY_LOGICAL:
2727          lower_trace_ray_logical_send(ibld, inst);
2728          break;
2729 
2730       case SHADER_OPCODE_URB_READ_LOGICAL:
2731          lower_urb_read_logical_send(ibld, inst);
2732          break;
2733 
2734       case SHADER_OPCODE_URB_WRITE_LOGICAL:
2735          lower_urb_write_logical_send(ibld, inst);
2736          break;
2737 
2738       default:
2739          continue;
2740       }
2741 
2742       progress = true;
2743    }
2744 
2745    if (progress)
2746       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2747 
2748    return progress;
2749 }
2750