• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010, 2022 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /**
25  * @file elk_lower_logical_sends.cpp
26  */
27 
28 #include "elk_eu.h"
29 #include "elk_fs.h"
30 #include "elk_fs_builder.h"
31 
32 using namespace elk;
33 
34 static void
lower_urb_read_logical_send(const fs_builder & bld,elk_fs_inst * inst)35 lower_urb_read_logical_send(const fs_builder &bld, elk_fs_inst *inst)
36 {
37    const intel_device_info *devinfo = bld.shader->devinfo;
38    const bool per_slot_present =
39       inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
40 
41    assert(inst->size_written % REG_SIZE == 0);
42    assert(inst->header_size == 0);
43 
44    elk_fs_reg payload_sources[2];
45    unsigned header_size = 0;
46    payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
47    if (per_slot_present)
48       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
49 
50    elk_fs_reg payload = elk_fs_reg(VGRF, bld.shader->alloc.allocate(header_size),
51                            ELK_REGISTER_TYPE_F);
52    bld.LOAD_PAYLOAD(payload, payload_sources, header_size, header_size);
53 
54    inst->opcode = ELK_SHADER_OPCODE_SEND;
55    inst->header_size = header_size;
56 
57    inst->sfid = ELK_SFID_URB;
58    inst->desc = elk_urb_desc(devinfo,
59                              GFX8_URB_OPCODE_SIMD8_READ,
60                              per_slot_present,
61                              false,
62                              inst->offset);
63 
64    inst->mlen = header_size;
65    inst->ex_desc = 0;
66    inst->ex_mlen = 0;
67    inst->send_is_volatile = true;
68 
69    inst->resize_sources(4);
70 
71    inst->src[0] = elk_imm_ud(0); /* desc */
72    inst->src[1] = elk_imm_ud(0); /* ex_desc */
73    inst->src[2] = payload;
74    inst->src[3] = elk_null_reg();
75 }
76 
77 static void
lower_urb_read_logical_send_xe2(const fs_builder & bld,elk_fs_inst * inst)78 lower_urb_read_logical_send_xe2(const fs_builder &bld, elk_fs_inst *inst)
79 {
80    const intel_device_info *devinfo = bld.shader->devinfo;
81    assert(devinfo->has_lsc);
82 
83    assert(inst->size_written % (REG_SIZE * reg_unit(devinfo)) == 0);
84    assert(inst->header_size == 0);
85 
86    /* Get the logical send arguments. */
87    const elk_fs_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];
88 
89    /* Calculate the total number of components of the payload. */
90    const unsigned dst_comps = inst->size_written / (REG_SIZE * reg_unit(devinfo));
91 
92    elk_fs_reg payload = bld.vgrf(ELK_REGISTER_TYPE_UD);
93 
94    bld.MOV(payload, handle);
95 
96    /* The low 24-bits of the URB handle is a byte offset into the URB area.
97     * Add the (OWord) offset of the write to this value.
98     */
99    if (inst->offset) {
100       bld.ADD(payload, payload, elk_imm_ud(inst->offset * 16));
101       inst->offset = 0;
102    }
103 
104    elk_fs_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
105    if (offsets.file != BAD_FILE) {
106       elk_fs_reg offsets_B = bld.vgrf(ELK_REGISTER_TYPE_UD);
107       bld.SHL(offsets_B, offsets, elk_imm_ud(4)); /* OWords -> Bytes */
108       bld.ADD(payload, payload, offsets_B);
109    }
110 
111    inst->sfid = ELK_SFID_URB;
112 
113    assert((dst_comps >= 1 && dst_comps <= 4) || dst_comps == 8);
114 
115    inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
116                              LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
117                              1 /* num_coordinates */,
118                              LSC_DATA_SIZE_D32, dst_comps /* num_channels */,
119                              false /* transpose */,
120                              LSC_CACHE(devinfo, STORE, L1UC_L3UC),
121                              false /* has_dest */);
122 
123 
124    /* Update the original instruction. */
125    inst->opcode = ELK_SHADER_OPCODE_SEND;
126    inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
127    inst->ex_mlen = 0;
128    inst->header_size = 0;
129    inst->send_has_side_effects = true;
130    inst->send_is_volatile = false;
131 
132    inst->resize_sources(4);
133 
134    inst->src[0] = elk_imm_ud(0);
135    inst->src[1] = elk_imm_ud(0);
136 
137    inst->src[2] = payload;
138    inst->src[3] = elk_null_reg();
139 }
140 
141 static void
lower_urb_write_logical_send(const fs_builder & bld,elk_fs_inst * inst)142 lower_urb_write_logical_send(const fs_builder &bld, elk_fs_inst *inst)
143 {
144    const intel_device_info *devinfo = bld.shader->devinfo;
145    const bool per_slot_present =
146       inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS].file != BAD_FILE;
147    const bool channel_mask_present =
148       inst->src[URB_LOGICAL_SRC_CHANNEL_MASK].file != BAD_FILE;
149 
150    assert(inst->header_size == 0);
151 
152    const unsigned length = 1 + per_slot_present + channel_mask_present +
153                            inst->components_read(URB_LOGICAL_SRC_DATA);
154 
155    elk_fs_reg *payload_sources = new elk_fs_reg[length];
156    elk_fs_reg payload = elk_fs_reg(VGRF, bld.shader->alloc.allocate(length),
157                            ELK_REGISTER_TYPE_F);
158 
159    unsigned header_size = 0;
160    payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_HANDLE];
161    if (per_slot_present)
162       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
163 
164    if (channel_mask_present)
165       payload_sources[header_size++] = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
166 
167    for (unsigned i = header_size, j = 0; i < length; i++, j++)
168       payload_sources[i] = offset(inst->src[URB_LOGICAL_SRC_DATA], bld, j);
169 
170    bld.LOAD_PAYLOAD(payload, payload_sources, length, header_size);
171 
172    delete [] payload_sources;
173 
174    inst->opcode = ELK_SHADER_OPCODE_SEND;
175    inst->header_size = header_size;
176    inst->dst = elk_null_reg();
177 
178    inst->sfid = ELK_SFID_URB;
179    inst->desc = elk_urb_desc(devinfo,
180                              GFX8_URB_OPCODE_SIMD8_WRITE,
181                              per_slot_present,
182                              channel_mask_present,
183                              inst->offset);
184 
185    inst->mlen = length;
186    inst->ex_desc = 0;
187    inst->ex_mlen = 0;
188    inst->send_has_side_effects = true;
189 
190    inst->resize_sources(4);
191 
192    inst->src[0] = elk_imm_ud(0); /* desc */
193    inst->src[1] = elk_imm_ud(0); /* ex_desc */
194    inst->src[2] = payload;
195    inst->src[3] = elk_null_reg();
196 }
197 
198 static void
lower_urb_write_logical_send_xe2(const fs_builder & bld,elk_fs_inst * inst)199 lower_urb_write_logical_send_xe2(const fs_builder &bld, elk_fs_inst *inst)
200 {
201    const intel_device_info *devinfo = bld.shader->devinfo;
202    assert(devinfo->has_lsc);
203 
204    /* Get the logical send arguments. */
205    const elk_fs_reg handle = inst->src[URB_LOGICAL_SRC_HANDLE];
206    const elk_fs_reg src = inst->components_read(URB_LOGICAL_SRC_DATA) ?
207       inst->src[URB_LOGICAL_SRC_DATA] : elk_fs_reg(elk_imm_ud(0));
208    assert(type_sz(src.type) == 4);
209 
210    /* Calculate the total number of components of the payload. */
211    const unsigned src_comps = MAX2(1, inst->components_read(URB_LOGICAL_SRC_DATA));
212    const unsigned src_sz = type_sz(src.type);
213 
214    elk_fs_reg payload = bld.vgrf(ELK_REGISTER_TYPE_UD);
215 
216    bld.MOV(payload, handle);
217 
218    /* The low 24-bits of the URB handle is a byte offset into the URB area.
219     * Add the (OWord) offset of the write to this value.
220     */
221    if (inst->offset) {
222       bld.ADD(payload, payload, elk_imm_ud(inst->offset * 16));
223       inst->offset = 0;
224    }
225 
226    elk_fs_reg offsets = inst->src[URB_LOGICAL_SRC_PER_SLOT_OFFSETS];
227    if (offsets.file != BAD_FILE) {
228       elk_fs_reg offsets_B = bld.vgrf(ELK_REGISTER_TYPE_UD);
229       bld.SHL(offsets_B, offsets, elk_imm_ud(4)); /* OWords -> Bytes */
230       bld.ADD(payload, payload, offsets_B);
231    }
232 
233    const elk_fs_reg cmask = inst->src[URB_LOGICAL_SRC_CHANNEL_MASK];
234    unsigned mask = 0;
235 
236    if (cmask.file != BAD_FILE) {
237       assert(cmask.file == IMM);
238       assert(cmask.type == ELK_REGISTER_TYPE_UD);
239       mask = cmask.ud >> 16;
240    }
241 
242    elk_fs_reg payload2 = bld.move_to_vgrf(src, src_comps);
243    const unsigned ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE;
244 
245    inst->sfid = ELK_SFID_URB;
246 
247    enum elk_lsc_opcode op = mask ? LSC_OP_STORE_CMASK : LSC_OP_STORE;
248    inst->desc = lsc_msg_desc_wcmask(devinfo, op, inst->exec_size,
249                              LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A32,
250                              1 /* num_coordinates */,
251                              LSC_DATA_SIZE_D32, src_comps /* num_channels */,
252                              false /* transpose */,
253                              LSC_CACHE(devinfo, STORE, L1UC_L3UC),
254                              false /* has_dest */, mask);
255 
256 
257    /* Update the original instruction. */
258    inst->opcode = ELK_SHADER_OPCODE_SEND;
259    inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
260    inst->ex_mlen = ex_mlen;
261    inst->header_size = 0;
262    inst->send_has_side_effects = true;
263    inst->send_is_volatile = false;
264 
265    inst->resize_sources(4);
266 
267    inst->src[0] = elk_imm_ud(0);
268    inst->src[1] = elk_imm_ud(0);
269 
270    inst->src[2] = payload;
271    inst->src[3] = payload2;
272 }
273 
274 static void
setup_color_payload(const fs_builder & bld,const elk_wm_prog_key * key,elk_fs_reg * dst,elk_fs_reg color,unsigned components)275 setup_color_payload(const fs_builder &bld, const elk_wm_prog_key *key,
276                     elk_fs_reg *dst, elk_fs_reg color, unsigned components)
277 {
278    if (key->clamp_fragment_color) {
279       elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_F, 4);
280       assert(color.type == ELK_REGISTER_TYPE_F);
281 
282       for (unsigned i = 0; i < components; i++)
283          set_saturate(true,
284                       bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
285 
286       color = tmp;
287    }
288 
289    for (unsigned i = 0; i < components; i++)
290       dst[i] = offset(color, bld, i);
291 }
292 
293 static void
lower_fb_write_logical_send(const fs_builder & bld,elk_fs_inst * inst,const struct elk_wm_prog_data * prog_data,const elk_wm_prog_key * key,const elk_fs_thread_payload & payload)294 lower_fb_write_logical_send(const fs_builder &bld, elk_fs_inst *inst,
295                             const struct elk_wm_prog_data *prog_data,
296                             const elk_wm_prog_key *key,
297                             const elk_fs_thread_payload &payload)
298 {
299    assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
300    const intel_device_info *devinfo = bld.shader->devinfo;
301    const elk_fs_reg color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
302    const elk_fs_reg color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
303    const elk_fs_reg src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
304    const elk_fs_reg src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
305    const elk_fs_reg dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
306    const elk_fs_reg src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
307    elk_fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
308    const unsigned components =
309       inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
310 
311    assert(inst->target != 0 || src0_alpha.file == BAD_FILE);
312 
313    /* We can potentially have a message length of up to 15, so we have to set
314     * base_mrf to either 0 or 1 in order to fit in m0..m15.
315     */
316    elk_fs_reg sources[15];
317    int header_size = 2, payload_header_size;
318    unsigned length = 0;
319 
320    if (devinfo->ver < 6) {
321       /* TODO: Support SIMD32 on gfx4-5 */
322       assert(bld.group() < 16);
323 
324       /* For gfx4-5, we always have a header consisting of g0 and g1.  We have
325        * an implied MOV from g0,g1 to the start of the message.  The MOV from
326        * g0 is handled by the hardware and the MOV from g1 is provided by the
327        * generator.  This is required because, on gfx4-5, the generator may
328        * generate two write messages with different message lengths in order
329        * to handle AA data properly.
330        *
331        * Also, since the pixel mask goes in the g0 portion of the message and
332        * since render target writes are the last thing in the shader, we write
333        * the pixel mask directly into g0 and it will get copied as part of the
334        * implied write.
335        */
336       if (prog_data->uses_kill) {
337          bld.exec_all().group(1, 0)
338             .MOV(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UW),
339                  elk_sample_mask_reg(bld));
340       }
341 
342       assert(length == 0);
343       length = 2;
344    } else if ((devinfo->verx10 <= 70 &&
345                prog_data->uses_kill) ||
346               (devinfo->ver < 11 &&
347                (color1.file != BAD_FILE || key->nr_color_regions > 1))) {
348       assert(devinfo->ver < 20);
349 
350       /* From the Sandy Bridge PRM, volume 4, page 198:
351        *
352        *     "Dispatched Pixel Enables. One bit per pixel indicating
353        *      which pixels were originally enabled when the thread was
354        *      dispatched. This field is only required for the end-of-
355        *      thread message and on all dual-source messages."
356        */
357       const fs_builder ubld = bld.exec_all().group(8, 0);
358 
359       elk_fs_reg header = ubld.vgrf(ELK_REGISTER_TYPE_UD, 2);
360       if (bld.group() < 16) {
361          /* The header starts off as g0 and g1 for the first half */
362          ubld.group(16, 0).MOV(header, retype(elk_vec8_grf(0, 0),
363                                               ELK_REGISTER_TYPE_UD));
364       } else {
365          /* The header starts off as g0 and g2 for the second half */
366          assert(bld.group() < 32);
367          const elk_fs_reg header_sources[2] = {
368             retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD),
369             retype(elk_vec8_grf(2, 0), ELK_REGISTER_TYPE_UD),
370          };
371          ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
372 
373          /* Gfx12 will require additional fix-ups if we ever hit this path. */
374          assert(devinfo->ver < 12);
375       }
376 
377       uint32_t g00_bits = 0;
378 
379       /* Set "Source0 Alpha Present to RenderTarget" bit in message
380        * header.
381        */
382       if (src0_alpha.file != BAD_FILE)
383          g00_bits |= 1 << 11;
384 
385       /* Set computes stencil to render target */
386       if (prog_data->computed_stencil)
387          g00_bits |= 1 << 14;
388 
389       if (g00_bits) {
390          /* OR extra bits into g0.0 */
391          ubld.group(1, 0).OR(component(header, 0),
392                              retype(elk_vec1_grf(0, 0),
393                                     ELK_REGISTER_TYPE_UD),
394                              elk_imm_ud(g00_bits));
395       }
396 
397       /* Set the render target index for choosing BLEND_STATE. */
398       if (inst->target > 0) {
399          ubld.group(1, 0).MOV(component(header, 2), elk_imm_ud(inst->target));
400       }
401 
402       if (prog_data->uses_kill) {
403          ubld.group(1, 0).MOV(retype(component(header, 15),
404                                      ELK_REGISTER_TYPE_UW),
405                               elk_sample_mask_reg(bld));
406       }
407 
408       assert(length == 0);
409       sources[0] = header;
410       sources[1] = horiz_offset(header, 8);
411       length = 2;
412    }
413    assert(length == 0 || length == 2);
414    header_size = length;
415 
416    if (payload.aa_dest_stencil_reg[0]) {
417       assert(inst->group < 16);
418       sources[length] = elk_fs_reg(VGRF, bld.shader->alloc.allocate(1));
419       bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
420          .MOV(sources[length],
421               elk_fs_reg(elk_vec8_grf(payload.aa_dest_stencil_reg[0], 0)));
422       length++;
423    }
424 
425    if (src0_alpha.file != BAD_FILE) {
426       for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
427          const fs_builder &ubld = bld.exec_all().group(8, i)
428                                     .annotate("FB write src0 alpha");
429          const elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_F);
430          ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
431          setup_color_payload(ubld, key, &sources[length], tmp, 1);
432          length++;
433       }
434    }
435 
436    if (sample_mask.file != BAD_FILE) {
437       const elk_fs_reg tmp(VGRF, bld.shader->alloc.allocate(reg_unit(devinfo)),
438                        ELK_REGISTER_TYPE_UD);
439 
440       /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
441        * relevant.  Since it's unsigned single words one vgrf is always
442        * 16-wide, but only the lower or higher 8 channels will be used by the
443        * hardware when doing a SIMD8 write depending on whether we have
444        * selected the subspans for the first or second half respectively.
445        */
446       assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
447       sample_mask.type = ELK_REGISTER_TYPE_UW;
448       sample_mask.stride *= 2;
449 
450       bld.exec_all().annotate("FB write oMask")
451          .MOV(horiz_offset(retype(tmp, ELK_REGISTER_TYPE_UW),
452                            inst->group % (16 * reg_unit(devinfo))),
453               sample_mask);
454 
455       for (unsigned i = 0; i < reg_unit(devinfo); i++)
456          sources[length++] = byte_offset(tmp, REG_SIZE * i);
457    }
458 
459    payload_header_size = length;
460 
461    setup_color_payload(bld, key, &sources[length], color0, components);
462    length += 4;
463 
464    if (color1.file != BAD_FILE) {
465       setup_color_payload(bld, key, &sources[length], color1, components);
466       length += 4;
467    }
468 
469    if (src_depth.file != BAD_FILE) {
470       sources[length] = src_depth;
471       length++;
472    }
473 
474    if (dst_depth.file != BAD_FILE) {
475       sources[length] = dst_depth;
476       length++;
477    }
478 
479    if (src_stencil.file != BAD_FILE) {
480       assert(devinfo->ver >= 9);
481       assert(bld.dispatch_width() == 8 * reg_unit(devinfo));
482 
483       /* XXX: src_stencil is only available on gfx9+. dst_depth is never
484        * available on gfx9+. As such it's impossible to have both enabled at the
485        * same time and therefore length cannot overrun the array.
486        */
487       assert(length < 15 * reg_unit(devinfo));
488 
489       sources[length] = bld.vgrf(ELK_REGISTER_TYPE_UD);
490       bld.exec_all().annotate("FB write OS")
491          .MOV(retype(sources[length], ELK_REGISTER_TYPE_UB),
492               subscript(src_stencil, ELK_REGISTER_TYPE_UB, 0));
493       length++;
494    }
495 
496    elk_fs_inst *load;
497    if (devinfo->ver >= 7) {
498       /* Send from the GRF */
499       elk_fs_reg payload = elk_fs_reg(VGRF, -1, ELK_REGISTER_TYPE_F);
500       load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
501       payload.nr = bld.shader->alloc.allocate(regs_written(load));
502       load->dst = payload;
503 
504       uint32_t msg_ctl = elk_fb_write_msg_control(inst, prog_data);
505 
506       inst->desc =
507          (inst->group / 16) << 11 | /* rt slot group */
508          elk_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,
509                            0 /* coarse_rt_write */);
510 
511       elk_fs_reg desc = elk_imm_ud(0);
512       if (prog_data->coarse_pixel_dispatch == ELK_ALWAYS) {
513          inst->desc |= (1 << 18);
514       } else if (prog_data->coarse_pixel_dispatch == ELK_SOMETIMES) {
515          STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_RT_WRITES == (1 << 18));
516          const fs_builder &ubld = bld.exec_all().group(8, 0);
517          desc = ubld.vgrf(ELK_REGISTER_TYPE_UD);
518          ubld.AND(desc, dynamic_msaa_flags(prog_data),
519                   elk_imm_ud(INTEL_MSAA_FLAG_COARSE_RT_WRITES));
520          desc = component(desc, 0);
521       }
522 
523       uint32_t ex_desc = 0;
524       if (devinfo->ver >= 11) {
525          /* Set the "Render Target Index" and "Src0 Alpha Present" fields
526           * in the extended message descriptor, in lieu of using a header.
527           */
528          ex_desc = inst->target << 12 | (src0_alpha.file != BAD_FILE) << 15;
529 
530          if (key->nr_color_regions == 0)
531             ex_desc |= 1 << 20; /* Null Render Target */
532       }
533       inst->ex_desc = ex_desc;
534 
535       inst->opcode = ELK_SHADER_OPCODE_SEND;
536       inst->resize_sources(3);
537       inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
538       inst->src[0] = desc;
539       inst->src[1] = elk_imm_ud(0);
540       inst->src[2] = payload;
541       inst->mlen = regs_written(load);
542       inst->ex_mlen = 0;
543       inst->header_size = header_size;
544       inst->check_tdr = true;
545       inst->send_has_side_effects = true;
546    } else {
547       /* Send from the MRF */
548       load = bld.LOAD_PAYLOAD(elk_fs_reg(MRF, 1, ELK_REGISTER_TYPE_F),
549                               sources, length, payload_header_size);
550 
551       /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
552        * will do this for us if we just give it a COMPR4 destination.
553        */
554       if (devinfo->ver < 6 && bld.dispatch_width() == 16)
555          load->dst.nr |= ELK_MRF_COMPR4;
556 
557       if (devinfo->ver < 6) {
558          /* Set up src[0] for the implied MOV from grf0-1 */
559          inst->resize_sources(1);
560          inst->src[0] = elk_vec8_grf(0, 0);
561       } else {
562          inst->resize_sources(0);
563       }
564       inst->base_mrf = 1;
565       inst->opcode = ELK_FS_OPCODE_FB_WRITE;
566       inst->mlen = regs_written(load);
567       inst->header_size = header_size;
568    }
569 }
570 
571 static void
lower_fb_read_logical_send(const fs_builder & bld,elk_fs_inst * inst)572 lower_fb_read_logical_send(const fs_builder &bld, elk_fs_inst *inst)
573 {
574    const intel_device_info *devinfo = bld.shader->devinfo;
575    const fs_builder &ubld = bld.exec_all().group(8, 0);
576    const unsigned length = 2;
577    const elk_fs_reg header = ubld.vgrf(ELK_REGISTER_TYPE_UD, length);
578 
579    if (bld.group() < 16) {
580       ubld.group(16, 0).MOV(header, retype(elk_vec8_grf(0, 0),
581                                            ELK_REGISTER_TYPE_UD));
582    } else {
583       assert(bld.group() < 32);
584       const elk_fs_reg header_sources[] = {
585          retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD),
586          retype(elk_vec8_grf(2, 0), ELK_REGISTER_TYPE_UD)
587       };
588       ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
589 
590       if (devinfo->ver >= 12) {
591          /* On Gfx12 the Viewport and Render Target Array Index fields (AKA
592           * Poly 0 Info) are provided in r1.1 instead of r0.0, and the render
593           * target message header format was updated accordingly -- However
594           * the updated format only works for the lower 16 channels in a
595           * SIMD32 thread, since the higher 16 channels want the subspan data
596           * from r2 instead of r1, so we need to copy over the contents of
597           * r1.1 in order to fix things up.
598           */
599          ubld.group(1, 0).MOV(component(header, 9),
600                               retype(elk_vec1_grf(1, 1), ELK_REGISTER_TYPE_UD));
601       }
602    }
603 
604    /* BSpec 12470 (Gfx8-11), BSpec 47842 (Gfx12+) :
605     *
606     *   "Must be zero for Render Target Read message."
607     *
608     * For bits :
609     *   - 14 : Stencil Present to Render Target
610     *   - 13 : Source Depth Present to Render Target
611     *   - 12 : oMask to Render Target
612     *   - 11 : Source0 Alpha Present to Render Target
613     */
614    ubld.group(1, 0).AND(component(header, 0),
615                         component(header, 0),
616                         elk_imm_ud(~INTEL_MASK(14, 11)));
617 
618    inst->resize_sources(1);
619    inst->src[0] = header;
620    inst->opcode = ELK_FS_OPCODE_FB_READ;
621    inst->mlen = length;
622    inst->header_size = length;
623 }
624 
625 static void
lower_sampler_logical_send_gfx4(const fs_builder & bld,elk_fs_inst * inst,elk_opcode op,const elk_fs_reg & coordinate,const elk_fs_reg & shadow_c,const elk_fs_reg & lod,const elk_fs_reg & lod2,const elk_fs_reg & surface,const elk_fs_reg & sampler,unsigned coord_components,unsigned grad_components)626 lower_sampler_logical_send_gfx4(const fs_builder &bld, elk_fs_inst *inst, elk_opcode op,
627                                 const elk_fs_reg &coordinate,
628                                 const elk_fs_reg &shadow_c,
629                                 const elk_fs_reg &lod, const elk_fs_reg &lod2,
630                                 const elk_fs_reg &surface,
631                                 const elk_fs_reg &sampler,
632                                 unsigned coord_components,
633                                 unsigned grad_components)
634 {
635    const bool has_lod = (op == ELK_SHADER_OPCODE_TXL || op == ELK_FS_OPCODE_TXB ||
636                          op == ELK_SHADER_OPCODE_TXF || op == ELK_SHADER_OPCODE_TXS);
637    elk_fs_reg msg_begin(MRF, 1, ELK_REGISTER_TYPE_F);
638    elk_fs_reg msg_end = msg_begin;
639 
640    /* g0 header. */
641    msg_end = offset(msg_end, bld.group(8, 0), 1);
642 
643    for (unsigned i = 0; i < coord_components; i++)
644       bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
645               offset(coordinate, bld, i));
646 
647    msg_end = offset(msg_end, bld, coord_components);
648 
649    /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
650     * require all three components to be present and zero if they are unused.
651     */
652    if (coord_components > 0 &&
653        (has_lod || shadow_c.file != BAD_FILE ||
654         (op == ELK_SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
655       assert(coord_components <= 3);
656       for (unsigned i = 0; i < 3 - coord_components; i++)
657          bld.MOV(offset(msg_end, bld, i), elk_imm_f(0.0f));
658 
659       msg_end = offset(msg_end, bld, 3 - coord_components);
660    }
661 
662    if (op == ELK_SHADER_OPCODE_TXD) {
663       /* TXD unsupported in SIMD16 mode. */
664       assert(bld.dispatch_width() == 8);
665 
666       /* the slots for u and v are always present, but r is optional */
667       if (coord_components < 2)
668          msg_end = offset(msg_end, bld, 2 - coord_components);
669 
670       /*  P   = u, v, r
671        * dPdx = dudx, dvdx, drdx
672        * dPdy = dudy, dvdy, drdy
673        *
674        * 1-arg: Does not exist.
675        *
676        * 2-arg: dudx   dvdx   dudy   dvdy
677        *        dPdx.x dPdx.y dPdy.x dPdy.y
678        *        m4     m5     m6     m7
679        *
680        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
681        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
682        *        m5     m6     m7     m8     m9     m10
683        */
684       for (unsigned i = 0; i < grad_components; i++)
685          bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
686 
687       msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
688 
689       for (unsigned i = 0; i < grad_components; i++)
690          bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
691 
692       msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
693    }
694 
695    if (has_lod) {
696       /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
697        * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
698        */
699       assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
700              bld.dispatch_width() == 16);
701 
702       const elk_reg_type type =
703          (op == ELK_SHADER_OPCODE_TXF || op == ELK_SHADER_OPCODE_TXS ?
704           ELK_REGISTER_TYPE_UD : ELK_REGISTER_TYPE_F);
705       bld.MOV(retype(msg_end, type), lod);
706       msg_end = offset(msg_end, bld, 1);
707    }
708 
709    if (shadow_c.file != BAD_FILE) {
710       if (op == ELK_SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
711          /* There's no plain shadow compare message, so we use shadow
712           * compare with a bias of 0.0.
713           */
714          bld.MOV(msg_end, elk_imm_f(0.0f));
715          msg_end = offset(msg_end, bld, 1);
716       }
717 
718       bld.MOV(msg_end, shadow_c);
719       msg_end = offset(msg_end, bld, 1);
720    }
721 
722    inst->opcode = op;
723    inst->src[0] = reg_undef;
724    inst->src[1] = surface;
725    inst->src[2] = sampler;
726    inst->resize_sources(3);
727    inst->base_mrf = msg_begin.nr;
728    inst->mlen = msg_end.nr - msg_begin.nr;
729    inst->header_size = 1;
730 }
731 
732 static void
lower_sampler_logical_send_gfx5(const fs_builder & bld,elk_fs_inst * inst,elk_opcode op,const elk_fs_reg & coordinate,const elk_fs_reg & shadow_c,const elk_fs_reg & lod,const elk_fs_reg & lod2,const elk_fs_reg & sample_index,const elk_fs_reg & surface,const elk_fs_reg & sampler,unsigned coord_components,unsigned grad_components)733 lower_sampler_logical_send_gfx5(const fs_builder &bld, elk_fs_inst *inst, elk_opcode op,
734                                 const elk_fs_reg &coordinate,
735                                 const elk_fs_reg &shadow_c,
736                                 const elk_fs_reg &lod, const elk_fs_reg &lod2,
737                                 const elk_fs_reg &sample_index,
738                                 const elk_fs_reg &surface,
739                                 const elk_fs_reg &sampler,
740                                 unsigned coord_components,
741                                 unsigned grad_components)
742 {
743    elk_fs_reg message(MRF, 2, ELK_REGISTER_TYPE_F);
744    elk_fs_reg msg_coords = message;
745    unsigned header_size = 0;
746 
747    if (inst->offset != 0) {
748       /* The offsets set up by the visitor are in the m1 header, so we can't
749        * go headerless.
750        */
751       header_size = 1;
752       message.nr--;
753    }
754 
755    for (unsigned i = 0; i < coord_components; i++)
756       bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type),
757               offset(coordinate, bld, i));
758 
759    elk_fs_reg msg_end = offset(msg_coords, bld, coord_components);
760    elk_fs_reg msg_lod = offset(msg_coords, bld, 4);
761 
762    if (shadow_c.file != BAD_FILE) {
763       elk_fs_reg msg_shadow = msg_lod;
764       bld.MOV(msg_shadow, shadow_c);
765       msg_lod = offset(msg_shadow, bld, 1);
766       msg_end = msg_lod;
767    }
768 
769    switch (op) {
770    case ELK_SHADER_OPCODE_TXL:
771    case ELK_FS_OPCODE_TXB:
772       bld.MOV(msg_lod, lod);
773       msg_end = offset(msg_lod, bld, 1);
774       break;
775    case ELK_SHADER_OPCODE_TXD:
776       /**
777        *  P   =  u,    v,    r
778        * dPdx = dudx, dvdx, drdx
779        * dPdy = dudy, dvdy, drdy
780        *
781        * Load up these values:
782        * - dudx   dudy   dvdx   dvdy   drdx   drdy
783        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
784        */
785       msg_end = msg_lod;
786       for (unsigned i = 0; i < grad_components; i++) {
787          bld.MOV(msg_end, offset(lod, bld, i));
788          msg_end = offset(msg_end, bld, 1);
789 
790          bld.MOV(msg_end, offset(lod2, bld, i));
791          msg_end = offset(msg_end, bld, 1);
792       }
793       break;
794    case ELK_SHADER_OPCODE_TXS:
795       msg_lod = retype(msg_end, ELK_REGISTER_TYPE_UD);
796       bld.MOV(msg_lod, lod);
797       msg_end = offset(msg_lod, bld, 1);
798       break;
799    case ELK_SHADER_OPCODE_TXF:
800       msg_lod = offset(msg_coords, bld, 3);
801       bld.MOV(retype(msg_lod, ELK_REGISTER_TYPE_UD), lod);
802       msg_end = offset(msg_lod, bld, 1);
803       break;
804    case ELK_SHADER_OPCODE_TXF_CMS:
805       msg_lod = offset(msg_coords, bld, 3);
806       /* lod */
807       bld.MOV(retype(msg_lod, ELK_REGISTER_TYPE_UD), elk_imm_ud(0u));
808       /* sample index */
809       bld.MOV(retype(offset(msg_lod, bld, 1), ELK_REGISTER_TYPE_UD), sample_index);
810       msg_end = offset(msg_lod, bld, 2);
811       break;
812    default:
813       break;
814    }
815 
816    inst->opcode = op;
817    inst->src[0] = reg_undef;
818    inst->src[1] = surface;
819    inst->src[2] = sampler;
820    inst->resize_sources(3);
821    inst->base_mrf = message.nr;
822    inst->mlen = msg_end.nr - message.nr;
823    inst->header_size = header_size;
824 
825    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
826    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
827 }
828 
829 static bool
is_high_sampler(const struct intel_device_info * devinfo,const elk_fs_reg & sampler)830 is_high_sampler(const struct intel_device_info *devinfo, const elk_fs_reg &sampler)
831 {
832    if (devinfo->verx10 <= 70)
833       return false;
834 
835    return sampler.file != IMM || sampler.ud >= 16;
836 }
837 
838 static unsigned
sampler_msg_type(const intel_device_info * devinfo,elk_opcode opcode,bool shadow_compare,bool has_min_lod)839 sampler_msg_type(const intel_device_info *devinfo,
840                  elk_opcode opcode, bool shadow_compare, bool has_min_lod)
841 {
842    assert(devinfo->ver >= 5);
843    switch (opcode) {
844    case ELK_SHADER_OPCODE_TEX:
845       if (devinfo->ver >= 20 && has_min_lod) {
846          return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD :
847                                  XE2_SAMPLER_MESSAGE_SAMPLE_MLOD;
848       } else {
849          return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
850                                  GFX5_SAMPLER_MESSAGE_SAMPLE;
851       }
852    case ELK_FS_OPCODE_TXB:
853       return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
854                               GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
855    case ELK_SHADER_OPCODE_TXL:
856       assert(!has_min_lod);
857       return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
858                               GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
859    case ELK_SHADER_OPCODE_TXL_LZ:
860       assert(!has_min_lod);
861       return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ :
862                               GFX9_SAMPLER_MESSAGE_SAMPLE_LZ;
863    case ELK_SHADER_OPCODE_TXS:
864    case ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
865       assert(!has_min_lod);
866       return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
867    case ELK_SHADER_OPCODE_TXD:
868       assert(!shadow_compare || devinfo->verx10 >= 75);
869       return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
870                               GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
871    case ELK_SHADER_OPCODE_TXF:
872       assert(!has_min_lod);
873       return GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
874    case ELK_SHADER_OPCODE_TXF_LZ:
875       assert(!has_min_lod);
876       assert(devinfo->ver >= 9);
877       return GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
878    case ELK_SHADER_OPCODE_TXF_CMS_W:
879       assert(!has_min_lod);
880       assert(devinfo->ver >= 9);
881       return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
882    case ELK_SHADER_OPCODE_TXF_CMS:
883       assert(!has_min_lod);
884       return devinfo->ver >= 7 ? GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS :
885                                  GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
886    case ELK_SHADER_OPCODE_TXF_UMS:
887       assert(!has_min_lod);
888       assert(devinfo->ver >= 7);
889       return GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
890    case ELK_SHADER_OPCODE_TXF_MCS:
891       assert(!has_min_lod);
892       assert(devinfo->ver >= 7);
893       return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
894    case ELK_SHADER_OPCODE_LOD:
895       assert(!has_min_lod);
896       return GFX5_SAMPLER_MESSAGE_LOD;
897    case ELK_SHADER_OPCODE_TG4:
898       assert(!has_min_lod);
899       assert(devinfo->ver >= 7);
900       return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
901                               GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
902       break;
903    case ELK_SHADER_OPCODE_TG4_OFFSET:
904       assert(!has_min_lod);
905       assert(devinfo->ver >= 7);
906       return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
907                               GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
908    case ELK_SHADER_OPCODE_SAMPLEINFO:
909       assert(!has_min_lod);
910       return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
911    default:
912       unreachable("not reached");
913    }
914 }
915 
916 /**
917  * Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to
918  * the given requested_alignment_sz.
919  */
920 static elk_fs_inst *
emit_load_payload_with_padding(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg * src,unsigned sources,unsigned header_size,unsigned requested_alignment_sz)921 emit_load_payload_with_padding(const fs_builder &bld, const elk_fs_reg &dst,
922                                const elk_fs_reg *src, unsigned sources,
923                                unsigned header_size,
924                                unsigned requested_alignment_sz)
925 {
926    unsigned length = 0;
927    unsigned num_srcs =
928       sources * DIV_ROUND_UP(requested_alignment_sz, bld.dispatch_width());
929    elk_fs_reg *src_comps = new elk_fs_reg[num_srcs];
930 
931    for (unsigned i = 0; i < header_size; i++)
932       src_comps[length++] = src[i];
933 
934    for (unsigned i = header_size; i < sources; i++) {
935       unsigned src_sz =
936          retype(dst, src[i].type).component_size(bld.dispatch_width());
937       const enum elk_reg_type padding_payload_type =
938          elk_reg_type_from_bit_size(type_sz(src[i].type) * 8,
939                                     ELK_REGISTER_TYPE_UD);
940 
941       src_comps[length++] = src[i];
942 
943       /* Expand the real sources if component of requested payload type is
944        * larger than real source component.
945        */
946       if (src_sz < requested_alignment_sz) {
947          for (unsigned j = 0; j < (requested_alignment_sz / src_sz) - 1; j++) {
948             src_comps[length++] = retype(elk_fs_reg(), padding_payload_type);
949          }
950       }
951    }
952 
953    elk_fs_inst *inst = bld.LOAD_PAYLOAD(dst, src_comps, length, header_size);
954    delete[] src_comps;
955 
956    return inst;
957 }
958 
959 static void
lower_sampler_logical_send_gfx7(const fs_builder & bld,elk_fs_inst * inst,elk_opcode op,const elk_fs_reg & coordinate,const elk_fs_reg & shadow_c,elk_fs_reg lod,const elk_fs_reg & lod2,const elk_fs_reg & min_lod,const elk_fs_reg & sample_index,const elk_fs_reg & mcs,const elk_fs_reg & surface,const elk_fs_reg & sampler,const elk_fs_reg & surface_handle,const elk_fs_reg & sampler_handle,const elk_fs_reg & tg4_offset,unsigned payload_type_bit_size,unsigned coord_components,unsigned grad_components,bool residency)960 lower_sampler_logical_send_gfx7(const fs_builder &bld, elk_fs_inst *inst, elk_opcode op,
961                                 const elk_fs_reg &coordinate,
962                                 const elk_fs_reg &shadow_c,
963                                 elk_fs_reg lod, const elk_fs_reg &lod2,
964                                 const elk_fs_reg &min_lod,
965                                 const elk_fs_reg &sample_index,
966                                 const elk_fs_reg &mcs,
967                                 const elk_fs_reg &surface,
968                                 const elk_fs_reg &sampler,
969                                 const elk_fs_reg &surface_handle,
970                                 const elk_fs_reg &sampler_handle,
971                                 const elk_fs_reg &tg4_offset,
972                                 unsigned payload_type_bit_size,
973                                 unsigned coord_components,
974                                 unsigned grad_components,
975                                 bool residency)
976 {
977    const elk_compiler *compiler = bld.shader->compiler;
978    const intel_device_info *devinfo = bld.shader->devinfo;
979    const enum elk_reg_type payload_type =
980       elk_reg_type_from_bit_size(payload_type_bit_size, ELK_REGISTER_TYPE_F);
981    const enum elk_reg_type payload_unsigned_type =
982       elk_reg_type_from_bit_size(payload_type_bit_size, ELK_REGISTER_TYPE_UD);
983    const enum elk_reg_type payload_signed_type =
984       elk_reg_type_from_bit_size(payload_type_bit_size, ELK_REGISTER_TYPE_D);
985    unsigned reg_width = bld.dispatch_width() / 8;
986    unsigned header_size = 0, length = 0;
987    elk_fs_reg sources[1 + MAX_SAMPLER_MESSAGE_SIZE];
988    for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
989       sources[i] = bld.vgrf(payload_type);
990 
991    /* We must have exactly one of surface/sampler and surface/sampler_handle */
992    assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
993    assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
994 
995    if (op == ELK_SHADER_OPCODE_TG4 || op == ELK_SHADER_OPCODE_TG4_OFFSET ||
996        inst->offset != 0 || inst->eot ||
997        op == ELK_SHADER_OPCODE_SAMPLEINFO ||
998        sampler_handle.file != BAD_FILE ||
999        is_high_sampler(devinfo, sampler) ||
1000        residency) {
1001       /* For general texture offsets (no txf workaround), we need a header to
1002        * put them in.
1003        *
1004        * TG4 needs to place its channel select in the header, for interaction
1005        * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
1006        * larger sampler numbers we need to offset the Sampler State Pointer in
1007        * the header.
1008        */
1009       elk_fs_reg header = retype(sources[0], ELK_REGISTER_TYPE_UD);
1010       for (header_size = 0; header_size < reg_unit(devinfo); header_size++)
1011          sources[length++] = byte_offset(header, REG_SIZE * header_size);
1012 
1013       /* If we're requesting fewer than four channels worth of response,
1014        * and we have an explicit header, we need to set up the sampler
1015        * writemask.  It's reversed from normal: 1 means "don't write".
1016        */
1017       unsigned reg_count = regs_written(inst) - reg_unit(devinfo) * residency;
1018       if (!inst->eot && reg_count < 4 * reg_width) {
1019          assert(reg_count % reg_width == 0);
1020          unsigned mask = ~((1 << (reg_count / reg_width)) - 1) & 0xf;
1021          inst->offset |= mask << 12;
1022       }
1023 
1024       if (residency)
1025          inst->offset |= 1 << 23; /* g0.2 bit23 : Pixel Null Mask Enable */
1026 
1027       /* Build the actual header */
1028       const fs_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
1029       const fs_builder ubld1 = ubld.group(1, 0);
1030       ubld.MOV(header, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
1031       if (inst->offset) {
1032          ubld1.MOV(component(header, 2), elk_imm_ud(inst->offset));
1033       } else if (bld.shader->stage != MESA_SHADER_VERTEX &&
1034                  bld.shader->stage != MESA_SHADER_FRAGMENT) {
1035          /* The vertex and fragment stages have g0.2 set to 0, so
1036           * header0.2 is 0 when g0 is copied. Other stages may not, so we
1037           * must set it to 0 to avoid setting undesirable bits in the
1038           * message.
1039           */
1040          ubld1.MOV(component(header, 2), elk_imm_ud(0));
1041       }
1042 
1043       if (sampler_handle.file != BAD_FILE) {
1044          /* Bindless sampler handles aren't relative to the sampler state
1045           * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
1046           * Instead, it's an absolute pointer relative to dynamic state base
1047           * address.
1048           *
1049           * Sampler states are 16 bytes each and the pointer we give here has
1050           * to be 32-byte aligned.  In order to avoid more indirect messages
1051           * than required, we assume that all bindless sampler states are
1052           * 32-byte aligned.  This sacrifices a bit of general state base
1053           * address space but means we can do something more efficient in the
1054           * shader.
1055           */
1056          if (compiler->use_bindless_sampler_offset) {
1057             assert(devinfo->ver >= 11);
1058             ubld1.OR(component(header, 3), sampler_handle, elk_imm_ud(1));
1059          } else {
1060             ubld1.MOV(component(header, 3), sampler_handle);
1061          }
1062       } else if (is_high_sampler(devinfo, sampler)) {
1063          elk_fs_reg sampler_state_ptr =
1064             retype(elk_vec1_grf(0, 3), ELK_REGISTER_TYPE_UD);
1065 
1066          /* Gfx11+ sampler message headers include bits in 4:0 which conflict
1067           * with the ones included in g0.3 bits 4:0.  Mask them out.
1068           */
1069          if (devinfo->ver >= 11) {
1070             sampler_state_ptr = ubld1.vgrf(ELK_REGISTER_TYPE_UD);
1071             ubld1.AND(sampler_state_ptr,
1072                       retype(elk_vec1_grf(0, 3), ELK_REGISTER_TYPE_UD),
1073                       elk_imm_ud(INTEL_MASK(31, 5)));
1074          }
1075 
1076          if (sampler.file == ELK_IMMEDIATE_VALUE) {
1077             assert(sampler.ud >= 16);
1078             const int sampler_state_size = 16; /* 16 bytes */
1079 
1080             ubld1.ADD(component(header, 3), sampler_state_ptr,
1081                       elk_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
1082          } else {
1083             elk_fs_reg tmp = ubld1.vgrf(ELK_REGISTER_TYPE_UD);
1084             ubld1.AND(tmp, sampler, elk_imm_ud(0x0f0));
1085             ubld1.SHL(tmp, tmp, elk_imm_ud(4));
1086             ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
1087          }
1088       } else if (devinfo->ver >= 11) {
1089          /* Gfx11+ sampler message headers include bits in 4:0 which conflict
1090           * with the ones included in g0.3 bits 4:0.  Mask them out.
1091           */
1092          ubld1.AND(component(header, 3),
1093                    retype(elk_vec1_grf(0, 3), ELK_REGISTER_TYPE_UD),
1094                    elk_imm_ud(INTEL_MASK(31, 5)));
1095       }
1096    }
1097 
1098    /* Change the opcode to account for LOD being zero before the
1099     * switch-statement that emits sources based on the opcode.
1100     */
1101    if (devinfo->ver >= 9 && lod.is_zero()) {
1102       if (op == ELK_SHADER_OPCODE_TXL)
1103          op = ELK_SHADER_OPCODE_TXL_LZ;
1104       else if (op == ELK_SHADER_OPCODE_TXF)
1105          op = ELK_SHADER_OPCODE_TXF_LZ;
1106    }
1107 
1108    /* On Xe2 and newer platforms, min_lod is the first parameter specifically
1109     * so that a bunch of other, possibly unused, parameters don't need to also
1110     * be included.
1111     */
1112    const unsigned msg_type =
1113       sampler_msg_type(devinfo, op, inst->shadow_compare,
1114                        min_lod.file != BAD_FILE);
1115 
1116    const bool min_lod_is_first = devinfo->ver >= 20 &&
1117       (msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_MLOD ||
1118        msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD);
1119 
1120    if (min_lod_is_first) {
1121       assert(min_lod.file != BAD_FILE);
1122       bld.MOV(sources[length++], min_lod);
1123    }
1124 
1125    if (shadow_c.file != BAD_FILE) {
1126       bld.MOV(sources[length], shadow_c);
1127       length++;
1128    }
1129 
1130    bool coordinate_done = false;
1131 
1132    /* Set up the LOD info */
1133    switch (op) {
1134    case ELK_FS_OPCODE_TXB:
1135    case ELK_SHADER_OPCODE_TXL:
1136       bld.MOV(sources[length], lod);
1137       length++;
1138       break;
1139    case ELK_SHADER_OPCODE_TXD:
1140       /* TXD should have been lowered in SIMD16 mode (in SIMD32 mode in
1141        * Xe2+).
1142        */
1143       assert(bld.dispatch_width() == (8 * reg_unit(devinfo)));
1144 
1145       /* Load dPdx and the coordinate together:
1146        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
1147        */
1148       for (unsigned i = 0; i < coord_components; i++) {
1149          bld.MOV(sources[length++], offset(coordinate, bld, i));
1150 
1151          /* For cube map array, the coordinate is (u,v,r,ai) but there are
1152           * only derivatives for (u, v, r).
1153           */
1154          if (i < grad_components) {
1155             bld.MOV(sources[length++], offset(lod, bld, i));
1156             bld.MOV(sources[length++], offset(lod2, bld, i));
1157          }
1158       }
1159 
1160       coordinate_done = true;
1161       break;
1162    case ELK_SHADER_OPCODE_TXS:
1163       bld.MOV(retype(sources[length], payload_unsigned_type), lod);
1164       length++;
1165       break;
1166    case ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
1167       /* We need an LOD; just use 0 */
1168       bld.MOV(retype(sources[length], payload_unsigned_type), elk_imm_ud(0));
1169       length++;
1170       break;
1171    case ELK_SHADER_OPCODE_TXF:
1172    case ELK_SHADER_OPCODE_TXF_LZ:
1173       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
1174        * On Gfx9 they are u, v, lod, r
1175        */
1176       bld.MOV(retype(sources[length++], payload_signed_type), coordinate);
1177 
1178       if (devinfo->ver >= 9) {
1179          if (coord_components >= 2) {
1180             bld.MOV(retype(sources[length], payload_signed_type),
1181                     offset(coordinate, bld, 1));
1182          } else {
1183             sources[length] = elk_imm_d(0);
1184          }
1185          length++;
1186       }
1187 
1188       if (op != ELK_SHADER_OPCODE_TXF_LZ) {
1189          bld.MOV(retype(sources[length], payload_signed_type), lod);
1190          length++;
1191       }
1192 
1193       for (unsigned i = devinfo->ver >= 9 ? 2 : 1; i < coord_components; i++)
1194          bld.MOV(retype(sources[length++], payload_signed_type),
1195                  offset(coordinate, bld, i));
1196 
1197       coordinate_done = true;
1198       break;
1199 
1200    case ELK_SHADER_OPCODE_TXF_CMS:
1201    case ELK_SHADER_OPCODE_TXF_CMS_W:
1202    case ELK_SHADER_OPCODE_TXF_UMS:
1203    case ELK_SHADER_OPCODE_TXF_MCS:
1204       if (op == ELK_SHADER_OPCODE_TXF_UMS ||
1205           op == ELK_SHADER_OPCODE_TXF_CMS ||
1206           op == ELK_SHADER_OPCODE_TXF_CMS_W) {
1207          bld.MOV(retype(sources[length++], payload_unsigned_type), sample_index);
1208       }
1209 
1210       /* Data from the multisample control surface. */
1211       if (op == ELK_SHADER_OPCODE_TXF_CMS || op == ELK_SHADER_OPCODE_TXF_CMS_W) {
1212          unsigned num_mcs_components = 1;
1213 
1214          /* From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
1215           * Shared Functions - 3D Sampler - Messages - Message Format:
1216           *
1217           *    ld2dms_w   si  mcs0 mcs1 mcs2  mcs3  u  v  r
1218           */
1219          if (op == ELK_SHADER_OPCODE_TXF_CMS_W)
1220             num_mcs_components = 2;
1221 
1222          for (unsigned i = 0; i < num_mcs_components; ++i) {
1223             /* Sampler always writes 4/8 register worth of data but for ld_mcs
1224              * only valid data is in first two register. So with 16-bit
1225              * payload, we need to split 2-32bit register into 4-16-bit
1226              * payload.
1227              *
1228              * From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
1229              * Shared Functions - 3D Sampler - Messages - Message Format:
1230              *
1231              *    ld2dms_w   si  mcs0 mcs1 mcs2  mcs3  u  v  r
1232              */
1233             if (devinfo->verx10 >= 125 && op == ELK_SHADER_OPCODE_TXF_CMS_W) {
1234                elk_fs_reg tmp = offset(mcs, bld, i);
1235                bld.MOV(retype(sources[length++], payload_unsigned_type),
1236                        mcs.file == IMM ? mcs :
1237                        subscript(tmp, payload_unsigned_type, 0));
1238                bld.MOV(retype(sources[length++], payload_unsigned_type),
1239                        mcs.file == IMM ? mcs :
1240                        subscript(tmp, payload_unsigned_type, 1));
1241             } else {
1242                bld.MOV(retype(sources[length++], payload_unsigned_type),
1243                        mcs.file == IMM ? mcs : offset(mcs, bld, i));
1244             }
1245          }
1246       }
1247 
1248       /* There is no offsetting for this message; just copy in the integer
1249        * texture coordinates.
1250        */
1251       for (unsigned i = 0; i < coord_components; i++)
1252          bld.MOV(retype(sources[length++], payload_signed_type),
1253                  offset(coordinate, bld, i));
1254 
1255       coordinate_done = true;
1256       break;
1257    case ELK_SHADER_OPCODE_TG4_OFFSET:
1258       /* More crazy intermixing */
1259       for (unsigned i = 0; i < 2; i++) /* u, v */
1260          bld.MOV(sources[length++], offset(coordinate, bld, i));
1261 
1262       for (unsigned i = 0; i < 2; i++) /* offu, offv */
1263          bld.MOV(retype(sources[length++], payload_signed_type),
1264                  offset(tg4_offset, bld, i));
1265 
1266       if (coord_components == 3) /* r if present */
1267          bld.MOV(sources[length++], offset(coordinate, bld, 2));
1268 
1269       coordinate_done = true;
1270       break;
1271    default:
1272       break;
1273    }
1274 
1275    /* Set up the coordinate (except for cases where it was done above) */
1276    if (!coordinate_done) {
1277       for (unsigned i = 0; i < coord_components; i++)
1278          bld.MOV(retype(sources[length++], payload_type),
1279                  offset(coordinate, bld, i));
1280    }
1281 
1282    if (min_lod.file != BAD_FILE && !min_lod_is_first) {
1283       /* Account for all of the missing coordinate sources */
1284       if (op == ELK_SHADER_OPCODE_TXD && devinfo->verx10 >= 125) {
1285          /* On DG2 and newer platforms, sample_d can only be used with 1D and
1286           * 2D surfaces, so the maximum number of gradient components is 2.
1287           * In spite of this limitation, the Bspec lists a mysterious R
1288           * component before the min_lod, so the maximum coordinate components
1289           * is 3.
1290           *
1291           * See bspec 45942, "Enable new message layout for cube array"
1292           */
1293          length += 3 - coord_components;
1294          length += (2 - grad_components) * 2;
1295       } else {
1296          length += 4 - coord_components;
1297          if (op == ELK_SHADER_OPCODE_TXD)
1298             length += (3 - grad_components) * 2;
1299       }
1300 
1301       bld.MOV(sources[length++], min_lod);
1302 
1303       /* Wa_14014595444: Populate MLOD as parameter 5 (twice). */
1304        if (devinfo->verx10 == 125 && op == ELK_FS_OPCODE_TXB &&
1305           !inst->shadow_compare)
1306          bld.MOV(sources[length++], min_lod);
1307    }
1308 
1309    const elk_fs_reg src_payload =
1310       elk_fs_reg(VGRF, bld.shader->alloc.allocate(length * reg_width),
1311                                               ELK_REGISTER_TYPE_F);
1312    /* In case of 16-bit payload each component takes one full register in
1313     * both SIMD8H and SIMD16H modes. In both cases one reg can hold 16
1314     * elements. In SIMD8H case hardware simply expects the components to be
1315     * padded (i.e., aligned on reg boundary).
1316     */
1317    elk_fs_inst *load_payload_inst =
1318       emit_load_payload_with_padding(bld, src_payload, sources, length,
1319                                      header_size, REG_SIZE * reg_unit(devinfo));
1320    unsigned mlen = load_payload_inst->size_written / REG_SIZE;
1321    unsigned simd_mode = 0;
1322    if (devinfo->ver < 20) {
1323       if (payload_type_bit_size == 16) {
1324          assert(devinfo->ver >= 11);
1325          simd_mode = inst->exec_size <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H :
1326             GFX10_SAMPLER_SIMD_MODE_SIMD16H;
1327       } else {
1328          simd_mode = inst->exec_size <= 8 ? ELK_SAMPLER_SIMD_MODE_SIMD8 :
1329             ELK_SAMPLER_SIMD_MODE_SIMD16;
1330       }
1331    } else {
1332       if (payload_type_bit_size == 16) {
1333          simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16H :
1334             XE2_SAMPLER_SIMD_MODE_SIMD32H;
1335       } else {
1336          simd_mode = inst->exec_size <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16 :
1337             XE2_SAMPLER_SIMD_MODE_SIMD32;
1338       }
1339    }
1340 
1341    /* Generate the SEND. */
1342    inst->opcode = ELK_SHADER_OPCODE_SEND;
1343    inst->mlen = mlen;
1344    inst->header_size = header_size;
1345 
1346    assert(msg_type == sampler_msg_type(devinfo, op, inst->shadow_compare,
1347                                        min_lod.file != BAD_FILE));
1348 
1349    inst->sfid = ELK_SFID_SAMPLER;
1350    if (surface.file == IMM &&
1351        (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
1352       inst->desc = elk_sampler_desc(devinfo, surface.ud,
1353                                     sampler.file == IMM ? sampler.ud % 16 : 0,
1354                                     msg_type,
1355                                     simd_mode,
1356                                     0 /* return_format unused on gfx7+ */);
1357       inst->src[0] = elk_imm_ud(0);
1358       inst->src[1] = elk_imm_ud(0);
1359    } else if (surface_handle.file != BAD_FILE) {
1360       /* Bindless surface */
1361       assert(devinfo->ver >= 9);
1362       inst->desc = elk_sampler_desc(devinfo,
1363                                     GFX9_BTI_BINDLESS,
1364                                     sampler.file == IMM ? sampler.ud % 16 : 0,
1365                                     msg_type,
1366                                     simd_mode,
1367                                     0 /* return_format unused on gfx7+ */);
1368 
1369       /* For bindless samplers, the entire address is included in the message
1370        * header so we can leave the portion in the message descriptor 0.
1371        */
1372       if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {
1373          inst->src[0] = elk_imm_ud(0);
1374       } else {
1375          const fs_builder ubld = bld.group(1, 0).exec_all();
1376          elk_fs_reg desc = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1377          ubld.SHL(desc, sampler, elk_imm_ud(8));
1378          inst->src[0] = component(desc, 0);
1379       }
1380 
1381       /* We assume that the driver provided the handle in the top 20 bits so
1382        * we can use the surface handle directly as the extended descriptor.
1383        */
1384       inst->src[1] = retype(surface_handle, ELK_REGISTER_TYPE_UD);
1385       inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1386    } else {
1387       /* Immediate portion of the descriptor */
1388       inst->desc = elk_sampler_desc(devinfo,
1389                                     0, /* surface */
1390                                     0, /* sampler */
1391                                     msg_type,
1392                                     simd_mode,
1393                                     0 /* return_format unused on gfx7+ */);
1394       const fs_builder ubld = bld.group(1, 0).exec_all();
1395       elk_fs_reg desc = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1396       if (surface.equals(sampler)) {
1397          /* This case is common in GL */
1398          ubld.MUL(desc, surface, elk_imm_ud(0x101));
1399       } else {
1400          if (sampler_handle.file != BAD_FILE) {
1401             ubld.MOV(desc, surface);
1402          } else if (sampler.file == IMM) {
1403             ubld.OR(desc, surface, elk_imm_ud(sampler.ud << 8));
1404          } else {
1405             ubld.SHL(desc, sampler, elk_imm_ud(8));
1406             ubld.OR(desc, desc, surface);
1407          }
1408       }
1409       ubld.AND(desc, desc, elk_imm_ud(0xfff));
1410 
1411       inst->src[0] = component(desc, 0);
1412       inst->src[1] = elk_imm_ud(0); /* ex_desc */
1413    }
1414 
1415    inst->ex_desc = 0;
1416 
1417    inst->src[2] = src_payload;
1418    inst->resize_sources(3);
1419 
1420    if (inst->eot) {
1421       /* EOT sampler messages don't make sense to split because it would
1422        * involve ending half of the thread early.
1423        */
1424       assert(inst->group == 0);
1425       /* We need to use SENDC for EOT sampler messages */
1426       inst->check_tdr = true;
1427       inst->send_has_side_effects = true;
1428    }
1429 
1430    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
1431    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE * reg_unit(devinfo));
1432 }
1433 
1434 static unsigned
get_sampler_msg_payload_type_bit_size(const intel_device_info * devinfo,elk_opcode op,const elk_fs_reg * src)1435 get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
1436                                       elk_opcode op, const elk_fs_reg *src)
1437 {
1438    unsigned src_type_size = 0;
1439 
1440    /* All sources need to have the same size, therefore seek the first valid
1441     * and take the size from there.
1442     */
1443    for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1444       if (src[i].file != BAD_FILE) {
1445          src_type_size = elk_reg_type_to_size(src[i].type);
1446          break;
1447       }
1448    }
1449 
1450    assert(src_type_size == 2 || src_type_size == 4);
1451 
1452 #ifndef NDEBUG
1453    /* Make sure all sources agree. On gfx12 this doesn't hold when sampling
1454     * compressed multisampled surfaces. There the payload contains MCS data
1455     * which is already in 16-bits unlike the other parameters that need forced
1456     * conversion.
1457     */
1458    if (devinfo->verx10 < 125 ||
1459        (op != ELK_SHADER_OPCODE_TXF_CMS_W &&
1460         op != ELK_SHADER_OPCODE_TXF_CMS)) {
1461       for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
1462          assert(src[i].file == BAD_FILE ||
1463                 elk_reg_type_to_size(src[i].type) == src_type_size);
1464       }
1465    }
1466 #endif
1467 
1468    if (devinfo->verx10 < 125)
1469       return src_type_size * 8;
1470 
1471    /* Force conversion from 32-bit sources to 16-bit payload. From the XeHP Bspec:
1472     * 3D and GPGPU Programs - Shared Functions - 3D Sampler - Messages - Message
1473     * Format [GFX12:HAS:1209977870] *
1474     *
1475     *  ld2dms_w       SIMD8H and SIMD16H Only
1476     *  ld_mcs         SIMD8H and SIMD16H Only
1477     *  ld2dms         REMOVEDBY(GEN:HAS:1406788836)
1478     */
1479 
1480    if (op == ELK_SHADER_OPCODE_TXF_CMS_W ||
1481        op == ELK_SHADER_OPCODE_TXF_CMS ||
1482        op == ELK_SHADER_OPCODE_TXF_UMS ||
1483        op == ELK_SHADER_OPCODE_TXF_MCS)
1484       src_type_size = 2;
1485 
1486    return src_type_size * 8;
1487 }
1488 
1489 static void
lower_sampler_logical_send(const fs_builder & bld,elk_fs_inst * inst,elk_opcode op)1490 lower_sampler_logical_send(const fs_builder &bld, elk_fs_inst *inst, elk_opcode op)
1491 {
1492    const intel_device_info *devinfo = bld.shader->devinfo;
1493    const elk_fs_reg coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
1494    const elk_fs_reg shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
1495    const elk_fs_reg lod = inst->src[TEX_LOGICAL_SRC_LOD];
1496    const elk_fs_reg lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
1497    const elk_fs_reg min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
1498    const elk_fs_reg sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
1499    const elk_fs_reg mcs = inst->src[TEX_LOGICAL_SRC_MCS];
1500    const elk_fs_reg surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
1501    const elk_fs_reg sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
1502    const elk_fs_reg surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
1503    const elk_fs_reg sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
1504    const elk_fs_reg tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
1505    assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
1506    const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
1507    assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
1508    const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
1509    assert(inst->src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
1510    const bool residency = inst->src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
1511    /* residency is only supported on Gfx8+ */
1512    assert(!residency || devinfo->ver >= 8);
1513 
1514    if (devinfo->ver >= 7) {
1515       const unsigned msg_payload_type_bit_size =
1516          get_sampler_msg_payload_type_bit_size(devinfo, op, inst->src);
1517 
1518       /* 16-bit payloads are available only on gfx11+ */
1519       assert(msg_payload_type_bit_size != 16 || devinfo->ver >= 11);
1520 
1521       lower_sampler_logical_send_gfx7(bld, inst, op, coordinate,
1522                                       shadow_c, lod, lod2, min_lod,
1523                                       sample_index,
1524                                       mcs, surface, sampler,
1525                                       surface_handle, sampler_handle,
1526                                       tg4_offset,
1527                                       msg_payload_type_bit_size,
1528                                       coord_components, grad_components,
1529                                       residency);
1530    } else if (devinfo->ver >= 5) {
1531       lower_sampler_logical_send_gfx5(bld, inst, op, coordinate,
1532                                       shadow_c, lod, lod2, sample_index,
1533                                       surface, sampler,
1534                                       coord_components, grad_components);
1535    } else {
1536       lower_sampler_logical_send_gfx4(bld, inst, op, coordinate,
1537                                       shadow_c, lod, lod2,
1538                                       surface, sampler,
1539                                       coord_components, grad_components);
1540    }
1541 }
1542 
1543 /**
1544  * Predicate the specified instruction on the vector mask.
1545  */
1546 static void
emit_predicate_on_vector_mask(const fs_builder & bld,elk_fs_inst * inst)1547 emit_predicate_on_vector_mask(const fs_builder &bld, elk_fs_inst *inst)
1548 {
1549    assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
1550           bld.group() == inst->group &&
1551           bld.dispatch_width() == inst->exec_size);
1552 
1553    const fs_builder ubld = bld.exec_all().group(1, 0);
1554 
1555    const elk_fs_visitor &s = *bld.shader;
1556    const elk_fs_reg vector_mask = ubld.vgrf(ELK_REGISTER_TYPE_UW);
1557    ubld.UNDEF(vector_mask);
1558    ubld.emit(ELK_SHADER_OPCODE_READ_SR_REG, vector_mask, elk_imm_ud(3));
1559    const unsigned subreg = sample_mask_flag_subreg(s);
1560 
1561    ubld.MOV(elk_flag_subreg(subreg + inst->group / 16), vector_mask);
1562 
1563    if (inst->predicate) {
1564       assert(inst->predicate == ELK_PREDICATE_NORMAL);
1565       assert(!inst->predicate_inverse);
1566       assert(inst->flag_subreg == 0);
1567       assert(s.devinfo->ver < 20);
1568       /* Combine the vector mask with the existing predicate by using a
1569        * vertical predication mode.
1570        */
1571       inst->predicate = ELK_PREDICATE_ALIGN1_ALLV;
1572    } else {
1573       inst->flag_subreg = subreg;
1574       inst->predicate = ELK_PREDICATE_NORMAL;
1575       inst->predicate_inverse = false;
1576    }
1577 }
1578 
1579 static void
setup_surface_descriptors(const fs_builder & bld,elk_fs_inst * inst,uint32_t desc,const elk_fs_reg & surface,const elk_fs_reg & surface_handle)1580 setup_surface_descriptors(const fs_builder &bld, elk_fs_inst *inst, uint32_t desc,
1581                           const elk_fs_reg &surface, const elk_fs_reg &surface_handle)
1582 {
1583    const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
1584    const elk_compiler *compiler = bld.shader->compiler;
1585 
1586    /* We must have exactly one of surface and surface_handle */
1587    assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
1588 
1589    if (surface.file == IMM) {
1590       inst->desc = desc | (surface.ud & 0xff);
1591       inst->src[0] = elk_imm_ud(0);
1592       inst->src[1] = elk_imm_ud(0); /* ex_desc */
1593    } else if (surface_handle.file != BAD_FILE) {
1594       /* Bindless surface */
1595       assert(devinfo->ver >= 9);
1596       inst->desc = desc | GFX9_BTI_BINDLESS;
1597       inst->src[0] = elk_imm_ud(0);
1598 
1599       /* We assume that the driver provided the handle in the top 20 bits so
1600        * we can use the surface handle directly as the extended descriptor.
1601        */
1602       inst->src[1] = retype(surface_handle, ELK_REGISTER_TYPE_UD);
1603       inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1604    } else {
1605       inst->desc = desc;
1606       const fs_builder ubld = bld.exec_all().group(1, 0);
1607       elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1608       ubld.AND(tmp, surface, elk_imm_ud(0xff));
1609       inst->src[0] = component(tmp, 0);
1610       inst->src[1] = elk_imm_ud(0); /* ex_desc */
1611    }
1612 }
1613 
1614 static void
setup_lsc_surface_descriptors(const fs_builder & bld,elk_fs_inst * inst,uint32_t desc,const elk_fs_reg & surface)1615 setup_lsc_surface_descriptors(const fs_builder &bld, elk_fs_inst *inst,
1616                               uint32_t desc, const elk_fs_reg &surface)
1617 {
1618    const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
1619    const elk_compiler *compiler = bld.shader->compiler;
1620 
1621    inst->src[0] = elk_imm_ud(0); /* desc */
1622 
1623    enum lsc_addr_surface_type surf_type = lsc_msg_desc_addr_type(devinfo, desc);
1624    switch (surf_type) {
1625    case LSC_ADDR_SURFTYPE_BSS:
1626       inst->send_ex_bso = compiler->extended_bindless_surface_offset;
1627       /* fall-through */
1628    case LSC_ADDR_SURFTYPE_SS:
1629       assert(surface.file != BAD_FILE);
1630       /* We assume that the driver provided the handle in the top 20 bits so
1631        * we can use the surface handle directly as the extended descriptor.
1632        */
1633       inst->src[1] = retype(surface, ELK_REGISTER_TYPE_UD);
1634       break;
1635 
1636    case LSC_ADDR_SURFTYPE_BTI:
1637       assert(surface.file != BAD_FILE);
1638       if (surface.file == IMM) {
1639          inst->src[1] = elk_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
1640       } else {
1641          const fs_builder ubld = bld.exec_all().group(1, 0);
1642          elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1643          ubld.SHL(tmp, surface, elk_imm_ud(24));
1644          inst->src[1] = component(tmp, 0);
1645       }
1646       break;
1647 
1648    case LSC_ADDR_SURFTYPE_FLAT:
1649       inst->src[1] = elk_imm_ud(0);
1650       break;
1651 
1652    default:
1653       unreachable("Invalid LSC surface address type");
1654    }
1655 }
1656 
1657 static void
lower_surface_logical_send(const fs_builder & bld,elk_fs_inst * inst)1658 lower_surface_logical_send(const fs_builder &bld, elk_fs_inst *inst)
1659 {
1660    const elk_compiler *compiler = bld.shader->compiler;
1661    const intel_device_info *devinfo = bld.shader->devinfo;
1662 
1663    /* Get the logical send arguments. */
1664    const elk_fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1665    const elk_fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1666    const elk_fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1667    const elk_fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1668    const UNUSED elk_fs_reg dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
1669    const elk_fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1670    const elk_fs_reg allow_sample_mask =
1671       inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
1672    assert(arg.file == IMM);
1673    assert(allow_sample_mask.file == IMM);
1674 
1675    /* Calculate the total number of components of the payload. */
1676    const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
1677    const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1678 
1679    const bool is_typed_access =
1680       inst->opcode == ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL ||
1681       inst->opcode == ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL ||
1682       inst->opcode == ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL;
1683 
1684    const bool is_surface_access = is_typed_access ||
1685       inst->opcode == ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL ||
1686       inst->opcode == ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL ||
1687       inst->opcode == ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL;
1688 
1689    const bool is_stateless =
1690       surface.file == IMM && (surface.ud == ELK_BTI_STATELESS ||
1691                               surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
1692 
1693    const bool has_side_effects = inst->has_side_effects();
1694 
1695    elk_fs_reg sample_mask = allow_sample_mask.ud ? elk_sample_mask_reg(bld) :
1696                                                elk_fs_reg(elk_imm_ud(0xffffffff));
1697 
1698    /* From the BDW PRM Volume 7, page 147:
1699     *
1700     *  "For the Data Cache Data Port*, the header must be present for the
1701     *   following message types: [...] Typed read/write/atomics"
1702     *
1703     * Earlier generations have a similar wording.  Because of this restriction
1704     * we don't attempt to implement sample masks via predication for such
1705     * messages prior to Gfx9, since we have to provide a header anyway.  On
1706     * Gfx11+ the header has been removed so we can only use predication.
1707     *
1708     * For all stateless A32 messages, we also need a header
1709     */
1710    elk_fs_reg header;
1711    if ((devinfo->ver < 9 && is_typed_access) || is_stateless) {
1712       fs_builder ubld = bld.exec_all().group(8, 0);
1713       header = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1714       if (is_stateless) {
1715          assert(!is_surface_access);
1716          ubld.emit(ELK_SHADER_OPCODE_SCRATCH_HEADER, header);
1717       } else {
1718          ubld.MOV(header, elk_imm_d(0));
1719          if (is_surface_access)
1720             ubld.group(1, 0).MOV(component(header, 7), sample_mask);
1721       }
1722    }
1723    const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;
1724 
1725    elk_fs_reg payload, payload2;
1726    unsigned mlen, ex_mlen = 0;
1727    if (devinfo->ver >= 9 &&
1728        (src.file == BAD_FILE || header.file == BAD_FILE)) {
1729       /* We have split sends on gfx9 and above */
1730       if (header.file == BAD_FILE) {
1731          payload = bld.move_to_vgrf(addr, addr_sz);
1732          payload2 = bld.move_to_vgrf(src, src_sz);
1733          mlen = addr_sz * (inst->exec_size / 8);
1734          ex_mlen = src_sz * (inst->exec_size / 8);
1735       } else {
1736          assert(src.file == BAD_FILE);
1737          payload = header;
1738          payload2 = bld.move_to_vgrf(addr, addr_sz);
1739          mlen = header_sz;
1740          ex_mlen = addr_sz * (inst->exec_size / 8);
1741       }
1742    } else {
1743       /* Allocate space for the payload. */
1744       const unsigned sz = header_sz + addr_sz + src_sz;
1745       payload = bld.vgrf(ELK_REGISTER_TYPE_UD, sz);
1746       elk_fs_reg *const components = new elk_fs_reg[sz];
1747       unsigned n = 0;
1748 
1749       /* Construct the payload. */
1750       if (header.file != BAD_FILE)
1751          components[n++] = header;
1752 
1753       for (unsigned i = 0; i < addr_sz; i++)
1754          components[n++] = offset(addr, bld, i);
1755 
1756       for (unsigned i = 0; i < src_sz; i++)
1757          components[n++] = offset(src, bld, i);
1758 
1759       bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
1760       mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
1761 
1762       delete[] components;
1763    }
1764 
1765    /* Predicate the instruction on the sample mask if no header is
1766     * provided.
1767     */
1768    if ((header.file == BAD_FILE || !is_surface_access) &&
1769        sample_mask.file != BAD_FILE && sample_mask.file != IMM)
1770       elk_emit_predicate_on_sample_mask(bld, inst);
1771 
1772    uint32_t sfid;
1773    switch (inst->opcode) {
1774    case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1775    case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1776       /* Byte scattered opcodes go through the normal data cache */
1777       sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1778       break;
1779 
1780    case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1781    case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1782       sfid =  devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
1783               devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
1784                                   ELK_DATAPORT_READ_TARGET_RENDER_CACHE;
1785       break;
1786 
1787    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1788    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1789    case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1790       /* Untyped Surface messages go through the data cache but the SFID value
1791        * changed on Haswell.
1792        */
1793       sfid = (devinfo->verx10 >= 75 ?
1794               HSW_SFID_DATAPORT_DATA_CACHE_1 :
1795               GFX7_SFID_DATAPORT_DATA_CACHE);
1796       break;
1797 
1798    case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1799    case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1800    case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1801       /* Typed surface messages go through the render cache on IVB and the
1802        * data cache on HSW+.
1803        */
1804       sfid = (devinfo->verx10 >= 75 ?
1805               HSW_SFID_DATAPORT_DATA_CACHE_1 :
1806               GFX6_SFID_DATAPORT_RENDER_CACHE);
1807       break;
1808 
1809    default:
1810       unreachable("Unsupported surface opcode");
1811    }
1812 
1813    uint32_t desc;
1814    switch (inst->opcode) {
1815    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1816       desc = elk_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1817                                             arg.ud, /* num_channels */
1818                                             false   /* write */);
1819       break;
1820 
1821    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
1822       desc = elk_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
1823                                             arg.ud, /* num_channels */
1824                                             true    /* write */);
1825       break;
1826 
1827    case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
1828       desc = elk_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1829                                            arg.ud, /* bit_size */
1830                                            false   /* write */);
1831       break;
1832 
1833    case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
1834       desc = elk_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
1835                                            arg.ud, /* bit_size */
1836                                            true    /* write */);
1837       break;
1838 
1839    case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
1840       assert(arg.ud == 32); /* bit_size */
1841       desc = elk_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1842                                             false  /* write */);
1843       break;
1844 
1845    case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
1846       assert(arg.ud == 32); /* bit_size */
1847       desc = elk_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
1848                                             true   /* write */);
1849       break;
1850 
1851    case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
1852       if (elk_lsc_opcode_is_atomic_float((enum elk_lsc_opcode) arg.ud)) {
1853          desc = elk_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,
1854                                                  lsc_op_to_legacy_atomic(arg.ud),
1855                                                  !inst->dst.is_null());
1856       } else {
1857          desc = elk_dp_untyped_atomic_desc(devinfo, inst->exec_size,
1858                                            lsc_op_to_legacy_atomic(arg.ud),
1859                                            !inst->dst.is_null());
1860       }
1861       break;
1862 
1863    case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
1864       desc = elk_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
1865                                           arg.ud, /* num_channels */
1866                                           false   /* write */);
1867       break;
1868 
1869    case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
1870       desc = elk_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
1871                                           arg.ud, /* num_channels */
1872                                           true    /* write */);
1873       break;
1874 
1875    case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
1876       desc = elk_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
1877                                       lsc_op_to_legacy_atomic(arg.ud),
1878                                       !inst->dst.is_null());
1879       break;
1880 
1881    default:
1882       unreachable("Unknown surface logical instruction");
1883    }
1884 
1885    /* Update the original instruction. */
1886    inst->opcode = ELK_SHADER_OPCODE_SEND;
1887    inst->mlen = mlen;
1888    inst->ex_mlen = ex_mlen;
1889    inst->header_size = header_sz;
1890    inst->send_has_side_effects = has_side_effects;
1891    inst->send_is_volatile = !has_side_effects;
1892    inst->send_ex_bso = surface_handle.file != BAD_FILE &&
1893                        compiler->extended_bindless_surface_offset;
1894 
1895    /* Set up SFID and descriptors */
1896    inst->sfid = sfid;
1897    setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
1898 
1899    inst->resize_sources(4);
1900 
1901    /* Finally, the payload */
1902    inst->src[2] = payload;
1903    inst->src[3] = payload2;
1904 }
1905 
1906 static enum lsc_data_size
lsc_bits_to_data_size(unsigned bit_size)1907 lsc_bits_to_data_size(unsigned bit_size)
1908 {
1909    switch (bit_size / 8) {
1910    case 1:  return LSC_DATA_SIZE_D8U32;
1911    case 2:  return LSC_DATA_SIZE_D16U32;
1912    case 4:  return LSC_DATA_SIZE_D32;
1913    case 8:  return LSC_DATA_SIZE_D64;
1914    default:
1915       unreachable("Unsupported data size.");
1916    }
1917 }
1918 
1919 static void
lower_lsc_surface_logical_send(const fs_builder & bld,elk_fs_inst * inst)1920 lower_lsc_surface_logical_send(const fs_builder &bld, elk_fs_inst *inst)
1921 {
1922    const elk_compiler *compiler = bld.shader->compiler;
1923    const intel_device_info *devinfo = bld.shader->devinfo;
1924    assert(devinfo->has_lsc);
1925 
1926    /* Get the logical send arguments. */
1927    const elk_fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
1928    const elk_fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
1929    const elk_fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
1930    const elk_fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
1931    const UNUSED elk_fs_reg dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
1932    const elk_fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
1933    const elk_fs_reg allow_sample_mask =
1934       inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
1935    assert(arg.file == IMM);
1936    assert(allow_sample_mask.file == IMM);
1937 
1938    /* Calculate the total number of components of the payload. */
1939    const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
1940    const unsigned src_comps = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
1941    const unsigned src_sz = type_sz(src.type);
1942    const unsigned dst_sz = type_sz(inst->dst.type);
1943 
1944    const bool has_side_effects = inst->has_side_effects();
1945 
1946    unsigned ex_mlen = 0;
1947    elk_fs_reg payload, payload2;
1948    payload = bld.move_to_vgrf(addr, addr_sz);
1949    if (src.file != BAD_FILE) {
1950       payload2 = bld.move_to_vgrf(src, src_comps);
1951       ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE;
1952    }
1953 
1954    /* Predicate the instruction on the sample mask if needed */
1955    elk_fs_reg sample_mask = allow_sample_mask.ud ? elk_sample_mask_reg(bld) :
1956                                                elk_fs_reg(elk_imm_ud(0xffffffff));
1957    if (sample_mask.file != BAD_FILE && sample_mask.file != IMM)
1958       elk_emit_predicate_on_sample_mask(bld, inst);
1959 
1960    if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
1961       inst->sfid = GFX12_SFID_SLM;
1962    else
1963       inst->sfid = GFX12_SFID_UGM;
1964 
1965    /* We should have exactly one of surface and surface_handle. For scratch
1966     * messages generated by elk_fs_nir.cpp we also allow a special value to
1967     * know what heap base we should use in STATE_BASE_ADDRESS (SS = Surface
1968     * State Offset, or BSS = Bindless Surface State Offset).
1969     */
1970    bool non_bindless = surface.file == IMM && surface.ud == GFX125_NON_BINDLESS;
1971    assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE) ||
1972           (non_bindless && surface_handle.file != BAD_FILE));
1973 
1974    enum lsc_addr_surface_type surf_type;
1975    if (surface_handle.file != BAD_FILE) {
1976       if (surface.file == BAD_FILE) {
1977          assert(!non_bindless);
1978          surf_type = LSC_ADDR_SURFTYPE_BSS;
1979       } else {
1980          assert(surface.file == IMM &&
1981                 (surface.ud == 0 || surface.ud == GFX125_NON_BINDLESS));
1982          surf_type = non_bindless ? LSC_ADDR_SURFTYPE_SS : LSC_ADDR_SURFTYPE_BSS;
1983       }
1984    } else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
1985       surf_type = LSC_ADDR_SURFTYPE_FLAT;
1986    else
1987       surf_type = LSC_ADDR_SURFTYPE_BTI;
1988 
1989    switch (inst->opcode) {
1990    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
1991       inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
1992                                 surf_type, LSC_ADDR_SIZE_A32,
1993                                 1 /* num_coordinates */,
1994                                 LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
1995                                 false /* transpose */,
1996                                 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
1997                                 true /* has_dest */);
1998       break;
1999    case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
2000       inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
2001                                 surf_type, LSC_ADDR_SIZE_A32,
2002                                 1 /* num_coordinates */,
2003                                 LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
2004                                 false /* transpose */,
2005                                 LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
2006                                 false /* has_dest */);
2007       break;
2008    case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: {
2009       /* Bspec: Atomic instruction -> Cache section:
2010        *
2011        *    Atomic messages are always forced to "un-cacheable" in the L1
2012        *    cache.
2013        */
2014       enum elk_lsc_opcode opcode = (enum elk_lsc_opcode) arg.ud;
2015 
2016       inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
2017                                 surf_type, LSC_ADDR_SIZE_A32,
2018                                 1 /* num_coordinates */,
2019                                 lsc_bits_to_data_size(dst_sz * 8),
2020                                 1 /* num_channels */,
2021                                 false /* transpose */,
2022                                 LSC_CACHE(devinfo, STORE, L1UC_L3WB),
2023                                 !inst->dst.is_null());
2024       break;
2025    }
2026    case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
2027       inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
2028                                 surf_type, LSC_ADDR_SIZE_A32,
2029                                 1 /* num_coordinates */,
2030                                 lsc_bits_to_data_size(arg.ud),
2031                                 1 /* num_channels */,
2032                                 false /* transpose */,
2033                                 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2034                                 true /* has_dest */);
2035       break;
2036    case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
2037       inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
2038                                 surf_type, LSC_ADDR_SIZE_A32,
2039                                 1 /* num_coordinates */,
2040                                 lsc_bits_to_data_size(arg.ud),
2041                                 1 /* num_channels */,
2042                                 false /* transpose */,
2043                                 LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
2044                                 false /* has_dest */);
2045       break;
2046    default:
2047       unreachable("Unknown surface logical instruction");
2048    }
2049 
2050    /* Update the original instruction. */
2051    inst->opcode = ELK_SHADER_OPCODE_SEND;
2052    inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2053    inst->ex_mlen = ex_mlen;
2054    inst->header_size = 0;
2055    inst->send_has_side_effects = has_side_effects;
2056    inst->send_is_volatile = !has_side_effects;
2057    inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
2058                        compiler->extended_bindless_surface_offset;
2059 
2060    inst->resize_sources(4);
2061 
2062    if (non_bindless) {
2063       inst->src[0] = elk_imm_ud(0);     /* desc */
2064       inst->src[1] = surface_handle;    /* ex_desc */
2065    } else {
2066       setup_lsc_surface_descriptors(bld, inst, inst->desc,
2067                                     surface.file != BAD_FILE ?
2068                                     surface : surface_handle);
2069    }
2070 
2071    /* Finally, the payload */
2072    inst->src[2] = payload;
2073    inst->src[3] = payload2;
2074 }
2075 
2076 static void
lower_lsc_block_logical_send(const fs_builder & bld,elk_fs_inst * inst)2077 lower_lsc_block_logical_send(const fs_builder &bld, elk_fs_inst *inst)
2078 {
2079    const elk_compiler *compiler = bld.shader->compiler;
2080    const intel_device_info *devinfo = bld.shader->devinfo;
2081    assert(devinfo->has_lsc);
2082 
2083    /* Get the logical send arguments. */
2084    const elk_fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
2085    const elk_fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
2086    const elk_fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
2087    const elk_fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
2088    const elk_fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
2089    assert(arg.file == IMM);
2090    assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);
2091    assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);
2092 
2093    const bool is_stateless =
2094       surface.file == IMM && (surface.ud == ELK_BTI_STATELESS ||
2095                               surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
2096 
2097    const bool has_side_effects = inst->has_side_effects();
2098 
2099    const bool write = inst->opcode == ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
2100 
2101    fs_builder ubld = bld.exec_all().group(1, 0);
2102    elk_fs_reg stateless_ex_desc;
2103    if (is_stateless) {
2104       stateless_ex_desc = ubld.vgrf(ELK_REGISTER_TYPE_UD);
2105       ubld.AND(stateless_ex_desc,
2106                retype(elk_vec1_grf(0, 5), ELK_REGISTER_TYPE_UD),
2107                elk_imm_ud(INTEL_MASK(31, 10)));
2108    }
2109 
2110    elk_fs_reg data;
2111    if (write) {
2112       const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
2113       data = retype(bld.move_to_vgrf(src, src_sz), ELK_REGISTER_TYPE_UD);
2114    }
2115 
2116    inst->opcode = ELK_SHADER_OPCODE_SEND;
2117    if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
2118       inst->sfid = GFX12_SFID_SLM;
2119    else
2120       inst->sfid = GFX12_SFID_UGM;
2121    const enum lsc_addr_surface_type surf_type =
2122       inst->sfid == GFX12_SFID_SLM ?
2123       LSC_ADDR_SURFTYPE_FLAT :
2124       surface.file == BAD_FILE ?
2125       LSC_ADDR_SURFTYPE_BSS : LSC_ADDR_SURFTYPE_BTI;
2126    inst->desc = lsc_msg_desc(devinfo,
2127                              write ? LSC_OP_STORE : LSC_OP_LOAD,
2128                              1 /* exec_size */,
2129                              surf_type,
2130                              LSC_ADDR_SIZE_A32,
2131                              1 /* num_coordinates */,
2132                              LSC_DATA_SIZE_D32,
2133                              arg.ud /* num_channels */,
2134                              true /* transpose */,
2135                              LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2136                              !write /* has_dest */);
2137 
2138    inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2139    inst->size_written = lsc_msg_desc_dest_len(devinfo, inst->desc) * REG_SIZE;
2140    inst->exec_size = 1;
2141    inst->ex_mlen = write ? DIV_ROUND_UP(arg.ud, 8) : 0;
2142    inst->header_size = 0;
2143    inst->send_has_side_effects = has_side_effects;
2144    inst->send_is_volatile = !has_side_effects;
2145    inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
2146                        compiler->extended_bindless_surface_offset;
2147 
2148    inst->resize_sources(4);
2149 
2150    if (stateless_ex_desc.file != BAD_FILE) {
2151       inst->src[0] = elk_imm_ud(0);     /* desc */
2152       inst->src[1] = stateless_ex_desc; /* ex_desc */
2153    } else {
2154       setup_lsc_surface_descriptors(bld, inst, inst->desc,
2155                                     surface.file != BAD_FILE ?
2156                                     surface : surface_handle);
2157    }
2158    inst->src[2] = addr;          /* payload */
2159    inst->src[3] = data;          /* payload2 */
2160 }
2161 
2162 static void
lower_surface_block_logical_send(const fs_builder & bld,elk_fs_inst * inst)2163 lower_surface_block_logical_send(const fs_builder &bld, elk_fs_inst *inst)
2164 {
2165    const intel_device_info *devinfo = bld.shader->devinfo;
2166    assert(devinfo->ver >= 9);
2167 
2168    /* Get the logical send arguments. */
2169    const elk_fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
2170    const elk_fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
2171    const elk_fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
2172    const elk_fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
2173    const elk_fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
2174    assert(arg.file == IMM);
2175    assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);
2176    assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);
2177 
2178    const bool is_stateless =
2179       surface.file == IMM && (surface.ud == ELK_BTI_STATELESS ||
2180                               surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
2181 
2182    const bool has_side_effects = inst->has_side_effects();
2183 
2184    const bool align_16B =
2185       inst->opcode != ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL;
2186 
2187    const bool write = inst->opcode == ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
2188 
2189    /* The address is stored in the header.  See MH_A32_GO and MH_BTS_GO. */
2190    fs_builder ubld = bld.exec_all().group(8, 0);
2191    elk_fs_reg header = ubld.vgrf(ELK_REGISTER_TYPE_UD);
2192 
2193    if (is_stateless)
2194       ubld.emit(ELK_SHADER_OPCODE_SCRATCH_HEADER, header);
2195    else
2196       ubld.MOV(header, elk_imm_d(0));
2197 
2198    /* Address in OWord units when aligned to OWords. */
2199    if (align_16B)
2200       ubld.group(1, 0).SHR(component(header, 2), addr, elk_imm_ud(4));
2201    else
2202       ubld.group(1, 0).MOV(component(header, 2), addr);
2203 
2204    elk_fs_reg data;
2205    unsigned ex_mlen = 0;
2206    if (write) {
2207       const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
2208       data = retype(bld.move_to_vgrf(src, src_sz), ELK_REGISTER_TYPE_UD);
2209       ex_mlen = src_sz * type_sz(src.type) * inst->exec_size / REG_SIZE;
2210    }
2211 
2212    inst->opcode = ELK_SHADER_OPCODE_SEND;
2213    inst->mlen = 1;
2214    inst->ex_mlen = ex_mlen;
2215    inst->header_size = 1;
2216    inst->send_has_side_effects = has_side_effects;
2217    inst->send_is_volatile = !has_side_effects;
2218 
2219    inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
2220 
2221    const uint32_t desc = elk_dp_oword_block_rw_desc(devinfo, align_16B,
2222                                                     arg.ud, write);
2223    setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2224 
2225    inst->resize_sources(4);
2226 
2227    inst->src[2] = header;
2228    inst->src[3] = data;
2229 }
2230 
2231 static elk_fs_reg
emit_a64_oword_block_header(const fs_builder & bld,const elk_fs_reg & addr)2232 emit_a64_oword_block_header(const fs_builder &bld, const elk_fs_reg &addr)
2233 {
2234    const fs_builder ubld = bld.exec_all().group(8, 0);
2235 
2236    assert(type_sz(addr.type) == 8 && addr.stride == 0);
2237 
2238    elk_fs_reg expanded_addr = addr;
2239    if (addr.file == UNIFORM) {
2240       /* We can't do stride 1 with the UNIFORM file, it requires stride 0 */
2241       expanded_addr = ubld.vgrf(ELK_REGISTER_TYPE_UQ);
2242       expanded_addr.stride = 0;
2243       ubld.MOV(expanded_addr, retype(addr, ELK_REGISTER_TYPE_UQ));
2244    }
2245 
2246    elk_fs_reg header = ubld.vgrf(ELK_REGISTER_TYPE_UD);
2247    ubld.MOV(header, elk_imm_ud(0));
2248 
2249    /* Use a 2-wide MOV to fill out the address */
2250    elk_fs_reg addr_vec2 = expanded_addr;
2251    addr_vec2.type = ELK_REGISTER_TYPE_UD;
2252    addr_vec2.stride = 1;
2253    ubld.group(2, 0).MOV(header, addr_vec2);
2254 
2255    return header;
2256 }
2257 
2258 static void
emit_fragment_mask(const fs_builder & bld,elk_fs_inst * inst)2259 emit_fragment_mask(const fs_builder &bld, elk_fs_inst *inst)
2260 {
2261    assert(inst->src[A64_LOGICAL_ENABLE_HELPERS].file == IMM);
2262    const bool enable_helpers = inst->src[A64_LOGICAL_ENABLE_HELPERS].ud;
2263 
2264    /* If we're a fragment shader, we have to predicate with the sample mask to
2265     * avoid helper invocations to avoid helper invocations in instructions
2266     * with side effects, unless they are explicitly required.
2267     *
2268     * There are also special cases when we actually want to run on helpers
2269     * (ray queries).
2270     */
2271    assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
2272    if (enable_helpers)
2273       emit_predicate_on_vector_mask(bld, inst);
2274    else if (inst->has_side_effects())
2275       elk_emit_predicate_on_sample_mask(bld, inst);
2276 }
2277 
2278 static void
lower_lsc_a64_logical_send(const fs_builder & bld,elk_fs_inst * inst)2279 lower_lsc_a64_logical_send(const fs_builder &bld, elk_fs_inst *inst)
2280 {
2281    const intel_device_info *devinfo = bld.shader->devinfo;
2282 
2283    /* Get the logical send arguments. */
2284    const elk_fs_reg addr = inst->src[A64_LOGICAL_ADDRESS];
2285    const elk_fs_reg src = inst->src[A64_LOGICAL_SRC];
2286    const unsigned src_sz = type_sz(src.type);
2287    const unsigned dst_sz = type_sz(inst->dst.type);
2288 
2289    const unsigned src_comps = inst->components_read(1);
2290    assert(inst->src[A64_LOGICAL_ARG].file == IMM);
2291    const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
2292    const bool has_side_effects = inst->has_side_effects();
2293 
2294    elk_fs_reg payload = retype(bld.move_to_vgrf(addr, 1), ELK_REGISTER_TYPE_UD);
2295    elk_fs_reg payload2 = retype(bld.move_to_vgrf(src, src_comps),
2296                             ELK_REGISTER_TYPE_UD);
2297    unsigned ex_mlen = src_comps * src_sz * inst->exec_size / REG_SIZE;
2298 
2299    switch (inst->opcode) {
2300    case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
2301       inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
2302                                 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2303                                 1 /* num_coordinates */,
2304                                 LSC_DATA_SIZE_D32, arg /* num_channels */,
2305                                 false /* transpose */,
2306                                 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2307                                 true /* has_dest */);
2308       break;
2309    case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
2310       inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
2311                                 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2312                                 1 /* num_coordinates */,
2313                                 LSC_DATA_SIZE_D32, arg /* num_channels */,
2314                                 false /* transpose */,
2315                                 LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
2316                                 false /* has_dest */);
2317       break;
2318    case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
2319       inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
2320                                 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2321                                 1 /* num_coordinates */,
2322                                 lsc_bits_to_data_size(arg),
2323                                 1 /* num_channels */,
2324                                 false /* transpose */,
2325                                 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2326                                 true /* has_dest */);
2327       break;
2328    case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
2329       inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
2330                                 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2331                                 1 /* num_coordinates */,
2332                                 lsc_bits_to_data_size(arg),
2333                                 1 /* num_channels */,
2334                                 false /* transpose */,
2335                                 LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS),
2336                                 false /* has_dest */);
2337       break;
2338    case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: {
2339       /* Bspec: Atomic instruction -> Cache section:
2340        *
2341        *    Atomic messages are always forced to "un-cacheable" in the L1
2342        *    cache.
2343        */
2344       enum elk_lsc_opcode opcode = (enum elk_lsc_opcode) arg;
2345       inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
2346                                 LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
2347                                 1 /* num_coordinates */,
2348                                 lsc_bits_to_data_size(dst_sz * 8),
2349                                 1 /* num_channels */,
2350                                 false /* transpose */,
2351                                 LSC_CACHE(devinfo, STORE, L1UC_L3WB),
2352                                 !inst->dst.is_null());
2353       break;
2354    }
2355    case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
2356    case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2357       inst->exec_size = 1;
2358       inst->desc = lsc_msg_desc(devinfo,
2359                                 LSC_OP_LOAD,
2360                                 1 /* exec_size */,
2361                                 LSC_ADDR_SURFTYPE_FLAT,
2362                                 LSC_ADDR_SIZE_A64,
2363                                 1 /* num_coordinates */,
2364                                 LSC_DATA_SIZE_D32,
2365                                 arg /* num_channels */,
2366                                 true /* transpose */,
2367                                 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2368                                 true /* has_dest */);
2369       break;
2370    case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
2371       inst->exec_size = 1;
2372       inst->desc = lsc_msg_desc(devinfo,
2373                                 LSC_OP_STORE,
2374                                 1 /* exec_size */,
2375                                 LSC_ADDR_SURFTYPE_FLAT,
2376                                 LSC_ADDR_SIZE_A64,
2377                                 1 /* num_coordinates */,
2378                                 LSC_DATA_SIZE_D32,
2379                                 arg /* num_channels */,
2380                                 true /* transpose */,
2381                                 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2382                                 false /* has_dest */);
2383 
2384       break;
2385    default:
2386       unreachable("Unknown A64 logical instruction");
2387    }
2388 
2389    if (bld.shader->stage == MESA_SHADER_FRAGMENT)
2390       emit_fragment_mask(bld, inst);
2391 
2392    /* Update the original instruction. */
2393    inst->opcode = ELK_SHADER_OPCODE_SEND;
2394    inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2395    inst->ex_mlen = ex_mlen;
2396    inst->header_size = 0;
2397    inst->send_has_side_effects = has_side_effects;
2398    inst->send_is_volatile = !has_side_effects;
2399 
2400    /* Set up SFID and descriptors */
2401    inst->sfid = GFX12_SFID_UGM;
2402    inst->resize_sources(4);
2403    inst->src[0] = elk_imm_ud(0); /* desc */
2404    inst->src[1] = elk_imm_ud(0); /* ex_desc */
2405    inst->src[2] = payload;
2406    inst->src[3] = payload2;
2407 }
2408 
2409 static void
lower_a64_logical_send(const fs_builder & bld,elk_fs_inst * inst)2410 lower_a64_logical_send(const fs_builder &bld, elk_fs_inst *inst)
2411 {
2412    const intel_device_info *devinfo = bld.shader->devinfo;
2413 
2414    const elk_fs_reg addr = inst->src[A64_LOGICAL_ADDRESS];
2415    const elk_fs_reg src = inst->src[A64_LOGICAL_SRC];
2416    const unsigned src_comps = inst->components_read(1);
2417    assert(inst->src[A64_LOGICAL_ARG].file == IMM);
2418    const unsigned arg = inst->src[A64_LOGICAL_ARG].ud;
2419    const bool has_side_effects = inst->has_side_effects();
2420 
2421    elk_fs_reg payload, payload2;
2422    unsigned mlen, ex_mlen = 0, header_size = 0;
2423    if (inst->opcode == ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL ||
2424        inst->opcode == ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL ||
2425        inst->opcode == ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL) {
2426       assert(devinfo->ver >= 9);
2427 
2428       /* OWORD messages only take a scalar address in a header */
2429       mlen = 1;
2430       header_size = 1;
2431       payload = emit_a64_oword_block_header(bld, addr);
2432 
2433       if (inst->opcode == ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL) {
2434          ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
2435          payload2 = retype(bld.move_to_vgrf(src, src_comps),
2436                            ELK_REGISTER_TYPE_UD);
2437       }
2438    } else if (devinfo->ver >= 9) {
2439       /* On Skylake and above, we have SENDS */
2440       mlen = 2 * (inst->exec_size / 8);
2441       ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
2442       payload = retype(bld.move_to_vgrf(addr, 1), ELK_REGISTER_TYPE_UD);
2443       payload2 = retype(bld.move_to_vgrf(src, src_comps),
2444                         ELK_REGISTER_TYPE_UD);
2445    } else {
2446       /* Add two because the address is 64-bit */
2447       const unsigned dwords = 2 + src_comps;
2448       mlen = dwords * (inst->exec_size / 8);
2449 
2450       elk_fs_reg sources[5];
2451 
2452       sources[0] = addr;
2453 
2454       for (unsigned i = 0; i < src_comps; i++)
2455          sources[1 + i] = offset(src, bld, i);
2456 
2457       payload = bld.vgrf(ELK_REGISTER_TYPE_UD, dwords);
2458       bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0);
2459    }
2460 
2461    uint32_t desc;
2462    switch (inst->opcode) {
2463    case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
2464       desc = elk_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
2465                                                 arg,   /* num_channels */
2466                                                 false  /* write */);
2467       break;
2468 
2469    case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
2470       desc = elk_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
2471                                                 arg,   /* num_channels */
2472                                                 true   /* write */);
2473       break;
2474 
2475    case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
2476       desc = elk_dp_a64_oword_block_rw_desc(devinfo,
2477                                             true,    /* align_16B */
2478                                             arg,     /* num_dwords */
2479                                             false    /* write */);
2480       break;
2481 
2482    case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
2483       desc = elk_dp_a64_oword_block_rw_desc(devinfo,
2484                                             false,   /* align_16B */
2485                                             arg,     /* num_dwords */
2486                                             false    /* write */);
2487       break;
2488 
2489    case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
2490       desc = elk_dp_a64_oword_block_rw_desc(devinfo,
2491                                             true,    /* align_16B */
2492                                             arg,     /* num_dwords */
2493                                             true     /* write */);
2494       break;
2495 
2496    case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
2497       desc = elk_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
2498                                                arg,   /* bit_size */
2499                                                false  /* write */);
2500       break;
2501 
2502    case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
2503       desc = elk_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
2504                                                arg,   /* bit_size */
2505                                                true   /* write */);
2506       break;
2507 
2508    case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
2509       if (elk_lsc_opcode_is_atomic_float((enum elk_lsc_opcode) arg)) {
2510          desc =
2511             elk_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
2512                                                  type_sz(inst->dst.type) * 8,
2513                                                  lsc_op_to_legacy_atomic(arg),
2514                                                  !inst->dst.is_null());
2515       } else {
2516          desc = elk_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size,
2517                                                type_sz(inst->dst.type) * 8,
2518                                                lsc_op_to_legacy_atomic(arg),
2519                                                !inst->dst.is_null());
2520       }
2521       break;
2522 
2523    default:
2524       unreachable("Unknown A64 logical instruction");
2525    }
2526 
2527    if (bld.shader->stage == MESA_SHADER_FRAGMENT)
2528       emit_fragment_mask(bld, inst);
2529 
2530    /* Update the original instruction. */
2531    inst->opcode = ELK_SHADER_OPCODE_SEND;
2532    inst->mlen = mlen;
2533    inst->ex_mlen = ex_mlen;
2534    inst->header_size = header_size;
2535    inst->send_has_side_effects = has_side_effects;
2536    inst->send_is_volatile = !has_side_effects;
2537 
2538    /* Set up SFID and descriptors */
2539    inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2540    inst->desc = desc;
2541    inst->resize_sources(4);
2542    inst->src[0] = elk_imm_ud(0); /* desc */
2543    inst->src[1] = elk_imm_ud(0); /* ex_desc */
2544    inst->src[2] = payload;
2545    inst->src[3] = payload2;
2546 }
2547 
2548 static void
lower_lsc_varying_pull_constant_logical_send(const fs_builder & bld,elk_fs_inst * inst)2549 lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld,
2550                                              elk_fs_inst *inst)
2551 {
2552    const intel_device_info *devinfo = bld.shader->devinfo;
2553    ASSERTED const elk_compiler *compiler = bld.shader->compiler;
2554 
2555    elk_fs_reg surface        = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
2556    elk_fs_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
2557    elk_fs_reg offset_B       = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
2558    elk_fs_reg alignment_B    = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT];
2559 
2560    /* We are switching the instruction from an ALU-like instruction to a
2561     * send-from-grf instruction.  Since sends can't handle strides or
2562     * source modifiers, we have to make a copy of the offset source.
2563     */
2564    elk_fs_reg ubo_offset = bld.move_to_vgrf(offset_B, 1);
2565 
2566    enum lsc_addr_surface_type surf_type =
2567       surface_handle.file == BAD_FILE ?
2568       LSC_ADDR_SURFTYPE_BTI : LSC_ADDR_SURFTYPE_BSS;
2569 
2570    assert(alignment_B.file == ELK_IMMEDIATE_VALUE);
2571    unsigned alignment = alignment_B.ud;
2572 
2573    inst->opcode = ELK_SHADER_OPCODE_SEND;
2574    inst->sfid = GFX12_SFID_UGM;
2575    inst->resize_sources(3);
2576    inst->send_ex_bso = surf_type == LSC_ADDR_SURFTYPE_BSS &&
2577                        compiler->extended_bindless_surface_offset;
2578 
2579    assert(!compiler->indirect_ubos_use_sampler);
2580 
2581    inst->src[0] = elk_imm_ud(0);
2582    inst->src[2] = ubo_offset; /* payload */
2583 
2584    if (alignment >= 4) {
2585       inst->desc =
2586          lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
2587                       surf_type, LSC_ADDR_SIZE_A32,
2588                       1 /* num_coordinates */,
2589                       LSC_DATA_SIZE_D32,
2590                       4 /* num_channels */,
2591                       false /* transpose */,
2592                       LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2593                       true /* has_dest */);
2594       inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2595 
2596       setup_lsc_surface_descriptors(bld, inst, inst->desc,
2597                                     surface.file != BAD_FILE ?
2598                                     surface : surface_handle);
2599    } else {
2600       inst->desc =
2601          lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
2602                       surf_type, LSC_ADDR_SIZE_A32,
2603                       1 /* num_coordinates */,
2604                       LSC_DATA_SIZE_D32,
2605                       1 /* num_channels */,
2606                       false /* transpose */,
2607                       LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
2608                       true /* has_dest */);
2609       inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
2610 
2611       setup_lsc_surface_descriptors(bld, inst, inst->desc,
2612                                     surface.file != BAD_FILE ?
2613                                     surface : surface_handle);
2614 
2615       /* The byte scattered messages can only read one dword at a time so
2616        * we have to duplicate the message 4 times to read the full vec4.
2617        * Hopefully, dead code will clean up the mess if some of them aren't
2618        * needed.
2619        */
2620       assert(inst->size_written == 16 * inst->exec_size);
2621       inst->size_written /= 4;
2622       for (unsigned c = 1; c < 4; c++) {
2623          /* Emit a copy of the instruction because we're about to modify
2624           * it.  Because this loop starts at 1, we will emit copies for the
2625           * first 3 and the final one will be the modified instruction.
2626           */
2627          bld.emit(*inst);
2628 
2629          /* Offset the source */
2630          inst->src[2] = bld.vgrf(ELK_REGISTER_TYPE_UD);
2631          bld.ADD(inst->src[2], ubo_offset, elk_imm_ud(c * 4));
2632 
2633          /* Offset the destination */
2634          inst->dst = offset(inst->dst, bld, 1);
2635       }
2636    }
2637 }
2638 
2639 static void
lower_varying_pull_constant_logical_send(const fs_builder & bld,elk_fs_inst * inst)2640 lower_varying_pull_constant_logical_send(const fs_builder &bld, elk_fs_inst *inst)
2641 {
2642    const intel_device_info *devinfo = bld.shader->devinfo;
2643    const elk_compiler *compiler = bld.shader->compiler;
2644 
2645    if (devinfo->ver >= 7) {
2646       elk_fs_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
2647       elk_fs_reg surface_handle = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE];
2648       elk_fs_reg offset_B = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
2649 
2650       /* We are switching the instruction from an ALU-like instruction to a
2651        * send-from-grf instruction.  Since sends can't handle strides or
2652        * source modifiers, we have to make a copy of the offset source.
2653        */
2654       elk_fs_reg ubo_offset = bld.vgrf(ELK_REGISTER_TYPE_UD);
2655       bld.MOV(ubo_offset, offset_B);
2656 
2657       assert(inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].file == ELK_IMMEDIATE_VALUE);
2658       unsigned alignment = inst->src[PULL_VARYING_CONSTANT_SRC_ALIGNMENT].ud;
2659 
2660       inst->opcode = ELK_SHADER_OPCODE_SEND;
2661       inst->mlen = inst->exec_size / 8;
2662       inst->resize_sources(3);
2663 
2664       /* src[0] & src[1] are filled by setup_surface_descriptors() */
2665       inst->src[2] = ubo_offset; /* payload */
2666 
2667       if (compiler->indirect_ubos_use_sampler) {
2668          const unsigned simd_mode =
2669             inst->exec_size <= 8 ? ELK_SAMPLER_SIMD_MODE_SIMD8 :
2670                                    ELK_SAMPLER_SIMD_MODE_SIMD16;
2671          const uint32_t desc = elk_sampler_desc(devinfo, 0, 0,
2672                                                 GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
2673                                                 simd_mode, 0);
2674 
2675          inst->sfid = ELK_SFID_SAMPLER;
2676          setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2677       } else if (alignment >= 4) {
2678          const uint32_t desc =
2679             elk_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
2680                                            4, /* num_channels */
2681                                            false   /* write */);
2682 
2683          inst->sfid = (devinfo->verx10 >= 75 ?
2684                        HSW_SFID_DATAPORT_DATA_CACHE_1 :
2685                        GFX7_SFID_DATAPORT_DATA_CACHE);
2686          setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2687       } else {
2688          const uint32_t desc =
2689             elk_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
2690                                           32,     /* bit_size */
2691                                           false   /* write */);
2692 
2693          inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
2694          setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2695 
2696          /* The byte scattered messages can only read one dword at a time so
2697           * we have to duplicate the message 4 times to read the full vec4.
2698           * Hopefully, dead code will clean up the mess if some of them aren't
2699           * needed.
2700           */
2701          assert(inst->size_written == 16 * inst->exec_size);
2702          inst->size_written /= 4;
2703          for (unsigned c = 1; c < 4; c++) {
2704             /* Emit a copy of the instruction because we're about to modify
2705              * it.  Because this loop starts at 1, we will emit copies for the
2706              * first 3 and the final one will be the modified instruction.
2707              */
2708             bld.emit(*inst);
2709 
2710             /* Offset the source */
2711             inst->src[2] = bld.vgrf(ELK_REGISTER_TYPE_UD);
2712             bld.ADD(inst->src[2], ubo_offset, elk_imm_ud(c * 4));
2713 
2714             /* Offset the destination */
2715             inst->dst = offset(inst->dst, bld, 1);
2716          }
2717       }
2718    } else {
2719       elk_fs_reg surface = inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE];
2720       elk_fs_reg offset = inst->src[PULL_VARYING_CONSTANT_SRC_OFFSET];
2721       assert(inst->src[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE].file == BAD_FILE);
2722 
2723       const elk_fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->ver),
2724                            ELK_REGISTER_TYPE_UD);
2725 
2726       bld.MOV(byte_offset(payload, REG_SIZE), offset);
2727 
2728       inst->opcode = ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4;
2729       inst->base_mrf = payload.nr;
2730       inst->header_size = 1;
2731       inst->mlen = 1 + inst->exec_size / 8;
2732 
2733       inst->resize_sources(1);
2734       inst->src[0] = surface;
2735    }
2736 }
2737 
2738 static void
lower_math_logical_send(const fs_builder & bld,elk_fs_inst * inst)2739 lower_math_logical_send(const fs_builder &bld, elk_fs_inst *inst)
2740 {
2741    assert(bld.shader->devinfo->ver < 6);
2742 
2743    inst->base_mrf = 2;
2744    inst->mlen = inst->sources * inst->exec_size / 8;
2745 
2746    if (inst->sources > 1) {
2747       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
2748        * "Message Payload":
2749        *
2750        * "Operand0[7].  For the INT DIV functions, this operand is the
2751        *  denominator."
2752        *  ...
2753        * "Operand1[7].  For the INT DIV functions, this operand is the
2754        *  numerator."
2755        */
2756       const bool is_int_div = inst->opcode != ELK_SHADER_OPCODE_POW;
2757       const elk_fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
2758       const elk_fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
2759 
2760       inst->resize_sources(1);
2761       inst->src[0] = src0;
2762 
2763       assert(inst->exec_size == 8);
2764       bld.MOV(elk_fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);
2765    }
2766 }
2767 
2768 static void
lower_interpolator_logical_send(const fs_builder & bld,elk_fs_inst * inst,const struct elk_wm_prog_key * wm_prog_key,const struct elk_wm_prog_data * wm_prog_data)2769 lower_interpolator_logical_send(const fs_builder &bld, elk_fs_inst *inst,
2770                                 const struct elk_wm_prog_key *wm_prog_key,
2771                                 const struct elk_wm_prog_data *wm_prog_data)
2772 {
2773    const intel_device_info *devinfo = bld.shader->devinfo;
2774 
2775    /* We have to send something */
2776    elk_fs_reg payload = elk_vec8_grf(0, 0);
2777    unsigned mlen = 1;
2778 
2779    unsigned mode;
2780    switch (inst->opcode) {
2781    case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2782       assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
2783       mode = GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE;
2784       break;
2785 
2786    case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2787       assert(inst->src[INTERP_SRC_OFFSET].file == BAD_FILE);
2788       mode = GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET;
2789       break;
2790 
2791    case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2792       payload = inst->src[INTERP_SRC_OFFSET];
2793       mlen = 2 * inst->exec_size / 8;
2794       mode = GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET;
2795       break;
2796 
2797    default:
2798       unreachable("Invalid interpolator instruction");
2799    }
2800 
2801    const bool dynamic_mode =
2802       inst->src[INTERP_SRC_DYNAMIC_MODE].file != BAD_FILE;
2803 
2804    elk_fs_reg desc = inst->src[INTERP_SRC_MSG_DESC];
2805    uint32_t desc_imm =
2806       elk_pixel_interp_desc(devinfo,
2807                             /* Leave the mode at 0 if persample_dispatch is
2808                              * dynamic, it will be ORed in below.
2809                              */
2810                             dynamic_mode ? 0 : mode,
2811                             inst->pi_noperspective,
2812                             false /* coarse_pixel_rate */,
2813                             inst->exec_size, inst->group);
2814 
2815    if (wm_prog_data->coarse_pixel_dispatch == ELK_ALWAYS) {
2816       desc_imm |= (1 << 15);
2817    } else if (wm_prog_data->coarse_pixel_dispatch == ELK_SOMETIMES) {
2818       STATIC_ASSERT(INTEL_MSAA_FLAG_COARSE_PI_MSG == (1 << 15));
2819       elk_fs_reg orig_desc = desc;
2820       const fs_builder &ubld = bld.exec_all().group(8, 0);
2821       desc = ubld.vgrf(ELK_REGISTER_TYPE_UD);
2822       ubld.AND(desc, dynamic_msaa_flags(wm_prog_data),
2823                elk_imm_ud(INTEL_MSAA_FLAG_COARSE_PI_MSG));
2824 
2825       /* And, if it's AT_OFFSET, we might have a non-trivial descriptor */
2826       if (orig_desc.file == IMM) {
2827          desc_imm |= orig_desc.ud;
2828       } else {
2829          ubld.OR(desc, desc, orig_desc);
2830       }
2831    }
2832 
2833    /* If persample_dispatch is dynamic, select the interpolation mode
2834     * dynamically and OR into the descriptor to complete the static part
2835     * generated by elk_pixel_interp_desc().
2836     *
2837     * Why does this work? If you look at the SKL PRMs, Volume 7:
2838     * 3D-Media-GPGPU, Shared Functions Pixel Interpolater, you'll see that
2839     *
2840     *   - "Per Message Offset” Message Descriptor
2841     *   - “Sample Position Offset” Message Descriptor
2842     *
2843     * have different formats. Fortunately, a fragment shader dispatched at
2844     * pixel rate, will have gl_SampleID = 0 & gl_NumSamples = 1. So the value
2845     * we pack in “Sample Position Offset” will be a 0 and will cover the X/Y
2846     * components of "Per Message Offset”, which will give us the pixel offset 0x0.
2847     */
2848    if (dynamic_mode) {
2849       elk_fs_reg orig_desc = desc;
2850       const fs_builder &ubld = bld.exec_all().group(8, 0);
2851       desc = ubld.vgrf(ELK_REGISTER_TYPE_UD);
2852 
2853       /* The predicate should have been built in elk_fs_nir.cpp when emitting
2854        * NIR code. This guarantees that we do not have incorrect interactions
2855        * with the flag register holding the predication result.
2856        */
2857       if (orig_desc.file == IMM) {
2858          /* Not using SEL here because we would generate an instruction with 2
2859           * immediate sources which is not supported by HW.
2860           */
2861          set_predicate_inv(ELK_PREDICATE_NORMAL, false,
2862                            ubld.MOV(desc, elk_imm_ud(orig_desc.ud |
2863                                                      GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
2864          set_predicate_inv(ELK_PREDICATE_NORMAL, true,
2865                            ubld.MOV(desc, elk_imm_ud(orig_desc.ud |
2866                                                      GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
2867       } else {
2868          set_predicate_inv(ELK_PREDICATE_NORMAL, false,
2869                            ubld.OR(desc, orig_desc,
2870                                    elk_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE << 12)));
2871          set_predicate_inv(ELK_PREDICATE_NORMAL, true,
2872                            ubld.OR(desc, orig_desc,
2873                                    elk_imm_ud(GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET << 12)));
2874       }
2875    }
2876 
2877    assert(bld.shader->devinfo->ver >= 7);
2878    inst->opcode = ELK_SHADER_OPCODE_SEND;
2879    inst->sfid = GFX7_SFID_PIXEL_INTERPOLATOR;
2880    inst->desc = desc_imm;
2881    inst->ex_desc = 0;
2882    inst->mlen = mlen;
2883    inst->ex_mlen = 0;
2884    inst->send_has_side_effects = false;
2885    inst->send_is_volatile = false;
2886 
2887    inst->resize_sources(3);
2888    inst->src[0] = component(desc, 0);
2889    inst->src[1] = elk_imm_ud(0); /* ex_desc */
2890    inst->src[2] = payload;
2891 }
2892 
2893 static void
lower_get_buffer_size(const fs_builder & bld,elk_fs_inst * inst)2894 lower_get_buffer_size(const fs_builder &bld, elk_fs_inst *inst)
2895 {
2896    const intel_device_info *devinfo = bld.shader->devinfo;
2897    assert(devinfo->ver >= 7);
2898    /* Since we can only execute this instruction on uniform bti/surface
2899     * handles, elk_fs_nir.cpp should already have limited this to SIMD8.
2900     */
2901    assert(inst->exec_size == (devinfo->ver < 20 ? 8 : 16));
2902 
2903    elk_fs_reg surface = inst->src[GET_BUFFER_SIZE_SRC_SURFACE];
2904    elk_fs_reg surface_handle = inst->src[GET_BUFFER_SIZE_SRC_SURFACE_HANDLE];
2905    elk_fs_reg lod = inst->src[GET_BUFFER_SIZE_SRC_LOD];
2906 
2907    inst->opcode = ELK_SHADER_OPCODE_SEND;
2908    inst->mlen = inst->exec_size / 8;
2909    inst->resize_sources(3);
2910    inst->ex_mlen = 0;
2911    inst->ex_desc = 0;
2912 
2913    /* src[0] & src[1] are filled by setup_surface_descriptors() */
2914    inst->src[2] = lod;
2915 
2916    const uint32_t return_format = devinfo->ver >= 8 ?
2917       GFX8_SAMPLER_RETURN_FORMAT_32BITS : ELK_SAMPLER_RETURN_FORMAT_SINT32;
2918 
2919    const uint32_t desc = elk_sampler_desc(devinfo, 0, 0,
2920                                           GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
2921                                           ELK_SAMPLER_SIMD_MODE_SIMD8,
2922                                           return_format);
2923 
2924    inst->dst = retype(inst->dst, ELK_REGISTER_TYPE_UW);
2925    inst->sfid = ELK_SFID_SAMPLER;
2926    setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
2927 }
2928 
2929 bool
lower_logical_sends()2930 elk_fs_visitor::lower_logical_sends()
2931 {
2932    bool progress = false;
2933 
2934    foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
2935       const fs_builder ibld(this, block, inst);
2936 
2937       switch (inst->opcode) {
2938       case ELK_FS_OPCODE_FB_WRITE_LOGICAL:
2939          assert(stage == MESA_SHADER_FRAGMENT);
2940          lower_fb_write_logical_send(ibld, inst,
2941                                      elk_wm_prog_data(prog_data),
2942                                      (const elk_wm_prog_key *)key,
2943                                      fs_payload());
2944          break;
2945 
2946       case ELK_FS_OPCODE_FB_READ_LOGICAL:
2947          lower_fb_read_logical_send(ibld, inst);
2948          break;
2949 
2950       case ELK_SHADER_OPCODE_TEX_LOGICAL:
2951          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TEX);
2952          break;
2953 
2954       case ELK_SHADER_OPCODE_TXD_LOGICAL:
2955          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXD);
2956          break;
2957 
2958       case ELK_SHADER_OPCODE_TXF_LOGICAL:
2959          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF);
2960          break;
2961 
2962       case ELK_SHADER_OPCODE_TXL_LOGICAL:
2963          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXL);
2964          break;
2965 
2966       case ELK_SHADER_OPCODE_TXS_LOGICAL:
2967          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXS);
2968          break;
2969 
2970       case ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
2971          lower_sampler_logical_send(ibld, inst,
2972                                     ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL);
2973          break;
2974 
2975       case ELK_FS_OPCODE_TXB_LOGICAL:
2976          lower_sampler_logical_send(ibld, inst, ELK_FS_OPCODE_TXB);
2977          break;
2978 
2979       case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
2980          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF_CMS);
2981          break;
2982 
2983       case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
2984       case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
2985          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF_CMS_W);
2986          break;
2987 
2988       case ELK_SHADER_OPCODE_TXF_UMS_LOGICAL:
2989          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF_UMS);
2990          break;
2991 
2992       case ELK_SHADER_OPCODE_TXF_MCS_LOGICAL:
2993          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TXF_MCS);
2994          break;
2995 
2996       case ELK_SHADER_OPCODE_LOD_LOGICAL:
2997          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_LOD);
2998          break;
2999 
3000       case ELK_SHADER_OPCODE_TG4_LOGICAL:
3001          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TG4);
3002          break;
3003 
3004       case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
3005          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_TG4_OFFSET);
3006          break;
3007 
3008       case ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL:
3009          lower_sampler_logical_send(ibld, inst, ELK_SHADER_OPCODE_SAMPLEINFO);
3010          break;
3011 
3012       case ELK_SHADER_OPCODE_GET_BUFFER_SIZE:
3013          lower_get_buffer_size(ibld, inst);
3014          break;
3015 
3016       case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
3017       case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
3018       case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
3019       case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
3020       case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
3021          if (devinfo->has_lsc) {
3022             lower_lsc_surface_logical_send(ibld, inst);
3023             break;
3024          }
3025       case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
3026       case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
3027       case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
3028       case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
3029       case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
3030          lower_surface_logical_send(ibld, inst);
3031          break;
3032 
3033       case ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
3034       case ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
3035          if (devinfo->has_lsc) {
3036             lower_lsc_block_logical_send(ibld, inst);
3037             break;
3038          }
3039          lower_surface_block_logical_send(ibld, inst);
3040          break;
3041 
3042       case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
3043       case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
3044       case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
3045       case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
3046       case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
3047       case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
3048       case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
3049       case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
3050          if (devinfo->has_lsc) {
3051             lower_lsc_a64_logical_send(ibld, inst);
3052             break;
3053          }
3054          lower_a64_logical_send(ibld, inst);
3055          break;
3056 
3057       case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
3058          if (devinfo->has_lsc && !compiler->indirect_ubos_use_sampler)
3059             lower_lsc_varying_pull_constant_logical_send(ibld, inst);
3060          else
3061             lower_varying_pull_constant_logical_send(ibld, inst);
3062          break;
3063 
3064       case ELK_SHADER_OPCODE_RCP:
3065       case ELK_SHADER_OPCODE_RSQ:
3066       case ELK_SHADER_OPCODE_SQRT:
3067       case ELK_SHADER_OPCODE_EXP2:
3068       case ELK_SHADER_OPCODE_LOG2:
3069       case ELK_SHADER_OPCODE_SIN:
3070       case ELK_SHADER_OPCODE_COS:
3071       case ELK_SHADER_OPCODE_POW:
3072       case ELK_SHADER_OPCODE_INT_QUOTIENT:
3073       case ELK_SHADER_OPCODE_INT_REMAINDER:
3074          /* The math opcodes are overloaded for the send-like and
3075           * expression-like instructions which seems kind of icky.  Gfx6+ has
3076           * a native (but rather quirky) MATH instruction so we don't need to
3077           * do anything here.  On Gfx4-5 we'll have to lower the Gfx6-like
3078           * logical instructions (which we can easily recognize because they
3079           * have mlen = 0) into send-like virtual instructions.
3080           */
3081          if (devinfo->ver < 6 && inst->mlen == 0) {
3082             lower_math_logical_send(ibld, inst);
3083             break;
3084 
3085          } else {
3086             continue;
3087          }
3088 
3089       case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
3090       case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
3091       case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
3092          lower_interpolator_logical_send(ibld, inst,
3093                                          (const elk_wm_prog_key *)key,
3094                                          elk_wm_prog_data(prog_data));
3095          break;
3096 
3097       case ELK_SHADER_OPCODE_URB_READ_LOGICAL:
3098          if (devinfo->ver < 20)
3099             lower_urb_read_logical_send(ibld, inst);
3100          else
3101             lower_urb_read_logical_send_xe2(ibld, inst);
3102          break;
3103 
3104       case ELK_SHADER_OPCODE_URB_WRITE_LOGICAL:
3105          if (devinfo->ver < 20)
3106             lower_urb_write_logical_send(ibld, inst);
3107          else
3108             lower_urb_write_logical_send_xe2(ibld, inst);
3109 
3110          break;
3111 
3112       default:
3113          continue;
3114       }
3115 
3116       progress = true;
3117    }
3118 
3119    if (progress)
3120       invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3121 
3122    return progress;
3123 }
3124 
3125 /**
3126  * Turns the generic expression-style uniform pull constant load instruction
3127  * into a hardware-specific series of instructions for loading a pull
3128  * constant.
3129  *
3130  * The expression style allows the CSE pass before this to optimize out
3131  * repeated loads from the same offset, and gives the pre-register-allocation
3132  * scheduling full flexibility, while the conversion to native instructions
3133  * allows the post-register-allocation scheduler the best information
3134  * possible.
3135  *
3136  * Note that execution masking for setting up pull constant loads is special:
3137  * the channels that need to be written are unrelated to the current execution
3138  * mask, since a later instruction will use one of the result channels as a
3139  * source operand for all 8 or 16 of its channels.
3140  */
3141 bool
lower_uniform_pull_constant_loads()3142 elk_fs_visitor::lower_uniform_pull_constant_loads()
3143 {
3144    bool progress = false;
3145 
3146    foreach_block_and_inst (block, elk_fs_inst, inst, cfg) {
3147       if (inst->opcode != ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3148          continue;
3149 
3150       const elk_fs_reg surface = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE];
3151       const elk_fs_reg surface_handle = inst->src[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE];
3152       const elk_fs_reg offset_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_OFFSET];
3153       const elk_fs_reg size_B = inst->src[PULL_UNIFORM_CONSTANT_SRC_SIZE];
3154       assert(surface.file == BAD_FILE || surface_handle.file == BAD_FILE);
3155       assert(offset_B.file == IMM);
3156       assert(size_B.file == IMM);
3157 
3158       if (devinfo->has_lsc) {
3159          const fs_builder ubld =
3160             fs_builder(this, block, inst).group(8, 0).exec_all();
3161 
3162          const elk_fs_reg payload = ubld.vgrf(ELK_REGISTER_TYPE_UD);
3163          ubld.MOV(payload, offset_B);
3164 
3165          inst->sfid = GFX12_SFID_UGM;
3166          inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
3167                                    1 /* simd_size */,
3168                                    surface_handle.file == BAD_FILE ?
3169                                    LSC_ADDR_SURFTYPE_BTI :
3170                                    LSC_ADDR_SURFTYPE_BSS,
3171                                    LSC_ADDR_SIZE_A32,
3172                                    1 /* num_coordinates */,
3173                                    LSC_DATA_SIZE_D32,
3174                                    inst->size_written / 4,
3175                                    true /* transpose */,
3176                                    LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
3177                                    true /* has_dest */);
3178 
3179          /* Update the original instruction. */
3180          inst->opcode = ELK_SHADER_OPCODE_SEND;
3181          inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
3182          inst->send_ex_bso = surface_handle.file != BAD_FILE &&
3183                              compiler->extended_bindless_surface_offset;
3184          inst->ex_mlen = 0;
3185          inst->header_size = 0;
3186          inst->send_has_side_effects = false;
3187          inst->send_is_volatile = true;
3188          inst->exec_size = 1;
3189 
3190          /* Finally, the payload */
3191 
3192          inst->resize_sources(3);
3193          setup_lsc_surface_descriptors(ubld, inst, inst->desc,
3194                                        surface.file != BAD_FILE ?
3195                                        surface : surface_handle);
3196          inst->src[2] = payload;
3197 
3198          invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3199       } else if (devinfo->ver >= 7) {
3200          const fs_builder ubld = fs_builder(this, block, inst).exec_all();
3201          elk_fs_reg header = fs_builder(this, 8).exec_all().vgrf(ELK_REGISTER_TYPE_UD);
3202 
3203          ubld.group(8, 0).MOV(header,
3204                               retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
3205          ubld.group(1, 0).MOV(component(header, 2),
3206                               elk_imm_ud(offset_B.ud / 16));
3207 
3208          inst->sfid = GFX6_SFID_DATAPORT_CONSTANT_CACHE;
3209          inst->opcode = ELK_SHADER_OPCODE_SEND;
3210          inst->header_size = 1;
3211          inst->mlen = 1;
3212 
3213          uint32_t desc =
3214             elk_dp_oword_block_rw_desc(devinfo, true /* align_16B */,
3215                                        size_B.ud / 4, false /* write */);
3216 
3217          inst->resize_sources(4);
3218 
3219          setup_surface_descriptors(ubld, inst, desc, surface, surface_handle);
3220 
3221          inst->src[2] = header;
3222          inst->src[3] = elk_fs_reg(); /* unused for reads */
3223 
3224          invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3225       } else {
3226          assert(surface_handle.file == BAD_FILE);
3227          /* Before register allocation, we didn't tell the scheduler about the
3228           * MRF we use.  We know it's safe to use this MRF because nothing
3229           * else does except for register spill/unspill, which generates and
3230           * uses its MRF within a single IR instruction.
3231           */
3232          inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
3233          inst->mlen = 1;
3234       }
3235 
3236       progress = true;
3237    }
3238 
3239    return progress;
3240 }
3241