• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file brw_fs_generator.cpp
25  *
26  * This file supports generating code from the FS LIR to the actual
27  * native instructions.
28  */
29 
30 #include "brw_eu.h"
31 #include "brw_fs.h"
32 #include "brw_cfg.h"
33 #include "util/mesa-sha1.h"
34 #include "util/half_float.h"
35 
36 static enum brw_reg_file
brw_file_from_reg(fs_reg * reg)37 brw_file_from_reg(fs_reg *reg)
38 {
39    switch (reg->file) {
40    case ARF:
41       return BRW_ARCHITECTURE_REGISTER_FILE;
42    case FIXED_GRF:
43    case VGRF:
44       return BRW_GENERAL_REGISTER_FILE;
45    case MRF:
46       return BRW_MESSAGE_REGISTER_FILE;
47    case IMM:
48       return BRW_IMMEDIATE_VALUE;
49    case BAD_FILE:
50    case ATTR:
51    case UNIFORM:
52       unreachable("not reached");
53    }
54    return BRW_ARCHITECTURE_REGISTER_FILE;
55 }
56 
57 static struct brw_reg
brw_reg_from_fs_reg(const struct intel_device_info * devinfo,fs_inst * inst,fs_reg * reg,bool compressed)58 brw_reg_from_fs_reg(const struct intel_device_info *devinfo, fs_inst *inst,
59                     fs_reg *reg, bool compressed)
60 {
61    struct brw_reg brw_reg;
62 
63    switch (reg->file) {
64    case MRF:
65       assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));
66       FALLTHROUGH;
67    case VGRF:
68       if (reg->stride == 0) {
69          brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
70       } else {
71          /* From the Haswell PRM:
72           *
73           *  "VertStride must be used to cross GRF register boundaries. This
74           *   rule implies that elements within a 'Width' cannot cross GRF
75           *   boundaries."
76           *
77           * The maximum width value that could satisfy this restriction is:
78           */
79          const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
80 
81          /* Because the hardware can only split source regions at a whole
82           * multiple of width during decompression (i.e. vertically), clamp
83           * the value obtained above to the physical execution size of a
84           * single decompressed chunk of the instruction:
85           */
86          const unsigned phys_width = compressed ? inst->exec_size / 2 :
87                                      inst->exec_size;
88 
89          const unsigned max_hw_width = 16;
90 
91          /* XXX - The equation above is strictly speaking not correct on
92           *       hardware that supports unbalanced GRF writes -- On Gfx9+
93           *       each decompressed chunk of the instruction may have a
94           *       different execution size when the number of components
95           *       written to each destination GRF is not the same.
96           */
97          if (reg->stride > 4) {
98             assert(reg != &inst->dst);
99             assert(reg->stride * type_sz(reg->type) <= REG_SIZE);
100             brw_reg = brw_vecn_reg(1, brw_file_from_reg(reg), reg->nr, 0);
101             brw_reg = stride(brw_reg, reg->stride, 1, 0);
102          } else {
103             const unsigned width = MIN3(reg_width, phys_width, max_hw_width);
104             brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
105             brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
106          }
107 
108          if (devinfo->verx10 == 70) {
109             /* From the IvyBridge PRM (EU Changes by Processor Generation, page 13):
110              *  "Each DF (Double Float) operand uses an element size of 4 rather
111              *   than 8 and all regioning parameters are twice what the values
112              *   would be based on the true element size: ExecSize, Width,
113              *   HorzStride, and VertStride. Each DF operand uses a pair of
114              *   channels and all masking and swizzing should be adjusted
115              *   appropriately."
116              *
117              * From the IvyBridge PRM (Special Requirements for Handling Double
118              * Precision Data Types, page 71):
119              *  "In Align1 mode, all regioning parameters like stride, execution
120              *   size, and width must use the syntax of a pair of packed
121              *   floats. The offsets for these data types must be 64-bit
122              *   aligned. The execution size and regioning parameters are in terms
123              *   of floats."
124              *
125              * Summarized: when handling DF-typed arguments, ExecSize,
126              * VertStride, and Width must be doubled.
127              *
128              * It applies to BayTrail too.
129              */
130             if (type_sz(reg->type) == 8) {
131                brw_reg.width++;
132                if (brw_reg.vstride > 0)
133                   brw_reg.vstride++;
134                assert(brw_reg.hstride == BRW_HORIZONTAL_STRIDE_1);
135             }
136 
137             /* When converting from DF->F, we set the destination stride to 2
138              * because each d2f conversion implicitly writes 2 floats, being
139              * the first one the converted value. IVB/BYT actually writes two
140              * F components per SIMD channel, and every other component is
141              * filled with garbage.
142              */
143             if (reg == &inst->dst && get_exec_type_size(inst) == 8 &&
144                 type_sz(inst->dst.type) < 8) {
145                assert(brw_reg.hstride > BRW_HORIZONTAL_STRIDE_1);
146                brw_reg.hstride--;
147             }
148          }
149       }
150 
151       brw_reg = retype(brw_reg, reg->type);
152       brw_reg = byte_offset(brw_reg, reg->offset);
153       brw_reg.abs = reg->abs;
154       brw_reg.negate = reg->negate;
155       break;
156    case ARF:
157    case FIXED_GRF:
158    case IMM:
159       assert(reg->offset == 0);
160       brw_reg = reg->as_brw_reg();
161       break;
162    case BAD_FILE:
163       /* Probably unused. */
164       brw_reg = brw_null_reg();
165       break;
166    case ATTR:
167    case UNIFORM:
168       unreachable("not reached");
169    }
170 
171    /* On HSW+, scalar DF sources can be accessed using the normal <0,1,0>
172     * region, but on IVB and BYT DF regions must be programmed in terms of
173     * floats. A <0,2,1> region accomplishes this.
174     */
175    if (devinfo->verx10 == 70 &&
176        type_sz(reg->type) == 8 &&
177        brw_reg.vstride == BRW_VERTICAL_STRIDE_0 &&
178        brw_reg.width == BRW_WIDTH_1 &&
179        brw_reg.hstride == BRW_HORIZONTAL_STRIDE_0) {
180       brw_reg.width = BRW_WIDTH_2;
181       brw_reg.hstride = BRW_HORIZONTAL_STRIDE_1;
182    }
183 
184    return brw_reg;
185 }
186 
fs_generator(const struct brw_compiler * compiler,void * log_data,void * mem_ctx,struct brw_stage_prog_data * prog_data,bool runtime_check_aads_emit,gl_shader_stage stage)187 fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
188                            void *mem_ctx,
189                            struct brw_stage_prog_data *prog_data,
190                            bool runtime_check_aads_emit,
191                            gl_shader_stage stage)
192 
193    : compiler(compiler), log_data(log_data),
194      devinfo(compiler->devinfo),
195      prog_data(prog_data), dispatch_width(0),
196      runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
197      shader_name(NULL), stage(stage), mem_ctx(mem_ctx)
198 {
199    p = rzalloc(mem_ctx, struct brw_codegen);
200    brw_init_codegen(&compiler->isa, p, mem_ctx);
201 
202    /* In the FS code generator, we are very careful to ensure that we always
203     * set the right execution size so we don't need the EU code to "help" us
204     * by trying to infer it.  Sometimes, it infers the wrong thing.
205     */
206    p->automatic_exec_sizes = false;
207 }
208 
~fs_generator()209 fs_generator::~fs_generator()
210 {
211 }
212 
213 class ip_record : public exec_node {
214 public:
215    DECLARE_RALLOC_CXX_OPERATORS(ip_record)
216 
ip_record(int ip)217    ip_record(int ip)
218    {
219       this->ip = ip;
220    }
221 
222    int ip;
223 };
224 
225 bool
patch_halt_jumps()226 fs_generator::patch_halt_jumps()
227 {
228    if (this->discard_halt_patches.is_empty())
229       return false;
230 
231    int scale = brw_jump_scale(p->devinfo);
232 
233    if (devinfo->ver >= 6) {
234       /* There is a somewhat strange undocumented requirement of using
235        * HALT, according to the simulator.  If some channel has HALTed to
236        * a particular UIP, then by the end of the program, every channel
237        * must have HALTed to that UIP.  Furthermore, the tracking is a
238        * stack, so you can't do the final halt of a UIP after starting
239        * halting to a new UIP.
240        *
241        * Symptoms of not emitting this instruction on actual hardware
242        * included GPU hangs and sparkly rendering on the piglit discard
243        * tests.
244        */
245       brw_inst *last_halt = brw_HALT(p);
246       brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
247       brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
248    }
249 
250    int ip = p->nr_insn;
251 
252    foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
253       brw_inst *patch = &p->store[patch_ip->ip];
254 
255       assert(brw_inst_opcode(p->isa, patch) == BRW_OPCODE_HALT);
256       if (devinfo->ver >= 6) {
257          /* HALT takes a half-instruction distance from the pre-incremented IP. */
258          brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
259       } else {
260          brw_set_src1(p, patch, brw_imm_d((ip - patch_ip->ip) * scale));
261       }
262    }
263 
264    this->discard_halt_patches.make_empty();
265 
266    if (devinfo->ver < 6) {
267       /* From the g965 PRM:
268        *
269        *    "As DMask is not automatically reloaded into AMask upon completion
270        *    of this instruction, software has to manually restore AMask upon
271        *    completion."
272        *
273        * DMask lives in the bottom 16 bits of sr0.1.
274        */
275       brw_inst *reset = brw_MOV(p, brw_mask_reg(BRW_AMASK),
276                                    retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW));
277       brw_inst_set_exec_size(devinfo, reset, BRW_EXECUTE_1);
278       brw_inst_set_mask_control(devinfo, reset, BRW_MASK_DISABLE);
279       brw_inst_set_qtr_control(devinfo, reset, BRW_COMPRESSION_NONE);
280       brw_inst_set_thread_control(devinfo, reset, BRW_THREAD_SWITCH);
281    }
282 
283    if (devinfo->ver == 4 && devinfo->platform != INTEL_PLATFORM_G4X) {
284       /* From the g965 PRM:
285        *
286        *    "[DevBW, DevCL] Erratum: The subfields in mask stack register are
287        *    reset to zero during graphics reset, however, they are not
288        *    initialized at thread dispatch. These subfields will retain the
289        *    values from the previous thread. Software should make sure the
290        *    mask stack is empty (reset to zero) before terminating the thread.
291        *    In case that this is not practical, software may have to reset the
292        *    mask stack at the beginning of each kernel, which will impact the
293        *    performance."
294        *
295        * Luckily we can rely on:
296        *
297        *    "[DevBW, DevCL] This register access restriction is not
298        *    applicable, hardware does ensure execution pipeline coherency,
299        *    when a mask stack register is used as an explicit source and/or
300        *    destination."
301        */
302       brw_push_insn_state(p);
303       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
304       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
305 
306       brw_set_default_exec_size(p, BRW_EXECUTE_2);
307       brw_MOV(p, vec2(brw_mask_stack_depth_reg(0)), brw_imm_uw(0));
308 
309       brw_set_default_exec_size(p, BRW_EXECUTE_16);
310       /* Reset the if stack. */
311       brw_MOV(p, retype(brw_mask_stack_reg(0), BRW_REGISTER_TYPE_UW),
312               brw_imm_uw(0));
313 
314       brw_pop_insn_state(p);
315    }
316 
317    return true;
318 }
319 
320 void
generate_send(fs_inst * inst,struct brw_reg dst,struct brw_reg desc,struct brw_reg ex_desc,struct brw_reg payload,struct brw_reg payload2)321 fs_generator::generate_send(fs_inst *inst,
322                             struct brw_reg dst,
323                             struct brw_reg desc,
324                             struct brw_reg ex_desc,
325                             struct brw_reg payload,
326                             struct brw_reg payload2)
327 {
328    const bool dst_is_null = dst.file == BRW_ARCHITECTURE_REGISTER_FILE &&
329                             dst.nr == BRW_ARF_NULL;
330    const unsigned rlen = dst_is_null ? 0 : inst->size_written / REG_SIZE;
331 
332    uint32_t desc_imm = inst->desc |
333       brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size);
334 
335    uint32_t ex_desc_imm = inst->ex_desc |
336       brw_message_ex_desc(devinfo, inst->ex_mlen);
337 
338    if (ex_desc.file != BRW_IMMEDIATE_VALUE || ex_desc.ud || ex_desc_imm) {
339       /* If we have any sort of extended descriptor, then we need SENDS.  This
340        * also covers the dual-payload case because ex_mlen goes in ex_desc.
341        */
342       brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2,
343                                       desc, desc_imm, ex_desc, ex_desc_imm,
344                                       inst->eot);
345       if (inst->check_tdr)
346          brw_inst_set_opcode(p->isa, brw_last_inst,
347                              devinfo->ver >= 12 ? BRW_OPCODE_SENDC : BRW_OPCODE_SENDSC);
348    } else {
349       brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm,
350                                    inst->eot);
351       if (inst->check_tdr)
352          brw_inst_set_opcode(p->isa, brw_last_inst, BRW_OPCODE_SENDC);
353    }
354 }
355 
356 void
fire_fb_write(fs_inst * inst,struct brw_reg payload,struct brw_reg implied_header,GLuint nr)357 fs_generator::fire_fb_write(fs_inst *inst,
358                             struct brw_reg payload,
359                             struct brw_reg implied_header,
360                             GLuint nr)
361 {
362    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
363 
364    if (devinfo->ver < 6) {
365       brw_push_insn_state(p);
366       brw_set_default_exec_size(p, BRW_EXECUTE_8);
367       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
368       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
369       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
370       brw_MOV(p, offset(retype(payload, BRW_REGISTER_TYPE_UD), 1),
371               offset(retype(implied_header, BRW_REGISTER_TYPE_UD), 1));
372       brw_pop_insn_state(p);
373    }
374 
375    uint32_t msg_control = brw_fb_write_msg_control(inst, prog_data);
376 
377    /* We assume render targets start at 0, because headerless FB write
378     * messages set "Render Target Index" to 0.  Using a different binding
379     * table index would make it impossible to use headerless messages.
380     */
381    const uint32_t surf_index = inst->target;
382 
383    brw_inst *insn = brw_fb_WRITE(p,
384                                  payload,
385                                  retype(implied_header, BRW_REGISTER_TYPE_UW),
386                                  msg_control,
387                                  surf_index,
388                                  nr,
389                                  0,
390                                  inst->eot,
391                                  inst->last_rt,
392                                  inst->header_size != 0);
393 
394    if (devinfo->ver >= 6)
395       brw_inst_set_rt_slot_group(devinfo, insn, inst->group / 16);
396 }
397 
398 void
generate_fb_write(fs_inst * inst,struct brw_reg payload)399 fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
400 {
401    if (devinfo->verx10 <= 70) {
402       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
403       brw_set_default_flag_reg(p, 0, 0);
404    }
405 
406    const struct brw_reg implied_header =
407       devinfo->ver < 6 ? payload : brw_null_reg();
408 
409    if (inst->base_mrf >= 0)
410       payload = brw_message_reg(inst->base_mrf);
411 
412    if (!runtime_check_aads_emit) {
413       fire_fb_write(inst, payload, implied_header, inst->mlen);
414    } else {
415       /* This can only happen in gen < 6 */
416       assert(devinfo->ver < 6);
417 
418       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
419 
420       /* Check runtime bit to detect if we have to send AA data or not */
421       brw_push_insn_state(p);
422       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
423       brw_set_default_exec_size(p, BRW_EXECUTE_1);
424       brw_AND(p,
425               v1_null_ud,
426               retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
427               brw_imm_ud(1<<26));
428       brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
429 
430       int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
431       brw_pop_insn_state(p);
432       {
433          /* Don't send AA data */
434          fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
435       }
436       brw_land_fwd_jump(p, jmp);
437       fire_fb_write(inst, payload, implied_header, inst->mlen);
438    }
439 }
440 
441 void
generate_fb_read(fs_inst * inst,struct brw_reg dst,struct brw_reg payload)442 fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst,
443                                struct brw_reg payload)
444 {
445    assert(inst->size_written % REG_SIZE == 0);
446    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
447    /* We assume that render targets start at binding table index 0. */
448    const unsigned surf_index = inst->target;
449 
450    gfx9_fb_READ(p, dst, payload, surf_index,
451                 inst->header_size, inst->size_written / REG_SIZE,
452                 prog_data->persample_dispatch);
453 }
454 
455 void
generate_mov_indirect(fs_inst * inst,struct brw_reg dst,struct brw_reg reg,struct brw_reg indirect_byte_offset)456 fs_generator::generate_mov_indirect(fs_inst *inst,
457                                     struct brw_reg dst,
458                                     struct brw_reg reg,
459                                     struct brw_reg indirect_byte_offset)
460 {
461    assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
462    assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
463    assert(!reg.abs && !reg.negate);
464    assert(reg.type == dst.type);
465 
466    unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
467 
468    if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
469       imm_byte_offset += indirect_byte_offset.ud;
470 
471       reg.nr = imm_byte_offset / REG_SIZE;
472       reg.subnr = imm_byte_offset % REG_SIZE;
473       if (type_sz(reg.type) > 4 && !devinfo->has_64bit_float) {
474          brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
475                     subscript(reg, BRW_REGISTER_TYPE_D, 0));
476          brw_set_default_swsb(p, tgl_swsb_null());
477          brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
478                     subscript(reg, BRW_REGISTER_TYPE_D, 1));
479       } else {
480          brw_MOV(p, dst, reg);
481       }
482    } else {
483       /* Prior to Broadwell, there are only 8 address registers. */
484       assert(inst->exec_size <= 8 || devinfo->ver >= 8);
485 
486       /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
487       struct brw_reg addr = vec8(brw_address_reg(0));
488 
489       /* Whether we can use destination dependency control without running the
490        * risk of a hang if an instruction gets shot down.
491        */
492       const bool use_dep_ctrl = !inst->predicate &&
493                                 inst->exec_size == dispatch_width;
494       brw_inst *insn;
495 
496       /* The destination stride of an instruction (in bytes) must be greater
497        * than or equal to the size of the rest of the instruction.  Since the
498        * address register is of type UW, we can't use a D-type instruction.
499        * In order to get around this, re retype to UW and use a stride.
500        */
501       indirect_byte_offset =
502          retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
503 
504       /* There are a number of reasons why we don't use the base offset here.
505        * One reason is that the field is only 9 bits which means we can only
506        * use it to access the first 16 GRFs.  Also, from the Haswell PRM
507        * section "Register Region Restrictions":
508        *
509        *    "The lower bits of the AddressImmediate must not overflow to
510        *    change the register address.  The lower 5 bits of Address
511        *    Immediate when added to lower 5 bits of address register gives
512        *    the sub-register offset. The upper bits of Address Immediate
513        *    when added to upper bits of address register gives the register
514        *    address. Any overflow from sub-register offset is dropped."
515        *
516        * Since the indirect may cause us to cross a register boundary, this
517        * makes the base offset almost useless.  We could try and do something
518        * clever where we use a actual base offset if base_offset % 32 == 0 but
519        * that would mean we were generating different code depending on the
520        * base offset.  Instead, for the sake of consistency, we'll just do the
521        * add ourselves.  This restriction is only listed in the Haswell PRM
522        * but empirical testing indicates that it applies on all older
523        * generations and is lifted on Broadwell.
524        *
525        * In the end, while base_offset is nice to look at in the generated
526        * code, using it saves us 0 instructions and would require quite a bit
527        * of case-by-case work.  It's just not worth it.
528        *
529        * Due to a hardware bug some platforms (particularly Gfx11+) seem to
530        * require the address components of all channels to be valid whether or
531        * not they're active, which causes issues if we use VxH addressing
532        * under non-uniform control-flow.  We can easily work around that by
533        * initializing the whole address register with a pipelined NoMask MOV
534        * instruction.
535        */
536       if (devinfo->ver >= 7) {
537          insn = brw_MOV(p, addr, brw_imm_uw(imm_byte_offset));
538          brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
539          brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
540          if (devinfo->ver >= 12)
541             brw_set_default_swsb(p, tgl_swsb_null());
542          else
543             brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl);
544       }
545 
546       insn = brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
547       if (devinfo->ver >= 12)
548          brw_set_default_swsb(p, tgl_swsb_regdist(1));
549       else if (devinfo->ver >= 7)
550          brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl);
551 
552       if (type_sz(reg.type) > 4 &&
553           ((devinfo->verx10 == 70) ||
554            devinfo->platform == INTEL_PLATFORM_CHV || intel_device_info_is_9lp(devinfo) ||
555            !devinfo->has_64bit_float || devinfo->verx10 >= 125)) {
556          /* IVB has an issue (which we found empirically) where it reads two
557           * address register components per channel for indirectly addressed
558           * 64-bit sources.
559           *
560           * From the Cherryview PRM Vol 7. "Register Region Restrictions":
561           *
562           *    "When source or destination datatype is 64b or operation is
563           *    integer DWord multiply, indirect addressing must not be used."
564           *
565           * To work around both of these, we do two integer MOVs insead of one
566           * 64-bit MOV.  Because no double value should ever cross a register
567           * boundary, it's safe to use the immediate offset in the indirect
568           * here to handle adding 4 bytes to the offset and avoid the extra
569           * ADD to the register file.
570           */
571          brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
572                     retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
573          brw_set_default_swsb(p, tgl_swsb_null());
574          brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
575                     retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
576       } else {
577          struct brw_reg ind_src = brw_VxH_indirect(0, 0);
578 
579          brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type));
580 
581          if (devinfo->ver == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
582              !inst->get_next()->is_tail_sentinel() &&
583              ((fs_inst *)inst->get_next())->mlen > 0) {
584             /* From the Sandybridge PRM:
585              *
586              *    "[Errata: DevSNB(SNB)] If MRF register is updated by any
587              *    instruction that “indexed/indirect” source AND is followed
588              *    by a send, the instruction requires a “Switch”. This is to
589              *    avoid race condition where send may dispatch before MRF is
590              *    updated."
591              */
592             brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
593          }
594       }
595    }
596 }
597 
598 void
generate_shuffle(fs_inst * inst,struct brw_reg dst,struct brw_reg src,struct brw_reg idx)599 fs_generator::generate_shuffle(fs_inst *inst,
600                                struct brw_reg dst,
601                                struct brw_reg src,
602                                struct brw_reg idx)
603 {
604    assert(src.file == BRW_GENERAL_REGISTER_FILE);
605    assert(!src.abs && !src.negate);
606 
607    /* Ivy bridge has some strange behavior that makes this a real pain to
608     * implement for 64-bit values so we just don't bother.
609     */
610    assert((devinfo->verx10 >= 75 && devinfo->has_64bit_float) ||
611           type_sz(src.type) <= 4);
612 
613    /* Because we're using the address register, we're limited to 8-wide
614     * execution on gfx7.  On gfx8, we're limited to 16-wide by the address
615     * register file and 8-wide for 64-bit types.  We could try and make this
616     * instruction splittable higher up in the compiler but that gets weird
617     * because it reads all of the channels regardless of execution size.  It's
618     * easier just to split it here.
619     */
620    const unsigned lower_width =
621       devinfo->ver <= 7 || element_sz(src) > 4 || element_sz(dst) > 4 ? 8 :
622       MIN2(16, inst->exec_size);
623 
624    brw_set_default_exec_size(p, cvt(lower_width) - 1);
625    for (unsigned group = 0; group < inst->exec_size; group += lower_width) {
626       brw_set_default_group(p, group);
627 
628       if ((src.vstride == 0 && src.hstride == 0) ||
629           idx.file == BRW_IMMEDIATE_VALUE) {
630          /* Trivial, the source is already uniform or the index is a constant.
631           * We will typically not get here if the optimizer is doing its job,
632           * but asserting would be mean.
633           */
634          const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
635          struct brw_reg group_src = stride(suboffset(src, i), 0, 1, 0);
636          struct brw_reg group_dst = suboffset(dst, group << (dst.hstride - 1));
637          brw_MOV(p, group_dst, group_src);
638       } else {
639          /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
640          struct brw_reg addr = vec8(brw_address_reg(0));
641 
642          struct brw_reg group_idx = suboffset(idx, group);
643 
644          if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) {
645             /* Things get grumpy if the register is too wide. */
646             group_idx.width--;
647             group_idx.vstride--;
648          }
649 
650          assert(type_sz(group_idx.type) <= 4);
651          if (type_sz(group_idx.type) == 4) {
652             /* The destination stride of an instruction (in bytes) must be
653              * greater than or equal to the size of the rest of the
654              * instruction.  Since the address register is of type UW, we
655              * can't use a D-type instruction.  In order to get around this,
656              * re retype to UW and use a stride.
657              */
658             group_idx = retype(spread(group_idx, 2), BRW_REGISTER_TYPE_W);
659          }
660 
661          uint32_t src_start_offset = src.nr * REG_SIZE + src.subnr;
662 
663          /* From the Haswell PRM:
664           *
665           *    "When a sequence of NoDDChk and NoDDClr are used, the last
666           *    instruction that completes the scoreboard clear must have a
667           *    non-zero execution mask. This means, if any kind of predication
668           *    can change the execution mask or channel enable of the last
669           *    instruction, the optimization must be avoided.  This is to
670           *    avoid instructions being shot down the pipeline when no writes
671           *    are required."
672           *
673           * Whenever predication is enabled or the instructions being emitted
674           * aren't the full width, it's possible that it will be run with zero
675           * channels enabled so we can't use dependency control without
676           * running the risk of a hang if an instruction gets shot down.
677           */
678          const bool use_dep_ctrl = !inst->predicate &&
679                                    lower_width == dispatch_width;
680          brw_inst *insn;
681 
682          /* Due to a hardware bug some platforms (particularly Gfx11+) seem
683           * to require the address components of all channels to be valid
684           * whether or not they're active, which causes issues if we use VxH
685           * addressing under non-uniform control-flow.  We can easily work
686           * around that by initializing the whole address register with a
687           * pipelined NoMask MOV instruction.
688           */
689          insn = brw_MOV(p, addr, brw_imm_uw(src_start_offset));
690          brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
691          brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
692          if (devinfo->ver >= 12)
693             brw_set_default_swsb(p, tgl_swsb_null());
694          else
695             brw_inst_set_no_dd_clear(devinfo, insn, use_dep_ctrl);
696 
697          /* Take into account the component size and horizontal stride. */
698          assert(src.vstride == src.hstride + src.width);
699          insn = brw_SHL(p, addr, group_idx,
700                         brw_imm_uw(util_logbase2(type_sz(src.type)) +
701                                    src.hstride - 1));
702          if (devinfo->ver >= 12)
703             brw_set_default_swsb(p, tgl_swsb_regdist(1));
704          else
705             brw_inst_set_no_dd_check(devinfo, insn, use_dep_ctrl);
706 
707          /* Add on the register start offset */
708          brw_ADD(p, addr, addr, brw_imm_uw(src_start_offset));
709          brw_MOV(p, suboffset(dst, group << (dst.hstride - 1)),
710                  retype(brw_VxH_indirect(0, 0), src.type));
711       }
712 
713       brw_set_default_swsb(p, tgl_swsb_null());
714    }
715 }
716 
717 void
generate_quad_swizzle(const fs_inst * inst,struct brw_reg dst,struct brw_reg src,unsigned swiz)718 fs_generator::generate_quad_swizzle(const fs_inst *inst,
719                                     struct brw_reg dst, struct brw_reg src,
720                                     unsigned swiz)
721 {
722    /* Requires a quad. */
723    assert(inst->exec_size >= 4);
724 
725    if (src.file == BRW_IMMEDIATE_VALUE ||
726        has_scalar_region(src)) {
727       /* The value is uniform across all channels */
728       brw_MOV(p, dst, src);
729 
730    } else if (devinfo->ver < 11 && type_sz(src.type) == 4) {
731       /* This only works on 8-wide 32-bit values */
732       assert(inst->exec_size == 8);
733       assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
734       assert(src.vstride == src.width + 1);
735       brw_set_default_access_mode(p, BRW_ALIGN_16);
736       struct brw_reg swiz_src = stride(src, 4, 4, 1);
737       swiz_src.swizzle = swiz;
738       brw_MOV(p, dst, swiz_src);
739 
740    } else {
741       assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
742       assert(src.vstride == src.width + 1);
743       const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0));
744 
745       switch (swiz) {
746       case BRW_SWIZZLE_XXXX:
747       case BRW_SWIZZLE_YYYY:
748       case BRW_SWIZZLE_ZZZZ:
749       case BRW_SWIZZLE_WWWW:
750          brw_MOV(p, dst, stride(src_0, 4, 4, 0));
751          break;
752 
753       case BRW_SWIZZLE_XXZZ:
754       case BRW_SWIZZLE_YYWW:
755          brw_MOV(p, dst, stride(src_0, 2, 2, 0));
756          break;
757 
758       case BRW_SWIZZLE_XYXY:
759       case BRW_SWIZZLE_ZWZW:
760          assert(inst->exec_size == 4);
761          brw_MOV(p, dst, stride(src_0, 0, 2, 1));
762          break;
763 
764       default:
765          assert(inst->force_writemask_all);
766          brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1);
767 
768          for (unsigned c = 0; c < 4; c++) {
769             brw_inst *insn = brw_MOV(
770                p, stride(suboffset(dst, c),
771                          4 * inst->dst.stride, 1, 4 * inst->dst.stride),
772                stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0));
773 
774             if (devinfo->ver < 12) {
775                brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
776                brw_inst_set_no_dd_check(devinfo, insn, c > 0);
777             }
778 
779             brw_set_default_swsb(p, tgl_swsb_null());
780          }
781 
782          break;
783       }
784    }
785 }
786 
787 void
generate_cs_terminate(fs_inst * inst,struct brw_reg payload)788 fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
789 {
790    struct brw_inst *insn;
791 
792    insn = brw_next_insn(p, BRW_OPCODE_SEND);
793 
794    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
795    brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW));
796    if (devinfo->ver < 12)
797       brw_set_src1(p, insn, brw_imm_ud(0u));
798 
799    /* For XeHP and newer send a message to the message gateway to terminate a
800     * compute shader. For older devices, a message is sent to the thread
801     * spawner.
802     */
803    if (devinfo->verx10 >= 125)
804       brw_inst_set_sfid(devinfo, insn, BRW_SFID_MESSAGE_GATEWAY);
805    else
806       brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
807    brw_inst_set_mlen(devinfo, insn, 1);
808    brw_inst_set_rlen(devinfo, insn, 0);
809    brw_inst_set_eot(devinfo, insn, inst->eot);
810    brw_inst_set_header_present(devinfo, insn, false);
811 
812    brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
813 
814    if (devinfo->ver < 11) {
815       brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
816 
817       /* Note that even though the thread has a URB resource associated with it,
818        * we set the "do not dereference URB" bit, because the URB resource is
819        * managed by the fixed-function unit, so it will free it automatically.
820        */
821       brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
822    }
823 
824    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
825 }
826 
827 void
generate_barrier(fs_inst *,struct brw_reg src)828 fs_generator::generate_barrier(fs_inst *, struct brw_reg src)
829 {
830    brw_barrier(p, src);
831    if (devinfo->ver >= 12) {
832       brw_set_default_swsb(p, tgl_swsb_null());
833       brw_SYNC(p, TGL_SYNC_BAR);
834    } else {
835       brw_WAIT(p);
836    }
837 }
838 
839 bool
generate_linterp(fs_inst * inst,struct brw_reg dst,struct brw_reg * src)840 fs_generator::generate_linterp(fs_inst *inst,
841                                struct brw_reg dst, struct brw_reg *src)
842 {
843    /* PLN reads:
844     *                      /   in SIMD16   \
845     *    -----------------------------------
846     *   | src1+0 | src1+1 | src1+2 | src1+3 |
847     *   |-----------------------------------|
848     *   |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
849     *    -----------------------------------
850     *
851     * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
852     *
853     *    -----------------------------------
854     *   | src1+0 | src1+1 | src1+2 | src1+3 |
855     *   |-----------------------------------|
856     *   |(x0, x1)|(y0, y1)|        |        | in SIMD8
857     *   |-----------------------------------|
858     *   |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
859     *    -----------------------------------
860     *
861     * See also: emit_interpolation_setup_gfx4().
862     */
863    struct brw_reg delta_x = src[0];
864    struct brw_reg delta_y = offset(src[0], inst->exec_size / 8);
865    struct brw_reg interp = src[1];
866    brw_inst *i[2];
867 
868    /* nir_lower_interpolation() will do the lowering to MAD instructions for
869     * us on gfx11+
870     */
871    assert(devinfo->ver < 11);
872 
873    if (devinfo->has_pln) {
874       if (devinfo->ver <= 6 && (delta_x.nr & 1) != 0) {
875          /* From the Sandy Bridge PRM Vol. 4, Pt. 2, Section 8.3.53, "Plane":
876           *
877           *    "[DevSNB]:<src1> must be even register aligned.
878           *
879           * This restriction is lifted on Ivy Bridge.
880           *
881           * This means that we need to split PLN into LINE+MAC on-the-fly.
882           * Unfortunately, the inputs are laid out for PLN and not LINE+MAC so
883           * we have to split into SIMD8 pieces.  For gfx4 (!has_pln), the
884           * coordinate registers are laid out differently so we leave it as a
885           * SIMD16 instruction.
886           */
887          assert(inst->exec_size == 8 || inst->exec_size == 16);
888          assert(inst->group % 16 == 0);
889 
890          brw_push_insn_state(p);
891          brw_set_default_exec_size(p, BRW_EXECUTE_8);
892 
893          /* Thanks to two accumulators, we can emit all the LINEs and then all
894           * the MACs.  This improves parallelism a bit.
895           */
896          for (unsigned g = 0; g < inst->exec_size / 8; g++) {
897             brw_inst *line = brw_LINE(p, brw_null_reg(), interp,
898                                       offset(delta_x, g * 2));
899             brw_inst_set_group(devinfo, line, inst->group + g * 8);
900 
901             /* LINE writes the accumulator automatically on gfx4-5.  On Sandy
902              * Bridge and later, we have to explicitly enable it.
903              */
904             if (devinfo->ver >= 6)
905                brw_inst_set_acc_wr_control(p->devinfo, line, true);
906 
907             /* brw_set_default_saturate() is called before emitting
908              * instructions, so the saturate bit is set in each instruction,
909              * so we need to unset it on the LINE instructions.
910              */
911             brw_inst_set_saturate(p->devinfo, line, false);
912          }
913 
914          for (unsigned g = 0; g < inst->exec_size / 8; g++) {
915             brw_inst *mac = brw_MAC(p, offset(dst, g), suboffset(interp, 1),
916                                     offset(delta_x, g * 2 + 1));
917             brw_inst_set_group(devinfo, mac, inst->group + g * 8);
918             brw_inst_set_cond_modifier(p->devinfo, mac, inst->conditional_mod);
919          }
920 
921          brw_pop_insn_state(p);
922 
923          return true;
924       } else {
925          brw_PLN(p, dst, interp, delta_x);
926 
927          return false;
928       }
929    } else {
930       i[0] = brw_LINE(p, brw_null_reg(), interp, delta_x);
931       i[1] = brw_MAC(p, dst, suboffset(interp, 1), delta_y);
932 
933       brw_inst_set_cond_modifier(p->devinfo, i[1], inst->conditional_mod);
934 
935       /* brw_set_default_saturate() is called before emitting instructions, so
936        * the saturate bit is set in each instruction, so we need to unset it on
937        * the first instruction.
938        */
939       brw_inst_set_saturate(p->devinfo, i[0], false);
940 
941       return true;
942    }
943 }
944 
945 void
generate_get_buffer_size(fs_inst * inst,struct brw_reg dst,struct brw_reg src,struct brw_reg surf_index)946 fs_generator::generate_get_buffer_size(fs_inst *inst,
947                                        struct brw_reg dst,
948                                        struct brw_reg src,
949                                        struct brw_reg surf_index)
950 {
951    assert(devinfo->ver >= 7);
952    assert(surf_index.file == BRW_IMMEDIATE_VALUE);
953 
954    uint32_t simd_mode;
955    int rlen = 4;
956 
957    switch (inst->exec_size) {
958    case 8:
959       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
960       break;
961    case 16:
962       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
963       break;
964    default:
965       unreachable("Invalid width for texture instruction");
966    }
967 
968    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
969       rlen = 8;
970       dst = vec16(dst);
971    }
972 
973    uint32_t return_format =
974       devinfo->ver >= 8 ? GFX8_SAMPLER_RETURN_FORMAT_32BITS :
975                           BRW_SAMPLER_RETURN_FORMAT_SINT32;
976    brw_SAMPLE(p,
977               retype(dst, BRW_REGISTER_TYPE_UW),
978               inst->base_mrf,
979               src,
980               surf_index.ud,
981               0,
982               GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
983               rlen, /* response length */
984               inst->mlen,
985               inst->header_size > 0,
986               simd_mode,
987               return_format);
988 }
989 
990 void
generate_tex(fs_inst * inst,struct brw_reg dst,struct brw_reg surface_index,struct brw_reg sampler_index)991 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst,
992                            struct brw_reg surface_index,
993                            struct brw_reg sampler_index)
994 {
995    assert(devinfo->ver < 7);
996    assert(inst->size_written % REG_SIZE == 0);
997    int msg_type = -1;
998    uint32_t simd_mode;
999    uint32_t return_format;
1000 
1001    /* Sampler EOT message of less than the dispatch width would kill the
1002     * thread prematurely.
1003     */
1004    assert(!inst->eot || inst->exec_size == dispatch_width);
1005 
1006    switch (dst.type) {
1007    case BRW_REGISTER_TYPE_D:
1008       return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
1009       break;
1010    case BRW_REGISTER_TYPE_UD:
1011       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
1012       break;
1013    default:
1014       return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
1015       break;
1016    }
1017 
1018    /* Stomp the resinfo output type to UINT32.  On gens 4-5, the output type
1019     * is set as part of the message descriptor.  On gfx4, the PRM seems to
1020     * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
1021     * later gens UINT32 is required.  Once you hit Sandy Bridge, the bit is
1022     * gone from the message descriptor entirely and you just get UINT32 all
1023     * the time regasrdless.  Since we can really only do non-UINT32 on gfx4,
1024     * just stomp it to UINT32 all the time.
1025     */
1026    if (inst->opcode == SHADER_OPCODE_TXS)
1027       return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
1028 
1029    switch (inst->exec_size) {
1030    case 8:
1031       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1032       break;
1033    case 16:
1034       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1035       break;
1036    default:
1037       unreachable("Invalid width for texture instruction");
1038    }
1039 
1040    if (devinfo->ver >= 5) {
1041       switch (inst->opcode) {
1042       case SHADER_OPCODE_TEX:
1043 	 if (inst->shadow_compare) {
1044 	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
1045 	 } else {
1046 	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE;
1047 	 }
1048 	 break;
1049       case FS_OPCODE_TXB:
1050 	 if (inst->shadow_compare) {
1051 	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
1052 	 } else {
1053 	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
1054 	 }
1055 	 break;
1056       case SHADER_OPCODE_TXL:
1057 	 if (inst->shadow_compare) {
1058 	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
1059 	 } else {
1060 	    msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
1061 	 }
1062 	 break;
1063       case SHADER_OPCODE_TXS:
1064 	 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
1065 	 break;
1066       case SHADER_OPCODE_TXD:
1067          assert(!inst->shadow_compare);
1068          msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
1069 	 break;
1070       case SHADER_OPCODE_TXF:
1071 	 msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
1072 	 break;
1073       case SHADER_OPCODE_TXF_CMS:
1074          msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
1075          break;
1076       case SHADER_OPCODE_LOD:
1077          msg_type = GFX5_SAMPLER_MESSAGE_LOD;
1078          break;
1079       case SHADER_OPCODE_TG4:
1080          assert(devinfo->ver == 6);
1081          assert(!inst->shadow_compare);
1082          msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
1083          break;
1084       case SHADER_OPCODE_SAMPLEINFO:
1085          msg_type = GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
1086          break;
1087       default:
1088 	 unreachable("not reached");
1089       }
1090    } else {
1091       switch (inst->opcode) {
1092       case SHADER_OPCODE_TEX:
1093 	 /* Note that G45 and older determines shadow compare and dispatch width
1094 	  * from message length for most messages.
1095 	  */
1096          if (inst->exec_size == 8) {
1097             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
1098             if (inst->shadow_compare) {
1099                assert(inst->mlen == 6);
1100             } else {
1101                assert(inst->mlen <= 4);
1102             }
1103          } else {
1104             if (inst->shadow_compare) {
1105                msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1106                assert(inst->mlen == 9);
1107             } else {
1108                msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1109                assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
1110             }
1111          }
1112 	 break;
1113       case FS_OPCODE_TXB:
1114 	 if (inst->shadow_compare) {
1115             assert(inst->exec_size == 8);
1116 	    assert(inst->mlen == 6);
1117 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
1118 	 } else {
1119 	    assert(inst->mlen == 9);
1120 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1121 	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1122 	 }
1123 	 break;
1124       case SHADER_OPCODE_TXL:
1125 	 if (inst->shadow_compare) {
1126             assert(inst->exec_size == 8);
1127 	    assert(inst->mlen == 6);
1128 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
1129 	 } else {
1130 	    assert(inst->mlen == 9);
1131 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
1132 	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1133 	 }
1134 	 break;
1135       case SHADER_OPCODE_TXD:
1136 	 /* There is no sample_d_c message; comparisons are done manually */
1137          assert(inst->exec_size == 8);
1138 	 assert(inst->mlen == 7 || inst->mlen == 10);
1139 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
1140 	 break;
1141       case SHADER_OPCODE_TXF:
1142          assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
1143 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1144 	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1145 	 break;
1146       case SHADER_OPCODE_TXS:
1147 	 assert(inst->mlen == 3);
1148 	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
1149 	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1150 	 break;
1151       default:
1152 	 unreachable("not reached");
1153       }
1154    }
1155    assert(msg_type != -1);
1156 
1157    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
1158       dst = vec16(dst);
1159    }
1160 
1161    assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
1162 
1163    /* Load the message header if present.  If there's a texture offset,
1164     * we need to set it up explicitly and load the offset bitfield.
1165     * Otherwise, we can use an implied move from g0 to the first message reg.
1166     */
1167    struct brw_reg src = brw_null_reg();
1168    if (inst->header_size != 0) {
1169       if (devinfo->ver < 6 && !inst->offset) {
1170          /* Set up an implied move from g0 to the MRF. */
1171          src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
1172       } else {
1173          const tgl_swsb swsb = brw_get_default_swsb(p);
1174          assert(inst->base_mrf != -1);
1175          struct brw_reg header_reg = brw_message_reg(inst->base_mrf);
1176 
1177          brw_push_insn_state(p);
1178          brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1179          brw_set_default_exec_size(p, BRW_EXECUTE_8);
1180          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1181          brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1182          /* Explicitly set up the message header by copying g0 to the MRF. */
1183          brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
1184          brw_set_default_swsb(p, tgl_swsb_regdist(1));
1185 
1186          brw_set_default_exec_size(p, BRW_EXECUTE_1);
1187          if (inst->offset) {
1188             /* Set the offset bits in DWord 2. */
1189             brw_MOV(p, get_element_ud(header_reg, 2),
1190                        brw_imm_ud(inst->offset));
1191          }
1192 
1193          brw_pop_insn_state(p);
1194          brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
1195       }
1196    }
1197 
1198    assert(surface_index.file == BRW_IMMEDIATE_VALUE);
1199    assert(sampler_index.file == BRW_IMMEDIATE_VALUE);
1200 
1201    brw_SAMPLE(p,
1202               retype(dst, BRW_REGISTER_TYPE_UW),
1203               inst->base_mrf,
1204               src,
1205               surface_index.ud,
1206               sampler_index.ud % 16,
1207               msg_type,
1208               inst->size_written / REG_SIZE,
1209               inst->mlen,
1210               inst->header_size != 0,
1211               simd_mode,
1212               return_format);
1213 }
1214 
1215 
1216 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
1217  * looking like:
1218  *
1219  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
1220  *
1221  * Ideally, we want to produce:
1222  *
1223  *           DDX                     DDY
1224  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
1225  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
1226  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
1227  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
1228  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
1229  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
1230  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
1231  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
1232  *
1233  * and add another set of two more subspans if in 16-pixel dispatch mode.
1234  *
1235  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
1236  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
1237  * pair.  But the ideal approximation may impose a huge performance cost on
1238  * sample_d.  On at least Haswell, sample_d instruction does some
1239  * optimizations if the same LOD is used for all pixels in the subspan.
1240  *
1241  * For DDY, we need to use ALIGN16 mode since it's capable of doing the
1242  * appropriate swizzling.
1243  */
1244 void
generate_ddx(const fs_inst * inst,struct brw_reg dst,struct brw_reg src)1245 fs_generator::generate_ddx(const fs_inst *inst,
1246                            struct brw_reg dst, struct brw_reg src)
1247 {
1248    unsigned vstride, width;
1249 
1250    if (devinfo->ver >= 8) {
1251       if (inst->opcode == FS_OPCODE_DDX_FINE) {
1252          /* produce accurate derivatives */
1253          vstride = BRW_VERTICAL_STRIDE_2;
1254          width = BRW_WIDTH_2;
1255       } else {
1256          /* replicate the derivative at the top-left pixel to other pixels */
1257          vstride = BRW_VERTICAL_STRIDE_4;
1258          width = BRW_WIDTH_4;
1259       }
1260 
1261       struct brw_reg src0 = byte_offset(src, type_sz(src.type));;
1262       struct brw_reg src1 = src;
1263 
1264       src0.vstride = vstride;
1265       src0.width   = width;
1266       src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1267       src1.vstride = vstride;
1268       src1.width   = width;
1269       src1.hstride = BRW_HORIZONTAL_STRIDE_0;
1270 
1271       brw_ADD(p, dst, src0, negate(src1));
1272    } else {
1273       /* On Haswell and earlier, the region used above appears to not work
1274        * correctly for compressed instructions.  At least on Haswell and
1275        * Iron Lake, compressed ALIGN16 instructions do work.  Since we
1276        * would have to split to SIMD8 no matter which method we choose, we
1277        * may as well use ALIGN16 on all platforms gfx7 and earlier.
1278        */
1279       struct brw_reg src0 = stride(src, 4, 4, 1);
1280       struct brw_reg src1 = stride(src, 4, 4, 1);
1281       if (inst->opcode == FS_OPCODE_DDX_FINE) {
1282          src0.swizzle = BRW_SWIZZLE_XXZZ;
1283          src1.swizzle = BRW_SWIZZLE_YYWW;
1284       } else {
1285          src0.swizzle = BRW_SWIZZLE_XXXX;
1286          src1.swizzle = BRW_SWIZZLE_YYYY;
1287       }
1288 
1289       brw_push_insn_state(p);
1290       brw_set_default_access_mode(p, BRW_ALIGN_16);
1291       brw_ADD(p, dst, negate(src0), src1);
1292       brw_pop_insn_state(p);
1293    }
1294 }
1295 
1296 /* The negate_value boolean is used to negate the derivative computation for
1297  * FBOs, since they place the origin at the upper left instead of the lower
1298  * left.
1299  */
1300 void
generate_ddy(const fs_inst * inst,struct brw_reg dst,struct brw_reg src)1301 fs_generator::generate_ddy(const fs_inst *inst,
1302                            struct brw_reg dst, struct brw_reg src)
1303 {
1304    const uint32_t type_size = type_sz(src.type);
1305 
1306    if (inst->opcode == FS_OPCODE_DDY_FINE) {
1307       /* produce accurate derivatives.
1308        *
1309        * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU)
1310        * "Register Region Restrictions", Section "1. Special Restrictions":
1311        *
1312        *    "In Align16 mode, the channel selects and channel enables apply to
1313        *     a pair of half-floats, because these parameters are defined for
1314        *     DWord elements ONLY. This is applicable when both source and
1315        *     destination are half-floats."
1316        *
1317        * So for half-float operations we use the Gfx11+ Align1 path. CHV
1318        * inherits its FP16 hardware from SKL, so it is not affected.
1319        */
1320       if (devinfo->ver >= 11 ||
1321           (devinfo->platform == INTEL_PLATFORM_BDW && src.type == BRW_REGISTER_TYPE_HF)) {
1322          src = stride(src, 0, 2, 1);
1323 
1324          brw_push_insn_state(p);
1325          brw_set_default_exec_size(p, BRW_EXECUTE_4);
1326          for (uint32_t g = 0; g < inst->exec_size; g += 4) {
1327             brw_set_default_group(p, inst->group + g);
1328             brw_ADD(p, byte_offset(dst, g * type_size),
1329                        negate(byte_offset(src,  g * type_size)),
1330                        byte_offset(src, (g + 2) * type_size));
1331             brw_set_default_swsb(p, tgl_swsb_null());
1332          }
1333          brw_pop_insn_state(p);
1334       } else {
1335          struct brw_reg src0 = stride(src, 4, 4, 1);
1336          struct brw_reg src1 = stride(src, 4, 4, 1);
1337          src0.swizzle = BRW_SWIZZLE_XYXY;
1338          src1.swizzle = BRW_SWIZZLE_ZWZW;
1339 
1340          brw_push_insn_state(p);
1341          brw_set_default_access_mode(p, BRW_ALIGN_16);
1342          brw_ADD(p, dst, negate(src0), src1);
1343          brw_pop_insn_state(p);
1344       }
1345    } else {
1346       /* replicate the derivative at the top-left pixel to other pixels */
1347       if (devinfo->ver >= 8) {
1348          struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size);
1349          struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size);
1350 
1351          brw_ADD(p, dst, negate(src0), src1);
1352       } else {
1353          /* On Haswell and earlier, the region used above appears to not work
1354           * correctly for compressed instructions.  At least on Haswell and
1355           * Iron Lake, compressed ALIGN16 instructions do work.  Since we
1356           * would have to split to SIMD8 no matter which method we choose, we
1357           * may as well use ALIGN16 on all platforms gfx7 and earlier.
1358           */
1359          struct brw_reg src0 = stride(src, 4, 4, 1);
1360          struct brw_reg src1 = stride(src, 4, 4, 1);
1361          src0.swizzle = BRW_SWIZZLE_XXXX;
1362          src1.swizzle = BRW_SWIZZLE_ZZZZ;
1363 
1364          brw_push_insn_state(p);
1365          brw_set_default_access_mode(p, BRW_ALIGN_16);
1366          brw_ADD(p, dst, negate(src0), src1);
1367          brw_pop_insn_state(p);
1368       }
1369    }
1370 }
1371 
1372 void
generate_halt(fs_inst *)1373 fs_generator::generate_halt(fs_inst *)
1374 {
1375    /* This HALT will be patched up at FB write time to point UIP at the end of
1376     * the program, and at brw_uip_jip() JIP will be set to the end of the
1377     * current block (or the program).
1378     */
1379    this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
1380    brw_HALT(p);
1381 }
1382 
1383 void
generate_scratch_write(fs_inst * inst,struct brw_reg src)1384 fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
1385 {
1386    /* The 32-wide messages only respect the first 16-wide half of the channel
1387     * enable signals which are replicated identically for the second group of
1388     * 16 channels, so we cannot use them unless the write is marked
1389     * force_writemask_all.
1390     */
1391    const unsigned lower_size = inst->force_writemask_all ? inst->exec_size :
1392                                MIN2(16, inst->exec_size);
1393    const unsigned block_size = 4 * lower_size / REG_SIZE;
1394    const tgl_swsb swsb = brw_get_default_swsb(p);
1395    assert(inst->mlen != 0);
1396 
1397    brw_push_insn_state(p);
1398    brw_set_default_exec_size(p, cvt(lower_size) - 1);
1399    brw_set_default_compression(p, lower_size > 8);
1400 
1401    for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
1402       brw_set_default_group(p, inst->group + lower_size * i);
1403 
1404       if (i > 0) {
1405          assert(swsb.mode & TGL_SBID_SET);
1406          brw_set_default_swsb(p, tgl_swsb_sbid(TGL_SBID_SRC, swsb.sbid));
1407       } else {
1408          brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1409       }
1410 
1411       brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0),
1412               retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
1413 
1414       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
1415       brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
1416                                     block_size,
1417                                     inst->offset + block_size * REG_SIZE * i);
1418    }
1419 
1420    brw_pop_insn_state(p);
1421 }
1422 
1423 void
generate_scratch_read(fs_inst * inst,struct brw_reg dst)1424 fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
1425 {
1426    assert(inst->exec_size <= 16 || inst->force_writemask_all);
1427    assert(inst->mlen != 0);
1428 
1429    brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
1430                                 inst->exec_size / 8, inst->offset);
1431 }
1432 
1433 void
generate_scratch_read_gfx7(fs_inst * inst,struct brw_reg dst)1434 fs_generator::generate_scratch_read_gfx7(fs_inst *inst, struct brw_reg dst)
1435 {
1436    assert(inst->exec_size <= 16 || inst->force_writemask_all);
1437 
1438    gfx7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
1439 }
1440 
1441 /* The A32 messages take a buffer base address in header.5:[31:0] (See
1442  * MH1_A32_PSM for typed messages or MH_A32_GO for byte/dword scattered
1443  * and OWord block messages in the SKL PRM Vol. 2d for more details.)
1444  * Unfortunately, there are a number of subtle differences:
1445  *
1446  * For the block read/write messages:
1447  *
1448  *   - We always stomp header.2 to fill in the actual scratch address (in
1449  *     units of OWORDs) so we don't care what's in there.
1450  *
1451  *   - They rely on per-thread scratch space value in header.3[3:0] to do
1452  *     bounds checking so that needs to be valid.  The upper bits of
1453  *     header.3 are ignored, though, so we can copy all of g0.3.
1454  *
1455  *   - They ignore header.5[9:0] and assumes the address is 1KB aligned.
1456  *
1457  *
1458  * For the byte/dword scattered read/write messages:
1459  *
1460  *   - We want header.2 to be zero because that gets added to the per-channel
1461  *     offset in the non-header portion of the message.
1462  *
1463  *   - Contrary to what the docs claim, they don't do any bounds checking so
1464  *     the value of header.3[3:0] doesn't matter.
1465  *
1466  *   - They consider all of header.5 for the base address and header.5[9:0]
1467  *     are not ignored.  This means that we can't copy g0.5 verbatim because
1468  *     g0.5[9:0] contains the FFTID on most platforms.  Instead, we have to
1469  *     use an AND to mask off the bottom 10 bits.
1470  *
1471  *
1472  * For block messages, just copying g0 gives a valid header because all the
1473  * garbage gets ignored except for header.2 which we stomp as part of message
1474  * setup.  For byte/dword scattered messages, we can just zero out the header
1475  * and copy over the bits we need from g0.5.  This opcode, however, tries to
1476  * satisfy the requirements of both by starting with 0 and filling out the
1477  * information required by either set of opcodes.
1478  */
1479 void
generate_scratch_header(fs_inst * inst,struct brw_reg dst)1480 fs_generator::generate_scratch_header(fs_inst *inst, struct brw_reg dst)
1481 {
1482    assert(inst->exec_size == 8 && inst->force_writemask_all);
1483    assert(dst.file == BRW_GENERAL_REGISTER_FILE);
1484 
1485    dst.type = BRW_REGISTER_TYPE_UD;
1486 
1487    brw_inst *insn = brw_MOV(p, dst, brw_imm_ud(0));
1488    if (devinfo->ver >= 12)
1489       brw_set_default_swsb(p, tgl_swsb_null());
1490    else
1491       brw_inst_set_no_dd_clear(p->devinfo, insn, true);
1492 
1493    /* Copy the per-thread scratch space size from g0.3[3:0] */
1494    brw_set_default_exec_size(p, BRW_EXECUTE_1);
1495    insn = brw_AND(p, suboffset(dst, 3),
1496                      retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
1497                      brw_imm_ud(INTEL_MASK(3, 0)));
1498    if (devinfo->ver < 12) {
1499       brw_inst_set_no_dd_clear(p->devinfo, insn, true);
1500       brw_inst_set_no_dd_check(p->devinfo, insn, true);
1501    }
1502 
1503    /* Copy the scratch base address from g0.5[31:10] */
1504    insn = brw_AND(p, suboffset(dst, 5),
1505                      retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
1506                      brw_imm_ud(INTEL_MASK(31, 10)));
1507    if (devinfo->ver < 12)
1508       brw_inst_set_no_dd_check(p->devinfo, insn, true);
1509 }
1510 
1511 void
generate_uniform_pull_constant_load(fs_inst * inst,struct brw_reg dst,struct brw_reg index,struct brw_reg offset)1512 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
1513                                                   struct brw_reg dst,
1514                                                   struct brw_reg index,
1515                                                   struct brw_reg offset)
1516 {
1517    assert(type_sz(dst.type) == 4);
1518    assert(inst->mlen != 0);
1519 
1520    assert(index.file == BRW_IMMEDIATE_VALUE &&
1521 	  index.type == BRW_REGISTER_TYPE_UD);
1522    uint32_t surf_index = index.ud;
1523 
1524    assert(offset.file == BRW_IMMEDIATE_VALUE &&
1525 	  offset.type == BRW_REGISTER_TYPE_UD);
1526    uint32_t read_offset = offset.ud;
1527 
1528    brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
1529 			read_offset, surf_index);
1530 }
1531 
1532 void
generate_uniform_pull_constant_load_gfx7(fs_inst * inst,struct brw_reg dst,struct brw_reg index,struct brw_reg payload)1533 fs_generator::generate_uniform_pull_constant_load_gfx7(fs_inst *inst,
1534                                                        struct brw_reg dst,
1535                                                        struct brw_reg index,
1536                                                        struct brw_reg payload)
1537 {
1538    assert(index.type == BRW_REGISTER_TYPE_UD);
1539    assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1540    assert(type_sz(dst.type) == 4);
1541    assert(!devinfo->has_lsc);
1542 
1543    if (index.file == BRW_IMMEDIATE_VALUE) {
1544       const uint32_t surf_index = index.ud;
1545 
1546       brw_push_insn_state(p);
1547       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1548       brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1549       brw_pop_insn_state(p);
1550 
1551       brw_inst_set_sfid(devinfo, send, GFX6_SFID_DATAPORT_CONSTANT_CACHE);
1552       brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
1553       brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
1554       brw_set_desc(p, send,
1555                    brw_message_desc(devinfo, 1, DIV_ROUND_UP(inst->size_written,
1556                                                              REG_SIZE), true) |
1557                    brw_dp_desc(devinfo, surf_index,
1558                                GFX7_DATAPORT_DC_OWORD_BLOCK_READ,
1559                                BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size)));
1560 
1561    } else {
1562       const tgl_swsb swsb = brw_get_default_swsb(p);
1563       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1564 
1565       brw_push_insn_state(p);
1566       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1567 
1568       /* a0.0 = surf_index & 0xff */
1569       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1570       brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1571       brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1572       brw_set_dest(p, insn_and, addr);
1573       brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1574       brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1575 
1576       /* dst = send(payload, a0.0 | <descriptor>) */
1577       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
1578       brw_send_indirect_message(
1579          p, GFX6_SFID_DATAPORT_CONSTANT_CACHE,
1580          retype(dst, BRW_REGISTER_TYPE_UD),
1581          retype(payload, BRW_REGISTER_TYPE_UD), addr,
1582          brw_message_desc(devinfo, 1,
1583                           DIV_ROUND_UP(inst->size_written, REG_SIZE), true) |
1584          brw_dp_desc(devinfo, 0 /* surface */,
1585                      GFX7_DATAPORT_DC_OWORD_BLOCK_READ,
1586                      BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size)),
1587          false /* EOT */);
1588 
1589       brw_pop_insn_state(p);
1590    }
1591 }
1592 
1593 void
generate_varying_pull_constant_load_gfx4(fs_inst * inst,struct brw_reg dst,struct brw_reg index)1594 fs_generator::generate_varying_pull_constant_load_gfx4(fs_inst *inst,
1595                                                        struct brw_reg dst,
1596                                                        struct brw_reg index)
1597 {
1598    assert(devinfo->ver < 7); /* Should use the gfx7 variant. */
1599    assert(inst->header_size != 0);
1600    assert(inst->mlen);
1601 
1602    assert(index.file == BRW_IMMEDIATE_VALUE &&
1603 	  index.type == BRW_REGISTER_TYPE_UD);
1604    uint32_t surf_index = index.ud;
1605 
1606    uint32_t simd_mode, rlen, msg_type;
1607    if (inst->exec_size == 16) {
1608       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1609       rlen = 8;
1610    } else {
1611       assert(inst->exec_size == 8);
1612       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1613       rlen = 4;
1614    }
1615 
1616    if (devinfo->ver >= 5)
1617       msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
1618    else {
1619       /* We always use the SIMD16 message so that we only have to load U, and
1620        * not V or R.
1621        */
1622       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1623       assert(inst->mlen == 3);
1624       assert(inst->size_written == 8 * REG_SIZE);
1625       rlen = 8;
1626       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1627    }
1628 
1629    struct brw_reg header = brw_vec8_grf(0, 0);
1630    gfx6_resolve_implied_move(p, &header, inst->base_mrf);
1631 
1632    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1633    brw_inst_set_compression(devinfo, send, false);
1634    brw_inst_set_sfid(devinfo, send, BRW_SFID_SAMPLER);
1635    brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1636    brw_set_src0(p, send, header);
1637    if (devinfo->ver < 6)
1638       brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf);
1639 
1640    /* Our surface is set up as floats, regardless of what actual data is
1641     * stored in it.
1642     */
1643    uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
1644    brw_set_desc(p, send,
1645                 brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size) |
1646                 brw_sampler_desc(devinfo, surf_index,
1647                                  0, /* sampler (unused) */
1648                                  msg_type, simd_mode, return_format));
1649 }
1650 
1651 void
generate_pixel_interpolator_query(fs_inst * inst,struct brw_reg dst,struct brw_reg src,struct brw_reg msg_data,unsigned msg_type)1652 fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
1653                                                 struct brw_reg dst,
1654                                                 struct brw_reg src,
1655                                                 struct brw_reg msg_data,
1656                                                 unsigned msg_type)
1657 {
1658    const bool has_payload = inst->src[0].file != BAD_FILE;
1659    assert(msg_data.type == BRW_REGISTER_TYPE_UD);
1660    assert(inst->size_written % REG_SIZE == 0);
1661 
1662    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
1663 
1664    brw_pixel_interpolator_query(p,
1665          retype(dst, BRW_REGISTER_TYPE_UW),
1666          /* If we don't have a payload, what we send doesn't matter */
1667          has_payload ? src : brw_vec8_grf(0, 0),
1668          inst->pi_noperspective,
1669          prog_data->per_coarse_pixel_dispatch,
1670          msg_type,
1671          msg_data,
1672          has_payload ? 2 * inst->exec_size / 8 : 1,
1673          inst->size_written / REG_SIZE);
1674 }
1675 
1676 /* Sets vstride=1, width=4, hstride=0 of register src1 during
1677  * the ADD instruction.
1678  */
1679 void
generate_set_sample_id(fs_inst * inst,struct brw_reg dst,struct brw_reg src0,struct brw_reg src1)1680 fs_generator::generate_set_sample_id(fs_inst *inst,
1681                                      struct brw_reg dst,
1682                                      struct brw_reg src0,
1683                                      struct brw_reg src1)
1684 {
1685    assert(dst.type == BRW_REGISTER_TYPE_D ||
1686           dst.type == BRW_REGISTER_TYPE_UD);
1687    assert(src0.type == BRW_REGISTER_TYPE_D ||
1688           src0.type == BRW_REGISTER_TYPE_UD);
1689 
1690    const struct brw_reg reg = stride(src1, 1, 4, 0);
1691    const unsigned lower_size = MIN2(inst->exec_size,
1692                                     devinfo->ver >= 8 ? 16 : 8);
1693 
1694    for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
1695       brw_inst *insn = brw_ADD(p, offset(dst, i * lower_size / 8),
1696                                offset(src0, (src0.vstride == 0 ? 0 : (1 << (src0.vstride - 1)) *
1697                                              (i * lower_size / (1 << src0.width))) *
1698                                             type_sz(src0.type) / REG_SIZE),
1699                                suboffset(reg, i * lower_size / 4));
1700       brw_inst_set_exec_size(devinfo, insn, cvt(lower_size) - 1);
1701       brw_inst_set_group(devinfo, insn, inst->group + lower_size * i);
1702       brw_inst_set_compression(devinfo, insn, lower_size > 8);
1703       brw_set_default_swsb(p, tgl_swsb_null());
1704    }
1705 }
1706 
1707 void
generate_pack_half_2x16_split(fs_inst *,struct brw_reg dst,struct brw_reg x,struct brw_reg y)1708 fs_generator::generate_pack_half_2x16_split(fs_inst *,
1709                                             struct brw_reg dst,
1710                                             struct brw_reg x,
1711                                             struct brw_reg y)
1712 {
1713    assert(devinfo->ver >= 7);
1714    assert(dst.type == BRW_REGISTER_TYPE_UD);
1715    assert(x.type == BRW_REGISTER_TYPE_F);
1716    assert(y.type == BRW_REGISTER_TYPE_F);
1717 
1718    /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
1719     *
1720     *   Because this instruction does not have a 16-bit floating-point type,
1721     *   the destination data type must be Word (W).
1722     *
1723     *   The destination must be DWord-aligned and specify a horizontal stride
1724     *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
1725     *   each destination channel and the upper word is not modified.
1726     */
1727    const enum brw_reg_type t = devinfo->ver > 7
1728       ? BRW_REGISTER_TYPE_HF : BRW_REGISTER_TYPE_W;
1729    struct brw_reg dst_w = spread(retype(dst, t), 2);
1730 
1731    if (y.file == IMM) {
1732       const uint32_t hhhh0000 = _mesa_float_to_half(y.f) << 16;
1733 
1734       brw_MOV(p, dst, brw_imm_ud(hhhh0000));
1735       brw_set_default_swsb(p, tgl_swsb_regdist(1));
1736    } else {
1737       /* Give each 32-bit channel of dst the form below, where "." means
1738        * unchanged.
1739        *   0x....hhhh
1740        */
1741       brw_F32TO16(p, dst_w, y);
1742 
1743       /* Now the form:
1744        *   0xhhhh0000
1745        */
1746       brw_set_default_swsb(p, tgl_swsb_regdist(1));
1747       brw_SHL(p, dst, dst, brw_imm_ud(16u));
1748    }
1749 
1750    /* And, finally the form of packHalf2x16's output:
1751     *   0xhhhhllll
1752     */
1753    brw_F32TO16(p, dst_w, x);
1754 }
1755 
1756 void
enable_debug(const char * shader_name)1757 fs_generator::enable_debug(const char *shader_name)
1758 {
1759    debug_flag = true;
1760    this->shader_name = shader_name;
1761 }
1762 
1763 int
generate_code(const cfg_t * cfg,int dispatch_width,struct shader_stats shader_stats,const brw::performance & perf,struct brw_compile_stats * stats)1764 fs_generator::generate_code(const cfg_t *cfg, int dispatch_width,
1765                             struct shader_stats shader_stats,
1766                             const brw::performance &perf,
1767                             struct brw_compile_stats *stats)
1768 {
1769    /* align to 64 byte boundary. */
1770    brw_realign(p, 64);
1771 
1772    this->dispatch_width = dispatch_width;
1773 
1774    int start_offset = p->next_insn_offset;
1775 
1776    int loop_count = 0, send_count = 0, nop_count = 0;
1777    bool is_accum_used = false;
1778 
1779    struct disasm_info *disasm_info = disasm_initialize(p->isa, cfg);
1780 
1781    foreach_block_and_inst (block, fs_inst, inst, cfg) {
1782       if (inst->opcode == SHADER_OPCODE_UNDEF)
1783          continue;
1784 
1785       struct brw_reg src[4], dst;
1786       unsigned int last_insn_offset = p->next_insn_offset;
1787       bool multiple_instructions_emitted = false;
1788       tgl_swsb swsb = inst->sched;
1789 
1790       /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
1791        * "Register Region Restrictions" section: for BDW, SKL:
1792        *
1793        *    "A POW/FDIV operation must not be followed by an instruction
1794        *     that requires two destination registers."
1795        *
1796        * The documentation is often lacking annotations for Atom parts,
1797        * and empirically this affects CHV as well.
1798        */
1799       if (devinfo->ver >= 8 &&
1800           devinfo->ver <= 9 &&
1801           p->nr_insn > 1 &&
1802           brw_inst_opcode(p->isa, brw_last_inst) == BRW_OPCODE_MATH &&
1803           brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW &&
1804           inst->dst.component_size(inst->exec_size) > REG_SIZE) {
1805          brw_NOP(p);
1806          last_insn_offset = p->next_insn_offset;
1807 
1808          /* In order to avoid spurious instruction count differences when the
1809           * instruction schedule changes, keep track of the number of inserted
1810           * NOPs.
1811           */
1812          nop_count++;
1813       }
1814 
1815       /* Wa_14010017096:
1816        *
1817        * Clear accumulator register before end of thread.
1818        */
1819       if (inst->eot && is_accum_used && devinfo->ver >= 12) {
1820          brw_set_default_exec_size(p, BRW_EXECUTE_16);
1821          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1822          brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1823          brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1824          brw_MOV(p, brw_acc_reg(8), brw_imm_f(0.0f));
1825          last_insn_offset = p->next_insn_offset;
1826          swsb = tgl_swsb_dst_dep(swsb, 1);
1827       }
1828 
1829       if (!is_accum_used && !inst->eot) {
1830          is_accum_used = inst->writes_accumulator_implicitly(devinfo) ||
1831                          inst->dst.is_accumulator();
1832       }
1833 
1834       /* Wa_14013745556:
1835        *
1836        * Always use @1 SWSB for EOT.
1837        */
1838       if (inst->eot && devinfo->ver >= 12) {
1839          if (tgl_swsb_src_dep(swsb).mode) {
1840             brw_set_default_exec_size(p, BRW_EXECUTE_1);
1841             brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1842             brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1843             brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1844             brw_SYNC(p, TGL_SYNC_NOP);
1845             last_insn_offset = p->next_insn_offset;
1846          }
1847 
1848          swsb = tgl_swsb_dst_dep(swsb, 1);
1849       }
1850 
1851       if (unlikely(debug_flag))
1852          disasm_annotate(disasm_info, inst, p->next_insn_offset);
1853 
1854       /* If the instruction writes to more than one register, it needs to be
1855        * explicitly marked as compressed on Gen <= 5.  On Gen >= 6 the
1856        * hardware figures out by itself what the right compression mode is,
1857        * but we still need to know whether the instruction is compressed to
1858        * set up the source register regions appropriately.
1859        *
1860        * XXX - This is wrong for instructions that write a single register but
1861        *       read more than one which should strictly speaking be treated as
1862        *       compressed.  For instructions that don't write any registers it
1863        *       relies on the destination being a null register of the correct
1864        *       type and regioning so the instruction is considered compressed
1865        *       or not accordingly.
1866        */
1867       const bool compressed =
1868            inst->dst.component_size(inst->exec_size) > REG_SIZE;
1869       brw_set_default_compression(p, compressed);
1870       brw_set_default_group(p, inst->group);
1871 
1872       for (unsigned int i = 0; i < inst->sources; i++) {
1873          src[i] = brw_reg_from_fs_reg(devinfo, inst,
1874                                       &inst->src[i], compressed);
1875 	 /* The accumulator result appears to get used for the
1876 	  * conditional modifier generation.  When negating a UD
1877 	  * value, there is a 33rd bit generated for the sign in the
1878 	  * accumulator value, so now you can't check, for example,
1879 	  * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
1880 	  */
1881 	 assert(!inst->conditional_mod ||
1882 		inst->src[i].type != BRW_REGISTER_TYPE_UD ||
1883 		!inst->src[i].negate);
1884       }
1885       dst = brw_reg_from_fs_reg(devinfo, inst,
1886                                 &inst->dst, compressed);
1887 
1888       brw_set_default_access_mode(p, BRW_ALIGN_1);
1889       brw_set_default_predicate_control(p, inst->predicate);
1890       brw_set_default_predicate_inverse(p, inst->predicate_inverse);
1891       /* On gfx7 and above, hardware automatically adds the group onto the
1892        * flag subregister number.  On Sandy Bridge and older, we have to do it
1893        * ourselves.
1894        */
1895       const unsigned flag_subreg = inst->flag_subreg +
1896          (devinfo->ver >= 7 ? 0 : inst->group / 16);
1897       brw_set_default_flag_reg(p, flag_subreg / 2, flag_subreg % 2);
1898       brw_set_default_saturate(p, inst->saturate);
1899       brw_set_default_mask_control(p, inst->force_writemask_all);
1900       brw_set_default_acc_write_control(p, inst->writes_accumulator);
1901       brw_set_default_swsb(p, swsb);
1902 
1903       unsigned exec_size = inst->exec_size;
1904       if (devinfo->verx10 == 70 &&
1905           (get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8)) {
1906          exec_size *= 2;
1907       }
1908 
1909       brw_set_default_exec_size(p, cvt(exec_size) - 1);
1910 
1911       assert(inst->force_writemask_all || inst->exec_size >= 4);
1912       assert(inst->force_writemask_all || inst->group % inst->exec_size == 0);
1913       assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->ver));
1914       assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
1915 
1916       switch (inst->opcode) {
1917       case BRW_OPCODE_SYNC:
1918          assert(src[0].file == BRW_IMMEDIATE_VALUE);
1919          brw_SYNC(p, tgl_sync_function(src[0].ud));
1920          break;
1921       case BRW_OPCODE_MOV:
1922 	 brw_MOV(p, dst, src[0]);
1923 	 break;
1924       case BRW_OPCODE_ADD:
1925 	 brw_ADD(p, dst, src[0], src[1]);
1926 	 break;
1927       case BRW_OPCODE_MUL:
1928 	 brw_MUL(p, dst, src[0], src[1]);
1929 	 break;
1930       case BRW_OPCODE_AVG:
1931 	 brw_AVG(p, dst, src[0], src[1]);
1932 	 break;
1933       case BRW_OPCODE_MACH:
1934 	 brw_MACH(p, dst, src[0], src[1]);
1935 	 break;
1936 
1937       case BRW_OPCODE_DP4A:
1938          assert(devinfo->ver >= 12);
1939          brw_DP4A(p, dst, src[0], src[1], src[2]);
1940          break;
1941 
1942       case BRW_OPCODE_LINE:
1943          brw_LINE(p, dst, src[0], src[1]);
1944          break;
1945 
1946       case BRW_OPCODE_MAD:
1947          assert(devinfo->ver >= 6);
1948          if (devinfo->ver < 10)
1949             brw_set_default_access_mode(p, BRW_ALIGN_16);
1950          brw_MAD(p, dst, src[0], src[1], src[2]);
1951 	 break;
1952 
1953       case BRW_OPCODE_LRP:
1954          assert(devinfo->ver >= 6 && devinfo->ver <= 10);
1955          if (devinfo->ver < 10)
1956             brw_set_default_access_mode(p, BRW_ALIGN_16);
1957          brw_LRP(p, dst, src[0], src[1], src[2]);
1958 	 break;
1959 
1960       case BRW_OPCODE_ADD3:
1961          assert(devinfo->verx10 >= 125);
1962          brw_ADD3(p, dst, src[0], src[1], src[2]);
1963          break;
1964 
1965       case BRW_OPCODE_FRC:
1966 	 brw_FRC(p, dst, src[0]);
1967 	 break;
1968       case BRW_OPCODE_RNDD:
1969 	 brw_RNDD(p, dst, src[0]);
1970 	 break;
1971       case BRW_OPCODE_RNDE:
1972 	 brw_RNDE(p, dst, src[0]);
1973 	 break;
1974       case BRW_OPCODE_RNDZ:
1975 	 brw_RNDZ(p, dst, src[0]);
1976 	 break;
1977 
1978       case BRW_OPCODE_AND:
1979 	 brw_AND(p, dst, src[0], src[1]);
1980 	 break;
1981       case BRW_OPCODE_OR:
1982 	 brw_OR(p, dst, src[0], src[1]);
1983 	 break;
1984       case BRW_OPCODE_XOR:
1985 	 brw_XOR(p, dst, src[0], src[1]);
1986 	 break;
1987       case BRW_OPCODE_NOT:
1988 	 brw_NOT(p, dst, src[0]);
1989 	 break;
1990       case BRW_OPCODE_ASR:
1991 	 brw_ASR(p, dst, src[0], src[1]);
1992 	 break;
1993       case BRW_OPCODE_SHR:
1994 	 brw_SHR(p, dst, src[0], src[1]);
1995 	 break;
1996       case BRW_OPCODE_SHL:
1997 	 brw_SHL(p, dst, src[0], src[1]);
1998 	 break;
1999       case BRW_OPCODE_ROL:
2000 	 assert(devinfo->ver >= 11);
2001 	 assert(src[0].type == dst.type);
2002 	 brw_ROL(p, dst, src[0], src[1]);
2003 	 break;
2004       case BRW_OPCODE_ROR:
2005 	 assert(devinfo->ver >= 11);
2006 	 assert(src[0].type == dst.type);
2007 	 brw_ROR(p, dst, src[0], src[1]);
2008 	 break;
2009       case BRW_OPCODE_F32TO16:
2010          assert(devinfo->ver >= 7);
2011          brw_F32TO16(p, dst, src[0]);
2012          break;
2013       case BRW_OPCODE_F16TO32:
2014          assert(devinfo->ver >= 7);
2015          brw_F16TO32(p, dst, src[0]);
2016          break;
2017       case BRW_OPCODE_CMP:
2018          if (inst->exec_size >= 16 && devinfo->verx10 == 70 &&
2019              dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2020             /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
2021              * implemented in the compiler is not sufficient. Overriding the
2022              * type when the destination is the null register is necessary but
2023              * not sufficient by itself.
2024              */
2025             dst.type = BRW_REGISTER_TYPE_D;
2026          }
2027          brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
2028 	 break;
2029       case BRW_OPCODE_CMPN:
2030          if (inst->exec_size >= 16 && devinfo->verx10 == 70 &&
2031              dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2032             /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
2033              * implemented in the compiler is not sufficient. Overriding the
2034              * type when the destination is the null register is necessary but
2035              * not sufficient by itself.
2036              */
2037             dst.type = BRW_REGISTER_TYPE_D;
2038          }
2039          brw_CMPN(p, dst, inst->conditional_mod, src[0], src[1]);
2040          break;
2041       case BRW_OPCODE_SEL:
2042 	 brw_SEL(p, dst, src[0], src[1]);
2043 	 break;
2044       case BRW_OPCODE_CSEL:
2045          assert(devinfo->ver >= 8);
2046          if (devinfo->ver < 10)
2047             brw_set_default_access_mode(p, BRW_ALIGN_16);
2048          brw_CSEL(p, dst, src[0], src[1], src[2]);
2049          break;
2050       case BRW_OPCODE_BFREV:
2051          assert(devinfo->ver >= 7);
2052          brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
2053                    retype(src[0], BRW_REGISTER_TYPE_UD));
2054          break;
2055       case BRW_OPCODE_FBH:
2056          assert(devinfo->ver >= 7);
2057          brw_FBH(p, retype(dst, src[0].type), src[0]);
2058          break;
2059       case BRW_OPCODE_FBL:
2060          assert(devinfo->ver >= 7);
2061          brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD),
2062                  retype(src[0], BRW_REGISTER_TYPE_UD));
2063          break;
2064       case BRW_OPCODE_LZD:
2065          brw_LZD(p, dst, src[0]);
2066          break;
2067       case BRW_OPCODE_CBIT:
2068          assert(devinfo->ver >= 7);
2069          brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD),
2070                   retype(src[0], BRW_REGISTER_TYPE_UD));
2071          break;
2072       case BRW_OPCODE_ADDC:
2073          assert(devinfo->ver >= 7);
2074          brw_ADDC(p, dst, src[0], src[1]);
2075          break;
2076       case BRW_OPCODE_SUBB:
2077          assert(devinfo->ver >= 7);
2078          brw_SUBB(p, dst, src[0], src[1]);
2079          break;
2080       case BRW_OPCODE_MAC:
2081          brw_MAC(p, dst, src[0], src[1]);
2082          break;
2083 
2084       case BRW_OPCODE_BFE:
2085          assert(devinfo->ver >= 7);
2086          if (devinfo->ver < 10)
2087             brw_set_default_access_mode(p, BRW_ALIGN_16);
2088          brw_BFE(p, dst, src[0], src[1], src[2]);
2089          break;
2090 
2091       case BRW_OPCODE_BFI1:
2092          assert(devinfo->ver >= 7);
2093          brw_BFI1(p, dst, src[0], src[1]);
2094          break;
2095       case BRW_OPCODE_BFI2:
2096          assert(devinfo->ver >= 7);
2097          if (devinfo->ver < 10)
2098             brw_set_default_access_mode(p, BRW_ALIGN_16);
2099          brw_BFI2(p, dst, src[0], src[1], src[2]);
2100          break;
2101 
2102       case BRW_OPCODE_IF:
2103 	 if (inst->src[0].file != BAD_FILE) {
2104 	    /* The instruction has an embedded compare (only allowed on gfx6) */
2105 	    assert(devinfo->ver == 6);
2106 	    gfx6_IF(p, inst->conditional_mod, src[0], src[1]);
2107 	 } else {
2108 	    brw_IF(p, brw_get_default_exec_size(p));
2109 	 }
2110 	 break;
2111 
2112       case BRW_OPCODE_ELSE:
2113 	 brw_ELSE(p);
2114 	 break;
2115       case BRW_OPCODE_ENDIF:
2116 	 brw_ENDIF(p);
2117 	 break;
2118 
2119       case BRW_OPCODE_DO:
2120 	 brw_DO(p, brw_get_default_exec_size(p));
2121 	 break;
2122 
2123       case BRW_OPCODE_BREAK:
2124 	 brw_BREAK(p);
2125 	 break;
2126       case BRW_OPCODE_CONTINUE:
2127          brw_CONT(p);
2128 	 break;
2129 
2130       case BRW_OPCODE_WHILE:
2131 	 brw_WHILE(p);
2132          loop_count++;
2133 	 break;
2134 
2135       case SHADER_OPCODE_RCP:
2136       case SHADER_OPCODE_RSQ:
2137       case SHADER_OPCODE_SQRT:
2138       case SHADER_OPCODE_EXP2:
2139       case SHADER_OPCODE_LOG2:
2140       case SHADER_OPCODE_SIN:
2141       case SHADER_OPCODE_COS:
2142          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
2143 	 if (devinfo->ver >= 6) {
2144             assert(inst->mlen == 0);
2145             assert(devinfo->ver >= 7 || inst->exec_size == 8);
2146             gfx6_math(p, dst, brw_math_function(inst->opcode),
2147                       src[0], brw_null_reg());
2148 	 } else {
2149             assert(inst->mlen >= 1);
2150             assert(devinfo->ver == 5 || devinfo->platform == INTEL_PLATFORM_G4X || inst->exec_size == 8);
2151             gfx4_math(p, dst,
2152                       brw_math_function(inst->opcode),
2153                       inst->base_mrf, src[0],
2154                       BRW_MATH_PRECISION_FULL);
2155             send_count++;
2156 	 }
2157 	 break;
2158       case SHADER_OPCODE_INT_QUOTIENT:
2159       case SHADER_OPCODE_INT_REMAINDER:
2160       case SHADER_OPCODE_POW:
2161          assert(devinfo->verx10 < 125);
2162          assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
2163          if (devinfo->ver >= 6) {
2164             assert(inst->mlen == 0);
2165             assert((devinfo->ver >= 7 && inst->opcode == SHADER_OPCODE_POW) ||
2166                    inst->exec_size == 8);
2167             gfx6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
2168          } else {
2169             assert(inst->mlen >= 1);
2170             assert(inst->exec_size == 8);
2171             gfx4_math(p, dst, brw_math_function(inst->opcode),
2172                       inst->base_mrf, src[0],
2173                       BRW_MATH_PRECISION_FULL);
2174             send_count++;
2175 	 }
2176 	 break;
2177       case FS_OPCODE_LINTERP:
2178 	 multiple_instructions_emitted = generate_linterp(inst, dst, src);
2179 	 break;
2180       case FS_OPCODE_PIXEL_X:
2181          assert(src[0].type == BRW_REGISTER_TYPE_UW);
2182          assert(src[1].type == BRW_REGISTER_TYPE_UW);
2183          src[0].subnr = 0 * type_sz(src[0].type);
2184          if (src[1].file == BRW_IMMEDIATE_VALUE) {
2185             assert(src[1].ud == 0);
2186             brw_MOV(p, dst, stride(src[0], 8, 4, 1));
2187          } else {
2188             /* Coarse pixel case */
2189             brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]);
2190          }
2191          break;
2192       case FS_OPCODE_PIXEL_Y:
2193          assert(src[0].type == BRW_REGISTER_TYPE_UW);
2194          assert(src[1].type == BRW_REGISTER_TYPE_UW);
2195          src[0].subnr = 4 * type_sz(src[0].type);
2196          if (src[1].file == BRW_IMMEDIATE_VALUE) {
2197             assert(src[1].ud == 0);
2198             brw_MOV(p, dst, stride(src[0], 8, 4, 1));
2199          } else {
2200             /* Coarse pixel case */
2201             brw_ADD(p, dst, stride(src[0], 8, 4, 1), src[1]);
2202          }
2203          break;
2204 
2205       case SHADER_OPCODE_SEND:
2206          generate_send(inst, dst, src[0], src[1], src[2],
2207                        inst->ex_mlen > 0 ? src[3] : brw_null_reg());
2208          send_count++;
2209          break;
2210 
2211       case SHADER_OPCODE_GET_BUFFER_SIZE:
2212          generate_get_buffer_size(inst, dst, src[0], src[1]);
2213          send_count++;
2214          break;
2215       case SHADER_OPCODE_TEX:
2216       case FS_OPCODE_TXB:
2217       case SHADER_OPCODE_TXD:
2218       case SHADER_OPCODE_TXF:
2219       case SHADER_OPCODE_TXF_CMS:
2220       case SHADER_OPCODE_TXL:
2221       case SHADER_OPCODE_TXS:
2222       case SHADER_OPCODE_LOD:
2223       case SHADER_OPCODE_TG4:
2224       case SHADER_OPCODE_SAMPLEINFO:
2225          assert(inst->src[0].file == BAD_FILE);
2226          generate_tex(inst, dst, src[1], src[2]);
2227          send_count++;
2228          break;
2229 
2230       case FS_OPCODE_DDX_COARSE:
2231       case FS_OPCODE_DDX_FINE:
2232          generate_ddx(inst, dst, src[0]);
2233          break;
2234       case FS_OPCODE_DDY_COARSE:
2235       case FS_OPCODE_DDY_FINE:
2236          generate_ddy(inst, dst, src[0]);
2237 	 break;
2238 
2239       case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
2240 	 generate_scratch_write(inst, src[0]);
2241          send_count++;
2242 	 break;
2243 
2244       case SHADER_OPCODE_GFX4_SCRATCH_READ:
2245 	 generate_scratch_read(inst, dst);
2246          send_count++;
2247 	 break;
2248 
2249       case SHADER_OPCODE_GFX7_SCRATCH_READ:
2250 	 generate_scratch_read_gfx7(inst, dst);
2251          send_count++;
2252 	 break;
2253 
2254       case SHADER_OPCODE_SCRATCH_HEADER:
2255          generate_scratch_header(inst, dst);
2256          break;
2257 
2258       case SHADER_OPCODE_MOV_INDIRECT:
2259          generate_mov_indirect(inst, dst, src[0], src[1]);
2260          break;
2261 
2262       case SHADER_OPCODE_MOV_RELOC_IMM:
2263          assert(src[0].file == BRW_IMMEDIATE_VALUE);
2264          brw_MOV_reloc_imm(p, dst, dst.type, src[0].ud);
2265          break;
2266 
2267       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
2268          assert(inst->force_writemask_all);
2269 	 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
2270          send_count++;
2271 	 break;
2272 
2273       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
2274          assert(inst->force_writemask_all);
2275 	 generate_uniform_pull_constant_load_gfx7(inst, dst, src[0], src[1]);
2276          send_count++;
2277 	 break;
2278 
2279       case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
2280 	 generate_varying_pull_constant_load_gfx4(inst, dst, src[0]);
2281          send_count++;
2282 	 break;
2283 
2284       case FS_OPCODE_REP_FB_WRITE:
2285       case FS_OPCODE_FB_WRITE:
2286 	 generate_fb_write(inst, src[0]);
2287          send_count++;
2288 	 break;
2289 
2290       case FS_OPCODE_FB_READ:
2291          generate_fb_read(inst, dst, src[0]);
2292          send_count++;
2293          break;
2294 
2295       case BRW_OPCODE_HALT:
2296          generate_halt(inst);
2297          break;
2298 
2299       case SHADER_OPCODE_INTERLOCK:
2300       case SHADER_OPCODE_MEMORY_FENCE: {
2301          assert(src[1].file == BRW_IMMEDIATE_VALUE);
2302          assert(src[2].file == BRW_IMMEDIATE_VALUE);
2303 
2304          const enum opcode send_op = inst->opcode == SHADER_OPCODE_INTERLOCK ?
2305             BRW_OPCODE_SENDC : BRW_OPCODE_SEND;
2306 
2307          brw_memory_fence(p, dst, src[0], send_op,
2308                           brw_message_target(inst->sfid),
2309                           inst->desc,
2310                           /* commit_enable */ src[1].ud,
2311                           /* bti */ src[2].ud);
2312          send_count++;
2313          break;
2314       }
2315 
2316       case FS_OPCODE_SCHEDULING_FENCE:
2317          if (inst->sources == 0 && swsb.regdist == 0 &&
2318                                    swsb.mode == TGL_SBID_NULL) {
2319             if (unlikely(debug_flag))
2320                disasm_info->use_tail = true;
2321             break;
2322          }
2323 
2324          if (devinfo->ver >= 12) {
2325             /* Use the available SWSB information to stall.  A single SYNC is
2326              * sufficient since if there were multiple dependencies, the
2327              * scoreboard algorithm already injected other SYNCs before this
2328              * instruction.
2329              */
2330             brw_SYNC(p, TGL_SYNC_NOP);
2331          } else {
2332             for (unsigned i = 0; i < inst->sources; i++) {
2333                /* Emit a MOV to force a stall until the instruction producing the
2334                 * registers finishes.
2335                 */
2336                brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
2337                        retype(src[i], BRW_REGISTER_TYPE_UW));
2338             }
2339 
2340             if (inst->sources > 1)
2341                multiple_instructions_emitted = true;
2342          }
2343 
2344          break;
2345 
2346       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2347          brw_find_live_channel(p, dst, false);
2348          break;
2349       case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL:
2350          brw_find_live_channel(p, dst, true);
2351          break;
2352 
2353       case FS_OPCODE_LOAD_LIVE_CHANNELS: {
2354          assert(devinfo->ver >= 8);
2355          assert(inst->force_writemask_all && inst->group == 0);
2356          assert(inst->dst.file == BAD_FILE);
2357          brw_set_default_exec_size(p, BRW_EXECUTE_1);
2358          brw_MOV(p, retype(brw_flag_subreg(inst->flag_subreg),
2359                            BRW_REGISTER_TYPE_UD),
2360                  retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
2361          break;
2362       }
2363       case SHADER_OPCODE_BROADCAST:
2364          assert(inst->force_writemask_all);
2365          brw_broadcast(p, dst, src[0], src[1]);
2366          break;
2367 
2368       case SHADER_OPCODE_SHUFFLE:
2369          generate_shuffle(inst, dst, src[0], src[1]);
2370          break;
2371 
2372       case SHADER_OPCODE_SEL_EXEC:
2373          assert(inst->force_writemask_all);
2374          assert(devinfo->has_64bit_float || type_sz(dst.type) <= 4);
2375          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2376          brw_MOV(p, dst, src[1]);
2377          brw_set_default_mask_control(p, BRW_MASK_ENABLE);
2378          brw_set_default_swsb(p, tgl_swsb_null());
2379          brw_MOV(p, dst, src[0]);
2380          break;
2381 
2382       case SHADER_OPCODE_QUAD_SWIZZLE:
2383          assert(src[1].file == BRW_IMMEDIATE_VALUE);
2384          assert(src[1].type == BRW_REGISTER_TYPE_UD);
2385          generate_quad_swizzle(inst, dst, src[0], src[1].ud);
2386          break;
2387 
2388       case SHADER_OPCODE_CLUSTER_BROADCAST: {
2389          assert((devinfo->platform != INTEL_PLATFORM_CHV &&
2390                  !intel_device_info_is_9lp(devinfo) &&
2391                  devinfo->has_64bit_float) || type_sz(src[0].type) <= 4);
2392          assert(!src[0].negate && !src[0].abs);
2393          assert(src[1].file == BRW_IMMEDIATE_VALUE);
2394          assert(src[1].type == BRW_REGISTER_TYPE_UD);
2395          assert(src[2].file == BRW_IMMEDIATE_VALUE);
2396          assert(src[2].type == BRW_REGISTER_TYPE_UD);
2397          const unsigned component = src[1].ud;
2398          const unsigned cluster_size = src[2].ud;
2399          assert(inst->src[0].file != ARF && inst->src[0].file != FIXED_GRF);
2400          const unsigned s = inst->src[0].stride;
2401          unsigned vstride = cluster_size * s;
2402          unsigned width = cluster_size;
2403 
2404          /* The maximum exec_size is 32, but the maximum width is only 16. */
2405          if (inst->exec_size == width) {
2406             vstride = 0;
2407             width = 1;
2408          }
2409 
2410          struct brw_reg strided = stride(suboffset(src[0], component * s),
2411                                          vstride, width, 0);
2412          brw_MOV(p, dst, strided);
2413          break;
2414       }
2415 
2416       case FS_OPCODE_SET_SAMPLE_ID:
2417          generate_set_sample_id(inst, dst, src[0], src[1]);
2418          break;
2419 
2420       case FS_OPCODE_PACK_HALF_2x16_SPLIT:
2421           generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
2422           break;
2423 
2424       case SHADER_OPCODE_HALT_TARGET:
2425          /* This is the place where the final HALT needs to be inserted if
2426           * we've emitted any discards.  If not, this will emit no code.
2427           */
2428          if (!patch_halt_jumps()) {
2429             if (unlikely(debug_flag)) {
2430                disasm_info->use_tail = true;
2431             }
2432          }
2433          break;
2434 
2435       case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2436          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2437                                            GFX7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
2438          send_count++;
2439          break;
2440 
2441       case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2442          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2443                                            GFX7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
2444          send_count++;
2445          break;
2446 
2447       case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2448          generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2449                                            GFX7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
2450          send_count++;
2451          break;
2452 
2453       case CS_OPCODE_CS_TERMINATE:
2454          generate_cs_terminate(inst, src[0]);
2455          send_count++;
2456          break;
2457 
2458       case SHADER_OPCODE_BARRIER:
2459 	 generate_barrier(inst, src[0]);
2460          send_count++;
2461 	 break;
2462 
2463       case BRW_OPCODE_DIM:
2464          assert(devinfo->platform == INTEL_PLATFORM_HSW);
2465          assert(src[0].type == BRW_REGISTER_TYPE_DF);
2466          assert(dst.type == BRW_REGISTER_TYPE_DF);
2467          brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
2468          break;
2469 
2470       case SHADER_OPCODE_RND_MODE: {
2471          assert(src[0].file == BRW_IMMEDIATE_VALUE);
2472          /*
2473           * Changes the floating point rounding mode updating the control
2474           * register field defined at cr0.0[5-6] bits.
2475           */
2476          enum brw_rnd_mode mode =
2477             (enum brw_rnd_mode) (src[0].d << BRW_CR0_RND_MODE_SHIFT);
2478          brw_float_controls_mode(p, mode, BRW_CR0_RND_MODE_MASK);
2479       }
2480          break;
2481 
2482       case SHADER_OPCODE_FLOAT_CONTROL_MODE:
2483          assert(src[0].file == BRW_IMMEDIATE_VALUE);
2484          assert(src[1].file == BRW_IMMEDIATE_VALUE);
2485          brw_float_controls_mode(p, src[0].d, src[1].d);
2486          break;
2487 
2488       case SHADER_OPCODE_READ_SR_REG:
2489          if (devinfo->ver >= 12) {
2490             /* There is a SWSB restriction that requires that any time sr0 is
2491              * accessed both the instruction doing the access and the next one
2492              * have SWSB set to RegDist(1).
2493              */
2494             if (brw_get_default_swsb(p).mode != TGL_SBID_NULL)
2495                brw_SYNC(p, TGL_SYNC_NOP);
2496             assert(src[0].file == BRW_IMMEDIATE_VALUE);
2497             brw_set_default_swsb(p, tgl_swsb_regdist(1));
2498             brw_MOV(p, dst, brw_sr0_reg(src[0].ud));
2499             brw_set_default_swsb(p, tgl_swsb_regdist(1));
2500             brw_AND(p, dst, dst, brw_imm_ud(0xffffffff));
2501          } else {
2502             brw_MOV(p, dst, brw_sr0_reg(src[0].ud));
2503          }
2504          break;
2505 
2506       default:
2507          unreachable("Unsupported opcode");
2508 
2509       case SHADER_OPCODE_LOAD_PAYLOAD:
2510          unreachable("Should be lowered by lower_load_payload()");
2511       }
2512 
2513       if (multiple_instructions_emitted)
2514          continue;
2515 
2516       if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2517          assert(p->next_insn_offset == last_insn_offset + 16 ||
2518                 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2519                  "emitting more than 1 instruction");
2520 
2521          brw_inst *last = &p->store[last_insn_offset / 16];
2522 
2523          if (inst->conditional_mod)
2524             brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
2525          if (devinfo->ver < 12) {
2526             brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
2527             brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
2528          }
2529       }
2530    }
2531 
2532    brw_set_uip_jip(p, start_offset);
2533 
2534    /* end of program sentinel */
2535    disasm_new_inst_group(disasm_info, p->next_insn_offset);
2536 
2537    /* `send_count` explicitly does not include spills or fills, as we'd
2538     * like to use it as a metric for intentional memory access or other
2539     * shared function use.  Otherwise, subtle changes to scheduling or
2540     * register allocation could cause it to fluctuate wildly - and that
2541     * effect is already counted in spill/fill counts.
2542     */
2543    send_count -= shader_stats.spill_count;
2544    send_count -= shader_stats.fill_count;
2545 
2546 #ifndef NDEBUG
2547    bool validated =
2548 #else
2549    if (unlikely(debug_flag))
2550 #endif
2551       brw_validate_instructions(&compiler->isa, p->store,
2552                                 start_offset,
2553                                 p->next_insn_offset,
2554                                 disasm_info);
2555 
2556    int before_size = p->next_insn_offset - start_offset;
2557    brw_compact_instructions(p, start_offset, disasm_info);
2558    int after_size = p->next_insn_offset - start_offset;
2559 
2560    if (unlikely(debug_flag)) {
2561       unsigned char sha1[21];
2562       char sha1buf[41];
2563 
2564       _mesa_sha1_compute(p->store + start_offset / sizeof(brw_inst),
2565                          after_size, sha1);
2566       _mesa_sha1_format(sha1buf, sha1);
2567 
2568       fprintf(stderr, "Native code for %s (sha1 %s)\n"
2569               "SIMD%d shader: %d instructions. %d loops. %u cycles. "
2570               "%d:%d spills:fills, %u sends, "
2571               "scheduled with mode %s. "
2572               "Promoted %u constants. "
2573               "Compacted %d to %d bytes (%.0f%%)\n",
2574               shader_name, sha1buf,
2575               dispatch_width, before_size / 16,
2576               loop_count, perf.latency,
2577               shader_stats.spill_count,
2578               shader_stats.fill_count,
2579               send_count,
2580               shader_stats.scheduler_mode,
2581               shader_stats.promoted_constants,
2582               before_size, after_size,
2583               100.0f * (before_size - after_size) / before_size);
2584 
2585       /* overriding the shader makes disasm_info invalid */
2586       if (!brw_try_override_assembly(p, start_offset, sha1buf)) {
2587          dump_assembly(p->store, start_offset, p->next_insn_offset,
2588                        disasm_info, perf.block_latency);
2589       } else {
2590          fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf);
2591       }
2592    }
2593    ralloc_free(disasm_info);
2594 #ifndef NDEBUG
2595    if (!validated && !debug_flag) {
2596       fprintf(stderr,
2597             "Validation failed. Rerun with INTEL_DEBUG=shaders to get more information.\n");
2598    }
2599 #endif
2600    assert(validated);
2601 
2602    brw_shader_debug_log(compiler, log_data,
2603                         "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
2604                         "%d:%d spills:fills, %u sends, "
2605                         "scheduled with mode %s, "
2606                         "Promoted %u constants, "
2607                         "compacted %d to %d bytes.\n",
2608                         _mesa_shader_stage_to_abbrev(stage),
2609                         dispatch_width, before_size / 16 - nop_count,
2610                         loop_count, perf.latency,
2611                         shader_stats.spill_count,
2612                         shader_stats.fill_count,
2613                         send_count,
2614                         shader_stats.scheduler_mode,
2615                         shader_stats.promoted_constants,
2616                         before_size, after_size);
2617    if (stats) {
2618       stats->dispatch_width = dispatch_width;
2619       stats->instructions = before_size / 16 - nop_count;
2620       stats->sends = send_count;
2621       stats->loops = loop_count;
2622       stats->cycles = perf.latency;
2623       stats->spills = shader_stats.spill_count;
2624       stats->fills = shader_stats.fill_count;
2625    }
2626 
2627    return start_offset;
2628 }
2629 
2630 void
add_const_data(void * data,unsigned size)2631 fs_generator::add_const_data(void *data, unsigned size)
2632 {
2633    assert(prog_data->const_data_size == 0);
2634    if (size > 0) {
2635       prog_data->const_data_size = size;
2636       prog_data->const_data_offset = brw_append_data(p, data, size, 32);
2637    }
2638 }
2639 
2640 void
add_resume_sbt(unsigned num_resume_shaders,uint64_t * sbt)2641 fs_generator::add_resume_sbt(unsigned num_resume_shaders, uint64_t *sbt)
2642 {
2643    assert(brw_shader_stage_is_bindless(stage));
2644    struct brw_bs_prog_data *bs_prog_data = brw_bs_prog_data(prog_data);
2645    if (num_resume_shaders > 0) {
2646       bs_prog_data->resume_sbt_offset =
2647          brw_append_data(p, sbt, num_resume_shaders * sizeof(uint64_t), 32);
2648       for (unsigned i = 0; i < num_resume_shaders; i++) {
2649          size_t offset = bs_prog_data->resume_sbt_offset + i * sizeof(*sbt);
2650          assert(offset <= UINT32_MAX);
2651          brw_add_reloc(p, BRW_SHADER_RELOC_SHADER_START_OFFSET,
2652                        BRW_SHADER_RELOC_TYPE_U32,
2653                        (uint32_t)offset, (uint32_t)sbt[i]);
2654       }
2655    }
2656 }
2657 
2658 const unsigned *
get_assembly()2659 fs_generator::get_assembly()
2660 {
2661    prog_data->relocs = brw_get_shader_relocs(p, &prog_data->num_relocs);
2662 
2663    return brw_get_program(p, &prog_data->program_size);
2664 }
2665