• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file
25  *
26  * This file drives the GLSL IR -> LIR translation, contains the
27  * optimizations on the LIR, and drives the generation of native code
28  * from the LIR.
29  */
30 
31 #include "brw_eu.h"
32 #include "brw_fs.h"
33 #include "brw_fs_builder.h"
34 #include "brw_fs_live_variables.h"
35 #include "brw_nir.h"
36 #include "brw_cfg.h"
37 #include "brw_private.h"
38 #include "intel_nir.h"
39 #include "shader_enums.h"
40 #include "dev/intel_debug.h"
41 #include "dev/intel_wa.h"
42 #include "compiler/glsl_types.h"
43 #include "compiler/nir/nir_builder.h"
44 #include "util/u_math.h"
45 
46 using namespace brw;
47 
48 static void
49 initialize_sources(fs_inst *inst, const brw_reg src[], uint8_t num_sources);
50 
51 void
init(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg * src,unsigned sources)52 fs_inst::init(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
53               const brw_reg *src, unsigned sources)
54 {
55    memset((void*)this, 0, sizeof(*this));
56 
57    initialize_sources(this, src, sources);
58 
59    for (unsigned i = 0; i < sources; i++)
60       this->src[i] = src[i];
61 
62    this->opcode = opcode;
63    this->dst = dst;
64    this->exec_size = exec_size;
65 
66    assert(dst.file != IMM && dst.file != UNIFORM);
67 
68    assert(this->exec_size != 0);
69 
70    this->conditional_mod = BRW_CONDITIONAL_NONE;
71 
72    /* This will be the case for almost all instructions. */
73    switch (dst.file) {
74    case VGRF:
75    case ADDRESS:
76    case ARF:
77    case FIXED_GRF:
78    case ATTR:
79       this->size_written = dst.component_size(exec_size);
80       break;
81    case BAD_FILE:
82       this->size_written = 0;
83       break;
84    case IMM:
85    case UNIFORM:
86       unreachable("Invalid destination register file");
87    }
88 
89    this->writes_accumulator = false;
90 }
91 
fs_inst()92 fs_inst::fs_inst()
93 {
94    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
95 }
96 
fs_inst(enum opcode opcode,uint8_t exec_size)97 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
98 {
99    init(opcode, exec_size, reg_undef, NULL, 0);
100 }
101 
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst)102 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst)
103 {
104    init(opcode, exec_size, dst, NULL, 0);
105 }
106 
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg & src0)107 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
108                  const brw_reg &src0)
109 {
110    const brw_reg src[1] = { src0 };
111    init(opcode, exec_size, dst, src, 1);
112 }
113 
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1)114 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
115                  const brw_reg &src0, const brw_reg &src1)
116 {
117    const brw_reg src[2] = { src0, src1 };
118    init(opcode, exec_size, dst, src, 2);
119 }
120 
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2)121 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
122                  const brw_reg &src0, const brw_reg &src1, const brw_reg &src2)
123 {
124    const brw_reg src[3] = { src0, src1, src2 };
125    init(opcode, exec_size, dst, src, 3);
126 }
127 
fs_inst(enum opcode opcode,uint8_t exec_width,const brw_reg & dst,const brw_reg src[],unsigned sources)128 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const brw_reg &dst,
129                  const brw_reg src[], unsigned sources)
130 {
131    init(opcode, exec_width, dst, src, sources);
132 }
133 
fs_inst(const fs_inst & that)134 fs_inst::fs_inst(const fs_inst &that)
135 {
136    memcpy((void*)this, &that, sizeof(that));
137    initialize_sources(this, that.src, that.sources);
138 }
139 
~fs_inst()140 fs_inst::~fs_inst()
141 {
142    if (this->src != this->builtin_src)
143       delete[] this->src;
144 }
145 
146 static void
initialize_sources(fs_inst * inst,const brw_reg src[],uint8_t num_sources)147 initialize_sources(fs_inst *inst, const brw_reg src[], uint8_t num_sources)
148 {
149    if (num_sources > ARRAY_SIZE(inst->builtin_src))
150       inst->src = new brw_reg[num_sources];
151    else
152       inst->src = inst->builtin_src;
153 
154    for (unsigned i = 0; i < num_sources; i++)
155       inst->src[i] = src[i];
156 
157    inst->sources = num_sources;
158 }
159 
160 void
resize_sources(uint8_t num_sources)161 fs_inst::resize_sources(uint8_t num_sources)
162 {
163    if (this->sources == num_sources)
164       return;
165 
166    brw_reg *old_src = this->src;
167    brw_reg *new_src;
168 
169    const unsigned builtin_size = ARRAY_SIZE(this->builtin_src);
170 
171    if (old_src == this->builtin_src) {
172       if (num_sources > builtin_size) {
173          new_src = new brw_reg[num_sources];
174          for (unsigned i = 0; i < this->sources; i++)
175             new_src[i] = old_src[i];
176 
177       } else {
178          new_src = old_src;
179       }
180    } else {
181       if (num_sources <= builtin_size) {
182          new_src = this->builtin_src;
183          assert(this->sources > num_sources);
184          for (unsigned i = 0; i < num_sources; i++)
185             new_src[i] = old_src[i];
186 
187       } else if (num_sources < this->sources) {
188          new_src = old_src;
189 
190       } else {
191          new_src = new brw_reg[num_sources];
192          for (unsigned i = 0; i < this->sources; i++)
193             new_src[i] = old_src[i];
194       }
195 
196       if (old_src != new_src)
197          delete[] old_src;
198    }
199 
200    this->sources = num_sources;
201    this->src = new_src;
202 }
203 
204 bool
is_send_from_grf() const205 fs_inst::is_send_from_grf() const
206 {
207    switch (opcode) {
208    case SHADER_OPCODE_SEND:
209    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
210    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
211    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
212    case SHADER_OPCODE_INTERLOCK:
213    case SHADER_OPCODE_MEMORY_FENCE:
214    case SHADER_OPCODE_BARRIER:
215       return true;
216    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
217       return src[1].file == VGRF;
218    default:
219       return false;
220    }
221 }
222 
223 bool
is_control_source(unsigned arg) const224 fs_inst::is_control_source(unsigned arg) const
225 {
226    switch (opcode) {
227    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
228       return arg == 0;
229 
230    case SHADER_OPCODE_BROADCAST:
231    case SHADER_OPCODE_SHUFFLE:
232    case SHADER_OPCODE_QUAD_SWIZZLE:
233    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
234    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
235    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
236       return arg == 1;
237 
238    case SHADER_OPCODE_MOV_INDIRECT:
239    case SHADER_OPCODE_CLUSTER_BROADCAST:
240       return arg == 1 || arg == 2;
241 
242    case SHADER_OPCODE_SEND:
243       return arg == 0 || arg == 1;
244 
245    case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
246    case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
247    case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
248       return arg != MEMORY_LOGICAL_BINDING &&
249              arg != MEMORY_LOGICAL_ADDRESS &&
250              arg != MEMORY_LOGICAL_DATA0 &&
251              arg != MEMORY_LOGICAL_DATA1;
252 
253    case SHADER_OPCODE_QUAD_SWAP:
254    case SHADER_OPCODE_INCLUSIVE_SCAN:
255    case SHADER_OPCODE_EXCLUSIVE_SCAN:
256    case SHADER_OPCODE_VOTE_ANY:
257    case SHADER_OPCODE_VOTE_ALL:
258    case SHADER_OPCODE_REDUCE:
259       return arg != 0;
260 
261    default:
262       return false;
263    }
264 }
265 
266 bool
is_payload(unsigned arg) const267 fs_inst::is_payload(unsigned arg) const
268 {
269    switch (opcode) {
270    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
271    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
272    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
273    case SHADER_OPCODE_INTERLOCK:
274    case SHADER_OPCODE_MEMORY_FENCE:
275    case SHADER_OPCODE_BARRIER:
276       return arg == 0;
277 
278    case SHADER_OPCODE_SEND:
279       return arg == 2 || arg == 3;
280 
281    default:
282       return false;
283    }
284 }
285 
286 bool
can_do_source_mods(const struct intel_device_info * devinfo) const287 fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
288 {
289    if (is_send_from_grf())
290       return false;
291 
292    /* From TGL PRM Vol 2a Pg. 1053 and Pg. 1069 MAD and MUL Instructions:
293     *
294     * "When multiplying a DW and any lower precision integer, source modifier
295     *  is not supported."
296     */
297    if (devinfo->ver >= 12 && (opcode == BRW_OPCODE_MUL ||
298                               opcode == BRW_OPCODE_MAD)) {
299       const brw_reg_type exec_type = get_exec_type(this);
300       const unsigned min_brw_type_size_bytes = opcode == BRW_OPCODE_MAD ?
301          MIN2(brw_type_size_bytes(src[1].type), brw_type_size_bytes(src[2].type)) :
302          MIN2(brw_type_size_bytes(src[0].type), brw_type_size_bytes(src[1].type));
303 
304       if (brw_type_is_int(exec_type) &&
305           brw_type_size_bytes(exec_type) >= 4 &&
306           brw_type_size_bytes(exec_type) != min_brw_type_size_bytes)
307          return false;
308    }
309 
310    switch (opcode) {
311    case BRW_OPCODE_ADDC:
312    case BRW_OPCODE_BFE:
313    case BRW_OPCODE_BFI1:
314    case BRW_OPCODE_BFI2:
315    case BRW_OPCODE_BFREV:
316    case BRW_OPCODE_CBIT:
317    case BRW_OPCODE_FBH:
318    case BRW_OPCODE_FBL:
319    case BRW_OPCODE_ROL:
320    case BRW_OPCODE_ROR:
321    case BRW_OPCODE_SUBB:
322    case BRW_OPCODE_DP4A:
323    case BRW_OPCODE_DPAS:
324    case SHADER_OPCODE_BROADCAST:
325    case SHADER_OPCODE_CLUSTER_BROADCAST:
326    case SHADER_OPCODE_MOV_INDIRECT:
327    case SHADER_OPCODE_SHUFFLE:
328    case SHADER_OPCODE_INT_QUOTIENT:
329    case SHADER_OPCODE_INT_REMAINDER:
330    case SHADER_OPCODE_REDUCE:
331    case SHADER_OPCODE_INCLUSIVE_SCAN:
332    case SHADER_OPCODE_EXCLUSIVE_SCAN:
333    case SHADER_OPCODE_VOTE_ANY:
334    case SHADER_OPCODE_VOTE_ALL:
335    case SHADER_OPCODE_VOTE_EQUAL:
336    case SHADER_OPCODE_BALLOT:
337    case SHADER_OPCODE_QUAD_SWAP:
338    case SHADER_OPCODE_READ_FROM_LIVE_CHANNEL:
339    case SHADER_OPCODE_READ_FROM_CHANNEL:
340       return false;
341    default:
342       return true;
343    }
344 }
345 
346 bool
can_do_cmod() const347 fs_inst::can_do_cmod() const
348 {
349    switch (opcode) {
350    case BRW_OPCODE_ADD:
351    case BRW_OPCODE_ADD3:
352    case BRW_OPCODE_ADDC:
353    case BRW_OPCODE_AND:
354    case BRW_OPCODE_ASR:
355    case BRW_OPCODE_AVG:
356    case BRW_OPCODE_CMP:
357    case BRW_OPCODE_CMPN:
358    case BRW_OPCODE_DP2:
359    case BRW_OPCODE_DP3:
360    case BRW_OPCODE_DP4:
361    case BRW_OPCODE_DPH:
362    case BRW_OPCODE_FRC:
363    case BRW_OPCODE_LINE:
364    case BRW_OPCODE_LRP:
365    case BRW_OPCODE_LZD:
366    case BRW_OPCODE_MAC:
367    case BRW_OPCODE_MACH:
368    case BRW_OPCODE_MAD:
369    case BRW_OPCODE_MOV:
370    case BRW_OPCODE_MUL:
371    case BRW_OPCODE_NOT:
372    case BRW_OPCODE_OR:
373    case BRW_OPCODE_PLN:
374    case BRW_OPCODE_RNDD:
375    case BRW_OPCODE_RNDE:
376    case BRW_OPCODE_RNDU:
377    case BRW_OPCODE_RNDZ:
378    case BRW_OPCODE_SHL:
379    case BRW_OPCODE_SHR:
380    case BRW_OPCODE_SUBB:
381    case BRW_OPCODE_XOR:
382       break;
383    default:
384       return false;
385    }
386 
387    /* The accumulator result appears to get used for the conditional modifier
388     * generation.  When negating a UD value, there is a 33rd bit generated for
389     * the sign in the accumulator value, so now you can't check, for example,
390     * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
391     */
392    for (unsigned i = 0; i < sources; i++) {
393       if (brw_type_is_uint(src[i].type) && src[i].negate)
394          return false;
395    }
396 
397    if (dst.file == ARF && dst.nr == BRW_ARF_SCALAR && src[0].file == IMM)
398       return false;
399 
400    return true;
401 }
402 
403 bool
can_change_types() const404 fs_inst::can_change_types() const
405 {
406    return dst.type == src[0].type &&
407           !src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR &&
408           (opcode == BRW_OPCODE_MOV ||
409            (opcode == SHADER_OPCODE_LOAD_PAYLOAD && sources == 1) ||
410            (opcode == BRW_OPCODE_SEL &&
411             dst.type == src[1].type &&
412             predicate != BRW_PREDICATE_NONE &&
413             !src[1].abs && !src[1].negate && src[1].file != ATTR));
414 }
415 
416 void
vfail(const char * format,va_list va)417 fs_visitor::vfail(const char *format, va_list va)
418 {
419    char *msg;
420 
421    if (failed)
422       return;
423 
424    failed = true;
425 
426    msg = ralloc_vasprintf(mem_ctx, format, va);
427    msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
428          dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
429 
430    this->fail_msg = msg;
431 
432    if (unlikely(debug_enabled)) {
433       fprintf(stderr, "%s",  msg);
434    }
435 }
436 
437 void
fail(const char * format,...)438 fs_visitor::fail(const char *format, ...)
439 {
440    va_list va;
441 
442    va_start(va, format);
443    vfail(format, va);
444    va_end(va);
445 }
446 
447 /**
448  * Mark this program as impossible to compile with dispatch width greater
449  * than n.
450  *
451  * During the SIMD8 compile (which happens first), we can detect and flag
452  * things that are unsupported in SIMD16+ mode, so the compiler can skip the
453  * SIMD16+ compile altogether.
454  *
455  * During a compile of dispatch width greater than n (if one happens anyway),
456  * this just calls fail().
457  */
458 void
limit_dispatch_width(unsigned n,const char * msg)459 fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
460 {
461    if (dispatch_width > n) {
462       fail("%s", msg);
463    } else {
464       max_dispatch_width = MIN2(max_dispatch_width, n);
465       brw_shader_perf_log(compiler, log_data,
466                           "Shader dispatch width limited to SIMD%d: %s\n",
467                           n, msg);
468    }
469 }
470 
471 /**
472  * Returns true if the instruction has a flag that means it won't
473  * update an entire destination register.
474  *
475  * For example, dead code elimination and live variable analysis want to know
476  * when a write to a variable screens off any preceding values that were in
477  * it.
478  */
479 bool
is_partial_write() const480 fs_inst::is_partial_write() const
481 {
482    if (this->predicate && !this->predicate_trivial &&
483        this->opcode != BRW_OPCODE_SEL)
484       return true;
485 
486    if (!this->dst.is_contiguous())
487       return true;
488 
489    if (this->dst.offset % REG_SIZE != 0)
490       return true;
491 
492    return this->size_written % REG_SIZE != 0;
493 }
494 
495 unsigned
components_read(unsigned i) const496 fs_inst::components_read(unsigned i) const
497 {
498    /* Return zero if the source is not present. */
499    if (src[i].file == BAD_FILE)
500       return 0;
501 
502    switch (opcode) {
503    case BRW_OPCODE_PLN:
504       return i == 0 ? 1 : 2;
505 
506    case FS_OPCODE_PIXEL_X:
507    case FS_OPCODE_PIXEL_Y:
508       assert(i < 2);
509       if (i == 0)
510          return 2;
511       else
512          return 1;
513 
514    case FS_OPCODE_FB_WRITE_LOGICAL:
515       assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
516       /* First/second FB write color. */
517       if (i < 2)
518          return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
519       else
520          return 1;
521 
522    case SHADER_OPCODE_TEX_LOGICAL:
523    case SHADER_OPCODE_TXD_LOGICAL:
524    case SHADER_OPCODE_TXF_LOGICAL:
525    case SHADER_OPCODE_TXL_LOGICAL:
526    case SHADER_OPCODE_TXS_LOGICAL:
527    case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
528    case FS_OPCODE_TXB_LOGICAL:
529    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
530    case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
531    case SHADER_OPCODE_TXF_MCS_LOGICAL:
532    case SHADER_OPCODE_LOD_LOGICAL:
533    case SHADER_OPCODE_TG4_LOGICAL:
534    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
535    case SHADER_OPCODE_TG4_BIAS_LOGICAL:
536    case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
537    case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
538    case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
539    case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
540    case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
541       assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
542              src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
543              src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
544       /* Texture coordinates. */
545       if (i == TEX_LOGICAL_SRC_COORDINATE)
546          return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
547       /* Texture derivatives. */
548       else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
549                opcode == SHADER_OPCODE_TXD_LOGICAL)
550          return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
551       /* Texture offset. */
552       else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
553          return 2;
554       /* MCS */
555       else if (i == TEX_LOGICAL_SRC_MCS) {
556          if (opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
557             return 2;
558          else if (opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
559             return 4;
560          else
561             return 1;
562       } else
563          return 1;
564 
565    case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
566       if (i == MEMORY_LOGICAL_DATA0 || i == MEMORY_LOGICAL_DATA0)
567          return 0;
568       /* fallthrough */
569    case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
570       if (i == MEMORY_LOGICAL_DATA1)
571          return 0;
572       /* fallthrough */
573    case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
574       if (i == MEMORY_LOGICAL_DATA0 || i == MEMORY_LOGICAL_DATA1)
575          return src[MEMORY_LOGICAL_COMPONENTS].ud;
576       else if (i == MEMORY_LOGICAL_ADDRESS)
577          return src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
578       else
579          return 1;
580 
581    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
582       return (i == 0 ? 2 : 1);
583 
584    case SHADER_OPCODE_URB_WRITE_LOGICAL:
585       assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
586 
587       if (i == URB_LOGICAL_SRC_DATA)
588          return src[URB_LOGICAL_SRC_COMPONENTS].ud;
589       else
590          return 1;
591 
592    case BRW_OPCODE_DPAS:
593       unreachable("Do not use components_read() for DPAS.");
594 
595    default:
596       return 1;
597    }
598 }
599 
600 unsigned
size_read(const struct intel_device_info * devinfo,int arg) const601 fs_inst::size_read(const struct intel_device_info *devinfo, int arg) const
602 {
603    switch (opcode) {
604    case SHADER_OPCODE_SEND:
605       if (arg == 2) {
606          return mlen * REG_SIZE;
607       } else if (arg == 3) {
608          return ex_mlen * REG_SIZE;
609       }
610       break;
611 
612    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
613    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
614       if (arg == 0)
615          return mlen * REG_SIZE;
616       break;
617 
618    case BRW_OPCODE_PLN:
619       if (arg == 0)
620          return 16;
621       break;
622 
623    case SHADER_OPCODE_LOAD_PAYLOAD:
624       if (arg < this->header_size)
625          return retype(src[arg], BRW_TYPE_UD).component_size(8);
626       break;
627 
628    case SHADER_OPCODE_BARRIER:
629       return REG_SIZE;
630 
631    case SHADER_OPCODE_MOV_INDIRECT:
632       if (arg == 0) {
633          assert(src[2].file == IMM);
634          return src[2].ud;
635       }
636       break;
637 
638    case BRW_OPCODE_DPAS: {
639       /* This is a little bit sketchy. There's no way to get at devinfo from
640        * here, so the regular reg_unit() cannot be used. However, on
641        * reg_unit() == 1 platforms, DPAS exec_size must be 8, and on known
642        * reg_unit() == 2 platforms, DPAS exec_size must be 16. This is not a
643        * coincidence, so this isn't so bad.
644        */
645       const unsigned reg_unit = this->exec_size / 8;
646 
647       switch (arg) {
648       case 0:
649          if (src[0].type == BRW_TYPE_HF) {
650             return rcount * reg_unit * REG_SIZE / 2;
651          } else {
652             return rcount * reg_unit * REG_SIZE;
653          }
654       case 1:
655          return sdepth * reg_unit * REG_SIZE;
656       case 2:
657          /* This is simpler than the formula described in the Bspec, but it
658           * covers all of the cases that we support. Each inner sdepth
659           * iteration of the DPAS consumes a single dword for int8, uint8, or
660           * float16 types. These are the one source types currently
661           * supportable through Vulkan. This is independent of reg_unit.
662           */
663          return rcount * sdepth * 4;
664       default:
665          unreachable("Invalid source number.");
666       }
667       break;
668    }
669 
670    default:
671       break;
672    }
673 
674    switch (src[arg].file) {
675    case UNIFORM:
676    case IMM:
677       return components_read(arg) * brw_type_size_bytes(src[arg].type);
678    case BAD_FILE:
679    case ADDRESS:
680    case ARF:
681    case FIXED_GRF:
682    case VGRF:
683    case ATTR:
684       /* Regardless of exec_size, values marked as scalar are SIMD8. */
685       return components_read(arg) *
686              src[arg].component_size(src[arg].is_scalar ? 8 * reg_unit(devinfo) : exec_size);
687    }
688    return 0;
689 }
690 
691 namespace {
692    unsigned
predicate_width(const intel_device_info * devinfo,brw_predicate predicate)693    predicate_width(const intel_device_info *devinfo, brw_predicate predicate)
694    {
695       if (devinfo->ver >= 20) {
696          return 1;
697       } else {
698          switch (predicate) {
699          case BRW_PREDICATE_NONE:            return 1;
700          case BRW_PREDICATE_NORMAL:          return 1;
701          case BRW_PREDICATE_ALIGN1_ANY2H:    return 2;
702          case BRW_PREDICATE_ALIGN1_ALL2H:    return 2;
703          case BRW_PREDICATE_ALIGN1_ANY4H:    return 4;
704          case BRW_PREDICATE_ALIGN1_ALL4H:    return 4;
705          case BRW_PREDICATE_ALIGN1_ANY8H:    return 8;
706          case BRW_PREDICATE_ALIGN1_ALL8H:    return 8;
707          case BRW_PREDICATE_ALIGN1_ANY16H:   return 16;
708          case BRW_PREDICATE_ALIGN1_ALL16H:   return 16;
709          case BRW_PREDICATE_ALIGN1_ANY32H:   return 32;
710          case BRW_PREDICATE_ALIGN1_ALL32H:   return 32;
711          default: unreachable("Unsupported predicate");
712          }
713       }
714    }
715 }
716 
717 unsigned
flags_read(const intel_device_info * devinfo) const718 fs_inst::flags_read(const intel_device_info *devinfo) const
719 {
720    if (devinfo->ver < 20 && (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
721                              predicate == BRW_PREDICATE_ALIGN1_ALLV)) {
722       /* The vertical predication modes combine corresponding bits from
723        * f0.0 and f1.0 on Gfx7+.
724        */
725       const unsigned shift = 4;
726       return brw_fs_flag_mask(this, 1) << shift | brw_fs_flag_mask(this, 1);
727    } else if (predicate) {
728       return brw_fs_flag_mask(this, predicate_width(devinfo, predicate));
729    } else {
730       unsigned mask = 0;
731       for (int i = 0; i < sources; i++) {
732          mask |= brw_fs_flag_mask(src[i], size_read(devinfo, i));
733       }
734       return mask;
735    }
736 }
737 
738 unsigned
flags_written(const intel_device_info * devinfo) const739 fs_inst::flags_written(const intel_device_info *devinfo) const
740 {
741    if (conditional_mod && (opcode != BRW_OPCODE_SEL &&
742                            opcode != BRW_OPCODE_CSEL &&
743                            opcode != BRW_OPCODE_IF &&
744                            opcode != BRW_OPCODE_WHILE)) {
745       return brw_fs_flag_mask(this, 1);
746    } else if (opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) {
747       return brw_fs_flag_mask(this, 32);
748    } else {
749       return brw_fs_flag_mask(dst, size_written);
750    }
751 }
752 
753 bool
has_sampler_residency() const754 fs_inst::has_sampler_residency() const
755 {
756    switch (opcode) {
757    case SHADER_OPCODE_TEX_LOGICAL:
758    case FS_OPCODE_TXB_LOGICAL:
759    case SHADER_OPCODE_TXL_LOGICAL:
760    case SHADER_OPCODE_TXD_LOGICAL:
761    case SHADER_OPCODE_TXF_LOGICAL:
762    case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
763    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
764    case SHADER_OPCODE_TXS_LOGICAL:
765    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
766    case SHADER_OPCODE_TG4_LOGICAL:
767    case SHADER_OPCODE_TG4_BIAS_LOGICAL:
768    case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
769    case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
770    case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
771    case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
772       assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
773       return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
774    default:
775       return false;
776    }
777 }
778 
779 /* \sa inst_is_raw_move in brw_eu_validate. */
780 bool
is_raw_move() const781 fs_inst::is_raw_move() const
782 {
783    if (opcode != BRW_OPCODE_MOV)
784       return false;
785 
786    if (src[0].file == IMM) {
787       if (brw_type_is_vector_imm(src[0].type))
788          return false;
789    } else if (src[0].negate || src[0].abs) {
790       return false;
791    }
792 
793    if (saturate)
794       return false;
795 
796    return src[0].type == dst.type ||
797           (brw_type_is_int(src[0].type) &&
798            brw_type_is_int(dst.type) &&
799            brw_type_size_bits(src[0].type) == brw_type_size_bits(dst.type));
800 }
801 
802 bool
uses_address_register_implicitly() const803 fs_inst::uses_address_register_implicitly() const
804 {
805    switch (opcode) {
806    case SHADER_OPCODE_BROADCAST:
807    case SHADER_OPCODE_SHUFFLE:
808    case SHADER_OPCODE_MOV_INDIRECT:
809       return true;
810    default:
811       return false;
812    }
813 }
814 
815 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
816  * This brings in those uniform definitions
817  */
818 void
import_uniforms(fs_visitor * v)819 fs_visitor::import_uniforms(fs_visitor *v)
820 {
821    this->uniforms = v->uniforms;
822 }
823 
824 enum intel_barycentric_mode
brw_barycentric_mode(const struct brw_wm_prog_key * key,nir_intrinsic_instr * intr)825 brw_barycentric_mode(const struct brw_wm_prog_key *key,
826                      nir_intrinsic_instr *intr)
827 {
828    const glsl_interp_mode mode =
829       (enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
830 
831    /* Barycentric modes don't make sense for flat inputs. */
832    assert(mode != INTERP_MODE_FLAT);
833 
834    unsigned bary;
835    switch (intr->intrinsic) {
836    case nir_intrinsic_load_barycentric_pixel:
837    case nir_intrinsic_load_barycentric_at_offset:
838       /* When per sample interpolation is dynamic, assume sample
839        * interpolation. We'll dynamically remap things so that the FS thread
840        * payload is not affected.
841        */
842       bary = key->persample_interp == INTEL_SOMETIMES ?
843              INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE :
844              INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL;
845       break;
846    case nir_intrinsic_load_barycentric_centroid:
847       bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID;
848       break;
849    case nir_intrinsic_load_barycentric_sample:
850    case nir_intrinsic_load_barycentric_at_sample:
851       bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE;
852       break;
853    default:
854       unreachable("invalid intrinsic");
855    }
856 
857    if (mode == INTERP_MODE_NOPERSPECTIVE)
858       bary += 3;
859 
860    return (enum intel_barycentric_mode) bary;
861 }
862 
863 /**
864  * Walk backwards from the end of the program looking for a URB write that
865  * isn't in control flow, and mark it with EOT.
866  *
867  * Return true if successful or false if a separate EOT write is needed.
868  */
869 bool
mark_last_urb_write_with_eot()870 fs_visitor::mark_last_urb_write_with_eot()
871 {
872    foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
873       if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
874          prev->eot = true;
875 
876          /* Delete now dead instructions. */
877          foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
878             if (dead == prev)
879                break;
880             dead->remove();
881          }
882          return true;
883       } else if (prev->is_control_flow() || prev->has_side_effects()) {
884          break;
885       }
886    }
887 
888    return false;
889 }
890 
891 static unsigned
round_components_to_whole_registers(const intel_device_info * devinfo,unsigned c)892 round_components_to_whole_registers(const intel_device_info *devinfo,
893                                     unsigned c)
894 {
895    return DIV_ROUND_UP(c, 8 * reg_unit(devinfo)) * reg_unit(devinfo);
896 }
897 
898 void
assign_curb_setup()899 fs_visitor::assign_curb_setup()
900 {
901    unsigned uniform_push_length =
902       round_components_to_whole_registers(devinfo, prog_data->nr_params);
903 
904    unsigned ubo_push_length = 0;
905    unsigned ubo_push_start[4];
906    for (int i = 0; i < 4; i++) {
907       ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
908       ubo_push_length += prog_data->ubo_ranges[i].length;
909 
910       assert(ubo_push_start[i] % (8 * reg_unit(devinfo)) == 0);
911       assert(ubo_push_length % (1 * reg_unit(devinfo)) == 0);
912    }
913 
914    prog_data->curb_read_length = uniform_push_length + ubo_push_length;
915    if (stage == MESA_SHADER_FRAGMENT &&
916        ((struct brw_wm_prog_key *)key)->null_push_constant_tbimr_workaround)
917       prog_data->curb_read_length = MAX2(1, prog_data->curb_read_length);
918 
919    uint64_t used = 0;
920    bool is_compute = gl_shader_stage_is_compute(stage);
921 
922    if (is_compute && devinfo->verx10 >= 125 && uniform_push_length > 0) {
923       assert(devinfo->has_lsc);
924       fs_builder ubld = fs_builder(this, 1).exec_all().at(
925          cfg->first_block(), cfg->first_block()->start());
926 
927       /* The base offset for our push data is passed in as R0.0[31:6]. We have
928        * to mask off the bottom 6 bits.
929        */
930       brw_reg base_addr =
931          ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
932                   brw_imm_ud(INTEL_MASK(31, 6)));
933 
934       /* On Gfx12-HP we load constants at the start of the program using A32
935        * stateless messages.
936        */
937       for (unsigned i = 0; i < uniform_push_length;) {
938          /* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
939          unsigned num_regs = MIN2(uniform_push_length - i, 8);
940          assert(num_regs > 0);
941          num_regs = 1 << util_logbase2(num_regs);
942 
943          /* This pass occurs after all of the optimization passes, so don't
944           * emit an 'ADD addr, base_addr, 0' instruction.
945           */
946          brw_reg addr = i == 0 ? base_addr :
947             ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
948 
949          brw_reg srcs[4] = {
950             brw_imm_ud(0), /* desc */
951             brw_imm_ud(0), /* ex_desc */
952             addr,          /* payload */
953             brw_reg(),      /* payload2 */
954          };
955 
956          brw_reg dest = retype(brw_vec8_grf(payload().num_regs + i, 0),
957                               BRW_TYPE_UD);
958          fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
959 
960          send->sfid = GFX12_SFID_UGM;
961          uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
962                                       LSC_ADDR_SURFTYPE_FLAT,
963                                       LSC_ADDR_SIZE_A32,
964                                       LSC_DATA_SIZE_D32,
965                                       num_regs * 8 /* num_channels */,
966                                       true /* transpose */,
967                                       LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
968          send->header_size = 0;
969          send->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, 1);
970          send->size_written =
971             lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
972          send->send_is_volatile = true;
973 
974          send->src[0] = brw_imm_ud(desc |
975                                    brw_message_desc(devinfo,
976                                                     send->mlen,
977                                                     send->size_written / REG_SIZE,
978                                                     send->header_size));
979 
980          i += num_regs;
981       }
982 
983       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
984    }
985 
986    /* Map the offsets in the UNIFORM file to fixed HW regs. */
987    foreach_block_and_inst(block, fs_inst, inst, cfg) {
988       for (unsigned int i = 0; i < inst->sources; i++) {
989 	 if (inst->src[i].file == UNIFORM) {
990             int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
991             int constant_nr;
992             if (inst->src[i].nr >= UBO_START) {
993                /* constant_nr is in 32-bit units, the rest are in bytes */
994                constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
995                              inst->src[i].offset / 4;
996             } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
997                constant_nr = uniform_nr;
998             } else {
999                /* Section 5.11 of the OpenGL 4.1 spec says:
1000                 * "Out-of-bounds reads return undefined values, which include
1001                 *  values from other variables of the active program or zero."
1002                 * Just return the first push constant.
1003                 */
1004                constant_nr = 0;
1005             }
1006 
1007             assert(constant_nr / 8 < 64);
1008             used |= BITFIELD64_BIT(constant_nr / 8);
1009 
1010 	    struct brw_reg brw_reg = brw_vec1_grf(payload().num_regs +
1011 						  constant_nr / 8,
1012 						  constant_nr % 8);
1013             brw_reg.abs = inst->src[i].abs;
1014             brw_reg.negate = inst->src[i].negate;
1015 
1016             /* The combination of is_scalar for load_uniform, copy prop, and
1017              * lower_btd_logical_send can generate a MOV from a UNIFORM with
1018              * exec size 2 and stride of 1.
1019              */
1020             assert(inst->src[i].stride == 0 || inst->exec_size == 2);
1021             inst->src[i] = byte_offset(
1022                retype(brw_reg, inst->src[i].type),
1023                inst->src[i].offset % 4);
1024 	 }
1025       }
1026    }
1027 
1028    uint64_t want_zero = used & prog_data->zero_push_reg;
1029    if (want_zero) {
1030       fs_builder ubld = fs_builder(this, 8).exec_all().at(
1031          cfg->first_block(), cfg->first_block()->start());
1032 
1033       /* push_reg_mask_param is in 32-bit units */
1034       unsigned mask_param = prog_data->push_reg_mask_param;
1035       struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
1036                                                               mask_param % 8);
1037 
1038       brw_reg b32;
1039       for (unsigned i = 0; i < 64; i++) {
1040          if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
1041             brw_reg shifted = ubld.vgrf(BRW_TYPE_W, 2);
1042             ubld.SHL(horiz_offset(shifted, 8),
1043                      byte_offset(retype(mask, BRW_TYPE_W), i / 8),
1044                      brw_imm_v(0x01234567));
1045             ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
1046 
1047             fs_builder ubld16 = ubld.group(16, 0);
1048             b32 = ubld16.vgrf(BRW_TYPE_D);
1049             ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
1050          }
1051 
1052          if (want_zero & BITFIELD64_BIT(i)) {
1053             assert(i < prog_data->curb_read_length);
1054             struct brw_reg push_reg =
1055                retype(brw_vec8_grf(payload().num_regs + i, 0), BRW_TYPE_D);
1056 
1057             ubld.AND(push_reg, push_reg, component(b32, i % 16));
1058          }
1059       }
1060 
1061       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1062    }
1063 
1064    /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1065    this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
1066 }
1067 
1068 /*
1069  * Build up an array of indices into the urb_setup array that
1070  * references the active entries of the urb_setup array.
1071  * Used to accelerate walking the active entries of the urb_setup array
1072  * on each upload.
1073  */
1074 void
brw_compute_urb_setup_index(struct brw_wm_prog_data * wm_prog_data)1075 brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
1076 {
1077    /* TODO(mesh): Review usage of this in the context of Mesh, we may want to
1078     * skip per-primitive attributes here.
1079     */
1080 
1081    /* Make sure uint8_t is sufficient */
1082    STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
1083    uint8_t index = 0;
1084    for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1085       if (wm_prog_data->urb_setup[attr] >= 0) {
1086          wm_prog_data->urb_setup_attribs[index++] = attr;
1087       }
1088    }
1089    wm_prog_data->urb_setup_attribs_count = index;
1090 }
1091 
1092 void
convert_attr_sources_to_hw_regs(fs_inst * inst)1093 fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
1094 {
1095    for (int i = 0; i < inst->sources; i++) {
1096       if (inst->src[i].file == ATTR) {
1097          assert(inst->src[i].nr == 0);
1098          int grf = payload().num_regs +
1099                    prog_data->curb_read_length +
1100                    inst->src[i].offset / REG_SIZE;
1101 
1102          /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
1103           *
1104           * VertStride must be used to cross GRF register boundaries. This
1105           * rule implies that elements within a 'Width' cannot cross GRF
1106           * boundaries.
1107           *
1108           * So, for registers that are large enough, we have to split the exec
1109           * size in two and trust the compression state to sort it out.
1110           */
1111          unsigned total_size = inst->exec_size *
1112                                inst->src[i].stride *
1113                                brw_type_size_bytes(inst->src[i].type);
1114 
1115          assert(total_size <= 2 * REG_SIZE);
1116          const unsigned exec_size =
1117             (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
1118 
1119          unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
1120          struct brw_reg reg =
1121             stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1122                                inst->src[i].offset % REG_SIZE),
1123                    exec_size * inst->src[i].stride,
1124                    width, inst->src[i].stride);
1125          reg.abs = inst->src[i].abs;
1126          reg.negate = inst->src[i].negate;
1127 
1128          inst->src[i] = reg;
1129       }
1130    }
1131 }
1132 
1133 int
brw_get_subgroup_id_param_index(const intel_device_info * devinfo,const brw_stage_prog_data * prog_data)1134 brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
1135                                 const brw_stage_prog_data *prog_data)
1136 {
1137    if (prog_data->nr_params == 0)
1138       return -1;
1139 
1140    if (devinfo->verx10 >= 125)
1141       return -1;
1142 
1143    /* The local thread id is always the last parameter in the list */
1144    uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
1145    if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
1146       return prog_data->nr_params - 1;
1147 
1148    return -1;
1149 }
1150 
1151 /**
1152  * Get the mask of SIMD channels enabled during dispatch and not yet disabled
1153  * by discard.  Due to the layout of the sample mask in the fragment shader
1154  * thread payload, \p bld is required to have a dispatch_width() not greater
1155  * than 16 for fragment shaders.
1156  */
1157 brw_reg
brw_sample_mask_reg(const fs_builder & bld)1158 brw_sample_mask_reg(const fs_builder &bld)
1159 {
1160    const fs_visitor &s = *bld.shader;
1161 
1162    if (s.stage != MESA_SHADER_FRAGMENT) {
1163       return brw_imm_ud(0xffffffff);
1164    } else if (s.devinfo->ver >= 20 ||
1165               brw_wm_prog_data(s.prog_data)->uses_kill) {
1166       return brw_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
1167    } else {
1168       assert(bld.dispatch_width() <= 16);
1169       assert(s.devinfo->ver < 20);
1170       return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
1171                     BRW_TYPE_UW);
1172    }
1173 }
1174 
1175 uint32_t
brw_fb_write_msg_control(const fs_inst * inst,const struct brw_wm_prog_data * prog_data)1176 brw_fb_write_msg_control(const fs_inst *inst,
1177                          const struct brw_wm_prog_data *prog_data)
1178 {
1179    uint32_t mctl;
1180 
1181    if (prog_data->dual_src_blend) {
1182       assert(inst->exec_size < 32);
1183 
1184       if (inst->group % 16 == 0)
1185          mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
1186       else if (inst->group % 16 == 8)
1187          mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
1188       else
1189          unreachable("Invalid dual-source FB write instruction group");
1190    } else {
1191       assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
1192 
1193       if (inst->exec_size == 16)
1194          mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
1195       else if (inst->exec_size == 8)
1196          mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
1197       else if (inst->exec_size == 32)
1198          mctl = XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE;
1199       else
1200          unreachable("Invalid FB write execution size");
1201    }
1202 
1203    return mctl;
1204 }
1205 
1206  /**
1207  * Predicate the specified instruction on the sample mask.
1208  */
1209 void
brw_emit_predicate_on_sample_mask(const fs_builder & bld,fs_inst * inst)1210 brw_emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst)
1211 {
1212    assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
1213           bld.group() == inst->group &&
1214           bld.dispatch_width() == inst->exec_size);
1215 
1216    const fs_visitor &s = *bld.shader;
1217    const brw_reg sample_mask = brw_sample_mask_reg(bld);
1218    const unsigned subreg = sample_mask_flag_subreg(s);
1219 
1220    if (s.devinfo->ver >= 20 || brw_wm_prog_data(s.prog_data)->uses_kill) {
1221       assert(sample_mask.file == ARF &&
1222              sample_mask.nr == brw_flag_subreg(subreg).nr &&
1223              sample_mask.subnr == brw_flag_subreg(
1224                 subreg + inst->group / 16).subnr);
1225    } else {
1226       bld.group(1, 0).exec_all()
1227          .MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask);
1228    }
1229 
1230    if (inst->predicate) {
1231       assert(inst->predicate == BRW_PREDICATE_NORMAL);
1232       assert(!inst->predicate_inverse);
1233       assert(inst->flag_subreg == 0);
1234       assert(s.devinfo->ver < 20);
1235       /* Combine the sample mask with the existing predicate by using a
1236        * vertical predication mode.
1237        */
1238       inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
1239    } else {
1240       inst->flag_subreg = subreg;
1241       inst->predicate = BRW_PREDICATE_NORMAL;
1242       inst->predicate_inverse = false;
1243    }
1244 }
1245 
register_pressure(const fs_visitor * v)1246 brw::register_pressure::register_pressure(const fs_visitor *v)
1247 {
1248    const fs_live_variables &live = v->live_analysis.require();
1249    const unsigned num_instructions = v->cfg->num_blocks ?
1250       v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
1251 
1252    regs_live_at_ip = new unsigned[num_instructions]();
1253 
1254    for (unsigned reg = 0; reg < v->alloc.count; reg++) {
1255       for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
1256          regs_live_at_ip[ip] += v->alloc.sizes[reg];
1257    }
1258 
1259    const unsigned payload_count = v->first_non_payload_grf;
1260 
1261    int *payload_last_use_ip = new int[payload_count];
1262    v->calculate_payload_ranges(true, payload_count, payload_last_use_ip);
1263 
1264    for (unsigned reg = 0; reg < payload_count; reg++) {
1265       for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
1266          ++regs_live_at_ip[ip];
1267    }
1268 
1269    delete[] payload_last_use_ip;
1270 }
1271 
~register_pressure()1272 brw::register_pressure::~register_pressure()
1273 {
1274    delete[] regs_live_at_ip;
1275 }
1276 
1277 void
invalidate_analysis(brw::analysis_dependency_class c)1278 fs_visitor::invalidate_analysis(brw::analysis_dependency_class c)
1279 {
1280    live_analysis.invalidate(c);
1281    regpressure_analysis.invalidate(c);
1282    performance_analysis.invalidate(c);
1283    idom_analysis.invalidate(c);
1284    def_analysis.invalidate(c);
1285 }
1286 
1287 void
debug_optimizer(const nir_shader * nir,const char * pass_name,int iteration,int pass_num) const1288 fs_visitor::debug_optimizer(const nir_shader *nir,
1289                             const char *pass_name,
1290                             int iteration, int pass_num) const
1291 {
1292    if (!brw_should_print_shader(nir, DEBUG_OPTIMIZER))
1293       return;
1294 
1295    char *filename;
1296    int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
1297                       debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
1298                       _mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
1299                       iteration, pass_num, pass_name);
1300    if (ret == -1)
1301       return;
1302 
1303    FILE *file = stderr;
1304    if (__normal_user()) {
1305       file = fopen(filename, "w");
1306       if (!file)
1307          file = stderr;
1308    }
1309 
1310    brw_print_instructions(*this, file);
1311 
1312    if (file != stderr)
1313       fclose(file);
1314 
1315    free(filename);
1316 }
1317 
1318 static uint32_t
brw_compute_max_register_pressure(fs_visitor & s)1319 brw_compute_max_register_pressure(fs_visitor &s)
1320 {
1321    const register_pressure &rp = s.regpressure_analysis.require();
1322    uint32_t ip = 0, max_pressure = 0;
1323    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
1324       max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
1325       ip++;
1326    }
1327    return max_pressure;
1328 }
1329 
1330 static fs_inst **
save_instruction_order(const struct cfg_t * cfg)1331 save_instruction_order(const struct cfg_t *cfg)
1332 {
1333    /* Before we schedule anything, stash off the instruction order as an array
1334     * of fs_inst *.  This way, we can reset it between scheduling passes to
1335     * prevent dependencies between the different scheduling modes.
1336     */
1337    int num_insts = cfg->last_block()->end_ip + 1;
1338    fs_inst **inst_arr = new fs_inst * [num_insts];
1339 
1340    int ip = 0;
1341    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1342       assert(ip >= block->start_ip && ip <= block->end_ip);
1343       inst_arr[ip++] = inst;
1344    }
1345    assert(ip == num_insts);
1346 
1347    return inst_arr;
1348 }
1349 
1350 static void
restore_instruction_order(struct cfg_t * cfg,fs_inst ** inst_arr)1351 restore_instruction_order(struct cfg_t *cfg, fs_inst **inst_arr)
1352 {
1353    ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
1354 
1355    int ip = 0;
1356    foreach_block (block, cfg) {
1357       block->instructions.make_empty();
1358 
1359       assert(ip == block->start_ip);
1360       for (; ip <= block->end_ip; ip++)
1361          block->instructions.push_tail(inst_arr[ip]);
1362    }
1363    assert(ip == num_insts);
1364 }
1365 
1366 /* Per-thread scratch space is a power-of-two multiple of 1KB. */
1367 static inline unsigned
brw_get_scratch_size(int size)1368 brw_get_scratch_size(int size)
1369 {
1370    return MAX2(1024, util_next_power_of_two(size));
1371 }
1372 
1373 void
brw_allocate_registers(fs_visitor & s,bool allow_spilling)1374 brw_allocate_registers(fs_visitor &s, bool allow_spilling)
1375 {
1376    const struct intel_device_info *devinfo = s.devinfo;
1377    const nir_shader *nir = s.nir;
1378    bool allocated;
1379 
1380    static const enum instruction_scheduler_mode pre_modes[] = {
1381       SCHEDULE_PRE,
1382       SCHEDULE_PRE_NON_LIFO,
1383       SCHEDULE_NONE,
1384       SCHEDULE_PRE_LIFO,
1385    };
1386 
1387    static const char *scheduler_mode_name[] = {
1388       [SCHEDULE_PRE] = "top-down",
1389       [SCHEDULE_PRE_NON_LIFO] = "non-lifo",
1390       [SCHEDULE_PRE_LIFO] = "lifo",
1391       [SCHEDULE_POST] = "post",
1392       [SCHEDULE_NONE] = "none",
1393    };
1394 
1395    uint32_t best_register_pressure = UINT32_MAX;
1396    enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
1397 
1398    brw_opt_compact_virtual_grfs(s);
1399 
1400    if (s.needs_register_pressure)
1401       s.shader_stats.max_register_pressure = brw_compute_max_register_pressure(s);
1402 
1403    s.debug_optimizer(nir, "pre_register_allocate", 90, 90);
1404 
1405    bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
1406 
1407    /* Before we schedule anything, stash off the instruction order as an array
1408     * of fs_inst *.  This way, we can reset it between scheduling passes to
1409     * prevent dependencies between the different scheduling modes.
1410     */
1411    fs_inst **orig_order = save_instruction_order(s.cfg);
1412    fs_inst **best_pressure_order = NULL;
1413 
1414    void *scheduler_ctx = ralloc_context(NULL);
1415    instruction_scheduler *sched = brw_prepare_scheduler(s, scheduler_ctx);
1416 
1417    /* Try each scheduling heuristic to see if it can successfully register
1418     * allocate without spilling.  They should be ordered by decreasing
1419     * performance but increasing likelihood of allocating.
1420     */
1421    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
1422       enum instruction_scheduler_mode sched_mode = pre_modes[i];
1423 
1424       brw_schedule_instructions_pre_ra(s, sched, sched_mode);
1425       s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
1426 
1427       s.debug_optimizer(nir, s.shader_stats.scheduler_mode, 95, i);
1428 
1429       if (0) {
1430          brw_assign_regs_trivial(s);
1431          allocated = true;
1432          break;
1433       }
1434 
1435       /* We should only spill registers on the last scheduling. */
1436       assert(!s.spilled_any_registers);
1437 
1438       allocated = brw_assign_regs(s, false, spill_all);
1439       if (allocated)
1440          break;
1441 
1442       /* Save the maximum register pressure */
1443       uint32_t this_pressure = brw_compute_max_register_pressure(s);
1444 
1445       if (0) {
1446          fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
1447                  scheduler_mode_name[sched_mode], this_pressure);
1448       }
1449 
1450       if (this_pressure < best_register_pressure) {
1451          best_register_pressure = this_pressure;
1452          best_sched = sched_mode;
1453          delete[] best_pressure_order;
1454          best_pressure_order = save_instruction_order(s.cfg);
1455       }
1456 
1457       /* Reset back to the original order before trying the next mode */
1458       restore_instruction_order(s.cfg, orig_order);
1459       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1460    }
1461 
1462    ralloc_free(scheduler_ctx);
1463 
1464    if (!allocated) {
1465       if (0) {
1466          fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
1467                  scheduler_mode_name[best_sched]);
1468       }
1469       restore_instruction_order(s.cfg, best_pressure_order);
1470       s.shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
1471 
1472       allocated = brw_assign_regs(s, allow_spilling, spill_all);
1473    }
1474 
1475    delete[] orig_order;
1476    delete[] best_pressure_order;
1477 
1478    if (!allocated) {
1479       s.fail("Failure to register allocate.  Reduce number of "
1480            "live scalar values to avoid this.");
1481    } else if (s.spilled_any_registers) {
1482       brw_shader_perf_log(s.compiler, s.log_data,
1483                           "%s shader triggered register spilling.  "
1484                           "Try reducing the number of live scalar "
1485                           "values to improve performance.\n",
1486                           _mesa_shader_stage_to_string(s.stage));
1487    }
1488 
1489    if (s.failed)
1490       return;
1491 
1492    int pass_num = 0;
1493 
1494    s.debug_optimizer(nir, "post_ra_alloc", 96, pass_num++);
1495 
1496    brw_opt_bank_conflicts(s);
1497 
1498    s.debug_optimizer(nir, "bank_conflict", 96, pass_num++);
1499 
1500    brw_schedule_instructions_post_ra(s);
1501 
1502    s.debug_optimizer(nir, "post_ra_alloc_scheduling", 96, pass_num++);
1503 
1504    /* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead
1505     * of part of assign_regs since both bank conflicts optimization and post
1506     * RA scheduling take advantage of distinguishing references to registers
1507     * that were allocated from references that were already fixed.
1508     *
1509     * TODO: Change the passes above, then move this lowering to be part of
1510     * assign_regs.
1511     */
1512    brw_lower_vgrfs_to_fixed_grfs(s);
1513 
1514    s.debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, pass_num++);
1515 
1516    brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_REGALLOC);
1517 
1518    if (s.last_scratch > 0) {
1519       /* We currently only support up to 2MB of scratch space.  If we
1520        * need to support more eventually, the documentation suggests
1521        * that we could allocate a larger buffer, and partition it out
1522        * ourselves.  We'd just have to undo the hardware's address
1523        * calculation by subtracting (FFTID * Per Thread Scratch Space)
1524        * and then add FFTID * (Larger Per Thread Scratch Space).
1525        *
1526        * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
1527        * Thread Group Tracking > Local Memory/Scratch Space.
1528        */
1529       if (s.last_scratch <= devinfo->max_scratch_size_per_thread) {
1530          /* Take the max of any previously compiled variant of the shader. In the
1531           * case of bindless shaders with return parts, this will also take the
1532           * max of all parts.
1533           */
1534          s.prog_data->total_scratch = MAX2(brw_get_scratch_size(s.last_scratch),
1535                                            s.prog_data->total_scratch);
1536       } else {
1537          s.fail("Scratch space required is larger than supported");
1538       }
1539    }
1540 
1541    if (s.failed)
1542       return;
1543 
1544    brw_lower_scoreboard(s);
1545 
1546    s.debug_optimizer(nir, "scoreboard", 96, pass_num++);
1547 }
1548 
1549 unsigned
brw_cs_push_const_total_size(const struct brw_cs_prog_data * cs_prog_data,unsigned threads)1550 brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
1551                              unsigned threads)
1552 {
1553    assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
1554    assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
1555    return cs_prog_data->push.per_thread.size * threads +
1556           cs_prog_data->push.cross_thread.size;
1557 }
1558 
1559 struct intel_cs_dispatch_info
brw_cs_get_dispatch_info(const struct intel_device_info * devinfo,const struct brw_cs_prog_data * prog_data,const unsigned * override_local_size)1560 brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
1561                          const struct brw_cs_prog_data *prog_data,
1562                          const unsigned *override_local_size)
1563 {
1564    struct intel_cs_dispatch_info info = {};
1565 
1566    const unsigned *sizes =
1567       override_local_size ? override_local_size :
1568                             prog_data->local_size;
1569 
1570    const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
1571    assert(simd >= 0 && simd < 3);
1572 
1573    info.group_size = sizes[0] * sizes[1] * sizes[2];
1574    info.simd_size = 8u << simd;
1575    info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
1576 
1577    const uint32_t remainder = info.group_size & (info.simd_size - 1);
1578    if (remainder > 0)
1579       info.right_mask = ~0u >> (32 - remainder);
1580    else
1581       info.right_mask = ~0u >> (32 - info.simd_size);
1582 
1583    return info;
1584 }
1585 
1586 void
brw_shader_phase_update(fs_visitor & s,enum brw_shader_phase phase)1587 brw_shader_phase_update(fs_visitor &s, enum brw_shader_phase phase)
1588 {
1589    assert(phase == s.phase + 1);
1590    s.phase = phase;
1591    brw_validate(s);
1592 }
1593 
brw_should_print_shader(const nir_shader * shader,uint64_t debug_flag)1594 bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
1595 {
1596    return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
1597 }
1598 
1599 namespace brw {
1600    brw_reg
fetch_payload_reg(const brw::fs_builder & bld,uint8_t regs[2],brw_reg_type type,unsigned n)1601    fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
1602                      brw_reg_type type, unsigned n)
1603    {
1604       if (!regs[0])
1605          return brw_reg();
1606 
1607       if (bld.dispatch_width() > 16) {
1608          const brw_reg tmp = bld.vgrf(type, n);
1609          const brw::fs_builder hbld = bld.exec_all().group(16, 0);
1610          const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
1611          brw_reg *const components = new brw_reg[m * n];
1612 
1613          for (unsigned c = 0; c < n; c++) {
1614             for (unsigned g = 0; g < m; g++)
1615                components[c * m + g] =
1616                   offset(retype(brw_vec8_grf(regs[g], 0), type), hbld, c);
1617          }
1618 
1619          hbld.LOAD_PAYLOAD(tmp, components, m * n, 0);
1620 
1621          delete[] components;
1622          return tmp;
1623 
1624       } else {
1625          return brw_reg(retype(brw_vec8_grf(regs[0], 0), type));
1626       }
1627    }
1628 
1629    brw_reg
fetch_barycentric_reg(const brw::fs_builder & bld,uint8_t regs[2])1630    fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2])
1631    {
1632       if (!regs[0])
1633          return brw_reg();
1634       else if (bld.shader->devinfo->ver >= 20)
1635          return fetch_payload_reg(bld, regs, BRW_TYPE_F, 2);
1636 
1637       const brw_reg tmp = bld.vgrf(BRW_TYPE_F, 2);
1638       const brw::fs_builder hbld = bld.exec_all().group(8, 0);
1639       const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
1640       brw_reg *const components = new brw_reg[2 * m];
1641 
1642       for (unsigned c = 0; c < 2; c++) {
1643          for (unsigned g = 0; g < m; g++)
1644             components[c * m + g] = offset(brw_vec8_grf(regs[g / 2], 0),
1645                                            hbld, c + 2 * (g % 2));
1646       }
1647 
1648       hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
1649 
1650       delete[] components;
1651       return tmp;
1652    }
1653 
1654    void
check_dynamic_msaa_flag(const fs_builder & bld,const struct brw_wm_prog_data * wm_prog_data,enum intel_msaa_flags flag)1655    check_dynamic_msaa_flag(const fs_builder &bld,
1656                            const struct brw_wm_prog_data *wm_prog_data,
1657                            enum intel_msaa_flags flag)
1658    {
1659       fs_inst *inst = bld.AND(bld.null_reg_ud(),
1660                               dynamic_msaa_flags(wm_prog_data),
1661                               brw_imm_ud(flag));
1662       inst->conditional_mod = BRW_CONDITIONAL_NZ;
1663    }
1664 }
1665