• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 /** @file
25  *
26  * This file drives the GLSL IR -> LIR translation, contains the
27  * optimizations on the LIR, and drives the generation of native code
28  * from the LIR.
29  */
30 
31 #include "brw_eu.h"
32 #include "brw_fs.h"
33 #include "brw_builder.h"
34 #include "brw_fs_live_variables.h"
35 #include "brw_nir.h"
36 #include "brw_cfg.h"
37 #include "brw_private.h"
38 #include "intel_nir.h"
39 #include "shader_enums.h"
40 #include "dev/intel_debug.h"
41 #include "dev/intel_wa.h"
42 #include "compiler/glsl_types.h"
43 #include "compiler/nir/nir_builder.h"
44 #include "util/u_math.h"
45 
46 using namespace brw;
47 
48 static void
49 initialize_sources(fs_inst *inst, const brw_reg src[], uint8_t num_sources);
50 
51 void
init(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg * src,unsigned sources)52 fs_inst::init(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
53               const brw_reg *src, unsigned sources)
54 {
55    memset((void*)this, 0, sizeof(*this));
56 
57    initialize_sources(this, src, sources);
58 
59    for (unsigned i = 0; i < sources; i++)
60       this->src[i] = src[i];
61 
62    this->opcode = opcode;
63    this->dst = dst;
64    this->exec_size = exec_size;
65 
66    assert(dst.file != IMM && dst.file != UNIFORM);
67 
68    assert(this->exec_size != 0);
69 
70    this->conditional_mod = BRW_CONDITIONAL_NONE;
71 
72    /* This will be the case for almost all instructions. */
73    switch (dst.file) {
74    case VGRF:
75    case ADDRESS:
76    case ARF:
77    case FIXED_GRF:
78    case ATTR:
79       this->size_written = dst.component_size(exec_size);
80       break;
81    case BAD_FILE:
82       this->size_written = 0;
83       break;
84    case IMM:
85    case UNIFORM:
86       unreachable("Invalid destination register file");
87    }
88 
89    this->writes_accumulator = false;
90 }
91 
fs_inst()92 fs_inst::fs_inst()
93 {
94    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
95 }
96 
fs_inst(enum opcode opcode,uint8_t exec_size)97 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
98 {
99    init(opcode, exec_size, reg_undef, NULL, 0);
100 }
101 
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst)102 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst)
103 {
104    init(opcode, exec_size, dst, NULL, 0);
105 }
106 
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg & src0)107 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
108                  const brw_reg &src0)
109 {
110    const brw_reg src[1] = { src0 };
111    init(opcode, exec_size, dst, src, 1);
112 }
113 
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1)114 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
115                  const brw_reg &src0, const brw_reg &src1)
116 {
117    const brw_reg src[2] = { src0, src1 };
118    init(opcode, exec_size, dst, src, 2);
119 }
120 
fs_inst(enum opcode opcode,uint8_t exec_size,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2)121 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
122                  const brw_reg &src0, const brw_reg &src1, const brw_reg &src2)
123 {
124    const brw_reg src[3] = { src0, src1, src2 };
125    init(opcode, exec_size, dst, src, 3);
126 }
127 
fs_inst(enum opcode opcode,uint8_t exec_width,const brw_reg & dst,const brw_reg src[],unsigned sources)128 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const brw_reg &dst,
129                  const brw_reg src[], unsigned sources)
130 {
131    init(opcode, exec_width, dst, src, sources);
132 }
133 
fs_inst(const fs_inst & that)134 fs_inst::fs_inst(const fs_inst &that)
135 {
136    memcpy((void*)this, &that, sizeof(that));
137    initialize_sources(this, that.src, that.sources);
138 }
139 
~fs_inst()140 fs_inst::~fs_inst()
141 {
142    if (this->src != this->builtin_src)
143       delete[] this->src;
144 }
145 
146 static void
initialize_sources(fs_inst * inst,const brw_reg src[],uint8_t num_sources)147 initialize_sources(fs_inst *inst, const brw_reg src[], uint8_t num_sources)
148 {
149    if (num_sources > ARRAY_SIZE(inst->builtin_src))
150       inst->src = new brw_reg[num_sources];
151    else
152       inst->src = inst->builtin_src;
153 
154    for (unsigned i = 0; i < num_sources; i++)
155       inst->src[i] = src[i];
156 
157    inst->sources = num_sources;
158 }
159 
160 void
resize_sources(uint8_t num_sources)161 fs_inst::resize_sources(uint8_t num_sources)
162 {
163    if (this->sources == num_sources)
164       return;
165 
166    brw_reg *old_src = this->src;
167    brw_reg *new_src;
168 
169    const unsigned builtin_size = ARRAY_SIZE(this->builtin_src);
170 
171    if (old_src == this->builtin_src) {
172       if (num_sources > builtin_size) {
173          new_src = new brw_reg[num_sources];
174          for (unsigned i = 0; i < this->sources; i++)
175             new_src[i] = old_src[i];
176 
177       } else {
178          new_src = old_src;
179       }
180    } else {
181       if (num_sources <= builtin_size) {
182          new_src = this->builtin_src;
183          assert(this->sources > num_sources);
184          for (unsigned i = 0; i < num_sources; i++)
185             new_src[i] = old_src[i];
186 
187       } else if (num_sources < this->sources) {
188          new_src = old_src;
189 
190       } else {
191          new_src = new brw_reg[num_sources];
192          for (unsigned i = 0; i < this->sources; i++)
193             new_src[i] = old_src[i];
194       }
195 
196       if (old_src != new_src)
197          delete[] old_src;
198    }
199 
200    this->sources = num_sources;
201    this->src = new_src;
202 }
203 
204 bool
is_send_from_grf() const205 fs_inst::is_send_from_grf() const
206 {
207    switch (opcode) {
208    case SHADER_OPCODE_SEND:
209    case SHADER_OPCODE_SEND_GATHER:
210    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
211    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
212    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
213    case SHADER_OPCODE_INTERLOCK:
214    case SHADER_OPCODE_MEMORY_FENCE:
215    case SHADER_OPCODE_BARRIER:
216       return true;
217    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
218       return src[1].file == VGRF;
219    default:
220       return false;
221    }
222 }
223 
224 bool
is_control_source(unsigned arg) const225 fs_inst::is_control_source(unsigned arg) const
226 {
227    switch (opcode) {
228    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
229       return arg == 0;
230 
231    case SHADER_OPCODE_BROADCAST:
232    case SHADER_OPCODE_SHUFFLE:
233    case SHADER_OPCODE_QUAD_SWIZZLE:
234    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
235    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
236    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
237       return arg == 1;
238 
239    case SHADER_OPCODE_MOV_INDIRECT:
240    case SHADER_OPCODE_CLUSTER_BROADCAST:
241       return arg == 1 || arg == 2;
242 
243    case SHADER_OPCODE_SEND:
244    case SHADER_OPCODE_SEND_GATHER:
245       return arg == 0 || arg == 1;
246 
247    case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
248    case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
249    case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
250       return arg != MEMORY_LOGICAL_BINDING &&
251              arg != MEMORY_LOGICAL_ADDRESS &&
252              arg != MEMORY_LOGICAL_DATA0 &&
253              arg != MEMORY_LOGICAL_DATA1;
254 
255    case SHADER_OPCODE_QUAD_SWAP:
256    case SHADER_OPCODE_INCLUSIVE_SCAN:
257    case SHADER_OPCODE_EXCLUSIVE_SCAN:
258    case SHADER_OPCODE_VOTE_ANY:
259    case SHADER_OPCODE_VOTE_ALL:
260    case SHADER_OPCODE_REDUCE:
261       return arg != 0;
262 
263    default:
264       return false;
265    }
266 }
267 
268 bool
is_payload(unsigned arg) const269 fs_inst::is_payload(unsigned arg) const
270 {
271    switch (opcode) {
272    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
273    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
274    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
275    case SHADER_OPCODE_INTERLOCK:
276    case SHADER_OPCODE_MEMORY_FENCE:
277    case SHADER_OPCODE_BARRIER:
278       return arg == 0;
279 
280    case SHADER_OPCODE_SEND:
281       return arg == 2 || arg == 3;
282 
283    case SHADER_OPCODE_SEND_GATHER:
284       return arg >= 2;
285 
286    default:
287       return false;
288    }
289 }
290 
291 bool
can_do_source_mods(const struct intel_device_info * devinfo) const292 fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
293 {
294    if (is_send_from_grf())
295       return false;
296 
297    /* From TGL PRM Vol 2a Pg. 1053 and Pg. 1069 MAD and MUL Instructions:
298     *
299     * "When multiplying a DW and any lower precision integer, source modifier
300     *  is not supported."
301     */
302    if (devinfo->ver >= 12 && (opcode == BRW_OPCODE_MUL ||
303                               opcode == BRW_OPCODE_MAD)) {
304       const brw_reg_type exec_type = get_exec_type(this);
305       const unsigned min_brw_type_size_bytes = opcode == BRW_OPCODE_MAD ?
306          MIN2(brw_type_size_bytes(src[1].type), brw_type_size_bytes(src[2].type)) :
307          MIN2(brw_type_size_bytes(src[0].type), brw_type_size_bytes(src[1].type));
308 
309       if (brw_type_is_int(exec_type) &&
310           brw_type_size_bytes(exec_type) >= 4 &&
311           brw_type_size_bytes(exec_type) != min_brw_type_size_bytes)
312          return false;
313    }
314 
315    switch (opcode) {
316    case BRW_OPCODE_ADDC:
317    case BRW_OPCODE_BFE:
318    case BRW_OPCODE_BFI1:
319    case BRW_OPCODE_BFI2:
320    case BRW_OPCODE_BFREV:
321    case BRW_OPCODE_CBIT:
322    case BRW_OPCODE_FBH:
323    case BRW_OPCODE_FBL:
324    case BRW_OPCODE_ROL:
325    case BRW_OPCODE_ROR:
326    case BRW_OPCODE_SUBB:
327    case BRW_OPCODE_DP4A:
328    case BRW_OPCODE_DPAS:
329    case SHADER_OPCODE_BROADCAST:
330    case SHADER_OPCODE_CLUSTER_BROADCAST:
331    case SHADER_OPCODE_MOV_INDIRECT:
332    case SHADER_OPCODE_SHUFFLE:
333    case SHADER_OPCODE_INT_QUOTIENT:
334    case SHADER_OPCODE_INT_REMAINDER:
335    case SHADER_OPCODE_REDUCE:
336    case SHADER_OPCODE_INCLUSIVE_SCAN:
337    case SHADER_OPCODE_EXCLUSIVE_SCAN:
338    case SHADER_OPCODE_VOTE_ANY:
339    case SHADER_OPCODE_VOTE_ALL:
340    case SHADER_OPCODE_VOTE_EQUAL:
341    case SHADER_OPCODE_BALLOT:
342    case SHADER_OPCODE_QUAD_SWAP:
343    case SHADER_OPCODE_READ_FROM_LIVE_CHANNEL:
344    case SHADER_OPCODE_READ_FROM_CHANNEL:
345       return false;
346    default:
347       return true;
348    }
349 }
350 
351 bool
can_do_cmod() const352 fs_inst::can_do_cmod() const
353 {
354    switch (opcode) {
355    case BRW_OPCODE_ADD:
356    case BRW_OPCODE_ADD3:
357    case BRW_OPCODE_ADDC:
358    case BRW_OPCODE_AND:
359    case BRW_OPCODE_ASR:
360    case BRW_OPCODE_AVG:
361    case BRW_OPCODE_CMP:
362    case BRW_OPCODE_CMPN:
363    case BRW_OPCODE_DP2:
364    case BRW_OPCODE_DP3:
365    case BRW_OPCODE_DP4:
366    case BRW_OPCODE_DPH:
367    case BRW_OPCODE_FRC:
368    case BRW_OPCODE_LINE:
369    case BRW_OPCODE_LRP:
370    case BRW_OPCODE_LZD:
371    case BRW_OPCODE_MAC:
372    case BRW_OPCODE_MACH:
373    case BRW_OPCODE_MAD:
374    case BRW_OPCODE_MOV:
375    case BRW_OPCODE_MUL:
376    case BRW_OPCODE_NOT:
377    case BRW_OPCODE_OR:
378    case BRW_OPCODE_PLN:
379    case BRW_OPCODE_RNDD:
380    case BRW_OPCODE_RNDE:
381    case BRW_OPCODE_RNDU:
382    case BRW_OPCODE_RNDZ:
383    case BRW_OPCODE_SHL:
384    case BRW_OPCODE_SHR:
385    case BRW_OPCODE_SUBB:
386    case BRW_OPCODE_XOR:
387       break;
388    default:
389       return false;
390    }
391 
392    /* The accumulator result appears to get used for the conditional modifier
393     * generation.  When negating a UD value, there is a 33rd bit generated for
394     * the sign in the accumulator value, so now you can't check, for example,
395     * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
396     */
397    for (unsigned i = 0; i < sources; i++) {
398       if (brw_type_is_uint(src[i].type) && src[i].negate)
399          return false;
400    }
401 
402    if (dst.file == ARF && dst.nr == BRW_ARF_SCALAR && src[0].file == IMM)
403       return false;
404 
405    return true;
406 }
407 
408 bool
can_change_types() const409 fs_inst::can_change_types() const
410 {
411    return dst.type == src[0].type &&
412           !src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR &&
413           (opcode == BRW_OPCODE_MOV ||
414            (opcode == SHADER_OPCODE_LOAD_PAYLOAD && sources == 1) ||
415            (opcode == BRW_OPCODE_SEL &&
416             dst.type == src[1].type &&
417             predicate != BRW_PREDICATE_NONE &&
418             !src[1].abs && !src[1].negate && src[1].file != ATTR));
419 }
420 
421 void
vfail(const char * format,va_list va)422 fs_visitor::vfail(const char *format, va_list va)
423 {
424    char *msg;
425 
426    if (failed)
427       return;
428 
429    failed = true;
430 
431    msg = ralloc_vasprintf(mem_ctx, format, va);
432    msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
433          dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
434 
435    this->fail_msg = msg;
436 
437    if (unlikely(debug_enabled)) {
438       fprintf(stderr, "%s",  msg);
439    }
440 }
441 
442 void
fail(const char * format,...)443 fs_visitor::fail(const char *format, ...)
444 {
445    va_list va;
446 
447    va_start(va, format);
448    vfail(format, va);
449    va_end(va);
450 }
451 
452 /**
453  * Mark this program as impossible to compile with dispatch width greater
454  * than n.
455  *
456  * During the SIMD8 compile (which happens first), we can detect and flag
457  * things that are unsupported in SIMD16+ mode, so the compiler can skip the
458  * SIMD16+ compile altogether.
459  *
460  * During a compile of dispatch width greater than n (if one happens anyway),
461  * this just calls fail().
462  */
463 void
limit_dispatch_width(unsigned n,const char * msg)464 fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
465 {
466    if (dispatch_width > n) {
467       fail("%s", msg);
468    } else {
469       max_dispatch_width = MIN2(max_dispatch_width, n);
470       brw_shader_perf_log(compiler, log_data,
471                           "Shader dispatch width limited to SIMD%d: %s\n",
472                           n, msg);
473    }
474 }
475 
476 /**
477  * Returns true if the instruction has a flag that means it won't
478  * update an entire destination register.
479  *
480  * For example, dead code elimination and live variable analysis want to know
481  * when a write to a variable screens off any preceding values that were in
482  * it.
483  */
484 bool
is_partial_write() const485 fs_inst::is_partial_write() const
486 {
487    if (this->predicate && !this->predicate_trivial &&
488        this->opcode != BRW_OPCODE_SEL)
489       return true;
490 
491    if (!this->dst.is_contiguous())
492       return true;
493 
494    if (this->dst.offset % REG_SIZE != 0)
495       return true;
496 
497    return this->size_written % REG_SIZE != 0;
498 }
499 
500 unsigned
components_read(unsigned i) const501 fs_inst::components_read(unsigned i) const
502 {
503    /* Return zero if the source is not present. */
504    if (src[i].file == BAD_FILE)
505       return 0;
506 
507    switch (opcode) {
508    case BRW_OPCODE_PLN:
509       return i == 0 ? 1 : 2;
510 
511    case FS_OPCODE_PIXEL_X:
512    case FS_OPCODE_PIXEL_Y:
513       assert(i < 2);
514       if (i == 0)
515          return 2;
516       else
517          return 1;
518 
519    case FS_OPCODE_FB_WRITE_LOGICAL:
520       assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
521       /* First/second FB write color. */
522       if (i < 2)
523          return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
524       else
525          return 1;
526 
527    case SHADER_OPCODE_TEX_LOGICAL:
528    case SHADER_OPCODE_TXD_LOGICAL:
529    case SHADER_OPCODE_TXF_LOGICAL:
530    case SHADER_OPCODE_TXL_LOGICAL:
531    case SHADER_OPCODE_TXS_LOGICAL:
532    case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
533    case FS_OPCODE_TXB_LOGICAL:
534    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
535    case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
536    case SHADER_OPCODE_TXF_MCS_LOGICAL:
537    case SHADER_OPCODE_LOD_LOGICAL:
538    case SHADER_OPCODE_TG4_LOGICAL:
539    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
540    case SHADER_OPCODE_TG4_BIAS_LOGICAL:
541    case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
542    case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
543    case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
544    case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
545    case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
546       assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
547              src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
548              src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
549       /* Texture coordinates. */
550       if (i == TEX_LOGICAL_SRC_COORDINATE)
551          return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
552       /* Texture derivatives. */
553       else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
554                opcode == SHADER_OPCODE_TXD_LOGICAL)
555          return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
556       /* Texture offset. */
557       else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
558          return 2;
559       /* MCS */
560       else if (i == TEX_LOGICAL_SRC_MCS) {
561          if (opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
562             return 2;
563          else if (opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
564             return 4;
565          else
566             return 1;
567       } else
568          return 1;
569 
570    case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
571       if (i == MEMORY_LOGICAL_DATA0 || i == MEMORY_LOGICAL_DATA0)
572          return 0;
573       /* fallthrough */
574    case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
575       if (i == MEMORY_LOGICAL_DATA1)
576          return 0;
577       /* fallthrough */
578    case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
579       if (i == MEMORY_LOGICAL_DATA0 || i == MEMORY_LOGICAL_DATA1)
580          return src[MEMORY_LOGICAL_COMPONENTS].ud;
581       else if (i == MEMORY_LOGICAL_ADDRESS)
582          return src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
583       else
584          return 1;
585 
586    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
587       return (i == 0 ? 2 : 1);
588 
589    case SHADER_OPCODE_URB_WRITE_LOGICAL:
590       assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
591 
592       if (i == URB_LOGICAL_SRC_DATA)
593          return src[URB_LOGICAL_SRC_COMPONENTS].ud;
594       else
595          return 1;
596 
597    case BRW_OPCODE_DPAS:
598       unreachable("Do not use components_read() for DPAS.");
599 
600    default:
601       return 1;
602    }
603 }
604 
605 unsigned
size_read(const struct intel_device_info * devinfo,int arg) const606 fs_inst::size_read(const struct intel_device_info *devinfo, int arg) const
607 {
608    switch (opcode) {
609    case SHADER_OPCODE_SEND:
610       if (arg == 2) {
611          return mlen * REG_SIZE;
612       } else if (arg == 3) {
613          return ex_mlen * REG_SIZE;
614       }
615       break;
616 
617    case SHADER_OPCODE_SEND_GATHER:
618       if (arg >= 3) {
619          /* SEND_GATHER is Xe3+, so no need to pass devinfo around. */
620          const unsigned reg_unit = 2;
621          return REG_SIZE * reg_unit;
622       }
623       break;
624 
625    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
626    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
627       if (arg == 0)
628          return mlen * REG_SIZE;
629       break;
630 
631    case BRW_OPCODE_PLN:
632       if (arg == 0)
633          return 16;
634       break;
635 
636    case SHADER_OPCODE_LOAD_PAYLOAD:
637       if (arg < this->header_size)
638          return retype(src[arg], BRW_TYPE_UD).component_size(8);
639       break;
640 
641    case SHADER_OPCODE_BARRIER:
642       return REG_SIZE;
643 
644    case SHADER_OPCODE_MOV_INDIRECT:
645       if (arg == 0) {
646          assert(src[2].file == IMM);
647          return src[2].ud;
648       }
649       break;
650 
651    case BRW_OPCODE_DPAS: {
652       /* This is a little bit sketchy. There's no way to get at devinfo from
653        * here, so the regular reg_unit() cannot be used. However, on
654        * reg_unit() == 1 platforms, DPAS exec_size must be 8, and on known
655        * reg_unit() == 2 platforms, DPAS exec_size must be 16. This is not a
656        * coincidence, so this isn't so bad.
657        */
658       const unsigned reg_unit = this->exec_size / 8;
659 
660       switch (arg) {
661       case 0:
662          if (src[0].type == BRW_TYPE_HF) {
663             return rcount * reg_unit * REG_SIZE / 2;
664          } else {
665             return rcount * reg_unit * REG_SIZE;
666          }
667       case 1:
668          return sdepth * reg_unit * REG_SIZE;
669       case 2:
670          /* This is simpler than the formula described in the Bspec, but it
671           * covers all of the cases that we support. Each inner sdepth
672           * iteration of the DPAS consumes a single dword for int8, uint8, or
673           * float16 types. These are the one source types currently
674           * supportable through Vulkan. This is independent of reg_unit.
675           */
676          return rcount * sdepth * 4;
677       default:
678          unreachable("Invalid source number.");
679       }
680       break;
681    }
682 
683    default:
684       break;
685    }
686 
687    switch (src[arg].file) {
688    case UNIFORM:
689    case IMM:
690       return components_read(arg) * brw_type_size_bytes(src[arg].type);
691    case BAD_FILE:
692    case ADDRESS:
693    case ARF:
694    case FIXED_GRF:
695    case VGRF:
696    case ATTR:
697       /* Regardless of exec_size, values marked as scalar are SIMD8. */
698       return components_read(arg) *
699              src[arg].component_size(src[arg].is_scalar ? 8 * reg_unit(devinfo) : exec_size);
700    }
701    return 0;
702 }
703 
704 namespace {
705    unsigned
predicate_width(const intel_device_info * devinfo,brw_predicate predicate)706    predicate_width(const intel_device_info *devinfo, brw_predicate predicate)
707    {
708       if (devinfo->ver >= 20) {
709          return 1;
710       } else {
711          switch (predicate) {
712          case BRW_PREDICATE_NONE:            return 1;
713          case BRW_PREDICATE_NORMAL:          return 1;
714          case BRW_PREDICATE_ALIGN1_ANY2H:    return 2;
715          case BRW_PREDICATE_ALIGN1_ALL2H:    return 2;
716          case BRW_PREDICATE_ALIGN1_ANY4H:    return 4;
717          case BRW_PREDICATE_ALIGN1_ALL4H:    return 4;
718          case BRW_PREDICATE_ALIGN1_ANY8H:    return 8;
719          case BRW_PREDICATE_ALIGN1_ALL8H:    return 8;
720          case BRW_PREDICATE_ALIGN1_ANY16H:   return 16;
721          case BRW_PREDICATE_ALIGN1_ALL16H:   return 16;
722          case BRW_PREDICATE_ALIGN1_ANY32H:   return 32;
723          case BRW_PREDICATE_ALIGN1_ALL32H:   return 32;
724          default: unreachable("Unsupported predicate");
725          }
726       }
727    }
728 }
729 
730 unsigned
flags_read(const intel_device_info * devinfo) const731 fs_inst::flags_read(const intel_device_info *devinfo) const
732 {
733    if (devinfo->ver < 20 && (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
734                              predicate == BRW_PREDICATE_ALIGN1_ALLV)) {
735       /* The vertical predication modes combine corresponding bits from
736        * f0.0 and f1.0 on Gfx7+.
737        */
738       const unsigned shift = 4;
739       return brw_fs_flag_mask(this, 1) << shift | brw_fs_flag_mask(this, 1);
740    } else if (predicate) {
741       return brw_fs_flag_mask(this, predicate_width(devinfo, predicate));
742    } else {
743       unsigned mask = 0;
744       for (int i = 0; i < sources; i++) {
745          mask |= brw_fs_flag_mask(src[i], size_read(devinfo, i));
746       }
747       return mask;
748    }
749 }
750 
751 unsigned
flags_written(const intel_device_info * devinfo) const752 fs_inst::flags_written(const intel_device_info *devinfo) const
753 {
754    if (conditional_mod && (opcode != BRW_OPCODE_SEL &&
755                            opcode != BRW_OPCODE_CSEL &&
756                            opcode != BRW_OPCODE_IF &&
757                            opcode != BRW_OPCODE_WHILE)) {
758       return brw_fs_flag_mask(this, 1);
759    } else if (opcode == FS_OPCODE_LOAD_LIVE_CHANNELS ||
760               opcode == SHADER_OPCODE_BALLOT ||
761               opcode == SHADER_OPCODE_VOTE_ANY ||
762               opcode == SHADER_OPCODE_VOTE_ALL ||
763               opcode == SHADER_OPCODE_VOTE_EQUAL) {
764       return brw_fs_flag_mask(this, 32);
765    } else {
766       return brw_fs_flag_mask(dst, size_written);
767    }
768 }
769 
770 bool
has_sampler_residency() const771 fs_inst::has_sampler_residency() const
772 {
773    switch (opcode) {
774    case SHADER_OPCODE_TEX_LOGICAL:
775    case FS_OPCODE_TXB_LOGICAL:
776    case SHADER_OPCODE_TXL_LOGICAL:
777    case SHADER_OPCODE_TXD_LOGICAL:
778    case SHADER_OPCODE_TXF_LOGICAL:
779    case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
780    case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
781    case SHADER_OPCODE_TXS_LOGICAL:
782    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
783    case SHADER_OPCODE_TG4_LOGICAL:
784    case SHADER_OPCODE_TG4_BIAS_LOGICAL:
785    case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
786    case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
787    case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
788    case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
789       assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
790       return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
791    default:
792       return false;
793    }
794 }
795 
796 /* \sa inst_is_raw_move in brw_eu_validate. */
797 bool
is_raw_move() const798 fs_inst::is_raw_move() const
799 {
800    if (opcode != BRW_OPCODE_MOV)
801       return false;
802 
803    if (src[0].file == IMM) {
804       if (brw_type_is_vector_imm(src[0].type))
805          return false;
806    } else if (src[0].negate || src[0].abs) {
807       return false;
808    }
809 
810    if (saturate)
811       return false;
812 
813    return src[0].type == dst.type ||
814           (brw_type_is_int(src[0].type) &&
815            brw_type_is_int(dst.type) &&
816            brw_type_size_bits(src[0].type) == brw_type_size_bits(dst.type));
817 }
818 
819 bool
uses_address_register_implicitly() const820 fs_inst::uses_address_register_implicitly() const
821 {
822    switch (opcode) {
823    case SHADER_OPCODE_BROADCAST:
824    case SHADER_OPCODE_SHUFFLE:
825    case SHADER_OPCODE_MOV_INDIRECT:
826       return true;
827    default:
828       return false;
829    }
830 }
831 
832 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
833  * This brings in those uniform definitions
834  */
835 void
import_uniforms(fs_visitor * v)836 fs_visitor::import_uniforms(fs_visitor *v)
837 {
838    this->uniforms = v->uniforms;
839 }
840 
841 enum intel_barycentric_mode
brw_barycentric_mode(const struct brw_wm_prog_key * key,nir_intrinsic_instr * intr)842 brw_barycentric_mode(const struct brw_wm_prog_key *key,
843                      nir_intrinsic_instr *intr)
844 {
845    const glsl_interp_mode mode =
846       (enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
847 
848    /* Barycentric modes don't make sense for flat inputs. */
849    assert(mode != INTERP_MODE_FLAT);
850 
851    unsigned bary;
852    switch (intr->intrinsic) {
853    case nir_intrinsic_load_barycentric_pixel:
854    case nir_intrinsic_load_barycentric_at_offset:
855       /* When per sample interpolation is dynamic, assume sample
856        * interpolation. We'll dynamically remap things so that the FS thread
857        * payload is not affected.
858        */
859       bary = key->persample_interp == INTEL_SOMETIMES ?
860              INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE :
861              INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL;
862       break;
863    case nir_intrinsic_load_barycentric_centroid:
864       bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID;
865       break;
866    case nir_intrinsic_load_barycentric_sample:
867    case nir_intrinsic_load_barycentric_at_sample:
868       bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE;
869       break;
870    default:
871       unreachable("invalid intrinsic");
872    }
873 
874    if (mode == INTERP_MODE_NOPERSPECTIVE)
875       bary += 3;
876 
877    return (enum intel_barycentric_mode) bary;
878 }
879 
880 /**
881  * Walk backwards from the end of the program looking for a URB write that
882  * isn't in control flow, and mark it with EOT.
883  *
884  * Return true if successful or false if a separate EOT write is needed.
885  */
886 bool
mark_last_urb_write_with_eot()887 fs_visitor::mark_last_urb_write_with_eot()
888 {
889    foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
890       if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
891          prev->eot = true;
892 
893          /* Delete now dead instructions. */
894          foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
895             if (dead == prev)
896                break;
897             dead->remove();
898          }
899          return true;
900       } else if (prev->is_control_flow() || prev->has_side_effects()) {
901          break;
902       }
903    }
904 
905    return false;
906 }
907 
908 static unsigned
round_components_to_whole_registers(const intel_device_info * devinfo,unsigned c)909 round_components_to_whole_registers(const intel_device_info *devinfo,
910                                     unsigned c)
911 {
912    return DIV_ROUND_UP(c, 8 * reg_unit(devinfo)) * reg_unit(devinfo);
913 }
914 
915 void
assign_curb_setup()916 fs_visitor::assign_curb_setup()
917 {
918    unsigned uniform_push_length =
919       round_components_to_whole_registers(devinfo, prog_data->nr_params);
920 
921    unsigned ubo_push_length = 0;
922    unsigned ubo_push_start[4];
923    for (int i = 0; i < 4; i++) {
924       ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
925       ubo_push_length += prog_data->ubo_ranges[i].length;
926 
927       assert(ubo_push_start[i] % (8 * reg_unit(devinfo)) == 0);
928       assert(ubo_push_length % (1 * reg_unit(devinfo)) == 0);
929    }
930 
931    prog_data->curb_read_length = uniform_push_length + ubo_push_length;
932    if (stage == MESA_SHADER_FRAGMENT &&
933        ((struct brw_wm_prog_key *)key)->null_push_constant_tbimr_workaround)
934       prog_data->curb_read_length = MAX2(1, prog_data->curb_read_length);
935 
936    uint64_t used = 0;
937    bool is_compute = gl_shader_stage_is_compute(stage);
938 
939    if (is_compute && devinfo->verx10 >= 125 && uniform_push_length > 0) {
940       assert(devinfo->has_lsc);
941       brw_builder ubld = brw_builder(this, 1).exec_all().at(
942          cfg->first_block(), cfg->first_block()->start());
943 
944       /* The base offset for our push data is passed in as R0.0[31:6]. We have
945        * to mask off the bottom 6 bits.
946        */
947       brw_reg base_addr =
948          ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
949                   brw_imm_ud(INTEL_MASK(31, 6)));
950 
951       /* On Gfx12-HP we load constants at the start of the program using A32
952        * stateless messages.
953        */
954       for (unsigned i = 0; i < uniform_push_length;) {
955          /* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
956          unsigned num_regs = MIN2(uniform_push_length - i, 8);
957          assert(num_regs > 0);
958          num_regs = 1 << util_logbase2(num_regs);
959 
960          /* This pass occurs after all of the optimization passes, so don't
961           * emit an 'ADD addr, base_addr, 0' instruction.
962           */
963          brw_reg addr = i == 0 ? base_addr :
964             ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
965 
966          brw_reg srcs[4] = {
967             brw_imm_ud(0), /* desc */
968             brw_imm_ud(0), /* ex_desc */
969             addr,          /* payload */
970             brw_reg(),      /* payload2 */
971          };
972 
973          brw_reg dest = retype(brw_vec8_grf(payload().num_regs + i, 0),
974                               BRW_TYPE_UD);
975          fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
976 
977          send->sfid = GFX12_SFID_UGM;
978          uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
979                                       LSC_ADDR_SURFTYPE_FLAT,
980                                       LSC_ADDR_SIZE_A32,
981                                       LSC_DATA_SIZE_D32,
982                                       num_regs * 8 /* num_channels */,
983                                       true /* transpose */,
984                                       LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
985          send->header_size = 0;
986          send->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, 1);
987          send->size_written =
988             lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
989          send->send_is_volatile = true;
990 
991          send->src[0] = brw_imm_ud(desc |
992                                    brw_message_desc(devinfo,
993                                                     send->mlen,
994                                                     send->size_written / REG_SIZE,
995                                                     send->header_size));
996 
997          i += num_regs;
998       }
999 
1000       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1001    }
1002 
1003    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1004    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1005       for (unsigned int i = 0; i < inst->sources; i++) {
1006 	 if (inst->src[i].file == UNIFORM) {
1007             int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
1008             int constant_nr;
1009             if (inst->src[i].nr >= UBO_START) {
1010                /* constant_nr is in 32-bit units, the rest are in bytes */
1011                constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
1012                              inst->src[i].offset / 4;
1013             } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1014                constant_nr = uniform_nr;
1015             } else {
1016                /* Section 5.11 of the OpenGL 4.1 spec says:
1017                 * "Out-of-bounds reads return undefined values, which include
1018                 *  values from other variables of the active program or zero."
1019                 * Just return the first push constant.
1020                 */
1021                constant_nr = 0;
1022             }
1023 
1024             assert(constant_nr / 8 < 64);
1025             used |= BITFIELD64_BIT(constant_nr / 8);
1026 
1027 	    struct brw_reg brw_reg = brw_vec1_grf(payload().num_regs +
1028 						  constant_nr / 8,
1029 						  constant_nr % 8);
1030             brw_reg.abs = inst->src[i].abs;
1031             brw_reg.negate = inst->src[i].negate;
1032 
1033             /* The combination of is_scalar for load_uniform, copy prop, and
1034              * lower_btd_logical_send can generate a MOV from a UNIFORM with
1035              * exec size 2 and stride of 1.
1036              */
1037             assert(inst->src[i].stride == 0 || inst->exec_size == 2);
1038             inst->src[i] = byte_offset(
1039                retype(brw_reg, inst->src[i].type),
1040                inst->src[i].offset % 4);
1041 	 }
1042       }
1043    }
1044 
1045    uint64_t want_zero = used & prog_data->zero_push_reg;
1046    if (want_zero) {
1047       brw_builder ubld = brw_builder(this, 8).exec_all().at(
1048          cfg->first_block(), cfg->first_block()->start());
1049 
1050       /* push_reg_mask_param is in 32-bit units */
1051       unsigned mask_param = prog_data->push_reg_mask_param;
1052       struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
1053                                                               mask_param % 8);
1054 
1055       brw_reg b32;
1056       for (unsigned i = 0; i < 64; i++) {
1057          if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
1058             brw_reg shifted = ubld.vgrf(BRW_TYPE_W, 2);
1059             ubld.SHL(horiz_offset(shifted, 8),
1060                      byte_offset(retype(mask, BRW_TYPE_W), i / 8),
1061                      brw_imm_v(0x01234567));
1062             ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
1063 
1064             brw_builder ubld16 = ubld.group(16, 0);
1065             b32 = ubld16.vgrf(BRW_TYPE_D);
1066             ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
1067          }
1068 
1069          if (want_zero & BITFIELD64_BIT(i)) {
1070             assert(i < prog_data->curb_read_length);
1071             struct brw_reg push_reg =
1072                retype(brw_vec8_grf(payload().num_regs + i, 0), BRW_TYPE_D);
1073 
1074             ubld.AND(push_reg, push_reg, component(b32, i % 16));
1075          }
1076       }
1077 
1078       invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1079    }
1080 
1081    /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1082    this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
1083 }
1084 
1085 /*
1086  * Build up an array of indices into the urb_setup array that
1087  * references the active entries of the urb_setup array.
1088  * Used to accelerate walking the active entries of the urb_setup array
1089  * on each upload.
1090  */
1091 void
brw_compute_urb_setup_index(struct brw_wm_prog_data * wm_prog_data)1092 brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
1093 {
1094    /* TODO(mesh): Review usage of this in the context of Mesh, we may want to
1095     * skip per-primitive attributes here.
1096     */
1097 
1098    /* Make sure uint8_t is sufficient */
1099    STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
1100    uint8_t index = 0;
1101    for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1102       if (wm_prog_data->urb_setup[attr] >= 0) {
1103          wm_prog_data->urb_setup_attribs[index++] = attr;
1104       }
1105    }
1106    wm_prog_data->urb_setup_attribs_count = index;
1107 }
1108 
1109 void
convert_attr_sources_to_hw_regs(fs_inst * inst)1110 fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
1111 {
1112    for (int i = 0; i < inst->sources; i++) {
1113       if (inst->src[i].file == ATTR) {
1114          assert(inst->src[i].nr == 0);
1115          int grf = payload().num_regs +
1116                    prog_data->curb_read_length +
1117                    inst->src[i].offset / REG_SIZE;
1118 
1119          /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
1120           *
1121           * VertStride must be used to cross GRF register boundaries. This
1122           * rule implies that elements within a 'Width' cannot cross GRF
1123           * boundaries.
1124           *
1125           * So, for registers that are large enough, we have to split the exec
1126           * size in two and trust the compression state to sort it out.
1127           */
1128          unsigned total_size = inst->exec_size *
1129                                inst->src[i].stride *
1130                                brw_type_size_bytes(inst->src[i].type);
1131 
1132          assert(total_size <= 2 * REG_SIZE);
1133          const unsigned exec_size =
1134             (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
1135 
1136          unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
1137          struct brw_reg reg =
1138             stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1139                                inst->src[i].offset % REG_SIZE),
1140                    exec_size * inst->src[i].stride,
1141                    width, inst->src[i].stride);
1142          reg.abs = inst->src[i].abs;
1143          reg.negate = inst->src[i].negate;
1144 
1145          inst->src[i] = reg;
1146       }
1147    }
1148 }
1149 
1150 int
brw_get_subgroup_id_param_index(const intel_device_info * devinfo,const brw_stage_prog_data * prog_data)1151 brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
1152                                 const brw_stage_prog_data *prog_data)
1153 {
1154    if (prog_data->nr_params == 0)
1155       return -1;
1156 
1157    if (devinfo->verx10 >= 125)
1158       return -1;
1159 
1160    /* The local thread id is always the last parameter in the list */
1161    uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
1162    if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
1163       return prog_data->nr_params - 1;
1164 
1165    return -1;
1166 }
1167 
1168 uint32_t
brw_fb_write_msg_control(const fs_inst * inst,const struct brw_wm_prog_data * prog_data)1169 brw_fb_write_msg_control(const fs_inst *inst,
1170                          const struct brw_wm_prog_data *prog_data)
1171 {
1172    uint32_t mctl;
1173 
1174    if (prog_data->dual_src_blend) {
1175       assert(inst->exec_size < 32);
1176 
1177       if (inst->group % 16 == 0)
1178          mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
1179       else if (inst->group % 16 == 8)
1180          mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
1181       else
1182          unreachable("Invalid dual-source FB write instruction group");
1183    } else {
1184       assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
1185 
1186       if (inst->exec_size == 16)
1187          mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
1188       else if (inst->exec_size == 8)
1189          mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
1190       else if (inst->exec_size == 32)
1191          mctl = XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE;
1192       else
1193          unreachable("Invalid FB write execution size");
1194    }
1195 
1196    return mctl;
1197 }
1198 
register_pressure(const fs_visitor * v)1199 brw::register_pressure::register_pressure(const fs_visitor *v)
1200 {
1201    const fs_live_variables &live = v->live_analysis.require();
1202    const unsigned num_instructions = v->cfg->num_blocks ?
1203       v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
1204 
1205    regs_live_at_ip = new unsigned[num_instructions]();
1206 
1207    for (unsigned reg = 0; reg < v->alloc.count; reg++) {
1208       for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
1209          regs_live_at_ip[ip] += v->alloc.sizes[reg];
1210    }
1211 
1212    const unsigned payload_count = v->first_non_payload_grf;
1213 
1214    int *payload_last_use_ip = new int[payload_count];
1215    v->calculate_payload_ranges(true, payload_count, payload_last_use_ip);
1216 
1217    for (unsigned reg = 0; reg < payload_count; reg++) {
1218       for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
1219          ++regs_live_at_ip[ip];
1220    }
1221 
1222    delete[] payload_last_use_ip;
1223 }
1224 
~register_pressure()1225 brw::register_pressure::~register_pressure()
1226 {
1227    delete[] regs_live_at_ip;
1228 }
1229 
1230 void
invalidate_analysis(brw::analysis_dependency_class c)1231 fs_visitor::invalidate_analysis(brw::analysis_dependency_class c)
1232 {
1233    live_analysis.invalidate(c);
1234    regpressure_analysis.invalidate(c);
1235    performance_analysis.invalidate(c);
1236    idom_analysis.invalidate(c);
1237    def_analysis.invalidate(c);
1238 }
1239 
1240 void
debug_optimizer(const nir_shader * nir,const char * pass_name,int iteration,int pass_num) const1241 fs_visitor::debug_optimizer(const nir_shader *nir,
1242                             const char *pass_name,
1243                             int iteration, int pass_num) const
1244 {
1245    if (!brw_should_print_shader(nir, DEBUG_OPTIMIZER))
1246       return;
1247 
1248    char *filename;
1249    int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
1250                       debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
1251                       _mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
1252                       iteration, pass_num, pass_name);
1253    if (ret == -1)
1254       return;
1255 
1256    FILE *file = stderr;
1257    if (__normal_user()) {
1258       file = fopen(filename, "w");
1259       if (!file)
1260          file = stderr;
1261    }
1262 
1263    brw_print_instructions(*this, file);
1264 
1265    if (file != stderr)
1266       fclose(file);
1267 
1268    free(filename);
1269 }
1270 
1271 static uint32_t
brw_compute_max_register_pressure(fs_visitor & s)1272 brw_compute_max_register_pressure(fs_visitor &s)
1273 {
1274    const register_pressure &rp = s.regpressure_analysis.require();
1275    uint32_t ip = 0, max_pressure = 0;
1276    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
1277       max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
1278       ip++;
1279    }
1280    return max_pressure;
1281 }
1282 
1283 static fs_inst **
save_instruction_order(const struct cfg_t * cfg)1284 save_instruction_order(const struct cfg_t *cfg)
1285 {
1286    /* Before we schedule anything, stash off the instruction order as an array
1287     * of fs_inst *.  This way, we can reset it between scheduling passes to
1288     * prevent dependencies between the different scheduling modes.
1289     */
1290    int num_insts = cfg->last_block()->end_ip + 1;
1291    fs_inst **inst_arr = new fs_inst * [num_insts];
1292 
1293    int ip = 0;
1294    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1295       assert(ip >= block->start_ip && ip <= block->end_ip);
1296       inst_arr[ip++] = inst;
1297    }
1298    assert(ip == num_insts);
1299 
1300    return inst_arr;
1301 }
1302 
1303 static void
restore_instruction_order(struct cfg_t * cfg,fs_inst ** inst_arr)1304 restore_instruction_order(struct cfg_t *cfg, fs_inst **inst_arr)
1305 {
1306    ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
1307 
1308    int ip = 0;
1309    foreach_block (block, cfg) {
1310       block->instructions.make_empty();
1311 
1312       assert(ip == block->start_ip);
1313       for (; ip <= block->end_ip; ip++)
1314          block->instructions.push_tail(inst_arr[ip]);
1315    }
1316    assert(ip == num_insts);
1317 }
1318 
1319 /* Per-thread scratch space is a power-of-two multiple of 1KB. */
1320 static inline unsigned
brw_get_scratch_size(int size)1321 brw_get_scratch_size(int size)
1322 {
1323    return MAX2(1024, util_next_power_of_two(size));
1324 }
1325 
1326 void
brw_allocate_registers(fs_visitor & s,bool allow_spilling)1327 brw_allocate_registers(fs_visitor &s, bool allow_spilling)
1328 {
1329    const struct intel_device_info *devinfo = s.devinfo;
1330    const nir_shader *nir = s.nir;
1331    bool allocated;
1332 
1333    static const enum brw_instruction_scheduler_mode pre_modes[] = {
1334       BRW_SCHEDULE_PRE,
1335       BRW_SCHEDULE_PRE_NON_LIFO,
1336       BRW_SCHEDULE_NONE,
1337       BRW_SCHEDULE_PRE_LIFO,
1338    };
1339 
1340    static const char *scheduler_mode_name[] = {
1341       [BRW_SCHEDULE_PRE] = "top-down",
1342       [BRW_SCHEDULE_PRE_NON_LIFO] = "non-lifo",
1343       [BRW_SCHEDULE_PRE_LIFO] = "lifo",
1344       [BRW_SCHEDULE_POST] = "post",
1345       [BRW_SCHEDULE_NONE] = "none",
1346    };
1347 
1348    uint32_t best_register_pressure = UINT32_MAX;
1349    enum brw_instruction_scheduler_mode best_sched = BRW_SCHEDULE_NONE;
1350 
1351    brw_opt_compact_virtual_grfs(s);
1352 
1353    if (s.needs_register_pressure)
1354       s.shader_stats.max_register_pressure = brw_compute_max_register_pressure(s);
1355 
1356    s.debug_optimizer(nir, "pre_register_allocate", 90, 90);
1357 
1358    bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
1359 
1360    /* Before we schedule anything, stash off the instruction order as an array
1361     * of fs_inst *.  This way, we can reset it between scheduling passes to
1362     * prevent dependencies between the different scheduling modes.
1363     */
1364    fs_inst **orig_order = save_instruction_order(s.cfg);
1365    fs_inst **best_pressure_order = NULL;
1366 
1367    void *scheduler_ctx = ralloc_context(NULL);
1368    brw_instruction_scheduler *sched = brw_prepare_scheduler(s, scheduler_ctx);
1369 
1370    /* Try each scheduling heuristic to see if it can successfully register
1371     * allocate without spilling.  They should be ordered by decreasing
1372     * performance but increasing likelihood of allocating.
1373     */
1374    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
1375       enum brw_instruction_scheduler_mode sched_mode = pre_modes[i];
1376 
1377       brw_schedule_instructions_pre_ra(s, sched, sched_mode);
1378       s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
1379 
1380       s.debug_optimizer(nir, s.shader_stats.scheduler_mode, 95, i);
1381 
1382       if (0) {
1383          brw_assign_regs_trivial(s);
1384          allocated = true;
1385          break;
1386       }
1387 
1388       /* We should only spill registers on the last scheduling. */
1389       assert(!s.spilled_any_registers);
1390 
1391       allocated = brw_assign_regs(s, false, spill_all);
1392       if (allocated)
1393          break;
1394 
1395       /* Save the maximum register pressure */
1396       uint32_t this_pressure = brw_compute_max_register_pressure(s);
1397 
1398       if (0) {
1399          fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
1400                  scheduler_mode_name[sched_mode], this_pressure);
1401       }
1402 
1403       if (this_pressure < best_register_pressure) {
1404          best_register_pressure = this_pressure;
1405          best_sched = sched_mode;
1406          delete[] best_pressure_order;
1407          best_pressure_order = save_instruction_order(s.cfg);
1408       }
1409 
1410       /* Reset back to the original order before trying the next mode */
1411       restore_instruction_order(s.cfg, orig_order);
1412       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1413    }
1414 
1415    ralloc_free(scheduler_ctx);
1416 
1417    if (!allocated) {
1418       if (0) {
1419          fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
1420                  scheduler_mode_name[best_sched]);
1421       }
1422       restore_instruction_order(s.cfg, best_pressure_order);
1423       s.shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
1424 
1425       allocated = brw_assign_regs(s, allow_spilling, spill_all);
1426    }
1427 
1428    delete[] orig_order;
1429    delete[] best_pressure_order;
1430 
1431    if (!allocated) {
1432       s.fail("Failure to register allocate.  Reduce number of "
1433            "live scalar values to avoid this.");
1434    } else if (s.spilled_any_registers) {
1435       brw_shader_perf_log(s.compiler, s.log_data,
1436                           "%s shader triggered register spilling.  "
1437                           "Try reducing the number of live scalar "
1438                           "values to improve performance.\n",
1439                           _mesa_shader_stage_to_string(s.stage));
1440    }
1441 
1442    if (s.failed)
1443       return;
1444 
1445    int pass_num = 0;
1446 
1447    s.debug_optimizer(nir, "post_ra_alloc", 96, pass_num++);
1448 
1449    brw_opt_bank_conflicts(s);
1450 
1451    s.debug_optimizer(nir, "bank_conflict", 96, pass_num++);
1452 
1453    brw_schedule_instructions_post_ra(s);
1454 
1455    s.debug_optimizer(nir, "post_ra_alloc_scheduling", 96, pass_num++);
1456 
1457    /* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead
1458     * of part of assign_regs since both bank conflicts optimization and post
1459     * RA scheduling take advantage of distinguishing references to registers
1460     * that were allocated from references that were already fixed.
1461     *
1462     * TODO: Change the passes above, then move this lowering to be part of
1463     * assign_regs.
1464     */
1465    brw_lower_vgrfs_to_fixed_grfs(s);
1466 
1467    s.debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, pass_num++);
1468 
1469    if (s.devinfo->ver >= 30) {
1470       brw_lower_send_gather(s);
1471       s.debug_optimizer(nir, "lower_send_gather", 96, pass_num++);
1472    }
1473 
1474    brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_REGALLOC);
1475 
1476    if (s.last_scratch > 0) {
1477       /* We currently only support up to 2MB of scratch space.  If we
1478        * need to support more eventually, the documentation suggests
1479        * that we could allocate a larger buffer, and partition it out
1480        * ourselves.  We'd just have to undo the hardware's address
1481        * calculation by subtracting (FFTID * Per Thread Scratch Space)
1482        * and then add FFTID * (Larger Per Thread Scratch Space).
1483        *
1484        * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
1485        * Thread Group Tracking > Local Memory/Scratch Space.
1486        */
1487       if (s.last_scratch <= devinfo->max_scratch_size_per_thread) {
1488          /* Take the max of any previously compiled variant of the shader. In the
1489           * case of bindless shaders with return parts, this will also take the
1490           * max of all parts.
1491           */
1492          s.prog_data->total_scratch = MAX2(brw_get_scratch_size(s.last_scratch),
1493                                            s.prog_data->total_scratch);
1494       } else {
1495          s.fail("Scratch space required is larger than supported");
1496       }
1497    }
1498 
1499    if (s.failed)
1500       return;
1501 
1502    brw_lower_scoreboard(s);
1503 
1504    s.debug_optimizer(nir, "scoreboard", 96, pass_num++);
1505 }
1506 
1507 unsigned
brw_cs_push_const_total_size(const struct brw_cs_prog_data * cs_prog_data,unsigned threads)1508 brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
1509                              unsigned threads)
1510 {
1511    assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
1512    assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
1513    return cs_prog_data->push.per_thread.size * threads +
1514           cs_prog_data->push.cross_thread.size;
1515 }
1516 
1517 struct intel_cs_dispatch_info
brw_cs_get_dispatch_info(const struct intel_device_info * devinfo,const struct brw_cs_prog_data * prog_data,const unsigned * override_local_size)1518 brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
1519                          const struct brw_cs_prog_data *prog_data,
1520                          const unsigned *override_local_size)
1521 {
1522    struct intel_cs_dispatch_info info = {};
1523 
1524    const unsigned *sizes =
1525       override_local_size ? override_local_size :
1526                             prog_data->local_size;
1527 
1528    const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
1529    assert(simd >= 0 && simd < 3);
1530 
1531    info.group_size = sizes[0] * sizes[1] * sizes[2];
1532    info.simd_size = 8u << simd;
1533    info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
1534 
1535    const uint32_t remainder = info.group_size & (info.simd_size - 1);
1536    if (remainder > 0)
1537       info.right_mask = ~0u >> (32 - remainder);
1538    else
1539       info.right_mask = ~0u >> (32 - info.simd_size);
1540 
1541    return info;
1542 }
1543 
1544 void
brw_shader_phase_update(fs_visitor & s,enum brw_shader_phase phase)1545 brw_shader_phase_update(fs_visitor &s, enum brw_shader_phase phase)
1546 {
1547    assert(phase == s.phase + 1);
1548    s.phase = phase;
1549    brw_validate(s);
1550 }
1551 
brw_should_print_shader(const nir_shader * shader,uint64_t debug_flag)1552 bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
1553 {
1554    return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
1555 }
1556