1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file elk_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include "elk_eu.h"
32 #include "elk_fs.h"
33 #include "elk_fs_builder.h"
34 #include "elk_fs_live_variables.h"
35 #include "elk_nir.h"
36 #include "elk_vec4_gs_visitor.h"
37 #include "elk_cfg.h"
38 #include "elk_dead_control_flow.h"
39 #include "elk_private.h"
40 #include "../intel_nir.h"
41 #include "shader_enums.h"
42 #include "dev/intel_debug.h"
43 #include "dev/intel_wa.h"
44 #include "compiler/glsl_types.h"
45 #include "compiler/nir/nir_builder.h"
46 #include "util/u_math.h"
47
48 #include <memory>
49
50 using namespace elk;
51
52 static unsigned get_lowered_simd_width(const elk_fs_visitor *shader,
53 const elk_fs_inst *inst);
54
55 void
init(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg * src,unsigned sources)56 elk_fs_inst::init(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
57 const elk_fs_reg *src, unsigned sources)
58 {
59 memset((void*)this, 0, sizeof(*this));
60
61 this->src = new elk_fs_reg[MAX2(sources, 3)];
62 for (unsigned i = 0; i < sources; i++)
63 this->src[i] = src[i];
64
65 this->opcode = opcode;
66 this->dst = dst;
67 this->sources = sources;
68 this->exec_size = exec_size;
69 this->base_mrf = -1;
70
71 assert(dst.file != IMM && dst.file != UNIFORM);
72
73 assert(this->exec_size != 0);
74
75 this->conditional_mod = ELK_CONDITIONAL_NONE;
76
77 /* This will be the case for almost all instructions. */
78 switch (dst.file) {
79 case VGRF:
80 case ARF:
81 case FIXED_GRF:
82 case MRF:
83 case ATTR:
84 this->size_written = dst.component_size(exec_size);
85 break;
86 case BAD_FILE:
87 this->size_written = 0;
88 break;
89 case IMM:
90 case UNIFORM:
91 unreachable("Invalid destination register file");
92 }
93
94 this->writes_accumulator = false;
95 }
96
elk_fs_inst()97 elk_fs_inst::elk_fs_inst()
98 {
99 init(ELK_OPCODE_NOP, 8, dst, NULL, 0);
100 }
101
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size)102 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size)
103 {
104 init(opcode, exec_size, reg_undef, NULL, 0);
105 }
106
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst)107 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst)
108 {
109 init(opcode, exec_size, dst, NULL, 0);
110 }
111
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg & src0)112 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
113 const elk_fs_reg &src0)
114 {
115 const elk_fs_reg src[1] = { src0 };
116 init(opcode, exec_size, dst, src, 1);
117 }
118
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg & src0,const elk_fs_reg & src1)119 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
120 const elk_fs_reg &src0, const elk_fs_reg &src1)
121 {
122 const elk_fs_reg src[2] = { src0, src1 };
123 init(opcode, exec_size, dst, src, 2);
124 }
125
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg & src0,const elk_fs_reg & src1,const elk_fs_reg & src2)126 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
127 const elk_fs_reg &src0, const elk_fs_reg &src1, const elk_fs_reg &src2)
128 {
129 const elk_fs_reg src[3] = { src0, src1, src2 };
130 init(opcode, exec_size, dst, src, 3);
131 }
132
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_width,const elk_fs_reg & dst,const elk_fs_reg src[],unsigned sources)133 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_width, const elk_fs_reg &dst,
134 const elk_fs_reg src[], unsigned sources)
135 {
136 init(opcode, exec_width, dst, src, sources);
137 }
138
elk_fs_inst(const elk_fs_inst & that)139 elk_fs_inst::elk_fs_inst(const elk_fs_inst &that)
140 {
141 memcpy((void*)this, &that, sizeof(that));
142
143 this->src = new elk_fs_reg[MAX2(that.sources, 3)];
144
145 for (unsigned i = 0; i < that.sources; i++)
146 this->src[i] = that.src[i];
147 }
148
~elk_fs_inst()149 elk_fs_inst::~elk_fs_inst()
150 {
151 delete[] this->src;
152 }
153
154 void
resize_sources(uint8_t num_sources)155 elk_fs_inst::resize_sources(uint8_t num_sources)
156 {
157 if (this->sources != num_sources) {
158 elk_fs_reg *src = new elk_fs_reg[MAX2(num_sources, 3)];
159
160 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
161 src[i] = this->src[i];
162
163 delete[] this->src;
164 this->src = src;
165 this->sources = num_sources;
166 }
167 }
168
169 void
VARYING_PULL_CONSTANT_LOAD(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg & surface,const elk_fs_reg & surface_handle,const elk_fs_reg & varying_offset,uint32_t const_offset,uint8_t alignment,unsigned components)170 elk_fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
171 const elk_fs_reg &dst,
172 const elk_fs_reg &surface,
173 const elk_fs_reg &surface_handle,
174 const elk_fs_reg &varying_offset,
175 uint32_t const_offset,
176 uint8_t alignment,
177 unsigned components)
178 {
179 assert(components <= 4);
180
181 /* We have our constant surface use a pitch of 4 bytes, so our index can
182 * be any component of a vector, and then we load 4 contiguous
183 * components starting from that. TODO: Support loading fewer than 4.
184 */
185 elk_fs_reg total_offset = vgrf(glsl_uint_type());
186 bld.ADD(total_offset, varying_offset, elk_imm_ud(const_offset));
187
188 /* The pull load message will load a vec4 (16 bytes). If we are loading
189 * a double this means we are only loading 2 elements worth of data.
190 * We also want to use a 32-bit data type for the dst of the load operation
191 * so other parts of the driver don't get confused about the size of the
192 * result.
193 */
194 elk_fs_reg vec4_result = bld.vgrf(ELK_REGISTER_TYPE_F, 4);
195
196 elk_fs_reg srcs[PULL_VARYING_CONSTANT_SRCS];
197 srcs[PULL_VARYING_CONSTANT_SRC_SURFACE] = surface;
198 srcs[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
199 srcs[PULL_VARYING_CONSTANT_SRC_OFFSET] = total_offset;
200 srcs[PULL_VARYING_CONSTANT_SRC_ALIGNMENT] = elk_imm_ud(alignment);
201
202 elk_fs_inst *inst = bld.emit(ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
203 vec4_result, srcs, PULL_VARYING_CONSTANT_SRCS);
204 inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
205
206 elk_shuffle_from_32bit_read(bld, dst, vec4_result, 0, components);
207 }
208
209 /**
210 * A helper for MOV generation for fixing up broken hardware SEND dependency
211 * handling.
212 */
213 void
DEP_RESOLVE_MOV(const fs_builder & bld,int grf)214 elk_fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
215 {
216 /* The caller always wants uncompressed to emit the minimal extra
217 * dependencies, and to avoid having to deal with aligning its regs to 2.
218 */
219 const fs_builder ubld = bld.annotate("send dependency resolve")
220 .quarter(0);
221
222 ubld.MOV(ubld.null_reg_f(), elk_fs_reg(VGRF, grf, ELK_REGISTER_TYPE_F));
223 }
224
225 bool
is_send_from_grf() const226 elk_fs_inst::is_send_from_grf() const
227 {
228 switch (opcode) {
229 case ELK_SHADER_OPCODE_SEND:
230 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
231 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
232 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
233 case ELK_SHADER_OPCODE_INTERLOCK:
234 case ELK_SHADER_OPCODE_MEMORY_FENCE:
235 case ELK_SHADER_OPCODE_BARRIER:
236 return true;
237 case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
238 return src[1].file == VGRF;
239 case ELK_FS_OPCODE_FB_WRITE:
240 case ELK_FS_OPCODE_FB_READ:
241 return src[0].file == VGRF;
242 default:
243 return false;
244 }
245 }
246
247 bool
is_control_source(unsigned arg) const248 elk_fs_inst::is_control_source(unsigned arg) const
249 {
250 switch (opcode) {
251 case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
252 case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
253 return arg == 0;
254
255 case ELK_SHADER_OPCODE_BROADCAST:
256 case ELK_SHADER_OPCODE_SHUFFLE:
257 case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
258 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
259 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
260 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
261 return arg == 1;
262
263 case ELK_SHADER_OPCODE_MOV_INDIRECT:
264 case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
265 case ELK_SHADER_OPCODE_TEX:
266 case ELK_FS_OPCODE_TXB:
267 case ELK_SHADER_OPCODE_TXD:
268 case ELK_SHADER_OPCODE_TXF:
269 case ELK_SHADER_OPCODE_TXF_LZ:
270 case ELK_SHADER_OPCODE_TXF_CMS:
271 case ELK_SHADER_OPCODE_TXF_CMS_W:
272 case ELK_SHADER_OPCODE_TXF_UMS:
273 case ELK_SHADER_OPCODE_TXF_MCS:
274 case ELK_SHADER_OPCODE_TXL:
275 case ELK_SHADER_OPCODE_TXL_LZ:
276 case ELK_SHADER_OPCODE_TXS:
277 case ELK_SHADER_OPCODE_LOD:
278 case ELK_SHADER_OPCODE_TG4:
279 case ELK_SHADER_OPCODE_TG4_OFFSET:
280 case ELK_SHADER_OPCODE_SAMPLEINFO:
281 return arg == 1 || arg == 2;
282
283 case ELK_SHADER_OPCODE_SEND:
284 return arg == 0 || arg == 1;
285
286 default:
287 return false;
288 }
289 }
290
291 bool
is_payload(unsigned arg) const292 elk_fs_inst::is_payload(unsigned arg) const
293 {
294 switch (opcode) {
295 case ELK_FS_OPCODE_FB_WRITE:
296 case ELK_FS_OPCODE_FB_READ:
297 case ELK_VEC4_OPCODE_UNTYPED_ATOMIC:
298 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_READ:
299 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
300 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
301 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
302 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
303 case ELK_SHADER_OPCODE_INTERLOCK:
304 case ELK_SHADER_OPCODE_MEMORY_FENCE:
305 case ELK_SHADER_OPCODE_BARRIER:
306 case ELK_SHADER_OPCODE_TEX:
307 case ELK_FS_OPCODE_TXB:
308 case ELK_SHADER_OPCODE_TXD:
309 case ELK_SHADER_OPCODE_TXF:
310 case ELK_SHADER_OPCODE_TXF_LZ:
311 case ELK_SHADER_OPCODE_TXF_CMS:
312 case ELK_SHADER_OPCODE_TXF_CMS_W:
313 case ELK_SHADER_OPCODE_TXF_UMS:
314 case ELK_SHADER_OPCODE_TXF_MCS:
315 case ELK_SHADER_OPCODE_TXL:
316 case ELK_SHADER_OPCODE_TXL_LZ:
317 case ELK_SHADER_OPCODE_TXS:
318 case ELK_SHADER_OPCODE_LOD:
319 case ELK_SHADER_OPCODE_TG4:
320 case ELK_SHADER_OPCODE_TG4_OFFSET:
321 case ELK_SHADER_OPCODE_SAMPLEINFO:
322 return arg == 0;
323
324 case ELK_SHADER_OPCODE_SEND:
325 return arg == 2 || arg == 3;
326
327 default:
328 return false;
329 }
330 }
331
332 /**
333 * Returns true if this instruction's sources and destinations cannot
334 * safely be the same register.
335 *
336 * In most cases, a register can be written over safely by the same
337 * instruction that is its last use. For a single instruction, the
338 * sources are dereferenced before writing of the destination starts
339 * (naturally).
340 *
341 * However, there are a few cases where this can be problematic:
342 *
343 * - Virtual opcodes that translate to multiple instructions in the
344 * code generator: if src == dst and one instruction writes the
345 * destination before a later instruction reads the source, then
346 * src will have been clobbered.
347 *
348 * - SIMD16 compressed instructions with certain regioning (see below).
349 *
350 * The register allocator uses this information to set up conflicts between
351 * GRF sources and the destination.
352 */
353 bool
has_source_and_destination_hazard() const354 elk_fs_inst::has_source_and_destination_hazard() const
355 {
356 switch (opcode) {
357 case ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT:
358 /* Multiple partial writes to the destination */
359 return true;
360 case ELK_SHADER_OPCODE_SHUFFLE:
361 /* This instruction returns an arbitrary channel from the source and
362 * gets split into smaller instructions in the generator. It's possible
363 * that one of the instructions will read from a channel corresponding
364 * to an earlier instruction.
365 */
366 case ELK_SHADER_OPCODE_SEL_EXEC:
367 /* This is implemented as
368 *
369 * mov(16) g4<1>D 0D { align1 WE_all 1H };
370 * mov(16) g4<1>D g5<8,8,1>D { align1 1H }
371 *
372 * Because the source is only read in the second instruction, the first
373 * may stomp all over it.
374 */
375 return true;
376 case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
377 switch (src[1].ud) {
378 case ELK_SWIZZLE_XXXX:
379 case ELK_SWIZZLE_YYYY:
380 case ELK_SWIZZLE_ZZZZ:
381 case ELK_SWIZZLE_WWWW:
382 case ELK_SWIZZLE_XXZZ:
383 case ELK_SWIZZLE_YYWW:
384 case ELK_SWIZZLE_XYXY:
385 case ELK_SWIZZLE_ZWZW:
386 /* These can be implemented as a single Align1 region on all
387 * platforms, so there's never a hazard between source and
388 * destination. C.f. elk_fs_generator::generate_quad_swizzle().
389 */
390 return false;
391 default:
392 return !is_uniform(src[0]);
393 }
394 case ELK_OPCODE_DPAS:
395 /* This is overly conservative. The actual hazard is more complicated to
396 * describe. When the repeat count is N, the single instruction behaves
397 * like N instructions with a repeat count of one, but the destination
398 * and source registers are incremented (in somewhat complex ways) for
399 * each instruction.
400 *
401 * This means the source and destination register is actually a range of
402 * registers. The hazard exists of an earlier iteration would write a
403 * register that should be read by a later iteration.
404 *
405 * There may be some advantage to properly modeling this, but for now,
406 * be overly conservative.
407 */
408 return rcount > 1;
409 default:
410 /* The SIMD16 compressed instruction
411 *
412 * add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F
413 *
414 * is actually decoded in hardware as:
415 *
416 * add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F
417 * add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F
418 *
419 * Which is safe. However, if we have uniform accesses
420 * happening, we get into trouble:
421 *
422 * add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F
423 * add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F
424 *
425 * Now our destination for the first instruction overwrote the
426 * second instruction's src0, and we get garbage for those 8
427 * pixels. There's a similar issue for the pre-gfx6
428 * pixel_x/pixel_y, which are registers of 16-bit values and thus
429 * would get stomped by the first decode as well.
430 */
431 if (exec_size == 16) {
432 for (int i = 0; i < sources; i++) {
433 if (src[i].file == VGRF && (src[i].stride == 0 ||
434 src[i].type == ELK_REGISTER_TYPE_UW ||
435 src[i].type == ELK_REGISTER_TYPE_W ||
436 src[i].type == ELK_REGISTER_TYPE_UB ||
437 src[i].type == ELK_REGISTER_TYPE_B)) {
438 return true;
439 }
440 }
441 }
442 return false;
443 }
444 }
445
446 bool
can_do_source_mods(const struct intel_device_info * devinfo) const447 elk_fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
448 {
449 if (devinfo->ver == 6 && is_math())
450 return false;
451
452 if (is_send_from_grf())
453 return false;
454
455 /* From Wa_1604601757:
456 *
457 * "When multiplying a DW and any lower precision integer, source modifier
458 * is not supported."
459 */
460 if (devinfo->ver >= 12 && (opcode == ELK_OPCODE_MUL ||
461 opcode == ELK_OPCODE_MAD)) {
462 const elk_reg_type exec_type = get_exec_type(this);
463 const unsigned min_type_sz = opcode == ELK_OPCODE_MAD ?
464 MIN2(type_sz(src[1].type), type_sz(src[2].type)) :
465 MIN2(type_sz(src[0].type), type_sz(src[1].type));
466
467 if (elk_reg_type_is_integer(exec_type) &&
468 type_sz(exec_type) >= 4 &&
469 type_sz(exec_type) != min_type_sz)
470 return false;
471 }
472
473 if (!elk_backend_instruction::can_do_source_mods())
474 return false;
475
476 return true;
477 }
478
479 bool
can_do_cmod()480 elk_fs_inst::can_do_cmod()
481 {
482 if (!elk_backend_instruction::can_do_cmod())
483 return false;
484
485 /* The accumulator result appears to get used for the conditional modifier
486 * generation. When negating a UD value, there is a 33rd bit generated for
487 * the sign in the accumulator value, so now you can't check, for example,
488 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
489 */
490 for (unsigned i = 0; i < sources; i++) {
491 if (elk_reg_type_is_unsigned_integer(src[i].type) && src[i].negate)
492 return false;
493 }
494
495 return true;
496 }
497
498 bool
can_change_types() const499 elk_fs_inst::can_change_types() const
500 {
501 return dst.type == src[0].type &&
502 !src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR &&
503 (opcode == ELK_OPCODE_MOV ||
504 (opcode == ELK_OPCODE_SEL &&
505 dst.type == src[1].type &&
506 predicate != ELK_PREDICATE_NONE &&
507 !src[1].abs && !src[1].negate && src[1].file != ATTR));
508 }
509
510 void
init()511 elk_fs_reg::init()
512 {
513 memset((void*)this, 0, sizeof(*this));
514 type = ELK_REGISTER_TYPE_UD;
515 stride = 1;
516 }
517
518 /** Generic unset register constructor. */
elk_fs_reg()519 elk_fs_reg::elk_fs_reg()
520 {
521 init();
522 this->file = BAD_FILE;
523 }
524
elk_fs_reg(struct::elk_reg reg)525 elk_fs_reg::elk_fs_reg(struct ::elk_reg reg) :
526 elk_backend_reg(reg)
527 {
528 this->offset = 0;
529 this->stride = 1;
530 if (this->file == IMM &&
531 (this->type != ELK_REGISTER_TYPE_V &&
532 this->type != ELK_REGISTER_TYPE_UV &&
533 this->type != ELK_REGISTER_TYPE_VF)) {
534 this->stride = 0;
535 }
536 }
537
538 bool
equals(const elk_fs_reg & r) const539 elk_fs_reg::equals(const elk_fs_reg &r) const
540 {
541 return (this->elk_backend_reg::equals(r) &&
542 stride == r.stride);
543 }
544
545 bool
negative_equals(const elk_fs_reg & r) const546 elk_fs_reg::negative_equals(const elk_fs_reg &r) const
547 {
548 return (this->elk_backend_reg::negative_equals(r) &&
549 stride == r.stride);
550 }
551
552 bool
is_contiguous() const553 elk_fs_reg::is_contiguous() const
554 {
555 switch (file) {
556 case ARF:
557 case FIXED_GRF:
558 return hstride == ELK_HORIZONTAL_STRIDE_1 &&
559 vstride == width + hstride;
560 case MRF:
561 case VGRF:
562 case ATTR:
563 return stride == 1;
564 case UNIFORM:
565 case IMM:
566 case BAD_FILE:
567 return true;
568 }
569
570 unreachable("Invalid register file");
571 }
572
573 unsigned
component_size(unsigned width) const574 elk_fs_reg::component_size(unsigned width) const
575 {
576 if (file == ARF || file == FIXED_GRF) {
577 const unsigned w = MIN2(width, 1u << this->width);
578 const unsigned h = width >> this->width;
579 const unsigned vs = vstride ? 1 << (vstride - 1) : 0;
580 const unsigned hs = hstride ? 1 << (hstride - 1) : 0;
581 assert(w > 0);
582 return ((MAX2(1, h) - 1) * vs + (w - 1) * hs + 1) * type_sz(type);
583 } else {
584 return MAX2(width * stride, 1) * type_sz(type);
585 }
586 }
587
588 void
vfail(const char * format,va_list va)589 elk_fs_visitor::vfail(const char *format, va_list va)
590 {
591 char *msg;
592
593 if (failed)
594 return;
595
596 failed = true;
597
598 msg = ralloc_vasprintf(mem_ctx, format, va);
599 msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
600 dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
601
602 this->fail_msg = msg;
603
604 if (unlikely(debug_enabled)) {
605 fprintf(stderr, "%s", msg);
606 }
607 }
608
609 void
fail(const char * format,...)610 elk_fs_visitor::fail(const char *format, ...)
611 {
612 va_list va;
613
614 va_start(va, format);
615 vfail(format, va);
616 va_end(va);
617 }
618
619 /**
620 * Mark this program as impossible to compile with dispatch width greater
621 * than n.
622 *
623 * During the SIMD8 compile (which happens first), we can detect and flag
624 * things that are unsupported in SIMD16+ mode, so the compiler can skip the
625 * SIMD16+ compile altogether.
626 *
627 * During a compile of dispatch width greater than n (if one happens anyway),
628 * this just calls fail().
629 */
630 void
limit_dispatch_width(unsigned n,const char * msg)631 elk_fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
632 {
633 if (dispatch_width > n) {
634 fail("%s", msg);
635 } else {
636 max_dispatch_width = MIN2(max_dispatch_width, n);
637 elk_shader_perf_log(compiler, log_data,
638 "Shader dispatch width limited to SIMD%d: %s\n",
639 n, msg);
640 }
641 }
642
643 /**
644 * Returns true if the instruction has a flag that means it won't
645 * update an entire destination register.
646 *
647 * For example, dead code elimination and live variable analysis want to know
648 * when a write to a variable screens off any preceding values that were in
649 * it.
650 */
651 bool
is_partial_write() const652 elk_fs_inst::is_partial_write() const
653 {
654 if (this->predicate && !this->predicate_trivial &&
655 this->opcode != ELK_OPCODE_SEL)
656 return true;
657
658 if (this->dst.offset % REG_SIZE != 0)
659 return true;
660
661 /* SEND instructions always write whole registers */
662 if (this->opcode == ELK_SHADER_OPCODE_SEND)
663 return false;
664
665 /* Special case UNDEF since a lot of places in the backend do things like this :
666 *
667 * fs_builder ubld = bld.exec_all().group(1, 0);
668 * elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD);
669 * ubld.UNDEF(tmp); <- partial write, even if the whole register is concerned
670 */
671 if (this->opcode == ELK_SHADER_OPCODE_UNDEF) {
672 assert(this->dst.is_contiguous());
673 return this->size_written < 32;
674 }
675
676 return this->exec_size * type_sz(this->dst.type) < 32 ||
677 !this->dst.is_contiguous();
678 }
679
680 unsigned
components_read(unsigned i) const681 elk_fs_inst::components_read(unsigned i) const
682 {
683 /* Return zero if the source is not present. */
684 if (src[i].file == BAD_FILE)
685 return 0;
686
687 switch (opcode) {
688 case ELK_FS_OPCODE_LINTERP:
689 if (i == 0)
690 return 2;
691 else
692 return 1;
693
694 case ELK_FS_OPCODE_PIXEL_X:
695 case ELK_FS_OPCODE_PIXEL_Y:
696 assert(i < 2);
697 if (i == 0)
698 return 2;
699 else
700 return 1;
701
702 case ELK_FS_OPCODE_FB_WRITE_LOGICAL:
703 assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
704 /* First/second FB write color. */
705 if (i < 2)
706 return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
707 else
708 return 1;
709
710 case ELK_SHADER_OPCODE_TEX_LOGICAL:
711 case ELK_SHADER_OPCODE_TXD_LOGICAL:
712 case ELK_SHADER_OPCODE_TXF_LOGICAL:
713 case ELK_SHADER_OPCODE_TXL_LOGICAL:
714 case ELK_SHADER_OPCODE_TXS_LOGICAL:
715 case ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
716 case ELK_FS_OPCODE_TXB_LOGICAL:
717 case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
718 case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
719 case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
720 case ELK_SHADER_OPCODE_TXF_UMS_LOGICAL:
721 case ELK_SHADER_OPCODE_TXF_MCS_LOGICAL:
722 case ELK_SHADER_OPCODE_LOD_LOGICAL:
723 case ELK_SHADER_OPCODE_TG4_LOGICAL:
724 case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
725 case ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL:
726 assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
727 src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
728 src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
729 /* Texture coordinates. */
730 if (i == TEX_LOGICAL_SRC_COORDINATE)
731 return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
732 /* Texture derivatives. */
733 else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
734 opcode == ELK_SHADER_OPCODE_TXD_LOGICAL)
735 return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
736 /* Texture offset. */
737 else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
738 return 2;
739 /* MCS */
740 else if (i == TEX_LOGICAL_SRC_MCS) {
741 if (opcode == ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL)
742 return 2;
743 else if (opcode == ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
744 return 4;
745 else
746 return 1;
747 } else
748 return 1;
749
750 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
751 case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
752 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM);
753 /* Surface coordinates. */
754 if (i == SURFACE_LOGICAL_SRC_ADDRESS)
755 return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
756 /* Surface operation source (ignored for reads). */
757 else if (i == SURFACE_LOGICAL_SRC_DATA)
758 return 0;
759 else
760 return 1;
761
762 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
763 case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
764 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
765 src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
766 /* Surface coordinates. */
767 if (i == SURFACE_LOGICAL_SRC_ADDRESS)
768 return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
769 /* Surface operation source. */
770 else if (i == SURFACE_LOGICAL_SRC_DATA)
771 return src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
772 else
773 return 1;
774
775 case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
776 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
777 case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
778 assert(src[A64_LOGICAL_ARG].file == IMM);
779 return 1;
780
781 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
782 assert(src[A64_LOGICAL_ARG].file == IMM);
783 if (i == A64_LOGICAL_SRC) { /* data to write */
784 const unsigned comps = src[A64_LOGICAL_ARG].ud / exec_size;
785 assert(comps > 0);
786 return comps;
787 } else {
788 return 1;
789 }
790
791 case ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
792 assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
793 return 1;
794
795 case ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
796 assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
797 if (i == SURFACE_LOGICAL_SRC_DATA) {
798 const unsigned comps = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud / exec_size;
799 assert(comps > 0);
800 return comps;
801 } else {
802 return 1;
803 }
804
805 case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
806 assert(src[A64_LOGICAL_ARG].file == IMM);
807 return i == A64_LOGICAL_SRC ? src[A64_LOGICAL_ARG].ud : 1;
808
809 case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
810 assert(src[A64_LOGICAL_ARG].file == IMM);
811 return i == A64_LOGICAL_SRC ?
812 lsc_op_num_data_values(src[A64_LOGICAL_ARG].ud) : 1;
813
814 case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
815 case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
816 /* Scattered logical opcodes use the following params:
817 * src[0] Surface coordinates
818 * src[1] Surface operation source (ignored for reads)
819 * src[2] Surface
820 * src[3] IMM with always 1 dimension.
821 * src[4] IMM with arg bitsize for scattered read/write 8, 16, 32
822 */
823 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
824 src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
825 return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1;
826
827 case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
828 case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
829 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
830 src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
831 return 1;
832
833 case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
834 case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
835 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
836 src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
837 const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
838 /* Surface coordinates. */
839 if (i == SURFACE_LOGICAL_SRC_ADDRESS)
840 return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
841 /* Surface operation source. */
842 else if (i == SURFACE_LOGICAL_SRC_DATA)
843 return lsc_op_num_data_values(op);
844 else
845 return 1;
846 }
847 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
848 return (i == 0 ? 2 : 1);
849
850 case ELK_SHADER_OPCODE_URB_WRITE_LOGICAL:
851 assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
852
853 if (i == URB_LOGICAL_SRC_DATA)
854 return src[URB_LOGICAL_SRC_COMPONENTS].ud;
855 else
856 return 1;
857
858 case ELK_OPCODE_DPAS:
859 unreachable("Do not use components_read() for DPAS.");
860
861 default:
862 return 1;
863 }
864 }
865
866 unsigned
size_read(int arg) const867 elk_fs_inst::size_read(int arg) const
868 {
869 switch (opcode) {
870 case ELK_SHADER_OPCODE_SEND:
871 if (arg == 2) {
872 return mlen * REG_SIZE;
873 } else if (arg == 3) {
874 return ex_mlen * REG_SIZE;
875 }
876 break;
877
878 case ELK_FS_OPCODE_FB_WRITE:
879 case ELK_FS_OPCODE_REP_FB_WRITE:
880 if (arg == 0) {
881 if (base_mrf >= 0)
882 return src[0].file == BAD_FILE ? 0 : 2 * REG_SIZE;
883 else
884 return mlen * REG_SIZE;
885 }
886 break;
887
888 case ELK_FS_OPCODE_FB_READ:
889 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
890 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
891 if (arg == 0)
892 return mlen * REG_SIZE;
893 break;
894
895 case ELK_FS_OPCODE_SET_SAMPLE_ID:
896 if (arg == 1)
897 return 1;
898 break;
899
900 case ELK_FS_OPCODE_LINTERP:
901 if (arg == 1)
902 return 16;
903 break;
904
905 case ELK_SHADER_OPCODE_LOAD_PAYLOAD:
906 if (arg < this->header_size)
907 return retype(src[arg], ELK_REGISTER_TYPE_UD).component_size(8);
908 break;
909
910 case ELK_CS_OPCODE_CS_TERMINATE:
911 case ELK_SHADER_OPCODE_BARRIER:
912 return REG_SIZE;
913
914 case ELK_SHADER_OPCODE_MOV_INDIRECT:
915 if (arg == 0) {
916 assert(src[2].file == IMM);
917 return src[2].ud;
918 }
919 break;
920
921 case ELK_OPCODE_DPAS:
922 switch (arg) {
923 case 0:
924 if (src[0].type == ELK_REGISTER_TYPE_HF) {
925 return rcount * REG_SIZE / 2;
926 } else {
927 return rcount * REG_SIZE;
928 }
929 case 1:
930 return sdepth * REG_SIZE;
931 case 2:
932 /* This is simpler than the formula described in the Bspec, but it
933 * covers all of the cases that we support on DG2.
934 */
935 return rcount * REG_SIZE;
936 default:
937 unreachable("Invalid source number.");
938 }
939 break;
940
941 case ELK_SHADER_OPCODE_TEX:
942 case ELK_FS_OPCODE_TXB:
943 case ELK_SHADER_OPCODE_TXD:
944 case ELK_SHADER_OPCODE_TXF:
945 case ELK_SHADER_OPCODE_TXF_LZ:
946 case ELK_SHADER_OPCODE_TXF_CMS:
947 case ELK_SHADER_OPCODE_TXF_CMS_W:
948 case ELK_SHADER_OPCODE_TXF_UMS:
949 case ELK_SHADER_OPCODE_TXF_MCS:
950 case ELK_SHADER_OPCODE_TXL:
951 case ELK_SHADER_OPCODE_TXL_LZ:
952 case ELK_SHADER_OPCODE_TXS:
953 case ELK_SHADER_OPCODE_LOD:
954 case ELK_SHADER_OPCODE_TG4:
955 case ELK_SHADER_OPCODE_TG4_OFFSET:
956 case ELK_SHADER_OPCODE_SAMPLEINFO:
957 if (arg == 0 && src[0].file == VGRF)
958 return mlen * REG_SIZE;
959 break;
960
961 default:
962 break;
963 }
964
965 switch (src[arg].file) {
966 case UNIFORM:
967 case IMM:
968 return components_read(arg) * type_sz(src[arg].type);
969 case BAD_FILE:
970 case ARF:
971 case FIXED_GRF:
972 case VGRF:
973 case ATTR:
974 return components_read(arg) * src[arg].component_size(exec_size);
975 case MRF:
976 unreachable("MRF registers are not allowed as sources");
977 }
978 return 0;
979 }
980
981 namespace {
982 unsigned
predicate_width(const intel_device_info * devinfo,elk_predicate predicate)983 predicate_width(const intel_device_info *devinfo, elk_predicate predicate)
984 {
985 if (devinfo->ver >= 20) {
986 return 1;
987 } else {
988 switch (predicate) {
989 case ELK_PREDICATE_NONE: return 1;
990 case ELK_PREDICATE_NORMAL: return 1;
991 case ELK_PREDICATE_ALIGN1_ANY2H: return 2;
992 case ELK_PREDICATE_ALIGN1_ALL2H: return 2;
993 case ELK_PREDICATE_ALIGN1_ANY4H: return 4;
994 case ELK_PREDICATE_ALIGN1_ALL4H: return 4;
995 case ELK_PREDICATE_ALIGN1_ANY8H: return 8;
996 case ELK_PREDICATE_ALIGN1_ALL8H: return 8;
997 case ELK_PREDICATE_ALIGN1_ANY16H: return 16;
998 case ELK_PREDICATE_ALIGN1_ALL16H: return 16;
999 case ELK_PREDICATE_ALIGN1_ANY32H: return 32;
1000 case ELK_PREDICATE_ALIGN1_ALL32H: return 32;
1001 default: unreachable("Unsupported predicate");
1002 }
1003 }
1004 }
1005
1006 /* Return the subset of flag registers that an instruction could
1007 * potentially read or write based on the execution controls and flag
1008 * subregister number of the instruction.
1009 */
1010 unsigned
flag_mask(const elk_fs_inst * inst,unsigned width)1011 flag_mask(const elk_fs_inst *inst, unsigned width)
1012 {
1013 assert(util_is_power_of_two_nonzero(width));
1014 const unsigned start = (inst->flag_subreg * 16 + inst->group) &
1015 ~(width - 1);
1016 const unsigned end = start + ALIGN(inst->exec_size, width);
1017 return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
1018 }
1019
1020 unsigned
bit_mask(unsigned n)1021 bit_mask(unsigned n)
1022 {
1023 return (n >= CHAR_BIT * sizeof(bit_mask(n)) ? ~0u : (1u << n) - 1);
1024 }
1025
1026 unsigned
flag_mask(const elk_fs_reg & r,unsigned sz)1027 flag_mask(const elk_fs_reg &r, unsigned sz)
1028 {
1029 if (r.file == ARF) {
1030 const unsigned start = (r.nr - ELK_ARF_FLAG) * 4 + r.subnr;
1031 const unsigned end = start + sz;
1032 return bit_mask(end) & ~bit_mask(start);
1033 } else {
1034 return 0;
1035 }
1036 }
1037 }
1038
1039 unsigned
flags_read(const intel_device_info * devinfo) const1040 elk_fs_inst::flags_read(const intel_device_info *devinfo) const
1041 {
1042 if (devinfo->ver < 20 && (predicate == ELK_PREDICATE_ALIGN1_ANYV ||
1043 predicate == ELK_PREDICATE_ALIGN1_ALLV)) {
1044 /* The vertical predication modes combine corresponding bits from
1045 * f0.0 and f1.0 on Gfx7+, and f0.0 and f0.1 on older hardware.
1046 */
1047 const unsigned shift = devinfo->ver >= 7 ? 4 : 2;
1048 return flag_mask(this, 1) << shift | flag_mask(this, 1);
1049 } else if (predicate) {
1050 return flag_mask(this, predicate_width(devinfo, predicate));
1051 } else {
1052 unsigned mask = 0;
1053 for (int i = 0; i < sources; i++) {
1054 mask |= flag_mask(src[i], size_read(i));
1055 }
1056 return mask;
1057 }
1058 }
1059
1060 unsigned
flags_written(const intel_device_info * devinfo) const1061 elk_fs_inst::flags_written(const intel_device_info *devinfo) const
1062 {
1063 /* On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
1064 * using a separate cmpn and sel instruction. This lowering occurs in
1065 * fs_vistor::lower_minmax which is called very, very late.
1066 */
1067 if ((conditional_mod && ((opcode != ELK_OPCODE_SEL || devinfo->ver <= 5) &&
1068 opcode != ELK_OPCODE_CSEL &&
1069 opcode != ELK_OPCODE_IF &&
1070 opcode != ELK_OPCODE_WHILE)) ||
1071 opcode == ELK_FS_OPCODE_FB_WRITE) {
1072 return flag_mask(this, 1);
1073 } else if (opcode == ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL ||
1074 opcode == ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL ||
1075 opcode == ELK_FS_OPCODE_LOAD_LIVE_CHANNELS) {
1076 return flag_mask(this, 32);
1077 } else {
1078 return flag_mask(dst, size_written);
1079 }
1080 }
1081
1082 /**
1083 * Returns how many MRFs an FS opcode will write over.
1084 *
1085 * Note that this is not the 0 or 1 implied writes in an actual gen
1086 * instruction -- the FS opcodes often generate MOVs in addition.
1087 */
1088 unsigned
implied_mrf_writes() const1089 elk_fs_inst::implied_mrf_writes() const
1090 {
1091 if (mlen == 0)
1092 return 0;
1093
1094 if (base_mrf == -1)
1095 return 0;
1096
1097 switch (opcode) {
1098 case ELK_SHADER_OPCODE_RCP:
1099 case ELK_SHADER_OPCODE_RSQ:
1100 case ELK_SHADER_OPCODE_SQRT:
1101 case ELK_SHADER_OPCODE_EXP2:
1102 case ELK_SHADER_OPCODE_LOG2:
1103 case ELK_SHADER_OPCODE_SIN:
1104 case ELK_SHADER_OPCODE_COS:
1105 return 1 * exec_size / 8;
1106 case ELK_SHADER_OPCODE_POW:
1107 case ELK_SHADER_OPCODE_INT_QUOTIENT:
1108 case ELK_SHADER_OPCODE_INT_REMAINDER:
1109 return 2 * exec_size / 8;
1110 case ELK_SHADER_OPCODE_TEX:
1111 case ELK_FS_OPCODE_TXB:
1112 case ELK_SHADER_OPCODE_TXD:
1113 case ELK_SHADER_OPCODE_TXF:
1114 case ELK_SHADER_OPCODE_TXF_CMS:
1115 case ELK_SHADER_OPCODE_TXF_MCS:
1116 case ELK_SHADER_OPCODE_TG4:
1117 case ELK_SHADER_OPCODE_TG4_OFFSET:
1118 case ELK_SHADER_OPCODE_TXL:
1119 case ELK_SHADER_OPCODE_TXS:
1120 case ELK_SHADER_OPCODE_LOD:
1121 case ELK_SHADER_OPCODE_SAMPLEINFO:
1122 return 1;
1123 case ELK_FS_OPCODE_FB_WRITE:
1124 case ELK_FS_OPCODE_REP_FB_WRITE:
1125 return src[0].file == BAD_FILE ? 0 : 2;
1126 case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1127 case ELK_SHADER_OPCODE_GFX4_SCRATCH_READ:
1128 return 1;
1129 case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
1130 return mlen;
1131 case ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE:
1132 return mlen;
1133 default:
1134 unreachable("not reached");
1135 }
1136 }
1137
1138 bool
has_sampler_residency() const1139 elk_fs_inst::has_sampler_residency() const
1140 {
1141 switch (opcode) {
1142 case ELK_SHADER_OPCODE_TEX_LOGICAL:
1143 case ELK_FS_OPCODE_TXB_LOGICAL:
1144 case ELK_SHADER_OPCODE_TXL_LOGICAL:
1145 case ELK_SHADER_OPCODE_TXD_LOGICAL:
1146 case ELK_SHADER_OPCODE_TXF_LOGICAL:
1147 case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
1148 case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
1149 case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
1150 case ELK_SHADER_OPCODE_TXS_LOGICAL:
1151 case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
1152 case ELK_SHADER_OPCODE_TG4_LOGICAL:
1153 assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
1154 return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
1155 default:
1156 return false;
1157 }
1158 }
1159
1160 elk_fs_reg
vgrf(const glsl_type * const type)1161 elk_fs_visitor::vgrf(const glsl_type *const type)
1162 {
1163 int reg_width = dispatch_width / 8;
1164 return elk_fs_reg(VGRF,
1165 alloc.allocate(glsl_count_dword_slots(type, false) * reg_width),
1166 elk_type_for_base_type(type));
1167 }
1168
elk_fs_reg(enum elk_reg_file file,unsigned nr)1169 elk_fs_reg::elk_fs_reg(enum elk_reg_file file, unsigned nr)
1170 {
1171 init();
1172 this->file = file;
1173 this->nr = nr;
1174 this->type = ELK_REGISTER_TYPE_F;
1175 this->stride = (file == UNIFORM ? 0 : 1);
1176 }
1177
elk_fs_reg(enum elk_reg_file file,unsigned nr,enum elk_reg_type type)1178 elk_fs_reg::elk_fs_reg(enum elk_reg_file file, unsigned nr, enum elk_reg_type type)
1179 {
1180 init();
1181 this->file = file;
1182 this->nr = nr;
1183 this->type = type;
1184 this->stride = (file == UNIFORM ? 0 : 1);
1185 }
1186
1187 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1188 * This brings in those uniform definitions
1189 */
1190 void
import_uniforms(elk_fs_visitor * v)1191 elk_fs_visitor::import_uniforms(elk_fs_visitor *v)
1192 {
1193 this->push_constant_loc = v->push_constant_loc;
1194 this->uniforms = v->uniforms;
1195 }
1196
1197 enum elk_barycentric_mode
elk_barycentric_mode(nir_intrinsic_instr * intr)1198 elk_barycentric_mode(nir_intrinsic_instr *intr)
1199 {
1200 const glsl_interp_mode mode =
1201 (enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
1202
1203 /* Barycentric modes don't make sense for flat inputs. */
1204 assert(mode != INTERP_MODE_FLAT);
1205
1206 unsigned bary;
1207 switch (intr->intrinsic) {
1208 case nir_intrinsic_load_barycentric_pixel:
1209 case nir_intrinsic_load_barycentric_at_offset:
1210 bary = ELK_BARYCENTRIC_PERSPECTIVE_PIXEL;
1211 break;
1212 case nir_intrinsic_load_barycentric_centroid:
1213 bary = ELK_BARYCENTRIC_PERSPECTIVE_CENTROID;
1214 break;
1215 case nir_intrinsic_load_barycentric_sample:
1216 case nir_intrinsic_load_barycentric_at_sample:
1217 bary = ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE;
1218 break;
1219 default:
1220 unreachable("invalid intrinsic");
1221 }
1222
1223 if (mode == INTERP_MODE_NOPERSPECTIVE)
1224 bary += 3;
1225
1226 return (enum elk_barycentric_mode) bary;
1227 }
1228
1229 /**
1230 * Turn one of the two CENTROID barycentric modes into PIXEL mode.
1231 */
1232 static enum elk_barycentric_mode
centroid_to_pixel(enum elk_barycentric_mode bary)1233 centroid_to_pixel(enum elk_barycentric_mode bary)
1234 {
1235 assert(bary == ELK_BARYCENTRIC_PERSPECTIVE_CENTROID ||
1236 bary == ELK_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
1237 return (enum elk_barycentric_mode) ((unsigned) bary - 1);
1238 }
1239
1240 /**
1241 * Walk backwards from the end of the program looking for a URB write that
1242 * isn't in control flow, and mark it with EOT.
1243 *
1244 * Return true if successful or false if a separate EOT write is needed.
1245 */
1246 bool
mark_last_urb_write_with_eot()1247 elk_fs_visitor::mark_last_urb_write_with_eot()
1248 {
1249 foreach_in_list_reverse(elk_fs_inst, prev, &this->instructions) {
1250 if (prev->opcode == ELK_SHADER_OPCODE_URB_WRITE_LOGICAL) {
1251 prev->eot = true;
1252
1253 /* Delete now dead instructions. */
1254 foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
1255 if (dead == prev)
1256 break;
1257 dead->remove();
1258 }
1259 return true;
1260 } else if (prev->is_control_flow() || prev->has_side_effects()) {
1261 break;
1262 }
1263 }
1264
1265 return false;
1266 }
1267
1268 void
emit_gs_thread_end()1269 elk_fs_visitor::emit_gs_thread_end()
1270 {
1271 assert(stage == MESA_SHADER_GEOMETRY);
1272
1273 struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(prog_data);
1274
1275 if (gs_compile->control_data_header_size_bits > 0) {
1276 emit_gs_control_data_bits(this->final_gs_vertex_count);
1277 }
1278
1279 const fs_builder abld = fs_builder(this).at_end().annotate("thread end");
1280 elk_fs_inst *inst;
1281
1282 if (gs_prog_data->static_vertex_count != -1) {
1283 /* Try and tag the last URB write with EOT instead of emitting a whole
1284 * separate write just to finish the thread.
1285 */
1286 if (mark_last_urb_write_with_eot())
1287 return;
1288
1289 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1290 srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
1291 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(0);
1292 inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
1293 srcs, ARRAY_SIZE(srcs));
1294 } else {
1295 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1296 srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
1297 srcs[URB_LOGICAL_SRC_DATA] = this->final_gs_vertex_count;
1298 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(1);
1299 inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
1300 srcs, ARRAY_SIZE(srcs));
1301 }
1302 inst->eot = true;
1303 inst->offset = 0;
1304 }
1305
1306 void
assign_curb_setup()1307 elk_fs_visitor::assign_curb_setup()
1308 {
1309 unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
1310
1311 unsigned ubo_push_length = 0;
1312 unsigned ubo_push_start[4];
1313 for (int i = 0; i < 4; i++) {
1314 ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
1315 ubo_push_length += stage_prog_data->ubo_ranges[i].length;
1316 }
1317
1318 prog_data->curb_read_length = uniform_push_length + ubo_push_length;
1319
1320 uint64_t used = 0;
1321 bool is_compute = gl_shader_stage_is_compute(stage);
1322
1323 if (is_compute && elk_cs_prog_data(prog_data)->uses_inline_data) {
1324 /* With COMPUTE_WALKER, we can push up to one register worth of data via
1325 * the inline data parameter in the COMPUTE_WALKER command itself.
1326 *
1327 * TODO: Support inline data and push at the same time.
1328 */
1329 assert(devinfo->verx10 >= 125);
1330 assert(uniform_push_length <= reg_unit(devinfo));
1331 } else if (is_compute && devinfo->verx10 >= 125) {
1332 assert(devinfo->has_lsc);
1333 fs_builder ubld = fs_builder(this, 1).exec_all().at(
1334 cfg->first_block(), cfg->first_block()->start());
1335
1336 /* The base offset for our push data is passed in as R0.0[31:6]. We have
1337 * to mask off the bottom 6 bits.
1338 */
1339 elk_fs_reg base_addr = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1340 ubld.AND(base_addr,
1341 retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UD),
1342 elk_imm_ud(INTEL_MASK(31, 6)));
1343
1344 /* On Gfx12-HP we load constants at the start of the program using A32
1345 * stateless messages.
1346 */
1347 for (unsigned i = 0; i < uniform_push_length;) {
1348 /* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
1349 unsigned num_regs = MIN2(uniform_push_length - i, 8);
1350 assert(num_regs > 0);
1351 num_regs = 1 << util_logbase2(num_regs);
1352
1353 elk_fs_reg addr = ubld.vgrf(ELK_REGISTER_TYPE_UD);
1354 ubld.ADD(addr, base_addr, elk_imm_ud(i * REG_SIZE));
1355
1356 elk_fs_reg srcs[4] = {
1357 elk_imm_ud(0), /* desc */
1358 elk_imm_ud(0), /* ex_desc */
1359 addr, /* payload */
1360 elk_fs_reg(), /* payload2 */
1361 };
1362
1363 elk_fs_reg dest = retype(elk_vec8_grf(payload().num_regs + i, 0),
1364 ELK_REGISTER_TYPE_UD);
1365 elk_fs_inst *send = ubld.emit(ELK_SHADER_OPCODE_SEND, dest, srcs, 4);
1366
1367 send->sfid = GFX12_SFID_UGM;
1368 send->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
1369 1 /* exec_size */,
1370 LSC_ADDR_SURFTYPE_FLAT,
1371 LSC_ADDR_SIZE_A32,
1372 1 /* num_coordinates */,
1373 LSC_DATA_SIZE_D32,
1374 num_regs * 8 /* num_channels */,
1375 true /* transpose */,
1376 LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
1377 true /* has_dest */);
1378 send->header_size = 0;
1379 send->mlen = lsc_msg_desc_src0_len(devinfo, send->desc);
1380 send->size_written =
1381 lsc_msg_desc_dest_len(devinfo, send->desc) * REG_SIZE;
1382 send->send_is_volatile = true;
1383
1384 i += num_regs;
1385 }
1386
1387 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1388 }
1389
1390 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1391 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1392 for (unsigned int i = 0; i < inst->sources; i++) {
1393 if (inst->src[i].file == UNIFORM) {
1394 int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
1395 int constant_nr;
1396 if (inst->src[i].nr >= UBO_START) {
1397 /* constant_nr is in 32-bit units, the rest are in bytes */
1398 constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
1399 inst->src[i].offset / 4;
1400 } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1401 constant_nr = push_constant_loc[uniform_nr];
1402 } else {
1403 /* Section 5.11 of the OpenGL 4.1 spec says:
1404 * "Out-of-bounds reads return undefined values, which include
1405 * values from other variables of the active program or zero."
1406 * Just return the first push constant.
1407 */
1408 constant_nr = 0;
1409 }
1410
1411 assert(constant_nr / 8 < 64);
1412 used |= BITFIELD64_BIT(constant_nr / 8);
1413
1414 struct elk_reg elk_reg = elk_vec1_grf(payload().num_regs +
1415 constant_nr / 8,
1416 constant_nr % 8);
1417 elk_reg.abs = inst->src[i].abs;
1418 elk_reg.negate = inst->src[i].negate;
1419
1420 assert(inst->src[i].stride == 0);
1421 inst->src[i] = byte_offset(
1422 retype(elk_reg, inst->src[i].type),
1423 inst->src[i].offset % 4);
1424 }
1425 }
1426 }
1427
1428 uint64_t want_zero = used & stage_prog_data->zero_push_reg;
1429 if (want_zero) {
1430 fs_builder ubld = fs_builder(this, 8).exec_all().at(
1431 cfg->first_block(), cfg->first_block()->start());
1432
1433 /* push_reg_mask_param is in 32-bit units */
1434 unsigned mask_param = stage_prog_data->push_reg_mask_param;
1435 struct elk_reg mask = elk_vec1_grf(payload().num_regs + mask_param / 8,
1436 mask_param % 8);
1437
1438 elk_fs_reg b32;
1439 for (unsigned i = 0; i < 64; i++) {
1440 if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
1441 elk_fs_reg shifted = ubld.vgrf(ELK_REGISTER_TYPE_W, 2);
1442 ubld.SHL(horiz_offset(shifted, 8),
1443 byte_offset(retype(mask, ELK_REGISTER_TYPE_W), i / 8),
1444 elk_imm_v(0x01234567));
1445 ubld.SHL(shifted, horiz_offset(shifted, 8), elk_imm_w(8));
1446
1447 fs_builder ubld16 = ubld.group(16, 0);
1448 b32 = ubld16.vgrf(ELK_REGISTER_TYPE_D);
1449 ubld16.group(16, 0).ASR(b32, shifted, elk_imm_w(15));
1450 }
1451
1452 if (want_zero & BITFIELD64_BIT(i)) {
1453 assert(i < prog_data->curb_read_length);
1454 struct elk_reg push_reg =
1455 retype(elk_vec8_grf(payload().num_regs + i, 0),
1456 ELK_REGISTER_TYPE_D);
1457
1458 ubld.AND(push_reg, push_reg, component(b32, i % 16));
1459 }
1460 }
1461
1462 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1463 }
1464
1465 /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1466 this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
1467 }
1468
1469 /*
1470 * Build up an array of indices into the urb_setup array that
1471 * references the active entries of the urb_setup array.
1472 * Used to accelerate walking the active entries of the urb_setup array
1473 * on each upload.
1474 */
1475 void
elk_compute_urb_setup_index(struct elk_wm_prog_data * wm_prog_data)1476 elk_compute_urb_setup_index(struct elk_wm_prog_data *wm_prog_data)
1477 {
1478 /* Make sure uint8_t is sufficient */
1479 STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
1480 uint8_t index = 0;
1481 for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1482 if (wm_prog_data->urb_setup[attr] >= 0) {
1483 wm_prog_data->urb_setup_attribs[index++] = attr;
1484 }
1485 }
1486 wm_prog_data->urb_setup_attribs_count = index;
1487 }
1488
1489 static void
calculate_urb_setup(const struct intel_device_info * devinfo,const struct elk_wm_prog_key * key,struct elk_wm_prog_data * prog_data,const nir_shader * nir)1490 calculate_urb_setup(const struct intel_device_info *devinfo,
1491 const struct elk_wm_prog_key *key,
1492 struct elk_wm_prog_data *prog_data,
1493 const nir_shader *nir)
1494 {
1495 memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
1496 memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
1497
1498 int urb_next = 0; /* in vec4s */
1499
1500 const uint64_t inputs_read =
1501 nir->info.inputs_read & ~nir->info.per_primitive_inputs;
1502
1503 /* Figure out where each of the incoming setup attributes lands. */
1504 if (devinfo->ver >= 6) {
1505 assert(!nir->info.per_primitive_inputs);
1506
1507 uint64_t vue_header_bits =
1508 VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
1509
1510 uint64_t unique_fs_attrs = inputs_read & ELK_FS_VARYING_INPUT_MASK;
1511
1512 /* VUE header fields all live in the same URB slot, so we pass them
1513 * as a single FS input attribute. We want to only count them once.
1514 */
1515 if (inputs_read & vue_header_bits) {
1516 unique_fs_attrs &= ~vue_header_bits;
1517 unique_fs_attrs |= VARYING_BIT_PSIZ;
1518 }
1519
1520 if (util_bitcount64(unique_fs_attrs) <= 16) {
1521 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1522 * first 16 varying inputs, so we can put them wherever we want.
1523 * Just put them in order.
1524 *
1525 * This is useful because it means that (a) inputs not used by the
1526 * fragment shader won't take up valuable register space, and (b) we
1527 * won't have to recompile the fragment shader if it gets paired with
1528 * a different vertex (or geometry) shader.
1529 *
1530 * VUE header fields share the same FS input attribute.
1531 */
1532 if (inputs_read & vue_header_bits) {
1533 if (inputs_read & VARYING_BIT_PSIZ)
1534 prog_data->urb_setup[VARYING_SLOT_PSIZ] = urb_next;
1535 if (inputs_read & VARYING_BIT_LAYER)
1536 prog_data->urb_setup[VARYING_SLOT_LAYER] = urb_next;
1537 if (inputs_read & VARYING_BIT_VIEWPORT)
1538 prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = urb_next;
1539
1540 urb_next++;
1541 }
1542
1543 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1544 if (inputs_read & ELK_FS_VARYING_INPUT_MASK & ~vue_header_bits &
1545 BITFIELD64_BIT(i)) {
1546 prog_data->urb_setup[i] = urb_next++;
1547 }
1548 }
1549 } else {
1550 /* We have enough input varyings that the SF/SBE pipeline stage can't
1551 * arbitrarily rearrange them to suit our whim; we have to put them
1552 * in an order that matches the output of the previous pipeline stage
1553 * (geometry or vertex shader).
1554 */
1555
1556 /* Re-compute the VUE map here in the case that the one coming from
1557 * geometry has more than one position slot (used for Primitive
1558 * Replication).
1559 */
1560 struct intel_vue_map prev_stage_vue_map;
1561 elk_compute_vue_map(devinfo, &prev_stage_vue_map,
1562 key->input_slots_valid,
1563 nir->info.separate_shader, 1);
1564
1565 int first_slot =
1566 elk_compute_first_urb_slot_required(inputs_read,
1567 &prev_stage_vue_map);
1568
1569 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1570 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1571 slot++) {
1572 int varying = prev_stage_vue_map.slot_to_varying[slot];
1573 if (varying != ELK_VARYING_SLOT_PAD &&
1574 (inputs_read & ELK_FS_VARYING_INPUT_MASK &
1575 BITFIELD64_BIT(varying))) {
1576 prog_data->urb_setup[varying] = slot - first_slot;
1577 }
1578 }
1579 urb_next = prev_stage_vue_map.num_slots - first_slot;
1580 }
1581 } else {
1582 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1583 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1584 /* Point size is packed into the header, not as a general attribute */
1585 if (i == VARYING_SLOT_PSIZ)
1586 continue;
1587
1588 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1589 /* The back color slot is skipped when the front color is
1590 * also written to. In addition, some slots can be
1591 * written in the vertex shader and not read in the
1592 * fragment shader. So the register number must always be
1593 * incremented, mapped or not.
1594 */
1595 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1596 prog_data->urb_setup[i] = urb_next;
1597 urb_next++;
1598 }
1599 }
1600
1601 /*
1602 * It's a FS only attribute, and we did interpolation for this attribute
1603 * in SF thread. So, count it here, too.
1604 *
1605 * See compile_sf_prog() for more info.
1606 */
1607 if (inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1608 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1609 }
1610
1611 prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
1612 prog_data->inputs = inputs_read;
1613
1614 elk_compute_urb_setup_index(prog_data);
1615 }
1616
1617 void
assign_urb_setup()1618 elk_fs_visitor::assign_urb_setup()
1619 {
1620 assert(stage == MESA_SHADER_FRAGMENT);
1621 struct elk_wm_prog_data *prog_data = elk_wm_prog_data(this->prog_data);
1622
1623 int urb_start = payload().num_regs + prog_data->base.curb_read_length;
1624
1625 /* Offset all the urb_setup[] index by the actual position of the
1626 * setup regs, now that the location of the constants has been chosen.
1627 */
1628 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1629 for (int i = 0; i < inst->sources; i++) {
1630 if (inst->src[i].file == ATTR) {
1631 /* ATTR elk_fs_reg::nr in the FS is in units of logical scalar
1632 * inputs each of which consumes 16B on Gfx4-Gfx12. In
1633 * single polygon mode this leads to the following layout
1634 * of the vertex setup plane parameters in the ATTR
1635 * register file:
1636 *
1637 * elk_fs_reg::nr Input Comp0 Comp1 Comp2 Comp3
1638 * 0 Attr0.x a1-a0 a2-a0 N/A a0
1639 * 1 Attr0.y a1-a0 a2-a0 N/A a0
1640 * 2 Attr0.z a1-a0 a2-a0 N/A a0
1641 * 3 Attr0.w a1-a0 a2-a0 N/A a0
1642 * 4 Attr1.x a1-a0 a2-a0 N/A a0
1643 * ...
1644 *
1645 * In multipolygon mode that no longer works since
1646 * different channels may be processing polygons with
1647 * different plane parameters, so each parameter above is
1648 * represented as a dispatch_width-wide vector:
1649 *
1650 * elk_fs_reg::nr elk_fs_reg::offset Input Comp0 ... CompN
1651 * 0 0 Attr0.x a1[0]-a0[0] ... a1[N]-a0[N]
1652 * 0 4 * dispatch_width Attr0.x a2[0]-a0[0] ... a2[N]-a0[N]
1653 * 0 8 * dispatch_width Attr0.x N/A ... N/A
1654 * 0 12 * dispatch_width Attr0.x a0[0] ... a0[N]
1655 * 1 0 Attr0.y a1[0]-a0[0] ... a1[N]-a0[N]
1656 * ...
1657 *
1658 * Note that many of the components on a single row above
1659 * are likely to be replicated multiple times (if, say, a
1660 * single SIMD thread is only processing 2 different
1661 * polygons), so plane parameters aren't actually stored
1662 * in GRF memory with that layout to avoid wasting space.
1663 * Instead we compose ATTR register regions with a 2D
1664 * region that walks through the parameters of each
1665 * polygon with the correct stride, reading the parameter
1666 * corresponding to each channel directly from the PS
1667 * thread payload.
1668 *
1669 * The latter layout corresponds to a param_width equal to
1670 * dispatch_width, while the former (scalar parameter)
1671 * layout has a param_width of 1.
1672 *
1673 * Gfx20+ represent plane parameters in a format similar
1674 * to the above, except the parameters are packed in 12B
1675 * and ordered like "a0, a1-a0, a2-a0" instead of the
1676 * above vec4 representation with a missing component.
1677 */
1678 const unsigned param_width = (max_polygons > 1 ? dispatch_width : 1);
1679
1680 /* Size of a single scalar component of a plane parameter
1681 * in bytes.
1682 */
1683 const unsigned chan_sz = 4;
1684 struct elk_reg reg;
1685 assert(max_polygons > 0);
1686
1687 /* Calculate the base register on the thread payload of
1688 * either the block of vertex setup data or the block of
1689 * per-primitive constant data depending on whether we're
1690 * accessing a primitive or vertex input. Also calculate
1691 * the index of the input within that block.
1692 */
1693 const bool per_prim = inst->src[i].nr < prog_data->num_per_primitive_inputs;
1694 const unsigned base = urb_start +
1695 (per_prim ? 0 :
1696 ALIGN(prog_data->num_per_primitive_inputs / 2,
1697 reg_unit(devinfo)) * max_polygons);
1698 const unsigned idx = per_prim ? inst->src[i].nr :
1699 inst->src[i].nr - prog_data->num_per_primitive_inputs;
1700
1701 /* Translate the offset within the param_width-wide
1702 * representation described above into an offset and a
1703 * grf, which contains the plane parameters for the first
1704 * polygon processed by the thread.
1705 */
1706 if (devinfo->ver >= 20 && !per_prim) {
1707 /* Gfx20+ is able to pack 5 logical input components
1708 * per 64B register for vertex setup data.
1709 */
1710 const unsigned grf = base + idx / 5 * 2 * max_polygons;
1711 assert(inst->src[i].offset / param_width < 12);
1712 const unsigned delta = idx % 5 * 12 +
1713 inst->src[i].offset / (param_width * chan_sz) * chan_sz +
1714 inst->src[i].offset % chan_sz;
1715 reg = byte_offset(retype(elk_vec8_grf(grf, 0), inst->src[i].type),
1716 delta);
1717 } else {
1718 /* Earlier platforms and per-primitive block pack 2 logical
1719 * input components per 32B register.
1720 */
1721 const unsigned grf = base + idx / 2 * max_polygons;
1722 assert(inst->src[i].offset / param_width < REG_SIZE / 2);
1723 const unsigned delta = (idx % 2) * (REG_SIZE / 2) +
1724 inst->src[i].offset / (param_width * chan_sz) * chan_sz +
1725 inst->src[i].offset % chan_sz;
1726 reg = byte_offset(retype(elk_vec8_grf(grf, 0), inst->src[i].type),
1727 delta);
1728 }
1729
1730 if (max_polygons > 1) {
1731 assert(devinfo->ver >= 12);
1732 /* Misaligned channel strides that would lead to
1733 * cross-channel access in the representation above are
1734 * disallowed.
1735 */
1736 assert(inst->src[i].stride * type_sz(inst->src[i].type) == chan_sz);
1737
1738 /* Number of channels processing the same polygon. */
1739 const unsigned poly_width = dispatch_width / max_polygons;
1740 assert(dispatch_width % max_polygons == 0);
1741
1742 /* Accessing a subset of channels of a parameter vector
1743 * starting from "chan" is necessary to handle
1744 * SIMD-lowered instructions though.
1745 */
1746 const unsigned chan = inst->src[i].offset %
1747 (param_width * chan_sz) / chan_sz;
1748 assert(chan < dispatch_width);
1749 assert(chan % poly_width == 0);
1750 const unsigned reg_size = reg_unit(devinfo) * REG_SIZE;
1751 reg = byte_offset(reg, chan / poly_width * reg_size);
1752
1753 if (inst->exec_size > poly_width) {
1754 /* Accessing the parameters for multiple polygons.
1755 * Corresponding parameters for different polygons
1756 * are stored a GRF apart on the thread payload, so
1757 * use that as vertical stride.
1758 */
1759 const unsigned vstride = reg_size / type_sz(inst->src[i].type);
1760 assert(vstride <= 32);
1761 assert(chan % poly_width == 0);
1762 reg = stride(reg, vstride, poly_width, 0);
1763 } else {
1764 /* Accessing one parameter for a single polygon --
1765 * Translate to a scalar region.
1766 */
1767 assert(chan % poly_width + inst->exec_size <= poly_width);
1768 reg = stride(reg, 0, 1, 0);
1769 }
1770
1771 } else {
1772 const unsigned width = inst->src[i].stride == 0 ?
1773 1 : MIN2(inst->exec_size, 8);
1774 reg = stride(reg, width * inst->src[i].stride,
1775 width, inst->src[i].stride);
1776 }
1777
1778 reg.abs = inst->src[i].abs;
1779 reg.negate = inst->src[i].negate;
1780 inst->src[i] = reg;
1781 }
1782 }
1783 }
1784
1785 /* Each attribute is 4 setup channels, each of which is half a reg,
1786 * but they may be replicated multiple times for multipolygon
1787 * dispatch.
1788 */
1789 this->first_non_payload_grf += prog_data->num_varying_inputs * 2 * max_polygons;
1790
1791 /* Unlike regular attributes, per-primitive attributes have all 4 channels
1792 * in the same slot, so each GRF can store two slots.
1793 */
1794 assert(prog_data->num_per_primitive_inputs % 2 == 0);
1795 this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2 * max_polygons;
1796 }
1797
1798 void
convert_attr_sources_to_hw_regs(elk_fs_inst * inst)1799 elk_fs_visitor::convert_attr_sources_to_hw_regs(elk_fs_inst *inst)
1800 {
1801 for (int i = 0; i < inst->sources; i++) {
1802 if (inst->src[i].file == ATTR) {
1803 assert(inst->src[i].nr == 0);
1804 int grf = payload().num_regs +
1805 prog_data->curb_read_length +
1806 inst->src[i].offset / REG_SIZE;
1807
1808 /* As explained at elk_reg_from_fs_reg, From the Haswell PRM:
1809 *
1810 * VertStride must be used to cross GRF register boundaries. This
1811 * rule implies that elements within a 'Width' cannot cross GRF
1812 * boundaries.
1813 *
1814 * So, for registers that are large enough, we have to split the exec
1815 * size in two and trust the compression state to sort it out.
1816 */
1817 unsigned total_size = inst->exec_size *
1818 inst->src[i].stride *
1819 type_sz(inst->src[i].type);
1820
1821 assert(total_size <= 2 * REG_SIZE);
1822 const unsigned exec_size =
1823 (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
1824
1825 unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
1826 struct elk_reg reg =
1827 stride(byte_offset(retype(elk_vec8_grf(grf, 0), inst->src[i].type),
1828 inst->src[i].offset % REG_SIZE),
1829 exec_size * inst->src[i].stride,
1830 width, inst->src[i].stride);
1831 reg.abs = inst->src[i].abs;
1832 reg.negate = inst->src[i].negate;
1833
1834 inst->src[i] = reg;
1835 }
1836 }
1837 }
1838
1839 void
assign_vs_urb_setup()1840 elk_fs_visitor::assign_vs_urb_setup()
1841 {
1842 struct elk_vs_prog_data *vs_prog_data = elk_vs_prog_data(prog_data);
1843
1844 assert(stage == MESA_SHADER_VERTEX);
1845
1846 /* Each attribute is 4 regs. */
1847 this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
1848
1849 assert(vs_prog_data->base.urb_read_length <= 15);
1850
1851 /* Rewrite all ATTR file references to the hw grf that they land in. */
1852 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1853 convert_attr_sources_to_hw_regs(inst);
1854 }
1855 }
1856
1857 void
assign_tcs_urb_setup()1858 elk_fs_visitor::assign_tcs_urb_setup()
1859 {
1860 assert(stage == MESA_SHADER_TESS_CTRL);
1861
1862 /* Rewrite all ATTR file references to HW_REGs. */
1863 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1864 convert_attr_sources_to_hw_regs(inst);
1865 }
1866 }
1867
1868 void
assign_tes_urb_setup()1869 elk_fs_visitor::assign_tes_urb_setup()
1870 {
1871 assert(stage == MESA_SHADER_TESS_EVAL);
1872
1873 struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(prog_data);
1874
1875 first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
1876
1877 /* Rewrite all ATTR file references to HW_REGs. */
1878 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1879 convert_attr_sources_to_hw_regs(inst);
1880 }
1881 }
1882
1883 void
assign_gs_urb_setup()1884 elk_fs_visitor::assign_gs_urb_setup()
1885 {
1886 assert(stage == MESA_SHADER_GEOMETRY);
1887
1888 struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(prog_data);
1889
1890 first_non_payload_grf +=
1891 8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
1892
1893 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1894 /* Rewrite all ATTR file references to GRFs. */
1895 convert_attr_sources_to_hw_regs(inst);
1896 }
1897 }
1898
1899
1900 /**
1901 * Split large virtual GRFs into separate components if we can.
1902 *
1903 * This pass aggressively splits VGRFs into as small a chunks as possible,
1904 * down to single registers if it can. If no VGRFs can be split, we return
1905 * false so this pass can safely be used inside an optimization loop. We
1906 * want to split, because virtual GRFs are what we register allocate and
1907 * spill (due to contiguousness requirements for some instructions), and
1908 * they're what we naturally generate in the codegen process, but most
1909 * virtual GRFs don't actually need to be contiguous sets of GRFs. If we
1910 * split, we'll end up with reduced live intervals and better dead code
1911 * elimination and coalescing.
1912 */
1913 bool
split_virtual_grfs()1914 elk_fs_visitor::split_virtual_grfs()
1915 {
1916 /* Compact the register file so we eliminate dead vgrfs. This
1917 * only defines split points for live registers, so if we have
1918 * too large dead registers they will hit assertions later.
1919 */
1920 compact_virtual_grfs();
1921
1922 unsigned num_vars = this->alloc.count;
1923
1924 /* Count the total number of registers */
1925 unsigned reg_count = 0;
1926 unsigned vgrf_to_reg[num_vars];
1927 for (unsigned i = 0; i < num_vars; i++) {
1928 vgrf_to_reg[i] = reg_count;
1929 reg_count += alloc.sizes[i];
1930 }
1931
1932 /* An array of "split points". For each register slot, this indicates
1933 * if this slot can be separated from the previous slot. Every time an
1934 * instruction uses multiple elements of a register (as a source or
1935 * destination), we mark the used slots as inseparable. Then we go
1936 * through and split the registers into the smallest pieces we can.
1937 */
1938 bool *split_points = new bool[reg_count];
1939 memset(split_points, 0, reg_count * sizeof(*split_points));
1940
1941 /* Mark all used registers as fully splittable */
1942 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1943 if (inst->dst.file == VGRF) {
1944 unsigned reg = vgrf_to_reg[inst->dst.nr];
1945 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
1946 split_points[reg + j] = true;
1947 }
1948
1949 for (unsigned i = 0; i < inst->sources; i++) {
1950 if (inst->src[i].file == VGRF) {
1951 unsigned reg = vgrf_to_reg[inst->src[i].nr];
1952 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
1953 split_points[reg + j] = true;
1954 }
1955 }
1956 }
1957
1958 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1959 /* We fix up undef instructions later */
1960 if (inst->opcode == ELK_SHADER_OPCODE_UNDEF) {
1961 assert(inst->dst.file == VGRF);
1962 continue;
1963 }
1964
1965 if (inst->dst.file == VGRF) {
1966 unsigned reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
1967 for (unsigned j = 1; j < regs_written(inst); j++)
1968 split_points[reg + j] = false;
1969 }
1970 for (unsigned i = 0; i < inst->sources; i++) {
1971 if (inst->src[i].file == VGRF) {
1972 unsigned reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
1973 for (unsigned j = 1; j < regs_read(inst, i); j++)
1974 split_points[reg + j] = false;
1975 }
1976 }
1977 }
1978
1979 /* Bitset of which registers have been split */
1980 bool *vgrf_has_split = new bool[num_vars];
1981 memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
1982
1983 unsigned *new_virtual_grf = new unsigned[reg_count];
1984 unsigned *new_reg_offset = new unsigned[reg_count];
1985
1986 unsigned reg = 0;
1987 bool has_splits = false;
1988 for (unsigned i = 0; i < num_vars; i++) {
1989 /* The first one should always be 0 as a quick sanity check. */
1990 assert(split_points[reg] == false);
1991
1992 /* j = 0 case */
1993 new_reg_offset[reg] = 0;
1994 reg++;
1995 unsigned offset = 1;
1996
1997 /* j > 0 case */
1998 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1999 /* If this is a split point, reset the offset to 0 and allocate a
2000 * new virtual GRF for the previous offset many registers
2001 */
2002 if (split_points[reg]) {
2003 has_splits = true;
2004 vgrf_has_split[i] = true;
2005 assert(offset <= MAX_VGRF_SIZE(devinfo));
2006 unsigned grf = alloc.allocate(offset);
2007 for (unsigned k = reg - offset; k < reg; k++)
2008 new_virtual_grf[k] = grf;
2009 offset = 0;
2010 }
2011 new_reg_offset[reg] = offset;
2012 offset++;
2013 reg++;
2014 }
2015
2016 /* The last one gets the original register number */
2017 assert(offset <= MAX_VGRF_SIZE(devinfo));
2018 alloc.sizes[i] = offset;
2019 for (unsigned k = reg - offset; k < reg; k++)
2020 new_virtual_grf[k] = i;
2021 }
2022 assert(reg == reg_count);
2023
2024 bool progress;
2025 if (!has_splits) {
2026 progress = false;
2027 goto cleanup;
2028 }
2029
2030 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
2031 if (inst->opcode == ELK_SHADER_OPCODE_UNDEF) {
2032 assert(inst->dst.file == VGRF);
2033 if (vgrf_has_split[inst->dst.nr]) {
2034 const fs_builder ibld(this, block, inst);
2035 assert(inst->size_written % REG_SIZE == 0);
2036 unsigned reg_offset = inst->dst.offset / REG_SIZE;
2037 unsigned size_written = 0;
2038 while (size_written < inst->size_written) {
2039 reg = vgrf_to_reg[inst->dst.nr] + reg_offset + size_written / REG_SIZE;
2040 elk_fs_inst *undef =
2041 ibld.UNDEF(
2042 byte_offset(elk_fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type),
2043 new_reg_offset[reg] * REG_SIZE));
2044 undef->size_written =
2045 MIN2(inst->size_written - size_written, undef->size_written);
2046 assert(undef->size_written % REG_SIZE == 0);
2047 size_written += undef->size_written;
2048 }
2049 inst->remove(block);
2050 } else {
2051 reg = vgrf_to_reg[inst->dst.nr];
2052 assert(new_reg_offset[reg] == 0);
2053 assert(new_virtual_grf[reg] == inst->dst.nr);
2054 }
2055 continue;
2056 }
2057
2058 if (inst->dst.file == VGRF) {
2059 reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
2060 if (vgrf_has_split[inst->dst.nr]) {
2061 inst->dst.nr = new_virtual_grf[reg];
2062 inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
2063 inst->dst.offset % REG_SIZE;
2064 assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2065 } else {
2066 assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE);
2067 assert(new_virtual_grf[reg] == inst->dst.nr);
2068 }
2069 }
2070 for (unsigned i = 0; i < inst->sources; i++) {
2071 if (inst->src[i].file != VGRF)
2072 continue;
2073
2074 reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
2075 if (vgrf_has_split[inst->src[i].nr]) {
2076 inst->src[i].nr = new_virtual_grf[reg];
2077 inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
2078 inst->src[i].offset % REG_SIZE;
2079 assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2080 } else {
2081 assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE);
2082 assert(new_virtual_grf[reg] == inst->src[i].nr);
2083 }
2084 }
2085 }
2086 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
2087
2088 progress = true;
2089
2090 cleanup:
2091 delete[] split_points;
2092 delete[] vgrf_has_split;
2093 delete[] new_virtual_grf;
2094 delete[] new_reg_offset;
2095
2096 return progress;
2097 }
2098
2099 /**
2100 * Remove unused virtual GRFs and compact the vgrf_* arrays.
2101 *
2102 * During code generation, we create tons of temporary variables, many of
2103 * which get immediately killed and are never used again. Yet, in later
2104 * optimization and analysis passes, such as compute_live_intervals, we need
2105 * to loop over all the virtual GRFs. Compacting them can save a lot of
2106 * overhead.
2107 */
2108 bool
compact_virtual_grfs()2109 elk_fs_visitor::compact_virtual_grfs()
2110 {
2111 bool progress = false;
2112 int *remap_table = new int[this->alloc.count];
2113 memset(remap_table, -1, this->alloc.count * sizeof(int));
2114
2115 /* Mark which virtual GRFs are used. */
2116 foreach_block_and_inst(block, const elk_fs_inst, inst, cfg) {
2117 if (inst->dst.file == VGRF)
2118 remap_table[inst->dst.nr] = 0;
2119
2120 for (int i = 0; i < inst->sources; i++) {
2121 if (inst->src[i].file == VGRF)
2122 remap_table[inst->src[i].nr] = 0;
2123 }
2124 }
2125
2126 /* Compact the GRF arrays. */
2127 int new_index = 0;
2128 for (unsigned i = 0; i < this->alloc.count; i++) {
2129 if (remap_table[i] == -1) {
2130 /* We just found an unused register. This means that we are
2131 * actually going to compact something.
2132 */
2133 progress = true;
2134 } else {
2135 remap_table[i] = new_index;
2136 alloc.sizes[new_index] = alloc.sizes[i];
2137 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
2138 ++new_index;
2139 }
2140 }
2141
2142 this->alloc.count = new_index;
2143
2144 /* Patch all the instructions to use the newly renumbered registers */
2145 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
2146 if (inst->dst.file == VGRF)
2147 inst->dst.nr = remap_table[inst->dst.nr];
2148
2149 for (int i = 0; i < inst->sources; i++) {
2150 if (inst->src[i].file == VGRF)
2151 inst->src[i].nr = remap_table[inst->src[i].nr];
2152 }
2153 }
2154
2155 /* Patch all the references to delta_xy, since they're used in register
2156 * allocation. If they're unused, switch them to BAD_FILE so we don't
2157 * think some random VGRF is delta_xy.
2158 */
2159 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2160 if (delta_xy[i].file == VGRF) {
2161 if (remap_table[delta_xy[i].nr] != -1) {
2162 delta_xy[i].nr = remap_table[delta_xy[i].nr];
2163 } else {
2164 delta_xy[i].file = BAD_FILE;
2165 }
2166 }
2167 }
2168
2169 delete[] remap_table;
2170
2171 return progress;
2172 }
2173
2174 int
elk_get_subgroup_id_param_index(const intel_device_info * devinfo,const elk_stage_prog_data * prog_data)2175 elk_get_subgroup_id_param_index(const intel_device_info *devinfo,
2176 const elk_stage_prog_data *prog_data)
2177 {
2178 if (prog_data->nr_params == 0)
2179 return -1;
2180
2181 if (devinfo->verx10 >= 125)
2182 return -1;
2183
2184 /* The local thread id is always the last parameter in the list */
2185 uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
2186 if (last_param == ELK_PARAM_BUILTIN_SUBGROUP_ID)
2187 return prog_data->nr_params - 1;
2188
2189 return -1;
2190 }
2191
2192 /**
2193 * Assign UNIFORM file registers to either push constants or pull constants.
2194 *
2195 * We allow a fragment shader to have more than the specified minimum
2196 * maximum number of fragment shader uniform components (64). If
2197 * there are too many of these, they'd fill up all of register space.
2198 * So, this will push some of them out to the pull constant buffer and
2199 * update the program to load them.
2200 */
2201 void
assign_constant_locations()2202 elk_fs_visitor::assign_constant_locations()
2203 {
2204 /* Only the first compile gets to decide on locations. */
2205 if (push_constant_loc)
2206 return;
2207
2208 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2209 for (unsigned u = 0; u < uniforms; u++)
2210 push_constant_loc[u] = u;
2211
2212 /* Now that we know how many regular uniforms we'll push, reduce the
2213 * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
2214 */
2215 /* For gen4/5:
2216 * Only allow 16 registers (128 uniform components) as push constants.
2217 *
2218 * If changing this value, note the limitation about total_regs in
2219 * elk_curbe.c/crocus_state.c
2220 */
2221 const unsigned max_push_length = compiler->devinfo->ver < 6 ? 16 : 64;
2222 unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
2223 for (int i = 0; i < 4; i++) {
2224 struct elk_ubo_range *range = &prog_data->ubo_ranges[i];
2225
2226 if (push_length + range->length > max_push_length)
2227 range->length = max_push_length - push_length;
2228
2229 push_length += range->length;
2230 }
2231 assert(push_length <= max_push_length);
2232 }
2233
2234 bool
get_pull_locs(const elk_fs_reg & src,unsigned * out_surf_index,unsigned * out_pull_index)2235 elk_fs_visitor::get_pull_locs(const elk_fs_reg &src,
2236 unsigned *out_surf_index,
2237 unsigned *out_pull_index)
2238 {
2239 assert(src.file == UNIFORM);
2240
2241 if (src.nr < UBO_START)
2242 return false;
2243
2244 const struct elk_ubo_range *range =
2245 &prog_data->ubo_ranges[src.nr - UBO_START];
2246
2247 /* If this access is in our (reduced) range, use the push data. */
2248 if (src.offset / 32 < range->length)
2249 return false;
2250
2251 *out_surf_index = range->block;
2252 *out_pull_index = (32 * range->start + src.offset) / 4;
2253
2254 prog_data->has_ubo_pull = true;
2255
2256 return true;
2257 }
2258
2259 /**
2260 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2261 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2262 */
2263 bool
lower_constant_loads()2264 elk_fs_visitor::lower_constant_loads()
2265 {
2266 unsigned index, pull_index;
2267 bool progress = false;
2268
2269 foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
2270 /* Set up the annotation tracking for new generated instructions. */
2271 const fs_builder ibld(this, block, inst);
2272
2273 for (int i = 0; i < inst->sources; i++) {
2274 if (inst->src[i].file != UNIFORM)
2275 continue;
2276
2277 /* We'll handle this case later */
2278 if (inst->opcode == ELK_SHADER_OPCODE_MOV_INDIRECT && i == 0)
2279 continue;
2280
2281 if (!get_pull_locs(inst->src[i], &index, &pull_index))
2282 continue;
2283
2284 assert(inst->src[i].stride == 0);
2285
2286 const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
2287 const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
2288 const elk_fs_reg dst = ubld.vgrf(ELK_REGISTER_TYPE_UD);
2289 const unsigned base = pull_index * 4;
2290
2291 elk_fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
2292 srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = elk_imm_ud(index);
2293 srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = elk_imm_ud(base & ~(block_sz - 1));
2294 srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = elk_imm_ud(block_sz);
2295
2296
2297 ubld.emit(ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
2298 srcs, PULL_UNIFORM_CONSTANT_SRCS);
2299
2300 /* Rewrite the instruction to use the temporary VGRF. */
2301 inst->src[i].file = VGRF;
2302 inst->src[i].nr = dst.nr;
2303 inst->src[i].offset = (base & (block_sz - 1)) +
2304 inst->src[i].offset % 4;
2305
2306 progress = true;
2307 }
2308
2309 if (inst->opcode == ELK_SHADER_OPCODE_MOV_INDIRECT &&
2310 inst->src[0].file == UNIFORM) {
2311
2312 if (!get_pull_locs(inst->src[0], &index, &pull_index))
2313 continue;
2314
2315 VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
2316 elk_imm_ud(index),
2317 elk_fs_reg() /* surface_handle */,
2318 inst->src[1],
2319 pull_index * 4, 4, 1);
2320 inst->remove(block);
2321
2322 progress = true;
2323 }
2324 }
2325 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2326
2327 return progress;
2328 }
2329
2330 static uint64_t
src_as_uint(const elk_fs_reg & src)2331 src_as_uint(const elk_fs_reg &src)
2332 {
2333 assert(src.file == IMM);
2334
2335 switch (src.type) {
2336 case ELK_REGISTER_TYPE_W:
2337 return (uint64_t)(int16_t)(src.ud & 0xffff);
2338
2339 case ELK_REGISTER_TYPE_UW:
2340 return (uint64_t)(uint16_t)(src.ud & 0xffff);
2341
2342 case ELK_REGISTER_TYPE_D:
2343 return (uint64_t)src.d;
2344
2345 case ELK_REGISTER_TYPE_UD:
2346 return (uint64_t)src.ud;
2347
2348 case ELK_REGISTER_TYPE_Q:
2349 return src.d64;
2350
2351 case ELK_REGISTER_TYPE_UQ:
2352 return src.u64;
2353
2354 default:
2355 unreachable("Invalid integer type.");
2356 }
2357 }
2358
2359 static elk_fs_reg
elk_imm_for_type(uint64_t value,enum elk_reg_type type)2360 elk_imm_for_type(uint64_t value, enum elk_reg_type type)
2361 {
2362 switch (type) {
2363 case ELK_REGISTER_TYPE_W:
2364 return elk_imm_w(value);
2365
2366 case ELK_REGISTER_TYPE_UW:
2367 return elk_imm_uw(value);
2368
2369 case ELK_REGISTER_TYPE_D:
2370 return elk_imm_d(value);
2371
2372 case ELK_REGISTER_TYPE_UD:
2373 return elk_imm_ud(value);
2374
2375 case ELK_REGISTER_TYPE_Q:
2376 return elk_imm_d(value);
2377
2378 case ELK_REGISTER_TYPE_UQ:
2379 return elk_imm_uq(value);
2380
2381 default:
2382 unreachable("Invalid integer type.");
2383 }
2384 }
2385
2386 bool
opt_algebraic()2387 elk_fs_visitor::opt_algebraic()
2388 {
2389 bool progress = false;
2390
2391 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
2392 switch (inst->opcode) {
2393 case ELK_OPCODE_MOV:
2394 if (!devinfo->has_64bit_float &&
2395 inst->dst.type == ELK_REGISTER_TYPE_DF) {
2396 assert(inst->dst.type == inst->src[0].type);
2397 assert(!inst->saturate);
2398 assert(!inst->src[0].abs);
2399 assert(!inst->src[0].negate);
2400 const elk::fs_builder ibld(this, block, inst);
2401
2402 if (!inst->is_partial_write())
2403 ibld.emit_undef_for_dst(inst);
2404
2405 ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_F, 1),
2406 subscript(inst->src[0], ELK_REGISTER_TYPE_F, 1));
2407 ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_F, 0),
2408 subscript(inst->src[0], ELK_REGISTER_TYPE_F, 0));
2409
2410 inst->remove(block);
2411 progress = true;
2412 }
2413
2414 if (!devinfo->has_64bit_int &&
2415 (inst->dst.type == ELK_REGISTER_TYPE_UQ ||
2416 inst->dst.type == ELK_REGISTER_TYPE_Q)) {
2417 assert(inst->dst.type == inst->src[0].type);
2418 assert(!inst->saturate);
2419 assert(!inst->src[0].abs);
2420 assert(!inst->src[0].negate);
2421 const elk::fs_builder ibld(this, block, inst);
2422
2423 if (!inst->is_partial_write())
2424 ibld.emit_undef_for_dst(inst);
2425
2426 ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 1),
2427 subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 1));
2428 ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 0),
2429 subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0));
2430
2431 inst->remove(block);
2432 progress = true;
2433 }
2434
2435 if ((inst->conditional_mod == ELK_CONDITIONAL_Z ||
2436 inst->conditional_mod == ELK_CONDITIONAL_NZ) &&
2437 inst->dst.is_null() &&
2438 (inst->src[0].abs || inst->src[0].negate)) {
2439 inst->src[0].abs = false;
2440 inst->src[0].negate = false;
2441 progress = true;
2442 break;
2443 }
2444
2445 if (inst->src[0].file != IMM)
2446 break;
2447
2448 if (inst->saturate) {
2449 /* Full mixed-type saturates don't happen. However, we can end up
2450 * with things like:
2451 *
2452 * mov.sat(8) g21<1>DF -1F
2453 *
2454 * Other mixed-size-but-same-base-type cases may also be possible.
2455 */
2456 if (inst->dst.type != inst->src[0].type &&
2457 inst->dst.type != ELK_REGISTER_TYPE_DF &&
2458 inst->src[0].type != ELK_REGISTER_TYPE_F)
2459 assert(!"unimplemented: saturate mixed types");
2460
2461 if (elk_saturate_immediate(inst->src[0].type,
2462 &inst->src[0].as_elk_reg())) {
2463 inst->saturate = false;
2464 progress = true;
2465 }
2466 }
2467 break;
2468
2469 case ELK_OPCODE_MUL:
2470 if (inst->src[1].file != IMM)
2471 continue;
2472
2473 if (elk_reg_type_is_floating_point(inst->src[1].type))
2474 break;
2475
2476 /* From the BDW PRM, Vol 2a, "mul - Multiply":
2477 *
2478 * "When multiplying integer datatypes, if src0 is DW and src1
2479 * is W, irrespective of the destination datatype, the
2480 * accumulator maintains full 48-bit precision."
2481 * ...
2482 * "When multiplying integer data types, if one of the sources
2483 * is a DW, the resulting full precision data is stored in
2484 * the accumulator."
2485 *
2486 * There are also similar notes in earlier PRMs.
2487 *
2488 * The MOV instruction can copy the bits of the source, but it
2489 * does not clear the higher bits of the accumulator. So, because
2490 * we might use the full accumulator in the MUL/MACH macro, we
2491 * shouldn't replace such MULs with MOVs.
2492 */
2493 if ((elk_reg_type_to_size(inst->src[0].type) == 4 ||
2494 elk_reg_type_to_size(inst->src[1].type) == 4) &&
2495 (inst->dst.is_accumulator() ||
2496 inst->writes_accumulator_implicitly(devinfo)))
2497 break;
2498
2499 /* a * 1.0 = a */
2500 if (inst->src[1].is_one()) {
2501 inst->opcode = ELK_OPCODE_MOV;
2502 inst->sources = 1;
2503 inst->src[1] = reg_undef;
2504 progress = true;
2505 break;
2506 }
2507
2508 /* a * -1.0 = -a */
2509 if (inst->src[1].is_negative_one()) {
2510 inst->opcode = ELK_OPCODE_MOV;
2511 inst->sources = 1;
2512 inst->src[0].negate = !inst->src[0].negate;
2513 inst->src[1] = reg_undef;
2514 progress = true;
2515 break;
2516 }
2517
2518 break;
2519 case ELK_OPCODE_ADD:
2520 if (inst->src[1].file != IMM)
2521 continue;
2522
2523 if (elk_reg_type_is_integer(inst->src[1].type) &&
2524 inst->src[1].is_zero()) {
2525 inst->opcode = ELK_OPCODE_MOV;
2526 inst->sources = 1;
2527 inst->src[1] = reg_undef;
2528 progress = true;
2529 break;
2530 }
2531
2532 if (inst->src[0].file == IMM) {
2533 assert(inst->src[0].type == ELK_REGISTER_TYPE_F);
2534 inst->opcode = ELK_OPCODE_MOV;
2535 inst->sources = 1;
2536 inst->src[0].f += inst->src[1].f;
2537 inst->src[1] = reg_undef;
2538 progress = true;
2539 break;
2540 }
2541 break;
2542
2543 case ELK_OPCODE_AND:
2544 if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
2545 const uint64_t src0 = src_as_uint(inst->src[0]);
2546 const uint64_t src1 = src_as_uint(inst->src[1]);
2547
2548 inst->opcode = ELK_OPCODE_MOV;
2549 inst->sources = 1;
2550 inst->src[0] = elk_imm_for_type(src0 & src1, inst->dst.type);
2551 inst->src[1] = reg_undef;
2552 progress = true;
2553 break;
2554 }
2555
2556 break;
2557
2558 case ELK_OPCODE_OR:
2559 if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
2560 const uint64_t src0 = src_as_uint(inst->src[0]);
2561 const uint64_t src1 = src_as_uint(inst->src[1]);
2562
2563 inst->opcode = ELK_OPCODE_MOV;
2564 inst->sources = 1;
2565 inst->src[0] = elk_imm_for_type(src0 | src1, inst->dst.type);
2566 inst->src[1] = reg_undef;
2567 progress = true;
2568 break;
2569 }
2570
2571 if (inst->src[0].equals(inst->src[1]) ||
2572 inst->src[1].is_zero()) {
2573 /* On Gfx8+, the OR instruction can have a source modifier that
2574 * performs logical not on the operand. Cases of 'OR r0, ~r1, 0'
2575 * or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV.
2576 */
2577 if (inst->src[0].negate) {
2578 inst->opcode = ELK_OPCODE_NOT;
2579 inst->sources = 1;
2580 inst->src[0].negate = false;
2581 } else {
2582 inst->opcode = ELK_OPCODE_MOV;
2583 inst->sources = 1;
2584 }
2585 inst->src[1] = reg_undef;
2586 progress = true;
2587 break;
2588 }
2589 break;
2590 case ELK_OPCODE_CMP:
2591 if ((inst->conditional_mod == ELK_CONDITIONAL_Z ||
2592 inst->conditional_mod == ELK_CONDITIONAL_NZ) &&
2593 inst->src[1].is_zero() &&
2594 (inst->src[0].abs || inst->src[0].negate)) {
2595 inst->src[0].abs = false;
2596 inst->src[0].negate = false;
2597 progress = true;
2598 break;
2599 }
2600 break;
2601 case ELK_OPCODE_SEL:
2602 if (!devinfo->has_64bit_float &&
2603 !devinfo->has_64bit_int &&
2604 (inst->dst.type == ELK_REGISTER_TYPE_DF ||
2605 inst->dst.type == ELK_REGISTER_TYPE_UQ ||
2606 inst->dst.type == ELK_REGISTER_TYPE_Q)) {
2607 assert(inst->dst.type == inst->src[0].type);
2608 assert(!inst->saturate);
2609 assert(!inst->src[0].abs && !inst->src[0].negate);
2610 assert(!inst->src[1].abs && !inst->src[1].negate);
2611 const elk::fs_builder ibld(this, block, inst);
2612
2613 if (!inst->is_partial_write())
2614 ibld.emit_undef_for_dst(inst);
2615
2616 set_predicate(inst->predicate,
2617 ibld.SEL(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 0),
2618 subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
2619 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0)));
2620 set_predicate(inst->predicate,
2621 ibld.SEL(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 1),
2622 subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 1),
2623 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 1)));
2624
2625 inst->remove(block);
2626 progress = true;
2627 }
2628 if (inst->src[0].equals(inst->src[1])) {
2629 inst->opcode = ELK_OPCODE_MOV;
2630 inst->sources = 1;
2631 inst->src[1] = reg_undef;
2632 inst->predicate = ELK_PREDICATE_NONE;
2633 inst->predicate_inverse = false;
2634 progress = true;
2635 } else if (inst->saturate && inst->src[1].file == IMM) {
2636 switch (inst->conditional_mod) {
2637 case ELK_CONDITIONAL_LE:
2638 case ELK_CONDITIONAL_L:
2639 switch (inst->src[1].type) {
2640 case ELK_REGISTER_TYPE_F:
2641 if (inst->src[1].f >= 1.0f) {
2642 inst->opcode = ELK_OPCODE_MOV;
2643 inst->sources = 1;
2644 inst->src[1] = reg_undef;
2645 inst->conditional_mod = ELK_CONDITIONAL_NONE;
2646 progress = true;
2647 }
2648 break;
2649 default:
2650 break;
2651 }
2652 break;
2653 case ELK_CONDITIONAL_GE:
2654 case ELK_CONDITIONAL_G:
2655 switch (inst->src[1].type) {
2656 case ELK_REGISTER_TYPE_F:
2657 if (inst->src[1].f <= 0.0f) {
2658 inst->opcode = ELK_OPCODE_MOV;
2659 inst->sources = 1;
2660 inst->src[1] = reg_undef;
2661 inst->conditional_mod = ELK_CONDITIONAL_NONE;
2662 progress = true;
2663 }
2664 break;
2665 default:
2666 break;
2667 }
2668 default:
2669 break;
2670 }
2671 }
2672 break;
2673 case ELK_OPCODE_MAD:
2674 if (inst->src[0].type != ELK_REGISTER_TYPE_F ||
2675 inst->src[1].type != ELK_REGISTER_TYPE_F ||
2676 inst->src[2].type != ELK_REGISTER_TYPE_F)
2677 break;
2678 if (inst->src[1].is_one()) {
2679 inst->opcode = ELK_OPCODE_ADD;
2680 inst->sources = 2;
2681 inst->src[1] = inst->src[2];
2682 inst->src[2] = reg_undef;
2683 progress = true;
2684 } else if (inst->src[2].is_one()) {
2685 inst->opcode = ELK_OPCODE_ADD;
2686 inst->sources = 2;
2687 inst->src[2] = reg_undef;
2688 progress = true;
2689 }
2690 break;
2691 case ELK_OPCODE_SHL:
2692 if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
2693 /* It's not currently possible to generate this, and this constant
2694 * folding does not handle it.
2695 */
2696 assert(!inst->saturate);
2697
2698 elk_fs_reg result;
2699
2700 switch (type_sz(inst->src[0].type)) {
2701 case 2:
2702 result = elk_imm_uw(0x0ffff & (inst->src[0].ud << (inst->src[1].ud & 0x1f)));
2703 break;
2704 case 4:
2705 result = elk_imm_ud(inst->src[0].ud << (inst->src[1].ud & 0x1f));
2706 break;
2707 case 8:
2708 result = elk_imm_uq(inst->src[0].u64 << (inst->src[1].ud & 0x3f));
2709 break;
2710 default:
2711 /* Just in case a future platform re-enables B or UB types. */
2712 unreachable("Invalid source size.");
2713 }
2714
2715 inst->opcode = ELK_OPCODE_MOV;
2716 inst->src[0] = retype(result, inst->dst.type);
2717 inst->src[1] = reg_undef;
2718 inst->sources = 1;
2719
2720 progress = true;
2721 }
2722 break;
2723
2724 case ELK_SHADER_OPCODE_BROADCAST:
2725 if (is_uniform(inst->src[0])) {
2726 inst->opcode = ELK_OPCODE_MOV;
2727 inst->sources = 1;
2728 inst->force_writemask_all = true;
2729 progress = true;
2730 } else if (inst->src[1].file == IMM) {
2731 inst->opcode = ELK_OPCODE_MOV;
2732 /* It's possible that the selected component will be too large and
2733 * overflow the register. This can happen if someone does a
2734 * readInvocation() from GLSL or SPIR-V and provides an OOB
2735 * invocationIndex. If this happens and we some how manage
2736 * to constant fold it in and get here, then component() may cause
2737 * us to start reading outside of the VGRF which will lead to an
2738 * assert later. Instead, just let it wrap around if it goes over
2739 * exec_size.
2740 */
2741 const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
2742 inst->src[0] = component(inst->src[0], comp);
2743 inst->sources = 1;
2744 inst->force_writemask_all = true;
2745 progress = true;
2746 }
2747 break;
2748
2749 case ELK_SHADER_OPCODE_SHUFFLE:
2750 if (is_uniform(inst->src[0])) {
2751 inst->opcode = ELK_OPCODE_MOV;
2752 inst->sources = 1;
2753 progress = true;
2754 } else if (inst->src[1].file == IMM) {
2755 inst->opcode = ELK_OPCODE_MOV;
2756 inst->src[0] = component(inst->src[0],
2757 inst->src[1].ud);
2758 inst->sources = 1;
2759 progress = true;
2760 }
2761 break;
2762
2763 default:
2764 break;
2765 }
2766
2767 /* Ensure that the correct source has the immediate value. 2-source
2768 * instructions must have the immediate in src[1]. On Gfx12 and later,
2769 * some 3-source instructions can have the immediate in src[0] or
2770 * src[2]. It's complicated, so don't mess with 3-source instructions
2771 * here.
2772 */
2773 if (progress && inst->sources == 2 && inst->is_commutative()) {
2774 if (inst->src[0].file == IMM) {
2775 elk_fs_reg tmp = inst->src[1];
2776 inst->src[1] = inst->src[0];
2777 inst->src[0] = tmp;
2778 }
2779 }
2780 }
2781
2782 if (progress)
2783 invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
2784 DEPENDENCY_INSTRUCTION_DETAIL);
2785
2786 return progress;
2787 }
2788
2789 static unsigned
load_payload_sources_read_for_size(elk_fs_inst * lp,unsigned size_read)2790 load_payload_sources_read_for_size(elk_fs_inst *lp, unsigned size_read)
2791 {
2792 assert(lp->opcode == ELK_SHADER_OPCODE_LOAD_PAYLOAD);
2793 assert(size_read >= lp->header_size * REG_SIZE);
2794
2795 unsigned i;
2796 unsigned size = lp->header_size * REG_SIZE;
2797 for (i = lp->header_size; size < size_read && i < lp->sources; i++)
2798 size += lp->exec_size * type_sz(lp->src[i].type);
2799
2800 /* Size read must cover exactly a subset of sources. */
2801 assert(size == size_read);
2802 return i;
2803 }
2804
2805 /**
2806 * Optimize sample messages that have constant zero values for the trailing
2807 * parameters. We can just reduce the message length for these
2808 * instructions instead of reserving a register for it. Trailing parameters
2809 * that aren't sent default to zero anyway. This will cause the dead code
2810 * eliminator to remove the MOV instruction that would otherwise be emitted to
2811 * set up the zero value.
2812 */
2813 bool
opt_zero_samples()2814 elk_fs_visitor::opt_zero_samples()
2815 {
2816 /* Implementation supports only SENDs, so applicable to Gfx7+ only. */
2817 assert(devinfo->ver >= 7);
2818
2819 bool progress = false;
2820
2821 foreach_block_and_inst(block, elk_fs_inst, send, cfg) {
2822 if (send->opcode != ELK_SHADER_OPCODE_SEND ||
2823 send->sfid != ELK_SFID_SAMPLER)
2824 continue;
2825
2826 /* Wa_14012688258:
2827 *
2828 * Don't trim zeros at the end of payload for sample operations
2829 * in cube and cube arrays.
2830 */
2831 if (send->keep_payload_trailing_zeros)
2832 continue;
2833
2834 /* This pass works on SENDs before splitting. */
2835 if (send->ex_mlen > 0)
2836 continue;
2837
2838 elk_fs_inst *lp = (elk_fs_inst *) send->prev;
2839
2840 if (lp->is_head_sentinel() || lp->opcode != ELK_SHADER_OPCODE_LOAD_PAYLOAD)
2841 continue;
2842
2843 /* How much of the payload are actually read by this SEND. */
2844 const unsigned params =
2845 load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
2846
2847 /* We don't want to remove the message header or the first parameter.
2848 * Removing the first parameter is not allowed, see the Haswell PRM
2849 * volume 7, page 149:
2850 *
2851 * "Parameter 0 is required except for the sampleinfo message, which
2852 * has no parameter 0"
2853 */
2854 const unsigned first_param_idx = lp->header_size;
2855 unsigned zero_size = 0;
2856 for (unsigned i = params - 1; i > first_param_idx; i--) {
2857 if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
2858 break;
2859 zero_size += lp->exec_size * type_sz(lp->src[i].type) * lp->dst.stride;
2860 }
2861
2862 const unsigned zero_len = zero_size / (reg_unit(devinfo) * REG_SIZE);
2863 if (zero_len > 0) {
2864 send->mlen -= zero_len;
2865 progress = true;
2866 }
2867 }
2868
2869 if (progress)
2870 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
2871
2872 return progress;
2873 }
2874
2875 /**
2876 * Opportunistically split SEND message payloads.
2877 *
2878 * Gfx9+ supports "split" SEND messages, which take two payloads that are
2879 * implicitly concatenated. If we find a SEND message with a single payload,
2880 * we can split that payload in two. This results in smaller contiguous
2881 * register blocks for us to allocate. But it can help beyond that, too.
2882 *
2883 * We try and split a LOAD_PAYLOAD between sources which change registers.
2884 * For example, a sampler message often contains a x/y/z coordinate that may
2885 * already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
2886 * or array index, which comes from elsewhere. In this case, the first few
2887 * sources will be different offsets of the same VGRF, then a later source
2888 * will be a different VGRF. So we split there, possibly eliminating the
2889 * payload concatenation altogether.
2890 */
2891 bool
opt_split_sends()2892 elk_fs_visitor::opt_split_sends()
2893 {
2894 if (devinfo->ver < 9)
2895 return false;
2896
2897 bool progress = false;
2898
2899 foreach_block_and_inst(block, elk_fs_inst, send, cfg) {
2900 if (send->opcode != ELK_SHADER_OPCODE_SEND ||
2901 send->mlen <= reg_unit(devinfo) || send->ex_mlen > 0)
2902 continue;
2903
2904 assert(send->src[2].file == VGRF);
2905
2906 /* Currently don't split sends that reuse a previously used payload. */
2907 elk_fs_inst *lp = (elk_fs_inst *) send->prev;
2908
2909 if (lp->is_head_sentinel() || lp->opcode != ELK_SHADER_OPCODE_LOAD_PAYLOAD)
2910 continue;
2911
2912 if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
2913 continue;
2914
2915 /* Split either after the header (if present), or when consecutive
2916 * sources switch from one VGRF to a different one.
2917 */
2918 unsigned mid = lp->header_size;
2919 if (mid == 0) {
2920 for (mid = 1; mid < lp->sources; mid++) {
2921 if (lp->src[mid].file == BAD_FILE)
2922 continue;
2923
2924 if (lp->src[0].file != lp->src[mid].file ||
2925 lp->src[0].nr != lp->src[mid].nr)
2926 break;
2927 }
2928 }
2929
2930 /* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
2931 * find out how many sources from the payload does it really need.
2932 */
2933 const unsigned end =
2934 load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
2935
2936 /* Nothing to split. */
2937 if (end <= mid)
2938 continue;
2939
2940 const fs_builder ibld(this, block, lp);
2941 elk_fs_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
2942 elk_fs_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);
2943
2944 assert(lp1->size_written % REG_SIZE == 0);
2945 assert(lp2->size_written % REG_SIZE == 0);
2946 assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);
2947
2948 lp1->dst = elk_fs_reg(VGRF, alloc.allocate(lp1->size_written / REG_SIZE), lp1->dst.type);
2949 lp2->dst = elk_fs_reg(VGRF, alloc.allocate(lp2->size_written / REG_SIZE), lp2->dst.type);
2950
2951 send->resize_sources(4);
2952 send->src[2] = lp1->dst;
2953 send->src[3] = lp2->dst;
2954 send->ex_mlen = lp2->size_written / REG_SIZE;
2955 send->mlen -= send->ex_mlen;
2956
2957 progress = true;
2958 }
2959
2960 if (progress)
2961 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2962
2963 return progress;
2964 }
2965
2966 /**
2967 * Remove redundant or useless halts.
2968 *
2969 * For example, we can eliminate halts in the following sequence:
2970 *
2971 * halt (redundant with the next halt)
2972 * halt (useless; jumps to the next instruction)
2973 * halt-target
2974 */
2975 bool
opt_redundant_halt()2976 elk_fs_visitor::opt_redundant_halt()
2977 {
2978 bool progress = false;
2979
2980 unsigned halt_count = 0;
2981 elk_fs_inst *halt_target = NULL;
2982 elk_bblock_t *halt_target_block = NULL;
2983 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
2984 if (inst->opcode == ELK_OPCODE_HALT)
2985 halt_count++;
2986
2987 if (inst->opcode == ELK_SHADER_OPCODE_HALT_TARGET) {
2988 halt_target = inst;
2989 halt_target_block = block;
2990 break;
2991 }
2992 }
2993
2994 if (!halt_target) {
2995 assert(halt_count == 0);
2996 return false;
2997 }
2998
2999 /* Delete any HALTs immediately before the halt target. */
3000 for (elk_fs_inst *prev = (elk_fs_inst *) halt_target->prev;
3001 !prev->is_head_sentinel() && prev->opcode == ELK_OPCODE_HALT;
3002 prev = (elk_fs_inst *) halt_target->prev) {
3003 prev->remove(halt_target_block);
3004 halt_count--;
3005 progress = true;
3006 }
3007
3008 if (halt_count == 0) {
3009 halt_target->remove(halt_target_block);
3010 progress = true;
3011 }
3012
3013 if (progress)
3014 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3015
3016 return progress;
3017 }
3018
3019 /**
3020 * Compute a bitmask with GRF granularity with a bit set for each GRF starting
3021 * from \p r.offset which overlaps the region starting at \p s.offset and
3022 * spanning \p ds bytes.
3023 */
3024 static inline unsigned
mask_relative_to(const elk_fs_reg & r,const elk_fs_reg & s,unsigned ds)3025 mask_relative_to(const elk_fs_reg &r, const elk_fs_reg &s, unsigned ds)
3026 {
3027 const int rel_offset = reg_offset(s) - reg_offset(r);
3028 const int shift = rel_offset / REG_SIZE;
3029 const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
3030 assert(reg_space(r) == reg_space(s) &&
3031 shift >= 0 && shift < int(8 * sizeof(unsigned)));
3032 return ((1 << n) - 1) << shift;
3033 }
3034
3035 bool
compute_to_mrf()3036 elk_fs_visitor::compute_to_mrf()
3037 {
3038 bool progress = false;
3039 int next_ip = 0;
3040
3041 /* No MRFs on Gen >= 7. */
3042 if (devinfo->ver >= 7)
3043 return false;
3044
3045 const fs_live_variables &live = live_analysis.require();
3046
3047 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
3048 int ip = next_ip;
3049 next_ip++;
3050
3051 if (inst->opcode != ELK_OPCODE_MOV ||
3052 inst->is_partial_write() ||
3053 inst->dst.file != MRF || inst->src[0].file != VGRF ||
3054 inst->dst.type != inst->src[0].type ||
3055 inst->src[0].abs || inst->src[0].negate ||
3056 !inst->src[0].is_contiguous() ||
3057 inst->src[0].offset % REG_SIZE != 0)
3058 continue;
3059
3060 /* Can't compute-to-MRF this GRF if someone else was going to
3061 * read it later.
3062 */
3063 if (live.vgrf_end[inst->src[0].nr] > ip)
3064 continue;
3065
3066 /* Found a move of a GRF to a MRF. Let's see if we can go rewrite the
3067 * things that computed the value of all GRFs of the source region. The
3068 * regs_left bitset keeps track of the registers we haven't yet found a
3069 * generating instruction for.
3070 */
3071 unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
3072
3073 foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
3074 if (regions_overlap(scan_inst->dst, scan_inst->size_written,
3075 inst->src[0], inst->size_read(0))) {
3076 /* Found the last thing to write our reg we want to turn
3077 * into a compute-to-MRF.
3078 */
3079
3080 /* If this one instruction didn't populate all the
3081 * channels, bail. We might be able to rewrite everything
3082 * that writes that reg, but it would require smarter
3083 * tracking.
3084 */
3085 if (scan_inst->is_partial_write())
3086 break;
3087
3088 /* Handling things not fully contained in the source of the copy
3089 * would need us to understand coalescing out more than one MOV at
3090 * a time.
3091 */
3092 if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
3093 inst->src[0], inst->size_read(0)))
3094 break;
3095
3096 /* SEND instructions can't have MRF as a destination. */
3097 if (scan_inst->mlen)
3098 break;
3099
3100 if (devinfo->ver == 6) {
3101 /* gfx6 math instructions must have the destination be
3102 * GRF, so no compute-to-MRF for them.
3103 */
3104 if (scan_inst->is_math()) {
3105 break;
3106 }
3107 }
3108
3109 /* Clear the bits for any registers this instruction overwrites. */
3110 regs_left &= ~mask_relative_to(
3111 inst->src[0], scan_inst->dst, scan_inst->size_written);
3112 if (!regs_left)
3113 break;
3114 }
3115
3116 /* We don't handle control flow here. Most computation of
3117 * values that end up in MRFs are shortly before the MRF
3118 * write anyway.
3119 */
3120 if (block->start() == scan_inst)
3121 break;
3122
3123 /* You can't read from an MRF, so if someone else reads our
3124 * MRF's source GRF that we wanted to rewrite, that stops us.
3125 */
3126 bool interfered = false;
3127 for (int i = 0; i < scan_inst->sources; i++) {
3128 if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),
3129 inst->src[0], inst->size_read(0))) {
3130 interfered = true;
3131 }
3132 }
3133 if (interfered)
3134 break;
3135
3136 if (regions_overlap(scan_inst->dst, scan_inst->size_written,
3137 inst->dst, inst->size_written)) {
3138 /* If somebody else writes our MRF here, we can't
3139 * compute-to-MRF before that.
3140 */
3141 break;
3142 }
3143
3144 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
3145 regions_overlap(elk_fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
3146 inst->dst, inst->size_written)) {
3147 /* Found a SEND instruction, which means that there are
3148 * live values in MRFs from base_mrf to base_mrf +
3149 * scan_inst->mlen - 1. Don't go pushing our MRF write up
3150 * above it.
3151 */
3152 break;
3153 }
3154 }
3155
3156 if (regs_left)
3157 continue;
3158
3159 /* Found all generating instructions of our MRF's source value, so it
3160 * should be safe to rewrite them to point to the MRF directly.
3161 */
3162 regs_left = (1 << regs_read(inst, 0)) - 1;
3163
3164 foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
3165 if (regions_overlap(scan_inst->dst, scan_inst->size_written,
3166 inst->src[0], inst->size_read(0))) {
3167 /* Clear the bits for any registers this instruction overwrites. */
3168 regs_left &= ~mask_relative_to(
3169 inst->src[0], scan_inst->dst, scan_inst->size_written);
3170
3171 const unsigned rel_offset = reg_offset(scan_inst->dst) -
3172 reg_offset(inst->src[0]);
3173
3174 if (inst->dst.nr & ELK_MRF_COMPR4) {
3175 /* Apply the same address transformation done by the hardware
3176 * for COMPR4 MRF writes.
3177 */
3178 assert(rel_offset < 2 * REG_SIZE);
3179 scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
3180
3181 /* Clear the COMPR4 bit if the generating instruction is not
3182 * compressed.
3183 */
3184 if (scan_inst->size_written < 2 * REG_SIZE)
3185 scan_inst->dst.nr &= ~ELK_MRF_COMPR4;
3186
3187 } else {
3188 /* Calculate the MRF number the result of this instruction is
3189 * ultimately written to.
3190 */
3191 scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
3192 }
3193
3194 scan_inst->dst.file = MRF;
3195 scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
3196 scan_inst->saturate |= inst->saturate;
3197 if (!regs_left)
3198 break;
3199 }
3200 }
3201
3202 assert(!regs_left);
3203 inst->remove(block);
3204 progress = true;
3205 }
3206
3207 if (progress)
3208 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3209
3210 return progress;
3211 }
3212
3213 /**
3214 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
3215 * flow. We could probably do better here with some form of divergence
3216 * analysis.
3217 */
3218 bool
eliminate_find_live_channel()3219 elk_fs_visitor::eliminate_find_live_channel()
3220 {
3221 bool progress = false;
3222 unsigned depth = 0;
3223
3224 if (!elk_stage_has_packed_dispatch(devinfo, stage, max_polygons,
3225 stage_prog_data)) {
3226 /* The optimization below assumes that channel zero is live on thread
3227 * dispatch, which may not be the case if the fixed function dispatches
3228 * threads sparsely.
3229 */
3230 return false;
3231 }
3232
3233 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
3234 switch (inst->opcode) {
3235 case ELK_OPCODE_IF:
3236 case ELK_OPCODE_DO:
3237 depth++;
3238 break;
3239
3240 case ELK_OPCODE_ENDIF:
3241 case ELK_OPCODE_WHILE:
3242 depth--;
3243 break;
3244
3245 case ELK_OPCODE_HALT:
3246 /* This can potentially make control flow non-uniform until the end
3247 * of the program.
3248 */
3249 goto out;
3250
3251 case ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL:
3252 if (depth == 0) {
3253 inst->opcode = ELK_OPCODE_MOV;
3254 inst->src[0] = elk_imm_ud(0u);
3255 inst->sources = 1;
3256 inst->force_writemask_all = true;
3257 progress = true;
3258 }
3259 break;
3260
3261 default:
3262 break;
3263 }
3264 }
3265
3266 out:
3267 if (progress)
3268 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
3269
3270 return progress;
3271 }
3272
3273 /**
3274 * Once we've generated code, try to convert normal ELK_FS_OPCODE_FB_WRITE
3275 * instructions to ELK_FS_OPCODE_REP_FB_WRITE.
3276 */
3277 void
emit_repclear_shader()3278 elk_fs_visitor::emit_repclear_shader()
3279 {
3280 elk_wm_prog_key *key = (elk_wm_prog_key*) this->key;
3281 elk_fs_inst *write = NULL;
3282
3283 assert(uniforms == 0);
3284 assume(key->nr_color_regions > 0);
3285
3286 elk_fs_reg color_output, header;
3287 if (devinfo->ver >= 7) {
3288 color_output = retype(elk_vec4_grf(127, 0), ELK_REGISTER_TYPE_UD);
3289 header = retype(elk_vec8_grf(125, 0), ELK_REGISTER_TYPE_UD);
3290 } else {
3291 color_output = retype(elk_vec4_reg(MRF, 2, 0), ELK_REGISTER_TYPE_UD);
3292 header = retype(elk_vec8_reg(MRF, 0, 0), ELK_REGISTER_TYPE_UD);
3293 }
3294
3295 /* We pass the clear color as a flat input. Copy it to the output. */
3296 elk_fs_reg color_input =
3297 elk_reg(ELK_GENERAL_REGISTER_FILE, 2, 3, 0, 0, ELK_REGISTER_TYPE_UD,
3298 ELK_VERTICAL_STRIDE_8, ELK_WIDTH_2, ELK_HORIZONTAL_STRIDE_4,
3299 ELK_SWIZZLE_XYZW, WRITEMASK_XYZW);
3300
3301 const fs_builder bld = fs_builder(this).at_end();
3302 bld.exec_all().group(4, 0).MOV(color_output, color_input);
3303
3304 if (key->nr_color_regions > 1) {
3305 /* Copy g0..g1 as the message header */
3306 bld.exec_all().group(16, 0)
3307 .MOV(header, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
3308 }
3309
3310 for (int i = 0; i < key->nr_color_regions; ++i) {
3311 if (i > 0)
3312 bld.exec_all().group(1, 0).MOV(component(header, 2), elk_imm_ud(i));
3313
3314 if (devinfo->ver >= 7) {
3315 write = bld.emit(ELK_SHADER_OPCODE_SEND);
3316 write->resize_sources(3);
3317 write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
3318 write->src[0] = elk_imm_ud(0);
3319 write->src[1] = elk_imm_ud(0);
3320 write->src[2] = i == 0 ? color_output : header;
3321 write->check_tdr = true;
3322 write->send_has_side_effects = true;
3323 write->desc = elk_fb_write_desc(devinfo, i,
3324 ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
3325 i == key->nr_color_regions - 1, false);
3326 } else {
3327 write = bld.emit(ELK_FS_OPCODE_REP_FB_WRITE);
3328 write->target = i;
3329 write->base_mrf = i == 0 ? color_output.nr : header.nr;
3330 }
3331
3332 /* We can use a headerless message for the first render target */
3333 write->header_size = i == 0 ? 0 : 2;
3334 write->mlen = 1 + write->header_size;
3335 }
3336 write->eot = true;
3337 write->last_rt = true;
3338
3339 calculate_cfg();
3340
3341 this->first_non_payload_grf = payload().num_regs;
3342 }
3343
3344 /**
3345 * Walks through basic blocks, looking for repeated MRF writes and
3346 * removing the later ones.
3347 */
3348 bool
remove_duplicate_mrf_writes()3349 elk_fs_visitor::remove_duplicate_mrf_writes()
3350 {
3351 elk_fs_inst *last_mrf_move[ELK_MAX_MRF(devinfo->ver)];
3352 bool progress = false;
3353
3354 /* Need to update the MRF tracking for compressed instructions. */
3355 if (dispatch_width >= 16)
3356 return false;
3357
3358 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3359
3360 foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
3361 if (inst->is_control_flow()) {
3362 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3363 }
3364
3365 if (inst->opcode == ELK_OPCODE_MOV &&
3366 inst->dst.file == MRF) {
3367 elk_fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
3368 if (prev_inst && prev_inst->opcode == ELK_OPCODE_MOV &&
3369 inst->dst.equals(prev_inst->dst) &&
3370 inst->src[0].equals(prev_inst->src[0]) &&
3371 inst->saturate == prev_inst->saturate &&
3372 inst->predicate == prev_inst->predicate &&
3373 inst->conditional_mod == prev_inst->conditional_mod &&
3374 inst->exec_size == prev_inst->exec_size) {
3375 inst->remove(block);
3376 progress = true;
3377 continue;
3378 }
3379 }
3380
3381 /* Clear out the last-write records for MRFs that were overwritten. */
3382 if (inst->dst.file == MRF) {
3383 last_mrf_move[inst->dst.nr] = NULL;
3384 }
3385
3386 if (inst->mlen > 0 && inst->base_mrf != -1) {
3387 /* Found a SEND instruction, which will include two or fewer
3388 * implied MRF writes. We could do better here.
3389 */
3390 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
3391 last_mrf_move[inst->base_mrf + i] = NULL;
3392 }
3393 }
3394
3395 /* Clear out any MRF move records whose sources got overwritten. */
3396 for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3397 if (last_mrf_move[i] &&
3398 regions_overlap(inst->dst, inst->size_written,
3399 last_mrf_move[i]->src[0],
3400 last_mrf_move[i]->size_read(0))) {
3401 last_mrf_move[i] = NULL;
3402 }
3403 }
3404
3405 if (inst->opcode == ELK_OPCODE_MOV &&
3406 inst->dst.file == MRF &&
3407 inst->src[0].file != ARF &&
3408 !inst->is_partial_write()) {
3409 last_mrf_move[inst->dst.nr] = inst;
3410 }
3411 }
3412
3413 if (progress)
3414 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3415
3416 return progress;
3417 }
3418
3419 /**
3420 * Rounding modes for conversion instructions are included for each
3421 * conversion, but right now it is a state. So once it is set,
3422 * we don't need to call it again for subsequent calls.
3423 *
3424 * This is useful for vector/matrices conversions, as setting the
3425 * mode once is enough for the full vector/matrix
3426 */
3427 bool
remove_extra_rounding_modes()3428 elk_fs_visitor::remove_extra_rounding_modes()
3429 {
3430 bool progress = false;
3431 unsigned execution_mode = this->nir->info.float_controls_execution_mode;
3432
3433 elk_rnd_mode base_mode = ELK_RND_MODE_UNSPECIFIED;
3434 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
3435 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
3436 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
3437 execution_mode)
3438 base_mode = ELK_RND_MODE_RTNE;
3439 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
3440 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
3441 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
3442 execution_mode)
3443 base_mode = ELK_RND_MODE_RTZ;
3444
3445 foreach_block (block, cfg) {
3446 elk_rnd_mode prev_mode = base_mode;
3447
3448 foreach_inst_in_block_safe (elk_fs_inst, inst, block) {
3449 if (inst->opcode == ELK_SHADER_OPCODE_RND_MODE) {
3450 assert(inst->src[0].file == ELK_IMMEDIATE_VALUE);
3451 const elk_rnd_mode mode = (elk_rnd_mode) inst->src[0].d;
3452 if (mode == prev_mode) {
3453 inst->remove(block);
3454 progress = true;
3455 } else {
3456 prev_mode = mode;
3457 }
3458 }
3459 }
3460 }
3461
3462 if (progress)
3463 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3464
3465 return progress;
3466 }
3467
3468 static void
clear_deps_for_inst_src(elk_fs_inst * inst,bool * deps,int first_grf,int grf_len)3469 clear_deps_for_inst_src(elk_fs_inst *inst, bool *deps, int first_grf, int grf_len)
3470 {
3471 /* Clear the flag for registers that actually got read (as expected). */
3472 for (int i = 0; i < inst->sources; i++) {
3473 int grf;
3474 if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
3475 grf = inst->src[i].nr;
3476 } else {
3477 continue;
3478 }
3479
3480 if (grf >= first_grf &&
3481 grf < first_grf + grf_len) {
3482 deps[grf - first_grf] = false;
3483 if (inst->exec_size == 16)
3484 deps[grf - first_grf + 1] = false;
3485 }
3486 }
3487 }
3488
3489 /**
3490 * Implements this workaround for the original 965:
3491 *
3492 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3493 * check for post destination dependencies on this instruction, software
3494 * must ensure that there is no destination hazard for the case of ‘write
3495 * followed by a posted write’ shown in the following example.
3496 *
3497 * 1. mov r3 0
3498 * 2. send r3.xy <rest of send instruction>
3499 * 3. mov r2 r3
3500 *
3501 * Due to no post-destination dependency check on the ‘send’, the above
3502 * code sequence could have two instructions (1 and 2) in flight at the
3503 * same time that both consider ‘r3’ as the target of their final writes.
3504 */
3505 void
insert_gfx4_pre_send_dependency_workarounds(elk_bblock_t * block,elk_fs_inst * inst)3506 elk_fs_visitor::insert_gfx4_pre_send_dependency_workarounds(elk_bblock_t *block,
3507 elk_fs_inst *inst)
3508 {
3509 int write_len = regs_written(inst);
3510 int first_write_grf = inst->dst.nr;
3511 bool needs_dep[ELK_MAX_MRF(devinfo->ver)];
3512 assert(write_len < (int)sizeof(needs_dep) - 1);
3513
3514 memset(needs_dep, false, sizeof(needs_dep));
3515 memset(needs_dep, true, write_len);
3516
3517 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3518
3519 /* Walk backwards looking for writes to registers we're writing which
3520 * aren't read since being written. If we hit the start of the program,
3521 * we assume that there are no outstanding dependencies on entry to the
3522 * program.
3523 */
3524 foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
3525 /* If we hit control flow, assume that there *are* outstanding
3526 * dependencies, and force their cleanup before our instruction.
3527 */
3528 if (block->start() == scan_inst && block->num != 0) {
3529 for (int i = 0; i < write_len; i++) {
3530 if (needs_dep[i])
3531 DEP_RESOLVE_MOV(fs_builder(this, block, inst),
3532 first_write_grf + i);
3533 }
3534 return;
3535 }
3536
3537 /* We insert our reads as late as possible on the assumption that any
3538 * instruction but a MOV that might have left us an outstanding
3539 * dependency has more latency than a MOV.
3540 */
3541 if (scan_inst->dst.file == VGRF) {
3542 for (unsigned i = 0; i < regs_written(scan_inst); i++) {
3543 int reg = scan_inst->dst.nr + i;
3544
3545 if (reg >= first_write_grf &&
3546 reg < first_write_grf + write_len &&
3547 needs_dep[reg - first_write_grf]) {
3548 DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
3549 needs_dep[reg - first_write_grf] = false;
3550 if (scan_inst->exec_size == 16)
3551 needs_dep[reg - first_write_grf + 1] = false;
3552 }
3553 }
3554 }
3555
3556 /* Clear the flag for registers that actually got read (as expected). */
3557 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3558
3559 /* Continue the loop only if we haven't resolved all the dependencies */
3560 int i;
3561 for (i = 0; i < write_len; i++) {
3562 if (needs_dep[i])
3563 break;
3564 }
3565 if (i == write_len)
3566 return;
3567 }
3568 }
3569
3570 /**
3571 * Implements this workaround for the original 965:
3572 *
3573 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3574 * used as a destination register until after it has been sourced by an
3575 * instruction with a different destination register.
3576 */
3577 void
insert_gfx4_post_send_dependency_workarounds(elk_bblock_t * block,elk_fs_inst * inst)3578 elk_fs_visitor::insert_gfx4_post_send_dependency_workarounds(elk_bblock_t *block, elk_fs_inst *inst)
3579 {
3580 int write_len = regs_written(inst);
3581 unsigned first_write_grf = inst->dst.nr;
3582 bool needs_dep[ELK_MAX_MRF(devinfo->ver)];
3583 assert(write_len < (int)sizeof(needs_dep) - 1);
3584
3585 memset(needs_dep, false, sizeof(needs_dep));
3586 memset(needs_dep, true, write_len);
3587 /* Walk forwards looking for writes to registers we're writing which aren't
3588 * read before being written.
3589 */
3590 foreach_inst_in_block_starting_from(elk_fs_inst, scan_inst, inst) {
3591 /* If we hit control flow, force resolve all remaining dependencies. */
3592 if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {
3593 for (int i = 0; i < write_len; i++) {
3594 if (needs_dep[i])
3595 DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3596 first_write_grf + i);
3597 }
3598 return;
3599 }
3600
3601 /* Clear the flag for registers that actually got read (as expected). */
3602 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3603
3604 /* We insert our reads as late as possible since they're reading the
3605 * result of a SEND, which has massive latency.
3606 */
3607 if (scan_inst->dst.file == VGRF &&
3608 scan_inst->dst.nr >= first_write_grf &&
3609 scan_inst->dst.nr < first_write_grf + write_len &&
3610 needs_dep[scan_inst->dst.nr - first_write_grf]) {
3611 DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3612 scan_inst->dst.nr);
3613 needs_dep[scan_inst->dst.nr - first_write_grf] = false;
3614 }
3615
3616 /* Continue the loop only if we haven't resolved all the dependencies */
3617 int i;
3618 for (i = 0; i < write_len; i++) {
3619 if (needs_dep[i])
3620 break;
3621 }
3622 if (i == write_len)
3623 return;
3624 }
3625 }
3626
3627 void
insert_gfx4_send_dependency_workarounds()3628 elk_fs_visitor::insert_gfx4_send_dependency_workarounds()
3629 {
3630 if (devinfo->ver != 4 || devinfo->platform == INTEL_PLATFORM_G4X)
3631 return;
3632
3633 bool progress = false;
3634
3635 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
3636 if (inst->mlen != 0 && inst->dst.file == VGRF) {
3637 insert_gfx4_pre_send_dependency_workarounds(block, inst);
3638 insert_gfx4_post_send_dependency_workarounds(block, inst);
3639 progress = true;
3640 }
3641 }
3642
3643 if (progress)
3644 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3645 }
3646
3647 bool
lower_load_payload()3648 elk_fs_visitor::lower_load_payload()
3649 {
3650 bool progress = false;
3651
3652 foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
3653 if (inst->opcode != ELK_SHADER_OPCODE_LOAD_PAYLOAD)
3654 continue;
3655
3656 assert(inst->dst.file == MRF || inst->dst.file == VGRF);
3657 assert(inst->saturate == false);
3658 elk_fs_reg dst = inst->dst;
3659
3660 /* Get rid of COMPR4. We'll add it back in if we need it */
3661 if (dst.file == MRF)
3662 dst.nr = dst.nr & ~ELK_MRF_COMPR4;
3663
3664 const fs_builder ibld(this, block, inst);
3665 const fs_builder ubld = ibld.exec_all();
3666
3667 for (uint8_t i = 0; i < inst->header_size;) {
3668 /* Number of header GRFs to initialize at once with a single MOV
3669 * instruction.
3670 */
3671 const unsigned n =
3672 (i + 1 < inst->header_size && inst->src[i].stride == 1 &&
3673 inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
3674 2 : 1;
3675
3676 if (inst->src[i].file != BAD_FILE)
3677 ubld.group(8 * n, 0).MOV(retype(dst, ELK_REGISTER_TYPE_UD),
3678 retype(inst->src[i], ELK_REGISTER_TYPE_UD));
3679
3680 dst = byte_offset(dst, n * REG_SIZE);
3681 i += n;
3682 }
3683
3684 if (inst->dst.file == MRF && (inst->dst.nr & ELK_MRF_COMPR4) &&
3685 inst->exec_size > 8) {
3686 /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3687 * a straightforward copy. Instead, the result of the
3688 * LOAD_PAYLOAD is treated as interleaved and the first four
3689 * non-header sources are unpacked as:
3690 *
3691 * m + 0: r0
3692 * m + 1: g0
3693 * m + 2: b0
3694 * m + 3: a0
3695 * m + 4: r1
3696 * m + 5: g1
3697 * m + 6: b1
3698 * m + 7: a1
3699 *
3700 * This is used for gen <= 5 fb writes.
3701 */
3702 assert(inst->exec_size == 16);
3703 assert(inst->header_size + 4 <= inst->sources);
3704 for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3705 if (inst->src[i].file != BAD_FILE) {
3706 if (devinfo->has_compr4) {
3707 elk_fs_reg compr4_dst = retype(dst, inst->src[i].type);
3708 compr4_dst.nr |= ELK_MRF_COMPR4;
3709 ibld.MOV(compr4_dst, inst->src[i]);
3710 } else {
3711 /* Platform doesn't have COMPR4. We have to fake it */
3712 elk_fs_reg mov_dst = retype(dst, inst->src[i].type);
3713 ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0));
3714 mov_dst.nr += 4;
3715 ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1));
3716 }
3717 }
3718
3719 dst.nr++;
3720 }
3721
3722 /* The loop above only ever incremented us through the first set
3723 * of 4 registers. However, thanks to the magic of COMPR4, we
3724 * actually wrote to the first 8 registers, so we need to take
3725 * that into account now.
3726 */
3727 dst.nr += 4;
3728
3729 /* The COMPR4 code took care of the first 4 sources. We'll let
3730 * the regular path handle any remaining sources. Yes, we are
3731 * modifying the instruction but we're about to delete it so
3732 * this really doesn't hurt anything.
3733 */
3734 inst->header_size += 4;
3735 }
3736
3737 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3738 dst.type = inst->src[i].type;
3739 if (inst->src[i].file != BAD_FILE) {
3740 ibld.MOV(dst, inst->src[i]);
3741 }
3742 dst = offset(dst, ibld, 1);
3743 }
3744
3745 inst->remove(block);
3746 progress = true;
3747 }
3748
3749 if (progress)
3750 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3751
3752 return progress;
3753 }
3754
3755 /**
3756 * Factor an unsigned 32-bit integer.
3757 *
3758 * Attempts to factor \c x into two values that are at most 0xFFFF. If no
3759 * such factorization is possible, either because the value is too large or is
3760 * prime, both \c result_a and \c result_b will be zero.
3761 */
3762 static void
factor_uint32(uint32_t x,unsigned * result_a,unsigned * result_b)3763 factor_uint32(uint32_t x, unsigned *result_a, unsigned *result_b)
3764 {
3765 /* This is necessary to prevent various opportunities for division by zero
3766 * below.
3767 */
3768 assert(x > 0xffff);
3769
3770 /* This represents the actual expected constraints on the input. Namely,
3771 * both the upper and lower words should be > 1.
3772 */
3773 assert(x >= 0x00020002);
3774
3775 *result_a = 0;
3776 *result_b = 0;
3777
3778 /* The value is too large to factor with the constraints. */
3779 if (x > (0xffffu * 0xffffu))
3780 return;
3781
3782 /* A non-prime number will have the form p*q*d where p is some prime
3783 * number, q > 1, and 1 <= d <= q. To meet the constraints of this
3784 * function, (p*d) < 0x10000. This implies d <= floor(0xffff / p).
3785 * Furthermore, since q < 0x10000, d >= floor(x / (0xffff * p)). Finally,
3786 * floor(x / (0xffff * p)) <= d <= floor(0xffff / p).
3787 *
3788 * The observation is finding the largest possible value of p reduces the
3789 * possible range of d. After selecting p, all values of d in this range
3790 * are tested until a factorization is found. The size of the range of
3791 * possible values of d sets an upper bound on the run time of the
3792 * function.
3793 */
3794 static const uint16_t primes[256] = {
3795 2, 3, 5, 7, 11, 13, 17, 19,
3796 23, 29, 31, 37, 41, 43, 47, 53,
3797 59, 61, 67, 71, 73, 79, 83, 89,
3798 97, 101, 103, 107, 109, 113, 127, 131, /* 32 */
3799 137, 139, 149, 151, 157, 163, 167, 173,
3800 179, 181, 191, 193, 197, 199, 211, 223,
3801 227, 229, 233, 239, 241, 251, 257, 263,
3802 269, 271, 277, 281, 283, 293, 307, 311, /* 64 */
3803 313, 317, 331, 337, 347, 349, 353, 359,
3804 367, 373, 379, 383, 389, 397, 401, 409,
3805 419, 421, 431, 433, 439, 443, 449, 457,
3806 461, 463, 467, 479, 487, 491, 499, 503, /* 96 */
3807 509, 521, 523, 541, 547, 557, 563, 569,
3808 571, 577, 587, 593, 599, 601, 607, 613,
3809 617, 619, 631, 641, 643, 647, 653, 659,
3810 661, 673, 677, 683, 691, 701, 709, 719, /* 128 */
3811 727, 733, 739, 743, 751, 757, 761, 769,
3812 773, 787, 797, 809, 811, 821, 823, 827,
3813 829, 839, 853, 857, 859, 863, 877, 881,
3814 883, 887, 907, 911, 919, 929, 937, 941, /* 160 */
3815 947, 953, 967, 971, 977, 983, 991, 997,
3816 1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049,
3817 1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097,
3818 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, /* 192 */
3819 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
3820 1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283,
3821 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
3822 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, /* 224 */
3823 1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459,
3824 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
3825 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571,
3826 1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619, /* 256 */
3827 };
3828
3829 unsigned p;
3830 unsigned x_div_p;
3831
3832 for (int i = ARRAY_SIZE(primes) - 1; i >= 0; i--) {
3833 p = primes[i];
3834 x_div_p = x / p;
3835
3836 if ((x_div_p * p) == x)
3837 break;
3838 }
3839
3840 /* A prime factor was not found. */
3841 if (x_div_p * p != x)
3842 return;
3843
3844 /* Terminate early if d=1 is a solution. */
3845 if (x_div_p < 0x10000) {
3846 *result_a = x_div_p;
3847 *result_b = p;
3848 return;
3849 }
3850
3851 /* Pick the maximum possible value for 'd'. It's important that the loop
3852 * below execute while d <= max_d because max_d is a valid value. Having
3853 * the wrong loop bound would cause 1627*1367*47 (0x063b0c83) to be
3854 * incorrectly reported as not being factorable. The problem would occur
3855 * with any value that is a factor of two primes in the table and one prime
3856 * not in the table.
3857 */
3858 const unsigned max_d = 0xffff / p;
3859
3860 /* Pick an initial value of 'd' that (combined with rejecting too large
3861 * values above) guarantees that 'q' will always be small enough.
3862 * DIV_ROUND_UP is used to prevent 'd' from being zero.
3863 */
3864 for (unsigned d = DIV_ROUND_UP(x_div_p, 0xffff); d <= max_d; d++) {
3865 unsigned q = x_div_p / d;
3866
3867 if ((q * d) == x_div_p) {
3868 assert(p * d * q == x);
3869 assert((p * d) < 0x10000);
3870
3871 *result_a = q;
3872 *result_b = p * d;
3873 break;
3874 }
3875
3876 /* Since every value of 'd' is tried, as soon as 'd' is larger
3877 * than 'q', we're just re-testing combinations that have
3878 * already been tested.
3879 */
3880 if (d > q)
3881 break;
3882 }
3883 }
3884
3885 void
lower_mul_dword_inst(elk_fs_inst * inst,elk_bblock_t * block)3886 elk_fs_visitor::lower_mul_dword_inst(elk_fs_inst *inst, elk_bblock_t *block)
3887 {
3888 const fs_builder ibld(this, block, inst);
3889
3890 /* It is correct to use inst->src[1].d in both end of the comparison.
3891 * Using .ud in the UINT16_MAX comparison would cause any negative value to
3892 * fail the check.
3893 */
3894 if (inst->src[1].file == IMM &&
3895 (inst->src[1].d >= INT16_MIN && inst->src[1].d <= UINT16_MAX)) {
3896 /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3897 * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3898 * src1 are used.
3899 *
3900 * If multiplying by an immediate value that fits in 16-bits, do a
3901 * single MUL instruction with that value in the proper location.
3902 */
3903 const bool ud = (inst->src[1].d >= 0);
3904 if (devinfo->ver < 7) {
3905 elk_fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type);
3906 ibld.MOV(imm, inst->src[1]);
3907 ibld.MUL(inst->dst, imm, inst->src[0]);
3908 } else {
3909 ibld.MUL(inst->dst, inst->src[0],
3910 ud ? elk_imm_uw(inst->src[1].ud)
3911 : elk_imm_w(inst->src[1].d));
3912 }
3913 } else {
3914 /* Gen < 8 (and some Gfx8+ low-power parts like Cherryview) cannot
3915 * do 32-bit integer multiplication in one instruction, but instead
3916 * must do a sequence (which actually calculates a 64-bit result):
3917 *
3918 * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
3919 * mach(8) null g3<8,8,1>D g4<8,8,1>D
3920 * mov(8) g2<1>D acc0<8,8,1>D
3921 *
3922 * But on Gen > 6, the ability to use second accumulator register
3923 * (acc1) for non-float data types was removed, preventing a simple
3924 * implementation in SIMD16. A 16-channel result can be calculated by
3925 * executing the three instructions twice in SIMD8, once with quarter
3926 * control of 1Q for the first eight channels and again with 2Q for
3927 * the second eight channels.
3928 *
3929 * Which accumulator register is implicitly accessed (by AccWrEnable
3930 * for instance) is determined by the quarter control. Unfortunately
3931 * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3932 * implicit accumulator access by an instruction with 2Q will access
3933 * acc1 regardless of whether the data type is usable in acc1.
3934 *
3935 * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3936 * integer data types.
3937 *
3938 * Since we only want the low 32-bits of the result, we can do two
3939 * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3940 * adjust the high result and add them (like the mach is doing):
3941 *
3942 * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
3943 * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
3944 * shl(8) g9<1>D g8<8,8,1>D 16D
3945 * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
3946 *
3947 * We avoid the shl instruction by realizing that we only want to add
3948 * the low 16-bits of the "high" result to the high 16-bits of the
3949 * "low" result and using proper regioning on the add:
3950 *
3951 * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
3952 * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
3953 * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
3954 *
3955 * Since it does not use the (single) accumulator register, we can
3956 * schedule multi-component multiplications much better.
3957 */
3958
3959 bool needs_mov = false;
3960 elk_fs_reg orig_dst = inst->dst;
3961
3962 /* Get a new VGRF for the "low" 32x16-bit multiplication result if
3963 * reusing the original destination is impossible due to hardware
3964 * restrictions, source/destination overlap, or it being the null
3965 * register.
3966 */
3967 elk_fs_reg low = inst->dst;
3968 if (orig_dst.is_null() || orig_dst.file == MRF ||
3969 regions_overlap(inst->dst, inst->size_written,
3970 inst->src[0], inst->size_read(0)) ||
3971 regions_overlap(inst->dst, inst->size_written,
3972 inst->src[1], inst->size_read(1)) ||
3973 inst->dst.stride >= 4) {
3974 needs_mov = true;
3975 low = elk_fs_reg(VGRF, alloc.allocate(regs_written(inst)),
3976 inst->dst.type);
3977 }
3978
3979 /* Get a new VGRF but keep the same stride as inst->dst */
3980 elk_fs_reg high(VGRF, alloc.allocate(regs_written(inst)), inst->dst.type);
3981 high.stride = inst->dst.stride;
3982 high.offset = inst->dst.offset % REG_SIZE;
3983
3984 bool do_addition = true;
3985 if (devinfo->ver >= 7) {
3986 /* From Wa_1604601757:
3987 *
3988 * "When multiplying a DW and any lower precision integer, source modifier
3989 * is not supported."
3990 *
3991 * An unsupported negate modifier on src[1] would ordinarily be
3992 * lowered by the subsequent lower_regioning pass. In this case that
3993 * pass would spawn another dword multiply. Instead, lower the
3994 * modifier first.
3995 */
3996 const bool source_mods_unsupported = (devinfo->ver >= 12);
3997
3998 if (inst->src[1].abs || (inst->src[1].negate &&
3999 source_mods_unsupported))
4000 lower_src_modifiers(this, block, inst, 1);
4001
4002 if (inst->src[1].file == IMM) {
4003 unsigned a;
4004 unsigned b;
4005
4006 /* If the immeditate value can be factored into two values, A and
4007 * B, that each fit in 16-bits, the multiplication result can
4008 * instead be calculated as (src1 * (A * B)) = ((src1 * A) * B).
4009 * This saves an operation (the addition) and a temporary register
4010 * (high).
4011 *
4012 * Skip the optimization if either the high word or the low word
4013 * is 0 or 1. In these conditions, at least one of the
4014 * multiplications generated by the straightforward method will be
4015 * eliminated anyway.
4016 */
4017 if (inst->src[1].ud > 0x0001ffff &&
4018 (inst->src[1].ud & 0xffff) > 1) {
4019 factor_uint32(inst->src[1].ud, &a, &b);
4020
4021 if (a != 0) {
4022 ibld.MUL(low, inst->src[0], elk_imm_uw(a));
4023 ibld.MUL(low, low, elk_imm_uw(b));
4024 do_addition = false;
4025 }
4026 }
4027
4028 if (do_addition) {
4029 ibld.MUL(low, inst->src[0],
4030 elk_imm_uw(inst->src[1].ud & 0xffff));
4031 ibld.MUL(high, inst->src[0],
4032 elk_imm_uw(inst->src[1].ud >> 16));
4033 }
4034 } else {
4035 ibld.MUL(low, inst->src[0],
4036 subscript(inst->src[1], ELK_REGISTER_TYPE_UW, 0));
4037 ibld.MUL(high, inst->src[0],
4038 subscript(inst->src[1], ELK_REGISTER_TYPE_UW, 1));
4039 }
4040 } else {
4041 if (inst->src[0].abs)
4042 lower_src_modifiers(this, block, inst, 0);
4043
4044 ibld.MUL(low, subscript(inst->src[0], ELK_REGISTER_TYPE_UW, 0),
4045 inst->src[1]);
4046 ibld.MUL(high, subscript(inst->src[0], ELK_REGISTER_TYPE_UW, 1),
4047 inst->src[1]);
4048 }
4049
4050 if (do_addition) {
4051 ibld.ADD(subscript(low, ELK_REGISTER_TYPE_UW, 1),
4052 subscript(low, ELK_REGISTER_TYPE_UW, 1),
4053 subscript(high, ELK_REGISTER_TYPE_UW, 0));
4054 }
4055
4056 if (needs_mov || inst->conditional_mod)
4057 set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low));
4058 }
4059 }
4060
4061 void
lower_mul_qword_inst(elk_fs_inst * inst,elk_bblock_t * block)4062 elk_fs_visitor::lower_mul_qword_inst(elk_fs_inst *inst, elk_bblock_t *block)
4063 {
4064 const fs_builder ibld(this, block, inst);
4065
4066 /* Considering two 64-bit integers ab and cd where each letter ab
4067 * corresponds to 32 bits, we get a 128-bit result WXYZ. We * cd
4068 * only need to provide the YZ part of the result. -------
4069 * BD
4070 * Only BD needs to be 64 bits. For AD and BC we only care + AD
4071 * about the lower 32 bits (since they are part of the upper + BC
4072 * 32 bits of our result). AC is not needed since it starts + AC
4073 * on the 65th bit of the result. -------
4074 * WXYZ
4075 */
4076 unsigned int q_regs = regs_written(inst);
4077 unsigned int d_regs = (q_regs + 1) / 2;
4078
4079 elk_fs_reg bd(VGRF, alloc.allocate(q_regs), ELK_REGISTER_TYPE_UQ);
4080 elk_fs_reg ad(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
4081 elk_fs_reg bc(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
4082
4083 /* Here we need the full 64 bit result for 32b * 32b. */
4084 if (devinfo->has_integer_dword_mul) {
4085 ibld.MUL(bd, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
4086 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0));
4087 } else {
4088 elk_fs_reg bd_high(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
4089 elk_fs_reg bd_low(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
4090 const unsigned acc_width = reg_unit(devinfo) * 8;
4091 elk_fs_reg acc = suboffset(retype(elk_acc_reg(inst->exec_size), ELK_REGISTER_TYPE_UD),
4092 inst->group % acc_width);
4093
4094 elk_fs_inst *mul = ibld.MUL(acc,
4095 subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
4096 subscript(inst->src[1], ELK_REGISTER_TYPE_UW, 0));
4097 mul->writes_accumulator = true;
4098
4099 ibld.MACH(bd_high, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
4100 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0));
4101 ibld.MOV(bd_low, acc);
4102
4103 ibld.UNDEF(bd);
4104 ibld.MOV(subscript(bd, ELK_REGISTER_TYPE_UD, 0), bd_low);
4105 ibld.MOV(subscript(bd, ELK_REGISTER_TYPE_UD, 1), bd_high);
4106 }
4107
4108 ibld.MUL(ad, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 1),
4109 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0));
4110 ibld.MUL(bc, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
4111 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 1));
4112
4113 ibld.ADD(ad, ad, bc);
4114 ibld.ADD(subscript(bd, ELK_REGISTER_TYPE_UD, 1),
4115 subscript(bd, ELK_REGISTER_TYPE_UD, 1), ad);
4116
4117 if (devinfo->has_64bit_int) {
4118 ibld.MOV(inst->dst, bd);
4119 } else {
4120 if (!inst->is_partial_write())
4121 ibld.emit_undef_for_dst(inst);
4122 ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 0),
4123 subscript(bd, ELK_REGISTER_TYPE_UD, 0));
4124 ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 1),
4125 subscript(bd, ELK_REGISTER_TYPE_UD, 1));
4126 }
4127 }
4128
4129 void
lower_mulh_inst(elk_fs_inst * inst,elk_bblock_t * block)4130 elk_fs_visitor::lower_mulh_inst(elk_fs_inst *inst, elk_bblock_t *block)
4131 {
4132 const fs_builder ibld(this, block, inst);
4133
4134 /* According to the BDW+ BSpec page for the "Multiply Accumulate
4135 * High" instruction:
4136 *
4137 * "An added preliminary mov is required for source modification on
4138 * src1:
4139 * mov (8) r3.0<1>:d -r3<8;8,1>:d
4140 * mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
4141 * mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d"
4142 */
4143 if (devinfo->ver >= 8 && (inst->src[1].negate || inst->src[1].abs))
4144 lower_src_modifiers(this, block, inst, 1);
4145
4146 /* Should have been lowered to 8-wide. */
4147 assert(inst->exec_size <= get_lowered_simd_width(this, inst));
4148 const unsigned acc_width = reg_unit(devinfo) * 8;
4149 const elk_fs_reg acc = suboffset(retype(elk_acc_reg(inst->exec_size), inst->dst.type),
4150 inst->group % acc_width);
4151 elk_fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
4152 elk_fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
4153
4154 if (devinfo->ver >= 8) {
4155 /* Until Gfx8, integer multiplies read 32-bits from one source,
4156 * and 16-bits from the other, and relying on the MACH instruction
4157 * to generate the high bits of the result.
4158 *
4159 * On Gfx8, the multiply instruction does a full 32x32-bit
4160 * multiply, but in order to do a 64-bit multiply we can simulate
4161 * the previous behavior and then use a MACH instruction.
4162 */
4163 assert(mul->src[1].type == ELK_REGISTER_TYPE_D ||
4164 mul->src[1].type == ELK_REGISTER_TYPE_UD);
4165 mul->src[1].type = ELK_REGISTER_TYPE_UW;
4166 mul->src[1].stride *= 2;
4167
4168 if (mul->src[1].file == IMM) {
4169 mul->src[1] = elk_imm_uw(mul->src[1].ud);
4170 }
4171 } else if (devinfo->verx10 == 70 &&
4172 inst->group > 0) {
4173 /* Among other things the quarter control bits influence which
4174 * accumulator register is used by the hardware for instructions
4175 * that access the accumulator implicitly (e.g. MACH). A
4176 * second-half instruction would normally map to acc1, which
4177 * doesn't exist on Gfx7 and up (the hardware does emulate it for
4178 * floating-point instructions *only* by taking advantage of the
4179 * extra precision of acc0 not normally used for floating point
4180 * arithmetic).
4181 *
4182 * HSW and up are careful enough not to try to access an
4183 * accumulator register that doesn't exist, but on earlier Gfx7
4184 * hardware we need to make sure that the quarter control bits are
4185 * zero to avoid non-deterministic behaviour and emit an extra MOV
4186 * to get the result masked correctly according to the current
4187 * channel enables.
4188 */
4189 mach->group = 0;
4190 mach->force_writemask_all = true;
4191 mach->dst = ibld.vgrf(inst->dst.type);
4192 ibld.MOV(inst->dst, mach->dst);
4193 }
4194 }
4195
4196 bool
lower_integer_multiplication()4197 elk_fs_visitor::lower_integer_multiplication()
4198 {
4199 bool progress = false;
4200
4201 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
4202 if (inst->opcode == ELK_OPCODE_MUL) {
4203 /* If the instruction is already in a form that does not need lowering,
4204 * return early.
4205 */
4206 if (devinfo->ver >= 7) {
4207 if (type_sz(inst->src[1].type) < 4 && type_sz(inst->src[0].type) <= 4)
4208 continue;
4209 } else {
4210 if (type_sz(inst->src[0].type) < 4 && type_sz(inst->src[1].type) <= 4)
4211 continue;
4212 }
4213
4214 if ((inst->dst.type == ELK_REGISTER_TYPE_Q ||
4215 inst->dst.type == ELK_REGISTER_TYPE_UQ) &&
4216 (inst->src[0].type == ELK_REGISTER_TYPE_Q ||
4217 inst->src[0].type == ELK_REGISTER_TYPE_UQ) &&
4218 (inst->src[1].type == ELK_REGISTER_TYPE_Q ||
4219 inst->src[1].type == ELK_REGISTER_TYPE_UQ)) {
4220 lower_mul_qword_inst(inst, block);
4221 inst->remove(block);
4222 progress = true;
4223 } else if (!inst->dst.is_accumulator() &&
4224 (inst->dst.type == ELK_REGISTER_TYPE_D ||
4225 inst->dst.type == ELK_REGISTER_TYPE_UD) &&
4226 (!devinfo->has_integer_dword_mul ||
4227 devinfo->verx10 >= 125)) {
4228 lower_mul_dword_inst(inst, block);
4229 inst->remove(block);
4230 progress = true;
4231 }
4232 } else if (inst->opcode == ELK_SHADER_OPCODE_MULH) {
4233 lower_mulh_inst(inst, block);
4234 inst->remove(block);
4235 progress = true;
4236 }
4237
4238 }
4239
4240 if (progress)
4241 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
4242
4243 return progress;
4244 }
4245
4246 bool
lower_minmax()4247 elk_fs_visitor::lower_minmax()
4248 {
4249 assert(devinfo->ver < 6);
4250
4251 bool progress = false;
4252
4253 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
4254 const fs_builder ibld(this, block, inst);
4255
4256 if (inst->opcode == ELK_OPCODE_SEL &&
4257 inst->predicate == ELK_PREDICATE_NONE) {
4258 /* If src1 is an immediate value that is not NaN, then it can't be
4259 * NaN. In that case, emit CMP because it is much better for cmod
4260 * propagation. Likewise if src1 is not float. Gfx4 and Gfx5 don't
4261 * support HF or DF, so it is not necessary to check for those.
4262 */
4263 if (inst->src[1].type != ELK_REGISTER_TYPE_F ||
4264 (inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
4265 ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
4266 inst->conditional_mod);
4267 } else {
4268 ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
4269 inst->conditional_mod);
4270 }
4271 inst->predicate = ELK_PREDICATE_NORMAL;
4272 inst->conditional_mod = ELK_CONDITIONAL_NONE;
4273
4274 progress = true;
4275 }
4276 }
4277
4278 if (progress)
4279 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
4280
4281 return progress;
4282 }
4283
4284 bool
lower_sub_sat()4285 elk_fs_visitor::lower_sub_sat()
4286 {
4287 bool progress = false;
4288
4289 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
4290 const fs_builder ibld(this, block, inst);
4291
4292 if (inst->opcode == ELK_SHADER_OPCODE_USUB_SAT ||
4293 inst->opcode == ELK_SHADER_OPCODE_ISUB_SAT) {
4294 /* The fundamental problem is the hardware performs source negation
4295 * at the bit width of the source. If the source is 0x80000000D, the
4296 * negation is 0x80000000D. As a result, subtractSaturate(0,
4297 * 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There
4298 * are at least three ways to resolve this:
4299 *
4300 * 1. Use the accumulator for the negated source. The accumulator is
4301 * 33 bits, so our source 0x80000000 is sign-extended to
4302 * 0x1800000000. The negation of which is 0x080000000. This
4303 * doesn't help for 64-bit integers (which are already bigger than
4304 * 33 bits). There are also only 8 accumulators, so SIMD16 or
4305 * SIMD32 instructions would have to be split into multiple SIMD8
4306 * instructions.
4307 *
4308 * 2. Use slightly different math. For any n-bit value x, we know (x
4309 * >> 1) != -(x >> 1). We can use this fact to only do
4310 * subtractions involving (x >> 1). subtractSaturate(a, b) ==
4311 * subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
4312 *
4313 * 3. For unsigned sources, it is sufficient to replace the
4314 * subtractSaturate with (a > b) ? a - b : 0.
4315 *
4316 * It may also be possible to use the SUBB instruction. This
4317 * implicitly writes the accumulator, so it could only be used in the
4318 * same situations as #1 above. It is further limited by only
4319 * allowing UD sources.
4320 */
4321 if (inst->exec_size == 8 && inst->src[0].type != ELK_REGISTER_TYPE_Q &&
4322 inst->src[0].type != ELK_REGISTER_TYPE_UQ) {
4323 elk_fs_reg acc(ARF, ELK_ARF_ACCUMULATOR, inst->src[1].type);
4324
4325 ibld.MOV(acc, inst->src[1]);
4326 elk_fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
4327 add->saturate = true;
4328 add->src[0].negate = true;
4329 } else if (inst->opcode == ELK_SHADER_OPCODE_ISUB_SAT) {
4330 /* tmp = src1 >> 1;
4331 * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
4332 */
4333 elk_fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
4334 elk_fs_reg tmp2 = ibld.vgrf(inst->src[0].type);
4335 elk_fs_reg tmp3 = ibld.vgrf(inst->src[0].type);
4336 elk_fs_inst *add;
4337
4338 ibld.SHR(tmp1, inst->src[1], elk_imm_d(1));
4339
4340 add = ibld.ADD(tmp2, inst->src[1], tmp1);
4341 add->src[1].negate = true;
4342
4343 add = ibld.ADD(tmp3, inst->src[0], tmp1);
4344 add->src[1].negate = true;
4345 add->saturate = true;
4346
4347 add = ibld.ADD(inst->dst, tmp3, tmp2);
4348 add->src[1].negate = true;
4349 add->saturate = true;
4350 } else {
4351 /* a > b ? a - b : 0 */
4352 ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
4353 ELK_CONDITIONAL_G);
4354
4355 elk_fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
4356 add->src[1].negate = !add->src[1].negate;
4357
4358 ibld.SEL(inst->dst, inst->dst, elk_imm_ud(0))
4359 ->predicate = ELK_PREDICATE_NORMAL;
4360 }
4361
4362 inst->remove(block);
4363 progress = true;
4364 }
4365 }
4366
4367 if (progress)
4368 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
4369
4370 return progress;
4371 }
4372
4373 /**
4374 * Get the mask of SIMD channels enabled during dispatch and not yet disabled
4375 * by discard. Due to the layout of the sample mask in the fragment shader
4376 * thread payload, \p bld is required to have a dispatch_width() not greater
4377 * than 16 for fragment shaders.
4378 */
4379 elk_fs_reg
elk_sample_mask_reg(const fs_builder & bld)4380 elk_sample_mask_reg(const fs_builder &bld)
4381 {
4382 const elk_fs_visitor &s = *bld.shader;
4383
4384 if (s.stage != MESA_SHADER_FRAGMENT) {
4385 return elk_imm_ud(0xffffffff);
4386 } else if (elk_wm_prog_data(s.stage_prog_data)->uses_kill) {
4387 assert(bld.dispatch_width() <= 16);
4388 return elk_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
4389 } else {
4390 assert(s.devinfo->ver >= 6 && bld.dispatch_width() <= 16);
4391 assert(s.devinfo->ver < 20);
4392 return retype(elk_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
4393 ELK_REGISTER_TYPE_UW);
4394 }
4395 }
4396
4397 uint32_t
elk_fb_write_msg_control(const elk_fs_inst * inst,const struct elk_wm_prog_data * prog_data)4398 elk_fb_write_msg_control(const elk_fs_inst *inst,
4399 const struct elk_wm_prog_data *prog_data)
4400 {
4401 uint32_t mctl;
4402
4403 if (inst->opcode == ELK_FS_OPCODE_REP_FB_WRITE) {
4404 assert(inst->group == 0 && inst->exec_size == 16);
4405 mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
4406 } else if (prog_data->dual_src_blend) {
4407 assert(inst->exec_size == 8);
4408
4409 if (inst->group % 16 == 0)
4410 mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
4411 else if (inst->group % 16 == 8)
4412 mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
4413 else
4414 unreachable("Invalid dual-source FB write instruction group");
4415 } else {
4416 assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
4417
4418 if (inst->exec_size == 16)
4419 mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
4420 else if (inst->exec_size == 8)
4421 mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
4422 else
4423 unreachable("Invalid FB write execution size");
4424 }
4425
4426 return mctl;
4427 }
4428
4429 /**
4430 * Predicate the specified instruction on the sample mask.
4431 */
4432 void
elk_emit_predicate_on_sample_mask(const fs_builder & bld,elk_fs_inst * inst)4433 elk_emit_predicate_on_sample_mask(const fs_builder &bld, elk_fs_inst *inst)
4434 {
4435 assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
4436 bld.group() == inst->group &&
4437 bld.dispatch_width() == inst->exec_size);
4438
4439 const elk_fs_visitor &s = *bld.shader;
4440 const elk_fs_reg sample_mask = elk_sample_mask_reg(bld);
4441 const unsigned subreg = sample_mask_flag_subreg(s);
4442
4443 if (elk_wm_prog_data(s.stage_prog_data)->uses_kill) {
4444 assert(sample_mask.file == ARF &&
4445 sample_mask.nr == elk_flag_subreg(subreg).nr &&
4446 sample_mask.subnr == elk_flag_subreg(
4447 subreg + inst->group / 16).subnr);
4448 } else {
4449 bld.group(1, 0).exec_all()
4450 .MOV(elk_flag_subreg(subreg + inst->group / 16), sample_mask);
4451 }
4452
4453 if (inst->predicate) {
4454 assert(inst->predicate == ELK_PREDICATE_NORMAL);
4455 assert(!inst->predicate_inverse);
4456 assert(inst->flag_subreg == 0);
4457 assert(s.devinfo->ver < 20);
4458 /* Combine the sample mask with the existing predicate by using a
4459 * vertical predication mode.
4460 */
4461 inst->predicate = ELK_PREDICATE_ALIGN1_ALLV;
4462 } else {
4463 inst->flag_subreg = subreg;
4464 inst->predicate = ELK_PREDICATE_NORMAL;
4465 inst->predicate_inverse = false;
4466 }
4467 }
4468
4469 static bool
is_mixed_float_with_fp32_dst(const elk_fs_inst * inst)4470 is_mixed_float_with_fp32_dst(const elk_fs_inst *inst)
4471 {
4472 /* This opcode sometimes uses :W type on the source even if the operand is
4473 * a :HF, because in gfx7 there is no support for :HF, and thus it uses :W.
4474 */
4475 if (inst->opcode == ELK_OPCODE_F16TO32)
4476 return true;
4477
4478 if (inst->dst.type != ELK_REGISTER_TYPE_F)
4479 return false;
4480
4481 for (int i = 0; i < inst->sources; i++) {
4482 if (inst->src[i].type == ELK_REGISTER_TYPE_HF)
4483 return true;
4484 }
4485
4486 return false;
4487 }
4488
4489 static bool
is_mixed_float_with_packed_fp16_dst(const elk_fs_inst * inst)4490 is_mixed_float_with_packed_fp16_dst(const elk_fs_inst *inst)
4491 {
4492 /* This opcode sometimes uses :W type on the destination even if the
4493 * destination is a :HF, because in gfx7 there is no support for :HF, and
4494 * thus it uses :W.
4495 */
4496 if (inst->opcode == ELK_OPCODE_F32TO16 &&
4497 inst->dst.stride == 1)
4498 return true;
4499
4500 if (inst->dst.type != ELK_REGISTER_TYPE_HF ||
4501 inst->dst.stride != 1)
4502 return false;
4503
4504 for (int i = 0; i < inst->sources; i++) {
4505 if (inst->src[i].type == ELK_REGISTER_TYPE_F)
4506 return true;
4507 }
4508
4509 return false;
4510 }
4511
4512 /**
4513 * Get the closest allowed SIMD width for instruction \p inst accounting for
4514 * some common regioning and execution control restrictions that apply to FPU
4515 * instructions. These restrictions don't necessarily have any relevance to
4516 * instructions not executed by the FPU pipeline like extended math, control
4517 * flow or send message instructions.
4518 *
4519 * For virtual opcodes it's really up to the instruction -- In some cases
4520 * (e.g. where a virtual instruction unrolls into a simple sequence of FPU
4521 * instructions) it may simplify virtual instruction lowering if we can
4522 * enforce FPU-like regioning restrictions already on the virtual instruction,
4523 * in other cases (e.g. virtual send-like instructions) this may be
4524 * excessively restrictive.
4525 */
4526 static unsigned
get_fpu_lowered_simd_width(const elk_fs_visitor * shader,const elk_fs_inst * inst)4527 get_fpu_lowered_simd_width(const elk_fs_visitor *shader,
4528 const elk_fs_inst *inst)
4529 {
4530 const struct elk_compiler *compiler = shader->compiler;
4531 const struct intel_device_info *devinfo = compiler->devinfo;
4532
4533 /* Maximum execution size representable in the instruction controls. */
4534 unsigned max_width = MIN2(32, inst->exec_size);
4535
4536 /* Number of channels per polygon handled by a multipolygon PS shader. */
4537 const unsigned poly_width = shader->dispatch_width /
4538 MAX2(1, shader->max_polygons);
4539
4540 /* Number of registers that will be read by an ATTR source if
4541 * present for multipolygon PS shaders, since the PS vertex setup
4542 * data for each polygon is stored in different contiguous GRFs.
4543 */
4544 const unsigned attr_reg_count = (shader->stage != MESA_SHADER_FRAGMENT ||
4545 shader->max_polygons < 2 ? 0 :
4546 DIV_ROUND_UP(inst->exec_size,
4547 poly_width) * reg_unit(devinfo));
4548
4549 /* According to the PRMs:
4550 * "A. In Direct Addressing mode, a source cannot span more than 2
4551 * adjacent GRF registers.
4552 * B. A destination cannot span more than 2 adjacent GRF registers."
4553 *
4554 * Look for the source or destination with the largest register region
4555 * which is the one that is going to limit the overall execution size of
4556 * the instruction due to this rule.
4557 */
4558 unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
4559
4560 for (unsigned i = 0; i < inst->sources; i++)
4561 reg_count = MAX3(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE),
4562 (inst->src[i].file == ATTR ? attr_reg_count : 0));
4563
4564 /* Calculate the maximum execution size of the instruction based on the
4565 * factor by which it goes over the hardware limit of 2 GRFs.
4566 */
4567 const unsigned max_reg_count = 2 * reg_unit(devinfo);
4568 if (reg_count > max_reg_count)
4569 max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, max_reg_count));
4570
4571 /* According to the IVB PRMs:
4572 * "When destination spans two registers, the source MUST span two
4573 * registers. The exception to the above rule:
4574 *
4575 * - When source is scalar, the source registers are not incremented.
4576 * - When source is packed integer Word and destination is packed
4577 * integer DWord, the source register is not incremented but the
4578 * source sub register is incremented."
4579 *
4580 * The hardware specs from Gfx4 to Gfx7.5 mention similar regioning
4581 * restrictions. The code below intentionally doesn't check whether the
4582 * destination type is integer because empirically the hardware doesn't
4583 * seem to care what the actual type is as long as it's dword-aligned.
4584 *
4585 * HSW PRMs also add a note to the second exception:
4586 * "When lower 8 channels are disabled, the sub register of source1
4587 * operand is not incremented. If the lower 8 channels are expected
4588 * to be disabled, say by predication, the instruction must be split
4589 * into pair of simd8 operations."
4590 *
4591 * We can't reliably know if the channels won't be disabled due to,
4592 * for example, IMASK. So, play it safe and disallow packed-word exception
4593 * for src1.
4594 */
4595 if (devinfo->ver < 8) {
4596 for (unsigned i = 0; i < inst->sources; i++) {
4597 /* IVB implements DF scalars as <0;2,1> regions. */
4598 const bool is_scalar_exception = is_uniform(inst->src[i]) &&
4599 (devinfo->platform == INTEL_PLATFORM_HSW || type_sz(inst->src[i].type) != 8);
4600 const bool is_packed_word_exception = i != 1 &&
4601 type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
4602 type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1;
4603
4604 /* We check size_read(i) against size_written instead of REG_SIZE
4605 * because we want to properly handle SIMD32. In SIMD32, you can end
4606 * up with writes to 4 registers and a source that reads 2 registers
4607 * and we may still need to lower all the way to SIMD8 in that case.
4608 */
4609 if (inst->size_written > REG_SIZE &&
4610 inst->size_read(i) != 0 &&
4611 inst->size_read(i) < inst->size_written &&
4612 !is_scalar_exception && !is_packed_word_exception) {
4613 const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
4614 max_width = MIN2(max_width, inst->exec_size / reg_count);
4615 }
4616 }
4617 }
4618
4619 if (devinfo->ver < 6) {
4620 /* From the G45 PRM, Volume 4 Page 361:
4621 *
4622 * "Operand Alignment Rule: With the exceptions listed below, a
4623 * source/destination operand in general should be aligned to even
4624 * 256-bit physical register with a region size equal to two 256-bit
4625 * physical registers."
4626 *
4627 * Normally we enforce this by allocating virtual registers to the
4628 * even-aligned class. But we need to handle payload registers.
4629 */
4630 for (unsigned i = 0; i < inst->sources; i++) {
4631 if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) &&
4632 inst->size_read(i) > REG_SIZE) {
4633 max_width = MIN2(max_width, 8);
4634 }
4635 }
4636 }
4637
4638 /* From the IVB PRMs:
4639 * "When an instruction is SIMD32, the low 16 bits of the execution mask
4640 * are applied for both halves of the SIMD32 instruction. If different
4641 * execution mask channels are required, split the instruction into two
4642 * SIMD16 instructions."
4643 *
4644 * There is similar text in the HSW PRMs. Gfx4-6 don't even implement
4645 * 32-wide control flow support in hardware and will behave similarly.
4646 */
4647 if (devinfo->ver < 8 && !inst->force_writemask_all)
4648 max_width = MIN2(max_width, 16);
4649
4650 /* From the IVB PRMs (applies to HSW too):
4651 * "Instructions with condition modifiers must not use SIMD32."
4652 *
4653 * From the BDW PRMs (applies to later hardware too):
4654 * "Ternary instruction with condition modifiers must not use SIMD32."
4655 */
4656 if (inst->conditional_mod && (devinfo->ver < 8 ||
4657 (inst->elk_is_3src(compiler) && devinfo->ver < 12)))
4658 max_width = MIN2(max_width, 16);
4659
4660 /* From the IVB PRMs (applies to other devices that don't have the
4661 * intel_device_info::supports_simd16_3src flag set):
4662 * "In Align16 access mode, SIMD16 is not allowed for DW operations and
4663 * SIMD8 is not allowed for DF operations."
4664 */
4665 if (inst->elk_is_3src(compiler) && !devinfo->supports_simd16_3src)
4666 max_width = MIN2(max_width, inst->exec_size / reg_count);
4667
4668 /* Pre-Gfx8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is
4669 * the 8-bit quarter of the execution mask signals specified in the
4670 * instruction control fields) for the second compressed half of any
4671 * single-precision instruction (for double-precision instructions
4672 * it's hardwired to use NibCtrl+1, at least on HSW), which means that
4673 * the EU will apply the wrong execution controls for the second
4674 * sequential GRF write if the number of channels per GRF is not exactly
4675 * eight in single-precision mode (or four in double-float mode).
4676 *
4677 * In this situation we calculate the maximum size of the split
4678 * instructions so they only ever write to a single register.
4679 */
4680 if (devinfo->ver < 8 && inst->size_written > REG_SIZE &&
4681 !inst->force_writemask_all) {
4682 const unsigned channels_per_grf = inst->exec_size /
4683 DIV_ROUND_UP(inst->size_written, REG_SIZE);
4684 const unsigned exec_type_size = get_exec_type_size(inst);
4685 assert(exec_type_size);
4686
4687 /* The hardware shifts exactly 8 channels per compressed half of the
4688 * instruction in single-precision mode and exactly 4 in double-precision.
4689 */
4690 if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))
4691 max_width = MIN2(max_width, channels_per_grf);
4692
4693 /* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT
4694 * because HW applies the same channel enable signals to both halves of
4695 * the compressed instruction which will be just wrong under
4696 * non-uniform control flow.
4697 */
4698 if (devinfo->verx10 == 70 &&
4699 (exec_type_size == 8 || type_sz(inst->dst.type) == 8))
4700 max_width = MIN2(max_width, 4);
4701 }
4702
4703 /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
4704 * Float Operations:
4705 *
4706 * "No SIMD16 in mixed mode when destination is f32. Instruction
4707 * execution size must be no more than 8."
4708 *
4709 * FIXME: the simulator doesn't seem to complain if we don't do this and
4710 * empirical testing with existing CTS tests show that they pass just fine
4711 * without implementing this, however, since our interpretation of the PRM
4712 * is that conversion MOVs between HF and F are still mixed-float
4713 * instructions (and therefore subject to this restriction) we decided to
4714 * split them to be safe. Might be useful to do additional investigation to
4715 * lift the restriction if we can ensure that it is safe though, since these
4716 * conversions are common when half-float types are involved since many
4717 * instructions do not support HF types and conversions from/to F are
4718 * required.
4719 */
4720 if (is_mixed_float_with_fp32_dst(inst) && devinfo->ver < 20)
4721 max_width = MIN2(max_width, 8);
4722
4723 /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
4724 * Float Operations:
4725 *
4726 * "No SIMD16 in mixed mode when destination is packed f16 for both
4727 * Align1 and Align16."
4728 */
4729 if (is_mixed_float_with_packed_fp16_dst(inst) && devinfo->ver < 20)
4730 max_width = MIN2(max_width, 8);
4731
4732 /* Only power-of-two execution sizes are representable in the instruction
4733 * control fields.
4734 */
4735 return 1 << util_logbase2(max_width);
4736 }
4737
4738 /**
4739 * Get the maximum allowed SIMD width for instruction \p inst accounting for
4740 * various payload size restrictions that apply to sampler message
4741 * instructions.
4742 *
4743 * This is only intended to provide a maximum theoretical bound for the
4744 * execution size of the message based on the number of argument components
4745 * alone, which in most cases will determine whether the SIMD8 or SIMD16
4746 * variant of the message can be used, though some messages may have
4747 * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
4748 * the message length to determine the exact SIMD width and argument count,
4749 * which makes a number of sampler message combinations impossible to
4750 * represent).
4751 *
4752 * Note: Platforms with monolithic SIMD16 double the possible SIMD widths
4753 * change from (SIMD8, SIMD16) to (SIMD16, SIMD32).
4754 */
4755 static unsigned
get_sampler_lowered_simd_width(const struct intel_device_info * devinfo,const elk_fs_inst * inst)4756 get_sampler_lowered_simd_width(const struct intel_device_info *devinfo,
4757 const elk_fs_inst *inst)
4758 {
4759 /* If we have a min_lod parameter on anything other than a simple sample
4760 * message, it will push it over 5 arguments and we have to fall back to
4761 * SIMD8.
4762 */
4763 if (inst->opcode != ELK_SHADER_OPCODE_TEX &&
4764 inst->components_read(TEX_LOGICAL_SRC_MIN_LOD))
4765 return devinfo->ver < 20 ? 8 : 16;
4766
4767 /* Calculate the number of coordinate components that have to be present
4768 * assuming that additional arguments follow the texel coordinates in the
4769 * message payload. On IVB+ there is no need for padding, on ILK-SNB we
4770 * need to pad to four or three components depending on the message,
4771 * pre-ILK we need to pad to at most three components.
4772 */
4773 const unsigned req_coord_components =
4774 (devinfo->ver >= 7 ||
4775 !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
4776 (devinfo->ver >= 5 && inst->opcode != ELK_SHADER_OPCODE_TXF_LOGICAL &&
4777 inst->opcode != ELK_SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
4778 3;
4779
4780 /* On Gfx9+ the LOD argument is for free if we're able to use the LZ
4781 * variant of the TXL or TXF message.
4782 */
4783 const bool implicit_lod = devinfo->ver >= 9 &&
4784 (inst->opcode == ELK_SHADER_OPCODE_TXL ||
4785 inst->opcode == ELK_SHADER_OPCODE_TXF) &&
4786 inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
4787
4788 /* Calculate the total number of argument components that need to be passed
4789 * to the sampler unit.
4790 */
4791 const unsigned num_payload_components =
4792 MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
4793 req_coord_components) +
4794 inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
4795 (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
4796 inst->components_read(TEX_LOGICAL_SRC_LOD2) +
4797 inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
4798 (inst->opcode == ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
4799 inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
4800 inst->components_read(TEX_LOGICAL_SRC_MCS);
4801
4802 const unsigned simd_limit = reg_unit(devinfo) *
4803 (num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
4804
4805 /* SIMD16 (SIMD32 on Xe2) messages with more than five arguments exceed the
4806 * maximum message size supported by the sampler, regardless of whether a
4807 * header is provided or not.
4808 */
4809 return MIN2(inst->exec_size, simd_limit);
4810 }
4811
4812 /**
4813 * Get the closest native SIMD width supported by the hardware for instruction
4814 * \p inst. The instruction will be left untouched by
4815 * elk_fs_visitor::lower_simd_width() if the returned value is equal to the
4816 * original execution size.
4817 */
4818 static unsigned
get_lowered_simd_width(const elk_fs_visitor * shader,const elk_fs_inst * inst)4819 get_lowered_simd_width(const elk_fs_visitor *shader, const elk_fs_inst *inst)
4820 {
4821 const struct elk_compiler *compiler = shader->compiler;
4822 const struct intel_device_info *devinfo = compiler->devinfo;
4823
4824 switch (inst->opcode) {
4825 case ELK_OPCODE_DP4A:
4826 case ELK_OPCODE_MOV:
4827 case ELK_OPCODE_SEL:
4828 case ELK_OPCODE_NOT:
4829 case ELK_OPCODE_AND:
4830 case ELK_OPCODE_OR:
4831 case ELK_OPCODE_XOR:
4832 case ELK_OPCODE_SHR:
4833 case ELK_OPCODE_SHL:
4834 case ELK_OPCODE_ASR:
4835 case ELK_OPCODE_ROR:
4836 case ELK_OPCODE_ROL:
4837 case ELK_OPCODE_CMPN:
4838 case ELK_OPCODE_CSEL:
4839 case ELK_OPCODE_F32TO16:
4840 case ELK_OPCODE_F16TO32:
4841 case ELK_OPCODE_BFREV:
4842 case ELK_OPCODE_BFE:
4843 case ELK_OPCODE_ADD:
4844 case ELK_OPCODE_MUL:
4845 case ELK_OPCODE_AVG:
4846 case ELK_OPCODE_FRC:
4847 case ELK_OPCODE_RNDU:
4848 case ELK_OPCODE_RNDD:
4849 case ELK_OPCODE_RNDE:
4850 case ELK_OPCODE_RNDZ:
4851 case ELK_OPCODE_LZD:
4852 case ELK_OPCODE_FBH:
4853 case ELK_OPCODE_FBL:
4854 case ELK_OPCODE_CBIT:
4855 case ELK_OPCODE_SAD2:
4856 case ELK_OPCODE_MAD:
4857 case ELK_OPCODE_LRP:
4858 case ELK_OPCODE_ADD3:
4859 case ELK_FS_OPCODE_PACK:
4860 case ELK_SHADER_OPCODE_SEL_EXEC:
4861 case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
4862 case ELK_SHADER_OPCODE_MOV_RELOC_IMM:
4863 return get_fpu_lowered_simd_width(shader, inst);
4864
4865 case ELK_OPCODE_CMP: {
4866 /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
4867 * when the destination is a GRF the dependency-clear bit on the flag
4868 * register is cleared early.
4869 *
4870 * Suggested workarounds are to disable coissuing CMP instructions
4871 * or to split CMP(16) instructions into two CMP(8) instructions.
4872 *
4873 * We choose to split into CMP(8) instructions since disabling
4874 * coissuing would affect CMP instructions not otherwise affected by
4875 * the errata.
4876 */
4877 const unsigned max_width = (devinfo->verx10 == 70 &&
4878 !inst->dst.is_null() ? 8 : ~0);
4879 return MIN2(max_width, get_fpu_lowered_simd_width(shader, inst));
4880 }
4881 case ELK_OPCODE_BFI1:
4882 case ELK_OPCODE_BFI2:
4883 /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
4884 * should
4885 * "Force BFI instructions to be executed always in SIMD8."
4886 */
4887 return MIN2(devinfo->platform == INTEL_PLATFORM_HSW ? 8 : ~0u,
4888 get_fpu_lowered_simd_width(shader, inst));
4889
4890 case ELK_OPCODE_IF:
4891 assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
4892 return inst->exec_size;
4893
4894 case ELK_SHADER_OPCODE_RCP:
4895 case ELK_SHADER_OPCODE_RSQ:
4896 case ELK_SHADER_OPCODE_SQRT:
4897 case ELK_SHADER_OPCODE_EXP2:
4898 case ELK_SHADER_OPCODE_LOG2:
4899 case ELK_SHADER_OPCODE_SIN:
4900 case ELK_SHADER_OPCODE_COS: {
4901 /* Unary extended math instructions are limited to SIMD8 on Gfx4 and
4902 * Gfx6. Extended Math Function is limited to SIMD8 with half-float.
4903 */
4904 if (devinfo->ver == 6 || devinfo->verx10 == 40)
4905 return MIN2(8, inst->exec_size);
4906 if (inst->dst.type == ELK_REGISTER_TYPE_HF)
4907 return MIN2(8, inst->exec_size);
4908 return MIN2(16, inst->exec_size);
4909 }
4910
4911 case ELK_SHADER_OPCODE_POW: {
4912 /* SIMD16 is only allowed on Gfx7+. Extended Math Function is limited
4913 * to SIMD8 with half-float
4914 */
4915 if (devinfo->ver < 7)
4916 return MIN2(8, inst->exec_size);
4917 if (inst->dst.type == ELK_REGISTER_TYPE_HF)
4918 return MIN2(8, inst->exec_size);
4919 return MIN2(16, inst->exec_size);
4920 }
4921
4922 case ELK_SHADER_OPCODE_USUB_SAT:
4923 case ELK_SHADER_OPCODE_ISUB_SAT:
4924 return get_fpu_lowered_simd_width(shader, inst);
4925
4926 case ELK_SHADER_OPCODE_INT_QUOTIENT:
4927 case ELK_SHADER_OPCODE_INT_REMAINDER:
4928 /* Integer division is limited to SIMD8 on all generations. */
4929 return MIN2(8, inst->exec_size);
4930
4931 case ELK_FS_OPCODE_LINTERP:
4932 case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
4933 case ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT:
4934 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
4935 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
4936 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
4937 return MIN2(16, inst->exec_size);
4938
4939 case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
4940 /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
4941 * message used to implement varying pull constant loads, so expand it
4942 * to SIMD16. An alternative with longer message payload length but
4943 * shorter return payload would be to use the SIMD8 sampler message that
4944 * takes (header, u, v, r) as parameters instead of (header, u).
4945 */
4946 return (devinfo->ver == 4 ? 16 : MIN2(16, inst->exec_size));
4947
4948 case ELK_FS_OPCODE_DDX_COARSE:
4949 case ELK_FS_OPCODE_DDX_FINE:
4950 case ELK_FS_OPCODE_DDY_COARSE:
4951 case ELK_FS_OPCODE_DDY_FINE:
4952 /* The implementation of this virtual opcode may require emitting
4953 * compressed Align16 instructions, which are severely limited on some
4954 * generations.
4955 *
4956 * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
4957 * Region Restrictions):
4958 *
4959 * "In Align16 access mode, SIMD16 is not allowed for DW operations
4960 * and SIMD8 is not allowed for DF operations."
4961 *
4962 * In this context, "DW operations" means "operations acting on 32-bit
4963 * values", so it includes operations on floats.
4964 *
4965 * Gfx4 has a similar restriction. From the i965 PRM, section 11.5.3
4966 * (Instruction Compression -> Rules and Restrictions):
4967 *
4968 * "A compressed instruction must be in Align1 access mode. Align16
4969 * mode instructions cannot be compressed."
4970 *
4971 * Similar text exists in the g45 PRM.
4972 *
4973 * Empirically, compressed align16 instructions using odd register
4974 * numbers don't appear to work on Sandybridge either.
4975 */
4976 return (devinfo->ver == 4 || devinfo->ver == 6 ||
4977 (devinfo->verx10 == 70) ?
4978 MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
4979
4980 case ELK_SHADER_OPCODE_MULH:
4981 /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
4982 * is 8-wide on Gfx7+.
4983 */
4984 return (devinfo->ver >= 20 ? 16 :
4985 devinfo->ver >= 7 ? 8 :
4986 get_fpu_lowered_simd_width(shader, inst));
4987
4988 case ELK_FS_OPCODE_FB_WRITE_LOGICAL:
4989 /* Gfx6 doesn't support SIMD16 depth writes but we cannot handle them
4990 * here.
4991 */
4992 assert(devinfo->ver != 6 ||
4993 inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
4994 inst->exec_size == 8);
4995 /* Dual-source FB writes are unsupported in SIMD16 mode. */
4996 return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
4997 8 : MIN2(16, inst->exec_size));
4998
4999 case ELK_FS_OPCODE_FB_READ_LOGICAL:
5000 return MIN2(16, inst->exec_size);
5001
5002 case ELK_SHADER_OPCODE_TEX_LOGICAL:
5003 case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
5004 case ELK_SHADER_OPCODE_TXF_UMS_LOGICAL:
5005 case ELK_SHADER_OPCODE_TXF_MCS_LOGICAL:
5006 case ELK_SHADER_OPCODE_LOD_LOGICAL:
5007 case ELK_SHADER_OPCODE_TG4_LOGICAL:
5008 case ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL:
5009 case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
5010 case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
5011 return get_sampler_lowered_simd_width(devinfo, inst);
5012
5013 /* On gfx12 parameters are fixed to 16-bit values and therefore they all
5014 * always fit regardless of the execution size.
5015 */
5016 case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
5017 return MIN2(16, inst->exec_size);
5018
5019 case ELK_SHADER_OPCODE_TXD_LOGICAL:
5020 /* TXD is unsupported in SIMD16 mode previous to Xe2. SIMD32 is still
5021 * unsuppported on Xe2.
5022 */
5023 return devinfo->ver < 20 ? 8 : 16;
5024
5025 case ELK_SHADER_OPCODE_TXL_LOGICAL:
5026 case ELK_FS_OPCODE_TXB_LOGICAL:
5027 /* Only one execution size is representable pre-ILK depending on whether
5028 * the shadow reference argument is present.
5029 */
5030 if (devinfo->ver == 4)
5031 return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
5032 else
5033 return get_sampler_lowered_simd_width(devinfo, inst);
5034
5035 case ELK_SHADER_OPCODE_TXF_LOGICAL:
5036 case ELK_SHADER_OPCODE_TXS_LOGICAL:
5037 /* Gfx4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
5038 * messages. Use SIMD16 instead.
5039 */
5040 if (devinfo->ver == 4)
5041 return 16;
5042 else
5043 return get_sampler_lowered_simd_width(devinfo, inst);
5044
5045 case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
5046 case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
5047 case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
5048 return 8;
5049
5050 case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
5051 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
5052 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
5053 case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
5054 case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
5055 case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
5056 case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
5057 return MIN2(16, inst->exec_size);
5058
5059 case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
5060 case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
5061 case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
5062 case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
5063 return devinfo->ver <= 8 ? 8 : MIN2(16, inst->exec_size);
5064
5065 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
5066 case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
5067 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
5068 assert(inst->exec_size <= 16);
5069 return inst->exec_size;
5070
5071 case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
5072 return devinfo->has_lsc ? MIN2(16, inst->exec_size) : 8;
5073
5074 case ELK_SHADER_OPCODE_URB_READ_LOGICAL:
5075 case ELK_SHADER_OPCODE_URB_WRITE_LOGICAL:
5076 return MIN2(devinfo->ver < 20 ? 8 : 16, inst->exec_size);
5077
5078 case ELK_SHADER_OPCODE_QUAD_SWIZZLE: {
5079 const unsigned swiz = inst->src[1].ud;
5080 return (is_uniform(inst->src[0]) ?
5081 get_fpu_lowered_simd_width(shader, inst) :
5082 devinfo->ver < 11 && type_sz(inst->src[0].type) == 4 ? 8 :
5083 swiz == ELK_SWIZZLE_XYXY || swiz == ELK_SWIZZLE_ZWZW ? 4 :
5084 get_fpu_lowered_simd_width(shader, inst));
5085 }
5086 case ELK_SHADER_OPCODE_MOV_INDIRECT: {
5087 /* From IVB and HSW PRMs:
5088 *
5089 * "2.When the destination requires two registers and the sources are
5090 * indirect, the sources must use 1x1 regioning mode.
5091 *
5092 * In case of DF instructions in HSW/IVB, the exec_size is limited by
5093 * the EU decompression logic not handling VxH indirect addressing
5094 * correctly.
5095 */
5096 const unsigned max_size = (devinfo->ver >= 8 ? 2 : 1) * REG_SIZE;
5097 /* Prior to Broadwell, we only have 8 address subregisters. */
5098 return MIN3(devinfo->ver >= 8 ? 16 : 8,
5099 max_size / (inst->dst.stride * type_sz(inst->dst.type)),
5100 inst->exec_size);
5101 }
5102
5103 case ELK_SHADER_OPCODE_LOAD_PAYLOAD: {
5104 const unsigned reg_count =
5105 DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
5106
5107 if (reg_count > 2) {
5108 /* Only LOAD_PAYLOAD instructions with per-channel destination region
5109 * can be easily lowered (which excludes headers and heterogeneous
5110 * types).
5111 */
5112 assert(!inst->header_size);
5113 for (unsigned i = 0; i < inst->sources; i++)
5114 assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||
5115 inst->src[i].file == BAD_FILE);
5116
5117 return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
5118 } else {
5119 return inst->exec_size;
5120 }
5121 }
5122 default:
5123 return inst->exec_size;
5124 }
5125 }
5126
5127 /**
5128 * Return true if splitting out the group of channels of instruction \p inst
5129 * given by lbld.group() requires allocating a temporary for the i-th source
5130 * of the lowered instruction.
5131 */
5132 static inline bool
needs_src_copy(const fs_builder & lbld,const elk_fs_inst * inst,unsigned i)5133 needs_src_copy(const fs_builder &lbld, const elk_fs_inst *inst, unsigned i)
5134 {
5135 return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
5136 (inst->components_read(i) == 1 &&
5137 lbld.dispatch_width() <= inst->exec_size)) ||
5138 (inst->flags_written(lbld.shader->devinfo) &
5139 flag_mask(inst->src[i], type_sz(inst->src[i].type)));
5140 }
5141
5142 /**
5143 * Extract the data that would be consumed by the channel group given by
5144 * lbld.group() from the i-th source region of instruction \p inst and return
5145 * it as result in packed form.
5146 */
5147 static elk_fs_reg
emit_unzip(const fs_builder & lbld,elk_fs_inst * inst,unsigned i)5148 emit_unzip(const fs_builder &lbld, elk_fs_inst *inst, unsigned i)
5149 {
5150 assert(lbld.group() >= inst->group);
5151
5152 /* Specified channel group from the source region. */
5153 const elk_fs_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);
5154
5155 if (needs_src_copy(lbld, inst, i)) {
5156 /* Builder of the right width to perform the copy avoiding uninitialized
5157 * data if the lowered execution size is greater than the original
5158 * execution size of the instruction.
5159 */
5160 const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
5161 inst->exec_size), 0);
5162 const elk_fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
5163
5164 for (unsigned k = 0; k < inst->components_read(i); ++k)
5165 cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
5166
5167 return tmp;
5168
5169 } else if (is_periodic(inst->src[i], lbld.dispatch_width())) {
5170 /* The source is invariant for all dispatch_width-wide groups of the
5171 * original region.
5172 */
5173 return inst->src[i];
5174
5175 } else {
5176 /* We can just point the lowered instruction at the right channel group
5177 * from the original region.
5178 */
5179 return src;
5180 }
5181 }
5182
5183 /**
5184 * Return true if splitting out the group of channels of instruction \p inst
5185 * given by lbld.group() requires allocating a temporary for the destination
5186 * of the lowered instruction and copying the data back to the original
5187 * destination region.
5188 */
5189 static inline bool
needs_dst_copy(const fs_builder & lbld,const elk_fs_inst * inst)5190 needs_dst_copy(const fs_builder &lbld, const elk_fs_inst *inst)
5191 {
5192 if (inst->dst.is_null())
5193 return false;
5194
5195 /* If the instruction writes more than one component we'll have to shuffle
5196 * the results of multiple lowered instructions in order to make sure that
5197 * they end up arranged correctly in the original destination region.
5198 */
5199 if (inst->size_written > inst->dst.component_size(inst->exec_size))
5200 return true;
5201
5202 /* If the lowered execution size is larger than the original the result of
5203 * the instruction won't fit in the original destination, so we'll have to
5204 * allocate a temporary in any case.
5205 */
5206 if (lbld.dispatch_width() > inst->exec_size)
5207 return true;
5208
5209 for (unsigned i = 0; i < inst->sources; i++) {
5210 /* If we already made a copy of the source for other reasons there won't
5211 * be any overlap with the destination.
5212 */
5213 if (needs_src_copy(lbld, inst, i))
5214 continue;
5215
5216 /* In order to keep the logic simple we emit a copy whenever the
5217 * destination region doesn't exactly match an overlapping source, which
5218 * may point at the source and destination not being aligned group by
5219 * group which could cause one of the lowered instructions to overwrite
5220 * the data read from the same source by other lowered instructions.
5221 */
5222 if (regions_overlap(inst->dst, inst->size_written,
5223 inst->src[i], inst->size_read(i)) &&
5224 !inst->dst.equals(inst->src[i]))
5225 return true;
5226 }
5227
5228 return false;
5229 }
5230
5231 /**
5232 * Insert data from a packed temporary into the channel group given by
5233 * lbld.group() of the destination region of instruction \p inst and return
5234 * the temporary as result. Any copy instructions that are required for
5235 * unzipping the previous value (in the case of partial writes) will be
5236 * inserted using \p lbld_before and any copy instructions required for
5237 * zipping up the destination of \p inst will be inserted using \p lbld_after.
5238 */
5239 static elk_fs_reg
emit_zip(const fs_builder & lbld_before,const fs_builder & lbld_after,elk_fs_inst * inst)5240 emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
5241 elk_fs_inst *inst)
5242 {
5243 assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
5244 assert(lbld_before.group() == lbld_after.group());
5245 assert(lbld_after.group() >= inst->group);
5246
5247 const struct intel_device_info *devinfo = lbld_before.shader->devinfo;
5248
5249 /* Specified channel group from the destination region. */
5250 const elk_fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);
5251
5252 if (!needs_dst_copy(lbld_after, inst)) {
5253 /* No need to allocate a temporary for the lowered instruction, just
5254 * take the right group of channels from the original region.
5255 */
5256 return dst;
5257 }
5258
5259 /* Deal with the residency data part later */
5260 const unsigned residency_size = inst->has_sampler_residency() ?
5261 (reg_unit(devinfo) * REG_SIZE) : 0;
5262 const unsigned dst_size = (inst->size_written - residency_size) /
5263 inst->dst.component_size(inst->exec_size);
5264
5265 const elk_fs_reg tmp = lbld_after.vgrf(inst->dst.type,
5266 dst_size + inst->has_sampler_residency());
5267
5268 if (inst->predicate) {
5269 /* Handle predication by copying the original contents of the
5270 * destination into the temporary before emitting the lowered
5271 * instruction.
5272 */
5273 const fs_builder gbld_before =
5274 lbld_before.group(MIN2(lbld_before.dispatch_width(),
5275 inst->exec_size), 0);
5276 for (unsigned k = 0; k < dst_size; ++k) {
5277 gbld_before.MOV(offset(tmp, lbld_before, k),
5278 offset(dst, inst->exec_size, k));
5279 }
5280 }
5281
5282 const fs_builder gbld_after =
5283 lbld_after.group(MIN2(lbld_after.dispatch_width(),
5284 inst->exec_size), 0);
5285 for (unsigned k = 0; k < dst_size; ++k) {
5286 /* Use a builder of the right width to perform the copy avoiding
5287 * uninitialized data if the lowered execution size is greater than the
5288 * original execution size of the instruction.
5289 */
5290 gbld_after.MOV(offset(dst, inst->exec_size, k),
5291 offset(tmp, lbld_after, k));
5292 }
5293
5294 if (inst->has_sampler_residency()) {
5295 /* Sampler messages with residency need a special attention. In the
5296 * first lane of the last component are located the Pixel Null Mask
5297 * (bits 0:15) & some upper bits we need to discard (bits 16:31). We
5298 * have to build a single 32bit value for the SIMD32 message out of 2
5299 * SIMD16 16 bit values.
5300 */
5301 const fs_builder rbld = gbld_after.exec_all().group(1, 0);
5302 elk_fs_reg local_res_reg = component(
5303 retype(offset(tmp, lbld_before, dst_size),
5304 ELK_REGISTER_TYPE_UW), 0);
5305 elk_fs_reg final_res_reg =
5306 retype(byte_offset(inst->dst,
5307 inst->size_written - residency_size +
5308 gbld_after.group() / 8),
5309 ELK_REGISTER_TYPE_UW);
5310 rbld.MOV(final_res_reg, local_res_reg);
5311 }
5312
5313 return tmp;
5314 }
5315
5316 bool
lower_simd_width()5317 elk_fs_visitor::lower_simd_width()
5318 {
5319 bool progress = false;
5320
5321 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
5322 const unsigned lower_width = get_lowered_simd_width(this, inst);
5323
5324 if (lower_width != inst->exec_size) {
5325 /* Builder matching the original instruction. We may also need to
5326 * emit an instruction of width larger than the original, set the
5327 * execution size of the builder to the highest of both for now so
5328 * we're sure that both cases can be handled.
5329 */
5330 const unsigned max_width = MAX2(inst->exec_size, lower_width);
5331
5332 const fs_builder bld = fs_builder(this).at_end();
5333 const fs_builder ibld = bld.at(block, inst)
5334 .exec_all(inst->force_writemask_all)
5335 .group(max_width, inst->group / max_width);
5336
5337 /* Split the copies in chunks of the execution width of either the
5338 * original or the lowered instruction, whichever is lower.
5339 */
5340 const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
5341 const unsigned residency_size = inst->has_sampler_residency() ?
5342 (reg_unit(devinfo) * REG_SIZE) : 0;
5343 const unsigned dst_size =
5344 (inst->size_written - residency_size) /
5345 inst->dst.component_size(inst->exec_size);
5346
5347 assert(!inst->writes_accumulator && !inst->mlen);
5348
5349 /* Inserting the zip, unzip, and duplicated instructions in all of
5350 * the right spots is somewhat tricky. All of the unzip and any
5351 * instructions from the zip which unzip the destination prior to
5352 * writing need to happen before all of the per-group instructions
5353 * and the zip instructions need to happen after. In order to sort
5354 * this all out, we insert the unzip instructions before \p inst,
5355 * insert the per-group instructions after \p inst (i.e. before
5356 * inst->next), and insert the zip instructions before the
5357 * instruction after \p inst. Since we are inserting instructions
5358 * after \p inst, inst->next is a moving target and we need to save
5359 * it off here so that we insert the zip instructions in the right
5360 * place.
5361 *
5362 * Since we're inserting split instructions after after_inst, the
5363 * instructions will end up in the reverse order that we insert them.
5364 * However, certain render target writes require that the low group
5365 * instructions come before the high group. From the Ivy Bridge PRM
5366 * Vol. 4, Pt. 1, Section 3.9.11:
5367 *
5368 * "If multiple SIMD8 Dual Source messages are delivered by the
5369 * pixel shader thread, each SIMD8_DUALSRC_LO message must be
5370 * issued before the SIMD8_DUALSRC_HI message with the same Slot
5371 * Group Select setting."
5372 *
5373 * And, from Section 3.9.11.1 of the same PRM:
5374 *
5375 * "When SIMD32 or SIMD16 PS threads send render target writes
5376 * with multiple SIMD8 and SIMD16 messages, the following must
5377 * hold:
5378 *
5379 * All the slots (as described above) must have a corresponding
5380 * render target write irrespective of the slot's validity. A slot
5381 * is considered valid when at least one sample is enabled. For
5382 * example, a SIMD16 PS thread must send two SIMD8 render target
5383 * writes to cover all the slots.
5384 *
5385 * PS thread must send SIMD render target write messages with
5386 * increasing slot numbers. For example, SIMD16 thread has
5387 * Slot[15:0] and if two SIMD8 render target writes are used, the
5388 * first SIMD8 render target write must send Slot[7:0] and the
5389 * next one must send Slot[15:8]."
5390 *
5391 * In order to make low group instructions come before high group
5392 * instructions (this is required for some render target writes), we
5393 * split from the highest group to lowest.
5394 */
5395 exec_node *const after_inst = inst->next;
5396 for (int i = n - 1; i >= 0; i--) {
5397 /* Emit a copy of the original instruction with the lowered width.
5398 * If the EOT flag was set throw it away except for the last
5399 * instruction to avoid killing the thread prematurely.
5400 */
5401 elk_fs_inst split_inst = *inst;
5402 split_inst.exec_size = lower_width;
5403 split_inst.eot = inst->eot && i == int(n - 1);
5404
5405 /* Select the correct channel enables for the i-th group, then
5406 * transform the sources and destination and emit the lowered
5407 * instruction.
5408 */
5409 const fs_builder lbld = ibld.group(lower_width, i);
5410
5411 for (unsigned j = 0; j < inst->sources; j++)
5412 split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
5413
5414 split_inst.dst = emit_zip(lbld.at(block, inst),
5415 lbld.at(block, after_inst), inst);
5416 split_inst.size_written =
5417 split_inst.dst.component_size(lower_width) * dst_size +
5418 residency_size;
5419
5420 lbld.at(block, inst->next).emit(split_inst);
5421 }
5422
5423 inst->remove(block);
5424 progress = true;
5425 }
5426 }
5427
5428 if (progress)
5429 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5430
5431 return progress;
5432 }
5433
5434 /**
5435 * Transform barycentric vectors into the interleaved form expected by the PLN
5436 * instruction and returned by the Gfx7+ PI shared function.
5437 *
5438 * For channels 0-15 in SIMD16 mode they are expected to be laid out as
5439 * follows in the register file:
5440 *
5441 * rN+0: X[0-7]
5442 * rN+1: Y[0-7]
5443 * rN+2: X[8-15]
5444 * rN+3: Y[8-15]
5445 *
5446 * There is no need to handle SIMD32 here -- This is expected to be run after
5447 * SIMD lowering, since SIMD lowering relies on vectors having the standard
5448 * component layout.
5449 */
5450 bool
lower_barycentrics()5451 elk_fs_visitor::lower_barycentrics()
5452 {
5453 const bool has_interleaved_layout = devinfo->has_pln ||
5454 (devinfo->ver >= 7 && devinfo->ver < 20);
5455 bool progress = false;
5456
5457 if (stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout)
5458 return false;
5459
5460 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
5461 if (inst->exec_size < 16)
5462 continue;
5463
5464 const fs_builder ibld(this, block, inst);
5465 const fs_builder ubld = ibld.exec_all().group(8, 0);
5466
5467 switch (inst->opcode) {
5468 case ELK_FS_OPCODE_LINTERP : {
5469 assert(inst->exec_size == 16);
5470 const elk_fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);
5471 elk_fs_reg srcs[4];
5472
5473 for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
5474 srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),
5475 8 * (i / 2));
5476
5477 ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
5478
5479 inst->src[0] = tmp;
5480 progress = true;
5481 break;
5482 }
5483 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
5484 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
5485 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
5486 assert(inst->exec_size == 16);
5487 const elk_fs_reg tmp = ibld.vgrf(inst->dst.type, 2);
5488
5489 for (unsigned i = 0; i < 2; i++) {
5490 for (unsigned g = 0; g < inst->exec_size / 8; g++) {
5491 elk_fs_inst *mov = ibld.at(block, inst->next).group(8, g)
5492 .MOV(horiz_offset(offset(inst->dst, ibld, i),
5493 8 * g),
5494 offset(tmp, ubld, 2 * g + i));
5495 mov->predicate = inst->predicate;
5496 mov->predicate_inverse = inst->predicate_inverse;
5497 mov->flag_subreg = inst->flag_subreg;
5498 }
5499 }
5500
5501 inst->dst = tmp;
5502 progress = true;
5503 break;
5504 }
5505 default:
5506 break;
5507 }
5508 }
5509
5510 if (progress)
5511 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5512
5513 return progress;
5514 }
5515
5516 /**
5517 * Lower a derivative instruction as the floating-point difference of two
5518 * swizzles of the source, specified as \p swz0 and \p swz1.
5519 */
5520 static bool
lower_derivative(elk_fs_visitor * v,elk_bblock_t * block,elk_fs_inst * inst,unsigned swz0,unsigned swz1)5521 lower_derivative(elk_fs_visitor *v, elk_bblock_t *block, elk_fs_inst *inst,
5522 unsigned swz0, unsigned swz1)
5523 {
5524 const fs_builder ubld = fs_builder(v, block, inst).exec_all();
5525 const elk_fs_reg tmp0 = ubld.vgrf(inst->src[0].type);
5526 const elk_fs_reg tmp1 = ubld.vgrf(inst->src[0].type);
5527
5528 ubld.emit(ELK_SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], elk_imm_ud(swz0));
5529 ubld.emit(ELK_SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], elk_imm_ud(swz1));
5530
5531 inst->resize_sources(2);
5532 inst->src[0] = negate(tmp0);
5533 inst->src[1] = tmp1;
5534 inst->opcode = ELK_OPCODE_ADD;
5535
5536 return true;
5537 }
5538
5539 /**
5540 * Lower derivative instructions on platforms where codegen cannot implement
5541 * them efficiently (i.e. XeHP).
5542 */
5543 bool
lower_derivatives()5544 elk_fs_visitor::lower_derivatives()
5545 {
5546 bool progress = false;
5547
5548 if (devinfo->verx10 < 125)
5549 return false;
5550
5551 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
5552 if (inst->opcode == ELK_FS_OPCODE_DDX_COARSE)
5553 progress |= lower_derivative(this, block, inst,
5554 ELK_SWIZZLE_XXXX, ELK_SWIZZLE_YYYY);
5555
5556 else if (inst->opcode == ELK_FS_OPCODE_DDX_FINE)
5557 progress |= lower_derivative(this, block, inst,
5558 ELK_SWIZZLE_XXZZ, ELK_SWIZZLE_YYWW);
5559
5560 else if (inst->opcode == ELK_FS_OPCODE_DDY_COARSE)
5561 progress |= lower_derivative(this, block, inst,
5562 ELK_SWIZZLE_XXXX, ELK_SWIZZLE_ZZZZ);
5563
5564 else if (inst->opcode == ELK_FS_OPCODE_DDY_FINE)
5565 progress |= lower_derivative(this, block, inst,
5566 ELK_SWIZZLE_XYXY, ELK_SWIZZLE_ZWZW);
5567 }
5568
5569 if (progress)
5570 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5571
5572 return progress;
5573 }
5574
5575 bool
lower_find_live_channel()5576 elk_fs_visitor::lower_find_live_channel()
5577 {
5578 bool progress = false;
5579
5580 if (devinfo->ver < 8)
5581 return false;
5582
5583 bool packed_dispatch =
5584 elk_stage_has_packed_dispatch(devinfo, stage, max_polygons,
5585 stage_prog_data);
5586 bool vmask =
5587 stage == MESA_SHADER_FRAGMENT &&
5588 elk_wm_prog_data(stage_prog_data)->uses_vmask;
5589
5590 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
5591 if (inst->opcode != ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL &&
5592 inst->opcode != ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL)
5593 continue;
5594
5595 bool first = inst->opcode == ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL;
5596
5597 /* Getting the first active channel index is easy on Gfx8: Just find
5598 * the first bit set in the execution mask. The register exists on
5599 * HSW already but it reads back as all ones when the current
5600 * instruction has execution masking disabled, so it's kind of
5601 * useless there.
5602 */
5603 elk_fs_reg exec_mask(retype(elk_mask_reg(0), ELK_REGISTER_TYPE_UD));
5604
5605 const fs_builder ibld(this, block, inst);
5606 if (!inst->is_partial_write())
5607 ibld.emit_undef_for_dst(inst);
5608
5609 const fs_builder ubld = fs_builder(this, block, inst).exec_all().group(1, 0);
5610
5611 /* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
5612 * so combine the execution and dispatch masks to obtain the true mask.
5613 *
5614 * If we're looking for the first live channel, and we have packed
5615 * dispatch, we can skip this step, as we know all dispatched channels
5616 * will appear at the front of the mask.
5617 */
5618 if (!(first && packed_dispatch)) {
5619 elk_fs_reg mask = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5620 ubld.UNDEF(mask);
5621 ubld.emit(ELK_SHADER_OPCODE_READ_SR_REG, mask, elk_imm_ud(vmask ? 3 : 2));
5622
5623 /* Quarter control has the effect of magically shifting the value of
5624 * ce0 so you'll get the first/last active channel relative to the
5625 * specified quarter control as result.
5626 */
5627 if (inst->group > 0)
5628 ubld.SHR(mask, mask, elk_imm_ud(ALIGN(inst->group, 8)));
5629
5630 ubld.AND(mask, exec_mask, mask);
5631 exec_mask = mask;
5632 }
5633
5634 if (first) {
5635 ubld.FBL(inst->dst, exec_mask);
5636 } else {
5637 elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD, 1);
5638 ubld.UNDEF(tmp);
5639 ubld.LZD(tmp, exec_mask);
5640 ubld.ADD(inst->dst, negate(tmp), elk_imm_uw(31));
5641 }
5642
5643 inst->remove(block);
5644 progress = true;
5645 }
5646
5647 if (progress)
5648 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5649
5650 return progress;
5651 }
5652
5653 void
dump_instructions_to_file(FILE * file) const5654 elk_fs_visitor::dump_instructions_to_file(FILE *file) const
5655 {
5656 if (cfg) {
5657 const register_pressure &rp = regpressure_analysis.require();
5658 unsigned ip = 0, max_pressure = 0;
5659 unsigned cf_count = 0;
5660 foreach_block_and_inst(block, elk_backend_instruction, inst, cfg) {
5661 if (inst->is_control_flow_end())
5662 cf_count -= 1;
5663
5664 max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
5665 fprintf(file, "{%3d} %4d: ", rp.regs_live_at_ip[ip], ip);
5666 for (unsigned i = 0; i < cf_count; i++)
5667 fprintf(file, " ");
5668 dump_instruction(inst, file);
5669 ip++;
5670
5671 if (inst->is_control_flow_begin())
5672 cf_count += 1;
5673 }
5674 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
5675 } else {
5676 int ip = 0;
5677 foreach_in_list(elk_backend_instruction, inst, &instructions) {
5678 fprintf(file, "%4d: ", ip++);
5679 dump_instruction(inst, file);
5680 }
5681 }
5682 }
5683
5684 void
dump_instruction_to_file(const elk_backend_instruction * be_inst,FILE * file) const5685 elk_fs_visitor::dump_instruction_to_file(const elk_backend_instruction *be_inst, FILE *file) const
5686 {
5687 const elk_fs_inst *inst = (const elk_fs_inst *)be_inst;
5688
5689 if (inst->predicate) {
5690 fprintf(file, "(%cf%d.%d) ",
5691 inst->predicate_inverse ? '-' : '+',
5692 inst->flag_subreg / 2,
5693 inst->flag_subreg % 2);
5694 }
5695
5696 fprintf(file, "%s", elk_instruction_name(&compiler->isa, inst->opcode));
5697 if (inst->saturate)
5698 fprintf(file, ".sat");
5699 if (inst->conditional_mod) {
5700 fprintf(file, "%s", elk_conditional_modifier[inst->conditional_mod]);
5701 if (!inst->predicate &&
5702 (devinfo->ver < 5 || (inst->opcode != ELK_OPCODE_SEL &&
5703 inst->opcode != ELK_OPCODE_CSEL &&
5704 inst->opcode != ELK_OPCODE_IF &&
5705 inst->opcode != ELK_OPCODE_WHILE))) {
5706 fprintf(file, ".f%d.%d", inst->flag_subreg / 2,
5707 inst->flag_subreg % 2);
5708 }
5709 }
5710 fprintf(file, "(%d) ", inst->exec_size);
5711
5712 if (inst->mlen) {
5713 fprintf(file, "(mlen: %d) ", inst->mlen);
5714 }
5715
5716 if (inst->ex_mlen) {
5717 fprintf(file, "(ex_mlen: %d) ", inst->ex_mlen);
5718 }
5719
5720 if (inst->eot) {
5721 fprintf(file, "(EOT) ");
5722 }
5723
5724 switch (inst->dst.file) {
5725 case VGRF:
5726 fprintf(file, "vgrf%d", inst->dst.nr);
5727 break;
5728 case FIXED_GRF:
5729 fprintf(file, "g%d", inst->dst.nr);
5730 break;
5731 case MRF:
5732 fprintf(file, "m%d", inst->dst.nr);
5733 break;
5734 case BAD_FILE:
5735 fprintf(file, "(null)");
5736 break;
5737 case UNIFORM:
5738 fprintf(file, "***u%d***", inst->dst.nr);
5739 break;
5740 case ATTR:
5741 fprintf(file, "***attr%d***", inst->dst.nr);
5742 break;
5743 case ARF:
5744 switch (inst->dst.nr) {
5745 case ELK_ARF_NULL:
5746 fprintf(file, "null");
5747 break;
5748 case ELK_ARF_ADDRESS:
5749 fprintf(file, "a0.%d", inst->dst.subnr);
5750 break;
5751 case ELK_ARF_ACCUMULATOR:
5752 fprintf(file, "acc%d", inst->dst.subnr);
5753 break;
5754 case ELK_ARF_FLAG:
5755 fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
5756 break;
5757 default:
5758 fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
5759 break;
5760 }
5761 break;
5762 case IMM:
5763 unreachable("not reached");
5764 }
5765
5766 if (inst->dst.offset ||
5767 (inst->dst.file == VGRF &&
5768 alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
5769 const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
5770 fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
5771 inst->dst.offset % reg_size);
5772 }
5773
5774 if (inst->dst.stride != 1)
5775 fprintf(file, "<%u>", inst->dst.stride);
5776 fprintf(file, ":%s, ", elk_reg_type_to_letters(inst->dst.type));
5777
5778 for (int i = 0; i < inst->sources; i++) {
5779 if (inst->src[i].negate)
5780 fprintf(file, "-");
5781 if (inst->src[i].abs)
5782 fprintf(file, "|");
5783 switch (inst->src[i].file) {
5784 case VGRF:
5785 fprintf(file, "vgrf%d", inst->src[i].nr);
5786 break;
5787 case FIXED_GRF:
5788 fprintf(file, "g%d", inst->src[i].nr);
5789 break;
5790 case MRF:
5791 fprintf(file, "***m%d***", inst->src[i].nr);
5792 break;
5793 case ATTR:
5794 fprintf(file, "attr%d", inst->src[i].nr);
5795 break;
5796 case UNIFORM:
5797 fprintf(file, "u%d", inst->src[i].nr);
5798 break;
5799 case BAD_FILE:
5800 fprintf(file, "(null)");
5801 break;
5802 case IMM:
5803 switch (inst->src[i].type) {
5804 case ELK_REGISTER_TYPE_HF:
5805 fprintf(file, "%-ghf", _mesa_half_to_float(inst->src[i].ud & 0xffff));
5806 break;
5807 case ELK_REGISTER_TYPE_F:
5808 fprintf(file, "%-gf", inst->src[i].f);
5809 break;
5810 case ELK_REGISTER_TYPE_DF:
5811 fprintf(file, "%fdf", inst->src[i].df);
5812 break;
5813 case ELK_REGISTER_TYPE_W:
5814 case ELK_REGISTER_TYPE_D:
5815 fprintf(file, "%dd", inst->src[i].d);
5816 break;
5817 case ELK_REGISTER_TYPE_UW:
5818 case ELK_REGISTER_TYPE_UD:
5819 fprintf(file, "%uu", inst->src[i].ud);
5820 break;
5821 case ELK_REGISTER_TYPE_Q:
5822 fprintf(file, "%" PRId64 "q", inst->src[i].d64);
5823 break;
5824 case ELK_REGISTER_TYPE_UQ:
5825 fprintf(file, "%" PRIu64 "uq", inst->src[i].u64);
5826 break;
5827 case ELK_REGISTER_TYPE_VF:
5828 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
5829 elk_vf_to_float((inst->src[i].ud >> 0) & 0xff),
5830 elk_vf_to_float((inst->src[i].ud >> 8) & 0xff),
5831 elk_vf_to_float((inst->src[i].ud >> 16) & 0xff),
5832 elk_vf_to_float((inst->src[i].ud >> 24) & 0xff));
5833 break;
5834 case ELK_REGISTER_TYPE_V:
5835 case ELK_REGISTER_TYPE_UV:
5836 fprintf(file, "%08x%s", inst->src[i].ud,
5837 inst->src[i].type == ELK_REGISTER_TYPE_V ? "V" : "UV");
5838 break;
5839 default:
5840 fprintf(file, "???");
5841 break;
5842 }
5843 break;
5844 case ARF:
5845 switch (inst->src[i].nr) {
5846 case ELK_ARF_NULL:
5847 fprintf(file, "null");
5848 break;
5849 case ELK_ARF_ADDRESS:
5850 fprintf(file, "a0.%d", inst->src[i].subnr);
5851 break;
5852 case ELK_ARF_ACCUMULATOR:
5853 fprintf(file, "acc%d", inst->src[i].subnr);
5854 break;
5855 case ELK_ARF_FLAG:
5856 fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
5857 break;
5858 default:
5859 fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
5860 break;
5861 }
5862 break;
5863 }
5864
5865 if (inst->src[i].offset ||
5866 (inst->src[i].file == VGRF &&
5867 alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
5868 const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
5869 fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
5870 inst->src[i].offset % reg_size);
5871 }
5872
5873 if (inst->src[i].abs)
5874 fprintf(file, "|");
5875
5876 if (inst->src[i].file != IMM) {
5877 unsigned stride;
5878 if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
5879 unsigned hstride = inst->src[i].hstride;
5880 stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
5881 } else {
5882 stride = inst->src[i].stride;
5883 }
5884 if (stride != 1)
5885 fprintf(file, "<%u>", stride);
5886
5887 fprintf(file, ":%s", elk_reg_type_to_letters(inst->src[i].type));
5888 }
5889
5890 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
5891 fprintf(file, ", ");
5892 }
5893
5894 fprintf(file, " ");
5895
5896 if (inst->force_writemask_all)
5897 fprintf(file, "NoMask ");
5898
5899 if (inst->exec_size != dispatch_width)
5900 fprintf(file, "group%d ", inst->group);
5901
5902 fprintf(file, "\n");
5903 }
5904
register_pressure(const elk_fs_visitor * v)5905 elk::register_pressure::register_pressure(const elk_fs_visitor *v)
5906 {
5907 const fs_live_variables &live = v->live_analysis.require();
5908 const unsigned num_instructions = v->cfg->num_blocks ?
5909 v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
5910
5911 regs_live_at_ip = new unsigned[num_instructions]();
5912
5913 for (unsigned reg = 0; reg < v->alloc.count; reg++) {
5914 for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
5915 regs_live_at_ip[ip] += v->alloc.sizes[reg];
5916 }
5917
5918 const unsigned payload_count = v->first_non_payload_grf;
5919
5920 int *payload_last_use_ip = new int[payload_count];
5921 v->calculate_payload_ranges(payload_count, payload_last_use_ip);
5922
5923 for (unsigned reg = 0; reg < payload_count; reg++) {
5924 for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
5925 ++regs_live_at_ip[ip];
5926 }
5927
5928 delete[] payload_last_use_ip;
5929 }
5930
~register_pressure()5931 elk::register_pressure::~register_pressure()
5932 {
5933 delete[] regs_live_at_ip;
5934 }
5935
5936 void
invalidate_analysis(elk::analysis_dependency_class c)5937 elk_fs_visitor::invalidate_analysis(elk::analysis_dependency_class c)
5938 {
5939 elk_backend_shader::invalidate_analysis(c);
5940 live_analysis.invalidate(c);
5941 regpressure_analysis.invalidate(c);
5942 }
5943
5944 void
debug_optimizer(const nir_shader * nir,const char * pass_name,int iteration,int pass_num) const5945 elk_fs_visitor::debug_optimizer(const nir_shader *nir,
5946 const char *pass_name,
5947 int iteration, int pass_num) const
5948 {
5949 if (!elk_should_print_shader(nir, DEBUG_OPTIMIZER))
5950 return;
5951
5952 char *filename;
5953 int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
5954 debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
5955 _mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
5956 iteration, pass_num, pass_name);
5957 if (ret == -1)
5958 return;
5959 dump_instructions(filename);
5960 free(filename);
5961 }
5962
5963 void
optimize()5964 elk_fs_visitor::optimize()
5965 {
5966 debug_optimizer(nir, "start", 0, 0);
5967
5968 /* Start by validating the shader we currently have. */
5969 validate();
5970
5971 bool progress = false;
5972 int iteration = 0;
5973 int pass_num = 0;
5974
5975 #define OPT(pass, args...) ({ \
5976 pass_num++; \
5977 bool this_progress = pass(args); \
5978 \
5979 if (this_progress) \
5980 debug_optimizer(nir, #pass, iteration, pass_num); \
5981 \
5982 validate(); \
5983 \
5984 progress = progress || this_progress; \
5985 this_progress; \
5986 })
5987
5988 assign_constant_locations();
5989 OPT(lower_constant_loads);
5990
5991 validate();
5992
5993 OPT(split_virtual_grfs);
5994
5995 /* Before anything else, eliminate dead code. The results of some NIR
5996 * instructions may effectively be calculated twice. Once when the
5997 * instruction is encountered, and again when the user of that result is
5998 * encountered. Wipe those away before algebraic optimizations and
5999 * especially copy propagation can mix things up.
6000 */
6001 OPT(dead_code_eliminate);
6002
6003 OPT(remove_extra_rounding_modes);
6004
6005 do {
6006 progress = false;
6007 pass_num = 0;
6008 iteration++;
6009
6010 OPT(remove_duplicate_mrf_writes);
6011
6012 OPT(opt_algebraic);
6013 OPT(opt_cse);
6014 OPT(opt_copy_propagation);
6015 OPT(elk_opt_predicated_break, this);
6016 OPT(opt_cmod_propagation);
6017 OPT(dead_code_eliminate);
6018 OPT(opt_peephole_sel);
6019 OPT(elk_dead_control_flow_eliminate, this);
6020 OPT(opt_saturate_propagation);
6021 OPT(register_coalesce);
6022 OPT(compute_to_mrf);
6023 OPT(eliminate_find_live_channel);
6024
6025 OPT(compact_virtual_grfs);
6026 } while (progress);
6027
6028 progress = false;
6029 pass_num = 0;
6030
6031 if (OPT(lower_pack)) {
6032 OPT(register_coalesce);
6033 OPT(dead_code_eliminate);
6034 }
6035
6036 OPT(lower_simd_width);
6037 OPT(lower_barycentrics);
6038 OPT(lower_logical_sends);
6039
6040 /* After logical SEND lowering. */
6041
6042 if (OPT(opt_copy_propagation))
6043 OPT(opt_algebraic);
6044
6045 /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
6046 * Do this before splitting SENDs.
6047 */
6048 if (devinfo->ver >= 7) {
6049 if (OPT(opt_zero_samples) && OPT(opt_copy_propagation))
6050 OPT(opt_algebraic);
6051 }
6052
6053 OPT(opt_split_sends);
6054 OPT(fixup_nomask_control_flow);
6055
6056 if (progress) {
6057 if (OPT(opt_copy_propagation))
6058 OPT(opt_algebraic);
6059
6060 /* Run after logical send lowering to give it a chance to CSE the
6061 * LOAD_PAYLOAD instructions created to construct the payloads of
6062 * e.g. texturing messages in cases where it wasn't possible to CSE the
6063 * whole logical instruction.
6064 */
6065 OPT(opt_cse);
6066 OPT(register_coalesce);
6067 OPT(compute_to_mrf);
6068 OPT(dead_code_eliminate);
6069 OPT(remove_duplicate_mrf_writes);
6070 OPT(opt_peephole_sel);
6071 }
6072
6073 OPT(opt_redundant_halt);
6074
6075 if (OPT(lower_load_payload)) {
6076 OPT(split_virtual_grfs);
6077
6078 /* Lower 64 bit MOVs generated by payload lowering. */
6079 if (!devinfo->has_64bit_float || !devinfo->has_64bit_int)
6080 OPT(opt_algebraic);
6081
6082 OPT(register_coalesce);
6083 OPT(lower_simd_width);
6084 OPT(compute_to_mrf);
6085 OPT(dead_code_eliminate);
6086 }
6087
6088 OPT(opt_combine_constants);
6089 if (OPT(lower_integer_multiplication)) {
6090 /* If lower_integer_multiplication made progress, it may have produced
6091 * some 32x32-bit MULs in the process of lowering 64-bit MULs. Run it
6092 * one more time to clean those up if they exist.
6093 */
6094 OPT(lower_integer_multiplication);
6095 }
6096 OPT(lower_sub_sat);
6097
6098 if (devinfo->ver <= 5 && OPT(lower_minmax)) {
6099 OPT(opt_cmod_propagation);
6100 OPT(opt_cse);
6101 if (OPT(opt_copy_propagation))
6102 OPT(opt_algebraic);
6103 OPT(dead_code_eliminate);
6104 }
6105
6106 progress = false;
6107 OPT(lower_derivatives);
6108 OPT(lower_regioning);
6109 if (progress) {
6110 if (OPT(opt_copy_propagation))
6111 OPT(opt_algebraic);
6112 OPT(dead_code_eliminate);
6113 OPT(lower_simd_width);
6114 }
6115
6116 OPT(fixup_sends_duplicate_payload);
6117
6118 OPT(lower_uniform_pull_constant_loads);
6119
6120 OPT(lower_find_live_channel);
6121
6122 validate();
6123 }
6124
6125 /**
6126 * From the Skylake PRM Vol. 2a docs for sends:
6127 *
6128 * "It is required that the second block of GRFs does not overlap with the
6129 * first block."
6130 *
6131 * There are plenty of cases where we may accidentally violate this due to
6132 * having, for instance, both sources be the constant 0. This little pass
6133 * just adds a new vgrf for the second payload and copies it over.
6134 */
6135 bool
fixup_sends_duplicate_payload()6136 elk_fs_visitor::fixup_sends_duplicate_payload()
6137 {
6138 bool progress = false;
6139
6140 foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
6141 if (inst->opcode == ELK_SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
6142 regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
6143 inst->src[3], inst->ex_mlen * REG_SIZE)) {
6144 elk_fs_reg tmp = elk_fs_reg(VGRF, alloc.allocate(inst->ex_mlen),
6145 ELK_REGISTER_TYPE_UD);
6146 /* Sadly, we've lost all notion of channels and bit sizes at this
6147 * point. Just WE_all it.
6148 */
6149 const fs_builder ibld = fs_builder(this, block, inst).exec_all().group(16, 0);
6150 elk_fs_reg copy_src = retype(inst->src[3], ELK_REGISTER_TYPE_UD);
6151 elk_fs_reg copy_dst = tmp;
6152 for (unsigned i = 0; i < inst->ex_mlen; i += 2) {
6153 if (inst->ex_mlen == i + 1) {
6154 /* Only one register left; do SIMD8 */
6155 ibld.group(8, 0).MOV(copy_dst, copy_src);
6156 } else {
6157 ibld.MOV(copy_dst, copy_src);
6158 }
6159 copy_src = offset(copy_src, ibld, 1);
6160 copy_dst = offset(copy_dst, ibld, 1);
6161 }
6162 inst->src[3] = tmp;
6163 progress = true;
6164 }
6165 }
6166
6167 if (progress)
6168 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
6169
6170 return progress;
6171 }
6172
6173 /**
6174 * Three source instruction must have a GRF/MRF destination register.
6175 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
6176 */
6177 void
fixup_3src_null_dest()6178 elk_fs_visitor::fixup_3src_null_dest()
6179 {
6180 bool progress = false;
6181
6182 foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
6183 if (inst->elk_is_3src(compiler) && inst->dst.is_null()) {
6184 inst->dst = elk_fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
6185 inst->dst.type);
6186 progress = true;
6187 }
6188 }
6189
6190 if (progress)
6191 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
6192 DEPENDENCY_VARIABLES);
6193 }
6194
6195 static bool
needs_dummy_fence(const intel_device_info * devinfo,elk_fs_inst * inst)6196 needs_dummy_fence(const intel_device_info *devinfo, elk_fs_inst *inst)
6197 {
6198 /* This workaround is about making sure that any instruction writing
6199 * through UGM has completed before we hit EOT.
6200 */
6201 if (inst->sfid != GFX12_SFID_UGM)
6202 return false;
6203
6204 /* Any UGM, non-Scratch-surface Stores (not including Atomic) messages,
6205 * where the L1-cache override is NOT among {WB, WS, WT}
6206 */
6207 enum elk_lsc_opcode opcode = lsc_msg_desc_opcode(devinfo, inst->desc);
6208 if (elk_lsc_opcode_is_store(opcode)) {
6209 switch (lsc_msg_desc_cache_ctrl(devinfo, inst->desc)) {
6210 case LSC_CACHE_STORE_L1STATE_L3MOCS:
6211 case LSC_CACHE_STORE_L1WB_L3WB:
6212 case LSC_CACHE_STORE_L1S_L3UC:
6213 case LSC_CACHE_STORE_L1S_L3WB:
6214 case LSC_CACHE_STORE_L1WT_L3UC:
6215 case LSC_CACHE_STORE_L1WT_L3WB:
6216 return false;
6217
6218 default:
6219 return true;
6220 }
6221 }
6222
6223 /* Any UGM Atomic message WITHOUT return value */
6224 if (elk_lsc_opcode_is_atomic(opcode) && inst->dst.file == BAD_FILE)
6225 return true;
6226
6227 return false;
6228 }
6229
6230 /* Wa_14015360517
6231 *
6232 * The first instruction of any kernel should have non-zero emask.
6233 * Make sure this happens by introducing a dummy mov instruction.
6234 */
6235 void
emit_dummy_mov_instruction()6236 elk_fs_visitor::emit_dummy_mov_instruction()
6237 {
6238 if (!intel_needs_workaround(devinfo, 14015360517))
6239 return;
6240
6241 struct elk_backend_instruction *first_inst =
6242 cfg->first_block()->start();
6243
6244 /* We can skip the WA if first instruction is marked with
6245 * force_writemask_all or exec_size equals dispatch_width.
6246 */
6247 if (first_inst->force_writemask_all ||
6248 first_inst->exec_size == dispatch_width)
6249 return;
6250
6251 /* Insert dummy mov as first instruction. */
6252 const fs_builder ubld =
6253 fs_builder(this, cfg->first_block(), (elk_fs_inst *)first_inst).exec_all().group(8, 0);
6254 ubld.MOV(ubld.null_reg_ud(), elk_imm_ud(0u));
6255
6256 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
6257 }
6258
6259 /* Wa_22013689345
6260 *
6261 * We need to emit UGM fence message before EOT, if shader has any UGM write
6262 * or atomic message.
6263 *
6264 * TODO/FINISHME: According to Curro we could avoid the fence in some cases.
6265 * We probably need a better criteria in needs_dummy_fence().
6266 */
6267 void
emit_dummy_memory_fence_before_eot()6268 elk_fs_visitor::emit_dummy_memory_fence_before_eot()
6269 {
6270 bool progress = false;
6271 bool has_ugm_write_or_atomic = false;
6272
6273 if (!intel_needs_workaround(devinfo, 22013689345))
6274 return;
6275
6276 foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
6277 if (!inst->eot) {
6278 if (needs_dummy_fence(devinfo, inst))
6279 has_ugm_write_or_atomic = true;
6280 continue;
6281 }
6282
6283 if (!has_ugm_write_or_atomic)
6284 break;
6285
6286 const fs_builder ibld(this, block, inst);
6287 const fs_builder ubld = ibld.exec_all().group(1, 0);
6288
6289 elk_fs_reg dst = ubld.vgrf(ELK_REGISTER_TYPE_UD);
6290 elk_fs_inst *dummy_fence = ubld.emit(ELK_SHADER_OPCODE_MEMORY_FENCE,
6291 dst, elk_vec8_grf(0, 0),
6292 /* commit enable */ elk_imm_ud(1),
6293 /* bti */ elk_imm_ud(0));
6294 dummy_fence->sfid = GFX12_SFID_UGM;
6295 dummy_fence->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_TILE,
6296 LSC_FLUSH_TYPE_NONE_6, false);
6297 ubld.emit(ELK_FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst);
6298 progress = true;
6299 /* TODO: remove this break if we ever have shader with multiple EOT. */
6300 break;
6301 }
6302
6303 if (progress) {
6304 invalidate_analysis(DEPENDENCY_INSTRUCTIONS |
6305 DEPENDENCY_VARIABLES);
6306 }
6307 }
6308
6309 /**
6310 * Find the first instruction in the program that might start a region of
6311 * divergent control flow due to a HALT jump. There is no
6312 * find_halt_control_flow_region_end(), the region of divergence extends until
6313 * the only ELK_SHADER_OPCODE_HALT_TARGET in the program.
6314 */
6315 static const elk_fs_inst *
find_halt_control_flow_region_start(const elk_fs_visitor * v)6316 find_halt_control_flow_region_start(const elk_fs_visitor *v)
6317 {
6318 foreach_block_and_inst(block, elk_fs_inst, inst, v->cfg) {
6319 if (inst->opcode == ELK_OPCODE_HALT ||
6320 inst->opcode == ELK_SHADER_OPCODE_HALT_TARGET)
6321 return inst;
6322 }
6323
6324 return NULL;
6325 }
6326
6327 /**
6328 * Work around the Gfx12 hardware bug filed as Wa_1407528679. EU fusion
6329 * can cause a BB to be executed with all channels disabled, which will lead
6330 * to the execution of any NoMask instructions in it, even though any
6331 * execution-masked instructions will be correctly shot down. This may break
6332 * assumptions of some NoMask SEND messages whose descriptor depends on data
6333 * generated by live invocations of the shader.
6334 *
6335 * This avoids the problem by predicating certain instructions on an ANY
6336 * horizontal predicate that makes sure that their execution is omitted when
6337 * all channels of the program are disabled.
6338 */
6339 bool
fixup_nomask_control_flow()6340 elk_fs_visitor::fixup_nomask_control_flow()
6341 {
6342 if (devinfo->ver != 12)
6343 return false;
6344
6345 const elk_predicate pred = dispatch_width > 16 ? ELK_PREDICATE_ALIGN1_ANY32H :
6346 dispatch_width > 8 ? ELK_PREDICATE_ALIGN1_ANY16H :
6347 ELK_PREDICATE_ALIGN1_ANY8H;
6348 const elk_fs_inst *halt_start = find_halt_control_flow_region_start(this);
6349 unsigned depth = 0;
6350 bool progress = false;
6351
6352 const fs_live_variables &live_vars = live_analysis.require();
6353
6354 /* Scan the program backwards in order to be able to easily determine
6355 * whether the flag register is live at any point.
6356 */
6357 foreach_block_reverse_safe(block, cfg) {
6358 BITSET_WORD flag_liveout = live_vars.block_data[block->num]
6359 .flag_liveout[0];
6360 STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1);
6361
6362 foreach_inst_in_block_reverse_safe(elk_fs_inst, inst, block) {
6363 if (!inst->predicate && inst->exec_size >= 8)
6364 flag_liveout &= ~inst->flags_written(devinfo);
6365
6366 switch (inst->opcode) {
6367 case ELK_OPCODE_DO:
6368 case ELK_OPCODE_IF:
6369 /* Note that this doesn't handle ELK_OPCODE_HALT since only
6370 * the first one in the program closes the region of divergent
6371 * control flow due to any HALT instructions -- Instead this is
6372 * handled with the halt_start check below.
6373 */
6374 depth--;
6375 break;
6376
6377 case ELK_OPCODE_WHILE:
6378 case ELK_OPCODE_ENDIF:
6379 case ELK_SHADER_OPCODE_HALT_TARGET:
6380 depth++;
6381 break;
6382
6383 default:
6384 /* Note that the vast majority of NoMask SEND instructions in the
6385 * program are harmless while executed in a block with all
6386 * channels disabled, since any instructions with side effects we
6387 * could hit here should be execution-masked.
6388 *
6389 * The main concern is NoMask SEND instructions where the message
6390 * descriptor or header depends on data generated by live
6391 * invocations of the shader (RESINFO and
6392 * ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically
6393 * computed surface index seem to be the only examples right now
6394 * where this could easily lead to GPU hangs). Unfortunately we
6395 * have no straightforward way to detect that currently, so just
6396 * predicate any NoMask SEND instructions we find under control
6397 * flow.
6398 *
6399 * If this proves to have a measurable performance impact it can
6400 * be easily extended with a whitelist of messages we know we can
6401 * safely omit the predication for.
6402 */
6403 if (depth && inst->force_writemask_all &&
6404 is_send(inst) && !inst->predicate) {
6405 /* We need to load the execution mask into the flag register by
6406 * using a builder with channel group matching the whole shader
6407 * (rather than the default which is derived from the original
6408 * instruction), in order to avoid getting a right-shifted
6409 * value.
6410 */
6411 const fs_builder ubld = fs_builder(this, block, inst)
6412 .exec_all().group(dispatch_width, 0);
6413 const elk_fs_reg flag = retype(elk_flag_reg(0, 0),
6414 ELK_REGISTER_TYPE_UD);
6415
6416 /* Due to the lack of flag register allocation we need to save
6417 * and restore the flag register if it's live.
6418 */
6419 const bool save_flag = flag_liveout &
6420 flag_mask(flag, dispatch_width / 8);
6421 const elk_fs_reg tmp = ubld.group(8, 0).vgrf(flag.type);
6422
6423 if (save_flag) {
6424 ubld.group(8, 0).UNDEF(tmp);
6425 ubld.group(1, 0).MOV(tmp, flag);
6426 }
6427
6428 ubld.emit(ELK_FS_OPCODE_LOAD_LIVE_CHANNELS);
6429
6430 set_predicate(pred, inst);
6431 inst->flag_subreg = 0;
6432 inst->predicate_trivial = true;
6433
6434 if (save_flag)
6435 ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);
6436
6437 progress = true;
6438 }
6439 break;
6440 }
6441
6442 if (inst == halt_start)
6443 depth--;
6444
6445 flag_liveout |= inst->flags_read(devinfo);
6446 }
6447 }
6448
6449 if (progress)
6450 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
6451
6452 return progress;
6453 }
6454
6455 uint32_t
compute_max_register_pressure()6456 elk_fs_visitor::compute_max_register_pressure()
6457 {
6458 const register_pressure &rp = regpressure_analysis.require();
6459 uint32_t ip = 0, max_pressure = 0;
6460 foreach_block_and_inst(block, elk_backend_instruction, inst, cfg) {
6461 max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
6462 ip++;
6463 }
6464 return max_pressure;
6465 }
6466
6467 static elk_fs_inst **
save_instruction_order(const struct elk_cfg_t * cfg)6468 save_instruction_order(const struct elk_cfg_t *cfg)
6469 {
6470 /* Before we schedule anything, stash off the instruction order as an array
6471 * of elk_fs_inst *. This way, we can reset it between scheduling passes to
6472 * prevent dependencies between the different scheduling modes.
6473 */
6474 int num_insts = cfg->last_block()->end_ip + 1;
6475 elk_fs_inst **inst_arr = new elk_fs_inst * [num_insts];
6476
6477 int ip = 0;
6478 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
6479 assert(ip >= block->start_ip && ip <= block->end_ip);
6480 inst_arr[ip++] = inst;
6481 }
6482 assert(ip == num_insts);
6483
6484 return inst_arr;
6485 }
6486
6487 static void
restore_instruction_order(struct elk_cfg_t * cfg,elk_fs_inst ** inst_arr)6488 restore_instruction_order(struct elk_cfg_t *cfg, elk_fs_inst **inst_arr)
6489 {
6490 ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
6491
6492 int ip = 0;
6493 foreach_block (block, cfg) {
6494 block->instructions.make_empty();
6495
6496 assert(ip == block->start_ip);
6497 for (; ip <= block->end_ip; ip++)
6498 block->instructions.push_tail(inst_arr[ip]);
6499 }
6500 assert(ip == num_insts);
6501 }
6502
6503 void
allocate_registers(bool allow_spilling)6504 elk_fs_visitor::allocate_registers(bool allow_spilling)
6505 {
6506 bool allocated;
6507
6508 static const enum instruction_scheduler_mode pre_modes[] = {
6509 SCHEDULE_PRE,
6510 SCHEDULE_PRE_NON_LIFO,
6511 SCHEDULE_NONE,
6512 SCHEDULE_PRE_LIFO,
6513 };
6514
6515 static const char *scheduler_mode_name[] = {
6516 [SCHEDULE_PRE] = "top-down",
6517 [SCHEDULE_PRE_NON_LIFO] = "non-lifo",
6518 [SCHEDULE_PRE_LIFO] = "lifo",
6519 [SCHEDULE_POST] = "post",
6520 [SCHEDULE_NONE] = "none",
6521 };
6522
6523 uint32_t best_register_pressure = UINT32_MAX;
6524 enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
6525
6526 compact_virtual_grfs();
6527
6528 if (needs_register_pressure)
6529 shader_stats.max_register_pressure = compute_max_register_pressure();
6530
6531 debug_optimizer(nir, "pre_register_allocate", 90, 90);
6532
6533 bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
6534
6535 /* Before we schedule anything, stash off the instruction order as an array
6536 * of elk_fs_inst *. This way, we can reset it between scheduling passes to
6537 * prevent dependencies between the different scheduling modes.
6538 */
6539 elk_fs_inst **orig_order = save_instruction_order(cfg);
6540 elk_fs_inst **best_pressure_order = NULL;
6541
6542 void *scheduler_ctx = ralloc_context(NULL);
6543 elk_fs_instruction_scheduler *sched = prepare_scheduler(scheduler_ctx);
6544
6545 /* Try each scheduling heuristic to see if it can successfully register
6546 * allocate without spilling. They should be ordered by decreasing
6547 * performance but increasing likelihood of allocating.
6548 */
6549 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
6550 enum instruction_scheduler_mode sched_mode = pre_modes[i];
6551
6552 schedule_instructions_pre_ra(sched, sched_mode);
6553 this->shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
6554
6555 debug_optimizer(nir, shader_stats.scheduler_mode, 95, i);
6556
6557 if (0) {
6558 assign_regs_trivial();
6559 allocated = true;
6560 break;
6561 }
6562
6563 /* We should only spill registers on the last scheduling. */
6564 assert(!spilled_any_registers);
6565
6566 allocated = assign_regs(false, spill_all);
6567 if (allocated)
6568 break;
6569
6570 /* Save the maximum register pressure */
6571 uint32_t this_pressure = compute_max_register_pressure();
6572
6573 if (0) {
6574 fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
6575 scheduler_mode_name[sched_mode], this_pressure);
6576 }
6577
6578 if (this_pressure < best_register_pressure) {
6579 best_register_pressure = this_pressure;
6580 best_sched = sched_mode;
6581 delete[] best_pressure_order;
6582 best_pressure_order = save_instruction_order(cfg);
6583 }
6584
6585 /* Reset back to the original order before trying the next mode */
6586 restore_instruction_order(cfg, orig_order);
6587 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
6588 }
6589
6590 ralloc_free(scheduler_ctx);
6591
6592 if (!allocated) {
6593 if (0) {
6594 fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
6595 scheduler_mode_name[best_sched]);
6596 }
6597 restore_instruction_order(cfg, best_pressure_order);
6598 shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
6599
6600 allocated = assign_regs(allow_spilling, spill_all);
6601 }
6602
6603 delete[] orig_order;
6604 delete[] best_pressure_order;
6605
6606 if (!allocated) {
6607 fail("Failure to register allocate. Reduce number of "
6608 "live scalar values to avoid this.");
6609 } else if (spilled_any_registers) {
6610 elk_shader_perf_log(compiler, log_data,
6611 "%s shader triggered register spilling. "
6612 "Try reducing the number of live scalar "
6613 "values to improve performance.\n",
6614 _mesa_shader_stage_to_string(stage));
6615 }
6616
6617 /* This must come after all optimization and register allocation, since
6618 * it inserts dead code that happens to have side effects, and it does
6619 * so based on the actual physical registers in use.
6620 */
6621 insert_gfx4_send_dependency_workarounds();
6622
6623 if (failed)
6624 return;
6625
6626 opt_bank_conflicts();
6627
6628 schedule_instructions_post_ra();
6629
6630 if (last_scratch > 0) {
6631 ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;
6632
6633 /* Take the max of any previously compiled variant of the shader. In the
6634 * case of bindless shaders with return parts, this will also take the
6635 * max of all parts.
6636 */
6637 prog_data->total_scratch = MAX2(elk_get_scratch_size(last_scratch),
6638 prog_data->total_scratch);
6639
6640 if (gl_shader_stage_is_compute(stage)) {
6641 if (devinfo->platform == INTEL_PLATFORM_HSW) {
6642 /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
6643 * field documentation, Haswell supports a minimum of 2kB of
6644 * scratch space for compute shaders, unlike every other stage
6645 * and platform.
6646 */
6647 prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);
6648 } else if (devinfo->ver <= 7) {
6649 /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
6650 * field documentation, platforms prior to Haswell measure scratch
6651 * size linearly with a range of [1kB, 12kB] and 1kB granularity.
6652 */
6653 prog_data->total_scratch = ALIGN(last_scratch, 1024);
6654 max_scratch_size = 12 * 1024;
6655 }
6656 }
6657
6658 /* We currently only support up to 2MB of scratch space. If we
6659 * need to support more eventually, the documentation suggests
6660 * that we could allocate a larger buffer, and partition it out
6661 * ourselves. We'd just have to undo the hardware's address
6662 * calculation by subtracting (FFTID * Per Thread Scratch Space)
6663 * and then add FFTID * (Larger Per Thread Scratch Space).
6664 *
6665 * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
6666 * Thread Group Tracking > Local Memory/Scratch Space.
6667 */
6668 assert(prog_data->total_scratch < max_scratch_size);
6669 }
6670 }
6671
6672 bool
run_vs()6673 elk_fs_visitor::run_vs()
6674 {
6675 assert(stage == MESA_SHADER_VERTEX);
6676
6677 payload_ = new elk_vs_thread_payload(*this);
6678
6679 nir_to_elk(this);
6680
6681 if (failed)
6682 return false;
6683
6684 emit_urb_writes();
6685
6686 calculate_cfg();
6687
6688 optimize();
6689
6690 assign_curb_setup();
6691 assign_vs_urb_setup();
6692
6693 fixup_3src_null_dest();
6694 emit_dummy_memory_fence_before_eot();
6695
6696 /* Wa_14015360517 */
6697 emit_dummy_mov_instruction();
6698
6699 allocate_registers(true /* allow_spilling */);
6700
6701 return !failed;
6702 }
6703
6704 void
set_tcs_invocation_id()6705 elk_fs_visitor::set_tcs_invocation_id()
6706 {
6707 struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(prog_data);
6708 struct elk_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
6709 const fs_builder bld = fs_builder(this).at_end();
6710
6711 const unsigned instance_id_mask =
6712 (devinfo->verx10 >= 125) ? INTEL_MASK(7, 0) :
6713 (devinfo->ver >= 11) ? INTEL_MASK(22, 16) :
6714 INTEL_MASK(23, 17);
6715 const unsigned instance_id_shift =
6716 (devinfo->verx10 >= 125) ? 0 : (devinfo->ver >= 11) ? 16 : 17;
6717
6718 /* Get instance number from g0.2 bits:
6719 * * 7:0 on DG2+
6720 * * 22:16 on gfx11+
6721 * * 23:17 otherwise
6722 */
6723 elk_fs_reg t = bld.vgrf(ELK_REGISTER_TYPE_UD);
6724 bld.AND(t, elk_fs_reg(retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD)),
6725 elk_imm_ud(instance_id_mask));
6726
6727 invocation_id = bld.vgrf(ELK_REGISTER_TYPE_UD);
6728
6729 if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH) {
6730 /* gl_InvocationID is just the thread number */
6731 bld.SHR(invocation_id, t, elk_imm_ud(instance_id_shift));
6732 return;
6733 }
6734
6735 assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH);
6736
6737 elk_fs_reg channels_uw = bld.vgrf(ELK_REGISTER_TYPE_UW);
6738 elk_fs_reg channels_ud = bld.vgrf(ELK_REGISTER_TYPE_UD);
6739 bld.MOV(channels_uw, elk_fs_reg(elk_imm_uv(0x76543210)));
6740 bld.MOV(channels_ud, channels_uw);
6741
6742 if (tcs_prog_data->instances == 1) {
6743 invocation_id = channels_ud;
6744 } else {
6745 elk_fs_reg instance_times_8 = bld.vgrf(ELK_REGISTER_TYPE_UD);
6746 bld.SHR(instance_times_8, t, elk_imm_ud(instance_id_shift - 3));
6747 bld.ADD(invocation_id, instance_times_8, channels_ud);
6748 }
6749 }
6750
6751 void
emit_tcs_thread_end()6752 elk_fs_visitor::emit_tcs_thread_end()
6753 {
6754 /* Try and tag the last URB write with EOT instead of emitting a whole
6755 * separate write just to finish the thread. There isn't guaranteed to
6756 * be one, so this may not succeed.
6757 */
6758 if (devinfo->ver != 8 && mark_last_urb_write_with_eot())
6759 return;
6760
6761 const fs_builder bld = fs_builder(this).at_end();
6762
6763 /* Emit a URB write to end the thread. On Broadwell, we use this to write
6764 * zero to the "TR DS Cache Disable" bit (we haven't implemented a fancy
6765 * algorithm to set it optimally). On other platforms, we simply write
6766 * zero to a reserved/MBZ patch header DWord which has no consequence.
6767 */
6768 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
6769 srcs[URB_LOGICAL_SRC_HANDLE] = tcs_payload().patch_urb_output;
6770 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = elk_imm_ud(WRITEMASK_X << 16);
6771 srcs[URB_LOGICAL_SRC_DATA] = elk_imm_ud(0);
6772 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(1);
6773 elk_fs_inst *inst = bld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL,
6774 reg_undef, srcs, ARRAY_SIZE(srcs));
6775 inst->eot = true;
6776 }
6777
6778 bool
run_tcs()6779 elk_fs_visitor::run_tcs()
6780 {
6781 assert(stage == MESA_SHADER_TESS_CTRL);
6782
6783 struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(prog_data);
6784 const fs_builder bld = fs_builder(this).at_end();
6785
6786 assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH ||
6787 vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
6788
6789 payload_ = new elk_tcs_thread_payload(*this);
6790
6791 /* Initialize gl_InvocationID */
6792 set_tcs_invocation_id();
6793
6794 const bool fix_dispatch_mask =
6795 vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH &&
6796 (nir->info.tess.tcs_vertices_out % 8) != 0;
6797
6798 /* Fix the disptach mask */
6799 if (fix_dispatch_mask) {
6800 bld.CMP(bld.null_reg_ud(), invocation_id,
6801 elk_imm_ud(nir->info.tess.tcs_vertices_out), ELK_CONDITIONAL_L);
6802 bld.IF(ELK_PREDICATE_NORMAL);
6803 }
6804
6805 nir_to_elk(this);
6806
6807 if (fix_dispatch_mask) {
6808 bld.emit(ELK_OPCODE_ENDIF);
6809 }
6810
6811 emit_tcs_thread_end();
6812
6813 if (failed)
6814 return false;
6815
6816 calculate_cfg();
6817
6818 optimize();
6819
6820 assign_curb_setup();
6821 assign_tcs_urb_setup();
6822
6823 fixup_3src_null_dest();
6824 emit_dummy_memory_fence_before_eot();
6825
6826 /* Wa_14015360517 */
6827 emit_dummy_mov_instruction();
6828
6829 allocate_registers(true /* allow_spilling */);
6830
6831 return !failed;
6832 }
6833
6834 bool
run_tes()6835 elk_fs_visitor::run_tes()
6836 {
6837 assert(stage == MESA_SHADER_TESS_EVAL);
6838
6839 payload_ = new elk_tes_thread_payload(*this);
6840
6841 nir_to_elk(this);
6842
6843 if (failed)
6844 return false;
6845
6846 emit_urb_writes();
6847
6848 calculate_cfg();
6849
6850 optimize();
6851
6852 assign_curb_setup();
6853 assign_tes_urb_setup();
6854
6855 fixup_3src_null_dest();
6856 emit_dummy_memory_fence_before_eot();
6857
6858 /* Wa_14015360517 */
6859 emit_dummy_mov_instruction();
6860
6861 allocate_registers(true /* allow_spilling */);
6862
6863 return !failed;
6864 }
6865
6866 bool
run_gs()6867 elk_fs_visitor::run_gs()
6868 {
6869 assert(stage == MESA_SHADER_GEOMETRY);
6870
6871 payload_ = new elk_gs_thread_payload(*this);
6872
6873 this->final_gs_vertex_count = vgrf(glsl_uint_type());
6874
6875 if (gs_compile->control_data_header_size_bits > 0) {
6876 /* Create a VGRF to store accumulated control data bits. */
6877 this->control_data_bits = vgrf(glsl_uint_type());
6878
6879 /* If we're outputting more than 32 control data bits, then EmitVertex()
6880 * will set control_data_bits to 0 after emitting the first vertex.
6881 * Otherwise, we need to initialize it to 0 here.
6882 */
6883 if (gs_compile->control_data_header_size_bits <= 32) {
6884 const fs_builder bld = fs_builder(this).at_end();
6885 const fs_builder abld = bld.annotate("initialize control data bits");
6886 abld.MOV(this->control_data_bits, elk_imm_ud(0u));
6887 }
6888 }
6889
6890 nir_to_elk(this);
6891
6892 emit_gs_thread_end();
6893
6894 if (failed)
6895 return false;
6896
6897 calculate_cfg();
6898
6899 optimize();
6900
6901 assign_curb_setup();
6902 assign_gs_urb_setup();
6903
6904 fixup_3src_null_dest();
6905 emit_dummy_memory_fence_before_eot();
6906
6907 /* Wa_14015360517 */
6908 emit_dummy_mov_instruction();
6909
6910 allocate_registers(true /* allow_spilling */);
6911
6912 return !failed;
6913 }
6914
6915 /* From the SKL PRM, Volume 16, Workarounds:
6916 *
6917 * 0877 3D Pixel Shader Hang possible when pixel shader dispatched with
6918 * only header phases (R0-R2)
6919 *
6920 * WA: Enable a non-header phase (e.g. push constant) when dispatch would
6921 * have been header only.
6922 *
6923 * Instead of enabling push constants one can alternatively enable one of the
6924 * inputs. Here one simply chooses "layer" which shouldn't impose much
6925 * overhead.
6926 */
6927 static void
gfx9_ps_header_only_workaround(struct elk_wm_prog_data * wm_prog_data)6928 gfx9_ps_header_only_workaround(struct elk_wm_prog_data *wm_prog_data)
6929 {
6930 if (wm_prog_data->num_varying_inputs)
6931 return;
6932
6933 if (wm_prog_data->base.curb_read_length)
6934 return;
6935
6936 wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
6937 wm_prog_data->num_varying_inputs = 1;
6938
6939 elk_compute_urb_setup_index(wm_prog_data);
6940 }
6941
6942 bool
run_fs(bool allow_spilling,bool do_rep_send)6943 elk_fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
6944 {
6945 struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(this->prog_data);
6946 elk_wm_prog_key *wm_key = (elk_wm_prog_key *) this->key;
6947 const fs_builder bld = fs_builder(this).at_end();
6948
6949 assert(stage == MESA_SHADER_FRAGMENT);
6950
6951 payload_ = new elk_fs_thread_payload(*this, source_depth_to_render_target,
6952 runtime_check_aads_emit);
6953
6954 if (do_rep_send) {
6955 assert(dispatch_width == 16);
6956 emit_repclear_shader();
6957 } else {
6958 if (nir->info.inputs_read > 0 ||
6959 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
6960 (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
6961 if (devinfo->ver < 6)
6962 emit_interpolation_setup_gfx4();
6963 else
6964 emit_interpolation_setup_gfx6();
6965 }
6966
6967 /* We handle discards by keeping track of the still-live pixels in f0.1.
6968 * Initialize it with the dispatched pixels.
6969 */
6970 if (wm_prog_data->uses_kill) {
6971 const unsigned lower_width = MIN2(dispatch_width, 16);
6972 for (unsigned i = 0; i < dispatch_width / lower_width; i++) {
6973 /* According to the "PS Thread Payload for Normal
6974 * Dispatch" pages on the BSpec, the dispatch mask is
6975 * stored in R0.15/R1.15 on gfx20+ and in R1.7/R2.7 on
6976 * gfx6+.
6977 */
6978 const elk_fs_reg dispatch_mask =
6979 devinfo->ver >= 20 ? xe2_vec1_grf(i, 15) :
6980 devinfo->ver >= 6 ? elk_vec1_grf(i + 1, 7) :
6981 elk_vec1_grf(0, 0);
6982 bld.exec_all().group(1, 0)
6983 .MOV(elk_sample_mask_reg(bld.group(lower_width, i)),
6984 retype(dispatch_mask, ELK_REGISTER_TYPE_UW));
6985 }
6986 }
6987
6988 if (nir->info.writes_memory)
6989 wm_prog_data->has_side_effects = true;
6990
6991 nir_to_elk(this);
6992
6993 if (failed)
6994 return false;
6995
6996 if (wm_key->emit_alpha_test)
6997 emit_alpha_test();
6998
6999 emit_fb_writes();
7000
7001 calculate_cfg();
7002
7003 optimize();
7004
7005 assign_curb_setup();
7006
7007 if (devinfo->ver == 9)
7008 gfx9_ps_header_only_workaround(wm_prog_data);
7009
7010 assign_urb_setup();
7011
7012 fixup_3src_null_dest();
7013 emit_dummy_memory_fence_before_eot();
7014
7015 /* Wa_14015360517 */
7016 emit_dummy_mov_instruction();
7017
7018 allocate_registers(allow_spilling);
7019 }
7020
7021 return !failed;
7022 }
7023
7024 bool
run_cs(bool allow_spilling)7025 elk_fs_visitor::run_cs(bool allow_spilling)
7026 {
7027 assert(gl_shader_stage_is_compute(stage));
7028 assert(devinfo->ver >= 7);
7029 const fs_builder bld = fs_builder(this).at_end();
7030
7031 payload_ = new elk_cs_thread_payload(*this);
7032
7033 if (devinfo->platform == INTEL_PLATFORM_HSW && prog_data->total_shared > 0) {
7034 /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
7035 const fs_builder abld = bld.exec_all().group(1, 0);
7036 abld.MOV(retype(elk_sr0_reg(1), ELK_REGISTER_TYPE_UW),
7037 suboffset(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UW), 1));
7038 }
7039
7040 nir_to_elk(this);
7041
7042 if (failed)
7043 return false;
7044
7045 emit_cs_terminate();
7046
7047 calculate_cfg();
7048
7049 optimize();
7050
7051 assign_curb_setup();
7052
7053 fixup_3src_null_dest();
7054 emit_dummy_memory_fence_before_eot();
7055
7056 /* Wa_14015360517 */
7057 emit_dummy_mov_instruction();
7058
7059 allocate_registers(allow_spilling);
7060
7061 return !failed;
7062 }
7063
7064 static bool
is_used_in_not_interp_frag_coord(nir_def * def)7065 is_used_in_not_interp_frag_coord(nir_def *def)
7066 {
7067 nir_foreach_use_including_if(src, def) {
7068 if (nir_src_is_if(src))
7069 return true;
7070
7071 if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
7072 return true;
7073
7074 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src));
7075 if (intrin->intrinsic != nir_intrinsic_load_frag_coord)
7076 return true;
7077 }
7078
7079 return false;
7080 }
7081
7082 /**
7083 * Return a bitfield where bit n is set if barycentric interpolation mode n
7084 * (see enum elk_barycentric_mode) is needed by the fragment shader.
7085 *
7086 * We examine the load_barycentric intrinsics rather than looking at input
7087 * variables so that we catch interpolateAtCentroid() messages too, which
7088 * also need the ELK_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
7089 */
7090 static unsigned
elk_compute_barycentric_interp_modes(const struct intel_device_info * devinfo,const nir_shader * shader)7091 elk_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
7092 const nir_shader *shader)
7093 {
7094 unsigned barycentric_interp_modes = 0;
7095
7096 nir_foreach_function_impl(impl, shader) {
7097 nir_foreach_block(block, impl) {
7098 nir_foreach_instr(instr, block) {
7099 if (instr->type != nir_instr_type_intrinsic)
7100 continue;
7101
7102 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
7103 switch (intrin->intrinsic) {
7104 case nir_intrinsic_load_barycentric_pixel:
7105 case nir_intrinsic_load_barycentric_centroid:
7106 case nir_intrinsic_load_barycentric_sample:
7107 case nir_intrinsic_load_barycentric_at_sample:
7108 case nir_intrinsic_load_barycentric_at_offset:
7109 break;
7110 default:
7111 continue;
7112 }
7113
7114 /* Ignore WPOS; it doesn't require interpolation. */
7115 if (!is_used_in_not_interp_frag_coord(&intrin->def))
7116 continue;
7117
7118 nir_intrinsic_op bary_op = intrin->intrinsic;
7119 enum elk_barycentric_mode bary =
7120 elk_barycentric_mode(intrin);
7121
7122 barycentric_interp_modes |= 1 << bary;
7123
7124 if (devinfo->needs_unlit_centroid_workaround &&
7125 bary_op == nir_intrinsic_load_barycentric_centroid)
7126 barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
7127 }
7128 }
7129 }
7130
7131 return barycentric_interp_modes;
7132 }
7133
7134 static void
elk_compute_flat_inputs(struct elk_wm_prog_data * prog_data,const nir_shader * shader)7135 elk_compute_flat_inputs(struct elk_wm_prog_data *prog_data,
7136 const nir_shader *shader)
7137 {
7138 prog_data->flat_inputs = 0;
7139
7140 nir_foreach_shader_in_variable(var, shader) {
7141 /* flat shading */
7142 if (var->data.interpolation != INTERP_MODE_FLAT)
7143 continue;
7144
7145 if (var->data.per_primitive)
7146 continue;
7147
7148 unsigned slots = glsl_count_attribute_slots(var->type, false);
7149 for (unsigned s = 0; s < slots; s++) {
7150 int input_index = prog_data->urb_setup[var->data.location + s];
7151
7152 if (input_index >= 0)
7153 prog_data->flat_inputs |= 1 << input_index;
7154 }
7155 }
7156 }
7157
7158 static uint8_t
computed_depth_mode(const nir_shader * shader)7159 computed_depth_mode(const nir_shader *shader)
7160 {
7161 if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
7162 switch (shader->info.fs.depth_layout) {
7163 case FRAG_DEPTH_LAYOUT_NONE:
7164 case FRAG_DEPTH_LAYOUT_ANY:
7165 return ELK_PSCDEPTH_ON;
7166 case FRAG_DEPTH_LAYOUT_GREATER:
7167 return ELK_PSCDEPTH_ON_GE;
7168 case FRAG_DEPTH_LAYOUT_LESS:
7169 return ELK_PSCDEPTH_ON_LE;
7170 case FRAG_DEPTH_LAYOUT_UNCHANGED:
7171 /* We initially set this to OFF, but having the shader write the
7172 * depth means we allocate register space in the SEND message. The
7173 * difference between the SEND register count and the OFF state
7174 * programming makes the HW hang.
7175 *
7176 * Removing the depth writes also leads to test failures. So use
7177 * LesserThanOrEqual, which fits writing the same value
7178 * (unchanged/equal).
7179 *
7180 */
7181 return ELK_PSCDEPTH_ON_LE;
7182 }
7183 }
7184 return ELK_PSCDEPTH_OFF;
7185 }
7186
7187 /**
7188 * Move load_interpolated_input with simple (payload-based) barycentric modes
7189 * to the top of the program so we don't emit multiple PLNs for the same input.
7190 *
7191 * This works around CSE not being able to handle non-dominating cases
7192 * such as:
7193 *
7194 * if (...) {
7195 * interpolate input
7196 * } else {
7197 * interpolate the same exact input
7198 * }
7199 *
7200 * This should be replaced by global value numbering someday.
7201 */
7202 bool
elk_nir_move_interpolation_to_top(nir_shader * nir)7203 elk_nir_move_interpolation_to_top(nir_shader *nir)
7204 {
7205 bool progress = false;
7206
7207 nir_foreach_function_impl(impl, nir) {
7208 nir_block *top = nir_start_block(impl);
7209 nir_cursor cursor = nir_before_instr(nir_block_first_instr(top));
7210 bool impl_progress = false;
7211
7212 for (nir_block *block = nir_block_cf_tree_next(top);
7213 block != NULL;
7214 block = nir_block_cf_tree_next(block)) {
7215
7216 nir_foreach_instr_safe(instr, block) {
7217 if (instr->type != nir_instr_type_intrinsic)
7218 continue;
7219
7220 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
7221 if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
7222 continue;
7223 nir_intrinsic_instr *bary_intrinsic =
7224 nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
7225 nir_intrinsic_op op = bary_intrinsic->intrinsic;
7226
7227 /* Leave interpolateAtSample/Offset() where they are. */
7228 if (op == nir_intrinsic_load_barycentric_at_sample ||
7229 op == nir_intrinsic_load_barycentric_at_offset)
7230 continue;
7231
7232 nir_instr *move[3] = {
7233 &bary_intrinsic->instr,
7234 intrin->src[1].ssa->parent_instr,
7235 instr
7236 };
7237
7238 for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
7239 if (move[i]->block != top) {
7240 nir_instr_move(cursor, move[i]);
7241 impl_progress = true;
7242 }
7243 }
7244 }
7245 }
7246
7247 progress = progress || impl_progress;
7248
7249 nir_metadata_preserve(impl, impl_progress ? (nir_metadata_block_index |
7250 nir_metadata_dominance)
7251 : nir_metadata_all);
7252 }
7253
7254 return progress;
7255 }
7256
7257 static void
elk_nir_populate_wm_prog_data(nir_shader * shader,const struct intel_device_info * devinfo,const struct elk_wm_prog_key * key,struct elk_wm_prog_data * prog_data)7258 elk_nir_populate_wm_prog_data(nir_shader *shader,
7259 const struct intel_device_info *devinfo,
7260 const struct elk_wm_prog_key *key,
7261 struct elk_wm_prog_data *prog_data)
7262 {
7263 /* key->alpha_test_func means simulating alpha testing via discards,
7264 * so the shader definitely kills pixels.
7265 */
7266 prog_data->uses_kill = shader->info.fs.uses_discard ||
7267 shader->info.fs.uses_demote ||
7268 key->emit_alpha_test;
7269 prog_data->uses_omask = !key->ignore_sample_mask_out &&
7270 (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
7271 prog_data->color_outputs_written = key->color_outputs_valid;
7272 prog_data->max_polygons = 1;
7273 prog_data->computed_depth_mode = computed_depth_mode(shader);
7274 prog_data->computed_stencil =
7275 shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
7276
7277 prog_data->sample_shading =
7278 shader->info.fs.uses_sample_shading ||
7279 shader->info.outputs_read;
7280
7281 assert(key->multisample_fbo != ELK_NEVER ||
7282 key->persample_interp == ELK_NEVER);
7283
7284 prog_data->persample_dispatch = key->persample_interp;
7285 if (prog_data->sample_shading)
7286 prog_data->persample_dispatch = ELK_ALWAYS;
7287
7288 /* We can only persample dispatch if we have a multisample FBO */
7289 prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch,
7290 key->multisample_fbo);
7291
7292 /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
7293 * persample_dispatch & multisample_fbo are not dynamic, Anv should be able
7294 * to definitively tell whether alpha_to_coverage is on or off.
7295 */
7296 prog_data->alpha_to_coverage = key->alpha_to_coverage;
7297 assert(prog_data->alpha_to_coverage != ELK_SOMETIMES ||
7298 prog_data->persample_dispatch == ELK_SOMETIMES);
7299
7300 if (devinfo->ver >= 6) {
7301 prog_data->uses_sample_mask =
7302 BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
7303
7304 /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
7305 *
7306 * "MSDISPMODE_PERSAMPLE is required in order to select
7307 * POSOFFSET_SAMPLE"
7308 *
7309 * So we can only really get sample positions if we are doing real
7310 * per-sample dispatch. If we need gl_SamplePosition and we don't have
7311 * persample dispatch, we hard-code it to 0.5.
7312 */
7313 prog_data->uses_pos_offset =
7314 prog_data->persample_dispatch != ELK_NEVER &&
7315 (BITSET_TEST(shader->info.system_values_read,
7316 SYSTEM_VALUE_SAMPLE_POS) ||
7317 BITSET_TEST(shader->info.system_values_read,
7318 SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
7319 }
7320
7321 prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
7322 prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
7323 prog_data->inner_coverage = shader->info.fs.inner_coverage;
7324
7325 prog_data->barycentric_interp_modes =
7326 elk_compute_barycentric_interp_modes(devinfo, shader);
7327
7328 /* From the BDW PRM documentation for 3DSTATE_WM:
7329 *
7330 * "MSDISPMODE_PERSAMPLE is required in order to select Perspective
7331 * Sample or Non- perspective Sample barycentric coordinates."
7332 *
7333 * So cleanup any potentially set sample barycentric mode when not in per
7334 * sample dispatch.
7335 */
7336 if (prog_data->persample_dispatch == ELK_NEVER) {
7337 prog_data->barycentric_interp_modes &=
7338 ~BITFIELD_BIT(ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE);
7339 }
7340
7341 prog_data->uses_nonperspective_interp_modes |=
7342 (prog_data->barycentric_interp_modes &
7343 ELK_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0;
7344
7345 /* The current VK_EXT_graphics_pipeline_library specification requires
7346 * coarse to specified at compile time. But per sample interpolation can be
7347 * dynamic. So we should never be in a situation where coarse &
7348 * persample_interp are both respectively true & ELK_ALWAYS.
7349 *
7350 * Coarse will dynamically turned off when persample_interp is active.
7351 */
7352 assert(!key->coarse_pixel || key->persample_interp != ELK_ALWAYS);
7353
7354 prog_data->coarse_pixel_dispatch =
7355 elk_sometimes_invert(prog_data->persample_dispatch);
7356 if (!key->coarse_pixel ||
7357 prog_data->uses_omask ||
7358 prog_data->sample_shading ||
7359 prog_data->uses_sample_mask ||
7360 (prog_data->computed_depth_mode != ELK_PSCDEPTH_OFF) ||
7361 prog_data->computed_stencil) {
7362 prog_data->coarse_pixel_dispatch = ELK_NEVER;
7363 }
7364
7365 /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
7366 * Message Descriptor :
7367 *
7368 * "Message Type. Specifies the type of message being sent when
7369 * pixel-rate evaluation is requested :
7370 *
7371 * Format = U2
7372 * 0: Per Message Offset (eval_snapped with immediate offset)
7373 * 1: Sample Position Offset (eval_sindex)
7374 * 2: Centroid Position Offset (eval_centroid)
7375 * 3: Per Slot Offset (eval_snapped with register offset)
7376 *
7377 * Message Type. Specifies the type of message being sent when
7378 * coarse-rate evaluation is requested :
7379 *
7380 * Format = U2
7381 * 0: Coarse to Pixel Mapping Message (internal message)
7382 * 1: Reserved
7383 * 2: Coarse Centroid Position (eval_centroid)
7384 * 3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
7385 *
7386 * The Sample Position Offset is marked as reserved for coarse rate
7387 * evaluation and leads to hangs if we try to use it. So disable coarse
7388 * pixel shading if we have any intrinsic that will result in a pixel
7389 * interpolater message at sample.
7390 */
7391 if (intel_nir_pulls_at_sample(shader))
7392 prog_data->coarse_pixel_dispatch = ELK_NEVER;
7393
7394 /* We choose to always enable VMask prior to XeHP, as it would cause
7395 * us to lose out on the eliminate_find_live_channel() optimization.
7396 */
7397 prog_data->uses_vmask = devinfo->verx10 < 125 ||
7398 shader->info.fs.needs_quad_helper_invocations ||
7399 shader->info.uses_wide_subgroup_intrinsics ||
7400 prog_data->coarse_pixel_dispatch != ELK_NEVER;
7401
7402 prog_data->uses_src_w =
7403 BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
7404 prog_data->uses_src_depth =
7405 BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
7406 prog_data->coarse_pixel_dispatch != ELK_ALWAYS;
7407 prog_data->uses_depth_w_coefficients =
7408 BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
7409 prog_data->coarse_pixel_dispatch != ELK_NEVER;
7410
7411 calculate_urb_setup(devinfo, key, prog_data, shader);
7412 elk_compute_flat_inputs(prog_data, shader);
7413 }
7414
7415 /**
7416 * Pre-gfx6, the register file of the EUs was shared between threads,
7417 * and each thread used some subset allocated on a 16-register block
7418 * granularity. The unit states wanted these block counts.
7419 */
7420 static inline int
elk_register_blocks(int reg_count)7421 elk_register_blocks(int reg_count)
7422 {
7423 return ALIGN(reg_count, 16) / 16 - 1;
7424 }
7425
7426 const unsigned *
elk_compile_fs(const struct elk_compiler * compiler,struct elk_compile_fs_params * params)7427 elk_compile_fs(const struct elk_compiler *compiler,
7428 struct elk_compile_fs_params *params)
7429 {
7430 struct nir_shader *nir = params->base.nir;
7431 const struct elk_wm_prog_key *key = params->key;
7432 struct elk_wm_prog_data *prog_data = params->prog_data;
7433 bool allow_spilling = params->allow_spilling;
7434 const bool debug_enabled =
7435 elk_should_print_shader(nir, params->base.debug_flag ?
7436 params->base.debug_flag : DEBUG_WM);
7437
7438 prog_data->base.stage = MESA_SHADER_FRAGMENT;
7439 prog_data->base.ray_queries = nir->info.ray_queries;
7440 prog_data->base.total_scratch = 0;
7441
7442 const struct intel_device_info *devinfo = compiler->devinfo;
7443 const unsigned max_subgroup_size = compiler->devinfo->ver >= 6 ? 32 : 16;
7444
7445 elk_nir_apply_key(nir, compiler, &key->base, max_subgroup_size);
7446 elk_nir_lower_fs_inputs(nir, devinfo, key);
7447 elk_nir_lower_fs_outputs(nir);
7448
7449 if (devinfo->ver < 6)
7450 elk_setup_vue_interpolation(params->vue_map, nir, prog_data);
7451
7452 /* From the SKL PRM, Volume 7, "Alpha Coverage":
7453 * "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
7454 * hardware, regardless of the state setting for this feature."
7455 */
7456 if (devinfo->ver > 6 && key->alpha_to_coverage != ELK_NEVER) {
7457 /* Run constant fold optimization in order to get the correct source
7458 * offset to determine render target 0 store instruction in
7459 * emit_alpha_to_coverage pass.
7460 */
7461 NIR_PASS(_, nir, nir_opt_constant_folding);
7462 NIR_PASS(_, nir, elk_nir_lower_alpha_to_coverage, key, prog_data);
7463 }
7464
7465 NIR_PASS(_, nir, elk_nir_move_interpolation_to_top);
7466 elk_postprocess_nir(nir, compiler, debug_enabled,
7467 key->base.robust_flags);
7468
7469 elk_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data);
7470
7471 std::unique_ptr<elk_fs_visitor> v8, v16, v32, vmulti;
7472 elk_cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL,
7473 *multi_cfg = NULL;
7474 float throughput = 0;
7475 bool has_spilled = false;
7476
7477 if (devinfo->ver < 20) {
7478 v8 = std::make_unique<elk_fs_visitor>(compiler, ¶ms->base, key,
7479 prog_data, nir, 8, 1,
7480 params->base.stats != NULL,
7481 debug_enabled);
7482 if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
7483 params->base.error_str = ralloc_strdup(params->base.mem_ctx,
7484 v8->fail_msg);
7485 return NULL;
7486 } else if (INTEL_SIMD(FS, 8)) {
7487 simd8_cfg = v8->cfg;
7488
7489 assert(v8->payload().num_regs % reg_unit(devinfo) == 0);
7490 prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo);
7491
7492 prog_data->reg_blocks_8 = elk_register_blocks(v8->grf_used);
7493 const performance &perf = v8->performance_analysis.require();
7494 throughput = MAX2(throughput, perf.throughput);
7495 has_spilled = v8->spilled_any_registers;
7496 allow_spilling = false;
7497 }
7498 }
7499
7500 /* Limit dispatch width to simd8 with dual source blending on gfx8.
7501 * See: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1917
7502 */
7503 if (devinfo->ver == 8 && prog_data->dual_src_blend &&
7504 INTEL_SIMD(FS, 8)) {
7505 assert(!params->use_rep_send);
7506 v8->limit_dispatch_width(8, "gfx8 workaround: "
7507 "using SIMD8 when dual src blending.\n");
7508 }
7509
7510 if (key->coarse_pixel && devinfo->ver < 20) {
7511 if (prog_data->dual_src_blend) {
7512 v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot"
7513 " use SIMD8 messages.\n");
7514 }
7515 v8->limit_dispatch_width(16, "SIMD32 not supported with coarse"
7516 " pixel shading.\n");
7517 }
7518
7519 if (nir->info.ray_queries > 0 && v8)
7520 v8->limit_dispatch_width(16, "SIMD32 with ray queries.\n");
7521
7522 if (!has_spilled &&
7523 (!v8 || v8->max_dispatch_width >= 16) &&
7524 (INTEL_SIMD(FS, 16) || params->use_rep_send)) {
7525 /* Try a SIMD16 compile */
7526 v16 = std::make_unique<elk_fs_visitor>(compiler, ¶ms->base, key,
7527 prog_data, nir, 16, 1,
7528 params->base.stats != NULL,
7529 debug_enabled);
7530 if (v8)
7531 v16->import_uniforms(v8.get());
7532 if (!v16->run_fs(allow_spilling, params->use_rep_send)) {
7533 elk_shader_perf_log(compiler, params->base.log_data,
7534 "SIMD16 shader failed to compile: %s\n",
7535 v16->fail_msg);
7536 } else {
7537 simd16_cfg = v16->cfg;
7538
7539 assert(v16->payload().num_regs % reg_unit(devinfo) == 0);
7540 prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo);
7541
7542 prog_data->reg_blocks_16 = elk_register_blocks(v16->grf_used);
7543 const performance &perf = v16->performance_analysis.require();
7544 throughput = MAX2(throughput, perf.throughput);
7545 has_spilled = v16->spilled_any_registers;
7546 allow_spilling = false;
7547 }
7548 }
7549
7550 const bool simd16_failed = v16 && !simd16_cfg;
7551
7552 /* Currently, the compiler only supports SIMD32 on SNB+ */
7553 if (!has_spilled &&
7554 (!v8 || v8->max_dispatch_width >= 32) &&
7555 (!v16 || v16->max_dispatch_width >= 32) && !params->use_rep_send &&
7556 devinfo->ver >= 6 && !simd16_failed &&
7557 INTEL_SIMD(FS, 32)) {
7558 /* Try a SIMD32 compile */
7559 v32 = std::make_unique<elk_fs_visitor>(compiler, ¶ms->base, key,
7560 prog_data, nir, 32, 1,
7561 params->base.stats != NULL,
7562 debug_enabled);
7563 if (v8)
7564 v32->import_uniforms(v8.get());
7565 else if (v16)
7566 v32->import_uniforms(v16.get());
7567
7568 if (!v32->run_fs(allow_spilling, false)) {
7569 elk_shader_perf_log(compiler, params->base.log_data,
7570 "SIMD32 shader failed to compile: %s\n",
7571 v32->fail_msg);
7572 } else {
7573 const performance &perf = v32->performance_analysis.require();
7574
7575 if (!INTEL_DEBUG(DEBUG_DO32) && throughput >= perf.throughput) {
7576 elk_shader_perf_log(compiler, params->base.log_data,
7577 "SIMD32 shader inefficient\n");
7578 } else {
7579 simd32_cfg = v32->cfg;
7580
7581 assert(v32->payload().num_regs % reg_unit(devinfo) == 0);
7582 prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo);
7583
7584 prog_data->reg_blocks_32 = elk_register_blocks(v32->grf_used);
7585 throughput = MAX2(throughput, perf.throughput);
7586 }
7587 }
7588 }
7589
7590 if (devinfo->ver >= 12 && !has_spilled &&
7591 params->max_polygons >= 2 && !key->coarse_pixel) {
7592 elk_fs_visitor *vbase = v8 ? v8.get() : v16 ? v16.get() : v32.get();
7593 assert(vbase);
7594
7595 if (devinfo->ver >= 20 &&
7596 params->max_polygons >= 4 &&
7597 vbase->max_dispatch_width >= 32 &&
7598 4 * prog_data->num_varying_inputs <= MAX_VARYING &&
7599 INTEL_SIMD(FS, 4X8)) {
7600 /* Try a quad-SIMD8 compile */
7601 vmulti = std::make_unique<elk_fs_visitor>(compiler, ¶ms->base, key,
7602 prog_data, nir, 32, 4,
7603 params->base.stats != NULL,
7604 debug_enabled);
7605 vmulti->import_uniforms(vbase);
7606 if (!vmulti->run_fs(false, params->use_rep_send)) {
7607 elk_shader_perf_log(compiler, params->base.log_data,
7608 "Quad-SIMD8 shader failed to compile: %s\n",
7609 vmulti->fail_msg);
7610 } else {
7611 multi_cfg = vmulti->cfg;
7612 assert(!vmulti->spilled_any_registers);
7613 }
7614 }
7615
7616 if (!multi_cfg && devinfo->ver >= 20 &&
7617 vbase->max_dispatch_width >= 32 &&
7618 2 * prog_data->num_varying_inputs <= MAX_VARYING &&
7619 INTEL_SIMD(FS, 2X16)) {
7620 /* Try a dual-SIMD16 compile */
7621 vmulti = std::make_unique<elk_fs_visitor>(compiler, ¶ms->base, key,
7622 prog_data, nir, 32, 2,
7623 params->base.stats != NULL,
7624 debug_enabled);
7625 vmulti->import_uniforms(vbase);
7626 if (!vmulti->run_fs(false, params->use_rep_send)) {
7627 elk_shader_perf_log(compiler, params->base.log_data,
7628 "Dual-SIMD16 shader failed to compile: %s\n",
7629 vmulti->fail_msg);
7630 } else {
7631 multi_cfg = vmulti->cfg;
7632 assert(!vmulti->spilled_any_registers);
7633 }
7634 }
7635
7636 if (!multi_cfg && vbase->max_dispatch_width >= 16 &&
7637 2 * prog_data->num_varying_inputs <= MAX_VARYING &&
7638 INTEL_SIMD(FS, 2X8)) {
7639 /* Try a dual-SIMD8 compile */
7640 vmulti = std::make_unique<elk_fs_visitor>(compiler, ¶ms->base, key,
7641 prog_data, nir, 16, 2,
7642 params->base.stats != NULL,
7643 debug_enabled);
7644 vmulti->import_uniforms(vbase);
7645 if (!vmulti->run_fs(allow_spilling, params->use_rep_send)) {
7646 elk_shader_perf_log(compiler, params->base.log_data,
7647 "Dual-SIMD8 shader failed to compile: %s\n",
7648 vmulti->fail_msg);
7649 } else {
7650 multi_cfg = vmulti->cfg;
7651 }
7652 }
7653
7654 if (multi_cfg) {
7655 assert(vmulti->payload().num_regs % reg_unit(devinfo) == 0);
7656 prog_data->base.dispatch_grf_start_reg = vmulti->payload().num_regs / reg_unit(devinfo);
7657
7658 prog_data->reg_blocks_8 = elk_register_blocks(vmulti->grf_used);
7659 }
7660 }
7661
7662 /* When the caller requests a repclear shader, they want SIMD16-only */
7663 if (params->use_rep_send)
7664 simd8_cfg = NULL;
7665
7666 /* Prior to Iron Lake, the PS had a single shader offset with a jump table
7667 * at the top to select the shader. We've never implemented that.
7668 * Instead, we just give them exactly one shader and we pick the widest one
7669 * available.
7670 */
7671 if (compiler->devinfo->ver < 5) {
7672 if (simd32_cfg || simd16_cfg)
7673 simd8_cfg = NULL;
7674 if (simd32_cfg)
7675 simd16_cfg = NULL;
7676 }
7677
7678 /* If computed depth is enabled SNB only allows SIMD8. */
7679 if (compiler->devinfo->ver == 6 &&
7680 prog_data->computed_depth_mode != ELK_PSCDEPTH_OFF)
7681 assert(simd16_cfg == NULL && simd32_cfg == NULL);
7682
7683 if (compiler->devinfo->ver <= 5 && !simd8_cfg) {
7684 /* Iron lake and earlier only have one Dispatch GRF start field. Make
7685 * the data available in the base prog data struct for convenience.
7686 */
7687 if (simd16_cfg) {
7688 prog_data->base.dispatch_grf_start_reg =
7689 prog_data->dispatch_grf_start_reg_16;
7690 } else if (simd32_cfg) {
7691 prog_data->base.dispatch_grf_start_reg =
7692 prog_data->dispatch_grf_start_reg_32;
7693 }
7694 }
7695
7696 elk_fs_generator g(compiler, ¶ms->base, &prog_data->base,
7697 v8 && v8->runtime_check_aads_emit, MESA_SHADER_FRAGMENT);
7698
7699 if (unlikely(debug_enabled)) {
7700 g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
7701 "%s fragment shader %s",
7702 nir->info.label ?
7703 nir->info.label : "unnamed",
7704 nir->info.name));
7705 }
7706
7707 struct elk_compile_stats *stats = params->base.stats;
7708 uint32_t max_dispatch_width = 0;
7709
7710 if (multi_cfg) {
7711 prog_data->dispatch_multi = vmulti->dispatch_width;
7712 prog_data->max_polygons = vmulti->max_polygons;
7713 g.generate_code(multi_cfg, vmulti->dispatch_width, vmulti->shader_stats,
7714 vmulti->performance_analysis.require(),
7715 stats, vmulti->max_polygons);
7716 stats = stats ? stats + 1 : NULL;
7717 max_dispatch_width = vmulti->dispatch_width;
7718
7719 } else if (simd8_cfg) {
7720 prog_data->dispatch_8 = true;
7721 g.generate_code(simd8_cfg, 8, v8->shader_stats,
7722 v8->performance_analysis.require(), stats, 1);
7723 stats = stats ? stats + 1 : NULL;
7724 max_dispatch_width = 8;
7725 }
7726
7727 if (simd16_cfg) {
7728 prog_data->dispatch_16 = true;
7729 prog_data->prog_offset_16 = g.generate_code(
7730 simd16_cfg, 16, v16->shader_stats,
7731 v16->performance_analysis.require(), stats, 1);
7732 stats = stats ? stats + 1 : NULL;
7733 max_dispatch_width = 16;
7734 }
7735
7736 if (simd32_cfg) {
7737 prog_data->dispatch_32 = true;
7738 prog_data->prog_offset_32 = g.generate_code(
7739 simd32_cfg, 32, v32->shader_stats,
7740 v32->performance_analysis.require(), stats, 1);
7741 stats = stats ? stats + 1 : NULL;
7742 max_dispatch_width = 32;
7743 }
7744
7745 for (struct elk_compile_stats *s = params->base.stats; s != NULL && s != stats; s++)
7746 s->max_dispatch_width = max_dispatch_width;
7747
7748 g.add_const_data(nir->constant_data, nir->constant_data_size);
7749 return g.get_assembly();
7750 }
7751
7752 unsigned
elk_cs_push_const_total_size(const struct elk_cs_prog_data * cs_prog_data,unsigned threads)7753 elk_cs_push_const_total_size(const struct elk_cs_prog_data *cs_prog_data,
7754 unsigned threads)
7755 {
7756 assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
7757 assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
7758 return cs_prog_data->push.per_thread.size * threads +
7759 cs_prog_data->push.cross_thread.size;
7760 }
7761
7762 static void
fill_push_const_block_info(struct elk_push_const_block * block,unsigned dwords)7763 fill_push_const_block_info(struct elk_push_const_block *block, unsigned dwords)
7764 {
7765 block->dwords = dwords;
7766 block->regs = DIV_ROUND_UP(dwords, 8);
7767 block->size = block->regs * 32;
7768 }
7769
7770 static void
cs_fill_push_const_info(const struct intel_device_info * devinfo,struct elk_cs_prog_data * cs_prog_data)7771 cs_fill_push_const_info(const struct intel_device_info *devinfo,
7772 struct elk_cs_prog_data *cs_prog_data)
7773 {
7774 const struct elk_stage_prog_data *prog_data = &cs_prog_data->base;
7775 int subgroup_id_index = elk_get_subgroup_id_param_index(devinfo, prog_data);
7776 bool cross_thread_supported = devinfo->verx10 >= 75;
7777
7778 /* The thread ID should be stored in the last param dword */
7779 assert(subgroup_id_index == -1 ||
7780 subgroup_id_index == (int)prog_data->nr_params - 1);
7781
7782 unsigned cross_thread_dwords, per_thread_dwords;
7783 if (!cross_thread_supported) {
7784 cross_thread_dwords = 0u;
7785 per_thread_dwords = prog_data->nr_params;
7786 } else if (subgroup_id_index >= 0) {
7787 /* Fill all but the last register with cross-thread payload */
7788 cross_thread_dwords = 8 * (subgroup_id_index / 8);
7789 per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
7790 assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
7791 } else {
7792 /* Fill all data using cross-thread payload */
7793 cross_thread_dwords = prog_data->nr_params;
7794 per_thread_dwords = 0u;
7795 }
7796
7797 fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
7798 fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
7799
7800 assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
7801 cs_prog_data->push.per_thread.size == 0);
7802 assert(cs_prog_data->push.cross_thread.dwords +
7803 cs_prog_data->push.per_thread.dwords ==
7804 prog_data->nr_params);
7805 }
7806
7807 static bool
filter_simd(const nir_instr * instr,const void *)7808 filter_simd(const nir_instr *instr, const void * /* options */)
7809 {
7810 if (instr->type != nir_instr_type_intrinsic)
7811 return false;
7812
7813 switch (nir_instr_as_intrinsic(instr)->intrinsic) {
7814 case nir_intrinsic_load_simd_width_intel:
7815 case nir_intrinsic_load_subgroup_id:
7816 return true;
7817
7818 default:
7819 return false;
7820 }
7821 }
7822
7823 static nir_def *
lower_simd(nir_builder * b,nir_instr * instr,void * options)7824 lower_simd(nir_builder *b, nir_instr *instr, void *options)
7825 {
7826 uintptr_t simd_width = (uintptr_t)options;
7827
7828 switch (nir_instr_as_intrinsic(instr)->intrinsic) {
7829 case nir_intrinsic_load_simd_width_intel:
7830 return nir_imm_int(b, simd_width);
7831
7832 case nir_intrinsic_load_subgroup_id:
7833 /* If the whole workgroup fits in one thread, we can lower subgroup_id
7834 * to a constant zero.
7835 */
7836 if (!b->shader->info.workgroup_size_variable) {
7837 unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
7838 b->shader->info.workgroup_size[1] *
7839 b->shader->info.workgroup_size[2];
7840 if (local_workgroup_size <= simd_width)
7841 return nir_imm_int(b, 0);
7842 }
7843 return NULL;
7844
7845 default:
7846 return NULL;
7847 }
7848 }
7849
7850 bool
elk_nir_lower_simd(nir_shader * nir,unsigned dispatch_width)7851 elk_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
7852 {
7853 return nir_shader_lower_instructions(nir, filter_simd, lower_simd,
7854 (void *)(uintptr_t)dispatch_width);
7855 }
7856
7857 const unsigned *
elk_compile_cs(const struct elk_compiler * compiler,struct elk_compile_cs_params * params)7858 elk_compile_cs(const struct elk_compiler *compiler,
7859 struct elk_compile_cs_params *params)
7860 {
7861 const nir_shader *nir = params->base.nir;
7862 const struct elk_cs_prog_key *key = params->key;
7863 struct elk_cs_prog_data *prog_data = params->prog_data;
7864
7865 const bool debug_enabled =
7866 elk_should_print_shader(nir, params->base.debug_flag ?
7867 params->base.debug_flag : DEBUG_CS);
7868
7869 prog_data->base.stage = MESA_SHADER_COMPUTE;
7870 prog_data->base.total_shared = nir->info.shared_size;
7871 prog_data->base.ray_queries = nir->info.ray_queries;
7872 prog_data->base.total_scratch = 0;
7873
7874 if (!nir->info.workgroup_size_variable) {
7875 prog_data->local_size[0] = nir->info.workgroup_size[0];
7876 prog_data->local_size[1] = nir->info.workgroup_size[1];
7877 prog_data->local_size[2] = nir->info.workgroup_size[2];
7878 }
7879
7880 elk_simd_selection_state simd_state{
7881 .devinfo = compiler->devinfo,
7882 .prog_data = prog_data,
7883 .required_width = elk_required_dispatch_width(&nir->info),
7884 };
7885
7886 std::unique_ptr<elk_fs_visitor> v[3];
7887
7888 for (unsigned simd = 0; simd < 3; simd++) {
7889 if (!elk_simd_should_compile(simd_state, simd))
7890 continue;
7891
7892 const unsigned dispatch_width = 8u << simd;
7893
7894 nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
7895 elk_nir_apply_key(shader, compiler, &key->base,
7896 dispatch_width);
7897
7898 NIR_PASS(_, shader, elk_nir_lower_simd, dispatch_width);
7899
7900 /* Clean up after the local index and ID calculations. */
7901 NIR_PASS(_, shader, nir_opt_constant_folding);
7902 NIR_PASS(_, shader, nir_opt_dce);
7903
7904 elk_postprocess_nir(shader, compiler, debug_enabled,
7905 key->base.robust_flags);
7906
7907 v[simd] = std::make_unique<elk_fs_visitor>(compiler, ¶ms->base,
7908 &key->base,
7909 &prog_data->base,
7910 shader, dispatch_width,
7911 params->base.stats != NULL,
7912 debug_enabled);
7913
7914 const int first = elk_simd_first_compiled(simd_state);
7915 if (first >= 0)
7916 v[simd]->import_uniforms(v[first].get());
7917
7918 const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable;
7919
7920 if (v[simd]->run_cs(allow_spilling)) {
7921 cs_fill_push_const_info(compiler->devinfo, prog_data);
7922
7923 elk_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
7924 } else {
7925 simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
7926 if (simd > 0) {
7927 elk_shader_perf_log(compiler, params->base.log_data,
7928 "SIMD%u shader failed to compile: %s\n",
7929 dispatch_width, v[simd]->fail_msg);
7930 }
7931 }
7932 }
7933
7934 const int selected_simd = elk_simd_select(simd_state);
7935 if (selected_simd < 0) {
7936 params->base.error_str =
7937 ralloc_asprintf(params->base.mem_ctx,
7938 "Can't compile shader: "
7939 "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
7940 simd_state.error[0], simd_state.error[1],
7941 simd_state.error[2]);
7942 return NULL;
7943 }
7944
7945 assert(selected_simd < 3);
7946 elk_fs_visitor *selected = v[selected_simd].get();
7947
7948 if (!nir->info.workgroup_size_variable)
7949 prog_data->prog_mask = 1 << selected_simd;
7950
7951 elk_fs_generator g(compiler, ¶ms->base, &prog_data->base,
7952 selected->runtime_check_aads_emit, MESA_SHADER_COMPUTE);
7953 if (unlikely(debug_enabled)) {
7954 char *name = ralloc_asprintf(params->base.mem_ctx,
7955 "%s compute shader %s",
7956 nir->info.label ?
7957 nir->info.label : "unnamed",
7958 nir->info.name);
7959 g.enable_debug(name);
7960 }
7961
7962 uint32_t max_dispatch_width = 8u << (util_last_bit(prog_data->prog_mask) - 1);
7963
7964 struct elk_compile_stats *stats = params->base.stats;
7965 for (unsigned simd = 0; simd < 3; simd++) {
7966 if (prog_data->prog_mask & (1u << simd)) {
7967 assert(v[simd]);
7968 prog_data->prog_offset[simd] =
7969 g.generate_code(v[simd]->cfg, 8u << simd, v[simd]->shader_stats,
7970 v[simd]->performance_analysis.require(), stats);
7971 if (stats)
7972 stats->max_dispatch_width = max_dispatch_width;
7973 stats = stats ? stats + 1 : NULL;
7974 max_dispatch_width = 8u << simd;
7975 }
7976 }
7977
7978 g.add_const_data(nir->constant_data, nir->constant_data_size);
7979
7980 return g.get_assembly();
7981 }
7982
7983 struct intel_cs_dispatch_info
elk_cs_get_dispatch_info(const struct intel_device_info * devinfo,const struct elk_cs_prog_data * prog_data,const unsigned * override_local_size)7984 elk_cs_get_dispatch_info(const struct intel_device_info *devinfo,
7985 const struct elk_cs_prog_data *prog_data,
7986 const unsigned *override_local_size)
7987 {
7988 struct intel_cs_dispatch_info info = {};
7989
7990 const unsigned *sizes =
7991 override_local_size ? override_local_size :
7992 prog_data->local_size;
7993
7994 const int simd = elk_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
7995 assert(simd >= 0 && simd < 3);
7996
7997 info.group_size = sizes[0] * sizes[1] * sizes[2];
7998 info.simd_size = 8u << simd;
7999 info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
8000
8001 const uint32_t remainder = info.group_size & (info.simd_size - 1);
8002 if (remainder > 0)
8003 info.right_mask = ~0u >> (32 - remainder);
8004 else
8005 info.right_mask = ~0u >> (32 - info.simd_size);
8006
8007 return info;
8008 }
8009
8010 uint64_t
elk_bsr(const struct intel_device_info * devinfo,uint32_t offset,uint8_t simd_size,uint8_t local_arg_offset)8011 elk_bsr(const struct intel_device_info *devinfo,
8012 uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset)
8013 {
8014 assert(offset % 64 == 0);
8015 assert(simd_size == 8 || simd_size == 16);
8016 assert(local_arg_offset % 8 == 0);
8017
8018 return offset |
8019 SET_BITS(simd_size == 8, 4, 4) |
8020 SET_BITS(local_arg_offset / 8, 2, 0);
8021 }
8022
8023 /**
8024 * Test the dispatch mask packing assumptions of
8025 * elk_stage_has_packed_dispatch(). Call this from e.g. the top of
8026 * elk_fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
8027 * executed with an unexpected dispatch mask.
8028 */
8029 static UNUSED void
elk_fs_test_dispatch_packing(const fs_builder & bld)8030 elk_fs_test_dispatch_packing(const fs_builder &bld)
8031 {
8032 const elk_fs_visitor *shader = static_cast<const elk_fs_visitor *>(bld.shader);
8033 const gl_shader_stage stage = shader->stage;
8034 const bool uses_vmask =
8035 stage == MESA_SHADER_FRAGMENT &&
8036 elk_wm_prog_data(shader->stage_prog_data)->uses_vmask;
8037
8038 if (elk_stage_has_packed_dispatch(shader->devinfo, stage,
8039 shader->max_polygons,
8040 shader->stage_prog_data)) {
8041 const fs_builder ubld = bld.exec_all().group(1, 0);
8042 const elk_fs_reg tmp = component(bld.vgrf(ELK_REGISTER_TYPE_UD), 0);
8043 const elk_fs_reg mask = uses_vmask ? elk_vmask_reg() : elk_dmask_reg();
8044
8045 ubld.ADD(tmp, mask, elk_imm_ud(1));
8046 ubld.AND(tmp, mask, tmp);
8047
8048 /* This will loop forever if the dispatch mask doesn't have the expected
8049 * form '2^n-1', in which case tmp will be non-zero.
8050 */
8051 bld.emit(ELK_OPCODE_DO);
8052 bld.CMP(bld.null_reg_ud(), tmp, elk_imm_ud(0), ELK_CONDITIONAL_NZ);
8053 set_predicate(ELK_PREDICATE_NORMAL, bld.emit(ELK_OPCODE_WHILE));
8054 }
8055 }
8056
8057 unsigned
workgroup_size() const8058 elk_fs_visitor::workgroup_size() const
8059 {
8060 assert(gl_shader_stage_uses_workgroup(stage));
8061 const struct elk_cs_prog_data *cs = elk_cs_prog_data(prog_data);
8062 return cs->local_size[0] * cs->local_size[1] * cs->local_size[2];
8063 }
8064
elk_should_print_shader(const nir_shader * shader,uint64_t debug_flag)8065 bool elk_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
8066 {
8067 return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
8068 }
8069
8070 namespace elk {
8071 elk_fs_reg
fetch_payload_reg(const elk::fs_builder & bld,uint8_t regs[2],elk_reg_type type,unsigned n)8072 fetch_payload_reg(const elk::fs_builder &bld, uint8_t regs[2],
8073 elk_reg_type type, unsigned n)
8074 {
8075 if (!regs[0])
8076 return elk_fs_reg();
8077
8078 if (bld.dispatch_width() > 16) {
8079 const elk_fs_reg tmp = bld.vgrf(type, n);
8080 const elk::fs_builder hbld = bld.exec_all().group(16, 0);
8081 const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
8082 elk_fs_reg *const components = new elk_fs_reg[m * n];
8083
8084 for (unsigned c = 0; c < n; c++) {
8085 for (unsigned g = 0; g < m; g++)
8086 components[c * m + g] =
8087 offset(retype(elk_vec8_grf(regs[g], 0), type), hbld, c);
8088 }
8089
8090 hbld.LOAD_PAYLOAD(tmp, components, m * n, 0);
8091
8092 delete[] components;
8093 return tmp;
8094
8095 } else {
8096 return elk_fs_reg(retype(elk_vec8_grf(regs[0], 0), type));
8097 }
8098 }
8099
8100 elk_fs_reg
fetch_barycentric_reg(const elk::fs_builder & bld,uint8_t regs[2])8101 fetch_barycentric_reg(const elk::fs_builder &bld, uint8_t regs[2])
8102 {
8103 if (!regs[0])
8104 return elk_fs_reg();
8105 else if (bld.shader->devinfo->ver >= 20)
8106 return fetch_payload_reg(bld, regs, ELK_REGISTER_TYPE_F, 2);
8107
8108 const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_F, 2);
8109 const elk::fs_builder hbld = bld.exec_all().group(8, 0);
8110 const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
8111 elk_fs_reg *const components = new elk_fs_reg[2 * m];
8112
8113 for (unsigned c = 0; c < 2; c++) {
8114 for (unsigned g = 0; g < m; g++)
8115 components[c * m + g] = offset(elk_vec8_grf(regs[g / 2], 0),
8116 hbld, c + 2 * (g % 2));
8117 }
8118
8119 hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
8120
8121 delete[] components;
8122 return tmp;
8123 }
8124
8125 void
check_dynamic_msaa_flag(const fs_builder & bld,const struct elk_wm_prog_data * wm_prog_data,enum intel_msaa_flags flag)8126 check_dynamic_msaa_flag(const fs_builder &bld,
8127 const struct elk_wm_prog_data *wm_prog_data,
8128 enum intel_msaa_flags flag)
8129 {
8130 elk_fs_inst *inst = bld.AND(bld.null_reg_ud(),
8131 dynamic_msaa_flags(wm_prog_data),
8132 elk_imm_ud(flag));
8133 inst->conditional_mod = ELK_CONDITIONAL_NZ;
8134 }
8135 }
8136