1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file elk_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 #include "elk_eu.h"
32 #include "elk_fs.h"
33 #include "elk_fs_builder.h"
34 #include "elk_fs_live_variables.h"
35 #include "elk_nir.h"
36 #include "elk_vec4_gs_visitor.h"
37 #include "elk_cfg.h"
38 #include "elk_dead_control_flow.h"
39 #include "elk_private.h"
40 #include "../intel_nir.h"
41 #include "shader_enums.h"
42 #include "dev/intel_debug.h"
43 #include "dev/intel_wa.h"
44 #include "compiler/glsl_types.h"
45 #include "compiler/nir/nir_builder.h"
46 #include "util/u_math.h"
47
48 #include <memory>
49
50 using namespace elk;
51
52 static unsigned get_lowered_simd_width(const elk_fs_visitor *shader,
53 const elk_fs_inst *inst);
54
55 void
init(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg * src,unsigned sources)56 elk_fs_inst::init(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
57 const elk_fs_reg *src, unsigned sources)
58 {
59 memset((void*)this, 0, sizeof(*this));
60
61 this->src = new elk_fs_reg[MAX2(sources, 3)];
62 for (unsigned i = 0; i < sources; i++)
63 this->src[i] = src[i];
64
65 this->opcode = opcode;
66 this->dst = dst;
67 this->sources = sources;
68 this->exec_size = exec_size;
69 this->base_mrf = -1;
70
71 assert(dst.file != IMM && dst.file != UNIFORM);
72
73 assert(this->exec_size != 0);
74
75 this->conditional_mod = ELK_CONDITIONAL_NONE;
76
77 /* This will be the case for almost all instructions. */
78 switch (dst.file) {
79 case VGRF:
80 case ARF:
81 case FIXED_GRF:
82 case MRF:
83 case ATTR:
84 this->size_written = dst.component_size(exec_size);
85 break;
86 case BAD_FILE:
87 this->size_written = 0;
88 break;
89 case IMM:
90 case UNIFORM:
91 unreachable("Invalid destination register file");
92 }
93
94 this->writes_accumulator = false;
95 }
96
elk_fs_inst()97 elk_fs_inst::elk_fs_inst()
98 {
99 init(ELK_OPCODE_NOP, 8, dst, NULL, 0);
100 }
101
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size)102 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size)
103 {
104 init(opcode, exec_size, reg_undef, NULL, 0);
105 }
106
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst)107 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst)
108 {
109 init(opcode, exec_size, dst, NULL, 0);
110 }
111
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg & src0)112 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
113 const elk_fs_reg &src0)
114 {
115 const elk_fs_reg src[1] = { src0 };
116 init(opcode, exec_size, dst, src, 1);
117 }
118
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg & src0,const elk_fs_reg & src1)119 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
120 const elk_fs_reg &src0, const elk_fs_reg &src1)
121 {
122 const elk_fs_reg src[2] = { src0, src1 };
123 init(opcode, exec_size, dst, src, 2);
124 }
125
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_size,const elk_fs_reg & dst,const elk_fs_reg & src0,const elk_fs_reg & src1,const elk_fs_reg & src2)126 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_size, const elk_fs_reg &dst,
127 const elk_fs_reg &src0, const elk_fs_reg &src1, const elk_fs_reg &src2)
128 {
129 const elk_fs_reg src[3] = { src0, src1, src2 };
130 init(opcode, exec_size, dst, src, 3);
131 }
132
elk_fs_inst(enum elk_opcode opcode,uint8_t exec_width,const elk_fs_reg & dst,const elk_fs_reg src[],unsigned sources)133 elk_fs_inst::elk_fs_inst(enum elk_opcode opcode, uint8_t exec_width, const elk_fs_reg &dst,
134 const elk_fs_reg src[], unsigned sources)
135 {
136 init(opcode, exec_width, dst, src, sources);
137 }
138
elk_fs_inst(const elk_fs_inst & that)139 elk_fs_inst::elk_fs_inst(const elk_fs_inst &that)
140 {
141 memcpy((void*)this, &that, sizeof(that));
142
143 this->src = new elk_fs_reg[MAX2(that.sources, 3)];
144
145 for (unsigned i = 0; i < that.sources; i++)
146 this->src[i] = that.src[i];
147 }
148
~elk_fs_inst()149 elk_fs_inst::~elk_fs_inst()
150 {
151 delete[] this->src;
152 }
153
154 void
resize_sources(uint8_t num_sources)155 elk_fs_inst::resize_sources(uint8_t num_sources)
156 {
157 if (this->sources != num_sources) {
158 elk_fs_reg *src = new elk_fs_reg[MAX2(num_sources, 3)];
159
160 for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
161 src[i] = this->src[i];
162
163 delete[] this->src;
164 this->src = src;
165 this->sources = num_sources;
166 }
167 }
168
169 void
VARYING_PULL_CONSTANT_LOAD(const fs_builder & bld,const elk_fs_reg & dst,const elk_fs_reg & surface,const elk_fs_reg & surface_handle,const elk_fs_reg & varying_offset,uint32_t const_offset,uint8_t alignment,unsigned components)170 elk_fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
171 const elk_fs_reg &dst,
172 const elk_fs_reg &surface,
173 const elk_fs_reg &surface_handle,
174 const elk_fs_reg &varying_offset,
175 uint32_t const_offset,
176 uint8_t alignment,
177 unsigned components)
178 {
179 assert(components <= 4);
180
181 /* We have our constant surface use a pitch of 4 bytes, so our index can
182 * be any component of a vector, and then we load 4 contiguous
183 * components starting from that. TODO: Support loading fewer than 4.
184 */
185 elk_fs_reg total_offset = vgrf(glsl_uint_type());
186 bld.ADD(total_offset, varying_offset, elk_imm_ud(const_offset));
187
188 /* The pull load message will load a vec4 (16 bytes). If we are loading
189 * a double this means we are only loading 2 elements worth of data.
190 * We also want to use a 32-bit data type for the dst of the load operation
191 * so other parts of the driver don't get confused about the size of the
192 * result.
193 */
194 elk_fs_reg vec4_result = bld.vgrf(ELK_REGISTER_TYPE_F, 4);
195
196 elk_fs_reg srcs[PULL_VARYING_CONSTANT_SRCS];
197 srcs[PULL_VARYING_CONSTANT_SRC_SURFACE] = surface;
198 srcs[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
199 srcs[PULL_VARYING_CONSTANT_SRC_OFFSET] = total_offset;
200 srcs[PULL_VARYING_CONSTANT_SRC_ALIGNMENT] = elk_imm_ud(alignment);
201
202 elk_fs_inst *inst = bld.emit(ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
203 vec4_result, srcs, PULL_VARYING_CONSTANT_SRCS);
204 inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
205
206 elk_shuffle_from_32bit_read(bld, dst, vec4_result, 0, components);
207 }
208
209 /**
210 * A helper for MOV generation for fixing up broken hardware SEND dependency
211 * handling.
212 */
213 void
DEP_RESOLVE_MOV(const fs_builder & bld,int grf)214 elk_fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
215 {
216 /* The caller always wants uncompressed to emit the minimal extra
217 * dependencies, and to avoid having to deal with aligning its regs to 2.
218 */
219 const fs_builder ubld = bld.annotate("send dependency resolve")
220 .quarter(0);
221
222 ubld.MOV(ubld.null_reg_f(), elk_fs_reg(VGRF, grf, ELK_REGISTER_TYPE_F));
223 }
224
225 bool
is_send_from_grf() const226 elk_fs_inst::is_send_from_grf() const
227 {
228 switch (opcode) {
229 case ELK_SHADER_OPCODE_SEND:
230 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
231 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
232 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
233 case ELK_SHADER_OPCODE_INTERLOCK:
234 case ELK_SHADER_OPCODE_MEMORY_FENCE:
235 case ELK_SHADER_OPCODE_BARRIER:
236 return true;
237 case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
238 return src[1].file == VGRF;
239 case ELK_FS_OPCODE_FB_WRITE:
240 return src[0].file == VGRF;
241 default:
242 return false;
243 }
244 }
245
246 bool
is_control_source(unsigned arg) const247 elk_fs_inst::is_control_source(unsigned arg) const
248 {
249 switch (opcode) {
250 case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
251 case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
252 return arg == 0;
253
254 case ELK_SHADER_OPCODE_BROADCAST:
255 case ELK_SHADER_OPCODE_SHUFFLE:
256 case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
257 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
258 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
259 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
260 return arg == 1;
261
262 case ELK_SHADER_OPCODE_MOV_INDIRECT:
263 case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
264 case ELK_SHADER_OPCODE_TEX:
265 case ELK_FS_OPCODE_TXB:
266 case ELK_SHADER_OPCODE_TXD:
267 case ELK_SHADER_OPCODE_TXF:
268 case ELK_SHADER_OPCODE_TXF_LZ:
269 case ELK_SHADER_OPCODE_TXF_CMS:
270 case ELK_SHADER_OPCODE_TXF_CMS_W:
271 case ELK_SHADER_OPCODE_TXF_UMS:
272 case ELK_SHADER_OPCODE_TXF_MCS:
273 case ELK_SHADER_OPCODE_TXL:
274 case ELK_SHADER_OPCODE_TXL_LZ:
275 case ELK_SHADER_OPCODE_TXS:
276 case ELK_SHADER_OPCODE_LOD:
277 case ELK_SHADER_OPCODE_TG4:
278 case ELK_SHADER_OPCODE_TG4_OFFSET:
279 case ELK_SHADER_OPCODE_SAMPLEINFO:
280 return arg == 1 || arg == 2;
281
282 case ELK_SHADER_OPCODE_SEND:
283 return arg == 0;
284
285 default:
286 return false;
287 }
288 }
289
290 bool
is_payload(unsigned arg) const291 elk_fs_inst::is_payload(unsigned arg) const
292 {
293 switch (opcode) {
294 case ELK_FS_OPCODE_FB_WRITE:
295 case ELK_VEC4_OPCODE_UNTYPED_ATOMIC:
296 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_READ:
297 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
298 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
299 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
300 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
301 case ELK_SHADER_OPCODE_INTERLOCK:
302 case ELK_SHADER_OPCODE_MEMORY_FENCE:
303 case ELK_SHADER_OPCODE_BARRIER:
304 case ELK_SHADER_OPCODE_TEX:
305 case ELK_FS_OPCODE_TXB:
306 case ELK_SHADER_OPCODE_TXD:
307 case ELK_SHADER_OPCODE_TXF:
308 case ELK_SHADER_OPCODE_TXF_LZ:
309 case ELK_SHADER_OPCODE_TXF_CMS:
310 case ELK_SHADER_OPCODE_TXF_CMS_W:
311 case ELK_SHADER_OPCODE_TXF_UMS:
312 case ELK_SHADER_OPCODE_TXF_MCS:
313 case ELK_SHADER_OPCODE_TXL:
314 case ELK_SHADER_OPCODE_TXL_LZ:
315 case ELK_SHADER_OPCODE_TXS:
316 case ELK_SHADER_OPCODE_LOD:
317 case ELK_SHADER_OPCODE_TG4:
318 case ELK_SHADER_OPCODE_TG4_OFFSET:
319 case ELK_SHADER_OPCODE_SAMPLEINFO:
320 return arg == 0;
321
322 case ELK_SHADER_OPCODE_SEND:
323 return arg == 1;
324
325 default:
326 return false;
327 }
328 }
329
330 /**
331 * Returns true if this instruction's sources and destinations cannot
332 * safely be the same register.
333 *
334 * In most cases, a register can be written over safely by the same
335 * instruction that is its last use. For a single instruction, the
336 * sources are dereferenced before writing of the destination starts
337 * (naturally).
338 *
339 * However, there are a few cases where this can be problematic:
340 *
341 * - Virtual opcodes that translate to multiple instructions in the
342 * code generator: if src == dst and one instruction writes the
343 * destination before a later instruction reads the source, then
344 * src will have been clobbered.
345 *
346 * - SIMD16 compressed instructions with certain regioning (see below).
347 *
348 * The register allocator uses this information to set up conflicts between
349 * GRF sources and the destination.
350 */
351 bool
has_source_and_destination_hazard() const352 elk_fs_inst::has_source_and_destination_hazard() const
353 {
354 switch (opcode) {
355 case ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT:
356 /* Multiple partial writes to the destination */
357 return true;
358 case ELK_SHADER_OPCODE_SHUFFLE:
359 /* This instruction returns an arbitrary channel from the source and
360 * gets split into smaller instructions in the generator. It's possible
361 * that one of the instructions will read from a channel corresponding
362 * to an earlier instruction.
363 */
364 case ELK_SHADER_OPCODE_SEL_EXEC:
365 /* This is implemented as
366 *
367 * mov(16) g4<1>D 0D { align1 WE_all 1H };
368 * mov(16) g4<1>D g5<8,8,1>D { align1 1H }
369 *
370 * Because the source is only read in the second instruction, the first
371 * may stomp all over it.
372 */
373 return true;
374 case ELK_SHADER_OPCODE_QUAD_SWIZZLE:
375 switch (src[1].ud) {
376 case ELK_SWIZZLE_XXXX:
377 case ELK_SWIZZLE_YYYY:
378 case ELK_SWIZZLE_ZZZZ:
379 case ELK_SWIZZLE_WWWW:
380 case ELK_SWIZZLE_XXZZ:
381 case ELK_SWIZZLE_YYWW:
382 case ELK_SWIZZLE_XYXY:
383 case ELK_SWIZZLE_ZWZW:
384 /* These can be implemented as a single Align1 region on all
385 * platforms, so there's never a hazard between source and
386 * destination. C.f. elk_fs_generator::generate_quad_swizzle().
387 */
388 return false;
389 default:
390 return !is_uniform(src[0]);
391 }
392 default:
393 /* The SIMD16 compressed instruction
394 *
395 * add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F
396 *
397 * is actually decoded in hardware as:
398 *
399 * add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F
400 * add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F
401 *
402 * Which is safe. However, if we have uniform accesses
403 * happening, we get into trouble:
404 *
405 * add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F
406 * add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F
407 *
408 * Now our destination for the first instruction overwrote the
409 * second instruction's src0, and we get garbage for those 8
410 * pixels. There's a similar issue for the pre-gfx6
411 * pixel_x/pixel_y, which are registers of 16-bit values and thus
412 * would get stomped by the first decode as well.
413 */
414 if (exec_size == 16) {
415 for (int i = 0; i < sources; i++) {
416 if (src[i].file == VGRF && (src[i].stride == 0 ||
417 src[i].type == ELK_REGISTER_TYPE_UW ||
418 src[i].type == ELK_REGISTER_TYPE_W ||
419 src[i].type == ELK_REGISTER_TYPE_UB ||
420 src[i].type == ELK_REGISTER_TYPE_B)) {
421 return true;
422 }
423 }
424 }
425 return false;
426 }
427 }
428
429 bool
can_do_source_mods(const struct intel_device_info * devinfo) const430 elk_fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
431 {
432 if (devinfo->ver == 6 && is_math())
433 return false;
434
435 if (is_send_from_grf())
436 return false;
437
438 return elk_backend_instruction::can_do_source_mods();
439 }
440
441 bool
can_do_cmod()442 elk_fs_inst::can_do_cmod()
443 {
444 if (!elk_backend_instruction::can_do_cmod())
445 return false;
446
447 /* The accumulator result appears to get used for the conditional modifier
448 * generation. When negating a UD value, there is a 33rd bit generated for
449 * the sign in the accumulator value, so now you can't check, for example,
450 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
451 */
452 for (unsigned i = 0; i < sources; i++) {
453 if (elk_reg_type_is_unsigned_integer(src[i].type) && src[i].negate)
454 return false;
455 }
456
457 return true;
458 }
459
460 bool
can_change_types() const461 elk_fs_inst::can_change_types() const
462 {
463 return dst.type == src[0].type &&
464 !src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR &&
465 (opcode == ELK_OPCODE_MOV ||
466 (opcode == ELK_OPCODE_SEL &&
467 dst.type == src[1].type &&
468 predicate != ELK_PREDICATE_NONE &&
469 !src[1].abs && !src[1].negate && src[1].file != ATTR));
470 }
471
472 void
init()473 elk_fs_reg::init()
474 {
475 memset((void*)this, 0, sizeof(*this));
476 type = ELK_REGISTER_TYPE_UD;
477 stride = 1;
478 }
479
480 /** Generic unset register constructor. */
elk_fs_reg()481 elk_fs_reg::elk_fs_reg()
482 {
483 init();
484 this->file = BAD_FILE;
485 }
486
elk_fs_reg(struct::elk_reg reg)487 elk_fs_reg::elk_fs_reg(struct ::elk_reg reg) :
488 elk_backend_reg(reg)
489 {
490 this->offset = 0;
491 this->stride = 1;
492 if (this->file == IMM &&
493 (this->type != ELK_REGISTER_TYPE_V &&
494 this->type != ELK_REGISTER_TYPE_UV &&
495 this->type != ELK_REGISTER_TYPE_VF)) {
496 this->stride = 0;
497 }
498 }
499
500 bool
equals(const elk_fs_reg & r) const501 elk_fs_reg::equals(const elk_fs_reg &r) const
502 {
503 return (this->elk_backend_reg::equals(r) &&
504 stride == r.stride);
505 }
506
507 bool
negative_equals(const elk_fs_reg & r) const508 elk_fs_reg::negative_equals(const elk_fs_reg &r) const
509 {
510 return (this->elk_backend_reg::negative_equals(r) &&
511 stride == r.stride);
512 }
513
514 bool
is_contiguous() const515 elk_fs_reg::is_contiguous() const
516 {
517 switch (file) {
518 case ARF:
519 case FIXED_GRF:
520 return hstride == ELK_HORIZONTAL_STRIDE_1 &&
521 vstride == width + hstride;
522 case MRF:
523 case VGRF:
524 case ATTR:
525 return stride == 1;
526 case UNIFORM:
527 case IMM:
528 case BAD_FILE:
529 return true;
530 }
531
532 unreachable("Invalid register file");
533 }
534
535 unsigned
component_size(unsigned width) const536 elk_fs_reg::component_size(unsigned width) const
537 {
538 if (file == ARF || file == FIXED_GRF) {
539 const unsigned w = MIN2(width, 1u << this->width);
540 const unsigned h = width >> this->width;
541 const unsigned vs = vstride ? 1 << (vstride - 1) : 0;
542 const unsigned hs = hstride ? 1 << (hstride - 1) : 0;
543 assert(w > 0);
544 return ((MAX2(1, h) - 1) * vs + (w - 1) * hs + 1) * type_sz(type);
545 } else {
546 return MAX2(width * stride, 1) * type_sz(type);
547 }
548 }
549
550 void
vfail(const char * format,va_list va)551 elk_fs_visitor::vfail(const char *format, va_list va)
552 {
553 char *msg;
554
555 if (failed)
556 return;
557
558 failed = true;
559
560 msg = ralloc_vasprintf(mem_ctx, format, va);
561 msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
562 dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
563
564 this->fail_msg = msg;
565
566 if (unlikely(debug_enabled)) {
567 fprintf(stderr, "%s", msg);
568 }
569 }
570
571 void
fail(const char * format,...)572 elk_fs_visitor::fail(const char *format, ...)
573 {
574 va_list va;
575
576 va_start(va, format);
577 vfail(format, va);
578 va_end(va);
579 }
580
581 /**
582 * Mark this program as impossible to compile with dispatch width greater
583 * than n.
584 *
585 * During the SIMD8 compile (which happens first), we can detect and flag
586 * things that are unsupported in SIMD16+ mode, so the compiler can skip the
587 * SIMD16+ compile altogether.
588 *
589 * During a compile of dispatch width greater than n (if one happens anyway),
590 * this just calls fail().
591 */
592 void
limit_dispatch_width(unsigned n,const char * msg)593 elk_fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
594 {
595 if (dispatch_width > n) {
596 fail("%s", msg);
597 } else {
598 max_dispatch_width = MIN2(max_dispatch_width, n);
599 elk_shader_perf_log(compiler, log_data,
600 "Shader dispatch width limited to SIMD%d: %s\n",
601 n, msg);
602 }
603 }
604
605 /**
606 * Returns true if the instruction has a flag that means it won't
607 * update an entire destination register.
608 *
609 * For example, dead code elimination and live variable analysis want to know
610 * when a write to a variable screens off any preceding values that were in
611 * it.
612 */
613 bool
is_partial_write() const614 elk_fs_inst::is_partial_write() const
615 {
616 if (this->predicate && !this->predicate_trivial &&
617 this->opcode != ELK_OPCODE_SEL)
618 return true;
619
620 if (this->dst.offset % REG_SIZE != 0)
621 return true;
622
623 /* SEND instructions always write whole registers */
624 if (this->opcode == ELK_SHADER_OPCODE_SEND)
625 return false;
626
627 /* Special case UNDEF since a lot of places in the backend do things like this :
628 *
629 * fs_builder ubld = bld.exec_all().group(1, 0);
630 * elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD);
631 * ubld.UNDEF(tmp); <- partial write, even if the whole register is concerned
632 */
633 if (this->opcode == ELK_SHADER_OPCODE_UNDEF) {
634 assert(this->dst.is_contiguous());
635 return this->size_written < 32;
636 }
637
638 return this->exec_size * type_sz(this->dst.type) < 32 ||
639 !this->dst.is_contiguous();
640 }
641
642 unsigned
components_read(unsigned i) const643 elk_fs_inst::components_read(unsigned i) const
644 {
645 /* Return zero if the source is not present. */
646 if (src[i].file == BAD_FILE)
647 return 0;
648
649 switch (opcode) {
650 case ELK_FS_OPCODE_LINTERP:
651 if (i == 0)
652 return 2;
653 else
654 return 1;
655
656 case ELK_FS_OPCODE_PIXEL_X:
657 case ELK_FS_OPCODE_PIXEL_Y:
658 assert(i < 2);
659 if (i == 0)
660 return 2;
661 else
662 return 1;
663
664 case ELK_FS_OPCODE_FB_WRITE_LOGICAL:
665 assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
666 /* First/second FB write color. */
667 if (i < 2)
668 return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
669 else
670 return 1;
671
672 case ELK_SHADER_OPCODE_TEX_LOGICAL:
673 case ELK_SHADER_OPCODE_TXD_LOGICAL:
674 case ELK_SHADER_OPCODE_TXF_LOGICAL:
675 case ELK_SHADER_OPCODE_TXL_LOGICAL:
676 case ELK_SHADER_OPCODE_TXS_LOGICAL:
677 case ELK_SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
678 case ELK_FS_OPCODE_TXB_LOGICAL:
679 case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
680 case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
681 case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
682 case ELK_SHADER_OPCODE_TXF_UMS_LOGICAL:
683 case ELK_SHADER_OPCODE_TXF_MCS_LOGICAL:
684 case ELK_SHADER_OPCODE_LOD_LOGICAL:
685 case ELK_SHADER_OPCODE_TG4_LOGICAL:
686 case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
687 case ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL:
688 assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
689 src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
690 src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
691 /* Texture coordinates. */
692 if (i == TEX_LOGICAL_SRC_COORDINATE)
693 return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
694 /* Texture derivatives. */
695 else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
696 opcode == ELK_SHADER_OPCODE_TXD_LOGICAL)
697 return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
698 /* Texture offset. */
699 else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
700 return 2;
701 /* MCS */
702 else if (i == TEX_LOGICAL_SRC_MCS) {
703 if (opcode == ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL)
704 return 2;
705 else if (opcode == ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
706 return 4;
707 else
708 return 1;
709 } else
710 return 1;
711
712 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
713 case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
714 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM);
715 /* Surface coordinates. */
716 if (i == SURFACE_LOGICAL_SRC_ADDRESS)
717 return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
718 /* Surface operation source (ignored for reads). */
719 else if (i == SURFACE_LOGICAL_SRC_DATA)
720 return 0;
721 else
722 return 1;
723
724 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
725 case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
726 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
727 src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
728 /* Surface coordinates. */
729 if (i == SURFACE_LOGICAL_SRC_ADDRESS)
730 return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
731 /* Surface operation source. */
732 else if (i == SURFACE_LOGICAL_SRC_DATA)
733 return src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
734 else
735 return 1;
736
737 case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
738 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
739 case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
740 assert(src[A64_LOGICAL_ARG].file == IMM);
741 return 1;
742
743 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
744 assert(src[A64_LOGICAL_ARG].file == IMM);
745 if (i == A64_LOGICAL_SRC) { /* data to write */
746 const unsigned comps = src[A64_LOGICAL_ARG].ud / exec_size;
747 assert(comps > 0);
748 return comps;
749 } else {
750 return 1;
751 }
752
753 case ELK_SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
754 assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
755 return 1;
756
757 case ELK_SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
758 assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
759 if (i == SURFACE_LOGICAL_SRC_DATA) {
760 const unsigned comps = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud / exec_size;
761 assert(comps > 0);
762 return comps;
763 } else {
764 return 1;
765 }
766
767 case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
768 assert(src[A64_LOGICAL_ARG].file == IMM);
769 return i == A64_LOGICAL_SRC ? src[A64_LOGICAL_ARG].ud : 1;
770
771 case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
772 assert(src[A64_LOGICAL_ARG].file == IMM);
773 return i == A64_LOGICAL_SRC ?
774 lsc_op_num_data_values(src[A64_LOGICAL_ARG].ud) : 1;
775
776 case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
777 case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
778 /* Scattered logical opcodes use the following params:
779 * src[0] Surface coordinates
780 * src[1] Surface operation source (ignored for reads)
781 * src[2] Surface
782 * src[3] IMM with always 1 dimension.
783 * src[4] IMM with arg bitsize for scattered read/write 8, 16, 32
784 */
785 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
786 src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
787 return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1;
788
789 case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
790 case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
791 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
792 src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
793 return 1;
794
795 case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
796 case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
797 assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
798 src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
799 const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
800 /* Surface coordinates. */
801 if (i == SURFACE_LOGICAL_SRC_ADDRESS)
802 return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
803 /* Surface operation source. */
804 else if (i == SURFACE_LOGICAL_SRC_DATA)
805 return lsc_op_num_data_values(op);
806 else
807 return 1;
808 }
809 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
810 return (i == 0 ? 2 : 1);
811
812 case ELK_SHADER_OPCODE_URB_WRITE_LOGICAL:
813 assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
814
815 if (i == URB_LOGICAL_SRC_DATA)
816 return src[URB_LOGICAL_SRC_COMPONENTS].ud;
817 else
818 return 1;
819
820 default:
821 return 1;
822 }
823 }
824
825 unsigned
size_read(int arg) const826 elk_fs_inst::size_read(int arg) const
827 {
828 switch (opcode) {
829 case ELK_SHADER_OPCODE_SEND:
830 if (arg == 1) {
831 return mlen * REG_SIZE;
832 }
833 break;
834
835 case ELK_FS_OPCODE_FB_WRITE:
836 case ELK_FS_OPCODE_REP_FB_WRITE:
837 if (arg == 0) {
838 if (base_mrf >= 0)
839 return src[0].file == BAD_FILE ? 0 : 2 * REG_SIZE;
840 else
841 return mlen * REG_SIZE;
842 }
843 break;
844
845 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
846 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
847 if (arg == 0)
848 return mlen * REG_SIZE;
849 break;
850
851 case ELK_FS_OPCODE_SET_SAMPLE_ID:
852 if (arg == 1)
853 return 1;
854 break;
855
856 case ELK_FS_OPCODE_LINTERP:
857 if (arg == 1)
858 return 16;
859 break;
860
861 case ELK_SHADER_OPCODE_LOAD_PAYLOAD:
862 if (arg < this->header_size)
863 return retype(src[arg], ELK_REGISTER_TYPE_UD).component_size(8);
864 break;
865
866 case ELK_CS_OPCODE_CS_TERMINATE:
867 case ELK_SHADER_OPCODE_BARRIER:
868 return REG_SIZE;
869
870 case ELK_SHADER_OPCODE_MOV_INDIRECT:
871 if (arg == 0) {
872 assert(src[2].file == IMM);
873 return src[2].ud;
874 }
875 break;
876
877 case ELK_SHADER_OPCODE_TEX:
878 case ELK_FS_OPCODE_TXB:
879 case ELK_SHADER_OPCODE_TXD:
880 case ELK_SHADER_OPCODE_TXF:
881 case ELK_SHADER_OPCODE_TXF_LZ:
882 case ELK_SHADER_OPCODE_TXF_CMS:
883 case ELK_SHADER_OPCODE_TXF_CMS_W:
884 case ELK_SHADER_OPCODE_TXF_UMS:
885 case ELK_SHADER_OPCODE_TXF_MCS:
886 case ELK_SHADER_OPCODE_TXL:
887 case ELK_SHADER_OPCODE_TXL_LZ:
888 case ELK_SHADER_OPCODE_TXS:
889 case ELK_SHADER_OPCODE_LOD:
890 case ELK_SHADER_OPCODE_TG4:
891 case ELK_SHADER_OPCODE_TG4_OFFSET:
892 case ELK_SHADER_OPCODE_SAMPLEINFO:
893 if (arg == 0 && src[0].file == VGRF)
894 return mlen * REG_SIZE;
895 break;
896
897 default:
898 break;
899 }
900
901 switch (src[arg].file) {
902 case UNIFORM:
903 case IMM:
904 return components_read(arg) * type_sz(src[arg].type);
905 case BAD_FILE:
906 case ARF:
907 case FIXED_GRF:
908 case VGRF:
909 case ATTR:
910 return components_read(arg) * src[arg].component_size(exec_size);
911 case MRF:
912 unreachable("MRF registers are not allowed as sources");
913 }
914 return 0;
915 }
916
917 namespace {
918 unsigned
predicate_width(const intel_device_info * devinfo,elk_predicate predicate)919 predicate_width(const intel_device_info *devinfo, elk_predicate predicate)
920 {
921 switch (predicate) {
922 case ELK_PREDICATE_NONE: return 1;
923 case ELK_PREDICATE_NORMAL: return 1;
924 case ELK_PREDICATE_ALIGN1_ANY2H: return 2;
925 case ELK_PREDICATE_ALIGN1_ALL2H: return 2;
926 case ELK_PREDICATE_ALIGN1_ANY4H: return 4;
927 case ELK_PREDICATE_ALIGN1_ALL4H: return 4;
928 case ELK_PREDICATE_ALIGN1_ANY8H: return 8;
929 case ELK_PREDICATE_ALIGN1_ALL8H: return 8;
930 case ELK_PREDICATE_ALIGN1_ANY16H: return 16;
931 case ELK_PREDICATE_ALIGN1_ALL16H: return 16;
932 case ELK_PREDICATE_ALIGN1_ANY32H: return 32;
933 case ELK_PREDICATE_ALIGN1_ALL32H: return 32;
934 default: unreachable("Unsupported predicate");
935 }
936 }
937
938 /* Return the subset of flag registers that an instruction could
939 * potentially read or write based on the execution controls and flag
940 * subregister number of the instruction.
941 */
942 unsigned
flag_mask(const elk_fs_inst * inst,unsigned width)943 flag_mask(const elk_fs_inst *inst, unsigned width)
944 {
945 assert(util_is_power_of_two_nonzero(width));
946 const unsigned start = (inst->flag_subreg * 16 + inst->group) &
947 ~(width - 1);
948 const unsigned end = start + ALIGN(inst->exec_size, width);
949 return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
950 }
951
952 unsigned
bit_mask(unsigned n)953 bit_mask(unsigned n)
954 {
955 return (n >= CHAR_BIT * sizeof(bit_mask(n)) ? ~0u : (1u << n) - 1);
956 }
957
958 unsigned
flag_mask(const elk_fs_reg & r,unsigned sz)959 flag_mask(const elk_fs_reg &r, unsigned sz)
960 {
961 if (r.file == ARF) {
962 const unsigned start = (r.nr - ELK_ARF_FLAG) * 4 + r.subnr;
963 const unsigned end = start + sz;
964 return bit_mask(end) & ~bit_mask(start);
965 } else {
966 return 0;
967 }
968 }
969 }
970
971 unsigned
flags_read(const intel_device_info * devinfo) const972 elk_fs_inst::flags_read(const intel_device_info *devinfo) const
973 {
974 if (predicate == ELK_PREDICATE_ALIGN1_ANYV ||
975 predicate == ELK_PREDICATE_ALIGN1_ALLV) {
976 /* The vertical predication modes combine corresponding bits from
977 * f0.0 and f1.0 on Gfx7+, and f0.0 and f0.1 on older hardware.
978 */
979 const unsigned shift = devinfo->ver >= 7 ? 4 : 2;
980 return flag_mask(this, 1) << shift | flag_mask(this, 1);
981 } else if (predicate) {
982 return flag_mask(this, predicate_width(devinfo, predicate));
983 } else {
984 unsigned mask = 0;
985 for (int i = 0; i < sources; i++) {
986 mask |= flag_mask(src[i], size_read(i));
987 }
988 return mask;
989 }
990 }
991
992 unsigned
flags_written(const intel_device_info * devinfo) const993 elk_fs_inst::flags_written(const intel_device_info *devinfo) const
994 {
995 /* On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
996 * using a separate cmpn and sel instruction. This lowering occurs in
997 * fs_vistor::lower_minmax which is called very, very late.
998 */
999 if ((conditional_mod && ((opcode != ELK_OPCODE_SEL || devinfo->ver <= 5) &&
1000 opcode != ELK_OPCODE_CSEL &&
1001 opcode != ELK_OPCODE_IF &&
1002 opcode != ELK_OPCODE_WHILE)) ||
1003 opcode == ELK_FS_OPCODE_FB_WRITE) {
1004 return flag_mask(this, 1);
1005 } else if (opcode == ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL ||
1006 opcode == ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL ||
1007 opcode == ELK_FS_OPCODE_LOAD_LIVE_CHANNELS) {
1008 return flag_mask(this, 32);
1009 } else {
1010 return flag_mask(dst, size_written);
1011 }
1012 }
1013
1014 /**
1015 * Returns how many MRFs an FS opcode will write over.
1016 *
1017 * Note that this is not the 0 or 1 implied writes in an actual gen
1018 * instruction -- the FS opcodes often generate MOVs in addition.
1019 */
1020 unsigned
implied_mrf_writes() const1021 elk_fs_inst::implied_mrf_writes() const
1022 {
1023 if (mlen == 0)
1024 return 0;
1025
1026 if (base_mrf == -1)
1027 return 0;
1028
1029 switch (opcode) {
1030 case ELK_SHADER_OPCODE_RCP:
1031 case ELK_SHADER_OPCODE_RSQ:
1032 case ELK_SHADER_OPCODE_SQRT:
1033 case ELK_SHADER_OPCODE_EXP2:
1034 case ELK_SHADER_OPCODE_LOG2:
1035 case ELK_SHADER_OPCODE_SIN:
1036 case ELK_SHADER_OPCODE_COS:
1037 return 1 * exec_size / 8;
1038 case ELK_SHADER_OPCODE_POW:
1039 case ELK_SHADER_OPCODE_INT_QUOTIENT:
1040 case ELK_SHADER_OPCODE_INT_REMAINDER:
1041 return 2 * exec_size / 8;
1042 case ELK_SHADER_OPCODE_TEX:
1043 case ELK_FS_OPCODE_TXB:
1044 case ELK_SHADER_OPCODE_TXD:
1045 case ELK_SHADER_OPCODE_TXF:
1046 case ELK_SHADER_OPCODE_TXF_CMS:
1047 case ELK_SHADER_OPCODE_TXF_MCS:
1048 case ELK_SHADER_OPCODE_TG4:
1049 case ELK_SHADER_OPCODE_TG4_OFFSET:
1050 case ELK_SHADER_OPCODE_TXL:
1051 case ELK_SHADER_OPCODE_TXS:
1052 case ELK_SHADER_OPCODE_LOD:
1053 case ELK_SHADER_OPCODE_SAMPLEINFO:
1054 return 1;
1055 case ELK_FS_OPCODE_FB_WRITE:
1056 case ELK_FS_OPCODE_REP_FB_WRITE:
1057 return src[0].file == BAD_FILE ? 0 : 2;
1058 case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1059 case ELK_SHADER_OPCODE_GFX4_SCRATCH_READ:
1060 return 1;
1061 case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
1062 return mlen;
1063 case ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE:
1064 return mlen;
1065 default:
1066 unreachable("not reached");
1067 }
1068 }
1069
1070 bool
has_sampler_residency() const1071 elk_fs_inst::has_sampler_residency() const
1072 {
1073 switch (opcode) {
1074 case ELK_SHADER_OPCODE_TEX_LOGICAL:
1075 case ELK_FS_OPCODE_TXB_LOGICAL:
1076 case ELK_SHADER_OPCODE_TXL_LOGICAL:
1077 case ELK_SHADER_OPCODE_TXD_LOGICAL:
1078 case ELK_SHADER_OPCODE_TXF_LOGICAL:
1079 case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
1080 case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
1081 case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
1082 case ELK_SHADER_OPCODE_TXS_LOGICAL:
1083 case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
1084 case ELK_SHADER_OPCODE_TG4_LOGICAL:
1085 assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
1086 return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
1087 default:
1088 return false;
1089 }
1090 }
1091
1092 elk_fs_reg
vgrf(const glsl_type * const type)1093 elk_fs_visitor::vgrf(const glsl_type *const type)
1094 {
1095 int reg_width = dispatch_width / 8;
1096 return elk_fs_reg(VGRF,
1097 alloc.allocate(glsl_count_dword_slots(type, false) * reg_width),
1098 elk_type_for_base_type(type));
1099 }
1100
elk_fs_reg(enum elk_reg_file file,unsigned nr)1101 elk_fs_reg::elk_fs_reg(enum elk_reg_file file, unsigned nr)
1102 {
1103 init();
1104 this->file = file;
1105 this->nr = nr;
1106 this->type = ELK_REGISTER_TYPE_F;
1107 this->stride = (file == UNIFORM ? 0 : 1);
1108 }
1109
elk_fs_reg(enum elk_reg_file file,unsigned nr,enum elk_reg_type type)1110 elk_fs_reg::elk_fs_reg(enum elk_reg_file file, unsigned nr, enum elk_reg_type type)
1111 {
1112 init();
1113 this->file = file;
1114 this->nr = nr;
1115 this->type = type;
1116 this->stride = (file == UNIFORM ? 0 : 1);
1117 }
1118
1119 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1120 * This brings in those uniform definitions
1121 */
1122 void
import_uniforms(elk_fs_visitor * v)1123 elk_fs_visitor::import_uniforms(elk_fs_visitor *v)
1124 {
1125 this->push_constant_loc = v->push_constant_loc;
1126 this->uniforms = v->uniforms;
1127 }
1128
1129 enum elk_barycentric_mode
elk_barycentric_mode(nir_intrinsic_instr * intr)1130 elk_barycentric_mode(nir_intrinsic_instr *intr)
1131 {
1132 const glsl_interp_mode mode =
1133 (enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
1134
1135 /* Barycentric modes don't make sense for flat inputs. */
1136 assert(mode != INTERP_MODE_FLAT);
1137
1138 unsigned bary;
1139 switch (intr->intrinsic) {
1140 case nir_intrinsic_load_barycentric_pixel:
1141 case nir_intrinsic_load_barycentric_at_offset:
1142 bary = ELK_BARYCENTRIC_PERSPECTIVE_PIXEL;
1143 break;
1144 case nir_intrinsic_load_barycentric_centroid:
1145 bary = ELK_BARYCENTRIC_PERSPECTIVE_CENTROID;
1146 break;
1147 case nir_intrinsic_load_barycentric_sample:
1148 case nir_intrinsic_load_barycentric_at_sample:
1149 bary = ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE;
1150 break;
1151 default:
1152 unreachable("invalid intrinsic");
1153 }
1154
1155 if (mode == INTERP_MODE_NOPERSPECTIVE)
1156 bary += 3;
1157
1158 return (enum elk_barycentric_mode) bary;
1159 }
1160
1161 /**
1162 * Turn one of the two CENTROID barycentric modes into PIXEL mode.
1163 */
1164 static enum elk_barycentric_mode
centroid_to_pixel(enum elk_barycentric_mode bary)1165 centroid_to_pixel(enum elk_barycentric_mode bary)
1166 {
1167 assert(bary == ELK_BARYCENTRIC_PERSPECTIVE_CENTROID ||
1168 bary == ELK_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
1169 return (enum elk_barycentric_mode) ((unsigned) bary - 1);
1170 }
1171
1172 /**
1173 * Walk backwards from the end of the program looking for a URB write that
1174 * isn't in control flow, and mark it with EOT.
1175 *
1176 * Return true if successful or false if a separate EOT write is needed.
1177 */
1178 bool
mark_last_urb_write_with_eot()1179 elk_fs_visitor::mark_last_urb_write_with_eot()
1180 {
1181 foreach_in_list_reverse(elk_fs_inst, prev, &this->instructions) {
1182 if (prev->opcode == ELK_SHADER_OPCODE_URB_WRITE_LOGICAL) {
1183 prev->eot = true;
1184
1185 /* Delete now dead instructions. */
1186 foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
1187 if (dead == prev)
1188 break;
1189 dead->remove();
1190 }
1191 return true;
1192 } else if (prev->is_control_flow() || prev->has_side_effects()) {
1193 break;
1194 }
1195 }
1196
1197 return false;
1198 }
1199
1200 void
emit_gs_thread_end()1201 elk_fs_visitor::emit_gs_thread_end()
1202 {
1203 assert(stage == MESA_SHADER_GEOMETRY);
1204
1205 struct elk_gs_prog_data *gs_prog_data = elk_gs_prog_data(prog_data);
1206
1207 if (gs_compile->control_data_header_size_bits > 0) {
1208 emit_gs_control_data_bits(this->final_gs_vertex_count);
1209 }
1210
1211 const fs_builder abld = fs_builder(this).at_end().annotate("thread end");
1212 elk_fs_inst *inst;
1213
1214 if (gs_prog_data->static_vertex_count != -1) {
1215 /* Try and tag the last URB write with EOT instead of emitting a whole
1216 * separate write just to finish the thread.
1217 */
1218 if (mark_last_urb_write_with_eot())
1219 return;
1220
1221 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1222 srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
1223 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(0);
1224 inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
1225 srcs, ARRAY_SIZE(srcs));
1226 } else {
1227 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
1228 srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
1229 srcs[URB_LOGICAL_SRC_DATA] = this->final_gs_vertex_count;
1230 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(1);
1231 inst = abld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
1232 srcs, ARRAY_SIZE(srcs));
1233 }
1234 inst->eot = true;
1235 inst->offset = 0;
1236 }
1237
1238 void
assign_curb_setup()1239 elk_fs_visitor::assign_curb_setup()
1240 {
1241 unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
1242
1243 unsigned ubo_push_length = 0;
1244 unsigned ubo_push_start[4];
1245 for (int i = 0; i < 4; i++) {
1246 ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
1247 ubo_push_length += stage_prog_data->ubo_ranges[i].length;
1248 }
1249
1250 prog_data->curb_read_length = uniform_push_length + ubo_push_length;
1251
1252 uint64_t used = 0;
1253
1254 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1255 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1256 for (unsigned int i = 0; i < inst->sources; i++) {
1257 if (inst->src[i].file == UNIFORM) {
1258 int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
1259 int constant_nr;
1260 if (inst->src[i].nr >= UBO_START) {
1261 /* constant_nr is in 32-bit units, the rest are in bytes */
1262 constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
1263 inst->src[i].offset / 4;
1264 } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1265 constant_nr = push_constant_loc[uniform_nr];
1266 } else {
1267 /* Section 5.11 of the OpenGL 4.1 spec says:
1268 * "Out-of-bounds reads return undefined values, which include
1269 * values from other variables of the active program or zero."
1270 * Just return the first push constant.
1271 */
1272 constant_nr = 0;
1273 }
1274
1275 assert(constant_nr / 8 < 64);
1276 used |= BITFIELD64_BIT(constant_nr / 8);
1277
1278 struct elk_reg elk_reg = elk_vec1_grf(payload().num_regs +
1279 constant_nr / 8,
1280 constant_nr % 8);
1281 elk_reg.abs = inst->src[i].abs;
1282 elk_reg.negate = inst->src[i].negate;
1283
1284 assert(inst->src[i].stride == 0);
1285 inst->src[i] = byte_offset(
1286 retype(elk_reg, inst->src[i].type),
1287 inst->src[i].offset % 4);
1288 }
1289 }
1290 }
1291
1292 uint64_t want_zero = used & stage_prog_data->zero_push_reg;
1293 if (want_zero) {
1294 fs_builder ubld = fs_builder(this, 8).exec_all().at(
1295 cfg->first_block(), cfg->first_block()->start());
1296
1297 /* push_reg_mask_param is in 32-bit units */
1298 unsigned mask_param = stage_prog_data->push_reg_mask_param;
1299 struct elk_reg mask = elk_vec1_grf(payload().num_regs + mask_param / 8,
1300 mask_param % 8);
1301
1302 elk_fs_reg b32;
1303 for (unsigned i = 0; i < 64; i++) {
1304 if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
1305 elk_fs_reg shifted = ubld.vgrf(ELK_REGISTER_TYPE_W, 2);
1306 ubld.SHL(horiz_offset(shifted, 8),
1307 byte_offset(retype(mask, ELK_REGISTER_TYPE_W), i / 8),
1308 elk_imm_v(0x01234567));
1309 ubld.SHL(shifted, horiz_offset(shifted, 8), elk_imm_w(8));
1310
1311 fs_builder ubld16 = ubld.group(16, 0);
1312 b32 = ubld16.vgrf(ELK_REGISTER_TYPE_D);
1313 ubld16.group(16, 0).ASR(b32, shifted, elk_imm_w(15));
1314 }
1315
1316 if (want_zero & BITFIELD64_BIT(i)) {
1317 assert(i < prog_data->curb_read_length);
1318 struct elk_reg push_reg =
1319 retype(elk_vec8_grf(payload().num_regs + i, 0),
1320 ELK_REGISTER_TYPE_D);
1321
1322 ubld.AND(push_reg, push_reg, component(b32, i % 16));
1323 }
1324 }
1325
1326 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1327 }
1328
1329 /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1330 this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
1331 }
1332
1333 /*
1334 * Build up an array of indices into the urb_setup array that
1335 * references the active entries of the urb_setup array.
1336 * Used to accelerate walking the active entries of the urb_setup array
1337 * on each upload.
1338 */
1339 void
elk_compute_urb_setup_index(struct elk_wm_prog_data * wm_prog_data)1340 elk_compute_urb_setup_index(struct elk_wm_prog_data *wm_prog_data)
1341 {
1342 /* Make sure uint8_t is sufficient */
1343 STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
1344 uint8_t index = 0;
1345 for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1346 if (wm_prog_data->urb_setup[attr] >= 0) {
1347 wm_prog_data->urb_setup_attribs[index++] = attr;
1348 }
1349 }
1350 wm_prog_data->urb_setup_attribs_count = index;
1351 }
1352
1353 static void
calculate_urb_setup(const struct intel_device_info * devinfo,const struct elk_wm_prog_key * key,struct elk_wm_prog_data * prog_data,const nir_shader * nir)1354 calculate_urb_setup(const struct intel_device_info *devinfo,
1355 const struct elk_wm_prog_key *key,
1356 struct elk_wm_prog_data *prog_data,
1357 const nir_shader *nir)
1358 {
1359 memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
1360 memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
1361
1362 int urb_next = 0; /* in vec4s */
1363
1364 const uint64_t inputs_read =
1365 nir->info.inputs_read & ~nir->info.per_primitive_inputs;
1366
1367 /* Figure out where each of the incoming setup attributes lands. */
1368 if (devinfo->ver >= 6) {
1369 assert(!nir->info.per_primitive_inputs);
1370
1371 uint64_t vue_header_bits =
1372 VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
1373
1374 uint64_t unique_fs_attrs = inputs_read & ELK_FS_VARYING_INPUT_MASK;
1375
1376 /* VUE header fields all live in the same URB slot, so we pass them
1377 * as a single FS input attribute. We want to only count them once.
1378 */
1379 if (inputs_read & vue_header_bits) {
1380 unique_fs_attrs &= ~vue_header_bits;
1381 unique_fs_attrs |= VARYING_BIT_PSIZ;
1382 }
1383
1384 if (util_bitcount64(unique_fs_attrs) <= 16) {
1385 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1386 * first 16 varying inputs, so we can put them wherever we want.
1387 * Just put them in order.
1388 *
1389 * This is useful because it means that (a) inputs not used by the
1390 * fragment shader won't take up valuable register space, and (b) we
1391 * won't have to recompile the fragment shader if it gets paired with
1392 * a different vertex (or geometry) shader.
1393 *
1394 * VUE header fields share the same FS input attribute.
1395 */
1396 if (inputs_read & vue_header_bits) {
1397 if (inputs_read & VARYING_BIT_PSIZ)
1398 prog_data->urb_setup[VARYING_SLOT_PSIZ] = urb_next;
1399 if (inputs_read & VARYING_BIT_LAYER)
1400 prog_data->urb_setup[VARYING_SLOT_LAYER] = urb_next;
1401 if (inputs_read & VARYING_BIT_VIEWPORT)
1402 prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = urb_next;
1403
1404 urb_next++;
1405 }
1406
1407 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1408 if (inputs_read & ELK_FS_VARYING_INPUT_MASK & ~vue_header_bits &
1409 BITFIELD64_BIT(i)) {
1410 prog_data->urb_setup[i] = urb_next++;
1411 }
1412 }
1413 } else {
1414 /* We have enough input varyings that the SF/SBE pipeline stage can't
1415 * arbitrarily rearrange them to suit our whim; we have to put them
1416 * in an order that matches the output of the previous pipeline stage
1417 * (geometry or vertex shader).
1418 */
1419
1420 /* Re-compute the VUE map here in the case that the one coming from
1421 * geometry has more than one position slot (used for Primitive
1422 * Replication).
1423 */
1424 struct intel_vue_map prev_stage_vue_map;
1425 elk_compute_vue_map(devinfo, &prev_stage_vue_map,
1426 key->input_slots_valid,
1427 nir->info.separate_shader, 1);
1428
1429 int first_slot =
1430 elk_compute_first_urb_slot_required(inputs_read,
1431 &prev_stage_vue_map);
1432
1433 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1434 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1435 slot++) {
1436 int varying = prev_stage_vue_map.slot_to_varying[slot];
1437 if (varying != ELK_VARYING_SLOT_PAD &&
1438 (inputs_read & ELK_FS_VARYING_INPUT_MASK &
1439 BITFIELD64_BIT(varying))) {
1440 prog_data->urb_setup[varying] = slot - first_slot;
1441 }
1442 }
1443 urb_next = prev_stage_vue_map.num_slots - first_slot;
1444 }
1445 } else {
1446 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1447 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1448 /* Point size is packed into the header, not as a general attribute */
1449 if (i == VARYING_SLOT_PSIZ)
1450 continue;
1451
1452 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1453 /* The back color slot is skipped when the front color is
1454 * also written to. In addition, some slots can be
1455 * written in the vertex shader and not read in the
1456 * fragment shader. So the register number must always be
1457 * incremented, mapped or not.
1458 */
1459 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1460 prog_data->urb_setup[i] = urb_next;
1461 urb_next++;
1462 }
1463 }
1464
1465 /*
1466 * It's a FS only attribute, and we did interpolation for this attribute
1467 * in SF thread. So, count it here, too.
1468 *
1469 * See compile_sf_prog() for more info.
1470 */
1471 if (inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1472 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1473 }
1474
1475 prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
1476 prog_data->inputs = inputs_read;
1477
1478 elk_compute_urb_setup_index(prog_data);
1479 }
1480
1481 void
assign_urb_setup()1482 elk_fs_visitor::assign_urb_setup()
1483 {
1484 assert(stage == MESA_SHADER_FRAGMENT);
1485 struct elk_wm_prog_data *prog_data = elk_wm_prog_data(this->prog_data);
1486
1487 int urb_start = payload().num_regs + prog_data->base.curb_read_length;
1488
1489 /* Offset all the urb_setup[] index by the actual position of the
1490 * setup regs, now that the location of the constants has been chosen.
1491 */
1492 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1493 for (int i = 0; i < inst->sources; i++) {
1494 if (inst->src[i].file == ATTR) {
1495 /* ATTR elk_fs_reg::nr in the FS is in units of logical scalar
1496 * inputs each of which consumes 16B on Gfx4-Gfx12. In
1497 * single polygon mode this leads to the following layout
1498 * of the vertex setup plane parameters in the ATTR
1499 * register file:
1500 *
1501 * elk_fs_reg::nr Input Comp0 Comp1 Comp2 Comp3
1502 * 0 Attr0.x a1-a0 a2-a0 N/A a0
1503 * 1 Attr0.y a1-a0 a2-a0 N/A a0
1504 * 2 Attr0.z a1-a0 a2-a0 N/A a0
1505 * 3 Attr0.w a1-a0 a2-a0 N/A a0
1506 * 4 Attr1.x a1-a0 a2-a0 N/A a0
1507 * ...
1508 */
1509 const unsigned param_width = 1;
1510
1511 /* Size of a single scalar component of a plane parameter
1512 * in bytes.
1513 */
1514 const unsigned chan_sz = 4;
1515 struct elk_reg reg;
1516
1517 /* Calculate the base register on the thread payload of
1518 * either the block of vertex setup data or the block of
1519 * per-primitive constant data depending on whether we're
1520 * accessing a primitive or vertex input. Also calculate
1521 * the index of the input within that block.
1522 */
1523 const bool per_prim = inst->src[i].nr < prog_data->num_per_primitive_inputs;
1524 const unsigned base = urb_start +
1525 (per_prim ? 0 :
1526 ALIGN(prog_data->num_per_primitive_inputs / 2,
1527 reg_unit(devinfo)));
1528 const unsigned idx = per_prim ? inst->src[i].nr :
1529 inst->src[i].nr - prog_data->num_per_primitive_inputs;
1530
1531 /* Translate the offset within the param_width-wide
1532 * representation described above into an offset and a
1533 * grf, which contains the plane parameters for the first
1534 * polygon processed by the thread.
1535 *
1536 * Earlier platforms and per-primitive block pack 2 logical
1537 * input components per 32B register.
1538 */
1539 const unsigned grf = base + idx / 2;
1540 assert(inst->src[i].offset / param_width < REG_SIZE / 2);
1541 const unsigned delta = (idx % 2) * (REG_SIZE / 2) +
1542 inst->src[i].offset / (param_width * chan_sz) * chan_sz +
1543 inst->src[i].offset % chan_sz;
1544 reg = byte_offset(retype(elk_vec8_grf(grf, 0), inst->src[i].type),
1545 delta);
1546
1547 const unsigned width = inst->src[i].stride == 0 ?
1548 1 : MIN2(inst->exec_size, 8);
1549 reg = stride(reg, width * inst->src[i].stride,
1550 width, inst->src[i].stride);
1551
1552 reg.abs = inst->src[i].abs;
1553 reg.negate = inst->src[i].negate;
1554 inst->src[i] = reg;
1555 }
1556 }
1557 }
1558
1559 /* Each attribute is 4 setup channels, each of which is half a reg,
1560 * but they may be replicated multiple times for multipolygon
1561 * dispatch.
1562 */
1563 this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
1564
1565 /* Unlike regular attributes, per-primitive attributes have all 4 channels
1566 * in the same slot, so each GRF can store two slots.
1567 */
1568 assert(prog_data->num_per_primitive_inputs % 2 == 0);
1569 this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2;
1570 }
1571
1572 void
convert_attr_sources_to_hw_regs(elk_fs_inst * inst)1573 elk_fs_visitor::convert_attr_sources_to_hw_regs(elk_fs_inst *inst)
1574 {
1575 for (int i = 0; i < inst->sources; i++) {
1576 if (inst->src[i].file == ATTR) {
1577 assert(inst->src[i].nr == 0);
1578 int grf = payload().num_regs +
1579 prog_data->curb_read_length +
1580 inst->src[i].offset / REG_SIZE;
1581
1582 /* As explained at elk_reg_from_fs_reg, From the Haswell PRM:
1583 *
1584 * VertStride must be used to cross GRF register boundaries. This
1585 * rule implies that elements within a 'Width' cannot cross GRF
1586 * boundaries.
1587 *
1588 * So, for registers that are large enough, we have to split the exec
1589 * size in two and trust the compression state to sort it out.
1590 */
1591 unsigned total_size = inst->exec_size *
1592 inst->src[i].stride *
1593 type_sz(inst->src[i].type);
1594
1595 assert(total_size <= 2 * REG_SIZE);
1596 const unsigned exec_size =
1597 (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
1598
1599 unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
1600 struct elk_reg reg =
1601 stride(byte_offset(retype(elk_vec8_grf(grf, 0), inst->src[i].type),
1602 inst->src[i].offset % REG_SIZE),
1603 exec_size * inst->src[i].stride,
1604 width, inst->src[i].stride);
1605 reg.abs = inst->src[i].abs;
1606 reg.negate = inst->src[i].negate;
1607
1608 inst->src[i] = reg;
1609 }
1610 }
1611 }
1612
1613 void
assign_vs_urb_setup()1614 elk_fs_visitor::assign_vs_urb_setup()
1615 {
1616 struct elk_vs_prog_data *vs_prog_data = elk_vs_prog_data(prog_data);
1617
1618 assert(stage == MESA_SHADER_VERTEX);
1619
1620 /* Each attribute is 4 regs. */
1621 this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
1622
1623 assert(vs_prog_data->base.urb_read_length <= 15);
1624
1625 /* Rewrite all ATTR file references to the hw grf that they land in. */
1626 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1627 convert_attr_sources_to_hw_regs(inst);
1628 }
1629 }
1630
1631 void
assign_tcs_urb_setup()1632 elk_fs_visitor::assign_tcs_urb_setup()
1633 {
1634 assert(stage == MESA_SHADER_TESS_CTRL);
1635
1636 /* Rewrite all ATTR file references to HW_REGs. */
1637 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1638 convert_attr_sources_to_hw_regs(inst);
1639 }
1640 }
1641
1642 void
assign_tes_urb_setup()1643 elk_fs_visitor::assign_tes_urb_setup()
1644 {
1645 assert(stage == MESA_SHADER_TESS_EVAL);
1646
1647 struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(prog_data);
1648
1649 first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
1650
1651 /* Rewrite all ATTR file references to HW_REGs. */
1652 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1653 convert_attr_sources_to_hw_regs(inst);
1654 }
1655 }
1656
1657 void
assign_gs_urb_setup()1658 elk_fs_visitor::assign_gs_urb_setup()
1659 {
1660 assert(stage == MESA_SHADER_GEOMETRY);
1661
1662 struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(prog_data);
1663
1664 first_non_payload_grf +=
1665 8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
1666
1667 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1668 /* Rewrite all ATTR file references to GRFs. */
1669 convert_attr_sources_to_hw_regs(inst);
1670 }
1671 }
1672
1673
1674 /**
1675 * Split large virtual GRFs into separate components if we can.
1676 *
1677 * This pass aggressively splits VGRFs into as small a chunks as possible,
1678 * down to single registers if it can. If no VGRFs can be split, we return
1679 * false so this pass can safely be used inside an optimization loop. We
1680 * want to split, because virtual GRFs are what we register allocate and
1681 * spill (due to contiguousness requirements for some instructions), and
1682 * they're what we naturally generate in the codegen process, but most
1683 * virtual GRFs don't actually need to be contiguous sets of GRFs. If we
1684 * split, we'll end up with reduced live intervals and better dead code
1685 * elimination and coalescing.
1686 */
1687 bool
split_virtual_grfs()1688 elk_fs_visitor::split_virtual_grfs()
1689 {
1690 /* Compact the register file so we eliminate dead vgrfs. This
1691 * only defines split points for live registers, so if we have
1692 * too large dead registers they will hit assertions later.
1693 */
1694 compact_virtual_grfs();
1695
1696 unsigned num_vars = this->alloc.count;
1697
1698 /* Count the total number of registers */
1699 unsigned reg_count = 0;
1700 unsigned *vgrf_to_reg = new unsigned[num_vars];
1701 for (unsigned i = 0; i < num_vars; i++) {
1702 vgrf_to_reg[i] = reg_count;
1703 reg_count += alloc.sizes[i];
1704 }
1705
1706 /* An array of "split points". For each register slot, this indicates
1707 * if this slot can be separated from the previous slot. Every time an
1708 * instruction uses multiple elements of a register (as a source or
1709 * destination), we mark the used slots as inseparable. Then we go
1710 * through and split the registers into the smallest pieces we can.
1711 */
1712 bool *split_points = new bool[reg_count];
1713 memset(split_points, 0, reg_count * sizeof(*split_points));
1714
1715 /* Mark all used registers as fully splittable */
1716 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1717 if (inst->dst.file == VGRF) {
1718 unsigned reg = vgrf_to_reg[inst->dst.nr];
1719 for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
1720 split_points[reg + j] = true;
1721 }
1722
1723 for (unsigned i = 0; i < inst->sources; i++) {
1724 if (inst->src[i].file == VGRF) {
1725 unsigned reg = vgrf_to_reg[inst->src[i].nr];
1726 for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
1727 split_points[reg + j] = true;
1728 }
1729 }
1730 }
1731
1732 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1733 /* We fix up undef instructions later */
1734 if (inst->opcode == ELK_SHADER_OPCODE_UNDEF) {
1735 assert(inst->dst.file == VGRF);
1736 continue;
1737 }
1738
1739 if (inst->dst.file == VGRF) {
1740 unsigned reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
1741 for (unsigned j = 1; j < regs_written(inst); j++)
1742 split_points[reg + j] = false;
1743 }
1744 for (unsigned i = 0; i < inst->sources; i++) {
1745 if (inst->src[i].file == VGRF) {
1746 unsigned reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
1747 for (unsigned j = 1; j < regs_read(inst, i); j++)
1748 split_points[reg + j] = false;
1749 }
1750 }
1751 }
1752
1753 /* Bitset of which registers have been split */
1754 bool *vgrf_has_split = new bool[num_vars];
1755 memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
1756
1757 unsigned *new_virtual_grf = new unsigned[reg_count];
1758 unsigned *new_reg_offset = new unsigned[reg_count];
1759
1760 unsigned reg = 0;
1761 bool has_splits = false;
1762 for (unsigned i = 0; i < num_vars; i++) {
1763 /* The first one should always be 0 as a quick sanity check. */
1764 assert(split_points[reg] == false);
1765
1766 /* j = 0 case */
1767 new_reg_offset[reg] = 0;
1768 reg++;
1769 unsigned offset = 1;
1770
1771 /* j > 0 case */
1772 for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1773 /* If this is a split point, reset the offset to 0 and allocate a
1774 * new virtual GRF for the previous offset many registers
1775 */
1776 if (split_points[reg]) {
1777 has_splits = true;
1778 vgrf_has_split[i] = true;
1779 assert(offset <= MAX_VGRF_SIZE(devinfo));
1780 unsigned grf = alloc.allocate(offset);
1781 for (unsigned k = reg - offset; k < reg; k++)
1782 new_virtual_grf[k] = grf;
1783 offset = 0;
1784 }
1785 new_reg_offset[reg] = offset;
1786 offset++;
1787 reg++;
1788 }
1789
1790 /* The last one gets the original register number */
1791 assert(offset <= MAX_VGRF_SIZE(devinfo));
1792 alloc.sizes[i] = offset;
1793 for (unsigned k = reg - offset; k < reg; k++)
1794 new_virtual_grf[k] = i;
1795 }
1796 assert(reg == reg_count);
1797
1798 bool progress;
1799 if (!has_splits) {
1800 progress = false;
1801 goto cleanup;
1802 }
1803
1804 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
1805 if (inst->opcode == ELK_SHADER_OPCODE_UNDEF) {
1806 assert(inst->dst.file == VGRF);
1807 if (vgrf_has_split[inst->dst.nr]) {
1808 const fs_builder ibld(this, block, inst);
1809 assert(inst->size_written % REG_SIZE == 0);
1810 unsigned reg_offset = inst->dst.offset / REG_SIZE;
1811 unsigned size_written = 0;
1812 while (size_written < inst->size_written) {
1813 reg = vgrf_to_reg[inst->dst.nr] + reg_offset + size_written / REG_SIZE;
1814 elk_fs_inst *undef =
1815 ibld.UNDEF(
1816 byte_offset(elk_fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type),
1817 new_reg_offset[reg] * REG_SIZE));
1818 undef->size_written =
1819 MIN2(inst->size_written - size_written, undef->size_written);
1820 assert(undef->size_written % REG_SIZE == 0);
1821 size_written += undef->size_written;
1822 }
1823 inst->remove(block);
1824 } else {
1825 reg = vgrf_to_reg[inst->dst.nr];
1826 assert(new_reg_offset[reg] == 0);
1827 assert(new_virtual_grf[reg] == inst->dst.nr);
1828 }
1829 continue;
1830 }
1831
1832 if (inst->dst.file == VGRF) {
1833 reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
1834 if (vgrf_has_split[inst->dst.nr]) {
1835 inst->dst.nr = new_virtual_grf[reg];
1836 inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
1837 inst->dst.offset % REG_SIZE;
1838 assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1839 } else {
1840 assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE);
1841 assert(new_virtual_grf[reg] == inst->dst.nr);
1842 }
1843 }
1844 for (unsigned i = 0; i < inst->sources; i++) {
1845 if (inst->src[i].file != VGRF)
1846 continue;
1847
1848 reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
1849 if (vgrf_has_split[inst->src[i].nr]) {
1850 inst->src[i].nr = new_virtual_grf[reg];
1851 inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
1852 inst->src[i].offset % REG_SIZE;
1853 assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1854 } else {
1855 assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE);
1856 assert(new_virtual_grf[reg] == inst->src[i].nr);
1857 }
1858 }
1859 }
1860 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
1861
1862 progress = true;
1863
1864 cleanup:
1865 delete[] split_points;
1866 delete[] vgrf_has_split;
1867 delete[] new_virtual_grf;
1868 delete[] new_reg_offset;
1869 delete[] vgrf_to_reg;
1870
1871 return progress;
1872 }
1873
1874 /**
1875 * Remove unused virtual GRFs and compact the vgrf_* arrays.
1876 *
1877 * During code generation, we create tons of temporary variables, many of
1878 * which get immediately killed and are never used again. Yet, in later
1879 * optimization and analysis passes, such as compute_live_intervals, we need
1880 * to loop over all the virtual GRFs. Compacting them can save a lot of
1881 * overhead.
1882 */
1883 bool
compact_virtual_grfs()1884 elk_fs_visitor::compact_virtual_grfs()
1885 {
1886 bool progress = false;
1887 int *remap_table = new int[this->alloc.count];
1888 memset(remap_table, -1, this->alloc.count * sizeof(int));
1889
1890 /* Mark which virtual GRFs are used. */
1891 foreach_block_and_inst(block, const elk_fs_inst, inst, cfg) {
1892 if (inst->dst.file == VGRF)
1893 remap_table[inst->dst.nr] = 0;
1894
1895 for (int i = 0; i < inst->sources; i++) {
1896 if (inst->src[i].file == VGRF)
1897 remap_table[inst->src[i].nr] = 0;
1898 }
1899 }
1900
1901 /* Compact the GRF arrays. */
1902 int new_index = 0;
1903 for (unsigned i = 0; i < this->alloc.count; i++) {
1904 if (remap_table[i] == -1) {
1905 /* We just found an unused register. This means that we are
1906 * actually going to compact something.
1907 */
1908 progress = true;
1909 } else {
1910 remap_table[i] = new_index;
1911 alloc.sizes[new_index] = alloc.sizes[i];
1912 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
1913 ++new_index;
1914 }
1915 }
1916
1917 this->alloc.count = new_index;
1918
1919 /* Patch all the instructions to use the newly renumbered registers */
1920 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
1921 if (inst->dst.file == VGRF)
1922 inst->dst.nr = remap_table[inst->dst.nr];
1923
1924 for (int i = 0; i < inst->sources; i++) {
1925 if (inst->src[i].file == VGRF)
1926 inst->src[i].nr = remap_table[inst->src[i].nr];
1927 }
1928 }
1929
1930 /* Patch all the references to delta_xy, since they're used in register
1931 * allocation. If they're unused, switch them to BAD_FILE so we don't
1932 * think some random VGRF is delta_xy.
1933 */
1934 for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
1935 if (delta_xy[i].file == VGRF) {
1936 if (remap_table[delta_xy[i].nr] != -1) {
1937 delta_xy[i].nr = remap_table[delta_xy[i].nr];
1938 } else {
1939 delta_xy[i].file = BAD_FILE;
1940 }
1941 }
1942 }
1943
1944 delete[] remap_table;
1945
1946 return progress;
1947 }
1948
1949 int
elk_get_subgroup_id_param_index(const intel_device_info * devinfo,const elk_stage_prog_data * prog_data)1950 elk_get_subgroup_id_param_index(const intel_device_info *devinfo,
1951 const elk_stage_prog_data *prog_data)
1952 {
1953 if (prog_data->nr_params == 0)
1954 return -1;
1955
1956 /* The local thread id is always the last parameter in the list */
1957 uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
1958 if (last_param == ELK_PARAM_BUILTIN_SUBGROUP_ID)
1959 return prog_data->nr_params - 1;
1960
1961 return -1;
1962 }
1963
1964 /**
1965 * Assign UNIFORM file registers to either push constants or pull constants.
1966 *
1967 * We allow a fragment shader to have more than the specified minimum
1968 * maximum number of fragment shader uniform components (64). If
1969 * there are too many of these, they'd fill up all of register space.
1970 * So, this will push some of them out to the pull constant buffer and
1971 * update the program to load them.
1972 */
1973 void
assign_constant_locations()1974 elk_fs_visitor::assign_constant_locations()
1975 {
1976 /* Only the first compile gets to decide on locations. */
1977 if (push_constant_loc)
1978 return;
1979
1980 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1981 for (unsigned u = 0; u < uniforms; u++)
1982 push_constant_loc[u] = u;
1983
1984 /* Now that we know how many regular uniforms we'll push, reduce the
1985 * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
1986 */
1987 /* For gen4/5:
1988 * Only allow 16 registers (128 uniform components) as push constants.
1989 *
1990 * If changing this value, note the limitation about total_regs in
1991 * elk_curbe.c/crocus_state.c
1992 */
1993 const unsigned max_push_length = compiler->devinfo->ver < 6 ? 16 : 64;
1994 unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
1995 for (int i = 0; i < 4; i++) {
1996 struct elk_ubo_range *range = &prog_data->ubo_ranges[i];
1997
1998 if (push_length + range->length > max_push_length)
1999 range->length = max_push_length - push_length;
2000
2001 push_length += range->length;
2002 }
2003 assert(push_length <= max_push_length);
2004 }
2005
2006 bool
get_pull_locs(const elk_fs_reg & src,unsigned * out_surf_index,unsigned * out_pull_index)2007 elk_fs_visitor::get_pull_locs(const elk_fs_reg &src,
2008 unsigned *out_surf_index,
2009 unsigned *out_pull_index)
2010 {
2011 assert(src.file == UNIFORM);
2012
2013 if (src.nr < UBO_START)
2014 return false;
2015
2016 const struct elk_ubo_range *range =
2017 &prog_data->ubo_ranges[src.nr - UBO_START];
2018
2019 /* If this access is in our (reduced) range, use the push data. */
2020 if (src.offset / 32 < range->length)
2021 return false;
2022
2023 *out_surf_index = range->block;
2024 *out_pull_index = (32 * range->start + src.offset) / 4;
2025
2026 prog_data->has_ubo_pull = true;
2027
2028 return true;
2029 }
2030
2031 /**
2032 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2033 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2034 */
2035 bool
lower_constant_loads()2036 elk_fs_visitor::lower_constant_loads()
2037 {
2038 unsigned index, pull_index;
2039 bool progress = false;
2040
2041 foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
2042 /* Set up the annotation tracking for new generated instructions. */
2043 const fs_builder ibld(this, block, inst);
2044
2045 for (int i = 0; i < inst->sources; i++) {
2046 if (inst->src[i].file != UNIFORM)
2047 continue;
2048
2049 /* We'll handle this case later */
2050 if (inst->opcode == ELK_SHADER_OPCODE_MOV_INDIRECT && i == 0)
2051 continue;
2052
2053 if (!get_pull_locs(inst->src[i], &index, &pull_index))
2054 continue;
2055
2056 assert(inst->src[i].stride == 0);
2057
2058 const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
2059 const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
2060 const elk_fs_reg dst = ubld.vgrf(ELK_REGISTER_TYPE_UD);
2061 const unsigned base = pull_index * 4;
2062
2063 elk_fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
2064 srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = elk_imm_ud(index);
2065 srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = elk_imm_ud(base & ~(block_sz - 1));
2066 srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = elk_imm_ud(block_sz);
2067
2068
2069 ubld.emit(ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
2070 srcs, PULL_UNIFORM_CONSTANT_SRCS);
2071
2072 /* Rewrite the instruction to use the temporary VGRF. */
2073 inst->src[i].file = VGRF;
2074 inst->src[i].nr = dst.nr;
2075 inst->src[i].offset = (base & (block_sz - 1)) +
2076 inst->src[i].offset % 4;
2077
2078 progress = true;
2079 }
2080
2081 if (inst->opcode == ELK_SHADER_OPCODE_MOV_INDIRECT &&
2082 inst->src[0].file == UNIFORM) {
2083
2084 if (!get_pull_locs(inst->src[0], &index, &pull_index))
2085 continue;
2086
2087 VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
2088 elk_imm_ud(index),
2089 elk_fs_reg() /* surface_handle */,
2090 inst->src[1],
2091 pull_index * 4, 4, 1);
2092 inst->remove(block);
2093
2094 progress = true;
2095 }
2096 }
2097 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2098
2099 return progress;
2100 }
2101
2102 static uint64_t
src_as_uint(const elk_fs_reg & src)2103 src_as_uint(const elk_fs_reg &src)
2104 {
2105 assert(src.file == IMM);
2106
2107 switch (src.type) {
2108 case ELK_REGISTER_TYPE_W:
2109 return (uint64_t)(int16_t)(src.ud & 0xffff);
2110
2111 case ELK_REGISTER_TYPE_UW:
2112 return (uint64_t)(uint16_t)(src.ud & 0xffff);
2113
2114 case ELK_REGISTER_TYPE_D:
2115 return (uint64_t)src.d;
2116
2117 case ELK_REGISTER_TYPE_UD:
2118 return (uint64_t)src.ud;
2119
2120 case ELK_REGISTER_TYPE_Q:
2121 return src.d64;
2122
2123 case ELK_REGISTER_TYPE_UQ:
2124 return src.u64;
2125
2126 default:
2127 unreachable("Invalid integer type.");
2128 }
2129 }
2130
2131 static elk_fs_reg
elk_imm_for_type(uint64_t value,enum elk_reg_type type)2132 elk_imm_for_type(uint64_t value, enum elk_reg_type type)
2133 {
2134 switch (type) {
2135 case ELK_REGISTER_TYPE_W:
2136 return elk_imm_w(value);
2137
2138 case ELK_REGISTER_TYPE_UW:
2139 return elk_imm_uw(value);
2140
2141 case ELK_REGISTER_TYPE_D:
2142 return elk_imm_d(value);
2143
2144 case ELK_REGISTER_TYPE_UD:
2145 return elk_imm_ud(value);
2146
2147 case ELK_REGISTER_TYPE_Q:
2148 return elk_imm_d(value);
2149
2150 case ELK_REGISTER_TYPE_UQ:
2151 return elk_imm_uq(value);
2152
2153 default:
2154 unreachable("Invalid integer type.");
2155 }
2156 }
2157
2158 bool
opt_algebraic()2159 elk_fs_visitor::opt_algebraic()
2160 {
2161 bool progress = false;
2162
2163 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
2164 switch (inst->opcode) {
2165 case ELK_OPCODE_MOV:
2166 if (!devinfo->has_64bit_float &&
2167 inst->dst.type == ELK_REGISTER_TYPE_DF) {
2168 assert(inst->dst.type == inst->src[0].type);
2169 assert(!inst->saturate);
2170 assert(!inst->src[0].abs);
2171 assert(!inst->src[0].negate);
2172 const elk::fs_builder ibld(this, block, inst);
2173
2174 if (!inst->is_partial_write())
2175 ibld.emit_undef_for_dst(inst);
2176
2177 ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_F, 1),
2178 subscript(inst->src[0], ELK_REGISTER_TYPE_F, 1));
2179 ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_F, 0),
2180 subscript(inst->src[0], ELK_REGISTER_TYPE_F, 0));
2181
2182 inst->remove(block);
2183 progress = true;
2184 }
2185
2186 if (!devinfo->has_64bit_int &&
2187 (inst->dst.type == ELK_REGISTER_TYPE_UQ ||
2188 inst->dst.type == ELK_REGISTER_TYPE_Q)) {
2189 assert(inst->dst.type == inst->src[0].type);
2190 assert(!inst->saturate);
2191 assert(!inst->src[0].abs);
2192 assert(!inst->src[0].negate);
2193 const elk::fs_builder ibld(this, block, inst);
2194
2195 if (!inst->is_partial_write())
2196 ibld.emit_undef_for_dst(inst);
2197
2198 ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 1),
2199 subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 1));
2200 ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 0),
2201 subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0));
2202
2203 inst->remove(block);
2204 progress = true;
2205 }
2206
2207 if ((inst->conditional_mod == ELK_CONDITIONAL_Z ||
2208 inst->conditional_mod == ELK_CONDITIONAL_NZ) &&
2209 inst->dst.is_null() &&
2210 (inst->src[0].abs || inst->src[0].negate)) {
2211 inst->src[0].abs = false;
2212 inst->src[0].negate = false;
2213 progress = true;
2214 break;
2215 }
2216
2217 if (inst->src[0].file != IMM)
2218 break;
2219
2220 if (inst->saturate) {
2221 /* Full mixed-type saturates don't happen. However, we can end up
2222 * with things like:
2223 *
2224 * mov.sat(8) g21<1>DF -1F
2225 *
2226 * Other mixed-size-but-same-base-type cases may also be possible.
2227 */
2228 if (inst->dst.type != inst->src[0].type &&
2229 inst->dst.type != ELK_REGISTER_TYPE_DF &&
2230 inst->src[0].type != ELK_REGISTER_TYPE_F)
2231 assert(!"unimplemented: saturate mixed types");
2232
2233 if (elk_saturate_immediate(inst->src[0].type,
2234 &inst->src[0].as_elk_reg())) {
2235 inst->saturate = false;
2236 progress = true;
2237 }
2238 }
2239 break;
2240
2241 case ELK_OPCODE_MUL:
2242 if (inst->src[1].file != IMM)
2243 continue;
2244
2245 if (elk_reg_type_is_floating_point(inst->src[1].type))
2246 break;
2247
2248 /* From the BDW PRM, Vol 2a, "mul - Multiply":
2249 *
2250 * "When multiplying integer datatypes, if src0 is DW and src1
2251 * is W, irrespective of the destination datatype, the
2252 * accumulator maintains full 48-bit precision."
2253 * ...
2254 * "When multiplying integer data types, if one of the sources
2255 * is a DW, the resulting full precision data is stored in
2256 * the accumulator."
2257 *
2258 * There are also similar notes in earlier PRMs.
2259 *
2260 * The MOV instruction can copy the bits of the source, but it
2261 * does not clear the higher bits of the accumulator. So, because
2262 * we might use the full accumulator in the MUL/MACH macro, we
2263 * shouldn't replace such MULs with MOVs.
2264 */
2265 if ((elk_reg_type_to_size(inst->src[0].type) == 4 ||
2266 elk_reg_type_to_size(inst->src[1].type) == 4) &&
2267 (inst->dst.is_accumulator() ||
2268 inst->writes_accumulator_implicitly(devinfo)))
2269 break;
2270
2271 /* a * 1.0 = a */
2272 if (inst->src[1].is_one()) {
2273 inst->opcode = ELK_OPCODE_MOV;
2274 inst->sources = 1;
2275 inst->src[1] = reg_undef;
2276 progress = true;
2277 break;
2278 }
2279
2280 /* a * -1.0 = -a */
2281 if (inst->src[1].is_negative_one()) {
2282 inst->opcode = ELK_OPCODE_MOV;
2283 inst->sources = 1;
2284 inst->src[0].negate = !inst->src[0].negate;
2285 inst->src[1] = reg_undef;
2286 progress = true;
2287 break;
2288 }
2289
2290 break;
2291 case ELK_OPCODE_ADD:
2292 if (inst->src[1].file != IMM)
2293 continue;
2294
2295 if (elk_reg_type_is_integer(inst->src[1].type) &&
2296 inst->src[1].is_zero()) {
2297 inst->opcode = ELK_OPCODE_MOV;
2298 inst->sources = 1;
2299 inst->src[1] = reg_undef;
2300 progress = true;
2301 break;
2302 }
2303
2304 if (inst->src[0].file == IMM) {
2305 assert(inst->src[0].type == ELK_REGISTER_TYPE_F);
2306 inst->opcode = ELK_OPCODE_MOV;
2307 inst->sources = 1;
2308 inst->src[0].f += inst->src[1].f;
2309 inst->src[1] = reg_undef;
2310 progress = true;
2311 break;
2312 }
2313 break;
2314
2315 case ELK_OPCODE_AND:
2316 if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
2317 const uint64_t src0 = src_as_uint(inst->src[0]);
2318 const uint64_t src1 = src_as_uint(inst->src[1]);
2319
2320 inst->opcode = ELK_OPCODE_MOV;
2321 inst->sources = 1;
2322 inst->src[0] = elk_imm_for_type(src0 & src1, inst->dst.type);
2323 inst->src[1] = reg_undef;
2324 progress = true;
2325 break;
2326 }
2327
2328 break;
2329
2330 case ELK_OPCODE_OR:
2331 if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
2332 const uint64_t src0 = src_as_uint(inst->src[0]);
2333 const uint64_t src1 = src_as_uint(inst->src[1]);
2334
2335 inst->opcode = ELK_OPCODE_MOV;
2336 inst->sources = 1;
2337 inst->src[0] = elk_imm_for_type(src0 | src1, inst->dst.type);
2338 inst->src[1] = reg_undef;
2339 progress = true;
2340 break;
2341 }
2342
2343 if (inst->src[0].equals(inst->src[1]) ||
2344 inst->src[1].is_zero()) {
2345 /* On Gfx8+, the OR instruction can have a source modifier that
2346 * performs logical not on the operand. Cases of 'OR r0, ~r1, 0'
2347 * or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV.
2348 */
2349 if (inst->src[0].negate) {
2350 inst->opcode = ELK_OPCODE_NOT;
2351 inst->sources = 1;
2352 inst->src[0].negate = false;
2353 } else {
2354 inst->opcode = ELK_OPCODE_MOV;
2355 inst->sources = 1;
2356 }
2357 inst->src[1] = reg_undef;
2358 progress = true;
2359 break;
2360 }
2361 break;
2362 case ELK_OPCODE_CMP:
2363 if ((inst->conditional_mod == ELK_CONDITIONAL_Z ||
2364 inst->conditional_mod == ELK_CONDITIONAL_NZ) &&
2365 inst->src[1].is_zero() &&
2366 (inst->src[0].abs || inst->src[0].negate)) {
2367 inst->src[0].abs = false;
2368 inst->src[0].negate = false;
2369 progress = true;
2370 break;
2371 }
2372 break;
2373 case ELK_OPCODE_SEL:
2374 if (!devinfo->has_64bit_float &&
2375 !devinfo->has_64bit_int &&
2376 (inst->dst.type == ELK_REGISTER_TYPE_DF ||
2377 inst->dst.type == ELK_REGISTER_TYPE_UQ ||
2378 inst->dst.type == ELK_REGISTER_TYPE_Q)) {
2379 assert(inst->dst.type == inst->src[0].type);
2380 assert(!inst->saturate);
2381 assert(!inst->src[0].abs && !inst->src[0].negate);
2382 assert(!inst->src[1].abs && !inst->src[1].negate);
2383 const elk::fs_builder ibld(this, block, inst);
2384
2385 if (!inst->is_partial_write())
2386 ibld.emit_undef_for_dst(inst);
2387
2388 set_predicate(inst->predicate,
2389 ibld.SEL(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 0),
2390 subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
2391 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0)));
2392 set_predicate(inst->predicate,
2393 ibld.SEL(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 1),
2394 subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 1),
2395 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 1)));
2396
2397 inst->remove(block);
2398 progress = true;
2399 }
2400 if (inst->src[0].equals(inst->src[1])) {
2401 inst->opcode = ELK_OPCODE_MOV;
2402 inst->sources = 1;
2403 inst->src[1] = reg_undef;
2404 inst->predicate = ELK_PREDICATE_NONE;
2405 inst->predicate_inverse = false;
2406 progress = true;
2407 } else if (inst->saturate && inst->src[1].file == IMM) {
2408 switch (inst->conditional_mod) {
2409 case ELK_CONDITIONAL_LE:
2410 case ELK_CONDITIONAL_L:
2411 switch (inst->src[1].type) {
2412 case ELK_REGISTER_TYPE_F:
2413 if (inst->src[1].f >= 1.0f) {
2414 inst->opcode = ELK_OPCODE_MOV;
2415 inst->sources = 1;
2416 inst->src[1] = reg_undef;
2417 inst->conditional_mod = ELK_CONDITIONAL_NONE;
2418 progress = true;
2419 }
2420 break;
2421 default:
2422 break;
2423 }
2424 break;
2425 case ELK_CONDITIONAL_GE:
2426 case ELK_CONDITIONAL_G:
2427 switch (inst->src[1].type) {
2428 case ELK_REGISTER_TYPE_F:
2429 if (inst->src[1].f <= 0.0f) {
2430 inst->opcode = ELK_OPCODE_MOV;
2431 inst->sources = 1;
2432 inst->src[1] = reg_undef;
2433 inst->conditional_mod = ELK_CONDITIONAL_NONE;
2434 progress = true;
2435 }
2436 break;
2437 default:
2438 break;
2439 }
2440 default:
2441 break;
2442 }
2443 }
2444 break;
2445 case ELK_OPCODE_MAD:
2446 if (inst->src[0].type != ELK_REGISTER_TYPE_F ||
2447 inst->src[1].type != ELK_REGISTER_TYPE_F ||
2448 inst->src[2].type != ELK_REGISTER_TYPE_F)
2449 break;
2450 if (inst->src[1].is_one()) {
2451 inst->opcode = ELK_OPCODE_ADD;
2452 inst->sources = 2;
2453 inst->src[1] = inst->src[2];
2454 inst->src[2] = reg_undef;
2455 progress = true;
2456 } else if (inst->src[2].is_one()) {
2457 inst->opcode = ELK_OPCODE_ADD;
2458 inst->sources = 2;
2459 inst->src[2] = reg_undef;
2460 progress = true;
2461 }
2462 break;
2463 case ELK_OPCODE_SHL:
2464 if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
2465 /* It's not currently possible to generate this, and this constant
2466 * folding does not handle it.
2467 */
2468 assert(!inst->saturate);
2469
2470 elk_fs_reg result;
2471
2472 switch (type_sz(inst->src[0].type)) {
2473 case 2:
2474 result = elk_imm_uw(0x0ffff & (inst->src[0].ud << (inst->src[1].ud & 0x1f)));
2475 break;
2476 case 4:
2477 result = elk_imm_ud(inst->src[0].ud << (inst->src[1].ud & 0x1f));
2478 break;
2479 case 8:
2480 result = elk_imm_uq(inst->src[0].u64 << (inst->src[1].ud & 0x3f));
2481 break;
2482 default:
2483 /* Just in case a future platform re-enables B or UB types. */
2484 unreachable("Invalid source size.");
2485 }
2486
2487 inst->opcode = ELK_OPCODE_MOV;
2488 inst->src[0] = retype(result, inst->dst.type);
2489 inst->src[1] = reg_undef;
2490 inst->sources = 1;
2491
2492 progress = true;
2493 }
2494 break;
2495
2496 case ELK_SHADER_OPCODE_BROADCAST:
2497 if (is_uniform(inst->src[0])) {
2498 inst->opcode = ELK_OPCODE_MOV;
2499 inst->sources = 1;
2500 inst->force_writemask_all = true;
2501 progress = true;
2502 } else if (inst->src[1].file == IMM) {
2503 inst->opcode = ELK_OPCODE_MOV;
2504 /* It's possible that the selected component will be too large and
2505 * overflow the register. This can happen if someone does a
2506 * readInvocation() from GLSL or SPIR-V and provides an OOB
2507 * invocationIndex. If this happens and we some how manage
2508 * to constant fold it in and get here, then component() may cause
2509 * us to start reading outside of the VGRF which will lead to an
2510 * assert later. Instead, just let it wrap around if it goes over
2511 * exec_size.
2512 */
2513 const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
2514 inst->src[0] = component(inst->src[0], comp);
2515 inst->sources = 1;
2516 inst->force_writemask_all = true;
2517 progress = true;
2518 }
2519 break;
2520
2521 case ELK_SHADER_OPCODE_SHUFFLE:
2522 if (is_uniform(inst->src[0])) {
2523 inst->opcode = ELK_OPCODE_MOV;
2524 inst->sources = 1;
2525 progress = true;
2526 } else if (inst->src[1].file == IMM) {
2527 inst->opcode = ELK_OPCODE_MOV;
2528 inst->src[0] = component(inst->src[0],
2529 inst->src[1].ud);
2530 inst->sources = 1;
2531 progress = true;
2532 }
2533 break;
2534
2535 default:
2536 break;
2537 }
2538
2539 /* Ensure that the correct source has the immediate value. 2-source
2540 * instructions must have the immediate in src[1]. On Gfx12 and later,
2541 * some 3-source instructions can have the immediate in src[0] or
2542 * src[2]. It's complicated, so don't mess with 3-source instructions
2543 * here.
2544 */
2545 if (progress && inst->sources == 2 && inst->is_commutative()) {
2546 if (inst->src[0].file == IMM) {
2547 elk_fs_reg tmp = inst->src[1];
2548 inst->src[1] = inst->src[0];
2549 inst->src[0] = tmp;
2550 }
2551 }
2552 }
2553
2554 if (progress)
2555 invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
2556 DEPENDENCY_INSTRUCTION_DETAIL);
2557
2558 return progress;
2559 }
2560
2561 static unsigned
load_payload_sources_read_for_size(elk_fs_inst * lp,unsigned size_read)2562 load_payload_sources_read_for_size(elk_fs_inst *lp, unsigned size_read)
2563 {
2564 assert(lp->opcode == ELK_SHADER_OPCODE_LOAD_PAYLOAD);
2565 assert(size_read >= lp->header_size * REG_SIZE);
2566
2567 unsigned i;
2568 unsigned size = lp->header_size * REG_SIZE;
2569 for (i = lp->header_size; size < size_read && i < lp->sources; i++)
2570 size += lp->exec_size * type_sz(lp->src[i].type);
2571
2572 /* Size read must cover exactly a subset of sources. */
2573 assert(size == size_read);
2574 return i;
2575 }
2576
2577 /**
2578 * Optimize sample messages that have constant zero values for the trailing
2579 * parameters. We can just reduce the message length for these
2580 * instructions instead of reserving a register for it. Trailing parameters
2581 * that aren't sent default to zero anyway. This will cause the dead code
2582 * eliminator to remove the MOV instruction that would otherwise be emitted to
2583 * set up the zero value.
2584 */
2585 bool
opt_zero_samples()2586 elk_fs_visitor::opt_zero_samples()
2587 {
2588 /* Implementation supports only SENDs, so applicable to Gfx7+ only. */
2589 assert(devinfo->ver >= 7);
2590
2591 bool progress = false;
2592
2593 foreach_block_and_inst(block, elk_fs_inst, send, cfg) {
2594 if (send->opcode != ELK_SHADER_OPCODE_SEND ||
2595 send->sfid != ELK_SFID_SAMPLER)
2596 continue;
2597
2598 /* Wa_14012688258:
2599 *
2600 * Don't trim zeros at the end of payload for sample operations
2601 * in cube and cube arrays.
2602 */
2603 if (send->keep_payload_trailing_zeros)
2604 continue;
2605
2606 elk_fs_inst *lp = (elk_fs_inst *) send->prev;
2607
2608 if (lp->is_head_sentinel() || lp->opcode != ELK_SHADER_OPCODE_LOAD_PAYLOAD)
2609 continue;
2610
2611 /* How much of the payload are actually read by this SEND. */
2612 const unsigned params =
2613 load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
2614
2615 /* We don't want to remove the message header or the first parameter.
2616 * Removing the first parameter is not allowed, see the Haswell PRM
2617 * volume 7, page 149:
2618 *
2619 * "Parameter 0 is required except for the sampleinfo message, which
2620 * has no parameter 0"
2621 */
2622 const unsigned first_param_idx = lp->header_size;
2623 unsigned zero_size = 0;
2624 for (unsigned i = params - 1; i > first_param_idx; i--) {
2625 if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
2626 break;
2627 zero_size += lp->exec_size * type_sz(lp->src[i].type) * lp->dst.stride;
2628 }
2629
2630 const unsigned zero_len = zero_size / (reg_unit(devinfo) * REG_SIZE);
2631 if (zero_len > 0) {
2632 send->mlen -= zero_len;
2633 progress = true;
2634 }
2635 }
2636
2637 if (progress)
2638 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
2639
2640 return progress;
2641 }
2642
2643 /**
2644 * Remove redundant or useless halts.
2645 *
2646 * For example, we can eliminate halts in the following sequence:
2647 *
2648 * halt (redundant with the next halt)
2649 * halt (useless; jumps to the next instruction)
2650 * halt-target
2651 */
2652 bool
opt_redundant_halt()2653 elk_fs_visitor::opt_redundant_halt()
2654 {
2655 bool progress = false;
2656
2657 unsigned halt_count = 0;
2658 elk_fs_inst *halt_target = NULL;
2659 elk_bblock_t *halt_target_block = NULL;
2660 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
2661 if (inst->opcode == ELK_OPCODE_HALT)
2662 halt_count++;
2663
2664 if (inst->opcode == ELK_SHADER_OPCODE_HALT_TARGET) {
2665 halt_target = inst;
2666 halt_target_block = block;
2667 break;
2668 }
2669 }
2670
2671 if (!halt_target) {
2672 assert(halt_count == 0);
2673 return false;
2674 }
2675
2676 /* Delete any HALTs immediately before the halt target. */
2677 for (elk_fs_inst *prev = (elk_fs_inst *) halt_target->prev;
2678 !prev->is_head_sentinel() && prev->opcode == ELK_OPCODE_HALT;
2679 prev = (elk_fs_inst *) halt_target->prev) {
2680 prev->remove(halt_target_block);
2681 halt_count--;
2682 progress = true;
2683 }
2684
2685 if (halt_count == 0) {
2686 halt_target->remove(halt_target_block);
2687 progress = true;
2688 }
2689
2690 if (progress)
2691 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2692
2693 return progress;
2694 }
2695
2696 /**
2697 * Compute a bitmask with GRF granularity with a bit set for each GRF starting
2698 * from \p r.offset which overlaps the region starting at \p s.offset and
2699 * spanning \p ds bytes.
2700 */
2701 static inline unsigned
mask_relative_to(const elk_fs_reg & r,const elk_fs_reg & s,unsigned ds)2702 mask_relative_to(const elk_fs_reg &r, const elk_fs_reg &s, unsigned ds)
2703 {
2704 const int rel_offset = reg_offset(s) - reg_offset(r);
2705 const int shift = rel_offset / REG_SIZE;
2706 const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
2707 assert(reg_space(r) == reg_space(s) &&
2708 shift >= 0 && shift < int(8 * sizeof(unsigned)));
2709 return ((1 << n) - 1) << shift;
2710 }
2711
2712 bool
compute_to_mrf()2713 elk_fs_visitor::compute_to_mrf()
2714 {
2715 bool progress = false;
2716 int next_ip = 0;
2717
2718 /* No MRFs on Gen >= 7. */
2719 if (devinfo->ver >= 7)
2720 return false;
2721
2722 const fs_live_variables &live = live_analysis.require();
2723
2724 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
2725 int ip = next_ip;
2726 next_ip++;
2727
2728 if (inst->opcode != ELK_OPCODE_MOV ||
2729 inst->is_partial_write() ||
2730 inst->dst.file != MRF || inst->src[0].file != VGRF ||
2731 inst->dst.type != inst->src[0].type ||
2732 inst->src[0].abs || inst->src[0].negate ||
2733 !inst->src[0].is_contiguous() ||
2734 inst->src[0].offset % REG_SIZE != 0)
2735 continue;
2736
2737 /* Can't compute-to-MRF this GRF if someone else was going to
2738 * read it later.
2739 */
2740 if (live.vgrf_end[inst->src[0].nr] > ip)
2741 continue;
2742
2743 /* Found a move of a GRF to a MRF. Let's see if we can go rewrite the
2744 * things that computed the value of all GRFs of the source region. The
2745 * regs_left bitset keeps track of the registers we haven't yet found a
2746 * generating instruction for.
2747 */
2748 unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
2749
2750 foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
2751 if (regions_overlap(scan_inst->dst, scan_inst->size_written,
2752 inst->src[0], inst->size_read(0))) {
2753 /* Found the last thing to write our reg we want to turn
2754 * into a compute-to-MRF.
2755 */
2756
2757 /* If this one instruction didn't populate all the
2758 * channels, bail. We might be able to rewrite everything
2759 * that writes that reg, but it would require smarter
2760 * tracking.
2761 */
2762 if (scan_inst->is_partial_write())
2763 break;
2764
2765 /* Handling things not fully contained in the source of the copy
2766 * would need us to understand coalescing out more than one MOV at
2767 * a time.
2768 */
2769 if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
2770 inst->src[0], inst->size_read(0)))
2771 break;
2772
2773 /* SEND instructions can't have MRF as a destination. */
2774 if (scan_inst->mlen)
2775 break;
2776
2777 if (devinfo->ver == 6) {
2778 /* gfx6 math instructions must have the destination be
2779 * GRF, so no compute-to-MRF for them.
2780 */
2781 if (scan_inst->is_math()) {
2782 break;
2783 }
2784 }
2785
2786 /* Clear the bits for any registers this instruction overwrites. */
2787 regs_left &= ~mask_relative_to(
2788 inst->src[0], scan_inst->dst, scan_inst->size_written);
2789 if (!regs_left)
2790 break;
2791 }
2792
2793 /* We don't handle control flow here. Most computation of
2794 * values that end up in MRFs are shortly before the MRF
2795 * write anyway.
2796 */
2797 if (block->start() == scan_inst)
2798 break;
2799
2800 /* You can't read from an MRF, so if someone else reads our
2801 * MRF's source GRF that we wanted to rewrite, that stops us.
2802 */
2803 bool interfered = false;
2804 for (int i = 0; i < scan_inst->sources; i++) {
2805 if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),
2806 inst->src[0], inst->size_read(0))) {
2807 interfered = true;
2808 }
2809 }
2810 if (interfered)
2811 break;
2812
2813 if (regions_overlap(scan_inst->dst, scan_inst->size_written,
2814 inst->dst, inst->size_written)) {
2815 /* If somebody else writes our MRF here, we can't
2816 * compute-to-MRF before that.
2817 */
2818 break;
2819 }
2820
2821 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
2822 regions_overlap(elk_fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
2823 inst->dst, inst->size_written)) {
2824 /* Found a SEND instruction, which means that there are
2825 * live values in MRFs from base_mrf to base_mrf +
2826 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2827 * above it.
2828 */
2829 break;
2830 }
2831 }
2832
2833 if (regs_left)
2834 continue;
2835
2836 /* Found all generating instructions of our MRF's source value, so it
2837 * should be safe to rewrite them to point to the MRF directly.
2838 */
2839 regs_left = (1 << regs_read(inst, 0)) - 1;
2840
2841 foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
2842 if (regions_overlap(scan_inst->dst, scan_inst->size_written,
2843 inst->src[0], inst->size_read(0))) {
2844 /* Clear the bits for any registers this instruction overwrites. */
2845 regs_left &= ~mask_relative_to(
2846 inst->src[0], scan_inst->dst, scan_inst->size_written);
2847
2848 const unsigned rel_offset = reg_offset(scan_inst->dst) -
2849 reg_offset(inst->src[0]);
2850
2851 if (inst->dst.nr & ELK_MRF_COMPR4) {
2852 /* Apply the same address transformation done by the hardware
2853 * for COMPR4 MRF writes.
2854 */
2855 assert(rel_offset < 2 * REG_SIZE);
2856 scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
2857
2858 /* Clear the COMPR4 bit if the generating instruction is not
2859 * compressed.
2860 */
2861 if (scan_inst->size_written < 2 * REG_SIZE)
2862 scan_inst->dst.nr &= ~ELK_MRF_COMPR4;
2863
2864 } else {
2865 /* Calculate the MRF number the result of this instruction is
2866 * ultimately written to.
2867 */
2868 scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
2869 }
2870
2871 scan_inst->dst.file = MRF;
2872 scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
2873 scan_inst->saturate |= inst->saturate;
2874 if (!regs_left)
2875 break;
2876 }
2877 }
2878
2879 assert(!regs_left);
2880 inst->remove(block);
2881 progress = true;
2882 }
2883
2884 if (progress)
2885 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2886
2887 return progress;
2888 }
2889
2890 /**
2891 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2892 * flow. We could probably do better here with some form of divergence
2893 * analysis.
2894 */
2895 bool
eliminate_find_live_channel()2896 elk_fs_visitor::eliminate_find_live_channel()
2897 {
2898 bool progress = false;
2899 unsigned depth = 0;
2900
2901 if (!elk_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
2902 /* The optimization below assumes that channel zero is live on thread
2903 * dispatch, which may not be the case if the fixed function dispatches
2904 * threads sparsely.
2905 */
2906 return false;
2907 }
2908
2909 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
2910 switch (inst->opcode) {
2911 case ELK_OPCODE_IF:
2912 case ELK_OPCODE_DO:
2913 depth++;
2914 break;
2915
2916 case ELK_OPCODE_ENDIF:
2917 case ELK_OPCODE_WHILE:
2918 depth--;
2919 break;
2920
2921 case ELK_OPCODE_HALT:
2922 /* This can potentially make control flow non-uniform until the end
2923 * of the program.
2924 */
2925 goto out;
2926
2927 case ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL:
2928 if (depth == 0) {
2929 inst->opcode = ELK_OPCODE_MOV;
2930 inst->src[0] = elk_imm_ud(0u);
2931 inst->sources = 1;
2932 inst->force_writemask_all = true;
2933 progress = true;
2934 }
2935 break;
2936
2937 default:
2938 break;
2939 }
2940 }
2941
2942 out:
2943 if (progress)
2944 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
2945
2946 return progress;
2947 }
2948
2949 /**
2950 * Once we've generated code, try to convert normal ELK_FS_OPCODE_FB_WRITE
2951 * instructions to ELK_FS_OPCODE_REP_FB_WRITE.
2952 */
2953 void
emit_repclear_shader()2954 elk_fs_visitor::emit_repclear_shader()
2955 {
2956 elk_wm_prog_key *key = (elk_wm_prog_key*) this->key;
2957 elk_fs_inst *write = NULL;
2958
2959 assert(uniforms == 0);
2960 assume(key->nr_color_regions > 0);
2961
2962 elk_fs_reg color_output, header;
2963 if (devinfo->ver >= 7) {
2964 color_output = retype(elk_vec4_grf(127, 0), ELK_REGISTER_TYPE_UD);
2965 header = retype(elk_vec8_grf(125, 0), ELK_REGISTER_TYPE_UD);
2966 } else {
2967 color_output = retype(elk_vec4_reg(MRF, 2, 0), ELK_REGISTER_TYPE_UD);
2968 header = retype(elk_vec8_reg(MRF, 0, 0), ELK_REGISTER_TYPE_UD);
2969 }
2970
2971 /* We pass the clear color as a flat input. Copy it to the output. */
2972 elk_fs_reg color_input =
2973 elk_reg(ELK_GENERAL_REGISTER_FILE, 2, 3, 0, 0, ELK_REGISTER_TYPE_UD,
2974 ELK_VERTICAL_STRIDE_8, ELK_WIDTH_2, ELK_HORIZONTAL_STRIDE_4,
2975 ELK_SWIZZLE_XYZW, WRITEMASK_XYZW);
2976
2977 const fs_builder bld = fs_builder(this).at_end();
2978 bld.exec_all().group(4, 0).MOV(color_output, color_input);
2979
2980 if (key->nr_color_regions > 1) {
2981 /* Copy g0..g1 as the message header */
2982 bld.exec_all().group(16, 0)
2983 .MOV(header, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
2984 }
2985
2986 for (int i = 0; i < key->nr_color_regions; ++i) {
2987 if (i > 0)
2988 bld.exec_all().group(1, 0).MOV(component(header, 2), elk_imm_ud(i));
2989
2990 if (devinfo->ver >= 7) {
2991 write = bld.emit(ELK_SHADER_OPCODE_SEND);
2992 write->resize_sources(2);
2993 write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
2994 write->src[0] = elk_imm_ud(0);
2995 write->src[1] = i == 0 ? color_output : header;
2996 write->check_tdr = true;
2997 write->send_has_side_effects = true;
2998 write->desc = elk_fb_write_desc(devinfo, i,
2999 ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
3000 i == key->nr_color_regions - 1, false);
3001 } else {
3002 write = bld.emit(ELK_FS_OPCODE_REP_FB_WRITE);
3003 write->target = i;
3004 write->base_mrf = i == 0 ? color_output.nr : header.nr;
3005 }
3006
3007 /* We can use a headerless message for the first render target */
3008 write->header_size = i == 0 ? 0 : 2;
3009 write->mlen = 1 + write->header_size;
3010 }
3011 write->eot = true;
3012 write->last_rt = true;
3013
3014 calculate_cfg();
3015
3016 this->first_non_payload_grf = payload().num_regs;
3017 }
3018
3019 /**
3020 * Walks through basic blocks, looking for repeated MRF writes and
3021 * removing the later ones.
3022 */
3023 bool
remove_duplicate_mrf_writes()3024 elk_fs_visitor::remove_duplicate_mrf_writes()
3025 {
3026 elk_fs_inst *last_mrf_move[ELK_MAX_MRF_ALL];
3027 bool progress = false;
3028
3029 /* Need to update the MRF tracking for compressed instructions. */
3030 if (dispatch_width >= 16)
3031 return false;
3032
3033 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3034
3035 foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
3036 if (inst->is_control_flow()) {
3037 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3038 }
3039
3040 if (inst->opcode == ELK_OPCODE_MOV &&
3041 inst->dst.file == MRF) {
3042 elk_fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
3043 if (prev_inst && prev_inst->opcode == ELK_OPCODE_MOV &&
3044 inst->dst.equals(prev_inst->dst) &&
3045 inst->src[0].equals(prev_inst->src[0]) &&
3046 inst->saturate == prev_inst->saturate &&
3047 inst->predicate == prev_inst->predicate &&
3048 inst->conditional_mod == prev_inst->conditional_mod &&
3049 inst->exec_size == prev_inst->exec_size) {
3050 inst->remove(block);
3051 progress = true;
3052 continue;
3053 }
3054 }
3055
3056 /* Clear out the last-write records for MRFs that were overwritten. */
3057 if (inst->dst.file == MRF) {
3058 last_mrf_move[inst->dst.nr] = NULL;
3059 }
3060
3061 if (inst->mlen > 0 && inst->base_mrf != -1) {
3062 /* Found a SEND instruction, which will include two or fewer
3063 * implied MRF writes. We could do better here.
3064 */
3065 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
3066 last_mrf_move[inst->base_mrf + i] = NULL;
3067 }
3068 }
3069
3070 /* Clear out any MRF move records whose sources got overwritten. */
3071 for (unsigned i = 0; i < ELK_MAX_MRF(devinfo->ver); i++) {
3072 if (last_mrf_move[i] &&
3073 regions_overlap(inst->dst, inst->size_written,
3074 last_mrf_move[i]->src[0],
3075 last_mrf_move[i]->size_read(0))) {
3076 last_mrf_move[i] = NULL;
3077 }
3078 }
3079
3080 if (inst->opcode == ELK_OPCODE_MOV &&
3081 inst->dst.file == MRF &&
3082 inst->src[0].file != ARF &&
3083 !inst->is_partial_write()) {
3084 last_mrf_move[inst->dst.nr] = inst;
3085 }
3086 }
3087
3088 if (progress)
3089 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3090
3091 return progress;
3092 }
3093
3094 /**
3095 * Rounding modes for conversion instructions are included for each
3096 * conversion, but right now it is a state. So once it is set,
3097 * we don't need to call it again for subsequent calls.
3098 *
3099 * This is useful for vector/matrices conversions, as setting the
3100 * mode once is enough for the full vector/matrix
3101 */
3102 bool
remove_extra_rounding_modes()3103 elk_fs_visitor::remove_extra_rounding_modes()
3104 {
3105 bool progress = false;
3106 unsigned execution_mode = this->nir->info.float_controls_execution_mode;
3107
3108 elk_rnd_mode base_mode = ELK_RND_MODE_UNSPECIFIED;
3109 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
3110 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
3111 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
3112 execution_mode)
3113 base_mode = ELK_RND_MODE_RTNE;
3114 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
3115 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
3116 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
3117 execution_mode)
3118 base_mode = ELK_RND_MODE_RTZ;
3119
3120 foreach_block (block, cfg) {
3121 elk_rnd_mode prev_mode = base_mode;
3122
3123 foreach_inst_in_block_safe (elk_fs_inst, inst, block) {
3124 if (inst->opcode == ELK_SHADER_OPCODE_RND_MODE) {
3125 assert(inst->src[0].file == ELK_IMMEDIATE_VALUE);
3126 const elk_rnd_mode mode = (elk_rnd_mode) inst->src[0].d;
3127 if (mode == prev_mode) {
3128 inst->remove(block);
3129 progress = true;
3130 } else {
3131 prev_mode = mode;
3132 }
3133 }
3134 }
3135 }
3136
3137 if (progress)
3138 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3139
3140 return progress;
3141 }
3142
3143 static void
clear_deps_for_inst_src(elk_fs_inst * inst,bool * deps,int first_grf,int grf_len)3144 clear_deps_for_inst_src(elk_fs_inst *inst, bool *deps, int first_grf, int grf_len)
3145 {
3146 /* Clear the flag for registers that actually got read (as expected). */
3147 for (int i = 0; i < inst->sources; i++) {
3148 int grf;
3149 if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
3150 grf = inst->src[i].nr;
3151 } else {
3152 continue;
3153 }
3154
3155 if (grf >= first_grf &&
3156 grf < first_grf + grf_len) {
3157 deps[grf - first_grf] = false;
3158 if (inst->exec_size == 16)
3159 deps[grf - first_grf + 1] = false;
3160 }
3161 }
3162 }
3163
3164 /**
3165 * Implements this workaround for the original 965:
3166 *
3167 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3168 * check for post destination dependencies on this instruction, software
3169 * must ensure that there is no destination hazard for the case of ‘write
3170 * followed by a posted write’ shown in the following example.
3171 *
3172 * 1. mov r3 0
3173 * 2. send r3.xy <rest of send instruction>
3174 * 3. mov r2 r3
3175 *
3176 * Due to no post-destination dependency check on the ‘send’, the above
3177 * code sequence could have two instructions (1 and 2) in flight at the
3178 * same time that both consider ‘r3’ as the target of their final writes.
3179 */
3180 void
insert_gfx4_pre_send_dependency_workarounds(elk_bblock_t * block,elk_fs_inst * inst)3181 elk_fs_visitor::insert_gfx4_pre_send_dependency_workarounds(elk_bblock_t *block,
3182 elk_fs_inst *inst)
3183 {
3184 int write_len = regs_written(inst);
3185 int first_write_grf = inst->dst.nr;
3186 bool needs_dep[ELK_MAX_MRF_ALL];
3187 assert(write_len < ELK_MAX_MRF(devinfo->ver) - 1);
3188
3189 memset(needs_dep, false, sizeof(needs_dep));
3190 memset(needs_dep, true, write_len);
3191
3192 clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3193
3194 /* Walk backwards looking for writes to registers we're writing which
3195 * aren't read since being written. If we hit the start of the program,
3196 * we assume that there are no outstanding dependencies on entry to the
3197 * program.
3198 */
3199 foreach_inst_in_block_reverse_starting_from(elk_fs_inst, scan_inst, inst) {
3200 /* If we hit control flow, assume that there *are* outstanding
3201 * dependencies, and force their cleanup before our instruction.
3202 */
3203 if (block->start() == scan_inst && block->num != 0) {
3204 for (int i = 0; i < write_len; i++) {
3205 if (needs_dep[i])
3206 DEP_RESOLVE_MOV(fs_builder(this, block, inst),
3207 first_write_grf + i);
3208 }
3209 return;
3210 }
3211
3212 /* We insert our reads as late as possible on the assumption that any
3213 * instruction but a MOV that might have left us an outstanding
3214 * dependency has more latency than a MOV.
3215 */
3216 if (scan_inst->dst.file == VGRF) {
3217 for (unsigned i = 0; i < regs_written(scan_inst); i++) {
3218 int reg = scan_inst->dst.nr + i;
3219
3220 if (reg >= first_write_grf &&
3221 reg < first_write_grf + write_len &&
3222 needs_dep[reg - first_write_grf]) {
3223 DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
3224 needs_dep[reg - first_write_grf] = false;
3225 if (scan_inst->exec_size == 16)
3226 needs_dep[reg - first_write_grf + 1] = false;
3227 }
3228 }
3229 }
3230
3231 /* Clear the flag for registers that actually got read (as expected). */
3232 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3233
3234 /* Continue the loop only if we haven't resolved all the dependencies */
3235 int i;
3236 for (i = 0; i < write_len; i++) {
3237 if (needs_dep[i])
3238 break;
3239 }
3240 if (i == write_len)
3241 return;
3242 }
3243 }
3244
3245 /**
3246 * Implements this workaround for the original 965:
3247 *
3248 * "[DevBW, DevCL] Errata: A destination register from a send can not be
3249 * used as a destination register until after it has been sourced by an
3250 * instruction with a different destination register.
3251 */
3252 void
insert_gfx4_post_send_dependency_workarounds(elk_bblock_t * block,elk_fs_inst * inst)3253 elk_fs_visitor::insert_gfx4_post_send_dependency_workarounds(elk_bblock_t *block, elk_fs_inst *inst)
3254 {
3255 int write_len = regs_written(inst);
3256 unsigned first_write_grf = inst->dst.nr;
3257 bool needs_dep[ELK_MAX_MRF_ALL];
3258 assert(write_len < ELK_MAX_MRF(devinfo->ver) - 1);
3259
3260 memset(needs_dep, false, sizeof(needs_dep));
3261 memset(needs_dep, true, write_len);
3262 /* Walk forwards looking for writes to registers we're writing which aren't
3263 * read before being written.
3264 */
3265 foreach_inst_in_block_starting_from(elk_fs_inst, scan_inst, inst) {
3266 /* If we hit control flow, force resolve all remaining dependencies. */
3267 if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {
3268 for (int i = 0; i < write_len; i++) {
3269 if (needs_dep[i])
3270 DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3271 first_write_grf + i);
3272 }
3273 return;
3274 }
3275
3276 /* Clear the flag for registers that actually got read (as expected). */
3277 clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3278
3279 /* We insert our reads as late as possible since they're reading the
3280 * result of a SEND, which has massive latency.
3281 */
3282 if (scan_inst->dst.file == VGRF &&
3283 scan_inst->dst.nr >= first_write_grf &&
3284 scan_inst->dst.nr < first_write_grf + write_len &&
3285 needs_dep[scan_inst->dst.nr - first_write_grf]) {
3286 DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3287 scan_inst->dst.nr);
3288 needs_dep[scan_inst->dst.nr - first_write_grf] = false;
3289 }
3290
3291 /* Continue the loop only if we haven't resolved all the dependencies */
3292 int i;
3293 for (i = 0; i < write_len; i++) {
3294 if (needs_dep[i])
3295 break;
3296 }
3297 if (i == write_len)
3298 return;
3299 }
3300 }
3301
3302 void
insert_gfx4_send_dependency_workarounds()3303 elk_fs_visitor::insert_gfx4_send_dependency_workarounds()
3304 {
3305 if (devinfo->ver != 4 || devinfo->platform == INTEL_PLATFORM_G4X)
3306 return;
3307
3308 bool progress = false;
3309
3310 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
3311 if (inst->mlen != 0 && inst->dst.file == VGRF) {
3312 insert_gfx4_pre_send_dependency_workarounds(block, inst);
3313 insert_gfx4_post_send_dependency_workarounds(block, inst);
3314 progress = true;
3315 }
3316 }
3317
3318 if (progress)
3319 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3320 }
3321
3322 /**
3323 * flags_read() and flags_written() return flag access with byte granularity,
3324 * but for Flag Register PRM lists "Access Granularity: Word", so we can assume
3325 * accessing any part of a word will clear its register dependency.
3326 */
3327 static unsigned
bytes_bitmask_to_words(unsigned b)3328 bytes_bitmask_to_words(unsigned b)
3329 {
3330 unsigned first_byte_mask = b & 0x55555555;
3331 unsigned second_byte_mask = b & 0xaaaaaaaa;
3332 return first_byte_mask |
3333 (first_byte_mask << 1) |
3334 second_byte_mask |
3335 (second_byte_mask >> 1);
3336 }
3337
3338 /**
3339 * WaClearArfDependenciesBeforeEot
3340 *
3341 * Flag register dependency not cleared after EOT, so we have to source them
3342 * before EOT. We can do this with simple `mov(1) nullUD, f{0,1}UD`
3343 *
3344 * To avoid emitting MOVs when it's not needed, check if each block reads all
3345 * the flags it sets. We might falsely determine register as unread if it'll be
3346 * accessed inside the next blocks, but this still should be good enough.
3347 */
3348 bool
workaround_source_arf_before_eot()3349 elk_fs_visitor::workaround_source_arf_before_eot()
3350 {
3351 bool progress = false;
3352
3353 if (devinfo->platform != INTEL_PLATFORM_CHV)
3354 return false;
3355
3356 unsigned flags_unread = 0;
3357
3358 foreach_block(block, cfg) {
3359 unsigned flags_unread_in_block = 0;
3360
3361 foreach_inst_in_block(elk_fs_inst, inst, block) {
3362 /* Instruction can read and write to the same flag, so the order is important */
3363 flags_unread_in_block &= ~bytes_bitmask_to_words(inst->flags_read(devinfo));
3364 flags_unread_in_block |= bytes_bitmask_to_words(inst->flags_written(devinfo));
3365
3366 /* HALT does not start its block even though it can leave a dependency */
3367 if (inst->opcode == ELK_OPCODE_HALT ||
3368 inst->opcode == ELK_SHADER_OPCODE_HALT_TARGET) {
3369 flags_unread |= flags_unread_in_block;
3370 flags_unread_in_block = 0;
3371 }
3372 }
3373
3374 flags_unread |= flags_unread_in_block;
3375
3376 if ((flags_unread & 0x0f) && (flags_unread & 0xf0))
3377 break;
3378 }
3379
3380 if (flags_unread) {
3381 int eot_count = 0;
3382
3383 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg)
3384 {
3385 if (!inst->eot)
3386 continue;
3387
3388 /* Currently, we always emit only one EOT per program,
3389 * this WA should be updated if it ever changes.
3390 */
3391 ++eot_count;
3392 assert(eot_count == 1);
3393
3394 const fs_builder ibld(this, block, inst);
3395 const fs_builder ubld = ibld.exec_all().group(1, 0);
3396
3397 if (flags_unread & 0x0f)
3398 ubld.MOV(ubld.null_reg_ud(), retype(elk_flag_reg(0, 0), ELK_REGISTER_TYPE_UD));
3399
3400 if (flags_unread & 0xf0)
3401 ubld.MOV(ubld.null_reg_ud(), retype(elk_flag_reg(1, 0), ELK_REGISTER_TYPE_UD));
3402 }
3403
3404 progress = true;
3405 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3406 }
3407
3408 return progress;
3409 }
3410
3411 bool
lower_load_payload()3412 elk_fs_visitor::lower_load_payload()
3413 {
3414 bool progress = false;
3415
3416 foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
3417 if (inst->opcode != ELK_SHADER_OPCODE_LOAD_PAYLOAD)
3418 continue;
3419
3420 assert(inst->dst.file == MRF || inst->dst.file == VGRF);
3421 assert(inst->saturate == false);
3422 elk_fs_reg dst = inst->dst;
3423
3424 /* Get rid of COMPR4. We'll add it back in if we need it */
3425 if (dst.file == MRF)
3426 dst.nr = dst.nr & ~ELK_MRF_COMPR4;
3427
3428 const fs_builder ibld(this, block, inst);
3429 const fs_builder ubld = ibld.exec_all();
3430
3431 for (uint8_t i = 0; i < inst->header_size;) {
3432 /* Number of header GRFs to initialize at once with a single MOV
3433 * instruction.
3434 */
3435 const unsigned n =
3436 (i + 1 < inst->header_size && inst->src[i].stride == 1 &&
3437 inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
3438 2 : 1;
3439
3440 if (inst->src[i].file != BAD_FILE)
3441 ubld.group(8 * n, 0).MOV(retype(dst, ELK_REGISTER_TYPE_UD),
3442 retype(inst->src[i], ELK_REGISTER_TYPE_UD));
3443
3444 dst = byte_offset(dst, n * REG_SIZE);
3445 i += n;
3446 }
3447
3448 if (inst->dst.file == MRF && (inst->dst.nr & ELK_MRF_COMPR4) &&
3449 inst->exec_size > 8) {
3450 /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3451 * a straightforward copy. Instead, the result of the
3452 * LOAD_PAYLOAD is treated as interleaved and the first four
3453 * non-header sources are unpacked as:
3454 *
3455 * m + 0: r0
3456 * m + 1: g0
3457 * m + 2: b0
3458 * m + 3: a0
3459 * m + 4: r1
3460 * m + 5: g1
3461 * m + 6: b1
3462 * m + 7: a1
3463 *
3464 * This is used for gen <= 5 fb writes.
3465 */
3466 assert(inst->exec_size == 16);
3467 assert(inst->header_size + 4 <= inst->sources);
3468 for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3469 if (inst->src[i].file != BAD_FILE) {
3470 if (devinfo->has_compr4) {
3471 elk_fs_reg compr4_dst = retype(dst, inst->src[i].type);
3472 compr4_dst.nr |= ELK_MRF_COMPR4;
3473 ibld.MOV(compr4_dst, inst->src[i]);
3474 } else {
3475 /* Platform doesn't have COMPR4. We have to fake it */
3476 elk_fs_reg mov_dst = retype(dst, inst->src[i].type);
3477 ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0));
3478 mov_dst.nr += 4;
3479 ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1));
3480 }
3481 }
3482
3483 dst.nr++;
3484 }
3485
3486 /* The loop above only ever incremented us through the first set
3487 * of 4 registers. However, thanks to the magic of COMPR4, we
3488 * actually wrote to the first 8 registers, so we need to take
3489 * that into account now.
3490 */
3491 dst.nr += 4;
3492
3493 /* The COMPR4 code took care of the first 4 sources. We'll let
3494 * the regular path handle any remaining sources. Yes, we are
3495 * modifying the instruction but we're about to delete it so
3496 * this really doesn't hurt anything.
3497 */
3498 inst->header_size += 4;
3499 }
3500
3501 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3502 dst.type = inst->src[i].type;
3503 if (inst->src[i].file != BAD_FILE) {
3504 ibld.MOV(dst, inst->src[i]);
3505 }
3506 dst = offset(dst, ibld, 1);
3507 }
3508
3509 inst->remove(block);
3510 progress = true;
3511 }
3512
3513 if (progress)
3514 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3515
3516 return progress;
3517 }
3518
3519 /**
3520 * Factor an unsigned 32-bit integer.
3521 *
3522 * Attempts to factor \c x into two values that are at most 0xFFFF. If no
3523 * such factorization is possible, either because the value is too large or is
3524 * prime, both \c result_a and \c result_b will be zero.
3525 */
3526 static void
factor_uint32(uint32_t x,unsigned * result_a,unsigned * result_b)3527 factor_uint32(uint32_t x, unsigned *result_a, unsigned *result_b)
3528 {
3529 /* This is necessary to prevent various opportunities for division by zero
3530 * below.
3531 */
3532 assert(x > 0xffff);
3533
3534 /* This represents the actual expected constraints on the input. Namely,
3535 * both the upper and lower words should be > 1.
3536 */
3537 assert(x >= 0x00020002);
3538
3539 *result_a = 0;
3540 *result_b = 0;
3541
3542 /* The value is too large to factor with the constraints. */
3543 if (x > (0xffffu * 0xffffu))
3544 return;
3545
3546 /* A non-prime number will have the form p*q*d where p is some prime
3547 * number, q > 1, and 1 <= d <= q. To meet the constraints of this
3548 * function, (p*d) < 0x10000. This implies d <= floor(0xffff / p).
3549 * Furthermore, since q < 0x10000, d >= floor(x / (0xffff * p)). Finally,
3550 * floor(x / (0xffff * p)) <= d <= floor(0xffff / p).
3551 *
3552 * The observation is finding the largest possible value of p reduces the
3553 * possible range of d. After selecting p, all values of d in this range
3554 * are tested until a factorization is found. The size of the range of
3555 * possible values of d sets an upper bound on the run time of the
3556 * function.
3557 */
3558 static const uint16_t primes[256] = {
3559 2, 3, 5, 7, 11, 13, 17, 19,
3560 23, 29, 31, 37, 41, 43, 47, 53,
3561 59, 61, 67, 71, 73, 79, 83, 89,
3562 97, 101, 103, 107, 109, 113, 127, 131, /* 32 */
3563 137, 139, 149, 151, 157, 163, 167, 173,
3564 179, 181, 191, 193, 197, 199, 211, 223,
3565 227, 229, 233, 239, 241, 251, 257, 263,
3566 269, 271, 277, 281, 283, 293, 307, 311, /* 64 */
3567 313, 317, 331, 337, 347, 349, 353, 359,
3568 367, 373, 379, 383, 389, 397, 401, 409,
3569 419, 421, 431, 433, 439, 443, 449, 457,
3570 461, 463, 467, 479, 487, 491, 499, 503, /* 96 */
3571 509, 521, 523, 541, 547, 557, 563, 569,
3572 571, 577, 587, 593, 599, 601, 607, 613,
3573 617, 619, 631, 641, 643, 647, 653, 659,
3574 661, 673, 677, 683, 691, 701, 709, 719, /* 128 */
3575 727, 733, 739, 743, 751, 757, 761, 769,
3576 773, 787, 797, 809, 811, 821, 823, 827,
3577 829, 839, 853, 857, 859, 863, 877, 881,
3578 883, 887, 907, 911, 919, 929, 937, 941, /* 160 */
3579 947, 953, 967, 971, 977, 983, 991, 997,
3580 1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049,
3581 1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097,
3582 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, /* 192 */
3583 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
3584 1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283,
3585 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
3586 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, /* 224 */
3587 1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459,
3588 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
3589 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571,
3590 1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619, /* 256 */
3591 };
3592
3593 unsigned p;
3594 unsigned x_div_p;
3595
3596 for (int i = ARRAY_SIZE(primes) - 1; i >= 0; i--) {
3597 p = primes[i];
3598 x_div_p = x / p;
3599
3600 if ((x_div_p * p) == x)
3601 break;
3602 }
3603
3604 /* A prime factor was not found. */
3605 if (x_div_p * p != x)
3606 return;
3607
3608 /* Terminate early if d=1 is a solution. */
3609 if (x_div_p < 0x10000) {
3610 *result_a = x_div_p;
3611 *result_b = p;
3612 return;
3613 }
3614
3615 /* Pick the maximum possible value for 'd'. It's important that the loop
3616 * below execute while d <= max_d because max_d is a valid value. Having
3617 * the wrong loop bound would cause 1627*1367*47 (0x063b0c83) to be
3618 * incorrectly reported as not being factorable. The problem would occur
3619 * with any value that is a factor of two primes in the table and one prime
3620 * not in the table.
3621 */
3622 const unsigned max_d = 0xffff / p;
3623
3624 /* Pick an initial value of 'd' that (combined with rejecting too large
3625 * values above) guarantees that 'q' will always be small enough.
3626 * DIV_ROUND_UP is used to prevent 'd' from being zero.
3627 */
3628 for (unsigned d = DIV_ROUND_UP(x_div_p, 0xffff); d <= max_d; d++) {
3629 unsigned q = x_div_p / d;
3630
3631 if ((q * d) == x_div_p) {
3632 assert(p * d * q == x);
3633 assert((p * d) < 0x10000);
3634
3635 *result_a = q;
3636 *result_b = p * d;
3637 break;
3638 }
3639
3640 /* Since every value of 'd' is tried, as soon as 'd' is larger
3641 * than 'q', we're just re-testing combinations that have
3642 * already been tested.
3643 */
3644 if (d > q)
3645 break;
3646 }
3647 }
3648
3649 void
lower_mul_dword_inst(elk_fs_inst * inst,elk_bblock_t * block)3650 elk_fs_visitor::lower_mul_dword_inst(elk_fs_inst *inst, elk_bblock_t *block)
3651 {
3652 const fs_builder ibld(this, block, inst);
3653
3654 /* It is correct to use inst->src[1].d in both end of the comparison.
3655 * Using .ud in the UINT16_MAX comparison would cause any negative value to
3656 * fail the check.
3657 */
3658 if (inst->src[1].file == IMM &&
3659 (inst->src[1].d >= INT16_MIN && inst->src[1].d <= UINT16_MAX)) {
3660 /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3661 * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3662 * src1 are used.
3663 *
3664 * If multiplying by an immediate value that fits in 16-bits, do a
3665 * single MUL instruction with that value in the proper location.
3666 */
3667 const bool ud = (inst->src[1].d >= 0);
3668 if (devinfo->ver < 7) {
3669 elk_fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type);
3670 ibld.MOV(imm, inst->src[1]);
3671 ibld.MUL(inst->dst, imm, inst->src[0]);
3672 } else {
3673 ibld.MUL(inst->dst, inst->src[0],
3674 ud ? elk_imm_uw(inst->src[1].ud)
3675 : elk_imm_w(inst->src[1].d));
3676 }
3677 } else {
3678 /* Gen < 8 (and some Gfx8+ low-power parts like Cherryview) cannot
3679 * do 32-bit integer multiplication in one instruction, but instead
3680 * must do a sequence (which actually calculates a 64-bit result):
3681 *
3682 * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
3683 * mach(8) null g3<8,8,1>D g4<8,8,1>D
3684 * mov(8) g2<1>D acc0<8,8,1>D
3685 *
3686 * But on Gen > 6, the ability to use second accumulator register
3687 * (acc1) for non-float data types was removed, preventing a simple
3688 * implementation in SIMD16. A 16-channel result can be calculated by
3689 * executing the three instructions twice in SIMD8, once with quarter
3690 * control of 1Q for the first eight channels and again with 2Q for
3691 * the second eight channels.
3692 *
3693 * Which accumulator register is implicitly accessed (by AccWrEnable
3694 * for instance) is determined by the quarter control. Unfortunately
3695 * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3696 * implicit accumulator access by an instruction with 2Q will access
3697 * acc1 regardless of whether the data type is usable in acc1.
3698 *
3699 * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3700 * integer data types.
3701 *
3702 * Since we only want the low 32-bits of the result, we can do two
3703 * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3704 * adjust the high result and add them (like the mach is doing):
3705 *
3706 * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
3707 * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
3708 * shl(8) g9<1>D g8<8,8,1>D 16D
3709 * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
3710 *
3711 * We avoid the shl instruction by realizing that we only want to add
3712 * the low 16-bits of the "high" result to the high 16-bits of the
3713 * "low" result and using proper regioning on the add:
3714 *
3715 * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
3716 * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
3717 * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
3718 *
3719 * Since it does not use the (single) accumulator register, we can
3720 * schedule multi-component multiplications much better.
3721 */
3722
3723 bool needs_mov = false;
3724 elk_fs_reg orig_dst = inst->dst;
3725
3726 /* Get a new VGRF for the "low" 32x16-bit multiplication result if
3727 * reusing the original destination is impossible due to hardware
3728 * restrictions, source/destination overlap, or it being the null
3729 * register.
3730 */
3731 elk_fs_reg low = inst->dst;
3732 if (orig_dst.is_null() || orig_dst.file == MRF ||
3733 regions_overlap(inst->dst, inst->size_written,
3734 inst->src[0], inst->size_read(0)) ||
3735 regions_overlap(inst->dst, inst->size_written,
3736 inst->src[1], inst->size_read(1)) ||
3737 inst->dst.stride >= 4) {
3738 needs_mov = true;
3739 low = elk_fs_reg(VGRF, alloc.allocate(regs_written(inst)),
3740 inst->dst.type);
3741 }
3742
3743 /* Get a new VGRF but keep the same stride as inst->dst */
3744 elk_fs_reg high(VGRF, alloc.allocate(regs_written(inst)), inst->dst.type);
3745 high.stride = inst->dst.stride;
3746 high.offset = inst->dst.offset % REG_SIZE;
3747
3748 bool do_addition = true;
3749 if (devinfo->ver >= 7) {
3750 if (inst->src[1].abs)
3751 lower_src_modifiers(this, block, inst, 1);
3752
3753 if (inst->src[1].file == IMM) {
3754 unsigned a;
3755 unsigned b;
3756
3757 /* If the immeditate value can be factored into two values, A and
3758 * B, that each fit in 16-bits, the multiplication result can
3759 * instead be calculated as (src1 * (A * B)) = ((src1 * A) * B).
3760 * This saves an operation (the addition) and a temporary register
3761 * (high).
3762 *
3763 * Skip the optimization if either the high word or the low word
3764 * is 0 or 1. In these conditions, at least one of the
3765 * multiplications generated by the straightforward method will be
3766 * eliminated anyway.
3767 */
3768 if (inst->src[1].ud > 0x0001ffff &&
3769 (inst->src[1].ud & 0xffff) > 1) {
3770 factor_uint32(inst->src[1].ud, &a, &b);
3771
3772 if (a != 0) {
3773 ibld.MUL(low, inst->src[0], elk_imm_uw(a));
3774 ibld.MUL(low, low, elk_imm_uw(b));
3775 do_addition = false;
3776 }
3777 }
3778
3779 if (do_addition) {
3780 ibld.MUL(low, inst->src[0],
3781 elk_imm_uw(inst->src[1].ud & 0xffff));
3782 ibld.MUL(high, inst->src[0],
3783 elk_imm_uw(inst->src[1].ud >> 16));
3784 }
3785 } else {
3786 ibld.MUL(low, inst->src[0],
3787 subscript(inst->src[1], ELK_REGISTER_TYPE_UW, 0));
3788 ibld.MUL(high, inst->src[0],
3789 subscript(inst->src[1], ELK_REGISTER_TYPE_UW, 1));
3790 }
3791 } else {
3792 if (inst->src[0].abs)
3793 lower_src_modifiers(this, block, inst, 0);
3794
3795 ibld.MUL(low, subscript(inst->src[0], ELK_REGISTER_TYPE_UW, 0),
3796 inst->src[1]);
3797 ibld.MUL(high, subscript(inst->src[0], ELK_REGISTER_TYPE_UW, 1),
3798 inst->src[1]);
3799 }
3800
3801 if (do_addition) {
3802 ibld.ADD(subscript(low, ELK_REGISTER_TYPE_UW, 1),
3803 subscript(low, ELK_REGISTER_TYPE_UW, 1),
3804 subscript(high, ELK_REGISTER_TYPE_UW, 0));
3805 }
3806
3807 if (needs_mov || inst->conditional_mod)
3808 set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low));
3809 }
3810 }
3811
3812 void
lower_mul_qword_inst(elk_fs_inst * inst,elk_bblock_t * block)3813 elk_fs_visitor::lower_mul_qword_inst(elk_fs_inst *inst, elk_bblock_t *block)
3814 {
3815 const fs_builder ibld(this, block, inst);
3816
3817 /* Considering two 64-bit integers ab and cd where each letter ab
3818 * corresponds to 32 bits, we get a 128-bit result WXYZ. We * cd
3819 * only need to provide the YZ part of the result. -------
3820 * BD
3821 * Only BD needs to be 64 bits. For AD and BC we only care + AD
3822 * about the lower 32 bits (since they are part of the upper + BC
3823 * 32 bits of our result). AC is not needed since it starts + AC
3824 * on the 65th bit of the result. -------
3825 * WXYZ
3826 */
3827 unsigned int q_regs = regs_written(inst);
3828 unsigned int d_regs = (q_regs + 1) / 2;
3829
3830 elk_fs_reg bd(VGRF, alloc.allocate(q_regs), ELK_REGISTER_TYPE_UQ);
3831 elk_fs_reg ad(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
3832 elk_fs_reg bc(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
3833
3834 /* Here we need the full 64 bit result for 32b * 32b. */
3835 if (devinfo->has_integer_dword_mul) {
3836 ibld.MUL(bd, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
3837 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0));
3838 } else {
3839 elk_fs_reg bd_high(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
3840 elk_fs_reg bd_low(VGRF, alloc.allocate(d_regs), ELK_REGISTER_TYPE_UD);
3841 const unsigned acc_width = reg_unit(devinfo) * 8;
3842 elk_fs_reg acc = suboffset(retype(elk_acc_reg(inst->exec_size), ELK_REGISTER_TYPE_UD),
3843 inst->group % acc_width);
3844
3845 elk_fs_inst *mul = ibld.MUL(acc,
3846 subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
3847 subscript(inst->src[1], ELK_REGISTER_TYPE_UW, 0));
3848 mul->writes_accumulator = true;
3849
3850 ibld.MACH(bd_high, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
3851 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0));
3852 ibld.MOV(bd_low, acc);
3853
3854 ibld.UNDEF(bd);
3855 ibld.MOV(subscript(bd, ELK_REGISTER_TYPE_UD, 0), bd_low);
3856 ibld.MOV(subscript(bd, ELK_REGISTER_TYPE_UD, 1), bd_high);
3857 }
3858
3859 ibld.MUL(ad, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 1),
3860 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 0));
3861 ibld.MUL(bc, subscript(inst->src[0], ELK_REGISTER_TYPE_UD, 0),
3862 subscript(inst->src[1], ELK_REGISTER_TYPE_UD, 1));
3863
3864 ibld.ADD(ad, ad, bc);
3865 ibld.ADD(subscript(bd, ELK_REGISTER_TYPE_UD, 1),
3866 subscript(bd, ELK_REGISTER_TYPE_UD, 1), ad);
3867
3868 if (devinfo->has_64bit_int) {
3869 ibld.MOV(inst->dst, bd);
3870 } else {
3871 if (!inst->is_partial_write())
3872 ibld.emit_undef_for_dst(inst);
3873 ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 0),
3874 subscript(bd, ELK_REGISTER_TYPE_UD, 0));
3875 ibld.MOV(subscript(inst->dst, ELK_REGISTER_TYPE_UD, 1),
3876 subscript(bd, ELK_REGISTER_TYPE_UD, 1));
3877 }
3878 }
3879
3880 void
lower_mulh_inst(elk_fs_inst * inst,elk_bblock_t * block)3881 elk_fs_visitor::lower_mulh_inst(elk_fs_inst *inst, elk_bblock_t *block)
3882 {
3883 const fs_builder ibld(this, block, inst);
3884
3885 /* According to the BDW+ BSpec page for the "Multiply Accumulate
3886 * High" instruction:
3887 *
3888 * "An added preliminary mov is required for source modification on
3889 * src1:
3890 * mov (8) r3.0<1>:d -r3<8;8,1>:d
3891 * mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
3892 * mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d"
3893 */
3894 if (devinfo->ver >= 8 && (inst->src[1].negate || inst->src[1].abs))
3895 lower_src_modifiers(this, block, inst, 1);
3896
3897 /* Should have been lowered to 8-wide. */
3898 assert(inst->exec_size <= get_lowered_simd_width(this, inst));
3899 const unsigned acc_width = reg_unit(devinfo) * 8;
3900 const elk_fs_reg acc = suboffset(retype(elk_acc_reg(inst->exec_size), inst->dst.type),
3901 inst->group % acc_width);
3902 elk_fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
3903 elk_fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
3904
3905 if (devinfo->ver >= 8) {
3906 /* Until Gfx8, integer multiplies read 32-bits from one source,
3907 * and 16-bits from the other, and relying on the MACH instruction
3908 * to generate the high bits of the result.
3909 *
3910 * On Gfx8, the multiply instruction does a full 32x32-bit
3911 * multiply, but in order to do a 64-bit multiply we can simulate
3912 * the previous behavior and then use a MACH instruction.
3913 */
3914 assert(mul->src[1].type == ELK_REGISTER_TYPE_D ||
3915 mul->src[1].type == ELK_REGISTER_TYPE_UD);
3916 mul->src[1].type = ELK_REGISTER_TYPE_UW;
3917 mul->src[1].stride *= 2;
3918
3919 if (mul->src[1].file == IMM) {
3920 mul->src[1] = elk_imm_uw(mul->src[1].ud);
3921 }
3922 } else if (devinfo->verx10 == 70 &&
3923 inst->group > 0) {
3924 /* Among other things the quarter control bits influence which
3925 * accumulator register is used by the hardware for instructions
3926 * that access the accumulator implicitly (e.g. MACH). A
3927 * second-half instruction would normally map to acc1, which
3928 * doesn't exist on Gfx7 and up (the hardware does emulate it for
3929 * floating-point instructions *only* by taking advantage of the
3930 * extra precision of acc0 not normally used for floating point
3931 * arithmetic).
3932 *
3933 * HSW and up are careful enough not to try to access an
3934 * accumulator register that doesn't exist, but on earlier Gfx7
3935 * hardware we need to make sure that the quarter control bits are
3936 * zero to avoid non-deterministic behaviour and emit an extra MOV
3937 * to get the result masked correctly according to the current
3938 * channel enables.
3939 */
3940 mach->group = 0;
3941 mach->force_writemask_all = true;
3942 mach->dst = ibld.vgrf(inst->dst.type);
3943 ibld.MOV(inst->dst, mach->dst);
3944 }
3945 }
3946
3947 bool
lower_integer_multiplication()3948 elk_fs_visitor::lower_integer_multiplication()
3949 {
3950 bool progress = false;
3951
3952 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
3953 if (inst->opcode == ELK_OPCODE_MUL) {
3954 /* If the instruction is already in a form that does not need lowering,
3955 * return early.
3956 */
3957 if (devinfo->ver >= 7) {
3958 if (type_sz(inst->src[1].type) < 4 && type_sz(inst->src[0].type) <= 4)
3959 continue;
3960 } else {
3961 if (type_sz(inst->src[0].type) < 4 && type_sz(inst->src[1].type) <= 4)
3962 continue;
3963 }
3964
3965 if ((inst->dst.type == ELK_REGISTER_TYPE_Q ||
3966 inst->dst.type == ELK_REGISTER_TYPE_UQ) &&
3967 (inst->src[0].type == ELK_REGISTER_TYPE_Q ||
3968 inst->src[0].type == ELK_REGISTER_TYPE_UQ) &&
3969 (inst->src[1].type == ELK_REGISTER_TYPE_Q ||
3970 inst->src[1].type == ELK_REGISTER_TYPE_UQ)) {
3971 lower_mul_qword_inst(inst, block);
3972 inst->remove(block);
3973 progress = true;
3974 } else if (!inst->dst.is_accumulator() &&
3975 (inst->dst.type == ELK_REGISTER_TYPE_D ||
3976 inst->dst.type == ELK_REGISTER_TYPE_UD) &&
3977 !devinfo->has_integer_dword_mul) {
3978 lower_mul_dword_inst(inst, block);
3979 inst->remove(block);
3980 progress = true;
3981 }
3982 } else if (inst->opcode == ELK_SHADER_OPCODE_MULH) {
3983 lower_mulh_inst(inst, block);
3984 inst->remove(block);
3985 progress = true;
3986 }
3987
3988 }
3989
3990 if (progress)
3991 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3992
3993 return progress;
3994 }
3995
3996 bool
lower_minmax()3997 elk_fs_visitor::lower_minmax()
3998 {
3999 assert(devinfo->ver < 6);
4000
4001 bool progress = false;
4002
4003 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
4004 const fs_builder ibld(this, block, inst);
4005
4006 if (inst->opcode == ELK_OPCODE_SEL &&
4007 inst->predicate == ELK_PREDICATE_NONE) {
4008 /* If src1 is an immediate value that is not NaN, then it can't be
4009 * NaN. In that case, emit CMP because it is much better for cmod
4010 * propagation. Likewise if src1 is not float. Gfx4 and Gfx5 don't
4011 * support HF or DF, so it is not necessary to check for those.
4012 */
4013 if (inst->src[1].type != ELK_REGISTER_TYPE_F ||
4014 (inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
4015 ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
4016 inst->conditional_mod);
4017 } else {
4018 ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
4019 inst->conditional_mod);
4020 }
4021 inst->predicate = ELK_PREDICATE_NORMAL;
4022 inst->conditional_mod = ELK_CONDITIONAL_NONE;
4023
4024 progress = true;
4025 }
4026 }
4027
4028 if (progress)
4029 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
4030
4031 return progress;
4032 }
4033
4034 bool
lower_sub_sat()4035 elk_fs_visitor::lower_sub_sat()
4036 {
4037 bool progress = false;
4038
4039 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
4040 const fs_builder ibld(this, block, inst);
4041
4042 if (inst->opcode == ELK_SHADER_OPCODE_USUB_SAT ||
4043 inst->opcode == ELK_SHADER_OPCODE_ISUB_SAT) {
4044 /* The fundamental problem is the hardware performs source negation
4045 * at the bit width of the source. If the source is 0x80000000D, the
4046 * negation is 0x80000000D. As a result, subtractSaturate(0,
4047 * 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There
4048 * are at least three ways to resolve this:
4049 *
4050 * 1. Use the accumulator for the negated source. The accumulator is
4051 * 33 bits, so our source 0x80000000 is sign-extended to
4052 * 0x1800000000. The negation of which is 0x080000000. This
4053 * doesn't help for 64-bit integers (which are already bigger than
4054 * 33 bits). There are also only 8 accumulators, so SIMD16 or
4055 * SIMD32 instructions would have to be split into multiple SIMD8
4056 * instructions.
4057 *
4058 * 2. Use slightly different math. For any n-bit value x, we know (x
4059 * >> 1) != -(x >> 1). We can use this fact to only do
4060 * subtractions involving (x >> 1). subtractSaturate(a, b) ==
4061 * subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
4062 *
4063 * 3. For unsigned sources, it is sufficient to replace the
4064 * subtractSaturate with (a > b) ? a - b : 0.
4065 *
4066 * It may also be possible to use the SUBB instruction. This
4067 * implicitly writes the accumulator, so it could only be used in the
4068 * same situations as #1 above. It is further limited by only
4069 * allowing UD sources.
4070 */
4071 if (inst->exec_size == 8 && inst->src[0].type != ELK_REGISTER_TYPE_Q &&
4072 inst->src[0].type != ELK_REGISTER_TYPE_UQ) {
4073 elk_fs_reg acc(ARF, ELK_ARF_ACCUMULATOR, inst->src[1].type);
4074
4075 ibld.MOV(acc, inst->src[1]);
4076 elk_fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
4077 add->saturate = true;
4078 add->src[0].negate = true;
4079 } else if (inst->opcode == ELK_SHADER_OPCODE_ISUB_SAT) {
4080 /* tmp = src1 >> 1;
4081 * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
4082 */
4083 elk_fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
4084 elk_fs_reg tmp2 = ibld.vgrf(inst->src[0].type);
4085 elk_fs_reg tmp3 = ibld.vgrf(inst->src[0].type);
4086 elk_fs_inst *add;
4087
4088 ibld.SHR(tmp1, inst->src[1], elk_imm_d(1));
4089
4090 add = ibld.ADD(tmp2, inst->src[1], tmp1);
4091 add->src[1].negate = true;
4092
4093 add = ibld.ADD(tmp3, inst->src[0], tmp1);
4094 add->src[1].negate = true;
4095 add->saturate = true;
4096
4097 add = ibld.ADD(inst->dst, tmp3, tmp2);
4098 add->src[1].negate = true;
4099 add->saturate = true;
4100 } else {
4101 /* a > b ? a - b : 0 */
4102 ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
4103 ELK_CONDITIONAL_G);
4104
4105 elk_fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
4106 add->src[1].negate = !add->src[1].negate;
4107
4108 ibld.SEL(inst->dst, inst->dst, elk_imm_ud(0))
4109 ->predicate = ELK_PREDICATE_NORMAL;
4110 }
4111
4112 inst->remove(block);
4113 progress = true;
4114 }
4115 }
4116
4117 if (progress)
4118 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
4119
4120 return progress;
4121 }
4122
4123 /**
4124 * Get the mask of SIMD channels enabled during dispatch and not yet disabled
4125 * by discard. Due to the layout of the sample mask in the fragment shader
4126 * thread payload, \p bld is required to have a dispatch_width() not greater
4127 * than 16 for fragment shaders.
4128 */
4129 elk_fs_reg
elk_sample_mask_reg(const fs_builder & bld)4130 elk_sample_mask_reg(const fs_builder &bld)
4131 {
4132 const elk_fs_visitor &s = *bld.shader;
4133
4134 if (s.stage != MESA_SHADER_FRAGMENT) {
4135 return elk_imm_ud(0xffffffff);
4136 } else if (elk_wm_prog_data(s.stage_prog_data)->uses_kill) {
4137 assert(bld.dispatch_width() <= 16);
4138 return elk_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
4139 } else {
4140 assert(s.devinfo->ver >= 6 && bld.dispatch_width() <= 16);
4141 return retype(elk_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
4142 ELK_REGISTER_TYPE_UW);
4143 }
4144 }
4145
4146 uint32_t
elk_fb_write_msg_control(const elk_fs_inst * inst,const struct elk_wm_prog_data * prog_data)4147 elk_fb_write_msg_control(const elk_fs_inst *inst,
4148 const struct elk_wm_prog_data *prog_data)
4149 {
4150 uint32_t mctl;
4151
4152 if (inst->opcode == ELK_FS_OPCODE_REP_FB_WRITE) {
4153 assert(inst->group == 0 && inst->exec_size == 16);
4154 mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
4155 } else if (prog_data->dual_src_blend) {
4156 assert(inst->exec_size == 8);
4157
4158 if (inst->group % 16 == 0)
4159 mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
4160 else if (inst->group % 16 == 8)
4161 mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
4162 else
4163 unreachable("Invalid dual-source FB write instruction group");
4164 } else {
4165 assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
4166
4167 if (inst->exec_size == 16)
4168 mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
4169 else if (inst->exec_size == 8)
4170 mctl = ELK_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
4171 else
4172 unreachable("Invalid FB write execution size");
4173 }
4174
4175 return mctl;
4176 }
4177
4178 /**
4179 * Predicate the specified instruction on the sample mask.
4180 */
4181 void
elk_emit_predicate_on_sample_mask(const fs_builder & bld,elk_fs_inst * inst)4182 elk_emit_predicate_on_sample_mask(const fs_builder &bld, elk_fs_inst *inst)
4183 {
4184 assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
4185 bld.group() == inst->group &&
4186 bld.dispatch_width() == inst->exec_size);
4187
4188 const elk_fs_visitor &s = *bld.shader;
4189 const elk_fs_reg sample_mask = elk_sample_mask_reg(bld);
4190 const unsigned subreg = sample_mask_flag_subreg(s);
4191
4192 if (elk_wm_prog_data(s.stage_prog_data)->uses_kill) {
4193 assert(sample_mask.file == ARF &&
4194 sample_mask.nr == elk_flag_subreg(subreg).nr &&
4195 sample_mask.subnr == elk_flag_subreg(
4196 subreg + inst->group / 16).subnr);
4197 } else {
4198 bld.group(1, 0).exec_all()
4199 .MOV(elk_flag_subreg(subreg + inst->group / 16), sample_mask);
4200 }
4201
4202 if (inst->predicate) {
4203 assert(inst->predicate == ELK_PREDICATE_NORMAL);
4204 assert(!inst->predicate_inverse);
4205 assert(inst->flag_subreg == 0);
4206 /* Combine the sample mask with the existing predicate by using a
4207 * vertical predication mode.
4208 */
4209 inst->predicate = ELK_PREDICATE_ALIGN1_ALLV;
4210 } else {
4211 inst->flag_subreg = subreg;
4212 inst->predicate = ELK_PREDICATE_NORMAL;
4213 inst->predicate_inverse = false;
4214 }
4215 }
4216
4217 static bool
is_mixed_float_with_fp32_dst(const elk_fs_inst * inst)4218 is_mixed_float_with_fp32_dst(const elk_fs_inst *inst)
4219 {
4220 /* This opcode sometimes uses :W type on the source even if the operand is
4221 * a :HF, because in gfx7 there is no support for :HF, and thus it uses :W.
4222 */
4223 if (inst->opcode == ELK_OPCODE_F16TO32)
4224 return true;
4225
4226 if (inst->dst.type != ELK_REGISTER_TYPE_F)
4227 return false;
4228
4229 for (int i = 0; i < inst->sources; i++) {
4230 if (inst->src[i].type == ELK_REGISTER_TYPE_HF)
4231 return true;
4232 }
4233
4234 return false;
4235 }
4236
4237 static bool
is_mixed_float_with_packed_fp16_dst(const elk_fs_inst * inst)4238 is_mixed_float_with_packed_fp16_dst(const elk_fs_inst *inst)
4239 {
4240 /* This opcode sometimes uses :W type on the destination even if the
4241 * destination is a :HF, because in gfx7 there is no support for :HF, and
4242 * thus it uses :W.
4243 */
4244 if (inst->opcode == ELK_OPCODE_F32TO16 &&
4245 inst->dst.stride == 1)
4246 return true;
4247
4248 if (inst->dst.type != ELK_REGISTER_TYPE_HF ||
4249 inst->dst.stride != 1)
4250 return false;
4251
4252 for (int i = 0; i < inst->sources; i++) {
4253 if (inst->src[i].type == ELK_REGISTER_TYPE_F)
4254 return true;
4255 }
4256
4257 return false;
4258 }
4259
4260 /**
4261 * Get the closest allowed SIMD width for instruction \p inst accounting for
4262 * some common regioning and execution control restrictions that apply to FPU
4263 * instructions. These restrictions don't necessarily have any relevance to
4264 * instructions not executed by the FPU pipeline like extended math, control
4265 * flow or send message instructions.
4266 *
4267 * For virtual opcodes it's really up to the instruction -- In some cases
4268 * (e.g. where a virtual instruction unrolls into a simple sequence of FPU
4269 * instructions) it may simplify virtual instruction lowering if we can
4270 * enforce FPU-like regioning restrictions already on the virtual instruction,
4271 * in other cases (e.g. virtual send-like instructions) this may be
4272 * excessively restrictive.
4273 */
4274 static unsigned
get_fpu_lowered_simd_width(const elk_fs_visitor * shader,const elk_fs_inst * inst)4275 get_fpu_lowered_simd_width(const elk_fs_visitor *shader,
4276 const elk_fs_inst *inst)
4277 {
4278 const struct elk_compiler *compiler = shader->compiler;
4279 const struct intel_device_info *devinfo = compiler->devinfo;
4280
4281 /* Maximum execution size representable in the instruction controls. */
4282 unsigned max_width = MIN2(32, inst->exec_size);
4283
4284 /* According to the PRMs:
4285 * "A. In Direct Addressing mode, a source cannot span more than 2
4286 * adjacent GRF registers.
4287 * B. A destination cannot span more than 2 adjacent GRF registers."
4288 *
4289 * Look for the source or destination with the largest register region
4290 * which is the one that is going to limit the overall execution size of
4291 * the instruction due to this rule.
4292 */
4293 unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
4294
4295 for (unsigned i = 0; i < inst->sources; i++)
4296 reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
4297
4298 /* Calculate the maximum execution size of the instruction based on the
4299 * factor by which it goes over the hardware limit of 2 GRFs.
4300 */
4301 const unsigned max_reg_count = 2 * reg_unit(devinfo);
4302 if (reg_count > max_reg_count)
4303 max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, max_reg_count));
4304
4305 /* According to the IVB PRMs:
4306 * "When destination spans two registers, the source MUST span two
4307 * registers. The exception to the above rule:
4308 *
4309 * - When source is scalar, the source registers are not incremented.
4310 * - When source is packed integer Word and destination is packed
4311 * integer DWord, the source register is not incremented but the
4312 * source sub register is incremented."
4313 *
4314 * The hardware specs from Gfx4 to Gfx7.5 mention similar regioning
4315 * restrictions. The code below intentionally doesn't check whether the
4316 * destination type is integer because empirically the hardware doesn't
4317 * seem to care what the actual type is as long as it's dword-aligned.
4318 *
4319 * HSW PRMs also add a note to the second exception:
4320 * "When lower 8 channels are disabled, the sub register of source1
4321 * operand is not incremented. If the lower 8 channels are expected
4322 * to be disabled, say by predication, the instruction must be split
4323 * into pair of simd8 operations."
4324 *
4325 * We can't reliably know if the channels won't be disabled due to,
4326 * for example, IMASK. So, play it safe and disallow packed-word exception
4327 * for src1.
4328 */
4329 if (devinfo->ver < 8) {
4330 for (unsigned i = 0; i < inst->sources; i++) {
4331 /* IVB implements DF scalars as <0;2,1> regions. */
4332 const bool is_scalar_exception = is_uniform(inst->src[i]) &&
4333 (devinfo->platform == INTEL_PLATFORM_HSW || type_sz(inst->src[i].type) != 8);
4334 const bool is_packed_word_exception = i != 1 &&
4335 type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
4336 type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1;
4337
4338 /* We check size_read(i) against size_written instead of REG_SIZE
4339 * because we want to properly handle SIMD32. In SIMD32, you can end
4340 * up with writes to 4 registers and a source that reads 2 registers
4341 * and we may still need to lower all the way to SIMD8 in that case.
4342 */
4343 if (inst->size_written > REG_SIZE &&
4344 inst->size_read(i) != 0 &&
4345 inst->size_read(i) < inst->size_written &&
4346 !is_scalar_exception && !is_packed_word_exception) {
4347 const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
4348 max_width = MIN2(max_width, inst->exec_size / reg_count);
4349 }
4350 }
4351 }
4352
4353 if (devinfo->ver < 6) {
4354 /* From the G45 PRM, Volume 4 Page 361:
4355 *
4356 * "Operand Alignment Rule: With the exceptions listed below, a
4357 * source/destination operand in general should be aligned to even
4358 * 256-bit physical register with a region size equal to two 256-bit
4359 * physical registers."
4360 *
4361 * Normally we enforce this by allocating virtual registers to the
4362 * even-aligned class. But we need to handle payload registers.
4363 */
4364 for (unsigned i = 0; i < inst->sources; i++) {
4365 if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) &&
4366 inst->size_read(i) > REG_SIZE) {
4367 max_width = MIN2(max_width, 8);
4368 }
4369 }
4370 }
4371
4372 /* From the IVB PRMs:
4373 * "When an instruction is SIMD32, the low 16 bits of the execution mask
4374 * are applied for both halves of the SIMD32 instruction. If different
4375 * execution mask channels are required, split the instruction into two
4376 * SIMD16 instructions."
4377 *
4378 * There is similar text in the HSW PRMs. Gfx4-6 don't even implement
4379 * 32-wide control flow support in hardware and will behave similarly.
4380 */
4381 if (devinfo->ver < 8 && !inst->force_writemask_all)
4382 max_width = MIN2(max_width, 16);
4383
4384 /* From the IVB PRMs (applies to HSW too):
4385 * "Instructions with condition modifiers must not use SIMD32."
4386 *
4387 * From the BDW PRMs (applies to later hardware too):
4388 * "Ternary instruction with condition modifiers must not use SIMD32."
4389 */
4390 if (inst->conditional_mod && (devinfo->ver < 8 ||
4391 inst->elk_is_3src(compiler)))
4392 max_width = MIN2(max_width, 16);
4393
4394 /* From the IVB PRMs (applies to other devices that don't have the
4395 * intel_device_info::supports_simd16_3src flag set):
4396 * "In Align16 access mode, SIMD16 is not allowed for DW operations and
4397 * SIMD8 is not allowed for DF operations."
4398 */
4399 if (inst->elk_is_3src(compiler) && !devinfo->supports_simd16_3src)
4400 max_width = MIN2(max_width, inst->exec_size / reg_count);
4401
4402 /* Pre-Gfx8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is
4403 * the 8-bit quarter of the execution mask signals specified in the
4404 * instruction control fields) for the second compressed half of any
4405 * single-precision instruction (for double-precision instructions
4406 * it's hardwired to use NibCtrl+1, at least on HSW), which means that
4407 * the EU will apply the wrong execution controls for the second
4408 * sequential GRF write if the number of channels per GRF is not exactly
4409 * eight in single-precision mode (or four in double-float mode).
4410 *
4411 * In this situation we calculate the maximum size of the split
4412 * instructions so they only ever write to a single register.
4413 */
4414 if (devinfo->ver < 8 && inst->size_written > REG_SIZE &&
4415 !inst->force_writemask_all) {
4416 const unsigned channels_per_grf = inst->exec_size /
4417 DIV_ROUND_UP(inst->size_written, REG_SIZE);
4418 const unsigned exec_type_size = get_exec_type_size(inst);
4419 assert(exec_type_size);
4420
4421 /* The hardware shifts exactly 8 channels per compressed half of the
4422 * instruction in single-precision mode and exactly 4 in double-precision.
4423 */
4424 if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))
4425 max_width = MIN2(max_width, channels_per_grf);
4426
4427 /* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT
4428 * because HW applies the same channel enable signals to both halves of
4429 * the compressed instruction which will be just wrong under
4430 * non-uniform control flow.
4431 */
4432 if (devinfo->verx10 == 70 &&
4433 (exec_type_size == 8 || type_sz(inst->dst.type) == 8))
4434 max_width = MIN2(max_width, 4);
4435 }
4436
4437 /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
4438 * Float Operations:
4439 *
4440 * "No SIMD16 in mixed mode when destination is f32. Instruction
4441 * execution size must be no more than 8."
4442 *
4443 * FIXME: the simulator doesn't seem to complain if we don't do this and
4444 * empirical testing with existing CTS tests show that they pass just fine
4445 * without implementing this, however, since our interpretation of the PRM
4446 * is that conversion MOVs between HF and F are still mixed-float
4447 * instructions (and therefore subject to this restriction) we decided to
4448 * split them to be safe. Might be useful to do additional investigation to
4449 * lift the restriction if we can ensure that it is safe though, since these
4450 * conversions are common when half-float types are involved since many
4451 * instructions do not support HF types and conversions from/to F are
4452 * required.
4453 */
4454 if (is_mixed_float_with_fp32_dst(inst))
4455 max_width = MIN2(max_width, 8);
4456
4457 /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
4458 * Float Operations:
4459 *
4460 * "No SIMD16 in mixed mode when destination is packed f16 for both
4461 * Align1 and Align16."
4462 */
4463 if (is_mixed_float_with_packed_fp16_dst(inst))
4464 max_width = MIN2(max_width, 8);
4465
4466 /* Only power-of-two execution sizes are representable in the instruction
4467 * control fields.
4468 */
4469 return 1 << util_logbase2(max_width);
4470 }
4471
4472 /**
4473 * Get the maximum allowed SIMD width for instruction \p inst accounting for
4474 * various payload size restrictions that apply to sampler message
4475 * instructions.
4476 *
4477 * This is only intended to provide a maximum theoretical bound for the
4478 * execution size of the message based on the number of argument components
4479 * alone, which in most cases will determine whether the SIMD8 or SIMD16
4480 * variant of the message can be used, though some messages may have
4481 * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
4482 * the message length to determine the exact SIMD width and argument count,
4483 * which makes a number of sampler message combinations impossible to
4484 * represent).
4485 *
4486 * Note: Platforms with monolithic SIMD16 double the possible SIMD widths
4487 * change from (SIMD8, SIMD16) to (SIMD16, SIMD32).
4488 */
4489 static unsigned
get_sampler_lowered_simd_width(const struct intel_device_info * devinfo,const elk_fs_inst * inst)4490 get_sampler_lowered_simd_width(const struct intel_device_info *devinfo,
4491 const elk_fs_inst *inst)
4492 {
4493 /* If we have a min_lod parameter on anything other than a simple sample
4494 * message, it will push it over 5 arguments and we have to fall back to
4495 * SIMD8.
4496 */
4497 if (inst->opcode != ELK_SHADER_OPCODE_TEX &&
4498 inst->components_read(TEX_LOGICAL_SRC_MIN_LOD))
4499 return 8;
4500
4501 /* Calculate the number of coordinate components that have to be present
4502 * assuming that additional arguments follow the texel coordinates in the
4503 * message payload. On IVB+ there is no need for padding, on ILK-SNB we
4504 * need to pad to four or three components depending on the message,
4505 * pre-ILK we need to pad to at most three components.
4506 */
4507 const unsigned req_coord_components =
4508 (devinfo->ver >= 7 ||
4509 !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
4510 (devinfo->ver >= 5 && inst->opcode != ELK_SHADER_OPCODE_TXF_LOGICAL &&
4511 inst->opcode != ELK_SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
4512 3;
4513
4514 /* Calculate the total number of argument components that need to be passed
4515 * to the sampler unit.
4516 */
4517 const unsigned num_payload_components =
4518 MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
4519 req_coord_components) +
4520 inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
4521 inst->components_read(TEX_LOGICAL_SRC_LOD) +
4522 inst->components_read(TEX_LOGICAL_SRC_LOD2) +
4523 inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
4524 (inst->opcode == ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
4525 inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
4526 inst->components_read(TEX_LOGICAL_SRC_MCS);
4527
4528 const unsigned simd_limit = reg_unit(devinfo) *
4529 (num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
4530
4531 /* SIMD16 (SIMD32 on Xe2) messages with more than five arguments exceed the
4532 * maximum message size supported by the sampler, regardless of whether a
4533 * header is provided or not.
4534 */
4535 return MIN2(inst->exec_size, simd_limit);
4536 }
4537
4538 /**
4539 * Get the closest native SIMD width supported by the hardware for instruction
4540 * \p inst. The instruction will be left untouched by
4541 * elk_fs_visitor::lower_simd_width() if the returned value is equal to the
4542 * original execution size.
4543 */
4544 static unsigned
get_lowered_simd_width(const elk_fs_visitor * shader,const elk_fs_inst * inst)4545 get_lowered_simd_width(const elk_fs_visitor *shader, const elk_fs_inst *inst)
4546 {
4547 const struct elk_compiler *compiler = shader->compiler;
4548 const struct intel_device_info *devinfo = compiler->devinfo;
4549
4550 switch (inst->opcode) {
4551 case ELK_OPCODE_MOV:
4552 case ELK_OPCODE_SEL:
4553 case ELK_OPCODE_NOT:
4554 case ELK_OPCODE_AND:
4555 case ELK_OPCODE_OR:
4556 case ELK_OPCODE_XOR:
4557 case ELK_OPCODE_SHR:
4558 case ELK_OPCODE_SHL:
4559 case ELK_OPCODE_ASR:
4560 case ELK_OPCODE_CMPN:
4561 case ELK_OPCODE_CSEL:
4562 case ELK_OPCODE_F32TO16:
4563 case ELK_OPCODE_F16TO32:
4564 case ELK_OPCODE_BFREV:
4565 case ELK_OPCODE_BFE:
4566 case ELK_OPCODE_ADD:
4567 case ELK_OPCODE_MUL:
4568 case ELK_OPCODE_AVG:
4569 case ELK_OPCODE_FRC:
4570 case ELK_OPCODE_RNDU:
4571 case ELK_OPCODE_RNDD:
4572 case ELK_OPCODE_RNDE:
4573 case ELK_OPCODE_RNDZ:
4574 case ELK_OPCODE_LZD:
4575 case ELK_OPCODE_FBH:
4576 case ELK_OPCODE_FBL:
4577 case ELK_OPCODE_CBIT:
4578 case ELK_OPCODE_SAD2:
4579 case ELK_OPCODE_MAD:
4580 case ELK_OPCODE_LRP:
4581 case ELK_FS_OPCODE_PACK:
4582 case ELK_SHADER_OPCODE_SEL_EXEC:
4583 case ELK_SHADER_OPCODE_CLUSTER_BROADCAST:
4584 case ELK_SHADER_OPCODE_MOV_RELOC_IMM:
4585 return get_fpu_lowered_simd_width(shader, inst);
4586
4587 case ELK_OPCODE_CMP: {
4588 /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
4589 * when the destination is a GRF the dependency-clear bit on the flag
4590 * register is cleared early.
4591 *
4592 * Suggested workarounds are to disable coissuing CMP instructions
4593 * or to split CMP(16) instructions into two CMP(8) instructions.
4594 *
4595 * We choose to split into CMP(8) instructions since disabling
4596 * coissuing would affect CMP instructions not otherwise affected by
4597 * the errata.
4598 */
4599 const unsigned max_width = (devinfo->verx10 == 70 &&
4600 !inst->dst.is_null() ? 8 : ~0);
4601 return MIN2(max_width, get_fpu_lowered_simd_width(shader, inst));
4602 }
4603 case ELK_OPCODE_BFI1:
4604 case ELK_OPCODE_BFI2:
4605 /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
4606 * should
4607 * "Force BFI instructions to be executed always in SIMD8."
4608 */
4609 return MIN2(devinfo->platform == INTEL_PLATFORM_HSW ? 8 : ~0u,
4610 get_fpu_lowered_simd_width(shader, inst));
4611
4612 case ELK_OPCODE_IF:
4613 assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
4614 return inst->exec_size;
4615
4616 case ELK_SHADER_OPCODE_RCP:
4617 case ELK_SHADER_OPCODE_RSQ:
4618 case ELK_SHADER_OPCODE_SQRT:
4619 case ELK_SHADER_OPCODE_EXP2:
4620 case ELK_SHADER_OPCODE_LOG2:
4621 case ELK_SHADER_OPCODE_SIN:
4622 case ELK_SHADER_OPCODE_COS: {
4623 /* Unary extended math instructions are limited to SIMD8 on Gfx4 and
4624 * Gfx6. Extended Math Function is limited to SIMD8 with half-float.
4625 */
4626 if (devinfo->ver == 6 || devinfo->verx10 == 40)
4627 return MIN2(8, inst->exec_size);
4628 if (inst->dst.type == ELK_REGISTER_TYPE_HF)
4629 return MIN2(8, inst->exec_size);
4630 return MIN2(16, inst->exec_size);
4631 }
4632
4633 case ELK_SHADER_OPCODE_POW: {
4634 /* SIMD16 is only allowed on Gfx7+. Extended Math Function is limited
4635 * to SIMD8 with half-float
4636 */
4637 if (devinfo->ver < 7)
4638 return MIN2(8, inst->exec_size);
4639 if (inst->dst.type == ELK_REGISTER_TYPE_HF)
4640 return MIN2(8, inst->exec_size);
4641 return MIN2(16, inst->exec_size);
4642 }
4643
4644 case ELK_SHADER_OPCODE_USUB_SAT:
4645 case ELK_SHADER_OPCODE_ISUB_SAT:
4646 return get_fpu_lowered_simd_width(shader, inst);
4647
4648 case ELK_SHADER_OPCODE_INT_QUOTIENT:
4649 case ELK_SHADER_OPCODE_INT_REMAINDER:
4650 /* Integer division is limited to SIMD8 on all generations. */
4651 return MIN2(8, inst->exec_size);
4652
4653 case ELK_FS_OPCODE_LINTERP:
4654 case ELK_FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
4655 case ELK_FS_OPCODE_PACK_HALF_2x16_SPLIT:
4656 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
4657 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
4658 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
4659 return MIN2(16, inst->exec_size);
4660
4661 case ELK_FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
4662 /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
4663 * message used to implement varying pull constant loads, so expand it
4664 * to SIMD16. An alternative with longer message payload length but
4665 * shorter return payload would be to use the SIMD8 sampler message that
4666 * takes (header, u, v, r) as parameters instead of (header, u).
4667 */
4668 return (devinfo->ver == 4 ? 16 : MIN2(16, inst->exec_size));
4669
4670 case ELK_FS_OPCODE_DDX_COARSE:
4671 case ELK_FS_OPCODE_DDX_FINE:
4672 case ELK_FS_OPCODE_DDY_COARSE:
4673 case ELK_FS_OPCODE_DDY_FINE:
4674 /* The implementation of this virtual opcode may require emitting
4675 * compressed Align16 instructions, which are severely limited on some
4676 * generations.
4677 *
4678 * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
4679 * Region Restrictions):
4680 *
4681 * "In Align16 access mode, SIMD16 is not allowed for DW operations
4682 * and SIMD8 is not allowed for DF operations."
4683 *
4684 * In this context, "DW operations" means "operations acting on 32-bit
4685 * values", so it includes operations on floats.
4686 *
4687 * Gfx4 has a similar restriction. From the i965 PRM, section 11.5.3
4688 * (Instruction Compression -> Rules and Restrictions):
4689 *
4690 * "A compressed instruction must be in Align1 access mode. Align16
4691 * mode instructions cannot be compressed."
4692 *
4693 * Similar text exists in the g45 PRM.
4694 *
4695 * Empirically, compressed align16 instructions using odd register
4696 * numbers don't appear to work on Sandybridge either.
4697 */
4698 return (devinfo->ver == 4 || devinfo->ver == 6 ||
4699 (devinfo->verx10 == 70) ?
4700 MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
4701
4702 case ELK_SHADER_OPCODE_MULH:
4703 /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
4704 * is 8-wide on Gfx7+.
4705 */
4706 return (devinfo->ver >= 7 ? 8 :
4707 get_fpu_lowered_simd_width(shader, inst));
4708
4709 case ELK_FS_OPCODE_FB_WRITE_LOGICAL:
4710 /* Gfx6 doesn't support SIMD16 depth writes but we cannot handle them
4711 * here.
4712 */
4713 assert(devinfo->ver != 6 ||
4714 inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
4715 inst->exec_size == 8);
4716 /* Dual-source FB writes are unsupported in SIMD16 mode. */
4717 return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
4718 8 : MIN2(16, inst->exec_size));
4719
4720 case ELK_SHADER_OPCODE_TEX_LOGICAL:
4721 case ELK_SHADER_OPCODE_TXF_CMS_LOGICAL:
4722 case ELK_SHADER_OPCODE_TXF_UMS_LOGICAL:
4723 case ELK_SHADER_OPCODE_TXF_MCS_LOGICAL:
4724 case ELK_SHADER_OPCODE_LOD_LOGICAL:
4725 case ELK_SHADER_OPCODE_TG4_LOGICAL:
4726 case ELK_SHADER_OPCODE_SAMPLEINFO_LOGICAL:
4727 case ELK_SHADER_OPCODE_TXF_CMS_W_LOGICAL:
4728 case ELK_SHADER_OPCODE_TG4_OFFSET_LOGICAL:
4729 return get_sampler_lowered_simd_width(devinfo, inst);
4730
4731 /* On gfx12 parameters are fixed to 16-bit values and therefore they all
4732 * always fit regardless of the execution size.
4733 */
4734 case ELK_SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
4735 return MIN2(16, inst->exec_size);
4736
4737 case ELK_SHADER_OPCODE_TXD_LOGICAL:
4738 /* TXD is unsupported in SIMD16 mode previous to Xe2. SIMD32 is still
4739 * unsuppported on Xe2.
4740 */
4741 return 8;
4742
4743 case ELK_SHADER_OPCODE_TXL_LOGICAL:
4744 case ELK_FS_OPCODE_TXB_LOGICAL:
4745 /* Only one execution size is representable pre-ILK depending on whether
4746 * the shadow reference argument is present.
4747 */
4748 if (devinfo->ver == 4)
4749 return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
4750 else
4751 return get_sampler_lowered_simd_width(devinfo, inst);
4752
4753 case ELK_SHADER_OPCODE_TXF_LOGICAL:
4754 case ELK_SHADER_OPCODE_TXS_LOGICAL:
4755 /* Gfx4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
4756 * messages. Use SIMD16 instead.
4757 */
4758 if (devinfo->ver == 4)
4759 return 16;
4760 else
4761 return get_sampler_lowered_simd_width(devinfo, inst);
4762
4763 case ELK_SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
4764 case ELK_SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
4765 case ELK_SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
4766 return 8;
4767
4768 case ELK_SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
4769 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
4770 case ELK_SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
4771 case ELK_SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
4772 case ELK_SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
4773 case ELK_SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
4774 case ELK_SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
4775 return MIN2(16, inst->exec_size);
4776
4777 case ELK_SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
4778 case ELK_SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
4779 case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
4780 case ELK_SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
4781 return devinfo->ver <= 8 ? 8 : MIN2(16, inst->exec_size);
4782
4783 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
4784 case ELK_SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
4785 case ELK_SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
4786 assert(inst->exec_size <= 16);
4787 return inst->exec_size;
4788
4789 case ELK_SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
4790 return devinfo->has_lsc ? MIN2(16, inst->exec_size) : 8;
4791
4792 case ELK_SHADER_OPCODE_URB_READ_LOGICAL:
4793 case ELK_SHADER_OPCODE_URB_WRITE_LOGICAL:
4794 return MIN2(8, inst->exec_size);
4795
4796 case ELK_SHADER_OPCODE_QUAD_SWIZZLE: {
4797 const unsigned swiz = inst->src[1].ud;
4798 return (is_uniform(inst->src[0]) ?
4799 get_fpu_lowered_simd_width(shader, inst) :
4800 type_sz(inst->src[0].type) == 4 ? 8 :
4801 swiz == ELK_SWIZZLE_XYXY || swiz == ELK_SWIZZLE_ZWZW ? 4 :
4802 get_fpu_lowered_simd_width(shader, inst));
4803 }
4804 case ELK_SHADER_OPCODE_MOV_INDIRECT: {
4805 /* From IVB and HSW PRMs:
4806 *
4807 * "2.When the destination requires two registers and the sources are
4808 * indirect, the sources must use 1x1 regioning mode.
4809 *
4810 * In case of DF instructions in HSW/IVB, the exec_size is limited by
4811 * the EU decompression logic not handling VxH indirect addressing
4812 * correctly.
4813 */
4814 const unsigned max_size = (devinfo->ver >= 8 ? 2 : 1) * REG_SIZE;
4815 /* Prior to Broadwell, we only have 8 address subregisters. */
4816 return MIN3(devinfo->ver >= 8 ? 16 : 8,
4817 max_size / (inst->dst.stride * type_sz(inst->dst.type)),
4818 inst->exec_size);
4819 }
4820
4821 case ELK_SHADER_OPCODE_LOAD_PAYLOAD: {
4822 const unsigned reg_count =
4823 DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
4824
4825 if (reg_count > 2) {
4826 /* Only LOAD_PAYLOAD instructions with per-channel destination region
4827 * can be easily lowered (which excludes headers and heterogeneous
4828 * types).
4829 */
4830 assert(!inst->header_size);
4831 for (unsigned i = 0; i < inst->sources; i++)
4832 assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||
4833 inst->src[i].file == BAD_FILE);
4834
4835 return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
4836 } else {
4837 return inst->exec_size;
4838 }
4839 }
4840 default:
4841 return inst->exec_size;
4842 }
4843 }
4844
4845 /**
4846 * Return true if splitting out the group of channels of instruction \p inst
4847 * given by lbld.group() requires allocating a temporary for the i-th source
4848 * of the lowered instruction.
4849 */
4850 static inline bool
needs_src_copy(const fs_builder & lbld,const elk_fs_inst * inst,unsigned i)4851 needs_src_copy(const fs_builder &lbld, const elk_fs_inst *inst, unsigned i)
4852 {
4853 return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
4854 (inst->components_read(i) == 1 &&
4855 lbld.dispatch_width() <= inst->exec_size)) ||
4856 (inst->flags_written(lbld.shader->devinfo) &
4857 flag_mask(inst->src[i], type_sz(inst->src[i].type)));
4858 }
4859
4860 /**
4861 * Extract the data that would be consumed by the channel group given by
4862 * lbld.group() from the i-th source region of instruction \p inst and return
4863 * it as result in packed form.
4864 */
4865 static elk_fs_reg
emit_unzip(const fs_builder & lbld,elk_fs_inst * inst,unsigned i)4866 emit_unzip(const fs_builder &lbld, elk_fs_inst *inst, unsigned i)
4867 {
4868 assert(lbld.group() >= inst->group);
4869
4870 /* Specified channel group from the source region. */
4871 const elk_fs_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);
4872
4873 if (needs_src_copy(lbld, inst, i)) {
4874 /* Builder of the right width to perform the copy avoiding uninitialized
4875 * data if the lowered execution size is greater than the original
4876 * execution size of the instruction.
4877 */
4878 const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
4879 inst->exec_size), 0);
4880 const elk_fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
4881
4882 for (unsigned k = 0; k < inst->components_read(i); ++k)
4883 cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
4884
4885 return tmp;
4886
4887 } else if (is_periodic(inst->src[i], lbld.dispatch_width())) {
4888 /* The source is invariant for all dispatch_width-wide groups of the
4889 * original region.
4890 */
4891 return inst->src[i];
4892
4893 } else {
4894 /* We can just point the lowered instruction at the right channel group
4895 * from the original region.
4896 */
4897 return src;
4898 }
4899 }
4900
4901 /**
4902 * Return true if splitting out the group of channels of instruction \p inst
4903 * given by lbld.group() requires allocating a temporary for the destination
4904 * of the lowered instruction and copying the data back to the original
4905 * destination region.
4906 */
4907 static inline bool
needs_dst_copy(const fs_builder & lbld,const elk_fs_inst * inst)4908 needs_dst_copy(const fs_builder &lbld, const elk_fs_inst *inst)
4909 {
4910 if (inst->dst.is_null())
4911 return false;
4912
4913 /* If the instruction writes more than one component we'll have to shuffle
4914 * the results of multiple lowered instructions in order to make sure that
4915 * they end up arranged correctly in the original destination region.
4916 */
4917 if (inst->size_written > inst->dst.component_size(inst->exec_size))
4918 return true;
4919
4920 /* If the lowered execution size is larger than the original the result of
4921 * the instruction won't fit in the original destination, so we'll have to
4922 * allocate a temporary in any case.
4923 */
4924 if (lbld.dispatch_width() > inst->exec_size)
4925 return true;
4926
4927 for (unsigned i = 0; i < inst->sources; i++) {
4928 /* If we already made a copy of the source for other reasons there won't
4929 * be any overlap with the destination.
4930 */
4931 if (needs_src_copy(lbld, inst, i))
4932 continue;
4933
4934 /* In order to keep the logic simple we emit a copy whenever the
4935 * destination region doesn't exactly match an overlapping source, which
4936 * may point at the source and destination not being aligned group by
4937 * group which could cause one of the lowered instructions to overwrite
4938 * the data read from the same source by other lowered instructions.
4939 */
4940 if (regions_overlap(inst->dst, inst->size_written,
4941 inst->src[i], inst->size_read(i)) &&
4942 !inst->dst.equals(inst->src[i]))
4943 return true;
4944 }
4945
4946 return false;
4947 }
4948
4949 /**
4950 * Insert data from a packed temporary into the channel group given by
4951 * lbld.group() of the destination region of instruction \p inst and return
4952 * the temporary as result. Any copy instructions that are required for
4953 * unzipping the previous value (in the case of partial writes) will be
4954 * inserted using \p lbld_before and any copy instructions required for
4955 * zipping up the destination of \p inst will be inserted using \p lbld_after.
4956 */
4957 static elk_fs_reg
emit_zip(const fs_builder & lbld_before,const fs_builder & lbld_after,elk_fs_inst * inst)4958 emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
4959 elk_fs_inst *inst)
4960 {
4961 assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
4962 assert(lbld_before.group() == lbld_after.group());
4963 assert(lbld_after.group() >= inst->group);
4964
4965 const struct intel_device_info *devinfo = lbld_before.shader->devinfo;
4966
4967 /* Specified channel group from the destination region. */
4968 const elk_fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);
4969
4970 if (!needs_dst_copy(lbld_after, inst)) {
4971 /* No need to allocate a temporary for the lowered instruction, just
4972 * take the right group of channels from the original region.
4973 */
4974 return dst;
4975 }
4976
4977 /* Deal with the residency data part later */
4978 const unsigned residency_size = inst->has_sampler_residency() ?
4979 (reg_unit(devinfo) * REG_SIZE) : 0;
4980 const unsigned dst_size = (inst->size_written - residency_size) /
4981 inst->dst.component_size(inst->exec_size);
4982
4983 const elk_fs_reg tmp = lbld_after.vgrf(inst->dst.type,
4984 dst_size + inst->has_sampler_residency());
4985
4986 if (inst->predicate) {
4987 /* Handle predication by copying the original contents of the
4988 * destination into the temporary before emitting the lowered
4989 * instruction.
4990 */
4991 const fs_builder gbld_before =
4992 lbld_before.group(MIN2(lbld_before.dispatch_width(),
4993 inst->exec_size), 0);
4994 for (unsigned k = 0; k < dst_size; ++k) {
4995 gbld_before.MOV(offset(tmp, lbld_before, k),
4996 offset(dst, inst->exec_size, k));
4997 }
4998 }
4999
5000 const fs_builder gbld_after =
5001 lbld_after.group(MIN2(lbld_after.dispatch_width(),
5002 inst->exec_size), 0);
5003 for (unsigned k = 0; k < dst_size; ++k) {
5004 /* Use a builder of the right width to perform the copy avoiding
5005 * uninitialized data if the lowered execution size is greater than the
5006 * original execution size of the instruction.
5007 */
5008 gbld_after.MOV(offset(dst, inst->exec_size, k),
5009 offset(tmp, lbld_after, k));
5010 }
5011
5012 if (inst->has_sampler_residency()) {
5013 /* Sampler messages with residency need a special attention. In the
5014 * first lane of the last component are located the Pixel Null Mask
5015 * (bits 0:15) & some upper bits we need to discard (bits 16:31). We
5016 * have to build a single 32bit value for the SIMD32 message out of 2
5017 * SIMD16 16 bit values.
5018 */
5019 const fs_builder rbld = gbld_after.exec_all().group(1, 0);
5020 elk_fs_reg local_res_reg = component(
5021 retype(offset(tmp, lbld_before, dst_size),
5022 ELK_REGISTER_TYPE_UW), 0);
5023 elk_fs_reg final_res_reg =
5024 retype(byte_offset(inst->dst,
5025 inst->size_written - residency_size +
5026 gbld_after.group() / 8),
5027 ELK_REGISTER_TYPE_UW);
5028 rbld.MOV(final_res_reg, local_res_reg);
5029 }
5030
5031 return tmp;
5032 }
5033
5034 bool
lower_simd_width()5035 elk_fs_visitor::lower_simd_width()
5036 {
5037 bool progress = false;
5038
5039 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
5040 const unsigned lower_width = get_lowered_simd_width(this, inst);
5041
5042 if (lower_width != inst->exec_size) {
5043 /* Builder matching the original instruction. We may also need to
5044 * emit an instruction of width larger than the original, set the
5045 * execution size of the builder to the highest of both for now so
5046 * we're sure that both cases can be handled.
5047 */
5048 const unsigned max_width = MAX2(inst->exec_size, lower_width);
5049
5050 const fs_builder bld =
5051 fs_builder(this, MAX2(max_width, dispatch_width)).at_end();
5052 const fs_builder ibld = bld.at(block, inst)
5053 .exec_all(inst->force_writemask_all)
5054 .group(max_width, inst->group / max_width);
5055
5056 /* Split the copies in chunks of the execution width of either the
5057 * original or the lowered instruction, whichever is lower.
5058 */
5059 const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
5060 const unsigned residency_size = inst->has_sampler_residency() ?
5061 (reg_unit(devinfo) * REG_SIZE) : 0;
5062 const unsigned dst_size =
5063 (inst->size_written - residency_size) /
5064 inst->dst.component_size(inst->exec_size);
5065
5066 assert(!inst->writes_accumulator && !inst->mlen);
5067
5068 /* Inserting the zip, unzip, and duplicated instructions in all of
5069 * the right spots is somewhat tricky. All of the unzip and any
5070 * instructions from the zip which unzip the destination prior to
5071 * writing need to happen before all of the per-group instructions
5072 * and the zip instructions need to happen after. In order to sort
5073 * this all out, we insert the unzip instructions before \p inst,
5074 * insert the per-group instructions after \p inst (i.e. before
5075 * inst->next), and insert the zip instructions before the
5076 * instruction after \p inst. Since we are inserting instructions
5077 * after \p inst, inst->next is a moving target and we need to save
5078 * it off here so that we insert the zip instructions in the right
5079 * place.
5080 *
5081 * Since we're inserting split instructions after after_inst, the
5082 * instructions will end up in the reverse order that we insert them.
5083 * However, certain render target writes require that the low group
5084 * instructions come before the high group. From the Ivy Bridge PRM
5085 * Vol. 4, Pt. 1, Section 3.9.11:
5086 *
5087 * "If multiple SIMD8 Dual Source messages are delivered by the
5088 * pixel shader thread, each SIMD8_DUALSRC_LO message must be
5089 * issued before the SIMD8_DUALSRC_HI message with the same Slot
5090 * Group Select setting."
5091 *
5092 * And, from Section 3.9.11.1 of the same PRM:
5093 *
5094 * "When SIMD32 or SIMD16 PS threads send render target writes
5095 * with multiple SIMD8 and SIMD16 messages, the following must
5096 * hold:
5097 *
5098 * All the slots (as described above) must have a corresponding
5099 * render target write irrespective of the slot's validity. A slot
5100 * is considered valid when at least one sample is enabled. For
5101 * example, a SIMD16 PS thread must send two SIMD8 render target
5102 * writes to cover all the slots.
5103 *
5104 * PS thread must send SIMD render target write messages with
5105 * increasing slot numbers. For example, SIMD16 thread has
5106 * Slot[15:0] and if two SIMD8 render target writes are used, the
5107 * first SIMD8 render target write must send Slot[7:0] and the
5108 * next one must send Slot[15:8]."
5109 *
5110 * In order to make low group instructions come before high group
5111 * instructions (this is required for some render target writes), we
5112 * split from the highest group to lowest.
5113 */
5114 exec_node *const after_inst = inst->next;
5115 for (int i = n - 1; i >= 0; i--) {
5116 /* Emit a copy of the original instruction with the lowered width.
5117 * If the EOT flag was set throw it away except for the last
5118 * instruction to avoid killing the thread prematurely.
5119 */
5120 elk_fs_inst split_inst = *inst;
5121 split_inst.exec_size = lower_width;
5122 split_inst.eot = inst->eot && i == int(n - 1);
5123
5124 /* Select the correct channel enables for the i-th group, then
5125 * transform the sources and destination and emit the lowered
5126 * instruction.
5127 */
5128 const fs_builder lbld = ibld.group(lower_width, i);
5129
5130 for (unsigned j = 0; j < inst->sources; j++)
5131 split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
5132
5133 split_inst.dst = emit_zip(lbld.at(block, inst),
5134 lbld.at(block, after_inst), inst);
5135 split_inst.size_written =
5136 split_inst.dst.component_size(lower_width) * dst_size +
5137 residency_size;
5138
5139 lbld.at(block, inst->next).emit(split_inst);
5140 }
5141
5142 inst->remove(block);
5143 progress = true;
5144 }
5145 }
5146
5147 if (progress)
5148 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5149
5150 return progress;
5151 }
5152
5153 /**
5154 * Transform barycentric vectors into the interleaved form expected by the PLN
5155 * instruction and returned by the Gfx7+ PI shared function.
5156 *
5157 * For channels 0-15 in SIMD16 mode they are expected to be laid out as
5158 * follows in the register file:
5159 *
5160 * rN+0: X[0-7]
5161 * rN+1: Y[0-7]
5162 * rN+2: X[8-15]
5163 * rN+3: Y[8-15]
5164 *
5165 * There is no need to handle SIMD32 here -- This is expected to be run after
5166 * SIMD lowering, since SIMD lowering relies on vectors having the standard
5167 * component layout.
5168 */
5169 bool
lower_barycentrics()5170 elk_fs_visitor::lower_barycentrics()
5171 {
5172 const bool has_interleaved_layout = devinfo->has_pln ||
5173 devinfo->ver >= 7;
5174 bool progress = false;
5175
5176 if (stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout)
5177 return false;
5178
5179 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
5180 if (inst->exec_size < 16)
5181 continue;
5182
5183 const fs_builder ibld(this, block, inst);
5184 const fs_builder ubld = ibld.exec_all().group(8, 0);
5185
5186 switch (inst->opcode) {
5187 case ELK_FS_OPCODE_LINTERP : {
5188 assert(inst->exec_size == 16);
5189 const elk_fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);
5190 elk_fs_reg srcs[4];
5191
5192 for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
5193 srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),
5194 8 * (i / 2));
5195
5196 ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
5197
5198 inst->src[0] = tmp;
5199 progress = true;
5200 break;
5201 }
5202 case ELK_FS_OPCODE_INTERPOLATE_AT_SAMPLE:
5203 case ELK_FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
5204 case ELK_FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
5205 assert(inst->exec_size == 16);
5206 const elk_fs_reg tmp = ibld.vgrf(inst->dst.type, 2);
5207
5208 for (unsigned i = 0; i < 2; i++) {
5209 for (unsigned g = 0; g < inst->exec_size / 8; g++) {
5210 elk_fs_inst *mov = ibld.at(block, inst->next).group(8, g)
5211 .MOV(horiz_offset(offset(inst->dst, ibld, i),
5212 8 * g),
5213 offset(tmp, ubld, 2 * g + i));
5214 mov->predicate = inst->predicate;
5215 mov->predicate_inverse = inst->predicate_inverse;
5216 mov->flag_subreg = inst->flag_subreg;
5217 }
5218 }
5219
5220 inst->dst = tmp;
5221 progress = true;
5222 break;
5223 }
5224 default:
5225 break;
5226 }
5227 }
5228
5229 if (progress)
5230 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5231
5232 return progress;
5233 }
5234
5235 bool
lower_find_live_channel()5236 elk_fs_visitor::lower_find_live_channel()
5237 {
5238 bool progress = false;
5239
5240 if (devinfo->ver < 8)
5241 return false;
5242
5243 bool packed_dispatch =
5244 elk_stage_has_packed_dispatch(devinfo, stage, stage_prog_data);
5245 bool vmask =
5246 stage == MESA_SHADER_FRAGMENT &&
5247 elk_wm_prog_data(stage_prog_data)->uses_vmask;
5248
5249 foreach_block_and_inst_safe(block, elk_fs_inst, inst, cfg) {
5250 if (inst->opcode != ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL &&
5251 inst->opcode != ELK_SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL)
5252 continue;
5253
5254 bool first = inst->opcode == ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL;
5255
5256 /* Getting the first active channel index is easy on Gfx8: Just find
5257 * the first bit set in the execution mask. The register exists on
5258 * HSW already but it reads back as all ones when the current
5259 * instruction has execution masking disabled, so it's kind of
5260 * useless there.
5261 */
5262 elk_fs_reg exec_mask(retype(elk_mask_reg(0), ELK_REGISTER_TYPE_UD));
5263
5264 const fs_builder ibld(this, block, inst);
5265 if (!inst->is_partial_write())
5266 ibld.emit_undef_for_dst(inst);
5267
5268 const fs_builder ubld = fs_builder(this, block, inst).exec_all().group(1, 0);
5269
5270 /* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
5271 * so combine the execution and dispatch masks to obtain the true mask.
5272 *
5273 * If we're looking for the first live channel, and we have packed
5274 * dispatch, we can skip this step, as we know all dispatched channels
5275 * will appear at the front of the mask.
5276 */
5277 if (!(first && packed_dispatch)) {
5278 elk_fs_reg mask = ubld.vgrf(ELK_REGISTER_TYPE_UD);
5279 ubld.UNDEF(mask);
5280 ubld.emit(ELK_SHADER_OPCODE_READ_SR_REG, mask, elk_imm_ud(vmask ? 3 : 2));
5281
5282 /* Quarter control has the effect of magically shifting the value of
5283 * ce0 so you'll get the first/last active channel relative to the
5284 * specified quarter control as result.
5285 */
5286 if (inst->group > 0)
5287 ubld.SHR(mask, mask, elk_imm_ud(ALIGN(inst->group, 8)));
5288
5289 ubld.AND(mask, exec_mask, mask);
5290 exec_mask = mask;
5291 }
5292
5293 if (first) {
5294 ubld.FBL(inst->dst, exec_mask);
5295 } else {
5296 elk_fs_reg tmp = ubld.vgrf(ELK_REGISTER_TYPE_UD, 1);
5297 ubld.UNDEF(tmp);
5298 ubld.LZD(tmp, exec_mask);
5299 ubld.ADD(inst->dst, negate(tmp), elk_imm_uw(31));
5300 }
5301
5302 inst->remove(block);
5303 progress = true;
5304 }
5305
5306 if (progress)
5307 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
5308
5309 return progress;
5310 }
5311
5312 void
dump_instructions_to_file(FILE * file) const5313 elk_fs_visitor::dump_instructions_to_file(FILE *file) const
5314 {
5315 if (cfg) {
5316 const register_pressure &rp = regpressure_analysis.require();
5317 unsigned ip = 0, max_pressure = 0;
5318 unsigned cf_count = 0;
5319 foreach_block_and_inst(block, elk_backend_instruction, inst, cfg) {
5320 if (inst->is_control_flow_end())
5321 cf_count -= 1;
5322
5323 max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
5324 fprintf(file, "{%3d} %4d: ", rp.regs_live_at_ip[ip], ip);
5325 for (unsigned i = 0; i < cf_count; i++)
5326 fprintf(file, " ");
5327 dump_instruction(inst, file);
5328 ip++;
5329
5330 if (inst->is_control_flow_begin())
5331 cf_count += 1;
5332 }
5333 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
5334 } else {
5335 int ip = 0;
5336 foreach_in_list(elk_backend_instruction, inst, &instructions) {
5337 fprintf(file, "%4d: ", ip++);
5338 dump_instruction(inst, file);
5339 }
5340 }
5341 }
5342
5343 void
dump_instruction_to_file(const elk_backend_instruction * be_inst,FILE * file) const5344 elk_fs_visitor::dump_instruction_to_file(const elk_backend_instruction *be_inst, FILE *file) const
5345 {
5346 const elk_fs_inst *inst = (const elk_fs_inst *)be_inst;
5347
5348 if (inst->predicate) {
5349 fprintf(file, "(%cf%d.%d) ",
5350 inst->predicate_inverse ? '-' : '+',
5351 inst->flag_subreg / 2,
5352 inst->flag_subreg % 2);
5353 }
5354
5355 fprintf(file, "%s", elk_instruction_name(&compiler->isa, inst->opcode));
5356 if (inst->saturate)
5357 fprintf(file, ".sat");
5358 if (inst->conditional_mod) {
5359 fprintf(file, "%s", elk_conditional_modifier[inst->conditional_mod]);
5360 if (!inst->predicate &&
5361 (devinfo->ver < 5 || (inst->opcode != ELK_OPCODE_SEL &&
5362 inst->opcode != ELK_OPCODE_CSEL &&
5363 inst->opcode != ELK_OPCODE_IF &&
5364 inst->opcode != ELK_OPCODE_WHILE))) {
5365 fprintf(file, ".f%d.%d", inst->flag_subreg / 2,
5366 inst->flag_subreg % 2);
5367 }
5368 }
5369 fprintf(file, "(%d) ", inst->exec_size);
5370
5371 if (inst->mlen) {
5372 fprintf(file, "(mlen: %d) ", inst->mlen);
5373 }
5374
5375 if (inst->eot) {
5376 fprintf(file, "(EOT) ");
5377 }
5378
5379 switch (inst->dst.file) {
5380 case VGRF:
5381 fprintf(file, "vgrf%d", inst->dst.nr);
5382 break;
5383 case FIXED_GRF:
5384 fprintf(file, "g%d", inst->dst.nr);
5385 break;
5386 case MRF:
5387 fprintf(file, "m%d", inst->dst.nr);
5388 break;
5389 case BAD_FILE:
5390 fprintf(file, "(null)");
5391 break;
5392 case UNIFORM:
5393 fprintf(file, "***u%d***", inst->dst.nr);
5394 break;
5395 case ATTR:
5396 fprintf(file, "***attr%d***", inst->dst.nr);
5397 break;
5398 case ARF:
5399 switch (inst->dst.nr) {
5400 case ELK_ARF_NULL:
5401 fprintf(file, "null");
5402 break;
5403 case ELK_ARF_ADDRESS:
5404 fprintf(file, "a0.%d", inst->dst.subnr);
5405 break;
5406 case ELK_ARF_ACCUMULATOR:
5407 fprintf(file, "acc%d", inst->dst.subnr);
5408 break;
5409 case ELK_ARF_FLAG:
5410 fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
5411 break;
5412 default:
5413 fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
5414 break;
5415 }
5416 break;
5417 case IMM:
5418 unreachable("not reached");
5419 }
5420
5421 if (inst->dst.offset ||
5422 (inst->dst.file == VGRF &&
5423 alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
5424 const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
5425 fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
5426 inst->dst.offset % reg_size);
5427 }
5428
5429 if (inst->dst.stride != 1)
5430 fprintf(file, "<%u>", inst->dst.stride);
5431 fprintf(file, ":%s, ", elk_reg_type_to_letters(inst->dst.type));
5432
5433 for (int i = 0; i < inst->sources; i++) {
5434 if (inst->src[i].negate)
5435 fprintf(file, "-");
5436 if (inst->src[i].abs)
5437 fprintf(file, "|");
5438 switch (inst->src[i].file) {
5439 case VGRF:
5440 fprintf(file, "vgrf%d", inst->src[i].nr);
5441 break;
5442 case FIXED_GRF:
5443 fprintf(file, "g%d", inst->src[i].nr);
5444 break;
5445 case MRF:
5446 fprintf(file, "***m%d***", inst->src[i].nr);
5447 break;
5448 case ATTR:
5449 fprintf(file, "attr%d", inst->src[i].nr);
5450 break;
5451 case UNIFORM:
5452 fprintf(file, "u%d", inst->src[i].nr);
5453 break;
5454 case BAD_FILE:
5455 fprintf(file, "(null)");
5456 break;
5457 case IMM:
5458 switch (inst->src[i].type) {
5459 case ELK_REGISTER_TYPE_HF:
5460 fprintf(file, "%-ghf", _mesa_half_to_float(inst->src[i].ud & 0xffff));
5461 break;
5462 case ELK_REGISTER_TYPE_F:
5463 fprintf(file, "%-gf", inst->src[i].f);
5464 break;
5465 case ELK_REGISTER_TYPE_DF:
5466 fprintf(file, "%fdf", inst->src[i].df);
5467 break;
5468 case ELK_REGISTER_TYPE_W:
5469 case ELK_REGISTER_TYPE_D:
5470 fprintf(file, "%dd", inst->src[i].d);
5471 break;
5472 case ELK_REGISTER_TYPE_UW:
5473 case ELK_REGISTER_TYPE_UD:
5474 fprintf(file, "%uu", inst->src[i].ud);
5475 break;
5476 case ELK_REGISTER_TYPE_Q:
5477 fprintf(file, "%" PRId64 "q", inst->src[i].d64);
5478 break;
5479 case ELK_REGISTER_TYPE_UQ:
5480 fprintf(file, "%" PRIu64 "uq", inst->src[i].u64);
5481 break;
5482 case ELK_REGISTER_TYPE_VF:
5483 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
5484 elk_vf_to_float((inst->src[i].ud >> 0) & 0xff),
5485 elk_vf_to_float((inst->src[i].ud >> 8) & 0xff),
5486 elk_vf_to_float((inst->src[i].ud >> 16) & 0xff),
5487 elk_vf_to_float((inst->src[i].ud >> 24) & 0xff));
5488 break;
5489 case ELK_REGISTER_TYPE_V:
5490 case ELK_REGISTER_TYPE_UV:
5491 fprintf(file, "%08x%s", inst->src[i].ud,
5492 inst->src[i].type == ELK_REGISTER_TYPE_V ? "V" : "UV");
5493 break;
5494 default:
5495 fprintf(file, "???");
5496 break;
5497 }
5498 break;
5499 case ARF:
5500 switch (inst->src[i].nr) {
5501 case ELK_ARF_NULL:
5502 fprintf(file, "null");
5503 break;
5504 case ELK_ARF_ADDRESS:
5505 fprintf(file, "a0.%d", inst->src[i].subnr);
5506 break;
5507 case ELK_ARF_ACCUMULATOR:
5508 fprintf(file, "acc%d", inst->src[i].subnr);
5509 break;
5510 case ELK_ARF_FLAG:
5511 fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
5512 break;
5513 default:
5514 fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
5515 break;
5516 }
5517 break;
5518 }
5519
5520 if (inst->src[i].offset ||
5521 (inst->src[i].file == VGRF &&
5522 alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
5523 const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
5524 fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
5525 inst->src[i].offset % reg_size);
5526 }
5527
5528 if (inst->src[i].abs)
5529 fprintf(file, "|");
5530
5531 if (inst->src[i].file != IMM) {
5532 unsigned stride;
5533 if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
5534 unsigned hstride = inst->src[i].hstride;
5535 stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
5536 } else {
5537 stride = inst->src[i].stride;
5538 }
5539 if (stride != 1)
5540 fprintf(file, "<%u>", stride);
5541
5542 fprintf(file, ":%s", elk_reg_type_to_letters(inst->src[i].type));
5543 }
5544
5545 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
5546 fprintf(file, ", ");
5547 }
5548
5549 fprintf(file, " ");
5550
5551 if (inst->force_writemask_all)
5552 fprintf(file, "NoMask ");
5553
5554 if (inst->exec_size != dispatch_width)
5555 fprintf(file, "group%d ", inst->group);
5556
5557 fprintf(file, "\n");
5558 }
5559
register_pressure(const elk_fs_visitor * v)5560 elk::register_pressure::register_pressure(const elk_fs_visitor *v)
5561 {
5562 const fs_live_variables &live = v->live_analysis.require();
5563 const unsigned num_instructions = v->cfg->num_blocks ?
5564 v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
5565
5566 regs_live_at_ip = new unsigned[num_instructions]();
5567
5568 for (unsigned reg = 0; reg < v->alloc.count; reg++) {
5569 for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
5570 regs_live_at_ip[ip] += v->alloc.sizes[reg];
5571 }
5572
5573 const unsigned payload_count = v->first_non_payload_grf;
5574
5575 int *payload_last_use_ip = new int[payload_count];
5576 v->calculate_payload_ranges(payload_count, payload_last_use_ip);
5577
5578 for (unsigned reg = 0; reg < payload_count; reg++) {
5579 for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
5580 ++regs_live_at_ip[ip];
5581 }
5582
5583 delete[] payload_last_use_ip;
5584 }
5585
~register_pressure()5586 elk::register_pressure::~register_pressure()
5587 {
5588 delete[] regs_live_at_ip;
5589 }
5590
5591 void
invalidate_analysis(elk::analysis_dependency_class c)5592 elk_fs_visitor::invalidate_analysis(elk::analysis_dependency_class c)
5593 {
5594 elk_backend_shader::invalidate_analysis(c);
5595 live_analysis.invalidate(c);
5596 regpressure_analysis.invalidate(c);
5597 }
5598
5599 void
debug_optimizer(const nir_shader * nir,const char * pass_name,int iteration,int pass_num) const5600 elk_fs_visitor::debug_optimizer(const nir_shader *nir,
5601 const char *pass_name,
5602 int iteration, int pass_num) const
5603 {
5604 if (!elk_should_print_shader(nir, DEBUG_OPTIMIZER))
5605 return;
5606
5607 char *filename;
5608 int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
5609 debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
5610 _mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
5611 iteration, pass_num, pass_name);
5612 if (ret == -1)
5613 return;
5614 dump_instructions(filename);
5615 free(filename);
5616 }
5617
5618 void
optimize()5619 elk_fs_visitor::optimize()
5620 {
5621 debug_optimizer(nir, "start", 0, 0);
5622
5623 /* Start by validating the shader we currently have. */
5624 validate();
5625
5626 bool progress = false;
5627 int iteration = 0;
5628 int pass_num = 0;
5629
5630 #define OPT(pass, args...) ({ \
5631 pass_num++; \
5632 bool this_progress = pass(args); \
5633 \
5634 if (this_progress) \
5635 debug_optimizer(nir, #pass, iteration, pass_num); \
5636 \
5637 validate(); \
5638 \
5639 progress = progress || this_progress; \
5640 this_progress; \
5641 })
5642
5643 assign_constant_locations();
5644 OPT(lower_constant_loads);
5645
5646 validate();
5647
5648 OPT(split_virtual_grfs);
5649
5650 /* Before anything else, eliminate dead code. The results of some NIR
5651 * instructions may effectively be calculated twice. Once when the
5652 * instruction is encountered, and again when the user of that result is
5653 * encountered. Wipe those away before algebraic optimizations and
5654 * especially copy propagation can mix things up.
5655 */
5656 OPT(dead_code_eliminate);
5657
5658 OPT(remove_extra_rounding_modes);
5659
5660 do {
5661 progress = false;
5662 pass_num = 0;
5663 iteration++;
5664
5665 OPT(remove_duplicate_mrf_writes);
5666
5667 OPT(opt_algebraic);
5668 OPT(opt_cse);
5669 OPT(opt_copy_propagation);
5670 OPT(elk_opt_predicated_break, this);
5671 OPT(opt_cmod_propagation);
5672 OPT(dead_code_eliminate);
5673 OPT(opt_peephole_sel);
5674 OPT(elk_dead_control_flow_eliminate, this);
5675 OPT(opt_saturate_propagation);
5676 OPT(register_coalesce);
5677 OPT(compute_to_mrf);
5678 OPT(eliminate_find_live_channel);
5679
5680 OPT(compact_virtual_grfs);
5681 } while (progress);
5682
5683 progress = false;
5684 pass_num = 0;
5685
5686 if (OPT(lower_pack)) {
5687 OPT(register_coalesce);
5688 OPT(dead_code_eliminate);
5689 }
5690
5691 OPT(lower_simd_width);
5692 OPT(lower_barycentrics);
5693 OPT(lower_logical_sends);
5694
5695 /* After logical SEND lowering. */
5696
5697 if (OPT(opt_copy_propagation))
5698 OPT(opt_algebraic);
5699
5700 /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
5701 * Do this before splitting SENDs.
5702 */
5703 if (devinfo->ver >= 7) {
5704 if (OPT(opt_zero_samples) && OPT(opt_copy_propagation))
5705 OPT(opt_algebraic);
5706 }
5707
5708 if (progress) {
5709 if (OPT(opt_copy_propagation))
5710 OPT(opt_algebraic);
5711
5712 /* Run after logical send lowering to give it a chance to CSE the
5713 * LOAD_PAYLOAD instructions created to construct the payloads of
5714 * e.g. texturing messages in cases where it wasn't possible to CSE the
5715 * whole logical instruction.
5716 */
5717 OPT(opt_cse);
5718 OPT(register_coalesce);
5719 OPT(compute_to_mrf);
5720 OPT(dead_code_eliminate);
5721 OPT(remove_duplicate_mrf_writes);
5722 OPT(opt_peephole_sel);
5723 }
5724
5725 OPT(opt_redundant_halt);
5726
5727 if (OPT(lower_load_payload)) {
5728 OPT(split_virtual_grfs);
5729
5730 /* Lower 64 bit MOVs generated by payload lowering. */
5731 if (!devinfo->has_64bit_float || !devinfo->has_64bit_int)
5732 OPT(opt_algebraic);
5733
5734 OPT(register_coalesce);
5735 OPT(lower_simd_width);
5736 OPT(compute_to_mrf);
5737 OPT(dead_code_eliminate);
5738 }
5739
5740 OPT(opt_combine_constants);
5741 if (OPT(lower_integer_multiplication)) {
5742 /* If lower_integer_multiplication made progress, it may have produced
5743 * some 32x32-bit MULs in the process of lowering 64-bit MULs. Run it
5744 * one more time to clean those up if they exist.
5745 */
5746 OPT(lower_integer_multiplication);
5747 }
5748 OPT(lower_sub_sat);
5749
5750 if (devinfo->ver <= 5 && OPT(lower_minmax)) {
5751 OPT(opt_cmod_propagation);
5752 OPT(opt_cse);
5753 if (OPT(opt_copy_propagation))
5754 OPT(opt_algebraic);
5755 OPT(dead_code_eliminate);
5756 }
5757
5758 progress = false;
5759 OPT(lower_regioning);
5760 if (progress) {
5761 if (OPT(opt_copy_propagation))
5762 OPT(opt_algebraic);
5763 OPT(dead_code_eliminate);
5764 OPT(lower_simd_width);
5765 }
5766
5767 OPT(lower_uniform_pull_constant_loads);
5768
5769 OPT(lower_find_live_channel);
5770
5771 validate();
5772 }
5773
5774 /**
5775 * Three source instruction must have a GRF/MRF destination register.
5776 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
5777 */
5778 void
fixup_3src_null_dest()5779 elk_fs_visitor::fixup_3src_null_dest()
5780 {
5781 bool progress = false;
5782
5783 foreach_block_and_inst_safe (block, elk_fs_inst, inst, cfg) {
5784 if (inst->elk_is_3src(compiler) && inst->dst.is_null()) {
5785 inst->dst = elk_fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
5786 inst->dst.type);
5787 progress = true;
5788 }
5789 }
5790
5791 if (progress)
5792 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
5793 DEPENDENCY_VARIABLES);
5794 }
5795
5796 uint32_t
compute_max_register_pressure()5797 elk_fs_visitor::compute_max_register_pressure()
5798 {
5799 const register_pressure &rp = regpressure_analysis.require();
5800 uint32_t ip = 0, max_pressure = 0;
5801 foreach_block_and_inst(block, elk_backend_instruction, inst, cfg) {
5802 max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
5803 ip++;
5804 }
5805 return max_pressure;
5806 }
5807
5808 static elk_fs_inst **
save_instruction_order(const struct elk_cfg_t * cfg)5809 save_instruction_order(const struct elk_cfg_t *cfg)
5810 {
5811 /* Before we schedule anything, stash off the instruction order as an array
5812 * of elk_fs_inst *. This way, we can reset it between scheduling passes to
5813 * prevent dependencies between the different scheduling modes.
5814 */
5815 int num_insts = cfg->last_block()->end_ip + 1;
5816 elk_fs_inst **inst_arr = new elk_fs_inst * [num_insts];
5817
5818 int ip = 0;
5819 foreach_block_and_inst(block, elk_fs_inst, inst, cfg) {
5820 assert(ip >= block->start_ip && ip <= block->end_ip);
5821 inst_arr[ip++] = inst;
5822 }
5823 assert(ip == num_insts);
5824
5825 return inst_arr;
5826 }
5827
5828 static void
restore_instruction_order(struct elk_cfg_t * cfg,elk_fs_inst ** inst_arr)5829 restore_instruction_order(struct elk_cfg_t *cfg, elk_fs_inst **inst_arr)
5830 {
5831 ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
5832
5833 int ip = 0;
5834 foreach_block (block, cfg) {
5835 block->instructions.make_empty();
5836
5837 assert(ip == block->start_ip);
5838 for (; ip <= block->end_ip; ip++)
5839 block->instructions.push_tail(inst_arr[ip]);
5840 }
5841 assert(ip == num_insts);
5842 }
5843
5844 void
allocate_registers(bool allow_spilling)5845 elk_fs_visitor::allocate_registers(bool allow_spilling)
5846 {
5847 bool allocated;
5848
5849 static const enum instruction_scheduler_mode pre_modes[] = {
5850 SCHEDULE_PRE,
5851 SCHEDULE_PRE_NON_LIFO,
5852 SCHEDULE_NONE,
5853 SCHEDULE_PRE_LIFO,
5854 };
5855
5856 static const char *scheduler_mode_name[] = {
5857 [SCHEDULE_PRE] = "top-down",
5858 [SCHEDULE_PRE_NON_LIFO] = "non-lifo",
5859 [SCHEDULE_PRE_LIFO] = "lifo",
5860 [SCHEDULE_POST] = "post",
5861 [SCHEDULE_NONE] = "none",
5862 };
5863
5864 uint32_t best_register_pressure = UINT32_MAX;
5865 enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
5866
5867 compact_virtual_grfs();
5868
5869 if (needs_register_pressure)
5870 shader_stats.max_register_pressure = compute_max_register_pressure();
5871
5872 debug_optimizer(nir, "pre_register_allocate", 90, 90);
5873
5874 bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
5875
5876 /* Before we schedule anything, stash off the instruction order as an array
5877 * of elk_fs_inst *. This way, we can reset it between scheduling passes to
5878 * prevent dependencies between the different scheduling modes.
5879 */
5880 elk_fs_inst **orig_order = save_instruction_order(cfg);
5881 elk_fs_inst **best_pressure_order = NULL;
5882
5883 void *scheduler_ctx = ralloc_context(NULL);
5884 elk_fs_instruction_scheduler *sched = prepare_scheduler(scheduler_ctx);
5885
5886 /* Try each scheduling heuristic to see if it can successfully register
5887 * allocate without spilling. They should be ordered by decreasing
5888 * performance but increasing likelihood of allocating.
5889 */
5890 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
5891 enum instruction_scheduler_mode sched_mode = pre_modes[i];
5892
5893 schedule_instructions_pre_ra(sched, sched_mode);
5894 this->shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
5895
5896 debug_optimizer(nir, shader_stats.scheduler_mode, 95, i);
5897
5898 if (0) {
5899 assign_regs_trivial();
5900 allocated = true;
5901 break;
5902 }
5903
5904 /* We should only spill registers on the last scheduling. */
5905 assert(!spilled_any_registers);
5906
5907 allocated = assign_regs(false, spill_all);
5908 if (allocated)
5909 break;
5910
5911 /* Save the maximum register pressure */
5912 uint32_t this_pressure = compute_max_register_pressure();
5913
5914 if (0) {
5915 fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
5916 scheduler_mode_name[sched_mode], this_pressure);
5917 }
5918
5919 if (this_pressure < best_register_pressure) {
5920 best_register_pressure = this_pressure;
5921 best_sched = sched_mode;
5922 delete[] best_pressure_order;
5923 best_pressure_order = save_instruction_order(cfg);
5924 }
5925
5926 /* Reset back to the original order before trying the next mode */
5927 restore_instruction_order(cfg, orig_order);
5928 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
5929 }
5930
5931 ralloc_free(scheduler_ctx);
5932
5933 if (!allocated) {
5934 if (0) {
5935 fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
5936 scheduler_mode_name[best_sched]);
5937 }
5938 restore_instruction_order(cfg, best_pressure_order);
5939 shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
5940
5941 allocated = assign_regs(allow_spilling, spill_all);
5942 }
5943
5944 delete[] orig_order;
5945 delete[] best_pressure_order;
5946
5947 if (!allocated) {
5948 fail("Failure to register allocate. Reduce number of "
5949 "live scalar values to avoid this.");
5950 } else if (spilled_any_registers) {
5951 elk_shader_perf_log(compiler, log_data,
5952 "%s shader triggered register spilling. "
5953 "Try reducing the number of live scalar "
5954 "values to improve performance.\n",
5955 _mesa_shader_stage_to_string(stage));
5956 }
5957
5958 /* This must come after all optimization and register allocation, since
5959 * it inserts dead code that happens to have side effects, and it does
5960 * so based on the actual physical registers in use.
5961 */
5962 insert_gfx4_send_dependency_workarounds();
5963
5964 if (failed)
5965 return;
5966
5967 opt_bank_conflicts();
5968
5969 schedule_instructions_post_ra();
5970
5971 if (last_scratch > 0) {
5972 ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;
5973
5974 /* Take the max of any previously compiled variant of the shader. In the
5975 * case of bindless shaders with return parts, this will also take the
5976 * max of all parts.
5977 */
5978 prog_data->total_scratch = MAX2(elk_get_scratch_size(last_scratch),
5979 prog_data->total_scratch);
5980
5981 if (gl_shader_stage_is_compute(stage)) {
5982 if (devinfo->platform == INTEL_PLATFORM_HSW) {
5983 /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
5984 * field documentation, Haswell supports a minimum of 2kB of
5985 * scratch space for compute shaders, unlike every other stage
5986 * and platform.
5987 */
5988 prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);
5989 } else if (devinfo->ver <= 7) {
5990 /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
5991 * field documentation, platforms prior to Haswell measure scratch
5992 * size linearly with a range of [1kB, 12kB] and 1kB granularity.
5993 */
5994 prog_data->total_scratch = ALIGN(last_scratch, 1024);
5995 max_scratch_size = 12 * 1024;
5996 }
5997 }
5998
5999 /* We currently only support up to 2MB of scratch space. If we
6000 * need to support more eventually, the documentation suggests
6001 * that we could allocate a larger buffer, and partition it out
6002 * ourselves. We'd just have to undo the hardware's address
6003 * calculation by subtracting (FFTID * Per Thread Scratch Space)
6004 * and then add FFTID * (Larger Per Thread Scratch Space).
6005 *
6006 * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
6007 * Thread Group Tracking > Local Memory/Scratch Space.
6008 */
6009 assert(prog_data->total_scratch < max_scratch_size);
6010 }
6011 }
6012
6013 bool
run_vs()6014 elk_fs_visitor::run_vs()
6015 {
6016 assert(stage == MESA_SHADER_VERTEX);
6017
6018 payload_ = new elk_vs_thread_payload(*this);
6019
6020 nir_to_elk(this);
6021
6022 if (failed)
6023 return false;
6024
6025 emit_urb_writes();
6026
6027 calculate_cfg();
6028
6029 optimize();
6030
6031 assign_curb_setup();
6032 assign_vs_urb_setup();
6033
6034 fixup_3src_null_dest();
6035
6036 allocate_registers(true /* allow_spilling */);
6037
6038 workaround_source_arf_before_eot();
6039
6040 return !failed;
6041 }
6042
6043 void
set_tcs_invocation_id()6044 elk_fs_visitor::set_tcs_invocation_id()
6045 {
6046 struct elk_tcs_prog_data *tcs_prog_data = elk_tcs_prog_data(prog_data);
6047 struct elk_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
6048 const fs_builder bld = fs_builder(this).at_end();
6049
6050 const unsigned instance_id_mask = INTEL_MASK(23, 17);
6051 const unsigned instance_id_shift = 17;
6052
6053 elk_fs_reg t = bld.vgrf(ELK_REGISTER_TYPE_UD);
6054 bld.AND(t, elk_fs_reg(retype(elk_vec1_grf(0, 2), ELK_REGISTER_TYPE_UD)),
6055 elk_imm_ud(instance_id_mask));
6056
6057 invocation_id = bld.vgrf(ELK_REGISTER_TYPE_UD);
6058
6059 if (vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH) {
6060 /* gl_InvocationID is just the thread number */
6061 bld.SHR(invocation_id, t, elk_imm_ud(instance_id_shift));
6062 return;
6063 }
6064
6065 assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH);
6066
6067 elk_fs_reg channels_uw = bld.vgrf(ELK_REGISTER_TYPE_UW);
6068 elk_fs_reg channels_ud = bld.vgrf(ELK_REGISTER_TYPE_UD);
6069 bld.MOV(channels_uw, elk_fs_reg(elk_imm_uv(0x76543210)));
6070 bld.MOV(channels_ud, channels_uw);
6071
6072 if (tcs_prog_data->instances == 1) {
6073 invocation_id = channels_ud;
6074 } else {
6075 elk_fs_reg instance_times_8 = bld.vgrf(ELK_REGISTER_TYPE_UD);
6076 bld.SHR(instance_times_8, t, elk_imm_ud(instance_id_shift - 3));
6077 bld.ADD(invocation_id, instance_times_8, channels_ud);
6078 }
6079 }
6080
6081 void
emit_tcs_thread_end()6082 elk_fs_visitor::emit_tcs_thread_end()
6083 {
6084 /* Try and tag the last URB write with EOT instead of emitting a whole
6085 * separate write just to finish the thread. There isn't guaranteed to
6086 * be one, so this may not succeed.
6087 */
6088 if (devinfo->ver != 8 && mark_last_urb_write_with_eot())
6089 return;
6090
6091 const fs_builder bld = fs_builder(this).at_end();
6092
6093 /* Emit a URB write to end the thread. On Broadwell, we use this to write
6094 * zero to the "TR DS Cache Disable" bit (we haven't implemented a fancy
6095 * algorithm to set it optimally). On other platforms, we simply write
6096 * zero to a reserved/MBZ patch header DWord which has no consequence.
6097 */
6098 elk_fs_reg srcs[URB_LOGICAL_NUM_SRCS];
6099 srcs[URB_LOGICAL_SRC_HANDLE] = tcs_payload().patch_urb_output;
6100 srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = elk_imm_ud(WRITEMASK_X << 16);
6101 srcs[URB_LOGICAL_SRC_DATA] = elk_imm_ud(0);
6102 srcs[URB_LOGICAL_SRC_COMPONENTS] = elk_imm_ud(1);
6103 elk_fs_inst *inst = bld.emit(ELK_SHADER_OPCODE_URB_WRITE_LOGICAL,
6104 reg_undef, srcs, ARRAY_SIZE(srcs));
6105 inst->eot = true;
6106 }
6107
6108 bool
run_tcs()6109 elk_fs_visitor::run_tcs()
6110 {
6111 assert(stage == MESA_SHADER_TESS_CTRL);
6112
6113 struct elk_vue_prog_data *vue_prog_data = elk_vue_prog_data(prog_data);
6114 const fs_builder bld = fs_builder(this).at_end();
6115
6116 assert(vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH ||
6117 vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_MULTI_PATCH);
6118
6119 payload_ = new elk_tcs_thread_payload(*this);
6120
6121 /* Initialize gl_InvocationID */
6122 set_tcs_invocation_id();
6123
6124 const bool fix_dispatch_mask =
6125 vue_prog_data->dispatch_mode == INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH &&
6126 (nir->info.tess.tcs_vertices_out % 8) != 0;
6127
6128 /* Fix the disptach mask */
6129 if (fix_dispatch_mask) {
6130 bld.CMP(bld.null_reg_ud(), invocation_id,
6131 elk_imm_ud(nir->info.tess.tcs_vertices_out), ELK_CONDITIONAL_L);
6132 bld.IF(ELK_PREDICATE_NORMAL);
6133 }
6134
6135 nir_to_elk(this);
6136
6137 if (fix_dispatch_mask) {
6138 bld.emit(ELK_OPCODE_ENDIF);
6139 }
6140
6141 emit_tcs_thread_end();
6142
6143 if (failed)
6144 return false;
6145
6146 calculate_cfg();
6147
6148 optimize();
6149
6150 assign_curb_setup();
6151 assign_tcs_urb_setup();
6152
6153 fixup_3src_null_dest();
6154
6155 allocate_registers(true /* allow_spilling */);
6156
6157 workaround_source_arf_before_eot();
6158
6159 return !failed;
6160 }
6161
6162 bool
run_tes()6163 elk_fs_visitor::run_tes()
6164 {
6165 assert(stage == MESA_SHADER_TESS_EVAL);
6166
6167 payload_ = new elk_tes_thread_payload(*this);
6168
6169 nir_to_elk(this);
6170
6171 if (failed)
6172 return false;
6173
6174 emit_urb_writes();
6175
6176 calculate_cfg();
6177
6178 optimize();
6179
6180 assign_curb_setup();
6181 assign_tes_urb_setup();
6182
6183 fixup_3src_null_dest();
6184
6185 allocate_registers(true /* allow_spilling */);
6186
6187 workaround_source_arf_before_eot();
6188
6189 return !failed;
6190 }
6191
6192 bool
run_gs()6193 elk_fs_visitor::run_gs()
6194 {
6195 assert(stage == MESA_SHADER_GEOMETRY);
6196
6197 payload_ = new elk_gs_thread_payload(*this);
6198
6199 this->final_gs_vertex_count = vgrf(glsl_uint_type());
6200
6201 if (gs_compile->control_data_header_size_bits > 0) {
6202 /* Create a VGRF to store accumulated control data bits. */
6203 this->control_data_bits = vgrf(glsl_uint_type());
6204
6205 /* If we're outputting more than 32 control data bits, then EmitVertex()
6206 * will set control_data_bits to 0 after emitting the first vertex.
6207 * Otherwise, we need to initialize it to 0 here.
6208 */
6209 if (gs_compile->control_data_header_size_bits <= 32) {
6210 const fs_builder bld = fs_builder(this).at_end();
6211 const fs_builder abld = bld.annotate("initialize control data bits");
6212 abld.MOV(this->control_data_bits, elk_imm_ud(0u));
6213 }
6214 }
6215
6216 nir_to_elk(this);
6217
6218 emit_gs_thread_end();
6219
6220 if (failed)
6221 return false;
6222
6223 calculate_cfg();
6224
6225 optimize();
6226
6227 assign_curb_setup();
6228 assign_gs_urb_setup();
6229
6230 fixup_3src_null_dest();
6231
6232 allocate_registers(true /* allow_spilling */);
6233
6234 workaround_source_arf_before_eot();
6235
6236 return !failed;
6237 }
6238
6239 bool
run_fs(bool allow_spilling,bool do_rep_send)6240 elk_fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
6241 {
6242 struct elk_wm_prog_data *wm_prog_data = elk_wm_prog_data(this->prog_data);
6243 elk_wm_prog_key *wm_key = (elk_wm_prog_key *) this->key;
6244 const fs_builder bld = fs_builder(this).at_end();
6245
6246 assert(stage == MESA_SHADER_FRAGMENT);
6247
6248 payload_ = new elk_fs_thread_payload(*this, source_depth_to_render_target,
6249 runtime_check_aads_emit);
6250
6251 if (do_rep_send) {
6252 assert(dispatch_width == 16);
6253 emit_repclear_shader();
6254 } else {
6255 if (nir->info.inputs_read > 0 ||
6256 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
6257 (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
6258 if (devinfo->ver < 6)
6259 emit_interpolation_setup_gfx4();
6260 else
6261 emit_interpolation_setup_gfx6();
6262 }
6263
6264 /* We handle discards by keeping track of the still-live pixels in f0.1.
6265 * Initialize it with the dispatched pixels.
6266 */
6267 if (wm_prog_data->uses_kill) {
6268 const unsigned lower_width = MIN2(dispatch_width, 16);
6269 for (unsigned i = 0; i < dispatch_width / lower_width; i++) {
6270 /* According to the "PS Thread Payload for Normal
6271 * Dispatch" pages on the BSpec, the dispatch mask is
6272 * stored in R1.7/R2.7 on gfx6+.
6273 */
6274 const elk_fs_reg dispatch_mask =
6275 devinfo->ver >= 6 ? elk_vec1_grf(i + 1, 7) :
6276 elk_vec1_grf(0, 0);
6277 bld.exec_all().group(1, 0)
6278 .MOV(elk_sample_mask_reg(bld.group(lower_width, i)),
6279 retype(dispatch_mask, ELK_REGISTER_TYPE_UW));
6280 }
6281 }
6282
6283 if (nir->info.writes_memory)
6284 wm_prog_data->has_side_effects = true;
6285
6286 nir_to_elk(this);
6287
6288 if (failed)
6289 return false;
6290
6291 if (wm_key->emit_alpha_test)
6292 emit_alpha_test();
6293
6294 emit_fb_writes();
6295
6296 calculate_cfg();
6297
6298 optimize();
6299
6300 assign_curb_setup();
6301
6302 assign_urb_setup();
6303
6304 fixup_3src_null_dest();
6305
6306 allocate_registers(allow_spilling);
6307
6308 workaround_source_arf_before_eot();
6309 }
6310
6311 return !failed;
6312 }
6313
6314 bool
run_cs(bool allow_spilling)6315 elk_fs_visitor::run_cs(bool allow_spilling)
6316 {
6317 assert(gl_shader_stage_is_compute(stage));
6318 assert(devinfo->ver >= 7);
6319 const fs_builder bld = fs_builder(this).at_end();
6320
6321 payload_ = new elk_cs_thread_payload(*this);
6322
6323 if (devinfo->platform == INTEL_PLATFORM_HSW && prog_data->total_shared > 0) {
6324 /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
6325 const fs_builder abld = bld.exec_all().group(1, 0);
6326 abld.MOV(retype(elk_sr0_reg(1), ELK_REGISTER_TYPE_UW),
6327 suboffset(retype(elk_vec1_grf(0, 0), ELK_REGISTER_TYPE_UW), 1));
6328 }
6329
6330 nir_to_elk(this);
6331
6332 if (failed)
6333 return false;
6334
6335 emit_cs_terminate();
6336
6337 calculate_cfg();
6338
6339 optimize();
6340
6341 assign_curb_setup();
6342
6343 fixup_3src_null_dest();
6344
6345 allocate_registers(allow_spilling);
6346
6347 workaround_source_arf_before_eot();
6348
6349 return !failed;
6350 }
6351
6352 static bool
is_used_in_not_interp_frag_coord(nir_def * def)6353 is_used_in_not_interp_frag_coord(nir_def *def)
6354 {
6355 nir_foreach_use_including_if(src, def) {
6356 if (nir_src_is_if(src))
6357 return true;
6358
6359 if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
6360 return true;
6361
6362 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src));
6363 if (intrin->intrinsic != nir_intrinsic_load_frag_coord)
6364 return true;
6365 }
6366
6367 return false;
6368 }
6369
6370 /**
6371 * Return a bitfield where bit n is set if barycentric interpolation mode n
6372 * (see enum elk_barycentric_mode) is needed by the fragment shader.
6373 *
6374 * We examine the load_barycentric intrinsics rather than looking at input
6375 * variables so that we catch interpolateAtCentroid() messages too, which
6376 * also need the ELK_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
6377 */
6378 static unsigned
elk_compute_barycentric_interp_modes(const struct intel_device_info * devinfo,const nir_shader * shader)6379 elk_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
6380 const nir_shader *shader)
6381 {
6382 unsigned barycentric_interp_modes = 0;
6383
6384 nir_foreach_function_impl(impl, shader) {
6385 nir_foreach_block(block, impl) {
6386 nir_foreach_instr(instr, block) {
6387 if (instr->type != nir_instr_type_intrinsic)
6388 continue;
6389
6390 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
6391 switch (intrin->intrinsic) {
6392 case nir_intrinsic_load_barycentric_pixel:
6393 case nir_intrinsic_load_barycentric_centroid:
6394 case nir_intrinsic_load_barycentric_sample:
6395 case nir_intrinsic_load_barycentric_at_sample:
6396 case nir_intrinsic_load_barycentric_at_offset:
6397 break;
6398 default:
6399 continue;
6400 }
6401
6402 /* Ignore WPOS; it doesn't require interpolation. */
6403 if (!is_used_in_not_interp_frag_coord(&intrin->def))
6404 continue;
6405
6406 nir_intrinsic_op bary_op = intrin->intrinsic;
6407 enum elk_barycentric_mode bary =
6408 elk_barycentric_mode(intrin);
6409
6410 barycentric_interp_modes |= 1 << bary;
6411
6412 if (devinfo->needs_unlit_centroid_workaround &&
6413 bary_op == nir_intrinsic_load_barycentric_centroid)
6414 barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
6415 }
6416 }
6417 }
6418
6419 return barycentric_interp_modes;
6420 }
6421
6422 static void
elk_compute_flat_inputs(struct elk_wm_prog_data * prog_data,const nir_shader * shader)6423 elk_compute_flat_inputs(struct elk_wm_prog_data *prog_data,
6424 const nir_shader *shader)
6425 {
6426 prog_data->flat_inputs = 0;
6427
6428 nir_foreach_shader_in_variable(var, shader) {
6429 /* flat shading */
6430 if (var->data.interpolation != INTERP_MODE_FLAT)
6431 continue;
6432
6433 if (var->data.per_primitive)
6434 continue;
6435
6436 unsigned slots = glsl_count_attribute_slots(var->type, false);
6437 for (unsigned s = 0; s < slots; s++) {
6438 int input_index = prog_data->urb_setup[var->data.location + s];
6439
6440 if (input_index >= 0)
6441 prog_data->flat_inputs |= 1 << input_index;
6442 }
6443 }
6444 }
6445
6446 static uint8_t
computed_depth_mode(const nir_shader * shader)6447 computed_depth_mode(const nir_shader *shader)
6448 {
6449 if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
6450 switch (shader->info.fs.depth_layout) {
6451 case FRAG_DEPTH_LAYOUT_NONE:
6452 case FRAG_DEPTH_LAYOUT_ANY:
6453 return ELK_PSCDEPTH_ON;
6454 case FRAG_DEPTH_LAYOUT_GREATER:
6455 return ELK_PSCDEPTH_ON_GE;
6456 case FRAG_DEPTH_LAYOUT_LESS:
6457 return ELK_PSCDEPTH_ON_LE;
6458 case FRAG_DEPTH_LAYOUT_UNCHANGED:
6459 /* We initially set this to OFF, but having the shader write the
6460 * depth means we allocate register space in the SEND message. The
6461 * difference between the SEND register count and the OFF state
6462 * programming makes the HW hang.
6463 *
6464 * Removing the depth writes also leads to test failures. So use
6465 * LesserThanOrEqual, which fits writing the same value
6466 * (unchanged/equal).
6467 *
6468 */
6469 return ELK_PSCDEPTH_ON_LE;
6470 }
6471 }
6472 return ELK_PSCDEPTH_OFF;
6473 }
6474
6475 /**
6476 * Move load_interpolated_input with simple (payload-based) barycentric modes
6477 * to the top of the program so we don't emit multiple PLNs for the same input.
6478 *
6479 * This works around CSE not being able to handle non-dominating cases
6480 * such as:
6481 *
6482 * if (...) {
6483 * interpolate input
6484 * } else {
6485 * interpolate the same exact input
6486 * }
6487 *
6488 * This should be replaced by global value numbering someday.
6489 */
6490 bool
elk_nir_move_interpolation_to_top(nir_shader * nir)6491 elk_nir_move_interpolation_to_top(nir_shader *nir)
6492 {
6493 bool progress = false;
6494
6495 nir_foreach_function_impl(impl, nir) {
6496 nir_block *top = nir_start_block(impl);
6497 nir_cursor cursor = nir_before_instr(nir_block_first_instr(top));
6498 bool impl_progress = false;
6499
6500 for (nir_block *block = nir_block_cf_tree_next(top);
6501 block != NULL;
6502 block = nir_block_cf_tree_next(block)) {
6503
6504 nir_foreach_instr_safe(instr, block) {
6505 if (instr->type != nir_instr_type_intrinsic)
6506 continue;
6507
6508 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
6509 if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
6510 continue;
6511 nir_intrinsic_instr *bary_intrinsic =
6512 nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
6513 nir_intrinsic_op op = bary_intrinsic->intrinsic;
6514
6515 /* Leave interpolateAtSample/Offset() where they are. */
6516 if (op == nir_intrinsic_load_barycentric_at_sample ||
6517 op == nir_intrinsic_load_barycentric_at_offset)
6518 continue;
6519
6520 nir_instr *move[3] = {
6521 &bary_intrinsic->instr,
6522 intrin->src[1].ssa->parent_instr,
6523 instr
6524 };
6525
6526 for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
6527 if (move[i]->block != top) {
6528 nir_instr_move(cursor, move[i]);
6529 impl_progress = true;
6530 }
6531 }
6532 }
6533 }
6534
6535 progress = progress || impl_progress;
6536
6537 nir_metadata_preserve(impl, impl_progress ? nir_metadata_control_flow
6538 : nir_metadata_all);
6539 }
6540
6541 return progress;
6542 }
6543
6544 static void
elk_nir_populate_wm_prog_data(nir_shader * shader,const struct intel_device_info * devinfo,const struct elk_wm_prog_key * key,struct elk_wm_prog_data * prog_data)6545 elk_nir_populate_wm_prog_data(nir_shader *shader,
6546 const struct intel_device_info *devinfo,
6547 const struct elk_wm_prog_key *key,
6548 struct elk_wm_prog_data *prog_data)
6549 {
6550 /* key->alpha_test_func means simulating alpha testing via discards,
6551 * so the shader definitely kills pixels.
6552 */
6553 prog_data->uses_kill = shader->info.fs.uses_discard ||
6554 key->emit_alpha_test;
6555 prog_data->uses_omask = !key->ignore_sample_mask_out &&
6556 (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
6557 prog_data->color_outputs_written = key->color_outputs_valid;
6558 prog_data->computed_depth_mode = computed_depth_mode(shader);
6559 prog_data->computed_stencil =
6560 shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
6561
6562 prog_data->sample_shading =
6563 shader->info.fs.uses_sample_shading ||
6564 shader->info.outputs_read;
6565
6566 assert(key->multisample_fbo != ELK_NEVER ||
6567 key->persample_interp == ELK_NEVER);
6568
6569 prog_data->persample_dispatch = key->persample_interp;
6570 if (prog_data->sample_shading)
6571 prog_data->persample_dispatch = ELK_ALWAYS;
6572
6573 /* We can only persample dispatch if we have a multisample FBO */
6574 prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch,
6575 key->multisample_fbo);
6576
6577 /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
6578 * persample_dispatch & multisample_fbo are not dynamic, Anv should be able
6579 * to definitively tell whether alpha_to_coverage is on or off.
6580 */
6581 prog_data->alpha_to_coverage = key->alpha_to_coverage;
6582 assert(prog_data->alpha_to_coverage != ELK_SOMETIMES ||
6583 prog_data->persample_dispatch == ELK_SOMETIMES);
6584
6585 if (devinfo->ver >= 6) {
6586 prog_data->uses_sample_mask =
6587 BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
6588
6589 /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
6590 *
6591 * "MSDISPMODE_PERSAMPLE is required in order to select
6592 * POSOFFSET_SAMPLE"
6593 *
6594 * So we can only really get sample positions if we are doing real
6595 * per-sample dispatch. If we need gl_SamplePosition and we don't have
6596 * persample dispatch, we hard-code it to 0.5.
6597 */
6598 prog_data->uses_pos_offset =
6599 prog_data->persample_dispatch != ELK_NEVER &&
6600 (BITSET_TEST(shader->info.system_values_read,
6601 SYSTEM_VALUE_SAMPLE_POS) ||
6602 BITSET_TEST(shader->info.system_values_read,
6603 SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
6604 }
6605
6606 prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
6607 prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
6608 prog_data->inner_coverage = shader->info.fs.inner_coverage;
6609
6610 prog_data->barycentric_interp_modes =
6611 elk_compute_barycentric_interp_modes(devinfo, shader);
6612
6613 /* From the BDW PRM documentation for 3DSTATE_WM:
6614 *
6615 * "MSDISPMODE_PERSAMPLE is required in order to select Perspective
6616 * Sample or Non- perspective Sample barycentric coordinates."
6617 *
6618 * So cleanup any potentially set sample barycentric mode when not in per
6619 * sample dispatch.
6620 */
6621 if (prog_data->persample_dispatch == ELK_NEVER) {
6622 prog_data->barycentric_interp_modes &=
6623 ~BITFIELD_BIT(ELK_BARYCENTRIC_PERSPECTIVE_SAMPLE);
6624 }
6625
6626 prog_data->uses_nonperspective_interp_modes |=
6627 (prog_data->barycentric_interp_modes &
6628 ELK_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0;
6629
6630 /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
6631 * Message Descriptor :
6632 *
6633 * "Message Type. Specifies the type of message being sent when
6634 * pixel-rate evaluation is requested :
6635 *
6636 * Format = U2
6637 * 0: Per Message Offset (eval_snapped with immediate offset)
6638 * 1: Sample Position Offset (eval_sindex)
6639 * 2: Centroid Position Offset (eval_centroid)
6640 * 3: Per Slot Offset (eval_snapped with register offset)
6641 *
6642 * Message Type. Specifies the type of message being sent when
6643 * coarse-rate evaluation is requested :
6644 *
6645 * Format = U2
6646 * 0: Coarse to Pixel Mapping Message (internal message)
6647 * 1: Reserved
6648 * 2: Coarse Centroid Position (eval_centroid)
6649 * 3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
6650 *
6651 * The Sample Position Offset is marked as reserved for coarse rate
6652 * evaluation and leads to hangs if we try to use it. So disable coarse
6653 * pixel shading if we have any intrinsic that will result in a pixel
6654 * interpolater message at sample.
6655 */
6656 intel_nir_pulls_at_sample(shader);
6657
6658 /* We choose to always enable VMask prior to XeHP, as it would cause
6659 * us to lose out on the eliminate_find_live_channel() optimization.
6660 */
6661 prog_data->uses_vmask = true;
6662
6663 prog_data->uses_src_w =
6664 BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
6665 prog_data->uses_src_depth =
6666 BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
6667
6668 calculate_urb_setup(devinfo, key, prog_data, shader);
6669 elk_compute_flat_inputs(prog_data, shader);
6670 }
6671
6672 /**
6673 * Pre-gfx6, the register file of the EUs was shared between threads,
6674 * and each thread used some subset allocated on a 16-register block
6675 * granularity. The unit states wanted these block counts.
6676 */
6677 static inline int
elk_register_blocks(int reg_count)6678 elk_register_blocks(int reg_count)
6679 {
6680 return ALIGN(reg_count, 16) / 16 - 1;
6681 }
6682
6683 const unsigned *
elk_compile_fs(const struct elk_compiler * compiler,struct elk_compile_fs_params * params)6684 elk_compile_fs(const struct elk_compiler *compiler,
6685 struct elk_compile_fs_params *params)
6686 {
6687 struct nir_shader *nir = params->base.nir;
6688 const struct elk_wm_prog_key *key = params->key;
6689 struct elk_wm_prog_data *prog_data = params->prog_data;
6690 bool allow_spilling = params->allow_spilling;
6691 const bool debug_enabled =
6692 elk_should_print_shader(nir, params->base.debug_flag ?
6693 params->base.debug_flag : DEBUG_WM);
6694
6695 prog_data->base.stage = MESA_SHADER_FRAGMENT;
6696 prog_data->base.total_scratch = 0;
6697
6698 const struct intel_device_info *devinfo = compiler->devinfo;
6699 const unsigned max_subgroup_size = compiler->devinfo->ver >= 6 ? 32 : 16;
6700
6701 elk_nir_apply_key(nir, compiler, &key->base, max_subgroup_size);
6702 elk_nir_lower_fs_inputs(nir, devinfo, key);
6703 elk_nir_lower_fs_outputs(nir);
6704
6705 if (devinfo->ver < 6)
6706 elk_setup_vue_interpolation(params->vue_map, nir, prog_data);
6707
6708 /* From the SKL PRM, Volume 7, "Alpha Coverage":
6709 * "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
6710 * hardware, regardless of the state setting for this feature."
6711 */
6712 if (devinfo->ver > 6 && key->alpha_to_coverage != ELK_NEVER) {
6713 /* Run constant fold optimization in order to get the correct source
6714 * offset to determine render target 0 store instruction in
6715 * emit_alpha_to_coverage pass.
6716 */
6717 NIR_PASS(_, nir, nir_opt_constant_folding);
6718 NIR_PASS(_, nir, elk_nir_lower_alpha_to_coverage, key, prog_data);
6719 }
6720
6721 NIR_PASS(_, nir, elk_nir_move_interpolation_to_top);
6722 elk_postprocess_nir(nir, compiler, debug_enabled,
6723 key->base.robust_flags);
6724
6725 elk_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data);
6726
6727 std::unique_ptr<elk_fs_visitor> v8, v16, v32, vmulti;
6728 elk_cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL;
6729 float throughput = 0;
6730 bool has_spilled = false;
6731
6732 v8 = std::make_unique<elk_fs_visitor>(compiler, ¶ms->base, key,
6733 prog_data, nir, 8,
6734 params->base.stats != NULL,
6735 debug_enabled);
6736 if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
6737 params->base.error_str = ralloc_strdup(params->base.mem_ctx,
6738 v8->fail_msg);
6739 return NULL;
6740 } else if (INTEL_SIMD(FS, 8)) {
6741 simd8_cfg = v8->cfg;
6742
6743 assert(v8->payload().num_regs % reg_unit(devinfo) == 0);
6744 prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo);
6745
6746 prog_data->reg_blocks_8 = elk_register_blocks(v8->grf_used);
6747 const performance &perf = v8->performance_analysis.require();
6748 throughput = MAX2(throughput, perf.throughput);
6749 has_spilled = v8->spilled_any_registers;
6750 allow_spilling = false;
6751 }
6752
6753 /* Limit dispatch width to simd8 with dual source blending on gfx8.
6754 * See: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1917
6755 */
6756 if (devinfo->ver == 8 && prog_data->dual_src_blend &&
6757 INTEL_SIMD(FS, 8)) {
6758 assert(!params->use_rep_send);
6759 v8->limit_dispatch_width(8, "gfx8 workaround: "
6760 "using SIMD8 when dual src blending.\n");
6761 }
6762
6763 if (!has_spilled &&
6764 (!v8 || v8->max_dispatch_width >= 16) &&
6765 (INTEL_SIMD(FS, 16) || params->use_rep_send)) {
6766 /* Try a SIMD16 compile */
6767 v16 = std::make_unique<elk_fs_visitor>(compiler, ¶ms->base, key,
6768 prog_data, nir, 16,
6769 params->base.stats != NULL,
6770 debug_enabled);
6771 if (v8)
6772 v16->import_uniforms(v8.get());
6773 if (!v16->run_fs(allow_spilling, params->use_rep_send)) {
6774 elk_shader_perf_log(compiler, params->base.log_data,
6775 "SIMD16 shader failed to compile: %s\n",
6776 v16->fail_msg);
6777 } else {
6778 simd16_cfg = v16->cfg;
6779
6780 assert(v16->payload().num_regs % reg_unit(devinfo) == 0);
6781 prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo);
6782
6783 prog_data->reg_blocks_16 = elk_register_blocks(v16->grf_used);
6784 const performance &perf = v16->performance_analysis.require();
6785 throughput = MAX2(throughput, perf.throughput);
6786 has_spilled = v16->spilled_any_registers;
6787 allow_spilling = false;
6788 }
6789 }
6790
6791 const bool simd16_failed = v16 && !simd16_cfg;
6792
6793 /* Currently, the compiler only supports SIMD32 on SNB+ */
6794 if (!has_spilled &&
6795 (!v8 || v8->max_dispatch_width >= 32) &&
6796 (!v16 || v16->max_dispatch_width >= 32) && !params->use_rep_send &&
6797 devinfo->ver >= 6 && !simd16_failed &&
6798 INTEL_SIMD(FS, 32)) {
6799 /* Try a SIMD32 compile */
6800 v32 = std::make_unique<elk_fs_visitor>(compiler, ¶ms->base, key,
6801 prog_data, nir, 32,
6802 params->base.stats != NULL,
6803 debug_enabled);
6804 if (v8)
6805 v32->import_uniforms(v8.get());
6806 else if (v16)
6807 v32->import_uniforms(v16.get());
6808
6809 if (!v32->run_fs(allow_spilling, false)) {
6810 elk_shader_perf_log(compiler, params->base.log_data,
6811 "SIMD32 shader failed to compile: %s\n",
6812 v32->fail_msg);
6813 } else {
6814 const performance &perf = v32->performance_analysis.require();
6815
6816 if (!INTEL_DEBUG(DEBUG_DO32) && throughput >= perf.throughput) {
6817 elk_shader_perf_log(compiler, params->base.log_data,
6818 "SIMD32 shader inefficient\n");
6819 } else {
6820 simd32_cfg = v32->cfg;
6821
6822 assert(v32->payload().num_regs % reg_unit(devinfo) == 0);
6823 prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo);
6824
6825 prog_data->reg_blocks_32 = elk_register_blocks(v32->grf_used);
6826 throughput = MAX2(throughput, perf.throughput);
6827 }
6828 }
6829 }
6830
6831 /* When the caller requests a repclear shader, they want SIMD16-only */
6832 if (params->use_rep_send)
6833 simd8_cfg = NULL;
6834
6835 /* Prior to Iron Lake, the PS had a single shader offset with a jump table
6836 * at the top to select the shader. We've never implemented that.
6837 * Instead, we just give them exactly one shader and we pick the widest one
6838 * available.
6839 */
6840 if (compiler->devinfo->ver < 5) {
6841 if (simd32_cfg || simd16_cfg)
6842 simd8_cfg = NULL;
6843 if (simd32_cfg)
6844 simd16_cfg = NULL;
6845 }
6846
6847 /* If computed depth is enabled SNB only allows SIMD8. */
6848 if (compiler->devinfo->ver == 6 &&
6849 prog_data->computed_depth_mode != ELK_PSCDEPTH_OFF)
6850 assert(simd16_cfg == NULL && simd32_cfg == NULL);
6851
6852 if (compiler->devinfo->ver <= 5 && !simd8_cfg) {
6853 /* Iron lake and earlier only have one Dispatch GRF start field. Make
6854 * the data available in the base prog data struct for convenience.
6855 */
6856 if (simd16_cfg) {
6857 prog_data->base.dispatch_grf_start_reg =
6858 prog_data->dispatch_grf_start_reg_16;
6859 } else if (simd32_cfg) {
6860 prog_data->base.dispatch_grf_start_reg =
6861 prog_data->dispatch_grf_start_reg_32;
6862 }
6863 }
6864
6865 elk_fs_generator g(compiler, ¶ms->base, &prog_data->base,
6866 v8 && v8->runtime_check_aads_emit, MESA_SHADER_FRAGMENT);
6867
6868 if (unlikely(debug_enabled)) {
6869 g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
6870 "%s fragment shader %s",
6871 nir->info.label ?
6872 nir->info.label : "unnamed",
6873 nir->info.name));
6874 }
6875
6876 struct elk_compile_stats *stats = params->base.stats;
6877 uint32_t max_dispatch_width = 0;
6878
6879 if (simd8_cfg) {
6880 prog_data->dispatch_8 = true;
6881 g.generate_code(simd8_cfg, 8, v8->shader_stats,
6882 v8->performance_analysis.require(), stats);
6883 stats = stats ? stats + 1 : NULL;
6884 max_dispatch_width = 8;
6885 }
6886
6887 if (simd16_cfg) {
6888 prog_data->dispatch_16 = true;
6889 prog_data->prog_offset_16 = g.generate_code(
6890 simd16_cfg, 16, v16->shader_stats,
6891 v16->performance_analysis.require(), stats);
6892 stats = stats ? stats + 1 : NULL;
6893 max_dispatch_width = 16;
6894 }
6895
6896 if (simd32_cfg) {
6897 prog_data->dispatch_32 = true;
6898 prog_data->prog_offset_32 = g.generate_code(
6899 simd32_cfg, 32, v32->shader_stats,
6900 v32->performance_analysis.require(), stats);
6901 stats = stats ? stats + 1 : NULL;
6902 max_dispatch_width = 32;
6903 }
6904
6905 for (struct elk_compile_stats *s = params->base.stats; s != NULL && s != stats; s++)
6906 s->max_dispatch_width = max_dispatch_width;
6907
6908 g.add_const_data(nir->constant_data, nir->constant_data_size);
6909 return g.get_assembly();
6910 }
6911
6912 unsigned
elk_cs_push_const_total_size(const struct elk_cs_prog_data * cs_prog_data,unsigned threads)6913 elk_cs_push_const_total_size(const struct elk_cs_prog_data *cs_prog_data,
6914 unsigned threads)
6915 {
6916 assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
6917 assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
6918 return cs_prog_data->push.per_thread.size * threads +
6919 cs_prog_data->push.cross_thread.size;
6920 }
6921
6922 static void
fill_push_const_block_info(struct elk_push_const_block * block,unsigned dwords)6923 fill_push_const_block_info(struct elk_push_const_block *block, unsigned dwords)
6924 {
6925 block->dwords = dwords;
6926 block->regs = DIV_ROUND_UP(dwords, 8);
6927 block->size = block->regs * 32;
6928 }
6929
6930 static void
cs_fill_push_const_info(const struct intel_device_info * devinfo,struct elk_cs_prog_data * cs_prog_data)6931 cs_fill_push_const_info(const struct intel_device_info *devinfo,
6932 struct elk_cs_prog_data *cs_prog_data)
6933 {
6934 const struct elk_stage_prog_data *prog_data = &cs_prog_data->base;
6935 int subgroup_id_index = elk_get_subgroup_id_param_index(devinfo, prog_data);
6936 bool cross_thread_supported = devinfo->verx10 >= 75;
6937
6938 /* The thread ID should be stored in the last param dword */
6939 assert(subgroup_id_index == -1 ||
6940 subgroup_id_index == (int)prog_data->nr_params - 1);
6941
6942 unsigned cross_thread_dwords, per_thread_dwords;
6943 if (!cross_thread_supported) {
6944 cross_thread_dwords = 0u;
6945 per_thread_dwords = prog_data->nr_params;
6946 } else if (subgroup_id_index >= 0) {
6947 /* Fill all but the last register with cross-thread payload */
6948 cross_thread_dwords = 8 * (subgroup_id_index / 8);
6949 per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
6950 assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
6951 } else {
6952 /* Fill all data using cross-thread payload */
6953 cross_thread_dwords = prog_data->nr_params;
6954 per_thread_dwords = 0u;
6955 }
6956
6957 fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
6958 fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
6959
6960 assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
6961 cs_prog_data->push.per_thread.size == 0);
6962 assert(cs_prog_data->push.cross_thread.dwords +
6963 cs_prog_data->push.per_thread.dwords ==
6964 prog_data->nr_params);
6965 }
6966
6967 static bool
filter_simd(const nir_instr * instr,const void *)6968 filter_simd(const nir_instr *instr, const void * /* options */)
6969 {
6970 if (instr->type != nir_instr_type_intrinsic)
6971 return false;
6972
6973 switch (nir_instr_as_intrinsic(instr)->intrinsic) {
6974 case nir_intrinsic_load_simd_width_intel:
6975 case nir_intrinsic_load_subgroup_id:
6976 return true;
6977
6978 default:
6979 return false;
6980 }
6981 }
6982
6983 static nir_def *
lower_simd(nir_builder * b,nir_instr * instr,void * options)6984 lower_simd(nir_builder *b, nir_instr *instr, void *options)
6985 {
6986 uintptr_t simd_width = (uintptr_t)options;
6987
6988 switch (nir_instr_as_intrinsic(instr)->intrinsic) {
6989 case nir_intrinsic_load_simd_width_intel:
6990 return nir_imm_int(b, simd_width);
6991
6992 case nir_intrinsic_load_subgroup_id:
6993 /* If the whole workgroup fits in one thread, we can lower subgroup_id
6994 * to a constant zero.
6995 */
6996 if (!b->shader->info.workgroup_size_variable) {
6997 unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
6998 b->shader->info.workgroup_size[1] *
6999 b->shader->info.workgroup_size[2];
7000 if (local_workgroup_size <= simd_width)
7001 return nir_imm_int(b, 0);
7002 }
7003 return NULL;
7004
7005 default:
7006 return NULL;
7007 }
7008 }
7009
7010 bool
elk_nir_lower_simd(nir_shader * nir,unsigned dispatch_width)7011 elk_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
7012 {
7013 return nir_shader_lower_instructions(nir, filter_simd, lower_simd,
7014 (void *)(uintptr_t)dispatch_width);
7015 }
7016
7017 const unsigned *
elk_compile_cs(const struct elk_compiler * compiler,struct elk_compile_cs_params * params)7018 elk_compile_cs(const struct elk_compiler *compiler,
7019 struct elk_compile_cs_params *params)
7020 {
7021 const nir_shader *nir = params->base.nir;
7022 const struct elk_cs_prog_key *key = params->key;
7023 struct elk_cs_prog_data *prog_data = params->prog_data;
7024
7025 const bool debug_enabled =
7026 elk_should_print_shader(nir, params->base.debug_flag ?
7027 params->base.debug_flag : DEBUG_CS);
7028
7029 prog_data->base.stage = MESA_SHADER_COMPUTE;
7030 prog_data->base.total_shared = nir->info.shared_size;
7031 prog_data->base.total_scratch = 0;
7032
7033 if (!nir->info.workgroup_size_variable) {
7034 prog_data->local_size[0] = nir->info.workgroup_size[0];
7035 prog_data->local_size[1] = nir->info.workgroup_size[1];
7036 prog_data->local_size[2] = nir->info.workgroup_size[2];
7037 }
7038
7039 elk_simd_selection_state simd_state{
7040 .devinfo = compiler->devinfo,
7041 .prog_data = prog_data,
7042 .required_width = elk_required_dispatch_width(&nir->info),
7043 };
7044
7045 std::unique_ptr<elk_fs_visitor> v[3];
7046
7047 for (unsigned simd = 0; simd < 3; simd++) {
7048 if (!elk_simd_should_compile(simd_state, simd))
7049 continue;
7050
7051 const unsigned dispatch_width = 8u << simd;
7052
7053 nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
7054 elk_nir_apply_key(shader, compiler, &key->base,
7055 dispatch_width);
7056
7057 NIR_PASS(_, shader, elk_nir_lower_simd, dispatch_width);
7058
7059 /* Clean up after the local index and ID calculations. */
7060 NIR_PASS(_, shader, nir_opt_constant_folding);
7061 NIR_PASS(_, shader, nir_opt_dce);
7062
7063 elk_postprocess_nir(shader, compiler, debug_enabled,
7064 key->base.robust_flags);
7065
7066 v[simd] = std::make_unique<elk_fs_visitor>(compiler, ¶ms->base,
7067 &key->base,
7068 &prog_data->base,
7069 shader, dispatch_width,
7070 params->base.stats != NULL,
7071 debug_enabled);
7072
7073 const int first = elk_simd_first_compiled(simd_state);
7074 if (first >= 0)
7075 v[simd]->import_uniforms(v[first].get());
7076
7077 const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable;
7078
7079 if (v[simd]->run_cs(allow_spilling)) {
7080 cs_fill_push_const_info(compiler->devinfo, prog_data);
7081
7082 elk_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
7083 } else {
7084 simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
7085 if (simd > 0) {
7086 elk_shader_perf_log(compiler, params->base.log_data,
7087 "SIMD%u shader failed to compile: %s\n",
7088 dispatch_width, v[simd]->fail_msg);
7089 }
7090 }
7091 }
7092
7093 const int selected_simd = elk_simd_select(simd_state);
7094 if (selected_simd < 0) {
7095 params->base.error_str =
7096 ralloc_asprintf(params->base.mem_ctx,
7097 "Can't compile shader: "
7098 "SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
7099 simd_state.error[0], simd_state.error[1],
7100 simd_state.error[2]);
7101 return NULL;
7102 }
7103
7104 assert(selected_simd < 3);
7105 elk_fs_visitor *selected = v[selected_simd].get();
7106
7107 if (!nir->info.workgroup_size_variable)
7108 prog_data->prog_mask = 1 << selected_simd;
7109
7110 elk_fs_generator g(compiler, ¶ms->base, &prog_data->base,
7111 selected->runtime_check_aads_emit, MESA_SHADER_COMPUTE);
7112 if (unlikely(debug_enabled)) {
7113 char *name = ralloc_asprintf(params->base.mem_ctx,
7114 "%s compute shader %s",
7115 nir->info.label ?
7116 nir->info.label : "unnamed",
7117 nir->info.name);
7118 g.enable_debug(name);
7119 }
7120
7121 uint32_t max_dispatch_width = 8u << (util_last_bit(prog_data->prog_mask) - 1);
7122
7123 struct elk_compile_stats *stats = params->base.stats;
7124 for (unsigned simd = 0; simd < 3; simd++) {
7125 if (prog_data->prog_mask & (1u << simd)) {
7126 assert(v[simd]);
7127 prog_data->prog_offset[simd] =
7128 g.generate_code(v[simd]->cfg, 8u << simd, v[simd]->shader_stats,
7129 v[simd]->performance_analysis.require(), stats);
7130 if (stats)
7131 stats->max_dispatch_width = max_dispatch_width;
7132 stats = stats ? stats + 1 : NULL;
7133 max_dispatch_width = 8u << simd;
7134 }
7135 }
7136
7137 g.add_const_data(nir->constant_data, nir->constant_data_size);
7138
7139 return g.get_assembly();
7140 }
7141
7142 struct intel_cs_dispatch_info
elk_cs_get_dispatch_info(const struct intel_device_info * devinfo,const struct elk_cs_prog_data * prog_data,const unsigned * override_local_size)7143 elk_cs_get_dispatch_info(const struct intel_device_info *devinfo,
7144 const struct elk_cs_prog_data *prog_data,
7145 const unsigned *override_local_size)
7146 {
7147 struct intel_cs_dispatch_info info = {};
7148
7149 const unsigned *sizes =
7150 override_local_size ? override_local_size :
7151 prog_data->local_size;
7152
7153 const int simd = elk_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
7154 assert(simd >= 0 && simd < 3);
7155
7156 info.group_size = sizes[0] * sizes[1] * sizes[2];
7157 info.simd_size = 8u << simd;
7158 info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
7159
7160 const uint32_t remainder = info.group_size & (info.simd_size - 1);
7161 if (remainder > 0)
7162 info.right_mask = ~0u >> (32 - remainder);
7163 else
7164 info.right_mask = ~0u >> (32 - info.simd_size);
7165
7166 return info;
7167 }
7168
7169 uint64_t
elk_bsr(const struct intel_device_info * devinfo,uint32_t offset,uint8_t simd_size,uint8_t local_arg_offset)7170 elk_bsr(const struct intel_device_info *devinfo,
7171 uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset)
7172 {
7173 assert(offset % 64 == 0);
7174 assert(simd_size == 8 || simd_size == 16);
7175 assert(local_arg_offset % 8 == 0);
7176
7177 return offset |
7178 SET_BITS(simd_size == 8, 4, 4) |
7179 SET_BITS(local_arg_offset / 8, 2, 0);
7180 }
7181
7182 /**
7183 * Test the dispatch mask packing assumptions of
7184 * elk_stage_has_packed_dispatch(). Call this from e.g. the top of
7185 * elk_fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
7186 * executed with an unexpected dispatch mask.
7187 */
7188 static UNUSED void
elk_fs_test_dispatch_packing(const fs_builder & bld)7189 elk_fs_test_dispatch_packing(const fs_builder &bld)
7190 {
7191 const elk_fs_visitor *shader = static_cast<const elk_fs_visitor *>(bld.shader);
7192 const gl_shader_stage stage = shader->stage;
7193 const bool uses_vmask =
7194 stage == MESA_SHADER_FRAGMENT &&
7195 elk_wm_prog_data(shader->stage_prog_data)->uses_vmask;
7196
7197 if (elk_stage_has_packed_dispatch(shader->devinfo, stage,
7198 shader->stage_prog_data)) {
7199 const fs_builder ubld = bld.exec_all().group(1, 0);
7200 const elk_fs_reg tmp = component(bld.vgrf(ELK_REGISTER_TYPE_UD), 0);
7201 const elk_fs_reg mask = uses_vmask ? elk_vmask_reg() : elk_dmask_reg();
7202
7203 ubld.ADD(tmp, mask, elk_imm_ud(1));
7204 ubld.AND(tmp, mask, tmp);
7205
7206 /* This will loop forever if the dispatch mask doesn't have the expected
7207 * form '2^n-1', in which case tmp will be non-zero.
7208 */
7209 bld.emit(ELK_OPCODE_DO);
7210 bld.CMP(bld.null_reg_ud(), tmp, elk_imm_ud(0), ELK_CONDITIONAL_NZ);
7211 set_predicate(ELK_PREDICATE_NORMAL, bld.emit(ELK_OPCODE_WHILE));
7212 }
7213 }
7214
7215 unsigned
workgroup_size() const7216 elk_fs_visitor::workgroup_size() const
7217 {
7218 assert(gl_shader_stage_uses_workgroup(stage));
7219 const struct elk_cs_prog_data *cs = elk_cs_prog_data(prog_data);
7220 return cs->local_size[0] * cs->local_size[1] * cs->local_size[2];
7221 }
7222
elk_should_print_shader(const nir_shader * shader,uint64_t debug_flag)7223 bool elk_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
7224 {
7225 return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
7226 }
7227
7228 namespace elk {
7229 elk_fs_reg
fetch_payload_reg(const elk::fs_builder & bld,uint8_t regs[2],elk_reg_type type,unsigned n)7230 fetch_payload_reg(const elk::fs_builder &bld, uint8_t regs[2],
7231 elk_reg_type type, unsigned n)
7232 {
7233 if (!regs[0])
7234 return elk_fs_reg();
7235
7236 if (bld.dispatch_width() > 16) {
7237 const elk_fs_reg tmp = bld.vgrf(type, n);
7238 const elk::fs_builder hbld = bld.exec_all().group(16, 0);
7239 const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
7240 elk_fs_reg *const components = new elk_fs_reg[m * n];
7241
7242 for (unsigned c = 0; c < n; c++) {
7243 for (unsigned g = 0; g < m; g++)
7244 components[c * m + g] =
7245 offset(retype(elk_vec8_grf(regs[g], 0), type), hbld, c);
7246 }
7247
7248 hbld.LOAD_PAYLOAD(tmp, components, m * n, 0);
7249
7250 delete[] components;
7251 return tmp;
7252
7253 } else {
7254 return elk_fs_reg(retype(elk_vec8_grf(regs[0], 0), type));
7255 }
7256 }
7257
7258 elk_fs_reg
fetch_barycentric_reg(const elk::fs_builder & bld,uint8_t regs[2])7259 fetch_barycentric_reg(const elk::fs_builder &bld, uint8_t regs[2])
7260 {
7261 if (!regs[0])
7262 return elk_fs_reg();
7263
7264 const elk_fs_reg tmp = bld.vgrf(ELK_REGISTER_TYPE_F, 2);
7265 const elk::fs_builder hbld = bld.exec_all().group(8, 0);
7266 const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
7267 elk_fs_reg *const components = new elk_fs_reg[2 * m];
7268
7269 for (unsigned c = 0; c < 2; c++) {
7270 for (unsigned g = 0; g < m; g++)
7271 components[c * m + g] = offset(elk_vec8_grf(regs[g / 2], 0),
7272 hbld, c + 2 * (g % 2));
7273 }
7274
7275 hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
7276
7277 delete[] components;
7278 return tmp;
7279 }
7280
7281 void
check_dynamic_msaa_flag(const fs_builder & bld,const struct elk_wm_prog_data * wm_prog_data,enum intel_msaa_flags flag)7282 check_dynamic_msaa_flag(const fs_builder &bld,
7283 const struct elk_wm_prog_data *wm_prog_data,
7284 enum intel_msaa_flags flag)
7285 {
7286 elk_fs_inst *inst = bld.AND(bld.null_reg_ud(),
7287 dynamic_msaa_flags(wm_prog_data),
7288 elk_imm_ud(flag));
7289 inst->conditional_mod = ELK_CONDITIONAL_NZ;
7290 }
7291 }
7292