1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_combine_constants.cpp
25 *
26 * This file contains the opt_combine_constants() pass that runs after the
27 * regular optimization loop. It passes over the instruction list and
28 * selectively promotes immediate values to registers by emitting a mov(1)
29 * instruction.
30 *
31 * This is useful on Gen 7 particularly, because a few instructions can be
32 * coissued (i.e., issued in the same cycle as another thread on the same EU
33 * issues an instruction) under some circumstances, one of which is that they
34 * cannot use immediate values.
35 */
36
37 #include "brw_fs.h"
38 #include "brw_cfg.h"
39 #include "util/half_float.h"
40
41 using namespace brw;
42
43 static const bool debug = false;
44
45 /* Returns whether an instruction could co-issue if its immediate source were
46 * replaced with a GRF source.
47 */
48 static bool
could_coissue(const struct gen_device_info * devinfo,const fs_inst * inst)49 could_coissue(const struct gen_device_info *devinfo, const fs_inst *inst)
50 {
51 if (devinfo->gen != 7)
52 return false;
53
54 switch (inst->opcode) {
55 case BRW_OPCODE_MOV:
56 case BRW_OPCODE_CMP:
57 case BRW_OPCODE_ADD:
58 case BRW_OPCODE_MUL:
59 /* Only float instructions can coissue. We don't have a great
60 * understanding of whether or not something like float(int(a) + int(b))
61 * would be considered float (based on the destination type) or integer
62 * (based on the source types), so we take the conservative choice of
63 * only promoting when both destination and source are float.
64 */
65 return inst->dst.type == BRW_REGISTER_TYPE_F &&
66 inst->src[0].type == BRW_REGISTER_TYPE_F;
67 default:
68 return false;
69 }
70 }
71
72 /**
73 * Returns true for instructions that don't support immediate sources.
74 */
75 static bool
must_promote_imm(const struct gen_device_info * devinfo,const fs_inst * inst)76 must_promote_imm(const struct gen_device_info *devinfo, const fs_inst *inst)
77 {
78 switch (inst->opcode) {
79 case SHADER_OPCODE_POW:
80 return devinfo->gen < 8;
81 case BRW_OPCODE_MAD:
82 case BRW_OPCODE_LRP:
83 return true;
84 default:
85 return false;
86 }
87 }
88
89 /** A box for putting fs_regs in a linked list. */
90 struct reg_link {
91 DECLARE_RALLOC_CXX_OPERATORS(reg_link)
92
reg_linkreg_link93 reg_link(fs_reg *reg) : reg(reg) {}
94
95 struct exec_node link;
96 fs_reg *reg;
97 };
98
99 static struct exec_node *
link(void * mem_ctx,fs_reg * reg)100 link(void *mem_ctx, fs_reg *reg)
101 {
102 reg_link *l = new(mem_ctx) reg_link(reg);
103 return &l->link;
104 }
105
106 /**
107 * Information about an immediate value.
108 */
109 struct imm {
110 /** The common ancestor of all blocks using this immediate value. */
111 bblock_t *block;
112
113 /**
114 * The instruction generating the immediate value, if all uses are contained
115 * within a single basic block. Otherwise, NULL.
116 */
117 fs_inst *inst;
118
119 /**
120 * A list of fs_regs that refer to this immediate. If we promote it, we'll
121 * have to patch these up to refer to the new GRF.
122 */
123 exec_list *uses;
124
125 /** The immediate value */
126 union {
127 char bytes[8];
128 double df;
129 int64_t d64;
130 float f;
131 int32_t d;
132 int16_t w;
133 };
134 uint8_t size;
135
136 /** When promoting half-float we need to account for certain restrictions */
137 bool is_half_float;
138
139 /**
140 * The GRF register and subregister number where we've decided to store the
141 * constant value.
142 */
143 uint8_t subreg_offset;
144 uint16_t nr;
145
146 /** The number of coissuable instructions using this immediate. */
147 uint16_t uses_by_coissue;
148
149 /**
150 * Whether this constant is used by an instruction that can't handle an
151 * immediate source (and already has to be promoted to a GRF).
152 */
153 bool must_promote;
154
155 uint16_t first_use_ip;
156 uint16_t last_use_ip;
157 };
158
159 /** The working set of information about immediates. */
160 struct table {
161 struct imm *imm;
162 int size;
163 int len;
164 };
165
166 static struct imm *
find_imm(struct table * table,void * data,uint8_t size)167 find_imm(struct table *table, void *data, uint8_t size)
168 {
169 for (int i = 0; i < table->len; i++) {
170 if (table->imm[i].size == size &&
171 !memcmp(table->imm[i].bytes, data, size)) {
172 return &table->imm[i];
173 }
174 }
175 return NULL;
176 }
177
178 static struct imm *
new_imm(struct table * table,void * mem_ctx)179 new_imm(struct table *table, void *mem_ctx)
180 {
181 if (table->len == table->size) {
182 table->size *= 2;
183 table->imm = reralloc(mem_ctx, table->imm, struct imm, table->size);
184 }
185 return &table->imm[table->len++];
186 }
187
188 /**
189 * Comparator used for sorting an array of imm structures.
190 *
191 * We sort by basic block number, then last use IP, then first use IP (least
192 * to greatest). This sorting causes immediates live in the same area to be
193 * allocated to the same register in the hopes that all values will be dead
194 * about the same time and the register can be reused.
195 */
196 static int
compare(const void * _a,const void * _b)197 compare(const void *_a, const void *_b)
198 {
199 const struct imm *a = (const struct imm *)_a,
200 *b = (const struct imm *)_b;
201
202 int block_diff = a->block->num - b->block->num;
203 if (block_diff)
204 return block_diff;
205
206 int end_diff = a->last_use_ip - b->last_use_ip;
207 if (end_diff)
208 return end_diff;
209
210 return a->first_use_ip - b->first_use_ip;
211 }
212
213 static bool
get_constant_value(const struct gen_device_info * devinfo,const fs_inst * inst,uint32_t src_idx,void * out,brw_reg_type * out_type)214 get_constant_value(const struct gen_device_info *devinfo,
215 const fs_inst *inst, uint32_t src_idx,
216 void *out, brw_reg_type *out_type)
217 {
218 const bool can_do_source_mods = inst->can_do_source_mods(devinfo);
219 const fs_reg *src = &inst->src[src_idx];
220
221 *out_type = src->type;
222
223 switch (*out_type) {
224 case BRW_REGISTER_TYPE_DF: {
225 double val = !can_do_source_mods ? src->df : fabs(src->df);
226 memcpy(out, &val, 8);
227 break;
228 }
229 case BRW_REGISTER_TYPE_F: {
230 float val = !can_do_source_mods ? src->f : fabsf(src->f);
231 memcpy(out, &val, 4);
232 break;
233 }
234 case BRW_REGISTER_TYPE_HF: {
235 uint16_t val = src->d & 0xffffu;
236 if (can_do_source_mods)
237 val = _mesa_float_to_half(fabsf(_mesa_half_to_float(val)));
238 memcpy(out, &val, 2);
239 break;
240 }
241 case BRW_REGISTER_TYPE_Q: {
242 int64_t val = !can_do_source_mods ? src->d64 : llabs(src->d64);
243 memcpy(out, &val, 8);
244 break;
245 }
246 case BRW_REGISTER_TYPE_UQ:
247 memcpy(out, &src->u64, 8);
248 break;
249 case BRW_REGISTER_TYPE_D: {
250 int32_t val = !can_do_source_mods ? src->d : abs(src->d);
251 memcpy(out, &val, 4);
252 break;
253 }
254 case BRW_REGISTER_TYPE_UD:
255 memcpy(out, &src->ud, 4);
256 break;
257 case BRW_REGISTER_TYPE_W: {
258 int16_t val = src->d & 0xffffu;
259 if (can_do_source_mods)
260 val = abs(val);
261 memcpy(out, &val, 2);
262 break;
263 }
264 case BRW_REGISTER_TYPE_UW:
265 memcpy(out, &src->ud, 2);
266 break;
267 default:
268 return false;
269 };
270
271 return true;
272 }
273
274 static struct brw_reg
build_imm_reg_for_copy(struct imm * imm)275 build_imm_reg_for_copy(struct imm *imm)
276 {
277 switch (imm->size) {
278 case 8:
279 return brw_imm_d(imm->d64);
280 case 4:
281 return brw_imm_d(imm->d);
282 case 2:
283 return brw_imm_w(imm->w);
284 default:
285 unreachable("not implemented");
286 }
287 }
288
289 static inline uint32_t
get_alignment_for_imm(const struct imm * imm)290 get_alignment_for_imm(const struct imm *imm)
291 {
292 if (imm->is_half_float)
293 return 4; /* At least MAD seems to require this */
294 else
295 return imm->size;
296 }
297
298 static bool
needs_negate(const fs_reg * reg,const struct imm * imm)299 needs_negate(const fs_reg *reg, const struct imm *imm)
300 {
301 switch (reg->type) {
302 case BRW_REGISTER_TYPE_DF:
303 return signbit(reg->df) != signbit(imm->df);
304 case BRW_REGISTER_TYPE_F:
305 return signbit(reg->f) != signbit(imm->f);
306 case BRW_REGISTER_TYPE_Q:
307 return (reg->d64 < 0) != (imm->d64 < 0);
308 case BRW_REGISTER_TYPE_D:
309 return (reg->d < 0) != (imm->d < 0);
310 case BRW_REGISTER_TYPE_HF:
311 return (reg->d & 0x8000u) != (imm->w & 0x8000u);
312 case BRW_REGISTER_TYPE_W:
313 return ((int16_t)reg->d < 0) != (imm->w < 0);
314 case BRW_REGISTER_TYPE_UQ:
315 case BRW_REGISTER_TYPE_UD:
316 case BRW_REGISTER_TYPE_UW:
317 return false;
318 default:
319 unreachable("not implemented");
320 };
321 }
322
323 static bool
representable_as_hf(float f,uint16_t * hf)324 representable_as_hf(float f, uint16_t *hf)
325 {
326 union fi u;
327 uint16_t h = _mesa_float_to_half(f);
328 u.f = _mesa_half_to_float(h);
329
330 if (u.f == f) {
331 *hf = h;
332 return true;
333 }
334
335 return false;
336 }
337
338 static bool
represent_src_as_imm(const struct gen_device_info * devinfo,fs_reg * src)339 represent_src_as_imm(const struct gen_device_info *devinfo,
340 fs_reg *src)
341 {
342 /* TODO : consider specific platforms also */
343 if (devinfo->gen == 12) {
344 uint16_t hf;
345 if (representable_as_hf(src->f, &hf)) {
346 *src = retype(brw_imm_uw(hf), BRW_REGISTER_TYPE_HF);
347 return true;
348 }
349 }
350 return false;
351 }
352
353 bool
opt_combine_constants()354 fs_visitor::opt_combine_constants()
355 {
356 void *const_ctx = ralloc_context(NULL);
357
358 struct table table;
359 table.size = 8;
360 table.len = 0;
361 table.imm = ralloc_array(const_ctx, struct imm, table.size);
362
363 const brw::idom_tree &idom = idom_analysis.require();
364 unsigned ip = -1;
365
366 /* Make a pass through all instructions and count the number of times each
367 * constant is used by coissueable instructions or instructions that cannot
368 * take immediate arguments.
369 */
370 foreach_block_and_inst(block, fs_inst, inst, cfg) {
371 ip++;
372
373 if (!could_coissue(devinfo, inst) && !must_promote_imm(devinfo, inst))
374 continue;
375
376 bool represented_as_imm = false;
377 for (int i = 0; i < inst->sources; i++) {
378 if (inst->src[i].file != IMM)
379 continue;
380
381 if (!represented_as_imm && i == 0 &&
382 inst->opcode == BRW_OPCODE_MAD &&
383 represent_src_as_imm(devinfo, &inst->src[i])) {
384 represented_as_imm = true;
385 continue;
386 }
387
388 char data[8];
389 brw_reg_type type;
390 if (!get_constant_value(devinfo, inst, i, data, &type))
391 continue;
392
393 uint8_t size = type_sz(type);
394
395 struct imm *imm = find_imm(&table, data, size);
396
397 if (imm) {
398 bblock_t *intersection = idom.intersect(block, imm->block);
399 if (intersection != imm->block)
400 imm->inst = NULL;
401 imm->block = intersection;
402 imm->uses->push_tail(link(const_ctx, &inst->src[i]));
403 imm->uses_by_coissue += could_coissue(devinfo, inst);
404 imm->must_promote = imm->must_promote || must_promote_imm(devinfo, inst);
405 imm->last_use_ip = ip;
406 if (type == BRW_REGISTER_TYPE_HF)
407 imm->is_half_float = true;
408 } else {
409 imm = new_imm(&table, const_ctx);
410 imm->block = block;
411 imm->inst = inst;
412 imm->uses = new(const_ctx) exec_list();
413 imm->uses->push_tail(link(const_ctx, &inst->src[i]));
414 memcpy(imm->bytes, data, size);
415 imm->size = size;
416 imm->is_half_float = type == BRW_REGISTER_TYPE_HF;
417 imm->uses_by_coissue = could_coissue(devinfo, inst);
418 imm->must_promote = must_promote_imm(devinfo, inst);
419 imm->first_use_ip = ip;
420 imm->last_use_ip = ip;
421 }
422 }
423 }
424
425 /* Remove constants from the table that don't have enough uses to make them
426 * profitable to store in a register.
427 */
428 for (int i = 0; i < table.len;) {
429 struct imm *imm = &table.imm[i];
430
431 if (!imm->must_promote && imm->uses_by_coissue < 4) {
432 table.imm[i] = table.imm[table.len - 1];
433 table.len--;
434 continue;
435 }
436 i++;
437 }
438 if (table.len == 0) {
439 ralloc_free(const_ctx);
440 return false;
441 }
442 if (cfg->num_blocks != 1)
443 qsort(table.imm, table.len, sizeof(struct imm), compare);
444
445 /* Insert MOVs to load the constant values into GRFs. */
446 fs_reg reg(VGRF, alloc.allocate(1));
447 reg.stride = 0;
448 for (int i = 0; i < table.len; i++) {
449 struct imm *imm = &table.imm[i];
450 /* Insert it either before the instruction that generated the immediate
451 * or after the last non-control flow instruction of the common ancestor.
452 */
453 exec_node *n = (imm->inst ? imm->inst :
454 imm->block->last_non_control_flow_inst()->next);
455
456 /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
457 *
458 * "In Align16 mode, the channel selects and channel enables apply to a
459 * pair of half-floats, because these parameters are defined for DWord
460 * elements ONLY. This is applicable when both source and destination
461 * are half-floats."
462 *
463 * This means that Align16 instructions that use promoted HF immediates
464 * and use a <0,1,0>:HF region would read 2 HF slots instead of
465 * replicating the single one we want. To avoid this, we always populate
466 * both HF slots within a DWord with the constant.
467 */
468 const uint32_t width = devinfo->gen == 8 && imm->is_half_float ? 2 : 1;
469 const fs_builder ibld = bld.at(imm->block, n).exec_all().group(width, 0);
470
471 /* Put the immediate in an offset aligned to its size. Some instructions
472 * seem to have additional alignment requirements, so account for that
473 * too.
474 */
475 reg.offset = ALIGN(reg.offset, get_alignment_for_imm(imm));
476
477 /* Ensure we have enough space in the register to copy the immediate */
478 struct brw_reg imm_reg = build_imm_reg_for_copy(imm);
479 if (reg.offset + type_sz(imm_reg.type) * width > REG_SIZE) {
480 reg.nr = alloc.allocate(1);
481 reg.offset = 0;
482 }
483
484 ibld.MOV(retype(reg, imm_reg.type), imm_reg);
485 imm->nr = reg.nr;
486 imm->subreg_offset = reg.offset;
487
488 reg.offset += imm->size * width;
489 }
490 shader_stats.promoted_constants = table.len;
491
492 /* Rewrite the immediate sources to refer to the new GRFs. */
493 for (int i = 0; i < table.len; i++) {
494 foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
495 fs_reg *reg = link->reg;
496 #ifdef DEBUG
497 switch (reg->type) {
498 case BRW_REGISTER_TYPE_DF:
499 assert((isnan(reg->df) && isnan(table.imm[i].df)) ||
500 (fabs(reg->df) == fabs(table.imm[i].df)));
501 break;
502 case BRW_REGISTER_TYPE_F:
503 assert((isnan(reg->f) && isnan(table.imm[i].f)) ||
504 (fabsf(reg->f) == fabsf(table.imm[i].f)));
505 break;
506 case BRW_REGISTER_TYPE_HF:
507 assert((isnan(_mesa_half_to_float(reg->d & 0xffffu)) &&
508 isnan(_mesa_half_to_float(table.imm[i].w))) ||
509 (fabsf(_mesa_half_to_float(reg->d & 0xffffu)) ==
510 fabsf(_mesa_half_to_float(table.imm[i].w))));
511 break;
512 case BRW_REGISTER_TYPE_Q:
513 assert(abs(reg->d64) == abs(table.imm[i].d64));
514 break;
515 case BRW_REGISTER_TYPE_UQ:
516 assert(reg->d64 == table.imm[i].d64);
517 break;
518 case BRW_REGISTER_TYPE_D:
519 assert(abs(reg->d) == abs(table.imm[i].d));
520 break;
521 case BRW_REGISTER_TYPE_UD:
522 assert(reg->d == table.imm[i].d);
523 break;
524 case BRW_REGISTER_TYPE_W:
525 assert(abs((int16_t) (reg->d & 0xffff)) == table.imm[i].w);
526 break;
527 case BRW_REGISTER_TYPE_UW:
528 assert((reg->ud & 0xffffu) == (uint16_t) table.imm[i].w);
529 break;
530 default:
531 break;
532 }
533 #endif
534
535 reg->file = VGRF;
536 reg->offset = table.imm[i].subreg_offset;
537 reg->stride = 0;
538 reg->negate = needs_negate(reg, &table.imm[i]);
539 reg->nr = table.imm[i].nr;
540 }
541 }
542
543 if (debug) {
544 for (int i = 0; i < table.len; i++) {
545 struct imm *imm = &table.imm[i];
546
547 printf("0x%016" PRIx64 " - block %3d, reg %3d sub %2d, "
548 "Uses: (%2d, %2d), IP: %4d to %4d, length %4d\n",
549 (uint64_t)(imm->d & BITFIELD64_MASK(imm->size * 8)),
550 imm->block->num,
551 imm->nr,
552 imm->subreg_offset,
553 imm->must_promote,
554 imm->uses_by_coissue,
555 imm->first_use_ip,
556 imm->last_use_ip,
557 imm->last_use_ip - imm->first_use_ip);
558 }
559 }
560
561 ralloc_free(const_ctx);
562 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
563
564 return true;
565 }
566