1 /*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_fs.h"
25 #include "brw_cfg.h"
26 #include "brw_fs_builder.h"
27
28 using namespace brw;
29
30 namespace {
31 /* From the SKL PRM Vol 2a, "Move":
32 *
33 * "A mov with the same source and destination type, no source modifier,
34 * and no saturation is a raw move. A packed byte destination region (B
35 * or UB type with HorzStride == 1 and ExecSize > 1) can only be written
36 * using raw move."
37 */
38 bool
is_byte_raw_mov(const fs_inst * inst)39 is_byte_raw_mov(const fs_inst *inst)
40 {
41 return type_sz(inst->dst.type) == 1 &&
42 inst->opcode == BRW_OPCODE_MOV &&
43 inst->src[0].type == inst->dst.type &&
44 !inst->saturate &&
45 !inst->src[0].negate &&
46 !inst->src[0].abs;
47 }
48
49 /*
50 * Return an acceptable byte stride for the destination of an instruction
51 * that requires it to have some particular alignment.
52 */
53 unsigned
required_dst_byte_stride(const fs_inst * inst)54 required_dst_byte_stride(const fs_inst *inst)
55 {
56 if (inst->dst.is_accumulator()) {
57 /* If the destination is an accumulator, insist that we leave the
58 * stride alone. We cannot "fix" accumulator destinations by writing
59 * to a temporary and emitting a MOV into the original destination.
60 * For multiply instructions (our one use of the accumulator), the
61 * MUL writes the full 66 bits of the accumulator whereas the MOV we
62 * would emit only writes 33 bits and leaves the top 33 bits
63 * undefined.
64 *
65 * It's safe to just require the original stride here because the
66 * lowering pass will detect the mismatch in has_invalid_src_region
67 * and fix the sources of the multiply instead of the destination.
68 */
69 return inst->dst.stride * type_sz(inst->dst.type);
70 } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
71 !is_byte_raw_mov(inst)) {
72 return get_exec_type_size(inst);
73 } else {
74 /* Calculate the maximum byte stride and the minimum/maximum type
75 * size across all source and destination operands we are required to
76 * lower.
77 */
78 unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
79 unsigned min_size = type_sz(inst->dst.type);
80 unsigned max_size = type_sz(inst->dst.type);
81
82 for (unsigned i = 0; i < inst->sources; i++) {
83 if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
84 const unsigned size = type_sz(inst->src[i].type);
85 max_stride = MAX2(max_stride, inst->src[i].stride * size);
86 min_size = MIN2(min_size, size);
87 max_size = MAX2(max_size, size);
88 }
89 }
90
91 /* All operands involved in lowering need to fit in the calculated
92 * stride.
93 */
94 assert(max_size <= 4 * min_size);
95
96 /* Attempt to use the largest byte stride among all present operands,
97 * but never exceed a stride of 4 since that would lead to illegal
98 * destination regions during lowering.
99 */
100 return MIN2(max_stride, 4 * min_size);
101 }
102 }
103
104 /*
105 * Return an acceptable byte sub-register offset for the destination of an
106 * instruction that requires it to be aligned to the sub-register offset of
107 * the sources.
108 */
109 unsigned
required_dst_byte_offset(const fs_inst * inst)110 required_dst_byte_offset(const fs_inst *inst)
111 {
112 for (unsigned i = 0; i < inst->sources; i++) {
113 if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
114 if (reg_offset(inst->src[i]) % REG_SIZE !=
115 reg_offset(inst->dst) % REG_SIZE)
116 return 0;
117 }
118
119 return reg_offset(inst->dst) % REG_SIZE;
120 }
121
122 /*
123 * Return whether the instruction has an unsupported channel bit layout
124 * specified for the i-th source region.
125 */
126 bool
has_invalid_src_region(const gen_device_info * devinfo,const fs_inst * inst,unsigned i)127 has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst,
128 unsigned i)
129 {
130 if (is_unordered(inst) || inst->is_control_source(i))
131 return false;
132
133 /* Empirical testing shows that Broadwell has a bug affecting half-float
134 * MAD instructions when any of its sources has a non-zero offset, such
135 * as:
136 *
137 * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
138 *
139 * We used to generate code like this for SIMD8 executions where we
140 * used to pack components Y and W of a vector at offset 16B of a SIMD
141 * register. The problem doesn't occur if the stride of the source is 0.
142 */
143 if (devinfo->gen == 8 &&
144 inst->opcode == BRW_OPCODE_MAD &&
145 inst->src[i].type == BRW_REGISTER_TYPE_HF &&
146 reg_offset(inst->src[i]) % REG_SIZE > 0 &&
147 inst->src[i].stride != 0) {
148 return true;
149 }
150
151 const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
152 const unsigned src_byte_stride = inst->src[i].stride *
153 type_sz(inst->src[i].type);
154 const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
155 const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;
156
157 return has_dst_aligned_region_restriction(devinfo, inst) &&
158 !is_uniform(inst->src[i]) &&
159 (src_byte_stride != dst_byte_stride ||
160 src_byte_offset != dst_byte_offset);
161 }
162
163 /*
164 * Return whether the instruction has an unsupported channel bit layout
165 * specified for the destination region.
166 */
167 bool
has_invalid_dst_region(const gen_device_info * devinfo,const fs_inst * inst)168 has_invalid_dst_region(const gen_device_info *devinfo,
169 const fs_inst *inst)
170 {
171 if (is_unordered(inst)) {
172 return false;
173 } else {
174 const brw_reg_type exec_type = get_exec_type(inst);
175 const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
176 const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
177 const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
178 type_sz(inst->dst.type) < type_sz(exec_type);
179
180 return (has_dst_aligned_region_restriction(devinfo, inst) &&
181 (required_dst_byte_stride(inst) != dst_byte_stride ||
182 required_dst_byte_offset(inst) != dst_byte_offset)) ||
183 (is_narrowing_conversion &&
184 required_dst_byte_stride(inst) != dst_byte_stride);
185 }
186 }
187
188 /*
189 * Return whether the instruction has unsupported source modifiers
190 * specified for the i-th source region.
191 */
192 bool
has_invalid_src_modifiers(const gen_device_info * devinfo,const fs_inst * inst,unsigned i)193 has_invalid_src_modifiers(const gen_device_info *devinfo, const fs_inst *inst,
194 unsigned i)
195 {
196 return !inst->can_do_source_mods(devinfo) &&
197 (inst->src[i].negate || inst->src[i].abs);
198 }
199
200 /*
201 * Return whether the instruction has an unsupported type conversion
202 * specified for the destination.
203 */
204 bool
has_invalid_conversion(const gen_device_info * devinfo,const fs_inst * inst)205 has_invalid_conversion(const gen_device_info *devinfo, const fs_inst *inst)
206 {
207 switch (inst->opcode) {
208 case BRW_OPCODE_MOV:
209 return false;
210 case BRW_OPCODE_SEL:
211 return inst->dst.type != get_exec_type(inst);
212 case SHADER_OPCODE_BROADCAST:
213 case SHADER_OPCODE_MOV_INDIRECT:
214 /* The source and destination types of these may be hard-coded to
215 * integer at codegen time due to hardware limitations of 64-bit
216 * types.
217 */
218 return ((devinfo->gen == 7 && !devinfo->is_haswell) ||
219 devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) &&
220 type_sz(inst->src[0].type) > 4 &&
221 inst->dst.type != inst->src[0].type;
222 default:
223 /* FIXME: We assume the opcodes don't explicitly mentioned before
224 * just work fine with arbitrary conversions.
225 */
226 return false;
227 }
228 }
229
230 /**
231 * Return whether the instruction has non-standard semantics for the
232 * conditional mod which don't cause the flag register to be updated with
233 * the comparison result.
234 */
235 bool
has_inconsistent_cmod(const fs_inst * inst)236 has_inconsistent_cmod(const fs_inst *inst)
237 {
238 return inst->opcode == BRW_OPCODE_SEL ||
239 inst->opcode == BRW_OPCODE_CSEL ||
240 inst->opcode == BRW_OPCODE_IF ||
241 inst->opcode == BRW_OPCODE_WHILE;
242 }
243
244 bool
245 lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
246 }
247
248 namespace brw {
249 /**
250 * Remove any modifiers from the \p i-th source region of the instruction,
251 * including negate, abs and any implicit type conversion to the execution
252 * type. Instead any source modifiers will be implemented as a separate
253 * MOV instruction prior to the original instruction.
254 */
255 bool
lower_src_modifiers(fs_visitor * v,bblock_t * block,fs_inst * inst,unsigned i)256 lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
257 {
258 assert(inst->components_read(i) == 1);
259 assert(v->devinfo->has_integer_dword_mul ||
260 inst->opcode != BRW_OPCODE_MUL ||
261 brw_reg_type_is_floating_point(get_exec_type(inst)) ||
262 MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4 ||
263 type_sz(inst->src[i].type) == get_exec_type_size(inst));
264
265 const fs_builder ibld(v, block, inst);
266 const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
267
268 lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
269 inst->src[i] = tmp;
270
271 return true;
272 }
273 }
274
275 namespace {
276 /**
277 * Remove any modifiers from the destination region of the instruction,
278 * including saturate, conditional mod and any implicit type conversion
279 * from the execution type. Instead any destination modifiers will be
280 * implemented as a separate MOV instruction after the original
281 * instruction.
282 */
283 bool
lower_dst_modifiers(fs_visitor * v,bblock_t * block,fs_inst * inst)284 lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
285 {
286 const fs_builder ibld(v, block, inst);
287 const brw_reg_type type = get_exec_type(inst);
288 /* Not strictly necessary, but if possible use a temporary with the same
289 * channel alignment as the current destination in order to avoid
290 * violating the restrictions enforced later on by lower_src_region()
291 * and lower_dst_region(), which would introduce additional copy
292 * instructions into the program unnecessarily.
293 */
294 const unsigned stride =
295 type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
296 type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
297 fs_reg tmp = ibld.vgrf(type, stride);
298 ibld.UNDEF(tmp);
299 tmp = horiz_stride(tmp, stride);
300
301 /* Emit a MOV taking care of all the destination modifiers. */
302 fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
303 mov->saturate = inst->saturate;
304 if (!has_inconsistent_cmod(inst))
305 mov->conditional_mod = inst->conditional_mod;
306 if (inst->opcode != BRW_OPCODE_SEL) {
307 mov->predicate = inst->predicate;
308 mov->predicate_inverse = inst->predicate_inverse;
309 }
310 mov->flag_subreg = inst->flag_subreg;
311 lower_instruction(v, block, mov);
312
313 /* Point the original instruction at the temporary, and clean up any
314 * destination modifiers.
315 */
316 assert(inst->size_written == inst->dst.component_size(inst->exec_size));
317 inst->dst = tmp;
318 inst->size_written = inst->dst.component_size(inst->exec_size);
319 inst->saturate = false;
320 if (!has_inconsistent_cmod(inst))
321 inst->conditional_mod = BRW_CONDITIONAL_NONE;
322
323 assert(!inst->flags_written() || !mov->predicate);
324 return true;
325 }
326
327 /**
328 * Remove any non-trivial shuffling of data from the \p i-th source region
329 * of the instruction. Instead implement the region as a series of integer
330 * copies into a temporary with the same channel layout as the destination.
331 */
332 bool
lower_src_region(fs_visitor * v,bblock_t * block,fs_inst * inst,unsigned i)333 lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
334 {
335 assert(inst->components_read(i) == 1);
336 const fs_builder ibld(v, block, inst);
337 const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
338 type_sz(inst->src[i].type);
339 assert(stride > 0);
340 fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);
341 ibld.UNDEF(tmp);
342 tmp = horiz_stride(tmp, stride);
343
344 /* Emit a series of 32-bit integer copies with any source modifiers
345 * cleaned up (because their semantics are dependent on the type).
346 */
347 const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
348 false);
349 const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
350 fs_reg raw_src = inst->src[i];
351 raw_src.negate = false;
352 raw_src.abs = false;
353
354 for (unsigned j = 0; j < n; j++)
355 ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
356
357 /* Point the original instruction at the temporary, making sure to keep
358 * any source modifiers in the instruction.
359 */
360 fs_reg lower_src = tmp;
361 lower_src.negate = inst->src[i].negate;
362 lower_src.abs = inst->src[i].abs;
363 inst->src[i] = lower_src;
364
365 return true;
366 }
367
368 /**
369 * Remove any non-trivial shuffling of data from the destination region of
370 * the instruction. Instead implement the region as a series of integer
371 * copies from a temporary with a channel layout compatible with the
372 * sources.
373 */
374 bool
lower_dst_region(fs_visitor * v,bblock_t * block,fs_inst * inst)375 lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
376 {
377 /* We cannot replace the result of an integer multiply which writes the
378 * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
379 * value whereas the MOV will act on only 32 or 33 bits of the
380 * accumulator.
381 */
382 assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
383 brw_reg_type_is_floating_point(inst->dst.type));
384
385 const fs_builder ibld(v, block, inst);
386 const unsigned stride = required_dst_byte_stride(inst) /
387 type_sz(inst->dst.type);
388 assert(stride > 0);
389 fs_reg tmp = ibld.vgrf(inst->dst.type, stride);
390 ibld.UNDEF(tmp);
391 tmp = horiz_stride(tmp, stride);
392
393 /* Emit a series of 32-bit integer copies from the temporary into the
394 * original destination.
395 */
396 const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
397 false);
398 const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
399
400 if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
401 /* Note that in general we cannot simply predicate the copies on the
402 * same flag register as the original instruction, since it may have
403 * been overwritten by the instruction itself. Instead initialize
404 * the temporary with the previous contents of the destination
405 * register.
406 */
407 for (unsigned j = 0; j < n; j++)
408 ibld.MOV(subscript(tmp, raw_type, j),
409 subscript(inst->dst, raw_type, j));
410 }
411
412 for (unsigned j = 0; j < n; j++)
413 ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
414 subscript(tmp, raw_type, j));
415
416 /* Point the original instruction at the temporary, making sure to keep
417 * any destination modifiers in the instruction.
418 */
419 assert(inst->size_written == inst->dst.component_size(inst->exec_size));
420 inst->dst = tmp;
421 inst->size_written = inst->dst.component_size(inst->exec_size);
422
423 return true;
424 }
425
426 /**
427 * Legalize the source and destination regioning controls of the specified
428 * instruction.
429 */
430 bool
lower_instruction(fs_visitor * v,bblock_t * block,fs_inst * inst)431 lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
432 {
433 const gen_device_info *devinfo = v->devinfo;
434 bool progress = false;
435
436 if (has_invalid_conversion(devinfo, inst))
437 progress |= lower_dst_modifiers(v, block, inst);
438
439 if (has_invalid_dst_region(devinfo, inst))
440 progress |= lower_dst_region(v, block, inst);
441
442 for (unsigned i = 0; i < inst->sources; i++) {
443 if (has_invalid_src_modifiers(devinfo, inst, i))
444 progress |= lower_src_modifiers(v, block, inst, i);
445
446 if (has_invalid_src_region(devinfo, inst, i))
447 progress |= lower_src_region(v, block, inst, i);
448 }
449
450 return progress;
451 }
452 }
453
454 bool
lower_regioning()455 fs_visitor::lower_regioning()
456 {
457 bool progress = false;
458
459 foreach_block_and_inst_safe(block, fs_inst, inst, cfg)
460 progress |= lower_instruction(this, block, inst);
461
462 if (progress)
463 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
464
465 return progress;
466 }
467