1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 /** @file brw_vec4_cmod_propagation.cpp
26 *
27 * Really similar to brw_fs_cmod_propagation but adapted to vec4 needs. Check
28 * brw_fs_cmod_propagation for further details on the rationale behind this
29 * optimization.
30 */
31
32 #include "brw_vec4.h"
33 #include "brw_cfg.h"
34 #include "brw_eu.h"
35
36 namespace brw {
37
38 static bool
writemasks_incompatible(const vec4_instruction * earlier,const vec4_instruction * later)39 writemasks_incompatible(const vec4_instruction *earlier,
40 const vec4_instruction *later)
41 {
42 return (earlier->dst.writemask != WRITEMASK_X &&
43 earlier->dst.writemask != WRITEMASK_XYZW) ||
44 (earlier->dst.writemask == WRITEMASK_XYZW &&
45 later->src[0].swizzle != BRW_SWIZZLE_XYZW) ||
46 (later->dst.writemask & ~earlier->dst.writemask) != 0;
47 }
48
49 static bool
opt_cmod_propagation_local(bblock_t * block,vec4_visitor * v)50 opt_cmod_propagation_local(bblock_t *block, vec4_visitor *v)
51 {
52 bool progress = false;
53 int ip = block->end_ip + 1;
54
55 foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) {
56 ip--;
57
58 if ((inst->opcode != BRW_OPCODE_AND &&
59 inst->opcode != BRW_OPCODE_CMP &&
60 inst->opcode != BRW_OPCODE_MOV) ||
61 inst->predicate != BRW_PREDICATE_NONE ||
62 !inst->dst.is_null() ||
63 (inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
64 inst->src[0].file != UNIFORM))
65 continue;
66
67 /* An ABS source modifier can only be handled when processing a compare
68 * with a value other than zero.
69 */
70 if (inst->src[0].abs &&
71 (inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero()))
72 continue;
73
74 if (inst->opcode == BRW_OPCODE_AND &&
75 !(inst->src[1].is_one() &&
76 inst->conditional_mod == BRW_CONDITIONAL_NZ &&
77 !inst->src[0].negate))
78 continue;
79
80 if (inst->opcode == BRW_OPCODE_MOV &&
81 inst->conditional_mod != BRW_CONDITIONAL_NZ)
82 continue;
83
84 bool read_flag = false;
85 foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, inst) {
86 /* A CMP with a second source of zero can match with anything. A CMP
87 * with a second source that is not zero can only match with an ADD
88 * instruction.
89 */
90 if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
91 bool negate;
92
93 if (scan_inst->opcode != BRW_OPCODE_ADD)
94 goto not_match;
95
96 if (writemasks_incompatible(scan_inst, inst))
97 goto not_match;
98
99 /* A CMP is basically a subtraction. The result of the
100 * subtraction must be the same as the result of the addition.
101 * This means that one of the operands must be negated. So (a +
102 * b) vs (a == -b) or (a + -b) vs (a == b).
103 */
104 if ((inst->src[0].equals(scan_inst->src[0]) &&
105 inst->src[1].negative_equals(scan_inst->src[1])) ||
106 (inst->src[0].equals(scan_inst->src[1]) &&
107 inst->src[1].negative_equals(scan_inst->src[0]))) {
108 negate = false;
109 } else if ((inst->src[0].negative_equals(scan_inst->src[0]) &&
110 inst->src[1].equals(scan_inst->src[1])) ||
111 (inst->src[0].negative_equals(scan_inst->src[1]) &&
112 inst->src[1].equals(scan_inst->src[0]))) {
113 negate = true;
114 } else {
115 goto not_match;
116 }
117
118 if (scan_inst->exec_size != inst->exec_size ||
119 scan_inst->group != inst->group)
120 goto not_match;
121
122 /* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods":
123 *
124 * * Note that the [post condition signal] bits generated at
125 * the output of a compute are before the .sat.
126 *
127 * So we don't have to bail if scan_inst has saturate.
128 */
129
130 /* Otherwise, try propagating the conditional. */
131 const enum brw_conditional_mod cond =
132 negate ? brw_swap_cmod(inst->conditional_mod)
133 : inst->conditional_mod;
134
135 if (scan_inst->can_do_cmod() &&
136 ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
137 scan_inst->conditional_mod == cond)) {
138 scan_inst->conditional_mod = cond;
139 inst->remove(block);
140 progress = true;
141 }
142 break;
143 }
144
145 if (regions_overlap(inst->src[0], inst->size_read(0),
146 scan_inst->dst, scan_inst->size_written)) {
147 if ((scan_inst->predicate && scan_inst->opcode != BRW_OPCODE_SEL) ||
148 scan_inst->dst.offset != inst->src[0].offset ||
149 scan_inst->exec_size != inst->exec_size ||
150 scan_inst->group != inst->group) {
151 break;
152 }
153
154 /* If scan_inst is a CMP that produces a single value and inst is
155 * a CMP.NZ that consumes only that value, remove inst.
156 */
157 if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
158 (inst->src[0].type == BRW_REGISTER_TYPE_D ||
159 inst->src[0].type == BRW_REGISTER_TYPE_UD) &&
160 (inst->opcode == BRW_OPCODE_CMP ||
161 inst->opcode == BRW_OPCODE_MOV) &&
162 scan_inst->opcode == BRW_OPCODE_CMP &&
163 ((inst->src[0].swizzle == BRW_SWIZZLE_XXXX &&
164 scan_inst->dst.writemask == WRITEMASK_X) ||
165 (inst->src[0].swizzle == BRW_SWIZZLE_YYYY &&
166 scan_inst->dst.writemask == WRITEMASK_Y) ||
167 (inst->src[0].swizzle == BRW_SWIZZLE_ZZZZ &&
168 scan_inst->dst.writemask == WRITEMASK_Z) ||
169 (inst->src[0].swizzle == BRW_SWIZZLE_WWWW &&
170 scan_inst->dst.writemask == WRITEMASK_W))) {
171 if (inst->dst.writemask != scan_inst->dst.writemask) {
172 src_reg temp(v, glsl_type::vec4_type, 1);
173
174 /* Given a sequence like:
175 *
176 * cmp.ge.f0(8) g21<1>.zF g20<4>.xF g18<4>.xF
177 * ...
178 * cmp.nz.f0(8) null<1>D g21<4>.zD 0D
179 *
180 * Replace it with something like:
181 *
182 * cmp.ge.f0(8) g22<1>.zF g20<4>.xF g18<4>.xF
183 * mov(8) g21<1>.xF g22<1>.zzzzF
184 *
185 * The added MOV will most likely be removed later. In the
186 * worst case, it should be cheaper to schedule.
187 */
188 temp.swizzle = brw_swizzle_for_mask(inst->dst.writemask);
189 temp.type = scan_inst->src[0].type;
190
191 vec4_instruction *mov = v->MOV(scan_inst->dst, temp);
192
193 /* Modify the source swizzles on scan_inst. If scan_inst
194 * was
195 *
196 * cmp.ge.f0(8) g21<1>.zF g20<4>.wzyxF g18<4>.yxwzF
197 *
198 * replace it with
199 *
200 * cmp.ge.f0(8) g21<1>.zF g20<4>.yyyyF g18<4>.wwwwF
201 */
202 unsigned src0_chan;
203 unsigned src1_chan;
204 switch (scan_inst->dst.writemask) {
205 case WRITEMASK_X:
206 src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 0);
207 src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 0);
208 break;
209 case WRITEMASK_Y:
210 src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 1);
211 src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 1);
212 break;
213 case WRITEMASK_Z:
214 src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 2);
215 src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 2);
216 break;
217 case WRITEMASK_W:
218 src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 3);
219 src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 3);
220 break;
221 default:
222 unreachable("Impossible writemask");
223 }
224
225 scan_inst->src[0].swizzle = BRW_SWIZZLE4(src0_chan,
226 src0_chan,
227 src0_chan,
228 src0_chan);
229
230 /* There's no swizzle on immediate value sources. */
231 if (scan_inst->src[1].file != IMM) {
232 scan_inst->src[1].swizzle = BRW_SWIZZLE4(src1_chan,
233 src1_chan,
234 src1_chan,
235 src1_chan);
236 }
237
238 scan_inst->dst = dst_reg(temp);
239 scan_inst->dst.writemask = inst->dst.writemask;
240
241 scan_inst->insert_after(block, mov);
242 }
243
244 inst->remove(block);
245 progress = true;
246 break;
247 }
248
249 if (writemasks_incompatible(scan_inst, inst))
250 break;
251
252 /* CMP's result is the same regardless of dest type. */
253 if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
254 scan_inst->opcode == BRW_OPCODE_CMP &&
255 (inst->dst.type == BRW_REGISTER_TYPE_D ||
256 inst->dst.type == BRW_REGISTER_TYPE_UD)) {
257 inst->remove(block);
258 progress = true;
259 break;
260 }
261
262 /* If the AND wasn't handled by the previous case, it isn't safe
263 * to remove it.
264 */
265 if (inst->opcode == BRW_OPCODE_AND)
266 break;
267
268 /* Comparisons operate differently for ints and floats */
269 if (scan_inst->dst.type != inst->dst.type &&
270 (scan_inst->dst.type == BRW_REGISTER_TYPE_F ||
271 inst->dst.type == BRW_REGISTER_TYPE_F))
272 break;
273
274 /* If the instruction generating inst's source also wrote the
275 * flag, and inst is doing a simple .nz comparison, then inst
276 * is redundant - the appropriate value is already in the flag
277 * register. Delete inst.
278 */
279 if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
280 !inst->src[0].negate &&
281 scan_inst->writes_flag(v->devinfo)) {
282 inst->remove(block);
283 progress = true;
284 break;
285 }
286
287 /* The conditional mod of the CMP/CMPN instructions behaves
288 * specially because the flag output is not calculated from the
289 * result of the instruction, but the other way around, which
290 * means that even if the condmod to propagate and the condmod
291 * from the CMP instruction are the same they will in general give
292 * different results because they are evaluated based on different
293 * inputs.
294 */
295 if (scan_inst->opcode == BRW_OPCODE_CMP ||
296 scan_inst->opcode == BRW_OPCODE_CMPN)
297 break;
298
299 /* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods":
300 *
301 * * Note that the [post condition signal] bits generated at
302 * the output of a compute are before the .sat.
303 */
304 if (scan_inst->saturate)
305 break;
306
307 /* From the Sky Lake PRM, Vol 2a, "Multiply":
308 *
309 * "When multiplying integer data types, if one of the sources
310 * is a DW, the resulting full precision data is stored in
311 * the accumulator. However, if the destination data type is
312 * either W or DW, the low bits of the result are written to
313 * the destination register and the remaining high bits are
314 * discarded. This results in undefined Overflow and Sign
315 * flags. Therefore, conditional modifiers and saturation
316 * (.sat) cannot be used in this case.
317 *
318 * We just disallow cmod propagation on all integer multiplies.
319 */
320 if (!brw_reg_type_is_floating_point(scan_inst->dst.type) &&
321 scan_inst->opcode == BRW_OPCODE_MUL)
322 break;
323
324 /* Otherwise, try propagating the conditional. */
325 enum brw_conditional_mod cond =
326 inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
327 : inst->conditional_mod;
328
329 if (scan_inst->can_do_cmod() &&
330 ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
331 scan_inst->conditional_mod == cond)) {
332 scan_inst->conditional_mod = cond;
333 inst->remove(block);
334 progress = true;
335 }
336 break;
337 }
338
339 not_match:
340 if (scan_inst->writes_flag(v->devinfo))
341 break;
342
343 read_flag = read_flag || scan_inst->reads_flag();
344 }
345 }
346
347 return progress;
348 }
349
350 bool
opt_cmod_propagation()351 vec4_visitor::opt_cmod_propagation()
352 {
353 bool progress = false;
354
355 foreach_block_reverse(block, cfg) {
356 progress = opt_cmod_propagation_local(block, this) || progress;
357 }
358
359 if (progress)
360 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
361
362 return progress;
363 }
364
365 } /* namespace brw */
366