1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
gen6_resolve_implied_move(struct brw_codegen * p,struct brw_reg * src,unsigned msg_reg_nr)46 gen6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct gen_device_info *devinfo = p->devinfo;
51 if (devinfo->gen < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 assert(devinfo->gen < 12);
59 brw_push_insn_state(p);
60 brw_set_default_exec_size(p, BRW_EXECUTE_8);
61 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64 retype(*src, BRW_REGISTER_TYPE_UD));
65 brw_pop_insn_state(p);
66 }
67 *src = brw_message_reg(msg_reg_nr);
68 }
69
70 static void
gen7_convert_mrf_to_grf(struct brw_codegen * p,struct brw_reg * reg)71 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72 {
73 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74 * "The send with EOT should use register space R112-R127 for <src>. This is
75 * to enable loading of a new thread into the same slot while the message
76 * with EOT for current thread is pending dispatch."
77 *
78 * Since we're pretending to have 16 MRFs anyway, we may as well use the
79 * registers required for messages with EOT.
80 */
81 const struct gen_device_info *devinfo = p->devinfo;
82 if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83 reg->file = BRW_GENERAL_REGISTER_FILE;
84 reg->nr += GEN7_MRF_HACK_START;
85 }
86 }
87
88 void
brw_set_dest(struct brw_codegen * p,brw_inst * inst,struct brw_reg dest)89 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
90 {
91 const struct gen_device_info *devinfo = p->devinfo;
92
93 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
94 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
95 else if (dest.file == BRW_GENERAL_REGISTER_FILE)
96 assert(dest.nr < 128);
97
98 /* The hardware has a restriction where a destination of size Byte with
99 * a stride of 1 is only allowed for a packed byte MOV. For any other
100 * instruction, the stride must be at least 2, even when the destination
101 * is the NULL register.
102 */
103 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
104 dest.nr == BRW_ARF_NULL &&
105 type_sz(dest.type) == 1 &&
106 dest.hstride == BRW_HORIZONTAL_STRIDE_1) {
107 dest.hstride = BRW_HORIZONTAL_STRIDE_2;
108 }
109
110 gen7_convert_mrf_to_grf(p, &dest);
111
112 if (devinfo->gen >= 12 &&
113 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
114 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
115 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
116 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
117 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
118 assert(dest.subnr == 0);
119 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
120 (dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
121 dest.vstride == dest.width + 1));
122 assert(!dest.negate && !dest.abs);
123 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
124 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
125
126 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
127 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
128 assert(devinfo->gen < 12);
129 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
130 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
131 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
132 assert(dest.subnr % 16 == 0);
133 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
134 dest.vstride == dest.width + 1);
135 assert(!dest.negate && !dest.abs);
136 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
137 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
138 brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
139 } else {
140 brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
141 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
142
143 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
144 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
145
146 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
147 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
148 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
149 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
150 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
151 } else {
152 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
153 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
154 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
155 dest.file == BRW_MESSAGE_REGISTER_FILE) {
156 assert(dest.writemask != 0);
157 }
158 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
159 * Although Dst.HorzStride is a don't care for Align16, HW needs
160 * this to be programmed as "01".
161 */
162 brw_inst_set_dst_hstride(devinfo, inst, 1);
163 }
164 } else {
165 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
166
167 /* These are different sizes in align1 vs align16:
168 */
169 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
170 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
171 dest.indirect_offset);
172 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
173 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
174 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
175 } else {
176 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
177 dest.indirect_offset);
178 /* even ignored in da16, still need to set as '01' */
179 brw_inst_set_dst_hstride(devinfo, inst, 1);
180 }
181 }
182 }
183
184 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
185 * or 16 (SIMD16), as that's normally correct. However, when dealing with
186 * small registers, it can be useful for us to automatically reduce it to
187 * match the register size.
188 */
189 if (p->automatic_exec_sizes) {
190 /*
191 * In platforms that support fp64 we can emit instructions with a width
192 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
193 * these cases we need to make sure that these instructions have their
194 * exec sizes set properly when they are emitted and we can't rely on
195 * this code to fix it.
196 */
197 bool fix_exec_size;
198 if (devinfo->gen >= 6)
199 fix_exec_size = dest.width < BRW_EXECUTE_4;
200 else
201 fix_exec_size = dest.width < BRW_EXECUTE_8;
202
203 if (fix_exec_size)
204 brw_inst_set_exec_size(devinfo, inst, dest.width);
205 }
206 }
207
208 void
brw_set_src0(struct brw_codegen * p,brw_inst * inst,struct brw_reg reg)209 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
210 {
211 const struct gen_device_info *devinfo = p->devinfo;
212
213 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
214 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
215 else if (reg.file == BRW_GENERAL_REGISTER_FILE)
216 assert(reg.nr < 128);
217
218 gen7_convert_mrf_to_grf(p, ®);
219
220 if (devinfo->gen >= 6 &&
221 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
222 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC ||
223 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
224 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC)) {
225 /* Any source modifiers or regions will be ignored, since this just
226 * identifies the MRF/GRF to start reading the message contents from.
227 * Check for some likely failures.
228 */
229 assert(!reg.negate);
230 assert(!reg.abs);
231 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
232 }
233
234 if (devinfo->gen >= 12 &&
235 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
236 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
237 assert(reg.file != BRW_IMMEDIATE_VALUE);
238 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
239 assert(reg.subnr == 0);
240 assert(has_scalar_region(reg) ||
241 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
242 reg.vstride == reg.width + 1));
243 assert(!reg.negate && !reg.abs);
244 brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
245 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
246
247 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
248 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
249 assert(reg.file == BRW_GENERAL_REGISTER_FILE);
250 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
251 assert(reg.subnr % 16 == 0);
252 assert(has_scalar_region(reg) ||
253 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
254 reg.vstride == reg.width + 1));
255 assert(!reg.negate && !reg.abs);
256 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
257 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
258 } else {
259 brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
260 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
261 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
262 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
263
264 if (reg.file == BRW_IMMEDIATE_VALUE) {
265 if (reg.type == BRW_REGISTER_TYPE_DF ||
266 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
267 brw_inst_set_imm_df(devinfo, inst, reg.df);
268 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
269 reg.type == BRW_REGISTER_TYPE_Q)
270 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
271 else
272 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
273
274 if (devinfo->gen < 12 && type_sz(reg.type) < 8) {
275 brw_inst_set_src1_reg_file(devinfo, inst,
276 BRW_ARCHITECTURE_REGISTER_FILE);
277 brw_inst_set_src1_reg_hw_type(devinfo, inst,
278 brw_inst_src0_reg_hw_type(devinfo, inst));
279 }
280 } else {
281 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
282 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
283 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
284 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
285 } else {
286 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
287 }
288 } else {
289 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
290
291 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
292 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
293 } else {
294 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
295 }
296 }
297
298 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
299 if (reg.width == BRW_WIDTH_1 &&
300 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
301 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
302 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
303 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
304 } else {
305 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
306 brw_inst_set_src0_width(devinfo, inst, reg.width);
307 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
308 }
309 } else {
310 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
311 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
312 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
313 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
314 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
315 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
316 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
317 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
318
319 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
320 /* This is an oddity of the fact we're using the same
321 * descriptions for registers in align_16 as align_1:
322 */
323 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
324 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
325 reg.type == BRW_REGISTER_TYPE_DF &&
326 reg.vstride == BRW_VERTICAL_STRIDE_2) {
327 /* From SNB PRM:
328 *
329 * "For Align16 access mode, only encodings of 0000 and 0011
330 * are allowed. Other codes are reserved."
331 *
332 * Presumably the DevSNB behavior applies to IVB as well.
333 */
334 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
335 } else {
336 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
337 }
338 }
339 }
340 }
341 }
342
343
344 void
brw_set_src1(struct brw_codegen * p,brw_inst * inst,struct brw_reg reg)345 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
346 {
347 const struct gen_device_info *devinfo = p->devinfo;
348
349 if (reg.file == BRW_GENERAL_REGISTER_FILE)
350 assert(reg.nr < 128);
351
352 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
353 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC ||
354 (devinfo->gen >= 12 &&
355 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
356 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC))) {
357 assert(reg.file == BRW_GENERAL_REGISTER_FILE ||
358 reg.file == BRW_ARCHITECTURE_REGISTER_FILE);
359 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
360 assert(reg.subnr == 0);
361 assert(has_scalar_region(reg) ||
362 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
363 reg.vstride == reg.width + 1));
364 assert(!reg.negate && !reg.abs);
365 brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr);
366 brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
367 } else {
368 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
369 *
370 * "Accumulator registers may be accessed explicitly as src0
371 * operands only."
372 */
373 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
374 reg.nr != BRW_ARF_ACCUMULATOR);
375
376 gen7_convert_mrf_to_grf(p, ®);
377 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
378
379 brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
380 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
381 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
382
383 /* Only src1 can be immediate in two-argument instructions.
384 */
385 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
386
387 if (reg.file == BRW_IMMEDIATE_VALUE) {
388 /* two-argument instructions can only use 32-bit immediates */
389 assert(type_sz(reg.type) < 8);
390 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
391 } else {
392 /* This is a hardware restriction, which may or may not be lifted
393 * in the future:
394 */
395 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
396 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
397
398 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
399 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
400 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
401 } else {
402 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
403 }
404
405 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
406 if (reg.width == BRW_WIDTH_1 &&
407 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
408 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
409 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
410 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
411 } else {
412 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
413 brw_inst_set_src1_width(devinfo, inst, reg.width);
414 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
415 }
416 } else {
417 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
418 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
419 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
420 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
421 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
422 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
423 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
424 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
425
426 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
427 /* This is an oddity of the fact we're using the same
428 * descriptions for registers in align_16 as align_1:
429 */
430 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
431 } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
432 reg.type == BRW_REGISTER_TYPE_DF &&
433 reg.vstride == BRW_VERTICAL_STRIDE_2) {
434 /* From SNB PRM:
435 *
436 * "For Align16 access mode, only encodings of 0000 and 0011
437 * are allowed. Other codes are reserved."
438 *
439 * Presumably the DevSNB behavior applies to IVB as well.
440 */
441 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
442 } else {
443 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
444 }
445 }
446 }
447 }
448 }
449
450 /**
451 * Specify the descriptor and extended descriptor immediate for a SEND(C)
452 * message instruction.
453 */
454 void
brw_set_desc_ex(struct brw_codegen * p,brw_inst * inst,unsigned desc,unsigned ex_desc)455 brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
456 unsigned desc, unsigned ex_desc)
457 {
458 const struct gen_device_info *devinfo = p->devinfo;
459 assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
460 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC);
461 if (devinfo->gen < 12)
462 brw_inst_set_src1_file_type(devinfo, inst,
463 BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
464 brw_inst_set_send_desc(devinfo, inst, desc);
465 if (devinfo->gen >= 9)
466 brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
467 }
468
brw_set_math_message(struct brw_codegen * p,brw_inst * inst,unsigned function,unsigned integer_type,bool low_precision,unsigned dataType)469 static void brw_set_math_message( struct brw_codegen *p,
470 brw_inst *inst,
471 unsigned function,
472 unsigned integer_type,
473 bool low_precision,
474 unsigned dataType )
475 {
476 const struct gen_device_info *devinfo = p->devinfo;
477 unsigned msg_length;
478 unsigned response_length;
479
480 /* Infer message length from the function */
481 switch (function) {
482 case BRW_MATH_FUNCTION_POW:
483 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
484 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
485 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
486 msg_length = 2;
487 break;
488 default:
489 msg_length = 1;
490 break;
491 }
492
493 /* Infer response length from the function */
494 switch (function) {
495 case BRW_MATH_FUNCTION_SINCOS:
496 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
497 response_length = 2;
498 break;
499 default:
500 response_length = 1;
501 break;
502 }
503
504 brw_set_desc(p, inst, brw_message_desc(
505 devinfo, msg_length, response_length, false));
506
507 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
508 brw_inst_set_math_msg_function(devinfo, inst, function);
509 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
510 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
511 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
512 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
513 brw_inst_set_saturate(devinfo, inst, 0);
514 }
515
516
brw_set_ff_sync_message(struct brw_codegen * p,brw_inst * insn,bool allocate,unsigned response_length,bool end_of_thread)517 static void brw_set_ff_sync_message(struct brw_codegen *p,
518 brw_inst *insn,
519 bool allocate,
520 unsigned response_length,
521 bool end_of_thread)
522 {
523 const struct gen_device_info *devinfo = p->devinfo;
524
525 brw_set_desc(p, insn, brw_message_desc(
526 devinfo, 1, response_length, true));
527
528 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
529 brw_inst_set_eot(devinfo, insn, end_of_thread);
530 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
531 brw_inst_set_urb_allocate(devinfo, insn, allocate);
532 /* The following fields are not used by FF_SYNC: */
533 brw_inst_set_urb_global_offset(devinfo, insn, 0);
534 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
535 brw_inst_set_urb_used(devinfo, insn, 0);
536 brw_inst_set_urb_complete(devinfo, insn, 0);
537 }
538
brw_set_urb_message(struct brw_codegen * p,brw_inst * insn,enum brw_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle_control)539 static void brw_set_urb_message( struct brw_codegen *p,
540 brw_inst *insn,
541 enum brw_urb_write_flags flags,
542 unsigned msg_length,
543 unsigned response_length,
544 unsigned offset,
545 unsigned swizzle_control )
546 {
547 const struct gen_device_info *devinfo = p->devinfo;
548
549 assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
550 assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
551 assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
552
553 brw_set_desc(p, insn, brw_message_desc(
554 devinfo, msg_length, response_length, true));
555
556 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
557 brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
558
559 if (flags & BRW_URB_WRITE_OWORD) {
560 assert(msg_length == 2); /* header + one OWORD of data */
561 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
562 } else {
563 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
564 }
565
566 brw_inst_set_urb_global_offset(devinfo, insn, offset);
567 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
568
569 if (devinfo->gen < 8) {
570 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
571 }
572
573 if (devinfo->gen < 7) {
574 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
575 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
576 } else {
577 brw_inst_set_urb_per_slot_offset(devinfo, insn,
578 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
579 }
580 }
581
582 static void
gen7_set_dp_scratch_message(struct brw_codegen * p,brw_inst * inst,bool write,bool dword,bool invalidate_after_read,unsigned num_regs,unsigned addr_offset,unsigned mlen,unsigned rlen,bool header_present)583 gen7_set_dp_scratch_message(struct brw_codegen *p,
584 brw_inst *inst,
585 bool write,
586 bool dword,
587 bool invalidate_after_read,
588 unsigned num_regs,
589 unsigned addr_offset,
590 unsigned mlen,
591 unsigned rlen,
592 bool header_present)
593 {
594 const struct gen_device_info *devinfo = p->devinfo;
595 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
596 (devinfo->gen >= 8 && num_regs == 8));
597 const unsigned block_size = (devinfo->gen >= 8 ? util_logbase2(num_regs) :
598 num_regs - 1);
599
600 brw_set_desc(p, inst, brw_message_desc(
601 devinfo, mlen, rlen, header_present));
602
603 brw_inst_set_sfid(devinfo, inst, GEN7_SFID_DATAPORT_DATA_CACHE);
604 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
605 brw_inst_set_scratch_read_write(devinfo, inst, write);
606 brw_inst_set_scratch_type(devinfo, inst, dword);
607 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
608 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
609 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
610 }
611
612 static void
brw_inst_set_state(const struct gen_device_info * devinfo,brw_inst * insn,const struct brw_insn_state * state)613 brw_inst_set_state(const struct gen_device_info *devinfo,
614 brw_inst *insn,
615 const struct brw_insn_state *state)
616 {
617 brw_inst_set_exec_size(devinfo, insn, state->exec_size);
618 brw_inst_set_group(devinfo, insn, state->group);
619 brw_inst_set_compression(devinfo, insn, state->compressed);
620 brw_inst_set_access_mode(devinfo, insn, state->access_mode);
621 brw_inst_set_mask_control(devinfo, insn, state->mask_control);
622 if (devinfo->gen >= 12)
623 brw_inst_set_swsb(devinfo, insn, tgl_swsb_encode(state->swsb));
624 brw_inst_set_saturate(devinfo, insn, state->saturate);
625 brw_inst_set_pred_control(devinfo, insn, state->predicate);
626 brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
627
628 if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
629 state->access_mode == BRW_ALIGN_16) {
630 brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
631 if (devinfo->gen >= 7)
632 brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
633 } else {
634 brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
635 if (devinfo->gen >= 7)
636 brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
637 }
638
639 if (devinfo->gen >= 6)
640 brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
641 }
642
643 static brw_inst *
brw_append_insns(struct brw_codegen * p,unsigned nr_insn,unsigned align)644 brw_append_insns(struct brw_codegen *p, unsigned nr_insn, unsigned align)
645 {
646 assert(util_is_power_of_two_or_zero(sizeof(brw_inst)));
647 assert(util_is_power_of_two_or_zero(align));
648 const unsigned align_insn = MAX2(align / sizeof(brw_inst), 1);
649 const unsigned start_insn = ALIGN(p->nr_insn, align_insn);
650 const unsigned new_nr_insn = start_insn + nr_insn;
651
652 if (p->store_size < new_nr_insn) {
653 p->store_size = util_next_power_of_two(new_nr_insn * sizeof(brw_inst));
654 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
655 }
656
657 /* Memset any padding due to alignment to 0. We don't want to be hashing
658 * or caching a bunch of random bits we got from a memory allocation.
659 */
660 if (p->nr_insn < start_insn) {
661 memset(&p->store[p->nr_insn], 0,
662 (start_insn - p->nr_insn) * sizeof(brw_inst));
663 }
664
665 assert(p->next_insn_offset == p->nr_insn * sizeof(brw_inst));
666 p->nr_insn = new_nr_insn;
667 p->next_insn_offset = new_nr_insn * sizeof(brw_inst);
668
669 return &p->store[start_insn];
670 }
671
672 void
brw_realign(struct brw_codegen * p,unsigned align)673 brw_realign(struct brw_codegen *p, unsigned align)
674 {
675 brw_append_insns(p, 0, align);
676 }
677
678 int
brw_append_data(struct brw_codegen * p,void * data,unsigned size,unsigned align)679 brw_append_data(struct brw_codegen *p, void *data,
680 unsigned size, unsigned align)
681 {
682 unsigned nr_insn = DIV_ROUND_UP(size, sizeof(brw_inst));
683 void *dst = brw_append_insns(p, nr_insn, align);
684 memcpy(dst, data, size);
685
686 /* If it's not a whole number of instructions, memset the end */
687 if (size < nr_insn * sizeof(brw_inst))
688 memset(dst + size, 0, nr_insn * sizeof(brw_inst) - size);
689
690 return dst - (void *)p->store;
691 }
692
693 #define next_insn brw_next_insn
694 brw_inst *
brw_next_insn(struct brw_codegen * p,unsigned opcode)695 brw_next_insn(struct brw_codegen *p, unsigned opcode)
696 {
697 const struct gen_device_info *devinfo = p->devinfo;
698 brw_inst *insn = brw_append_insns(p, 1, sizeof(brw_inst));
699
700 memset(insn, 0, sizeof(*insn));
701 brw_inst_set_opcode(devinfo, insn, opcode);
702
703 /* Apply the default instruction state */
704 brw_inst_set_state(devinfo, insn, p->current);
705
706 return insn;
707 }
708
709 static brw_inst *
brw_alu1(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src)710 brw_alu1(struct brw_codegen *p, unsigned opcode,
711 struct brw_reg dest, struct brw_reg src)
712 {
713 brw_inst *insn = next_insn(p, opcode);
714 brw_set_dest(p, insn, dest);
715 brw_set_src0(p, insn, src);
716 return insn;
717 }
718
719 static brw_inst *
brw_alu2(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)720 brw_alu2(struct brw_codegen *p, unsigned opcode,
721 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
722 {
723 /* 64-bit immediates are only supported on 1-src instructions */
724 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
725 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
726
727 brw_inst *insn = next_insn(p, opcode);
728 brw_set_dest(p, insn, dest);
729 brw_set_src0(p, insn, src0);
730 brw_set_src1(p, insn, src1);
731 return insn;
732 }
733
734 static int
get_3src_subreg_nr(struct brw_reg reg)735 get_3src_subreg_nr(struct brw_reg reg)
736 {
737 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
738 * use 32-bit units (components 0..7). Since they only support F/D/UD
739 * types, this doesn't lose any flexibility, but uses fewer bits.
740 */
741 return reg.subnr / 4;
742 }
743
744 static enum gen10_align1_3src_vertical_stride
to_3src_align1_vstride(const struct gen_device_info * devinfo,enum brw_vertical_stride vstride)745 to_3src_align1_vstride(const struct gen_device_info *devinfo,
746 enum brw_vertical_stride vstride)
747 {
748 switch (vstride) {
749 case BRW_VERTICAL_STRIDE_0:
750 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
751 case BRW_VERTICAL_STRIDE_1:
752 assert(devinfo->gen >= 12);
753 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1;
754 case BRW_VERTICAL_STRIDE_2:
755 assert(devinfo->gen < 12);
756 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
757 case BRW_VERTICAL_STRIDE_4:
758 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
759 case BRW_VERTICAL_STRIDE_8:
760 case BRW_VERTICAL_STRIDE_16:
761 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
762 default:
763 unreachable("invalid vstride");
764 }
765 }
766
767
768 static enum gen10_align1_3src_src_horizontal_stride
to_3src_align1_hstride(enum brw_horizontal_stride hstride)769 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
770 {
771 switch (hstride) {
772 case BRW_HORIZONTAL_STRIDE_0:
773 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
774 case BRW_HORIZONTAL_STRIDE_1:
775 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
776 case BRW_HORIZONTAL_STRIDE_2:
777 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
778 case BRW_HORIZONTAL_STRIDE_4:
779 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
780 default:
781 unreachable("invalid hstride");
782 }
783 }
784
785 static brw_inst *
brw_alu3(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1,struct brw_reg src2)786 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
787 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
788 {
789 const struct gen_device_info *devinfo = p->devinfo;
790 brw_inst *inst = next_insn(p, opcode);
791
792 gen7_convert_mrf_to_grf(p, &dest);
793
794 assert(dest.nr < 128);
795
796 if (devinfo->gen >= 10)
797 assert(!(src0.file == BRW_IMMEDIATE_VALUE &&
798 src2.file == BRW_IMMEDIATE_VALUE));
799
800 assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128);
801 assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128);
802 assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128);
803 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
804 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
805 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
806 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
807
808 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
809 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
810 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
811
812 if (devinfo->gen >= 12) {
813 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
814 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
815 } else {
816 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
817 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
818 BRW_ALIGN1_3SRC_ACCUMULATOR);
819 brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
820 } else {
821 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
822 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
823 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
824 }
825 }
826 brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
827
828 brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
829
830 if (brw_reg_type_is_floating_point(dest.type)) {
831 brw_inst_set_3src_a1_exec_type(devinfo, inst,
832 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
833 } else {
834 brw_inst_set_3src_a1_exec_type(devinfo, inst,
835 BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
836 }
837
838 brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
839 brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
840 brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
841 brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
842
843 if (src0.file == BRW_IMMEDIATE_VALUE) {
844 brw_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud);
845 } else {
846 brw_inst_set_3src_a1_src0_vstride(
847 devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
848 brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
849 to_3src_align1_hstride(src0.hstride));
850 brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
851 if (src0.type == BRW_REGISTER_TYPE_NF) {
852 brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
853 } else {
854 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
855 }
856 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
857 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
858 }
859 brw_inst_set_3src_a1_src1_vstride(
860 devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
861 brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
862 to_3src_align1_hstride(src1.hstride));
863
864 brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
865 if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
866 brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
867 } else {
868 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
869 }
870 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
871 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
872
873 if (src2.file == BRW_IMMEDIATE_VALUE) {
874 brw_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud);
875 } else {
876 brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
877 to_3src_align1_hstride(src2.hstride));
878 /* no vstride on src2 */
879 brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
880 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
881 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
882 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
883 }
884
885 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
886 src0.file == BRW_IMMEDIATE_VALUE ||
887 (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
888 src0.type == BRW_REGISTER_TYPE_NF));
889 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
890 src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
891 assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
892 src2.file == BRW_IMMEDIATE_VALUE);
893
894 if (devinfo->gen >= 12) {
895 if (src0.file == BRW_IMMEDIATE_VALUE) {
896 brw_inst_set_3src_a1_src0_is_imm(devinfo, inst, 1);
897 } else {
898 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
899 }
900
901 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
902
903 if (src2.file == BRW_IMMEDIATE_VALUE) {
904 brw_inst_set_3src_a1_src2_is_imm(devinfo, inst, 1);
905 } else {
906 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
907 }
908 } else {
909 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
910 src0.file == BRW_GENERAL_REGISTER_FILE ?
911 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
912 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
913 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
914 src1.file == BRW_GENERAL_REGISTER_FILE ?
915 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
916 BRW_ALIGN1_3SRC_ACCUMULATOR);
917 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
918 src2.file == BRW_GENERAL_REGISTER_FILE ?
919 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
920 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
921 }
922
923 } else {
924 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
925 dest.file == BRW_MESSAGE_REGISTER_FILE);
926 assert(dest.type == BRW_REGISTER_TYPE_F ||
927 dest.type == BRW_REGISTER_TYPE_DF ||
928 dest.type == BRW_REGISTER_TYPE_D ||
929 dest.type == BRW_REGISTER_TYPE_UD ||
930 (dest.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 8));
931 if (devinfo->gen == 6) {
932 brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
933 dest.file == BRW_MESSAGE_REGISTER_FILE);
934 }
935 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
936 brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 4);
937 brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
938
939 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
940 brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
941 brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
942 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
943 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
944 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
945 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
946 src0.vstride == BRW_VERTICAL_STRIDE_0);
947
948 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
949 brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
950 brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
951 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
952 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
953 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
954 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
955 src1.vstride == BRW_VERTICAL_STRIDE_0);
956
957 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
958 brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
959 brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
960 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
961 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
962 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
963 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
964 src2.vstride == BRW_VERTICAL_STRIDE_0);
965
966 if (devinfo->gen >= 7) {
967 /* Set both the source and destination types based on dest.type,
968 * ignoring the source register types. The MAD and LRP emitters ensure
969 * that all four types are float. The BFE and BFI2 emitters, however,
970 * may send us mixed D and UD types and want us to ignore that and use
971 * the destination type.
972 */
973 brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
974 brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
975
976 /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
977 *
978 * "Three source instructions can use operands with mixed-mode
979 * precision. When SrcType field is set to :f or :hf it defines
980 * precision for source 0 only, and fields Src1Type and Src2Type
981 * define precision for other source operands:
982 *
983 * 0b = :f. Single precision Float (32-bit).
984 * 1b = :hf. Half precision Float (16-bit)."
985 */
986 if (src1.type == BRW_REGISTER_TYPE_HF)
987 brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
988
989 if (src2.type == BRW_REGISTER_TYPE_HF)
990 brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
991 }
992 }
993
994 return inst;
995 }
996
997
998 /***********************************************************************
999 * Convenience routines.
1000 */
1001 #define ALU1(OP) \
1002 brw_inst *brw_##OP(struct brw_codegen *p, \
1003 struct brw_reg dest, \
1004 struct brw_reg src0) \
1005 { \
1006 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
1007 }
1008
1009 #define ALU2(OP) \
1010 brw_inst *brw_##OP(struct brw_codegen *p, \
1011 struct brw_reg dest, \
1012 struct brw_reg src0, \
1013 struct brw_reg src1) \
1014 { \
1015 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
1016 }
1017
1018 #define ALU3(OP) \
1019 brw_inst *brw_##OP(struct brw_codegen *p, \
1020 struct brw_reg dest, \
1021 struct brw_reg src0, \
1022 struct brw_reg src1, \
1023 struct brw_reg src2) \
1024 { \
1025 if (p->current->access_mode == BRW_ALIGN_16) { \
1026 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
1027 src0.swizzle = BRW_SWIZZLE_XXXX; \
1028 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
1029 src1.swizzle = BRW_SWIZZLE_XXXX; \
1030 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
1031 src2.swizzle = BRW_SWIZZLE_XXXX; \
1032 } \
1033 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1034 }
1035
1036 #define ALU3F(OP) \
1037 brw_inst *brw_##OP(struct brw_codegen *p, \
1038 struct brw_reg dest, \
1039 struct brw_reg src0, \
1040 struct brw_reg src1, \
1041 struct brw_reg src2) \
1042 { \
1043 assert(dest.type == BRW_REGISTER_TYPE_F || \
1044 dest.type == BRW_REGISTER_TYPE_DF); \
1045 if (dest.type == BRW_REGISTER_TYPE_F) { \
1046 assert(src0.type == BRW_REGISTER_TYPE_F); \
1047 assert(src1.type == BRW_REGISTER_TYPE_F); \
1048 assert(src2.type == BRW_REGISTER_TYPE_F); \
1049 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
1050 assert(src0.type == BRW_REGISTER_TYPE_DF); \
1051 assert(src1.type == BRW_REGISTER_TYPE_DF); \
1052 assert(src2.type == BRW_REGISTER_TYPE_DF); \
1053 } \
1054 \
1055 if (p->current->access_mode == BRW_ALIGN_16) { \
1056 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
1057 src0.swizzle = BRW_SWIZZLE_XXXX; \
1058 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
1059 src1.swizzle = BRW_SWIZZLE_XXXX; \
1060 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
1061 src2.swizzle = BRW_SWIZZLE_XXXX; \
1062 } \
1063 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1064 }
1065
1066 ALU2(SEL)
ALU1(NOT)1067 ALU1(NOT)
1068 ALU2(AND)
1069 ALU2(OR)
1070 ALU2(XOR)
1071 ALU2(SHR)
1072 ALU2(SHL)
1073 ALU1(DIM)
1074 ALU2(ASR)
1075 ALU2(ROL)
1076 ALU2(ROR)
1077 ALU3(CSEL)
1078 ALU1(FRC)
1079 ALU1(RNDD)
1080 ALU1(RNDE)
1081 ALU1(RNDU)
1082 ALU1(RNDZ)
1083 ALU2(MAC)
1084 ALU2(MACH)
1085 ALU1(LZD)
1086 ALU2(DP4)
1087 ALU2(DPH)
1088 ALU2(DP3)
1089 ALU2(DP2)
1090 ALU3(MAD)
1091 ALU3F(LRP)
1092 ALU1(BFREV)
1093 ALU3(BFE)
1094 ALU2(BFI1)
1095 ALU3(BFI2)
1096 ALU1(FBH)
1097 ALU1(FBL)
1098 ALU1(CBIT)
1099 ALU2(ADDC)
1100 ALU2(SUBB)
1101
1102 brw_inst *
1103 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1104 {
1105 const struct gen_device_info *devinfo = p->devinfo;
1106
1107 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1108 * To avoid the problems that causes, we use an <X,2,0> source region to
1109 * read each element twice.
1110 */
1111 if (devinfo->gen == 7 && !devinfo->is_haswell &&
1112 brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1113 dest.type == BRW_REGISTER_TYPE_DF &&
1114 (src0.type == BRW_REGISTER_TYPE_F ||
1115 src0.type == BRW_REGISTER_TYPE_D ||
1116 src0.type == BRW_REGISTER_TYPE_UD) &&
1117 !has_scalar_region(src0)) {
1118 assert(src0.vstride == src0.width + src0.hstride);
1119 src0.vstride = src0.hstride;
1120 src0.width = BRW_WIDTH_2;
1121 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1122 }
1123
1124 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1125 }
1126
1127 brw_inst *
brw_ADD(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1128 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1129 struct brw_reg src0, struct brw_reg src1)
1130 {
1131 /* 6.2.2: add */
1132 if (src0.type == BRW_REGISTER_TYPE_F ||
1133 (src0.file == BRW_IMMEDIATE_VALUE &&
1134 src0.type == BRW_REGISTER_TYPE_VF)) {
1135 assert(src1.type != BRW_REGISTER_TYPE_UD);
1136 assert(src1.type != BRW_REGISTER_TYPE_D);
1137 }
1138
1139 if (src1.type == BRW_REGISTER_TYPE_F ||
1140 (src1.file == BRW_IMMEDIATE_VALUE &&
1141 src1.type == BRW_REGISTER_TYPE_VF)) {
1142 assert(src0.type != BRW_REGISTER_TYPE_UD);
1143 assert(src0.type != BRW_REGISTER_TYPE_D);
1144 }
1145
1146 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1147 }
1148
1149 brw_inst *
brw_AVG(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1150 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1151 struct brw_reg src0, struct brw_reg src1)
1152 {
1153 assert(dest.type == src0.type);
1154 assert(src0.type == src1.type);
1155 switch (src0.type) {
1156 case BRW_REGISTER_TYPE_B:
1157 case BRW_REGISTER_TYPE_UB:
1158 case BRW_REGISTER_TYPE_W:
1159 case BRW_REGISTER_TYPE_UW:
1160 case BRW_REGISTER_TYPE_D:
1161 case BRW_REGISTER_TYPE_UD:
1162 break;
1163 default:
1164 unreachable("Bad type for brw_AVG");
1165 }
1166
1167 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1168 }
1169
1170 brw_inst *
brw_MUL(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1171 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1172 struct brw_reg src0, struct brw_reg src1)
1173 {
1174 /* 6.32.38: mul */
1175 if (src0.type == BRW_REGISTER_TYPE_D ||
1176 src0.type == BRW_REGISTER_TYPE_UD ||
1177 src1.type == BRW_REGISTER_TYPE_D ||
1178 src1.type == BRW_REGISTER_TYPE_UD) {
1179 assert(dest.type != BRW_REGISTER_TYPE_F);
1180 }
1181
1182 if (src0.type == BRW_REGISTER_TYPE_F ||
1183 (src0.file == BRW_IMMEDIATE_VALUE &&
1184 src0.type == BRW_REGISTER_TYPE_VF)) {
1185 assert(src1.type != BRW_REGISTER_TYPE_UD);
1186 assert(src1.type != BRW_REGISTER_TYPE_D);
1187 }
1188
1189 if (src1.type == BRW_REGISTER_TYPE_F ||
1190 (src1.file == BRW_IMMEDIATE_VALUE &&
1191 src1.type == BRW_REGISTER_TYPE_VF)) {
1192 assert(src0.type != BRW_REGISTER_TYPE_UD);
1193 assert(src0.type != BRW_REGISTER_TYPE_D);
1194 }
1195
1196 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1197 src0.nr != BRW_ARF_ACCUMULATOR);
1198 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1199 src1.nr != BRW_ARF_ACCUMULATOR);
1200
1201 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1202 }
1203
1204 brw_inst *
brw_LINE(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1205 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1206 struct brw_reg src0, struct brw_reg src1)
1207 {
1208 src0.vstride = BRW_VERTICAL_STRIDE_0;
1209 src0.width = BRW_WIDTH_1;
1210 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1211 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1212 }
1213
1214 brw_inst *
brw_PLN(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1215 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1216 struct brw_reg src0, struct brw_reg src1)
1217 {
1218 src0.vstride = BRW_VERTICAL_STRIDE_0;
1219 src0.width = BRW_WIDTH_1;
1220 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1221 src1.vstride = BRW_VERTICAL_STRIDE_8;
1222 src1.width = BRW_WIDTH_8;
1223 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1224 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1225 }
1226
1227 brw_inst *
brw_F32TO16(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src)1228 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1229 {
1230 const struct gen_device_info *devinfo = p->devinfo;
1231 const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1232 /* The F32TO16 instruction doesn't support 32-bit destination types in
1233 * Align1 mode, and neither does the Gen8 implementation in terms of a
1234 * converting MOV. Gen7 does zero out the high 16 bits in Align16 mode as
1235 * an undocumented feature.
1236 */
1237 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1238 (!align16 || devinfo->gen >= 8));
1239 brw_inst *inst;
1240
1241 if (align16) {
1242 assert(dst.type == BRW_REGISTER_TYPE_UD);
1243 } else {
1244 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1245 dst.type == BRW_REGISTER_TYPE_W ||
1246 dst.type == BRW_REGISTER_TYPE_UW ||
1247 dst.type == BRW_REGISTER_TYPE_HF);
1248 }
1249
1250 brw_push_insn_state(p);
1251
1252 if (needs_zero_fill) {
1253 brw_set_default_access_mode(p, BRW_ALIGN_1);
1254 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1255 }
1256
1257 if (devinfo->gen >= 8) {
1258 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1259 } else {
1260 assert(devinfo->gen == 7);
1261 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1262 }
1263
1264 if (needs_zero_fill) {
1265 if (devinfo->gen < 12)
1266 brw_inst_set_no_dd_clear(devinfo, inst, true);
1267 brw_set_default_swsb(p, tgl_swsb_null());
1268 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1269 if (devinfo->gen < 12)
1270 brw_inst_set_no_dd_check(devinfo, inst, true);
1271 }
1272
1273 brw_pop_insn_state(p);
1274 return inst;
1275 }
1276
1277 brw_inst *
brw_F16TO32(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src)1278 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1279 {
1280 const struct gen_device_info *devinfo = p->devinfo;
1281 bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1282
1283 if (align16) {
1284 assert(src.type == BRW_REGISTER_TYPE_UD);
1285 } else {
1286 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1287 *
1288 * Because this instruction does not have a 16-bit floating-point
1289 * type, the source data type must be Word (W). The destination type
1290 * must be F (Float).
1291 */
1292 if (src.type == BRW_REGISTER_TYPE_UD)
1293 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1294
1295 assert(src.type == BRW_REGISTER_TYPE_W ||
1296 src.type == BRW_REGISTER_TYPE_UW ||
1297 src.type == BRW_REGISTER_TYPE_HF);
1298 }
1299
1300 if (devinfo->gen >= 8) {
1301 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1302 } else {
1303 assert(devinfo->gen == 7);
1304 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1305 }
1306 }
1307
1308
brw_NOP(struct brw_codegen * p)1309 void brw_NOP(struct brw_codegen *p)
1310 {
1311 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1312 memset(insn, 0, sizeof(*insn));
1313 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1314 }
1315
brw_SYNC(struct brw_codegen * p,enum tgl_sync_function func)1316 void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func)
1317 {
1318 brw_inst *insn = next_insn(p, BRW_OPCODE_SYNC);
1319 brw_inst_set_cond_modifier(p->devinfo, insn, func);
1320 }
1321
1322 /***********************************************************************
1323 * Comparisons, if/else/endif
1324 */
1325
1326 brw_inst *
brw_JMPI(struct brw_codegen * p,struct brw_reg index,unsigned predicate_control)1327 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1328 unsigned predicate_control)
1329 {
1330 const struct gen_device_info *devinfo = p->devinfo;
1331 struct brw_reg ip = brw_ip_reg();
1332 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1333
1334 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1335 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1336 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1337 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1338
1339 return inst;
1340 }
1341
1342 static void
push_if_stack(struct brw_codegen * p,brw_inst * inst)1343 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1344 {
1345 p->if_stack[p->if_stack_depth] = inst - p->store;
1346
1347 p->if_stack_depth++;
1348 if (p->if_stack_array_size <= p->if_stack_depth) {
1349 p->if_stack_array_size *= 2;
1350 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1351 p->if_stack_array_size);
1352 }
1353 }
1354
1355 static brw_inst *
pop_if_stack(struct brw_codegen * p)1356 pop_if_stack(struct brw_codegen *p)
1357 {
1358 p->if_stack_depth--;
1359 return &p->store[p->if_stack[p->if_stack_depth]];
1360 }
1361
1362 static void
push_loop_stack(struct brw_codegen * p,brw_inst * inst)1363 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1364 {
1365 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1366 p->loop_stack_array_size *= 2;
1367 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1368 p->loop_stack_array_size);
1369 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1370 p->loop_stack_array_size);
1371 }
1372
1373 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1374 p->loop_stack_depth++;
1375 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1376 }
1377
1378 static brw_inst *
get_inner_do_insn(struct brw_codegen * p)1379 get_inner_do_insn(struct brw_codegen *p)
1380 {
1381 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1382 }
1383
1384 /* EU takes the value from the flag register and pushes it onto some
1385 * sort of a stack (presumably merging with any flag value already on
1386 * the stack). Within an if block, the flags at the top of the stack
1387 * control execution on each channel of the unit, eg. on each of the
1388 * 16 pixel values in our wm programs.
1389 *
1390 * When the matching 'else' instruction is reached (presumably by
1391 * countdown of the instruction count patched in by our ELSE/ENDIF
1392 * functions), the relevant flags are inverted.
1393 *
1394 * When the matching 'endif' instruction is reached, the flags are
1395 * popped off. If the stack is now empty, normal execution resumes.
1396 */
1397 brw_inst *
brw_IF(struct brw_codegen * p,unsigned execute_size)1398 brw_IF(struct brw_codegen *p, unsigned execute_size)
1399 {
1400 const struct gen_device_info *devinfo = p->devinfo;
1401 brw_inst *insn;
1402
1403 insn = next_insn(p, BRW_OPCODE_IF);
1404
1405 /* Override the defaults for this instruction:
1406 */
1407 if (devinfo->gen < 6) {
1408 brw_set_dest(p, insn, brw_ip_reg());
1409 brw_set_src0(p, insn, brw_ip_reg());
1410 brw_set_src1(p, insn, brw_imm_d(0x0));
1411 } else if (devinfo->gen == 6) {
1412 brw_set_dest(p, insn, brw_imm_w(0));
1413 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1414 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1415 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1416 } else if (devinfo->gen == 7) {
1417 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1418 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1419 brw_set_src1(p, insn, brw_imm_w(0));
1420 brw_inst_set_jip(devinfo, insn, 0);
1421 brw_inst_set_uip(devinfo, insn, 0);
1422 } else {
1423 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1424 if (devinfo->gen < 12)
1425 brw_set_src0(p, insn, brw_imm_d(0));
1426 brw_inst_set_jip(devinfo, insn, 0);
1427 brw_inst_set_uip(devinfo, insn, 0);
1428 }
1429
1430 brw_inst_set_exec_size(devinfo, insn, execute_size);
1431 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1432 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1433 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1434 if (!p->single_program_flow && devinfo->gen < 6)
1435 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1436
1437 push_if_stack(p, insn);
1438 p->if_depth_in_loop[p->loop_stack_depth]++;
1439 return insn;
1440 }
1441
1442 /* This function is only used for gen6-style IF instructions with an
1443 * embedded comparison (conditional modifier). It is not used on gen7.
1444 */
1445 brw_inst *
gen6_IF(struct brw_codegen * p,enum brw_conditional_mod conditional,struct brw_reg src0,struct brw_reg src1)1446 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1447 struct brw_reg src0, struct brw_reg src1)
1448 {
1449 const struct gen_device_info *devinfo = p->devinfo;
1450 brw_inst *insn;
1451
1452 insn = next_insn(p, BRW_OPCODE_IF);
1453
1454 brw_set_dest(p, insn, brw_imm_w(0));
1455 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1456 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1457 brw_set_src0(p, insn, src0);
1458 brw_set_src1(p, insn, src1);
1459
1460 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1461 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1462 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1463
1464 push_if_stack(p, insn);
1465 return insn;
1466 }
1467
1468 /**
1469 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1470 */
1471 static void
convert_IF_ELSE_to_ADD(struct brw_codegen * p,brw_inst * if_inst,brw_inst * else_inst)1472 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1473 brw_inst *if_inst, brw_inst *else_inst)
1474 {
1475 const struct gen_device_info *devinfo = p->devinfo;
1476
1477 /* The next instruction (where the ENDIF would be, if it existed) */
1478 brw_inst *next_inst = &p->store[p->nr_insn];
1479
1480 assert(p->single_program_flow);
1481 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1482 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1483 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1484
1485 /* Convert IF to an ADD instruction that moves the instruction pointer
1486 * to the first instruction of the ELSE block. If there is no ELSE
1487 * block, point to where ENDIF would be. Reverse the predicate.
1488 *
1489 * There's no need to execute an ENDIF since we don't need to do any
1490 * stack operations, and if we're currently executing, we just want to
1491 * continue normally.
1492 */
1493 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1494 brw_inst_set_pred_inv(devinfo, if_inst, true);
1495
1496 if (else_inst != NULL) {
1497 /* Convert ELSE to an ADD instruction that points where the ENDIF
1498 * would be.
1499 */
1500 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1501
1502 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1503 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1504 } else {
1505 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1506 }
1507 }
1508
1509 /**
1510 * Patch IF and ELSE instructions with appropriate jump targets.
1511 */
1512 static void
patch_IF_ELSE(struct brw_codegen * p,brw_inst * if_inst,brw_inst * else_inst,brw_inst * endif_inst)1513 patch_IF_ELSE(struct brw_codegen *p,
1514 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1515 {
1516 const struct gen_device_info *devinfo = p->devinfo;
1517
1518 /* We shouldn't be patching IF and ELSE instructions in single program flow
1519 * mode when gen < 6, because in single program flow mode on those
1520 * platforms, we convert flow control instructions to conditional ADDs that
1521 * operate on IP (see brw_ENDIF).
1522 *
1523 * However, on Gen6, writing to IP doesn't work in single program flow mode
1524 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1525 * not be updated by non-flow control instructions."). And on later
1526 * platforms, there is no significant benefit to converting control flow
1527 * instructions to conditional ADDs. So we do patch IF and ELSE
1528 * instructions in single program flow mode on those platforms.
1529 */
1530 if (devinfo->gen < 6)
1531 assert(!p->single_program_flow);
1532
1533 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1534 assert(endif_inst != NULL);
1535 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1536
1537 unsigned br = brw_jump_scale(devinfo);
1538
1539 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1540 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1541
1542 if (else_inst == NULL) {
1543 /* Patch IF -> ENDIF */
1544 if (devinfo->gen < 6) {
1545 /* Turn it into an IFF, which means no mask stack operations for
1546 * all-false and jumping past the ENDIF.
1547 */
1548 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1549 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1550 br * (endif_inst - if_inst + 1));
1551 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1552 } else if (devinfo->gen == 6) {
1553 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1554 brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1555 } else {
1556 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1557 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1558 }
1559 } else {
1560 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1561
1562 /* Patch IF -> ELSE */
1563 if (devinfo->gen < 6) {
1564 brw_inst_set_gen4_jump_count(devinfo, if_inst,
1565 br * (else_inst - if_inst));
1566 brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1567 } else if (devinfo->gen == 6) {
1568 brw_inst_set_gen6_jump_count(devinfo, if_inst,
1569 br * (else_inst - if_inst + 1));
1570 }
1571
1572 /* Patch ELSE -> ENDIF */
1573 if (devinfo->gen < 6) {
1574 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1575 * matching ENDIF.
1576 */
1577 brw_inst_set_gen4_jump_count(devinfo, else_inst,
1578 br * (endif_inst - else_inst + 1));
1579 brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1580 } else if (devinfo->gen == 6) {
1581 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1582 brw_inst_set_gen6_jump_count(devinfo, else_inst,
1583 br * (endif_inst - else_inst));
1584 } else {
1585 /* The IF instruction's JIP should point just past the ELSE */
1586 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1587 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1588 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1589 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1590 if (devinfo->gen >= 8) {
1591 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1592 * should point to ENDIF.
1593 */
1594 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1595 }
1596 }
1597 }
1598 }
1599
1600 void
brw_ELSE(struct brw_codegen * p)1601 brw_ELSE(struct brw_codegen *p)
1602 {
1603 const struct gen_device_info *devinfo = p->devinfo;
1604 brw_inst *insn;
1605
1606 insn = next_insn(p, BRW_OPCODE_ELSE);
1607
1608 if (devinfo->gen < 6) {
1609 brw_set_dest(p, insn, brw_ip_reg());
1610 brw_set_src0(p, insn, brw_ip_reg());
1611 brw_set_src1(p, insn, brw_imm_d(0x0));
1612 } else if (devinfo->gen == 6) {
1613 brw_set_dest(p, insn, brw_imm_w(0));
1614 brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1615 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1616 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1617 } else if (devinfo->gen == 7) {
1618 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1619 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1620 brw_set_src1(p, insn, brw_imm_w(0));
1621 brw_inst_set_jip(devinfo, insn, 0);
1622 brw_inst_set_uip(devinfo, insn, 0);
1623 } else {
1624 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1625 if (devinfo->gen < 12)
1626 brw_set_src0(p, insn, brw_imm_d(0));
1627 brw_inst_set_jip(devinfo, insn, 0);
1628 brw_inst_set_uip(devinfo, insn, 0);
1629 }
1630
1631 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1632 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1633 if (!p->single_program_flow && devinfo->gen < 6)
1634 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1635
1636 push_if_stack(p, insn);
1637 }
1638
1639 void
brw_ENDIF(struct brw_codegen * p)1640 brw_ENDIF(struct brw_codegen *p)
1641 {
1642 const struct gen_device_info *devinfo = p->devinfo;
1643 brw_inst *insn = NULL;
1644 brw_inst *else_inst = NULL;
1645 brw_inst *if_inst = NULL;
1646 brw_inst *tmp;
1647 bool emit_endif = true;
1648
1649 /* In single program flow mode, we can express IF and ELSE instructions
1650 * equivalently as ADD instructions that operate on IP. On platforms prior
1651 * to Gen6, flow control instructions cause an implied thread switch, so
1652 * this is a significant savings.
1653 *
1654 * However, on Gen6, writing to IP doesn't work in single program flow mode
1655 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1656 * not be updated by non-flow control instructions."). And on later
1657 * platforms, there is no significant benefit to converting control flow
1658 * instructions to conditional ADDs. So we only do this trick on Gen4 and
1659 * Gen5.
1660 */
1661 if (devinfo->gen < 6 && p->single_program_flow)
1662 emit_endif = false;
1663
1664 /*
1665 * A single next_insn() may change the base address of instruction store
1666 * memory(p->store), so call it first before referencing the instruction
1667 * store pointer from an index
1668 */
1669 if (emit_endif)
1670 insn = next_insn(p, BRW_OPCODE_ENDIF);
1671
1672 /* Pop the IF and (optional) ELSE instructions from the stack */
1673 p->if_depth_in_loop[p->loop_stack_depth]--;
1674 tmp = pop_if_stack(p);
1675 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1676 else_inst = tmp;
1677 tmp = pop_if_stack(p);
1678 }
1679 if_inst = tmp;
1680
1681 if (!emit_endif) {
1682 /* ENDIF is useless; don't bother emitting it. */
1683 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1684 return;
1685 }
1686
1687 if (devinfo->gen < 6) {
1688 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1689 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1690 brw_set_src1(p, insn, brw_imm_d(0x0));
1691 } else if (devinfo->gen == 6) {
1692 brw_set_dest(p, insn, brw_imm_w(0));
1693 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1694 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1695 } else if (devinfo->gen == 7) {
1696 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1697 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1698 brw_set_src1(p, insn, brw_imm_w(0));
1699 } else {
1700 brw_set_src0(p, insn, brw_imm_d(0));
1701 }
1702
1703 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1704 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1705 if (devinfo->gen < 6)
1706 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1707
1708 /* Also pop item off the stack in the endif instruction: */
1709 if (devinfo->gen < 6) {
1710 brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1711 brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1712 } else if (devinfo->gen == 6) {
1713 brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1714 } else {
1715 brw_inst_set_jip(devinfo, insn, 2);
1716 }
1717 patch_IF_ELSE(p, if_inst, else_inst, insn);
1718 }
1719
1720 brw_inst *
brw_BREAK(struct brw_codegen * p)1721 brw_BREAK(struct brw_codegen *p)
1722 {
1723 const struct gen_device_info *devinfo = p->devinfo;
1724 brw_inst *insn;
1725
1726 insn = next_insn(p, BRW_OPCODE_BREAK);
1727 if (devinfo->gen >= 8) {
1728 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1729 brw_set_src0(p, insn, brw_imm_d(0x0));
1730 } else if (devinfo->gen >= 6) {
1731 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1732 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1733 brw_set_src1(p, insn, brw_imm_d(0x0));
1734 } else {
1735 brw_set_dest(p, insn, brw_ip_reg());
1736 brw_set_src0(p, insn, brw_ip_reg());
1737 brw_set_src1(p, insn, brw_imm_d(0x0));
1738 brw_inst_set_gen4_pop_count(devinfo, insn,
1739 p->if_depth_in_loop[p->loop_stack_depth]);
1740 }
1741 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1742 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1743
1744 return insn;
1745 }
1746
1747 brw_inst *
brw_CONT(struct brw_codegen * p)1748 brw_CONT(struct brw_codegen *p)
1749 {
1750 const struct gen_device_info *devinfo = p->devinfo;
1751 brw_inst *insn;
1752
1753 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1754 brw_set_dest(p, insn, brw_ip_reg());
1755 if (devinfo->gen >= 8) {
1756 brw_set_src0(p, insn, brw_imm_d(0x0));
1757 } else {
1758 brw_set_src0(p, insn, brw_ip_reg());
1759 brw_set_src1(p, insn, brw_imm_d(0x0));
1760 }
1761
1762 if (devinfo->gen < 6) {
1763 brw_inst_set_gen4_pop_count(devinfo, insn,
1764 p->if_depth_in_loop[p->loop_stack_depth]);
1765 }
1766 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1767 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1768 return insn;
1769 }
1770
1771 brw_inst *
brw_HALT(struct brw_codegen * p)1772 brw_HALT(struct brw_codegen *p)
1773 {
1774 const struct gen_device_info *devinfo = p->devinfo;
1775 brw_inst *insn;
1776
1777 insn = next_insn(p, BRW_OPCODE_HALT);
1778 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1779 if (devinfo->gen < 6) {
1780 /* From the Gen4 PRM:
1781 *
1782 * "IP register must be put (for example, by the assembler) at <dst>
1783 * and <src0> locations.
1784 */
1785 brw_set_dest(p, insn, brw_ip_reg());
1786 brw_set_src0(p, insn, brw_ip_reg());
1787 brw_set_src1(p, insn, brw_imm_d(0x0)); /* exitcode updated later. */
1788 } else if (devinfo->gen < 8) {
1789 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1790 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1791 } else if (devinfo->gen < 12) {
1792 brw_set_src0(p, insn, brw_imm_d(0x0));
1793 }
1794
1795 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1796 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1797 return insn;
1798 }
1799
1800 /* DO/WHILE loop:
1801 *
1802 * The DO/WHILE is just an unterminated loop -- break or continue are
1803 * used for control within the loop. We have a few ways they can be
1804 * done.
1805 *
1806 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1807 * jip and no DO instruction.
1808 *
1809 * For non-uniform control flow pre-gen6, there's a DO instruction to
1810 * push the mask, and a WHILE to jump back, and BREAK to get out and
1811 * pop the mask.
1812 *
1813 * For gen6, there's no more mask stack, so no need for DO. WHILE
1814 * just points back to the first instruction of the loop.
1815 */
1816 brw_inst *
brw_DO(struct brw_codegen * p,unsigned execute_size)1817 brw_DO(struct brw_codegen *p, unsigned execute_size)
1818 {
1819 const struct gen_device_info *devinfo = p->devinfo;
1820
1821 if (devinfo->gen >= 6 || p->single_program_flow) {
1822 push_loop_stack(p, &p->store[p->nr_insn]);
1823 return &p->store[p->nr_insn];
1824 } else {
1825 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1826
1827 push_loop_stack(p, insn);
1828
1829 /* Override the defaults for this instruction:
1830 */
1831 brw_set_dest(p, insn, brw_null_reg());
1832 brw_set_src0(p, insn, brw_null_reg());
1833 brw_set_src1(p, insn, brw_null_reg());
1834
1835 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1836 brw_inst_set_exec_size(devinfo, insn, execute_size);
1837 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1838
1839 return insn;
1840 }
1841 }
1842
1843 /**
1844 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1845 * instruction here.
1846 *
1847 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1848 * nesting, since it can always just point to the end of the block/current loop.
1849 */
1850 static void
brw_patch_break_cont(struct brw_codegen * p,brw_inst * while_inst)1851 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1852 {
1853 const struct gen_device_info *devinfo = p->devinfo;
1854 brw_inst *do_inst = get_inner_do_insn(p);
1855 brw_inst *inst;
1856 unsigned br = brw_jump_scale(devinfo);
1857
1858 assert(devinfo->gen < 6);
1859
1860 for (inst = while_inst - 1; inst != do_inst; inst--) {
1861 /* If the jump count is != 0, that means that this instruction has already
1862 * been patched because it's part of a loop inside of the one we're
1863 * patching.
1864 */
1865 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1866 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1867 brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1868 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1869 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1870 brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1871 }
1872 }
1873 }
1874
1875 brw_inst *
brw_WHILE(struct brw_codegen * p)1876 brw_WHILE(struct brw_codegen *p)
1877 {
1878 const struct gen_device_info *devinfo = p->devinfo;
1879 brw_inst *insn, *do_insn;
1880 unsigned br = brw_jump_scale(devinfo);
1881
1882 if (devinfo->gen >= 6) {
1883 insn = next_insn(p, BRW_OPCODE_WHILE);
1884 do_insn = get_inner_do_insn(p);
1885
1886 if (devinfo->gen >= 8) {
1887 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1888 if (devinfo->gen < 12)
1889 brw_set_src0(p, insn, brw_imm_d(0));
1890 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1891 } else if (devinfo->gen == 7) {
1892 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1893 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1894 brw_set_src1(p, insn, brw_imm_w(0));
1895 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1896 } else {
1897 brw_set_dest(p, insn, brw_imm_w(0));
1898 brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1899 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1900 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1901 }
1902
1903 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1904
1905 } else {
1906 if (p->single_program_flow) {
1907 insn = next_insn(p, BRW_OPCODE_ADD);
1908 do_insn = get_inner_do_insn(p);
1909
1910 brw_set_dest(p, insn, brw_ip_reg());
1911 brw_set_src0(p, insn, brw_ip_reg());
1912 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1913 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1914 } else {
1915 insn = next_insn(p, BRW_OPCODE_WHILE);
1916 do_insn = get_inner_do_insn(p);
1917
1918 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1919
1920 brw_set_dest(p, insn, brw_ip_reg());
1921 brw_set_src0(p, insn, brw_ip_reg());
1922 brw_set_src1(p, insn, brw_imm_d(0));
1923
1924 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1925 brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1926 brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1927
1928 brw_patch_break_cont(p, insn);
1929 }
1930 }
1931 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1932
1933 p->loop_stack_depth--;
1934
1935 return insn;
1936 }
1937
1938 /* FORWARD JUMPS:
1939 */
brw_land_fwd_jump(struct brw_codegen * p,int jmp_insn_idx)1940 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1941 {
1942 const struct gen_device_info *devinfo = p->devinfo;
1943 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1944 unsigned jmpi = 1;
1945
1946 if (devinfo->gen >= 5)
1947 jmpi = 2;
1948
1949 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1950 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1951
1952 brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1953 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1954 }
1955
1956 /* To integrate with the above, it makes sense that the comparison
1957 * instruction should populate the flag register. It might be simpler
1958 * just to use the flag reg for most WM tasks?
1959 */
brw_CMP(struct brw_codegen * p,struct brw_reg dest,unsigned conditional,struct brw_reg src0,struct brw_reg src1)1960 void brw_CMP(struct brw_codegen *p,
1961 struct brw_reg dest,
1962 unsigned conditional,
1963 struct brw_reg src0,
1964 struct brw_reg src1)
1965 {
1966 const struct gen_device_info *devinfo = p->devinfo;
1967 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1968
1969 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1970 brw_set_dest(p, insn, dest);
1971 brw_set_src0(p, insn, src0);
1972 brw_set_src1(p, insn, src1);
1973
1974 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1975 * page says:
1976 * "Any CMP instruction with a null destination must use a {switch}."
1977 *
1978 * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1979 * mentioned on their work-arounds pages.
1980 */
1981 if (devinfo->gen == 7) {
1982 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1983 dest.nr == BRW_ARF_NULL) {
1984 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1985 }
1986 }
1987 }
1988
1989 /***********************************************************************
1990 * Helpers for the various SEND message types:
1991 */
1992
1993 /** Extended math function, float[8].
1994 */
gen4_math(struct brw_codegen * p,struct brw_reg dest,unsigned function,unsigned msg_reg_nr,struct brw_reg src,unsigned precision)1995 void gen4_math(struct brw_codegen *p,
1996 struct brw_reg dest,
1997 unsigned function,
1998 unsigned msg_reg_nr,
1999 struct brw_reg src,
2000 unsigned precision )
2001 {
2002 const struct gen_device_info *devinfo = p->devinfo;
2003 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2004 unsigned data_type;
2005 if (has_scalar_region(src)) {
2006 data_type = BRW_MATH_DATA_SCALAR;
2007 } else {
2008 data_type = BRW_MATH_DATA_VECTOR;
2009 }
2010
2011 assert(devinfo->gen < 6);
2012
2013 /* Example code doesn't set predicate_control for send
2014 * instructions.
2015 */
2016 brw_inst_set_pred_control(devinfo, insn, 0);
2017 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2018
2019 brw_set_dest(p, insn, dest);
2020 brw_set_src0(p, insn, src);
2021 brw_set_math_message(p,
2022 insn,
2023 function,
2024 src.type == BRW_REGISTER_TYPE_D,
2025 precision,
2026 data_type);
2027 }
2028
gen6_math(struct brw_codegen * p,struct brw_reg dest,unsigned function,struct brw_reg src0,struct brw_reg src1)2029 void gen6_math(struct brw_codegen *p,
2030 struct brw_reg dest,
2031 unsigned function,
2032 struct brw_reg src0,
2033 struct brw_reg src1)
2034 {
2035 const struct gen_device_info *devinfo = p->devinfo;
2036 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
2037
2038 assert(devinfo->gen >= 6);
2039
2040 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
2041 (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
2042
2043 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2044 if (devinfo->gen == 6) {
2045 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2046 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2047 }
2048
2049 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2050 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2051 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2052 assert(src0.type != BRW_REGISTER_TYPE_F);
2053 assert(src1.type != BRW_REGISTER_TYPE_F);
2054 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2055 (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2056 } else {
2057 assert(src0.type == BRW_REGISTER_TYPE_F ||
2058 (src0.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2059 assert(src1.type == BRW_REGISTER_TYPE_F ||
2060 (src1.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2061 }
2062
2063 /* Source modifiers are ignored for extended math instructions on Gen6. */
2064 if (devinfo->gen == 6) {
2065 assert(!src0.negate);
2066 assert(!src0.abs);
2067 assert(!src1.negate);
2068 assert(!src1.abs);
2069 }
2070
2071 brw_inst_set_math_function(devinfo, insn, function);
2072
2073 brw_set_dest(p, insn, dest);
2074 brw_set_src0(p, insn, src0);
2075 brw_set_src1(p, insn, src1);
2076 }
2077
2078 /**
2079 * Return the right surface index to access the thread scratch space using
2080 * stateless dataport messages.
2081 */
2082 unsigned
brw_scratch_surface_idx(const struct brw_codegen * p)2083 brw_scratch_surface_idx(const struct brw_codegen *p)
2084 {
2085 /* The scratch space is thread-local so IA coherency is unnecessary. */
2086 if (p->devinfo->gen >= 8)
2087 return GEN8_BTI_STATELESS_NON_COHERENT;
2088 else
2089 return BRW_BTI_STATELESS;
2090 }
2091
2092 /**
2093 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2094 * using a constant offset per channel.
2095 *
2096 * The offset must be aligned to oword size (16 bytes). Used for
2097 * register spilling.
2098 */
brw_oword_block_write_scratch(struct brw_codegen * p,struct brw_reg mrf,int num_regs,unsigned offset)2099 void brw_oword_block_write_scratch(struct brw_codegen *p,
2100 struct brw_reg mrf,
2101 int num_regs,
2102 unsigned offset)
2103 {
2104 const struct gen_device_info *devinfo = p->devinfo;
2105 const unsigned target_cache =
2106 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2107 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2108 BRW_SFID_DATAPORT_WRITE);
2109 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2110 uint32_t msg_type;
2111
2112 if (devinfo->gen >= 6)
2113 offset /= 16;
2114
2115 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2116
2117 const unsigned mlen = 1 + num_regs;
2118
2119 /* Set up the message header. This is g0, with g0.2 filled with
2120 * the offset. We don't want to leave our offset around in g0 or
2121 * it'll screw up texture samples, so set it up inside the message
2122 * reg.
2123 */
2124 {
2125 brw_push_insn_state(p);
2126 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2127 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2128 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2129 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2130
2131 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2132
2133 /* set message header global offset field (reg 0, element 2) */
2134 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2135 brw_set_default_swsb(p, tgl_swsb_null());
2136 brw_MOV(p,
2137 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2138 mrf.nr,
2139 2), BRW_REGISTER_TYPE_UD),
2140 brw_imm_ud(offset));
2141
2142 brw_pop_insn_state(p);
2143 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2144 }
2145
2146 {
2147 struct brw_reg dest;
2148 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2149 int send_commit_msg;
2150 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2151 BRW_REGISTER_TYPE_UW);
2152
2153 brw_inst_set_sfid(devinfo, insn, target_cache);
2154 brw_inst_set_compression(devinfo, insn, false);
2155
2156 if (brw_inst_exec_size(devinfo, insn) >= 16)
2157 src_header = vec16(src_header);
2158
2159 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2160 if (devinfo->gen < 6)
2161 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2162
2163 /* Until gen6, writes followed by reads from the same location
2164 * are not guaranteed to be ordered unless write_commit is set.
2165 * If set, then a no-op write is issued to the destination
2166 * register to set a dependency, and a read from the destination
2167 * can be used to ensure the ordering.
2168 *
2169 * For gen6, only writes between different threads need ordering
2170 * protection. Our use of DP writes is all about register
2171 * spilling within a thread.
2172 */
2173 if (devinfo->gen >= 6) {
2174 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2175 send_commit_msg = 0;
2176 } else {
2177 dest = src_header;
2178 send_commit_msg = 1;
2179 }
2180
2181 brw_set_dest(p, insn, dest);
2182 if (devinfo->gen >= 6) {
2183 brw_set_src0(p, insn, mrf);
2184 } else {
2185 brw_set_src0(p, insn, brw_null_reg());
2186 }
2187
2188 if (devinfo->gen >= 6)
2189 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2190 else
2191 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2192
2193 brw_set_desc(p, insn,
2194 brw_message_desc(devinfo, mlen, send_commit_msg, true) |
2195 brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
2196 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2197 msg_type, 0, /* not a render target */
2198 send_commit_msg));
2199 }
2200 }
2201
2202
2203 /**
2204 * Read a block of owords (half a GRF each) from the scratch buffer
2205 * using a constant index per channel.
2206 *
2207 * Offset must be aligned to oword size (16 bytes). Used for register
2208 * spilling.
2209 */
2210 void
brw_oword_block_read_scratch(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,int num_regs,unsigned offset)2211 brw_oword_block_read_scratch(struct brw_codegen *p,
2212 struct brw_reg dest,
2213 struct brw_reg mrf,
2214 int num_regs,
2215 unsigned offset)
2216 {
2217 const struct gen_device_info *devinfo = p->devinfo;
2218 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2219
2220 if (devinfo->gen >= 6)
2221 offset /= 16;
2222
2223 if (p->devinfo->gen >= 7) {
2224 /* On gen 7 and above, we no longer have message registers and we can
2225 * send from any register we want. By using the destination register
2226 * for the message, we guarantee that the implied message write won't
2227 * accidentally overwrite anything. This has been a problem because
2228 * the MRF registers and source for the final FB write are both fixed
2229 * and may overlap.
2230 */
2231 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2232 } else {
2233 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2234 }
2235 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2236
2237 const unsigned rlen = num_regs;
2238 const unsigned target_cache =
2239 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2240 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2241 BRW_SFID_DATAPORT_READ);
2242
2243 {
2244 brw_push_insn_state(p);
2245 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2246 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2247 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2248 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2249
2250 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2251
2252 /* set message header global offset field (reg 0, element 2) */
2253 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2254 brw_set_default_swsb(p, tgl_swsb_null());
2255 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2256
2257 brw_pop_insn_state(p);
2258 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2259 }
2260
2261 {
2262 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2263
2264 brw_inst_set_sfid(devinfo, insn, target_cache);
2265 assert(brw_inst_pred_control(devinfo, insn) == 0);
2266 brw_inst_set_compression(devinfo, insn, false);
2267
2268 brw_set_dest(p, insn, dest); /* UW? */
2269 if (devinfo->gen >= 6) {
2270 brw_set_src0(p, insn, mrf);
2271 } else {
2272 brw_set_src0(p, insn, brw_null_reg());
2273 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2274 }
2275
2276 brw_set_desc(p, insn,
2277 brw_message_desc(devinfo, 1, rlen, true) |
2278 brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
2279 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2280 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2281 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2282 }
2283 }
2284
2285 void
gen7_block_read_scratch(struct brw_codegen * p,struct brw_reg dest,int num_regs,unsigned offset)2286 gen7_block_read_scratch(struct brw_codegen *p,
2287 struct brw_reg dest,
2288 int num_regs,
2289 unsigned offset)
2290 {
2291 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2292 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2293
2294 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2295
2296 /* The HW requires that the header is present; this is to get the g0.5
2297 * scratch offset.
2298 */
2299 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2300
2301 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2302 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2303 * is 32 bytes, which happens to be the size of a register.
2304 */
2305 offset /= REG_SIZE;
2306 assert(offset < (1 << 12));
2307
2308 gen7_set_dp_scratch_message(p, insn,
2309 false, /* scratch read */
2310 false, /* OWords */
2311 false, /* invalidate after read */
2312 num_regs,
2313 offset,
2314 1, /* mlen: just g0 */
2315 num_regs, /* rlen */
2316 true); /* header present */
2317 }
2318
2319 /**
2320 * Read float[4] vectors from the data port constant cache.
2321 * Location (in buffer) should be a multiple of 16.
2322 * Used for fetching shader constants.
2323 */
brw_oword_block_read(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,uint32_t offset,uint32_t bind_table_index)2324 void brw_oword_block_read(struct brw_codegen *p,
2325 struct brw_reg dest,
2326 struct brw_reg mrf,
2327 uint32_t offset,
2328 uint32_t bind_table_index)
2329 {
2330 const struct gen_device_info *devinfo = p->devinfo;
2331 const unsigned target_cache =
2332 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2333 BRW_SFID_DATAPORT_READ);
2334 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2335 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2336
2337 /* On newer hardware, offset is in units of owords. */
2338 if (devinfo->gen >= 6)
2339 offset /= 16;
2340
2341 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2342
2343 brw_push_insn_state(p);
2344 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2345 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2346 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2347
2348 brw_push_insn_state(p);
2349 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2350 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2351 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2352
2353 /* set message header global offset field (reg 0, element 2) */
2354 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2355 brw_set_default_swsb(p, tgl_swsb_null());
2356 brw_MOV(p,
2357 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2358 mrf.nr,
2359 2), BRW_REGISTER_TYPE_UD),
2360 brw_imm_ud(offset));
2361 brw_pop_insn_state(p);
2362
2363 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2364
2365 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2366
2367 brw_inst_set_sfid(devinfo, insn, target_cache);
2368
2369 /* cast dest to a uword[8] vector */
2370 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2371
2372 brw_set_dest(p, insn, dest);
2373 if (devinfo->gen >= 6) {
2374 brw_set_src0(p, insn, mrf);
2375 } else {
2376 brw_set_src0(p, insn, brw_null_reg());
2377 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2378 }
2379
2380 brw_set_desc(p, insn,
2381 brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2382 brw_dp_read_desc(devinfo, bind_table_index,
2383 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2384 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2385 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
2386
2387 brw_pop_insn_state(p);
2388 }
2389
2390 brw_inst *
brw_fb_WRITE(struct brw_codegen * p,struct brw_reg payload,struct brw_reg implied_header,unsigned msg_control,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool eot,bool last_render_target,bool header_present)2391 brw_fb_WRITE(struct brw_codegen *p,
2392 struct brw_reg payload,
2393 struct brw_reg implied_header,
2394 unsigned msg_control,
2395 unsigned binding_table_index,
2396 unsigned msg_length,
2397 unsigned response_length,
2398 bool eot,
2399 bool last_render_target,
2400 bool header_present)
2401 {
2402 const struct gen_device_info *devinfo = p->devinfo;
2403 const unsigned target_cache =
2404 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2405 BRW_SFID_DATAPORT_WRITE);
2406 brw_inst *insn;
2407 unsigned msg_type;
2408 struct brw_reg dest, src0;
2409
2410 if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2411 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2412 else
2413 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2414
2415 if (devinfo->gen >= 6) {
2416 insn = next_insn(p, BRW_OPCODE_SENDC);
2417 } else {
2418 insn = next_insn(p, BRW_OPCODE_SEND);
2419 }
2420 brw_inst_set_sfid(devinfo, insn, target_cache);
2421 brw_inst_set_compression(devinfo, insn, false);
2422
2423 if (devinfo->gen >= 6) {
2424 /* headerless version, just submit color payload */
2425 src0 = payload;
2426
2427 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2428 } else {
2429 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2430 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2431 src0 = implied_header;
2432
2433 msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2434 }
2435
2436 brw_set_dest(p, insn, dest);
2437 brw_set_src0(p, insn, src0);
2438 brw_set_desc(p, insn,
2439 brw_message_desc(devinfo, msg_length, response_length,
2440 header_present) |
2441 brw_dp_write_desc(devinfo, binding_table_index, msg_control,
2442 msg_type, last_render_target,
2443 0 /* send_commit_msg */));
2444 brw_inst_set_eot(devinfo, insn, eot);
2445
2446 return insn;
2447 }
2448
2449 brw_inst *
gen9_fb_READ(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool per_sample)2450 gen9_fb_READ(struct brw_codegen *p,
2451 struct brw_reg dst,
2452 struct brw_reg payload,
2453 unsigned binding_table_index,
2454 unsigned msg_length,
2455 unsigned response_length,
2456 bool per_sample)
2457 {
2458 const struct gen_device_info *devinfo = p->devinfo;
2459 assert(devinfo->gen >= 9);
2460 const unsigned msg_subtype =
2461 brw_get_default_exec_size(p) == BRW_EXECUTE_16 ? 0 : 1;
2462 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2463
2464 brw_inst_set_sfid(devinfo, insn, GEN6_SFID_DATAPORT_RENDER_CACHE);
2465 brw_set_dest(p, insn, dst);
2466 brw_set_src0(p, insn, payload);
2467 brw_set_desc(
2468 p, insn,
2469 brw_message_desc(devinfo, msg_length, response_length, true) |
2470 brw_dp_read_desc(devinfo, binding_table_index,
2471 per_sample << 5 | msg_subtype,
2472 GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2473 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2474 brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2475
2476 return insn;
2477 }
2478
2479 /**
2480 * Texture sample instruction.
2481 * Note: the msg_type plus msg_length values determine exactly what kind
2482 * of sampling operation is performed. See volume 4, page 161 of docs.
2483 */
brw_SAMPLE(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,unsigned sampler,unsigned msg_type,unsigned response_length,unsigned msg_length,unsigned header_present,unsigned simd_mode,unsigned return_format)2484 void brw_SAMPLE(struct brw_codegen *p,
2485 struct brw_reg dest,
2486 unsigned msg_reg_nr,
2487 struct brw_reg src0,
2488 unsigned binding_table_index,
2489 unsigned sampler,
2490 unsigned msg_type,
2491 unsigned response_length,
2492 unsigned msg_length,
2493 unsigned header_present,
2494 unsigned simd_mode,
2495 unsigned return_format)
2496 {
2497 const struct gen_device_info *devinfo = p->devinfo;
2498 brw_inst *insn;
2499
2500 if (msg_reg_nr != -1)
2501 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2502
2503 insn = next_insn(p, BRW_OPCODE_SEND);
2504 brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2505 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2506
2507 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2508 *
2509 * "Instruction compression is not allowed for this instruction (that
2510 * is, send). The hardware behavior is undefined if this instruction is
2511 * set as compressed. However, compress control can be set to "SecHalf"
2512 * to affect the EMask generation."
2513 *
2514 * No similar wording is found in later PRMs, but there are examples
2515 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2516 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2517 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2518 */
2519 brw_inst_set_compression(devinfo, insn, false);
2520
2521 if (devinfo->gen < 6)
2522 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2523
2524 brw_set_dest(p, insn, dest);
2525 brw_set_src0(p, insn, src0);
2526 brw_set_desc(p, insn,
2527 brw_message_desc(devinfo, msg_length, response_length,
2528 header_present) |
2529 brw_sampler_desc(devinfo, binding_table_index, sampler,
2530 msg_type, simd_mode, return_format));
2531 }
2532
2533 /* Adjust the message header's sampler state pointer to
2534 * select the correct group of 16 samplers.
2535 */
brw_adjust_sampler_state_pointer(struct brw_codegen * p,struct brw_reg header,struct brw_reg sampler_index)2536 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2537 struct brw_reg header,
2538 struct brw_reg sampler_index)
2539 {
2540 /* The "Sampler Index" field can only store values between 0 and 15.
2541 * However, we can add an offset to the "Sampler State Pointer"
2542 * field, effectively selecting a different set of 16 samplers.
2543 *
2544 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2545 * offset, and each sampler state is only 16-bytes, so we can't
2546 * exclusively use the offset - we have to use both.
2547 */
2548
2549 const struct gen_device_info *devinfo = p->devinfo;
2550
2551 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2552 const int sampler_state_size = 16; /* 16 bytes */
2553 uint32_t sampler = sampler_index.ud;
2554
2555 if (sampler >= 16) {
2556 assert(devinfo->is_haswell || devinfo->gen >= 8);
2557 brw_ADD(p,
2558 get_element_ud(header, 3),
2559 get_element_ud(brw_vec8_grf(0, 0), 3),
2560 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2561 }
2562 } else {
2563 /* Non-const sampler array indexing case */
2564 if (devinfo->gen < 8 && !devinfo->is_haswell) {
2565 return;
2566 }
2567
2568 struct brw_reg temp = get_element_ud(header, 3);
2569
2570 brw_push_insn_state(p);
2571 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2572 brw_set_default_swsb(p, tgl_swsb_regdist(1));
2573 brw_SHL(p, temp, temp, brw_imm_ud(4));
2574 brw_ADD(p,
2575 get_element_ud(header, 3),
2576 get_element_ud(brw_vec8_grf(0, 0), 3),
2577 temp);
2578 brw_pop_insn_state(p);
2579 }
2580 }
2581
2582 /* All these variables are pretty confusing - we might be better off
2583 * using bitmasks and macros for this, in the old style. Or perhaps
2584 * just having the caller instantiate the fields in dword3 itself.
2585 */
brw_urb_WRITE(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,enum brw_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle)2586 void brw_urb_WRITE(struct brw_codegen *p,
2587 struct brw_reg dest,
2588 unsigned msg_reg_nr,
2589 struct brw_reg src0,
2590 enum brw_urb_write_flags flags,
2591 unsigned msg_length,
2592 unsigned response_length,
2593 unsigned offset,
2594 unsigned swizzle)
2595 {
2596 const struct gen_device_info *devinfo = p->devinfo;
2597 brw_inst *insn;
2598
2599 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2600
2601 if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2602 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2603 brw_push_insn_state(p);
2604 brw_set_default_access_mode(p, BRW_ALIGN_1);
2605 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2606 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2607 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2608 BRW_REGISTER_TYPE_UD),
2609 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2610 brw_imm_ud(0xff00));
2611 brw_pop_insn_state(p);
2612 }
2613
2614 insn = next_insn(p, BRW_OPCODE_SEND);
2615
2616 assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2617
2618 brw_set_dest(p, insn, dest);
2619 brw_set_src0(p, insn, src0);
2620 brw_set_src1(p, insn, brw_imm_d(0));
2621
2622 if (devinfo->gen < 6)
2623 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2624
2625 brw_set_urb_message(p,
2626 insn,
2627 flags,
2628 msg_length,
2629 response_length,
2630 offset,
2631 swizzle);
2632 }
2633
2634 void
brw_send_indirect_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload,struct brw_reg desc,unsigned desc_imm,bool eot)2635 brw_send_indirect_message(struct brw_codegen *p,
2636 unsigned sfid,
2637 struct brw_reg dst,
2638 struct brw_reg payload,
2639 struct brw_reg desc,
2640 unsigned desc_imm,
2641 bool eot)
2642 {
2643 const struct gen_device_info *devinfo = p->devinfo;
2644 struct brw_inst *send;
2645
2646 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2647
2648 assert(desc.type == BRW_REGISTER_TYPE_UD);
2649
2650 if (desc.file == BRW_IMMEDIATE_VALUE) {
2651 send = next_insn(p, BRW_OPCODE_SEND);
2652 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2653 brw_set_desc(p, send, desc.ud | desc_imm);
2654 } else {
2655 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2656 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2657
2658 brw_push_insn_state(p);
2659 brw_set_default_access_mode(p, BRW_ALIGN_1);
2660 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2661 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2662 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2663 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2664
2665 /* Load the indirect descriptor to an address register using OR so the
2666 * caller can specify additional descriptor bits with the desc_imm
2667 * immediate.
2668 */
2669 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2670
2671 brw_pop_insn_state(p);
2672
2673 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2674 send = next_insn(p, BRW_OPCODE_SEND);
2675 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2676
2677 if (devinfo->gen >= 12)
2678 brw_inst_set_send_sel_reg32_desc(devinfo, send, true);
2679 else
2680 brw_set_src1(p, send, addr);
2681 }
2682
2683 brw_set_dest(p, send, dst);
2684 brw_inst_set_sfid(devinfo, send, sfid);
2685 brw_inst_set_eot(devinfo, send, eot);
2686 }
2687
2688 void
brw_send_indirect_split_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload0,struct brw_reg payload1,struct brw_reg desc,unsigned desc_imm,struct brw_reg ex_desc,unsigned ex_desc_imm,bool eot)2689 brw_send_indirect_split_message(struct brw_codegen *p,
2690 unsigned sfid,
2691 struct brw_reg dst,
2692 struct brw_reg payload0,
2693 struct brw_reg payload1,
2694 struct brw_reg desc,
2695 unsigned desc_imm,
2696 struct brw_reg ex_desc,
2697 unsigned ex_desc_imm,
2698 bool eot)
2699 {
2700 const struct gen_device_info *devinfo = p->devinfo;
2701 struct brw_inst *send;
2702
2703 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2704
2705 assert(desc.type == BRW_REGISTER_TYPE_UD);
2706
2707 if (desc.file == BRW_IMMEDIATE_VALUE) {
2708 desc.ud |= desc_imm;
2709 } else {
2710 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2711 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2712
2713 brw_push_insn_state(p);
2714 brw_set_default_access_mode(p, BRW_ALIGN_1);
2715 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2716 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2717 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2718 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2719
2720 /* Load the indirect descriptor to an address register using OR so the
2721 * caller can specify additional descriptor bits with the desc_imm
2722 * immediate.
2723 */
2724 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2725
2726 brw_pop_insn_state(p);
2727 desc = addr;
2728
2729 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2730 }
2731
2732 if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
2733 (devinfo->gen >= 12 || (ex_desc.ud & INTEL_MASK(15, 12)) == 0)) {
2734 ex_desc.ud |= ex_desc_imm;
2735 } else {
2736 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2737 struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);
2738
2739 brw_push_insn_state(p);
2740 brw_set_default_access_mode(p, BRW_ALIGN_1);
2741 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2742 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2743 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2744 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2745
2746 /* Load the indirect extended descriptor to an address register using OR
2747 * so the caller can specify additional descriptor bits with the
2748 * desc_imm immediate.
2749 *
2750 * Even though the instruction dispatcher always pulls the SFID and EOT
2751 * fields from the instruction itself, actual external unit which
2752 * processes the message gets the SFID and EOT from the extended
2753 * descriptor which comes from the address register. If we don't OR
2754 * those two bits in, the external unit may get confused and hang.
2755 */
2756 unsigned imm_part = ex_desc_imm | sfid | eot << 5;
2757
2758 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2759 /* ex_desc bits 15:12 don't exist in the instruction encoding prior
2760 * to Gen12, so we may have fallen back to an indirect extended
2761 * descriptor.
2762 */
2763 brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
2764 } else {
2765 brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
2766 }
2767
2768 brw_pop_insn_state(p);
2769 ex_desc = addr;
2770
2771 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2772 }
2773
2774 send = next_insn(p, devinfo->gen >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);
2775 brw_set_dest(p, send, dst);
2776 brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));
2777 brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));
2778
2779 if (desc.file == BRW_IMMEDIATE_VALUE) {
2780 brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);
2781 brw_inst_set_send_desc(devinfo, send, desc.ud);
2782 } else {
2783 assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2784 assert(desc.nr == BRW_ARF_ADDRESS);
2785 assert(desc.subnr == 0);
2786 brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);
2787 }
2788
2789 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2790 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
2791 brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
2792 } else {
2793 assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2794 assert(ex_desc.nr == BRW_ARF_ADDRESS);
2795 assert((ex_desc.subnr & 0x3) == 0);
2796 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
2797 brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2);
2798 }
2799
2800 brw_inst_set_sfid(devinfo, send, sfid);
2801 brw_inst_set_eot(devinfo, send, eot);
2802 }
2803
2804 static void
brw_send_indirect_surface_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned desc_imm)2805 brw_send_indirect_surface_message(struct brw_codegen *p,
2806 unsigned sfid,
2807 struct brw_reg dst,
2808 struct brw_reg payload,
2809 struct brw_reg surface,
2810 unsigned desc_imm)
2811 {
2812 if (surface.file != BRW_IMMEDIATE_VALUE) {
2813 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2814 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2815
2816 brw_push_insn_state(p);
2817 brw_set_default_access_mode(p, BRW_ALIGN_1);
2818 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2819 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2820 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2821 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2822
2823 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2824 * some surface array is accessed out of bounds.
2825 */
2826 brw_AND(p, addr,
2827 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2828 BRW_GET_SWZ(surface.swizzle, 0)),
2829 brw_imm_ud(0xff));
2830
2831 brw_pop_insn_state(p);
2832
2833 surface = addr;
2834 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2835 }
2836
2837 brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2838 }
2839
2840 static bool
while_jumps_before_offset(const struct gen_device_info * devinfo,brw_inst * insn,int while_offset,int start_offset)2841 while_jumps_before_offset(const struct gen_device_info *devinfo,
2842 brw_inst *insn, int while_offset, int start_offset)
2843 {
2844 int scale = 16 / brw_jump_scale(devinfo);
2845 int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2846 : brw_inst_jip(devinfo, insn);
2847 assert(jip < 0);
2848 return while_offset + jip * scale <= start_offset;
2849 }
2850
2851
2852 static int
brw_find_next_block_end(struct brw_codegen * p,int start_offset)2853 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2854 {
2855 int offset;
2856 void *store = p->store;
2857 const struct gen_device_info *devinfo = p->devinfo;
2858
2859 int depth = 0;
2860
2861 for (offset = next_offset(devinfo, store, start_offset);
2862 offset < p->next_insn_offset;
2863 offset = next_offset(devinfo, store, offset)) {
2864 brw_inst *insn = store + offset;
2865
2866 switch (brw_inst_opcode(devinfo, insn)) {
2867 case BRW_OPCODE_IF:
2868 depth++;
2869 break;
2870 case BRW_OPCODE_ENDIF:
2871 if (depth == 0)
2872 return offset;
2873 depth--;
2874 break;
2875 case BRW_OPCODE_WHILE:
2876 /* If the while doesn't jump before our instruction, it's the end
2877 * of a sibling do...while loop. Ignore it.
2878 */
2879 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2880 continue;
2881 /* fallthrough */
2882 case BRW_OPCODE_ELSE:
2883 case BRW_OPCODE_HALT:
2884 if (depth == 0)
2885 return offset;
2886 default:
2887 break;
2888 }
2889 }
2890
2891 return 0;
2892 }
2893
2894 /* There is no DO instruction on gen6, so to find the end of the loop
2895 * we have to see if the loop is jumping back before our start
2896 * instruction.
2897 */
2898 static int
brw_find_loop_end(struct brw_codegen * p,int start_offset)2899 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2900 {
2901 const struct gen_device_info *devinfo = p->devinfo;
2902 int offset;
2903 void *store = p->store;
2904
2905 assert(devinfo->gen >= 6);
2906
2907 /* Always start after the instruction (such as a WHILE) we're trying to fix
2908 * up.
2909 */
2910 for (offset = next_offset(devinfo, store, start_offset);
2911 offset < p->next_insn_offset;
2912 offset = next_offset(devinfo, store, offset)) {
2913 brw_inst *insn = store + offset;
2914
2915 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2916 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2917 return offset;
2918 }
2919 }
2920 assert(!"not reached");
2921 return start_offset;
2922 }
2923
2924 /* After program generation, go back and update the UIP and JIP of
2925 * BREAK, CONT, and HALT instructions to their correct locations.
2926 */
2927 void
brw_set_uip_jip(struct brw_codegen * p,int start_offset)2928 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2929 {
2930 const struct gen_device_info *devinfo = p->devinfo;
2931 int offset;
2932 int br = brw_jump_scale(devinfo);
2933 int scale = 16 / br;
2934 void *store = p->store;
2935
2936 if (devinfo->gen < 6)
2937 return;
2938
2939 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2940 brw_inst *insn = store + offset;
2941 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2942
2943 int block_end_offset = brw_find_next_block_end(p, offset);
2944 switch (brw_inst_opcode(devinfo, insn)) {
2945 case BRW_OPCODE_BREAK:
2946 assert(block_end_offset != 0);
2947 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2948 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2949 brw_inst_set_uip(devinfo, insn,
2950 (brw_find_loop_end(p, offset) - offset +
2951 (devinfo->gen == 6 ? 16 : 0)) / scale);
2952 break;
2953 case BRW_OPCODE_CONTINUE:
2954 assert(block_end_offset != 0);
2955 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2956 brw_inst_set_uip(devinfo, insn,
2957 (brw_find_loop_end(p, offset) - offset) / scale);
2958
2959 assert(brw_inst_uip(devinfo, insn) != 0);
2960 assert(brw_inst_jip(devinfo, insn) != 0);
2961 break;
2962
2963 case BRW_OPCODE_ENDIF: {
2964 int32_t jump = (block_end_offset == 0) ?
2965 1 * br : (block_end_offset - offset) / scale;
2966 if (devinfo->gen >= 7)
2967 brw_inst_set_jip(devinfo, insn, jump);
2968 else
2969 brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2970 break;
2971 }
2972
2973 case BRW_OPCODE_HALT:
2974 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2975 *
2976 * "In case of the halt instruction not inside any conditional
2977 * code block, the value of <JIP> and <UIP> should be the
2978 * same. In case of the halt instruction inside conditional code
2979 * block, the <UIP> should be the end of the program, and the
2980 * <JIP> should be end of the most inner conditional code block."
2981 *
2982 * The uip will have already been set by whoever set up the
2983 * instruction.
2984 */
2985 if (block_end_offset == 0) {
2986 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2987 } else {
2988 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2989 }
2990 assert(brw_inst_uip(devinfo, insn) != 0);
2991 assert(brw_inst_jip(devinfo, insn) != 0);
2992 break;
2993
2994 default:
2995 break;
2996 }
2997 }
2998 }
2999
brw_ff_sync(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,bool allocate,unsigned response_length,bool eot)3000 void brw_ff_sync(struct brw_codegen *p,
3001 struct brw_reg dest,
3002 unsigned msg_reg_nr,
3003 struct brw_reg src0,
3004 bool allocate,
3005 unsigned response_length,
3006 bool eot)
3007 {
3008 const struct gen_device_info *devinfo = p->devinfo;
3009 brw_inst *insn;
3010
3011 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
3012
3013 insn = next_insn(p, BRW_OPCODE_SEND);
3014 brw_set_dest(p, insn, dest);
3015 brw_set_src0(p, insn, src0);
3016 brw_set_src1(p, insn, brw_imm_d(0));
3017
3018 if (devinfo->gen < 6)
3019 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
3020
3021 brw_set_ff_sync_message(p,
3022 insn,
3023 allocate,
3024 response_length,
3025 eot);
3026 }
3027
3028 /**
3029 * Emit the SEND instruction necessary to generate stream output data on Gen6
3030 * (for transform feedback).
3031 *
3032 * If send_commit_msg is true, this is the last piece of stream output data
3033 * from this thread, so send the data as a committed write. According to the
3034 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
3035 *
3036 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
3037 * writes are complete by sending the final write as a committed write."
3038 */
3039 void
brw_svb_write(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,bool send_commit_msg)3040 brw_svb_write(struct brw_codegen *p,
3041 struct brw_reg dest,
3042 unsigned msg_reg_nr,
3043 struct brw_reg src0,
3044 unsigned binding_table_index,
3045 bool send_commit_msg)
3046 {
3047 const struct gen_device_info *devinfo = p->devinfo;
3048 const unsigned target_cache =
3049 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
3050 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
3051 BRW_SFID_DATAPORT_WRITE);
3052 brw_inst *insn;
3053
3054 gen6_resolve_implied_move(p, &src0, msg_reg_nr);
3055
3056 insn = next_insn(p, BRW_OPCODE_SEND);
3057 brw_inst_set_sfid(devinfo, insn, target_cache);
3058 brw_set_dest(p, insn, dest);
3059 brw_set_src0(p, insn, src0);
3060 brw_set_desc(p, insn,
3061 brw_message_desc(devinfo, 1, send_commit_msg, true) |
3062 brw_dp_write_desc(devinfo, binding_table_index,
3063 0, /* msg_control: ignored */
3064 GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
3065 0, /* last_render_target: ignored */
3066 send_commit_msg)); /* send_commit_msg */
3067 }
3068
3069 static unsigned
brw_surface_payload_size(unsigned num_channels,unsigned exec_size)3070 brw_surface_payload_size(unsigned num_channels,
3071 unsigned exec_size /**< 0 for SIMD4x2 */)
3072 {
3073 if (exec_size == 0)
3074 return 1; /* SIMD4x2 */
3075 else if (exec_size <= 8)
3076 return num_channels;
3077 else
3078 return 2 * num_channels;
3079 }
3080
3081 void
brw_untyped_atomic(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned atomic_op,unsigned msg_length,bool response_expected,bool header_present)3082 brw_untyped_atomic(struct brw_codegen *p,
3083 struct brw_reg dst,
3084 struct brw_reg payload,
3085 struct brw_reg surface,
3086 unsigned atomic_op,
3087 unsigned msg_length,
3088 bool response_expected,
3089 bool header_present)
3090 {
3091 const struct gen_device_info *devinfo = p->devinfo;
3092 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3093 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3094 GEN7_SFID_DATAPORT_DATA_CACHE);
3095 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3096 /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3097 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3098 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3099 has_simd4x2 ? 0 : 8;
3100 const unsigned response_length =
3101 brw_surface_payload_size(response_expected, exec_size);
3102 const unsigned desc =
3103 brw_message_desc(devinfo, msg_length, response_length, header_present) |
3104 brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
3105 response_expected);
3106 /* Mask out unused components -- This is especially important in Align16
3107 * mode on generations that don't have native support for SIMD4x2 atomics,
3108 * because unused but enabled components will cause the dataport to perform
3109 * additional atomic operations on the addresses that happen to be in the
3110 * uninitialized Y, Z and W coordinates of the payload.
3111 */
3112 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3113
3114 brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
3115 payload, surface, desc);
3116 }
3117
3118 void
brw_untyped_surface_read(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels)3119 brw_untyped_surface_read(struct brw_codegen *p,
3120 struct brw_reg dst,
3121 struct brw_reg payload,
3122 struct brw_reg surface,
3123 unsigned msg_length,
3124 unsigned num_channels)
3125 {
3126 const struct gen_device_info *devinfo = p->devinfo;
3127 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3128 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3129 GEN7_SFID_DATAPORT_DATA_CACHE);
3130 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3131 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
3132 const unsigned response_length =
3133 brw_surface_payload_size(num_channels, exec_size);
3134 const unsigned desc =
3135 brw_message_desc(devinfo, msg_length, response_length, false) |
3136 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
3137
3138 brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
3139 }
3140
3141 void
brw_untyped_surface_write(struct brw_codegen * p,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels,bool header_present)3142 brw_untyped_surface_write(struct brw_codegen *p,
3143 struct brw_reg payload,
3144 struct brw_reg surface,
3145 unsigned msg_length,
3146 unsigned num_channels,
3147 bool header_present)
3148 {
3149 const struct gen_device_info *devinfo = p->devinfo;
3150 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3151 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3152 GEN7_SFID_DATAPORT_DATA_CACHE);
3153 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3154 /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3155 const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3156 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3157 has_simd4x2 ? 0 : 8;
3158 const unsigned desc =
3159 brw_message_desc(devinfo, msg_length, 0, header_present) |
3160 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
3161 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3162 const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
3163
3164 brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
3165 payload, surface, desc);
3166 }
3167
3168 static void
brw_set_memory_fence_message(struct brw_codegen * p,struct brw_inst * insn,enum brw_message_target sfid,bool commit_enable,unsigned bti)3169 brw_set_memory_fence_message(struct brw_codegen *p,
3170 struct brw_inst *insn,
3171 enum brw_message_target sfid,
3172 bool commit_enable,
3173 unsigned bti)
3174 {
3175 const struct gen_device_info *devinfo = p->devinfo;
3176
3177 brw_set_desc(p, insn, brw_message_desc(
3178 devinfo, 1, (commit_enable ? 1 : 0), true));
3179
3180 brw_inst_set_sfid(devinfo, insn, sfid);
3181
3182 switch (sfid) {
3183 case GEN6_SFID_DATAPORT_RENDER_CACHE:
3184 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3185 break;
3186 case GEN7_SFID_DATAPORT_DATA_CACHE:
3187 brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3188 break;
3189 default:
3190 unreachable("Not reached");
3191 }
3192
3193 if (commit_enable)
3194 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3195
3196 assert(devinfo->gen >= 11 || bti == 0);
3197 brw_inst_set_binding_table_index(devinfo, insn, bti);
3198 }
3199
3200 void
brw_memory_fence(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src,enum opcode send_op,enum brw_message_target sfid,bool commit_enable,unsigned bti)3201 brw_memory_fence(struct brw_codegen *p,
3202 struct brw_reg dst,
3203 struct brw_reg src,
3204 enum opcode send_op,
3205 enum brw_message_target sfid,
3206 bool commit_enable,
3207 unsigned bti)
3208 {
3209 const struct gen_device_info *devinfo = p->devinfo;
3210
3211 dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
3212 src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
3213
3214 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3215 * message doesn't write anything back.
3216 */
3217 struct brw_inst *insn = next_insn(p, send_op);
3218 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3219 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3220 brw_set_dest(p, insn, dst);
3221 brw_set_src0(p, insn, src);
3222 brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
3223 }
3224
3225 void
brw_pixel_interpolator_query(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,bool noperspective,unsigned mode,struct brw_reg data,unsigned msg_length,unsigned response_length)3226 brw_pixel_interpolator_query(struct brw_codegen *p,
3227 struct brw_reg dest,
3228 struct brw_reg mrf,
3229 bool noperspective,
3230 unsigned mode,
3231 struct brw_reg data,
3232 unsigned msg_length,
3233 unsigned response_length)
3234 {
3235 const struct gen_device_info *devinfo = p->devinfo;
3236 const uint16_t exec_size = brw_get_default_exec_size(p);
3237 const unsigned slot_group = brw_get_default_group(p) / 16;
3238 const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
3239 const unsigned desc =
3240 brw_message_desc(devinfo, msg_length, response_length, false) |
3241 brw_pixel_interp_desc(devinfo, mode, noperspective, simd_mode,
3242 slot_group);
3243
3244 /* brw_send_indirect_message will automatically use a direct send message
3245 * if data is actually immediate.
3246 */
3247 brw_send_indirect_message(p,
3248 GEN7_SFID_PIXEL_INTERPOLATOR,
3249 dest,
3250 mrf,
3251 vec1(data),
3252 desc,
3253 false);
3254 }
3255
3256 void
brw_find_live_channel(struct brw_codegen * p,struct brw_reg dst,struct brw_reg mask)3257 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3258 struct brw_reg mask)
3259 {
3260 const struct gen_device_info *devinfo = p->devinfo;
3261 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3262 const unsigned qtr_control = brw_get_default_group(p) / 8;
3263 brw_inst *inst;
3264
3265 assert(devinfo->gen >= 7);
3266 assert(mask.type == BRW_REGISTER_TYPE_UD);
3267
3268 brw_push_insn_state(p);
3269
3270 /* The flag register is only used on Gen7 in align1 mode, so avoid setting
3271 * unnecessary bits in the instruction words, get the information we need
3272 * and reset the default flag register. This allows more instructions to be
3273 * compacted.
3274 */
3275 const unsigned flag_subreg = p->current->flag_subreg;
3276 brw_set_default_flag_reg(p, 0, 0);
3277
3278 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3279 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3280
3281 if (devinfo->gen >= 8) {
3282 /* Getting the first active channel index is easy on Gen8: Just find
3283 * the first bit set in the execution mask. The register exists on
3284 * HSW already but it reads back as all ones when the current
3285 * instruction has execution masking disabled, so it's kind of
3286 * useless.
3287 */
3288 struct brw_reg exec_mask =
3289 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3290
3291 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3292 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3293 /* Unfortunately, ce0 does not take into account the thread
3294 * dispatch mask, which may be a problem in cases where it's not
3295 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3296 * some n). Combine ce0 with the given dispatch (or vector) mask
3297 * to mask off those channels which were never dispatched by the
3298 * hardware.
3299 */
3300 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3301 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3302 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3303 exec_mask = vec1(dst);
3304 }
3305
3306 /* Quarter control has the effect of magically shifting the value of
3307 * ce0 so you'll get the first active channel relative to the
3308 * specified quarter control as result.
3309 */
3310 inst = brw_FBL(p, vec1(dst), exec_mask);
3311 } else {
3312 const struct brw_reg flag = brw_flag_subreg(flag_subreg);
3313
3314 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3315 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3316
3317 /* Run enough instructions returning zero with execution masking and
3318 * a conditional modifier enabled in order to get the full execution
3319 * mask in f1.0. We could use a single 32-wide move here if it
3320 * weren't because of the hardware bug that causes channel enables to
3321 * be applied incorrectly to the second half of 32-wide instructions
3322 * on Gen7.
3323 */
3324 const unsigned lower_size = MIN2(16, exec_size);
3325 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3326 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3327 brw_imm_uw(0));
3328 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3329 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3330 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3331 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3332 brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3333 brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3334 }
3335
3336 /* Find the first bit set in the exec_size-wide portion of the flag
3337 * register that was updated by the last sequence of MOV
3338 * instructions.
3339 */
3340 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3341 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3342 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3343 }
3344 } else {
3345 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3346
3347 if (devinfo->gen >= 8 &&
3348 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3349 /* In SIMD4x2 mode the first active channel index is just the
3350 * negation of the first bit of the mask register. Note that ce0
3351 * doesn't take into account the dispatch mask, so the Gen7 path
3352 * should be used instead unless you have the guarantee that the
3353 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3354 * for some n).
3355 */
3356 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3357 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3358 brw_imm_ud(1));
3359
3360 } else {
3361 /* Overwrite the destination without and with execution masking to
3362 * find out which of the channels is active.
3363 */
3364 brw_push_insn_state(p);
3365 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3366 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3367 brw_imm_ud(1));
3368
3369 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3370 brw_imm_ud(0));
3371 brw_pop_insn_state(p);
3372 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3373 }
3374 }
3375
3376 brw_pop_insn_state(p);
3377 }
3378
3379 void
brw_broadcast(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src,struct brw_reg idx)3380 brw_broadcast(struct brw_codegen *p,
3381 struct brw_reg dst,
3382 struct brw_reg src,
3383 struct brw_reg idx)
3384 {
3385 const struct gen_device_info *devinfo = p->devinfo;
3386 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3387 brw_inst *inst;
3388
3389 brw_push_insn_state(p);
3390 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3391 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3392
3393 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3394 src.address_mode == BRW_ADDRESS_DIRECT);
3395 assert(!src.abs && !src.negate);
3396 assert(src.type == dst.type);
3397
3398 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3399 idx.file == BRW_IMMEDIATE_VALUE) {
3400 /* Trivial, the source is already uniform or the index is a constant.
3401 * We will typically not get here if the optimizer is doing its job, but
3402 * asserting would be mean.
3403 */
3404 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3405 src = align1 ? stride(suboffset(src, i), 0, 1, 0) :
3406 stride(suboffset(src, 4 * i), 0, 4, 1);
3407
3408 if (type_sz(src.type) > 4 && !devinfo->has_64bit_float) {
3409 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3410 subscript(src, BRW_REGISTER_TYPE_D, 0));
3411 brw_set_default_swsb(p, tgl_swsb_null());
3412 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3413 subscript(src, BRW_REGISTER_TYPE_D, 1));
3414 } else {
3415 brw_MOV(p, dst, src);
3416 }
3417 } else {
3418 /* From the Haswell PRM section "Register Region Restrictions":
3419 *
3420 * "The lower bits of the AddressImmediate must not overflow to
3421 * change the register address. The lower 5 bits of Address
3422 * Immediate when added to lower 5 bits of address register gives
3423 * the sub-register offset. The upper bits of Address Immediate
3424 * when added to upper bits of address register gives the register
3425 * address. Any overflow from sub-register offset is dropped."
3426 *
3427 * Fortunately, for broadcast, we never have a sub-register offset so
3428 * this isn't an issue.
3429 */
3430 assert(src.subnr == 0);
3431
3432 if (align1) {
3433 const struct brw_reg addr =
3434 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3435 unsigned offset = src.nr * REG_SIZE + src.subnr;
3436 /* Limit in bytes of the signed indirect addressing immediate. */
3437 const unsigned limit = 512;
3438
3439 brw_push_insn_state(p);
3440 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3441 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3442
3443 /* Take into account the component size and horizontal stride. */
3444 assert(src.vstride == src.hstride + src.width);
3445 brw_SHL(p, addr, vec1(idx),
3446 brw_imm_ud(util_logbase2(type_sz(src.type)) +
3447 src.hstride - 1));
3448
3449 /* We can only address up to limit bytes using the indirect
3450 * addressing immediate, account for the difference if the source
3451 * register is above this limit.
3452 */
3453 if (offset >= limit) {
3454 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3455 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3456 offset = offset % limit;
3457 }
3458
3459 brw_pop_insn_state(p);
3460
3461 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3462
3463 /* Use indirect addressing to fetch the specified component. */
3464 if (type_sz(src.type) > 4 &&
3465 (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo) ||
3466 !devinfo->has_64bit_float)) {
3467 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3468 *
3469 * "When source or destination datatype is 64b or operation is
3470 * integer DWord multiply, indirect addressing must not be
3471 * used."
3472 *
3473 * To work around both of this issue, we do two integer MOVs
3474 * insead of one 64-bit MOV. Because no double value should ever
3475 * cross a register boundary, it's safe to use the immediate
3476 * offset in the indirect here to handle adding 4 bytes to the
3477 * offset and avoid the extra ADD to the register file.
3478 */
3479 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3480 retype(brw_vec1_indirect(addr.subnr, offset),
3481 BRW_REGISTER_TYPE_D));
3482 brw_set_default_swsb(p, tgl_swsb_null());
3483 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3484 retype(brw_vec1_indirect(addr.subnr, offset + 4),
3485 BRW_REGISTER_TYPE_D));
3486 } else {
3487 brw_MOV(p, dst,
3488 retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3489 }
3490 } else {
3491 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3492 * to all bits of a flag register,
3493 */
3494 inst = brw_MOV(p,
3495 brw_null_reg(),
3496 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3497 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3498 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3499 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3500
3501 /* and use predicated SEL to pick the right channel. */
3502 inst = brw_SEL(p, dst,
3503 stride(suboffset(src, 4), 4, 4, 1),
3504 stride(src, 4, 4, 1));
3505 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3506 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3507 }
3508 }
3509
3510 brw_pop_insn_state(p);
3511 }
3512
3513 /**
3514 * This instruction is generated as a single-channel align1 instruction by
3515 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3516 *
3517 * We can't use the typed atomic op in the FS because that has the execution
3518 * mask ANDed with the pixel mask, but we just want to write the one dword for
3519 * all the pixels.
3520 *
3521 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3522 * one u32. So we use the same untyped atomic write message as the pixel
3523 * shader.
3524 *
3525 * The untyped atomic operation requires a BUFFER surface type with RAW
3526 * format, and is only accessible through the legacy DATA_CACHE dataport
3527 * messages.
3528 */
brw_shader_time_add(struct brw_codegen * p,struct brw_reg payload,uint32_t surf_index)3529 void brw_shader_time_add(struct brw_codegen *p,
3530 struct brw_reg payload,
3531 uint32_t surf_index)
3532 {
3533 const struct gen_device_info *devinfo = p->devinfo;
3534 const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3535 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3536 GEN7_SFID_DATAPORT_DATA_CACHE);
3537 assert(devinfo->gen >= 7);
3538
3539 brw_push_insn_state(p);
3540 brw_set_default_access_mode(p, BRW_ALIGN_1);
3541 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3542 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3543 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3544
3545 /* We use brw_vec1_reg and unmasked because we want to increment the given
3546 * offset only once.
3547 */
3548 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3549 BRW_ARF_NULL, 0));
3550 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3551 payload.nr, 0));
3552 brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) |
3553 brw_dp_untyped_atomic_desc(devinfo, 1, BRW_AOP_ADD,
3554 false)));
3555
3556 brw_inst_set_sfid(devinfo, send, sfid);
3557 brw_inst_set_binding_table_index(devinfo, send, surf_index);
3558
3559 brw_pop_insn_state(p);
3560 }
3561
3562
3563 /**
3564 * Emit the SEND message for a barrier
3565 */
3566 void
brw_barrier(struct brw_codegen * p,struct brw_reg src)3567 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3568 {
3569 const struct gen_device_info *devinfo = p->devinfo;
3570 struct brw_inst *inst;
3571
3572 assert(devinfo->gen >= 7);
3573
3574 brw_push_insn_state(p);
3575 brw_set_default_access_mode(p, BRW_ALIGN_1);
3576 inst = next_insn(p, BRW_OPCODE_SEND);
3577 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3578 brw_set_src0(p, inst, src);
3579 brw_set_src1(p, inst, brw_null_reg());
3580 brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3581
3582 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3583 brw_inst_set_gateway_subfuncid(devinfo, inst,
3584 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3585
3586 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3587 brw_pop_insn_state(p);
3588 }
3589
3590
3591 /**
3592 * Emit the wait instruction for a barrier
3593 */
3594 void
brw_WAIT(struct brw_codegen * p)3595 brw_WAIT(struct brw_codegen *p)
3596 {
3597 const struct gen_device_info *devinfo = p->devinfo;
3598 struct brw_inst *insn;
3599
3600 struct brw_reg src = brw_notification_reg();
3601
3602 insn = next_insn(p, BRW_OPCODE_WAIT);
3603 brw_set_dest(p, insn, src);
3604 brw_set_src0(p, insn, src);
3605 brw_set_src1(p, insn, brw_null_reg());
3606
3607 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3608 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3609 }
3610
3611 void
brw_float_controls_mode(struct brw_codegen * p,unsigned mode,unsigned mask)3612 brw_float_controls_mode(struct brw_codegen *p,
3613 unsigned mode, unsigned mask)
3614 {
3615 /* From the Skylake PRM, Volume 7, page 760:
3616 * "Implementation Restriction on Register Access: When the control
3617 * register is used as an explicit source and/or destination, hardware
3618 * does not ensure execution pipeline coherency. Software must set the
3619 * thread control field to ‘switch’ for an instruction that uses
3620 * control register as an explicit operand."
3621 *
3622 * On Gen12+ this is implemented in terms of SWSB annotations instead.
3623 */
3624 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3625
3626 brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3627 brw_imm_ud(~mask));
3628 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3629 if (p->devinfo->gen < 12)
3630 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3631
3632 if (mode) {
3633 brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3634 brw_imm_ud(mode));
3635 brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1);
3636 if (p->devinfo->gen < 12)
3637 brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH);
3638 }
3639
3640 if (p->devinfo->gen >= 12)
3641 brw_SYNC(p, TGL_SYNC_NOP);
3642 }
3643
3644 void
brw_update_reloc_imm(const struct gen_device_info * devinfo,brw_inst * inst,uint32_t value)3645 brw_update_reloc_imm(const struct gen_device_info *devinfo,
3646 brw_inst *inst,
3647 uint32_t value)
3648 {
3649 /* Sanity check that the instruction is a MOV of an immediate */
3650 assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV);
3651 assert(brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE);
3652
3653 /* If it was compacted, we can't safely rewrite */
3654 assert(brw_inst_cmpt_control(devinfo, inst) == 0);
3655
3656 brw_inst_set_imm_ud(devinfo, inst, value);
3657 }
3658
3659 /* A default value for constants that will be patched at run-time.
3660 * We pick an arbitrary value that prevents instruction compaction.
3661 */
3662 #define DEFAULT_PATCH_IMM 0x4a7cc037
3663
3664 void
brw_MOV_reloc_imm(struct brw_codegen * p,struct brw_reg dst,enum brw_reg_type src_type,uint32_t id)3665 brw_MOV_reloc_imm(struct brw_codegen *p,
3666 struct brw_reg dst,
3667 enum brw_reg_type src_type,
3668 uint32_t id)
3669 {
3670 assert(type_sz(src_type) == 4);
3671 assert(type_sz(dst.type) == 4);
3672
3673 if (p->num_relocs + 1 > p->reloc_array_size) {
3674 p->reloc_array_size = MAX2(16, p->reloc_array_size * 2);
3675 p->relocs = reralloc(p->mem_ctx, p->relocs,
3676 struct brw_shader_reloc, p->reloc_array_size);
3677 }
3678
3679 p->relocs[p->num_relocs++] = (struct brw_shader_reloc) {
3680 .id = id,
3681 .offset = p->next_insn_offset,
3682 };
3683
3684 brw_MOV(p, dst, retype(brw_imm_ud(DEFAULT_PATCH_IMM), src_type));
3685 }
3686