1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
gfx6_resolve_implied_move(struct brw_codegen * p,struct brw_reg * src,unsigned msg_reg_nr)46 gfx6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct intel_device_info *devinfo = p->devinfo;
51 if (devinfo->ver < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 assert(devinfo->ver < 12);
59 brw_push_insn_state(p);
60 brw_set_default_exec_size(p, BRW_EXECUTE_8);
61 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64 retype(*src, BRW_REGISTER_TYPE_UD));
65 brw_pop_insn_state(p);
66 }
67 *src = brw_message_reg(msg_reg_nr);
68 }
69
70 static void
gfx7_convert_mrf_to_grf(struct brw_codegen * p,struct brw_reg * reg)71 gfx7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72 {
73 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74 * "The send with EOT should use register space R112-R127 for <src>. This is
75 * to enable loading of a new thread into the same slot while the message
76 * with EOT for current thread is pending dispatch."
77 *
78 * Since we're pretending to have 16 MRFs anyway, we may as well use the
79 * registers required for messages with EOT.
80 */
81 const struct intel_device_info *devinfo = p->devinfo;
82 if (devinfo->ver >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83 reg->file = BRW_GENERAL_REGISTER_FILE;
84 reg->nr += GFX7_MRF_HACK_START;
85 }
86 }
87
88 void
brw_set_dest(struct brw_codegen * p,brw_inst * inst,struct brw_reg dest)89 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
90 {
91 const struct intel_device_info *devinfo = p->devinfo;
92
93 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
94 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));
95 else if (dest.file == BRW_GENERAL_REGISTER_FILE)
96 assert(dest.nr < 128);
97
98 /* The hardware has a restriction where a destination of size Byte with
99 * a stride of 1 is only allowed for a packed byte MOV. For any other
100 * instruction, the stride must be at least 2, even when the destination
101 * is the NULL register.
102 */
103 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
104 dest.nr == BRW_ARF_NULL &&
105 type_sz(dest.type) == 1 &&
106 dest.hstride == BRW_HORIZONTAL_STRIDE_1) {
107 dest.hstride = BRW_HORIZONTAL_STRIDE_2;
108 }
109
110 gfx7_convert_mrf_to_grf(p, &dest);
111
112 if (devinfo->ver >= 12 &&
113 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
114 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
115 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
116 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
117 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
118 assert(dest.subnr == 0);
119 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
120 (dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
121 dest.vstride == dest.width + 1));
122 assert(!dest.negate && !dest.abs);
123 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
124 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
125
126 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
127 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
128 assert(devinfo->ver < 12);
129 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
130 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
131 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
132 assert(dest.subnr % 16 == 0);
133 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
134 dest.vstride == dest.width + 1);
135 assert(!dest.negate && !dest.abs);
136 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
137 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
138 brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
139 } else {
140 brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
141 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
142
143 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
144 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
145
146 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
147 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
148 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
149 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
150 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
151 } else {
152 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
153 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
154 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
155 dest.file == BRW_MESSAGE_REGISTER_FILE) {
156 assert(dest.writemask != 0);
157 }
158 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
159 * Although Dst.HorzStride is a don't care for Align16, HW needs
160 * this to be programmed as "01".
161 */
162 brw_inst_set_dst_hstride(devinfo, inst, 1);
163 }
164 } else {
165 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
166
167 /* These are different sizes in align1 vs align16:
168 */
169 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
170 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
171 dest.indirect_offset);
172 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
173 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
174 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
175 } else {
176 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
177 dest.indirect_offset);
178 /* even ignored in da16, still need to set as '01' */
179 brw_inst_set_dst_hstride(devinfo, inst, 1);
180 }
181 }
182 }
183
184 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
185 * or 16 (SIMD16), as that's normally correct. However, when dealing with
186 * small registers, it can be useful for us to automatically reduce it to
187 * match the register size.
188 */
189 if (p->automatic_exec_sizes) {
190 /*
191 * In platforms that support fp64 we can emit instructions with a width
192 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
193 * these cases we need to make sure that these instructions have their
194 * exec sizes set properly when they are emitted and we can't rely on
195 * this code to fix it.
196 */
197 bool fix_exec_size;
198 if (devinfo->ver >= 6)
199 fix_exec_size = dest.width < BRW_EXECUTE_4;
200 else
201 fix_exec_size = dest.width < BRW_EXECUTE_8;
202
203 if (fix_exec_size)
204 brw_inst_set_exec_size(devinfo, inst, dest.width);
205 }
206 }
207
208 void
brw_set_src0(struct brw_codegen * p,brw_inst * inst,struct brw_reg reg)209 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
210 {
211 const struct intel_device_info *devinfo = p->devinfo;
212
213 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
214 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));
215 else if (reg.file == BRW_GENERAL_REGISTER_FILE)
216 assert(reg.nr < 128);
217
218 gfx7_convert_mrf_to_grf(p, ®);
219
220 if (devinfo->ver >= 6 &&
221 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
222 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC ||
223 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
224 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC)) {
225 /* Any source modifiers or regions will be ignored, since this just
226 * identifies the MRF/GRF to start reading the message contents from.
227 * Check for some likely failures.
228 */
229 assert(!reg.negate);
230 assert(!reg.abs);
231 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
232 }
233
234 if (devinfo->ver >= 12 &&
235 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
236 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
237 assert(reg.file != BRW_IMMEDIATE_VALUE);
238 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
239 assert(reg.subnr == 0);
240 assert(has_scalar_region(reg) ||
241 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
242 reg.vstride == reg.width + 1));
243 assert(!reg.negate && !reg.abs);
244 brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
245 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
246
247 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
248 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
249 assert(reg.file == BRW_GENERAL_REGISTER_FILE);
250 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
251 assert(reg.subnr % 16 == 0);
252 assert(has_scalar_region(reg) ||
253 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
254 reg.vstride == reg.width + 1));
255 assert(!reg.negate && !reg.abs);
256 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
257 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
258 } else {
259 brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
260 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
261 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
262 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
263
264 if (reg.file == BRW_IMMEDIATE_VALUE) {
265 if (reg.type == BRW_REGISTER_TYPE_DF ||
266 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
267 brw_inst_set_imm_df(devinfo, inst, reg.df);
268 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
269 reg.type == BRW_REGISTER_TYPE_Q)
270 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
271 else
272 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
273
274 if (devinfo->ver < 12 && type_sz(reg.type) < 8) {
275 brw_inst_set_src1_reg_file(devinfo, inst,
276 BRW_ARCHITECTURE_REGISTER_FILE);
277 brw_inst_set_src1_reg_hw_type(devinfo, inst,
278 brw_inst_src0_reg_hw_type(devinfo, inst));
279 }
280 } else {
281 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
282 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
283 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
284 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
285 } else {
286 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
287 }
288 } else {
289 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
290
291 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
292 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
293 } else {
294 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
295 }
296 }
297
298 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
299 if (reg.width == BRW_WIDTH_1 &&
300 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
301 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
302 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
303 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
304 } else {
305 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
306 brw_inst_set_src0_width(devinfo, inst, reg.width);
307 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
308 }
309 } else {
310 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
311 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
312 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
313 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
314 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
315 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
316 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
317 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
318
319 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
320 /* This is an oddity of the fact we're using the same
321 * descriptions for registers in align_16 as align_1:
322 */
323 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
324 } else if (devinfo->verx10 == 70 &&
325 reg.type == BRW_REGISTER_TYPE_DF &&
326 reg.vstride == BRW_VERTICAL_STRIDE_2) {
327 /* From SNB PRM:
328 *
329 * "For Align16 access mode, only encodings of 0000 and 0011
330 * are allowed. Other codes are reserved."
331 *
332 * Presumably the DevSNB behavior applies to IVB as well.
333 */
334 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
335 } else {
336 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
337 }
338 }
339 }
340 }
341 }
342
343
344 void
brw_set_src1(struct brw_codegen * p,brw_inst * inst,struct brw_reg reg)345 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
346 {
347 const struct intel_device_info *devinfo = p->devinfo;
348
349 if (reg.file == BRW_GENERAL_REGISTER_FILE)
350 assert(reg.nr < 128);
351
352 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
353 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC ||
354 (devinfo->ver >= 12 &&
355 (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
356 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC))) {
357 assert(reg.file == BRW_GENERAL_REGISTER_FILE ||
358 reg.file == BRW_ARCHITECTURE_REGISTER_FILE);
359 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
360 assert(reg.subnr == 0);
361 assert(has_scalar_region(reg) ||
362 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
363 reg.vstride == reg.width + 1));
364 assert(!reg.negate && !reg.abs);
365 brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr);
366 brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
367 } else {
368 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
369 *
370 * "Accumulator registers may be accessed explicitly as src0
371 * operands only."
372 */
373 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
374 reg.nr != BRW_ARF_ACCUMULATOR);
375
376 gfx7_convert_mrf_to_grf(p, ®);
377 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
378
379 brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
380 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
381 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
382
383 /* Only src1 can be immediate in two-argument instructions.
384 */
385 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
386
387 if (reg.file == BRW_IMMEDIATE_VALUE) {
388 /* two-argument instructions can only use 32-bit immediates */
389 assert(type_sz(reg.type) < 8);
390 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
391 } else {
392 /* This is a hardware restriction, which may or may not be lifted
393 * in the future:
394 */
395 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
396 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
397
398 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
399 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
400 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
401 } else {
402 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
403 }
404
405 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
406 if (reg.width == BRW_WIDTH_1 &&
407 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
408 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
409 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
410 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
411 } else {
412 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
413 brw_inst_set_src1_width(devinfo, inst, reg.width);
414 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
415 }
416 } else {
417 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
418 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
419 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
420 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
421 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
422 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
423 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
424 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
425
426 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
427 /* This is an oddity of the fact we're using the same
428 * descriptions for registers in align_16 as align_1:
429 */
430 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
431 } else if (devinfo->verx10 == 70 &&
432 reg.type == BRW_REGISTER_TYPE_DF &&
433 reg.vstride == BRW_VERTICAL_STRIDE_2) {
434 /* From SNB PRM:
435 *
436 * "For Align16 access mode, only encodings of 0000 and 0011
437 * are allowed. Other codes are reserved."
438 *
439 * Presumably the DevSNB behavior applies to IVB as well.
440 */
441 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
442 } else {
443 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
444 }
445 }
446 }
447 }
448 }
449
450 /**
451 * Specify the descriptor and extended descriptor immediate for a SEND(C)
452 * message instruction.
453 */
454 void
brw_set_desc_ex(struct brw_codegen * p,brw_inst * inst,unsigned desc,unsigned ex_desc)455 brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
456 unsigned desc, unsigned ex_desc)
457 {
458 const struct intel_device_info *devinfo = p->devinfo;
459 assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
460 brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC);
461 if (devinfo->ver < 12)
462 brw_inst_set_src1_file_type(devinfo, inst,
463 BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
464 brw_inst_set_send_desc(devinfo, inst, desc);
465 if (devinfo->ver >= 9)
466 brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
467 }
468
brw_set_math_message(struct brw_codegen * p,brw_inst * inst,unsigned function,unsigned integer_type,bool low_precision,unsigned dataType)469 static void brw_set_math_message( struct brw_codegen *p,
470 brw_inst *inst,
471 unsigned function,
472 unsigned integer_type,
473 bool low_precision,
474 unsigned dataType )
475 {
476 const struct intel_device_info *devinfo = p->devinfo;
477 unsigned msg_length;
478 unsigned response_length;
479
480 /* Infer message length from the function */
481 switch (function) {
482 case BRW_MATH_FUNCTION_POW:
483 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
484 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
485 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
486 msg_length = 2;
487 break;
488 default:
489 msg_length = 1;
490 break;
491 }
492
493 /* Infer response length from the function */
494 switch (function) {
495 case BRW_MATH_FUNCTION_SINCOS:
496 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
497 response_length = 2;
498 break;
499 default:
500 response_length = 1;
501 break;
502 }
503
504 brw_set_desc(p, inst, brw_message_desc(
505 devinfo, msg_length, response_length, false));
506
507 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
508 brw_inst_set_math_msg_function(devinfo, inst, function);
509 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
510 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
511 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
512 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
513 brw_inst_set_saturate(devinfo, inst, 0);
514 }
515
516
brw_set_ff_sync_message(struct brw_codegen * p,brw_inst * insn,bool allocate,unsigned response_length,bool end_of_thread)517 static void brw_set_ff_sync_message(struct brw_codegen *p,
518 brw_inst *insn,
519 bool allocate,
520 unsigned response_length,
521 bool end_of_thread)
522 {
523 const struct intel_device_info *devinfo = p->devinfo;
524
525 brw_set_desc(p, insn, brw_message_desc(
526 devinfo, 1, response_length, true));
527
528 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
529 brw_inst_set_eot(devinfo, insn, end_of_thread);
530 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
531 brw_inst_set_urb_allocate(devinfo, insn, allocate);
532 /* The following fields are not used by FF_SYNC: */
533 brw_inst_set_urb_global_offset(devinfo, insn, 0);
534 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
535 brw_inst_set_urb_used(devinfo, insn, 0);
536 brw_inst_set_urb_complete(devinfo, insn, 0);
537 }
538
brw_set_urb_message(struct brw_codegen * p,brw_inst * insn,enum brw_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle_control)539 static void brw_set_urb_message( struct brw_codegen *p,
540 brw_inst *insn,
541 enum brw_urb_write_flags flags,
542 unsigned msg_length,
543 unsigned response_length,
544 unsigned offset,
545 unsigned swizzle_control )
546 {
547 const struct intel_device_info *devinfo = p->devinfo;
548
549 assert(devinfo->ver < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
550 assert(devinfo->ver < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
551 assert(devinfo->ver >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
552
553 brw_set_desc(p, insn, brw_message_desc(
554 devinfo, msg_length, response_length, true));
555
556 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
557 brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
558
559 if (flags & BRW_URB_WRITE_OWORD) {
560 assert(msg_length == 2); /* header + one OWORD of data */
561 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
562 } else {
563 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
564 }
565
566 brw_inst_set_urb_global_offset(devinfo, insn, offset);
567 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
568
569 if (devinfo->ver < 8) {
570 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
571 }
572
573 if (devinfo->ver < 7) {
574 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
575 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
576 } else {
577 brw_inst_set_urb_per_slot_offset(devinfo, insn,
578 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
579 }
580 }
581
582 static void
gfx7_set_dp_scratch_message(struct brw_codegen * p,brw_inst * inst,bool write,bool dword,bool invalidate_after_read,unsigned num_regs,unsigned addr_offset,unsigned mlen,unsigned rlen,bool header_present)583 gfx7_set_dp_scratch_message(struct brw_codegen *p,
584 brw_inst *inst,
585 bool write,
586 bool dword,
587 bool invalidate_after_read,
588 unsigned num_regs,
589 unsigned addr_offset,
590 unsigned mlen,
591 unsigned rlen,
592 bool header_present)
593 {
594 const struct intel_device_info *devinfo = p->devinfo;
595 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
596 (devinfo->ver >= 8 && num_regs == 8));
597 const unsigned block_size = (devinfo->ver >= 8 ? util_logbase2(num_regs) :
598 num_regs - 1);
599
600 brw_set_desc(p, inst, brw_message_desc(
601 devinfo, mlen, rlen, header_present));
602
603 brw_inst_set_sfid(devinfo, inst, GFX7_SFID_DATAPORT_DATA_CACHE);
604 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
605 brw_inst_set_scratch_read_write(devinfo, inst, write);
606 brw_inst_set_scratch_type(devinfo, inst, dword);
607 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
608 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
609 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
610 }
611
612 static void
brw_inst_set_state(const struct intel_device_info * devinfo,brw_inst * insn,const struct brw_insn_state * state)613 brw_inst_set_state(const struct intel_device_info *devinfo,
614 brw_inst *insn,
615 const struct brw_insn_state *state)
616 {
617 brw_inst_set_exec_size(devinfo, insn, state->exec_size);
618 brw_inst_set_group(devinfo, insn, state->group);
619 brw_inst_set_compression(devinfo, insn, state->compressed);
620 brw_inst_set_access_mode(devinfo, insn, state->access_mode);
621 brw_inst_set_mask_control(devinfo, insn, state->mask_control);
622 if (devinfo->ver >= 12)
623 brw_inst_set_swsb(devinfo, insn, tgl_swsb_encode(devinfo, state->swsb));
624 brw_inst_set_saturate(devinfo, insn, state->saturate);
625 brw_inst_set_pred_control(devinfo, insn, state->predicate);
626 brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
627
628 if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
629 state->access_mode == BRW_ALIGN_16) {
630 brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
631 if (devinfo->ver >= 7)
632 brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
633 } else {
634 brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
635 if (devinfo->ver >= 7)
636 brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
637 }
638
639 if (devinfo->ver >= 6)
640 brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
641 }
642
643 static brw_inst *
brw_append_insns(struct brw_codegen * p,unsigned nr_insn,unsigned align)644 brw_append_insns(struct brw_codegen *p, unsigned nr_insn, unsigned align)
645 {
646 assert(util_is_power_of_two_or_zero(sizeof(brw_inst)));
647 assert(util_is_power_of_two_or_zero(align));
648 const unsigned align_insn = MAX2(align / sizeof(brw_inst), 1);
649 const unsigned start_insn = ALIGN(p->nr_insn, align_insn);
650 const unsigned new_nr_insn = start_insn + nr_insn;
651
652 if (p->store_size < new_nr_insn) {
653 p->store_size = util_next_power_of_two(new_nr_insn * sizeof(brw_inst));
654 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
655 }
656
657 /* Memset any padding due to alignment to 0. We don't want to be hashing
658 * or caching a bunch of random bits we got from a memory allocation.
659 */
660 if (p->nr_insn < start_insn) {
661 memset(&p->store[p->nr_insn], 0,
662 (start_insn - p->nr_insn) * sizeof(brw_inst));
663 }
664
665 assert(p->next_insn_offset == p->nr_insn * sizeof(brw_inst));
666 p->nr_insn = new_nr_insn;
667 p->next_insn_offset = new_nr_insn * sizeof(brw_inst);
668
669 return &p->store[start_insn];
670 }
671
672 void
brw_realign(struct brw_codegen * p,unsigned align)673 brw_realign(struct brw_codegen *p, unsigned align)
674 {
675 brw_append_insns(p, 0, align);
676 }
677
678 int
brw_append_data(struct brw_codegen * p,void * data,unsigned size,unsigned align)679 brw_append_data(struct brw_codegen *p, void *data,
680 unsigned size, unsigned align)
681 {
682 unsigned nr_insn = DIV_ROUND_UP(size, sizeof(brw_inst));
683 void *dst = brw_append_insns(p, nr_insn, align);
684 memcpy(dst, data, size);
685
686 /* If it's not a whole number of instructions, memset the end */
687 if (size < nr_insn * sizeof(brw_inst))
688 memset(dst + size, 0, nr_insn * sizeof(brw_inst) - size);
689
690 return dst - (void *)p->store;
691 }
692
693 #define next_insn brw_next_insn
694 brw_inst *
brw_next_insn(struct brw_codegen * p,unsigned opcode)695 brw_next_insn(struct brw_codegen *p, unsigned opcode)
696 {
697 const struct intel_device_info *devinfo = p->devinfo;
698 brw_inst *insn = brw_append_insns(p, 1, sizeof(brw_inst));
699
700 memset(insn, 0, sizeof(*insn));
701 brw_inst_set_opcode(devinfo, insn, opcode);
702
703 /* Apply the default instruction state */
704 brw_inst_set_state(devinfo, insn, p->current);
705
706 return insn;
707 }
708
709 void
brw_add_reloc(struct brw_codegen * p,uint32_t id,enum brw_shader_reloc_type type,uint32_t offset,uint32_t delta)710 brw_add_reloc(struct brw_codegen *p, uint32_t id,
711 enum brw_shader_reloc_type type,
712 uint32_t offset, uint32_t delta)
713 {
714 if (p->num_relocs + 1 > p->reloc_array_size) {
715 p->reloc_array_size = MAX2(16, p->reloc_array_size * 2);
716 p->relocs = reralloc(p->mem_ctx, p->relocs,
717 struct brw_shader_reloc, p->reloc_array_size);
718 }
719
720 p->relocs[p->num_relocs++] = (struct brw_shader_reloc) {
721 .id = id,
722 .type = type,
723 .offset = offset,
724 .delta = delta,
725 };
726 }
727
728 static brw_inst *
brw_alu1(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src)729 brw_alu1(struct brw_codegen *p, unsigned opcode,
730 struct brw_reg dest, struct brw_reg src)
731 {
732 brw_inst *insn = next_insn(p, opcode);
733 brw_set_dest(p, insn, dest);
734 brw_set_src0(p, insn, src);
735 return insn;
736 }
737
738 static brw_inst *
brw_alu2(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)739 brw_alu2(struct brw_codegen *p, unsigned opcode,
740 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
741 {
742 /* 64-bit immediates are only supported on 1-src instructions */
743 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
744 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
745
746 brw_inst *insn = next_insn(p, opcode);
747 brw_set_dest(p, insn, dest);
748 brw_set_src0(p, insn, src0);
749 brw_set_src1(p, insn, src1);
750 return insn;
751 }
752
753 static int
get_3src_subreg_nr(struct brw_reg reg)754 get_3src_subreg_nr(struct brw_reg reg)
755 {
756 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
757 * use 32-bit units (components 0..7). Since they only support F/D/UD
758 * types, this doesn't lose any flexibility, but uses fewer bits.
759 */
760 return reg.subnr / 4;
761 }
762
763 static enum gfx10_align1_3src_vertical_stride
to_3src_align1_vstride(const struct intel_device_info * devinfo,enum brw_vertical_stride vstride)764 to_3src_align1_vstride(const struct intel_device_info *devinfo,
765 enum brw_vertical_stride vstride)
766 {
767 switch (vstride) {
768 case BRW_VERTICAL_STRIDE_0:
769 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
770 case BRW_VERTICAL_STRIDE_1:
771 assert(devinfo->ver >= 12);
772 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1;
773 case BRW_VERTICAL_STRIDE_2:
774 assert(devinfo->ver < 12);
775 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
776 case BRW_VERTICAL_STRIDE_4:
777 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
778 case BRW_VERTICAL_STRIDE_8:
779 case BRW_VERTICAL_STRIDE_16:
780 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
781 default:
782 unreachable("invalid vstride");
783 }
784 }
785
786
787 static enum gfx10_align1_3src_src_horizontal_stride
to_3src_align1_hstride(enum brw_horizontal_stride hstride)788 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
789 {
790 switch (hstride) {
791 case BRW_HORIZONTAL_STRIDE_0:
792 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
793 case BRW_HORIZONTAL_STRIDE_1:
794 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
795 case BRW_HORIZONTAL_STRIDE_2:
796 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
797 case BRW_HORIZONTAL_STRIDE_4:
798 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
799 default:
800 unreachable("invalid hstride");
801 }
802 }
803
804 static brw_inst *
brw_alu3(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1,struct brw_reg src2)805 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
806 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
807 {
808 const struct intel_device_info *devinfo = p->devinfo;
809 brw_inst *inst = next_insn(p, opcode);
810
811 gfx7_convert_mrf_to_grf(p, &dest);
812
813 assert(dest.nr < 128);
814
815 if (devinfo->ver >= 10)
816 assert(!(src0.file == BRW_IMMEDIATE_VALUE &&
817 src2.file == BRW_IMMEDIATE_VALUE));
818
819 assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128);
820 assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128);
821 assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128);
822 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
823 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
824 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
825 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
826
827 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
828 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
829 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
830
831 if (devinfo->ver >= 12) {
832 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
833 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
834 } else {
835 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
836 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
837 BRW_ALIGN1_3SRC_ACCUMULATOR);
838 brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
839 } else {
840 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
841 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
842 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
843 }
844 }
845 brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
846
847 brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
848
849 if (brw_reg_type_is_floating_point(dest.type)) {
850 brw_inst_set_3src_a1_exec_type(devinfo, inst,
851 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
852 } else {
853 brw_inst_set_3src_a1_exec_type(devinfo, inst,
854 BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
855 }
856
857 brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
858 brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
859 brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
860 brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
861
862 if (src0.file == BRW_IMMEDIATE_VALUE) {
863 brw_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud);
864 } else {
865 brw_inst_set_3src_a1_src0_vstride(
866 devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
867 brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
868 to_3src_align1_hstride(src0.hstride));
869 brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
870 if (src0.type == BRW_REGISTER_TYPE_NF) {
871 brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
872 } else {
873 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
874 }
875 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
876 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
877 }
878 brw_inst_set_3src_a1_src1_vstride(
879 devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
880 brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
881 to_3src_align1_hstride(src1.hstride));
882
883 brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
884 if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
885 brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
886 } else {
887 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
888 }
889 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
890 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
891
892 if (src2.file == BRW_IMMEDIATE_VALUE) {
893 brw_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud);
894 } else {
895 brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
896 to_3src_align1_hstride(src2.hstride));
897 /* no vstride on src2 */
898 brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
899 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
900 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
901 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
902 }
903
904 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
905 src0.file == BRW_IMMEDIATE_VALUE ||
906 (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
907 src0.type == BRW_REGISTER_TYPE_NF));
908 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
909 src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
910 assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
911 src2.file == BRW_IMMEDIATE_VALUE);
912
913 if (devinfo->ver >= 12) {
914 if (src0.file == BRW_IMMEDIATE_VALUE) {
915 brw_inst_set_3src_a1_src0_is_imm(devinfo, inst, 1);
916 } else {
917 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
918 }
919
920 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
921
922 if (src2.file == BRW_IMMEDIATE_VALUE) {
923 brw_inst_set_3src_a1_src2_is_imm(devinfo, inst, 1);
924 } else {
925 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
926 }
927 } else {
928 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
929 src0.file == BRW_GENERAL_REGISTER_FILE ?
930 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
931 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
932 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
933 src1.file == BRW_GENERAL_REGISTER_FILE ?
934 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
935 BRW_ALIGN1_3SRC_ACCUMULATOR);
936 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
937 src2.file == BRW_GENERAL_REGISTER_FILE ?
938 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
939 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
940 }
941
942 } else {
943 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
944 dest.file == BRW_MESSAGE_REGISTER_FILE);
945 assert(dest.type == BRW_REGISTER_TYPE_F ||
946 dest.type == BRW_REGISTER_TYPE_DF ||
947 dest.type == BRW_REGISTER_TYPE_D ||
948 dest.type == BRW_REGISTER_TYPE_UD ||
949 (dest.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 8));
950 if (devinfo->ver == 6) {
951 brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
952 dest.file == BRW_MESSAGE_REGISTER_FILE);
953 }
954 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
955 brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 4);
956 brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
957
958 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
959 brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
960 brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
961 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
962 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
963 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
964 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
965 src0.vstride == BRW_VERTICAL_STRIDE_0);
966
967 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
968 brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
969 brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
970 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
971 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
972 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
973 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
974 src1.vstride == BRW_VERTICAL_STRIDE_0);
975
976 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
977 brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
978 brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
979 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
980 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
981 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
982 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
983 src2.vstride == BRW_VERTICAL_STRIDE_0);
984
985 if (devinfo->ver >= 7) {
986 /* Set both the source and destination types based on dest.type,
987 * ignoring the source register types. The MAD and LRP emitters ensure
988 * that all four types are float. The BFE and BFI2 emitters, however,
989 * may send us mixed D and UD types and want us to ignore that and use
990 * the destination type.
991 */
992 brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
993 brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
994
995 /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
996 *
997 * "Three source instructions can use operands with mixed-mode
998 * precision. When SrcType field is set to :f or :hf it defines
999 * precision for source 0 only, and fields Src1Type and Src2Type
1000 * define precision for other source operands:
1001 *
1002 * 0b = :f. Single precision Float (32-bit).
1003 * 1b = :hf. Half precision Float (16-bit)."
1004 */
1005 if (src1.type == BRW_REGISTER_TYPE_HF)
1006 brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
1007
1008 if (src2.type == BRW_REGISTER_TYPE_HF)
1009 brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
1010 }
1011 }
1012
1013 return inst;
1014 }
1015
1016
1017 /***********************************************************************
1018 * Convenience routines.
1019 */
1020 #define ALU1(OP) \
1021 brw_inst *brw_##OP(struct brw_codegen *p, \
1022 struct brw_reg dest, \
1023 struct brw_reg src0) \
1024 { \
1025 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
1026 }
1027
1028 #define ALU2(OP) \
1029 brw_inst *brw_##OP(struct brw_codegen *p, \
1030 struct brw_reg dest, \
1031 struct brw_reg src0, \
1032 struct brw_reg src1) \
1033 { \
1034 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
1035 }
1036
1037 #define ALU3(OP) \
1038 brw_inst *brw_##OP(struct brw_codegen *p, \
1039 struct brw_reg dest, \
1040 struct brw_reg src0, \
1041 struct brw_reg src1, \
1042 struct brw_reg src2) \
1043 { \
1044 if (p->current->access_mode == BRW_ALIGN_16) { \
1045 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
1046 src0.swizzle = BRW_SWIZZLE_XXXX; \
1047 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
1048 src1.swizzle = BRW_SWIZZLE_XXXX; \
1049 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
1050 src2.swizzle = BRW_SWIZZLE_XXXX; \
1051 } \
1052 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1053 }
1054
1055 #define ALU3F(OP) \
1056 brw_inst *brw_##OP(struct brw_codegen *p, \
1057 struct brw_reg dest, \
1058 struct brw_reg src0, \
1059 struct brw_reg src1, \
1060 struct brw_reg src2) \
1061 { \
1062 assert(dest.type == BRW_REGISTER_TYPE_F || \
1063 dest.type == BRW_REGISTER_TYPE_DF); \
1064 if (dest.type == BRW_REGISTER_TYPE_F) { \
1065 assert(src0.type == BRW_REGISTER_TYPE_F); \
1066 assert(src1.type == BRW_REGISTER_TYPE_F); \
1067 assert(src2.type == BRW_REGISTER_TYPE_F); \
1068 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
1069 assert(src0.type == BRW_REGISTER_TYPE_DF); \
1070 assert(src1.type == BRW_REGISTER_TYPE_DF); \
1071 assert(src2.type == BRW_REGISTER_TYPE_DF); \
1072 } \
1073 \
1074 if (p->current->access_mode == BRW_ALIGN_16) { \
1075 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
1076 src0.swizzle = BRW_SWIZZLE_XXXX; \
1077 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
1078 src1.swizzle = BRW_SWIZZLE_XXXX; \
1079 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
1080 src2.swizzle = BRW_SWIZZLE_XXXX; \
1081 } \
1082 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1083 }
1084
1085 ALU2(SEL)
ALU1(NOT)1086 ALU1(NOT)
1087 ALU2(AND)
1088 ALU2(OR)
1089 ALU2(XOR)
1090 ALU2(SHR)
1091 ALU2(SHL)
1092 ALU1(DIM)
1093 ALU2(ASR)
1094 ALU2(ROL)
1095 ALU2(ROR)
1096 ALU3(CSEL)
1097 ALU1(FRC)
1098 ALU1(RNDD)
1099 ALU1(RNDE)
1100 ALU1(RNDU)
1101 ALU1(RNDZ)
1102 ALU2(MAC)
1103 ALU2(MACH)
1104 ALU1(LZD)
1105 ALU2(DP4)
1106 ALU2(DPH)
1107 ALU2(DP3)
1108 ALU2(DP2)
1109 ALU3(DP4A)
1110 ALU3(MAD)
1111 ALU3F(LRP)
1112 ALU1(BFREV)
1113 ALU3(BFE)
1114 ALU2(BFI1)
1115 ALU3(BFI2)
1116 ALU1(FBH)
1117 ALU1(FBL)
1118 ALU1(CBIT)
1119 ALU2(ADDC)
1120 ALU2(SUBB)
1121 ALU3(ADD3)
1122
1123 brw_inst *
1124 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1125 {
1126 const struct intel_device_info *devinfo = p->devinfo;
1127
1128 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1129 * To avoid the problems that causes, we use an <X,2,0> source region to
1130 * read each element twice.
1131 */
1132 if (devinfo->verx10 == 70 &&
1133 brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1134 dest.type == BRW_REGISTER_TYPE_DF &&
1135 (src0.type == BRW_REGISTER_TYPE_F ||
1136 src0.type == BRW_REGISTER_TYPE_D ||
1137 src0.type == BRW_REGISTER_TYPE_UD) &&
1138 !has_scalar_region(src0)) {
1139 assert(src0.vstride == src0.width + src0.hstride);
1140 src0.vstride = src0.hstride;
1141 src0.width = BRW_WIDTH_2;
1142 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1143 }
1144
1145 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1146 }
1147
1148 brw_inst *
brw_ADD(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1149 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1150 struct brw_reg src0, struct brw_reg src1)
1151 {
1152 /* 6.2.2: add */
1153 if (src0.type == BRW_REGISTER_TYPE_F ||
1154 (src0.file == BRW_IMMEDIATE_VALUE &&
1155 src0.type == BRW_REGISTER_TYPE_VF)) {
1156 assert(src1.type != BRW_REGISTER_TYPE_UD);
1157 assert(src1.type != BRW_REGISTER_TYPE_D);
1158 }
1159
1160 if (src1.type == BRW_REGISTER_TYPE_F ||
1161 (src1.file == BRW_IMMEDIATE_VALUE &&
1162 src1.type == BRW_REGISTER_TYPE_VF)) {
1163 assert(src0.type != BRW_REGISTER_TYPE_UD);
1164 assert(src0.type != BRW_REGISTER_TYPE_D);
1165 }
1166
1167 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1168 }
1169
1170 brw_inst *
brw_AVG(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1171 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1172 struct brw_reg src0, struct brw_reg src1)
1173 {
1174 assert(dest.type == src0.type);
1175 assert(src0.type == src1.type);
1176 switch (src0.type) {
1177 case BRW_REGISTER_TYPE_B:
1178 case BRW_REGISTER_TYPE_UB:
1179 case BRW_REGISTER_TYPE_W:
1180 case BRW_REGISTER_TYPE_UW:
1181 case BRW_REGISTER_TYPE_D:
1182 case BRW_REGISTER_TYPE_UD:
1183 break;
1184 default:
1185 unreachable("Bad type for brw_AVG");
1186 }
1187
1188 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1189 }
1190
1191 brw_inst *
brw_MUL(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1192 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1193 struct brw_reg src0, struct brw_reg src1)
1194 {
1195 /* 6.32.38: mul */
1196 if (src0.type == BRW_REGISTER_TYPE_D ||
1197 src0.type == BRW_REGISTER_TYPE_UD ||
1198 src1.type == BRW_REGISTER_TYPE_D ||
1199 src1.type == BRW_REGISTER_TYPE_UD) {
1200 assert(dest.type != BRW_REGISTER_TYPE_F);
1201 }
1202
1203 if (src0.type == BRW_REGISTER_TYPE_F ||
1204 (src0.file == BRW_IMMEDIATE_VALUE &&
1205 src0.type == BRW_REGISTER_TYPE_VF)) {
1206 assert(src1.type != BRW_REGISTER_TYPE_UD);
1207 assert(src1.type != BRW_REGISTER_TYPE_D);
1208 }
1209
1210 if (src1.type == BRW_REGISTER_TYPE_F ||
1211 (src1.file == BRW_IMMEDIATE_VALUE &&
1212 src1.type == BRW_REGISTER_TYPE_VF)) {
1213 assert(src0.type != BRW_REGISTER_TYPE_UD);
1214 assert(src0.type != BRW_REGISTER_TYPE_D);
1215 }
1216
1217 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1218 src0.nr != BRW_ARF_ACCUMULATOR);
1219 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1220 src1.nr != BRW_ARF_ACCUMULATOR);
1221
1222 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1223 }
1224
1225 brw_inst *
brw_LINE(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1226 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1227 struct brw_reg src0, struct brw_reg src1)
1228 {
1229 src0.vstride = BRW_VERTICAL_STRIDE_0;
1230 src0.width = BRW_WIDTH_1;
1231 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1232 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1233 }
1234
1235 brw_inst *
brw_PLN(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1236 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1237 struct brw_reg src0, struct brw_reg src1)
1238 {
1239 src0.vstride = BRW_VERTICAL_STRIDE_0;
1240 src0.width = BRW_WIDTH_1;
1241 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1242 src1.vstride = BRW_VERTICAL_STRIDE_8;
1243 src1.width = BRW_WIDTH_8;
1244 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1245 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1246 }
1247
1248 brw_inst *
brw_F32TO16(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src)1249 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1250 {
1251 const struct intel_device_info *devinfo = p->devinfo;
1252 const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1253 /* The F32TO16 instruction doesn't support 32-bit destination types in
1254 * Align1 mode, and neither does the Gfx8 implementation in terms of a
1255 * converting MOV. Gfx7 does zero out the high 16 bits in Align16 mode as
1256 * an undocumented feature.
1257 */
1258 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1259 (!align16 || devinfo->ver >= 8));
1260 brw_inst *inst;
1261
1262 if (align16) {
1263 assert(dst.type == BRW_REGISTER_TYPE_UD);
1264 } else {
1265 assert(dst.type == BRW_REGISTER_TYPE_UD ||
1266 dst.type == BRW_REGISTER_TYPE_W ||
1267 dst.type == BRW_REGISTER_TYPE_UW ||
1268 dst.type == BRW_REGISTER_TYPE_HF);
1269 }
1270
1271 brw_push_insn_state(p);
1272
1273 if (needs_zero_fill) {
1274 brw_set_default_access_mode(p, BRW_ALIGN_1);
1275 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1276 }
1277
1278 if (devinfo->ver >= 8) {
1279 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1280 } else {
1281 assert(devinfo->ver == 7);
1282 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1283 }
1284
1285 if (needs_zero_fill) {
1286 if (devinfo->ver < 12)
1287 brw_inst_set_no_dd_clear(devinfo, inst, true);
1288 brw_set_default_swsb(p, tgl_swsb_null());
1289 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1290 if (devinfo->ver < 12)
1291 brw_inst_set_no_dd_check(devinfo, inst, true);
1292 }
1293
1294 brw_pop_insn_state(p);
1295 return inst;
1296 }
1297
1298 brw_inst *
brw_F16TO32(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src)1299 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1300 {
1301 const struct intel_device_info *devinfo = p->devinfo;
1302 bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1303
1304 if (align16) {
1305 assert(src.type == BRW_REGISTER_TYPE_UD);
1306 } else {
1307 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1308 *
1309 * Because this instruction does not have a 16-bit floating-point
1310 * type, the source data type must be Word (W). The destination type
1311 * must be F (Float).
1312 */
1313 if (src.type == BRW_REGISTER_TYPE_UD)
1314 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1315
1316 assert(src.type == BRW_REGISTER_TYPE_W ||
1317 src.type == BRW_REGISTER_TYPE_UW ||
1318 src.type == BRW_REGISTER_TYPE_HF);
1319 }
1320
1321 if (devinfo->ver >= 8) {
1322 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1323 } else {
1324 assert(devinfo->ver == 7);
1325 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1326 }
1327 }
1328
1329
brw_NOP(struct brw_codegen * p)1330 void brw_NOP(struct brw_codegen *p)
1331 {
1332 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1333 memset(insn, 0, sizeof(*insn));
1334 brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1335 }
1336
brw_SYNC(struct brw_codegen * p,enum tgl_sync_function func)1337 void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func)
1338 {
1339 brw_inst *insn = next_insn(p, BRW_OPCODE_SYNC);
1340 brw_inst_set_cond_modifier(p->devinfo, insn, func);
1341 }
1342
1343 /***********************************************************************
1344 * Comparisons, if/else/endif
1345 */
1346
1347 brw_inst *
brw_JMPI(struct brw_codegen * p,struct brw_reg index,unsigned predicate_control)1348 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1349 unsigned predicate_control)
1350 {
1351 const struct intel_device_info *devinfo = p->devinfo;
1352 struct brw_reg ip = brw_ip_reg();
1353 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1354
1355 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1356 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1357 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1358 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1359
1360 return inst;
1361 }
1362
1363 static void
push_if_stack(struct brw_codegen * p,brw_inst * inst)1364 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1365 {
1366 p->if_stack[p->if_stack_depth] = inst - p->store;
1367
1368 p->if_stack_depth++;
1369 if (p->if_stack_array_size <= p->if_stack_depth) {
1370 p->if_stack_array_size *= 2;
1371 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1372 p->if_stack_array_size);
1373 }
1374 }
1375
1376 static brw_inst *
pop_if_stack(struct brw_codegen * p)1377 pop_if_stack(struct brw_codegen *p)
1378 {
1379 p->if_stack_depth--;
1380 return &p->store[p->if_stack[p->if_stack_depth]];
1381 }
1382
1383 static void
push_loop_stack(struct brw_codegen * p,brw_inst * inst)1384 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1385 {
1386 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1387 p->loop_stack_array_size *= 2;
1388 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1389 p->loop_stack_array_size);
1390 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1391 p->loop_stack_array_size);
1392 }
1393
1394 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1395 p->loop_stack_depth++;
1396 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1397 }
1398
1399 static brw_inst *
get_inner_do_insn(struct brw_codegen * p)1400 get_inner_do_insn(struct brw_codegen *p)
1401 {
1402 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1403 }
1404
1405 /* EU takes the value from the flag register and pushes it onto some
1406 * sort of a stack (presumably merging with any flag value already on
1407 * the stack). Within an if block, the flags at the top of the stack
1408 * control execution on each channel of the unit, eg. on each of the
1409 * 16 pixel values in our wm programs.
1410 *
1411 * When the matching 'else' instruction is reached (presumably by
1412 * countdown of the instruction count patched in by our ELSE/ENDIF
1413 * functions), the relevant flags are inverted.
1414 *
1415 * When the matching 'endif' instruction is reached, the flags are
1416 * popped off. If the stack is now empty, normal execution resumes.
1417 */
1418 brw_inst *
brw_IF(struct brw_codegen * p,unsigned execute_size)1419 brw_IF(struct brw_codegen *p, unsigned execute_size)
1420 {
1421 const struct intel_device_info *devinfo = p->devinfo;
1422 brw_inst *insn;
1423
1424 insn = next_insn(p, BRW_OPCODE_IF);
1425
1426 /* Override the defaults for this instruction:
1427 */
1428 if (devinfo->ver < 6) {
1429 brw_set_dest(p, insn, brw_ip_reg());
1430 brw_set_src0(p, insn, brw_ip_reg());
1431 brw_set_src1(p, insn, brw_imm_d(0x0));
1432 } else if (devinfo->ver == 6) {
1433 brw_set_dest(p, insn, brw_imm_w(0));
1434 brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
1435 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1436 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1437 } else if (devinfo->ver == 7) {
1438 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1439 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1440 brw_set_src1(p, insn, brw_imm_w(0));
1441 brw_inst_set_jip(devinfo, insn, 0);
1442 brw_inst_set_uip(devinfo, insn, 0);
1443 } else {
1444 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1445 if (devinfo->ver < 12)
1446 brw_set_src0(p, insn, brw_imm_d(0));
1447 brw_inst_set_jip(devinfo, insn, 0);
1448 brw_inst_set_uip(devinfo, insn, 0);
1449 }
1450
1451 brw_inst_set_exec_size(devinfo, insn, execute_size);
1452 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1453 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1454 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1455 if (!p->single_program_flow && devinfo->ver < 6)
1456 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1457
1458 push_if_stack(p, insn);
1459 p->if_depth_in_loop[p->loop_stack_depth]++;
1460 return insn;
1461 }
1462
1463 /* This function is only used for gfx6-style IF instructions with an
1464 * embedded comparison (conditional modifier). It is not used on gfx7.
1465 */
1466 brw_inst *
gfx6_IF(struct brw_codegen * p,enum brw_conditional_mod conditional,struct brw_reg src0,struct brw_reg src1)1467 gfx6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1468 struct brw_reg src0, struct brw_reg src1)
1469 {
1470 const struct intel_device_info *devinfo = p->devinfo;
1471 brw_inst *insn;
1472
1473 insn = next_insn(p, BRW_OPCODE_IF);
1474
1475 brw_set_dest(p, insn, brw_imm_w(0));
1476 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1477 brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
1478 brw_set_src0(p, insn, src0);
1479 brw_set_src1(p, insn, src1);
1480
1481 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1482 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1483 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1484
1485 push_if_stack(p, insn);
1486 return insn;
1487 }
1488
1489 /**
1490 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1491 */
1492 static void
convert_IF_ELSE_to_ADD(struct brw_codegen * p,brw_inst * if_inst,brw_inst * else_inst)1493 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1494 brw_inst *if_inst, brw_inst *else_inst)
1495 {
1496 const struct intel_device_info *devinfo = p->devinfo;
1497
1498 /* The next instruction (where the ENDIF would be, if it existed) */
1499 brw_inst *next_inst = &p->store[p->nr_insn];
1500
1501 assert(p->single_program_flow);
1502 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1503 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1504 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1505
1506 /* Convert IF to an ADD instruction that moves the instruction pointer
1507 * to the first instruction of the ELSE block. If there is no ELSE
1508 * block, point to where ENDIF would be. Reverse the predicate.
1509 *
1510 * There's no need to execute an ENDIF since we don't need to do any
1511 * stack operations, and if we're currently executing, we just want to
1512 * continue normally.
1513 */
1514 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1515 brw_inst_set_pred_inv(devinfo, if_inst, true);
1516
1517 if (else_inst != NULL) {
1518 /* Convert ELSE to an ADD instruction that points where the ENDIF
1519 * would be.
1520 */
1521 brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1522
1523 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1524 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1525 } else {
1526 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1527 }
1528 }
1529
1530 /**
1531 * Patch IF and ELSE instructions with appropriate jump targets.
1532 */
1533 static void
patch_IF_ELSE(struct brw_codegen * p,brw_inst * if_inst,brw_inst * else_inst,brw_inst * endif_inst)1534 patch_IF_ELSE(struct brw_codegen *p,
1535 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1536 {
1537 const struct intel_device_info *devinfo = p->devinfo;
1538
1539 /* We shouldn't be patching IF and ELSE instructions in single program flow
1540 * mode when gen < 6, because in single program flow mode on those
1541 * platforms, we convert flow control instructions to conditional ADDs that
1542 * operate on IP (see brw_ENDIF).
1543 *
1544 * However, on Gfx6, writing to IP doesn't work in single program flow mode
1545 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1546 * not be updated by non-flow control instructions."). And on later
1547 * platforms, there is no significant benefit to converting control flow
1548 * instructions to conditional ADDs. So we do patch IF and ELSE
1549 * instructions in single program flow mode on those platforms.
1550 */
1551 if (devinfo->ver < 6)
1552 assert(!p->single_program_flow);
1553
1554 assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1555 assert(endif_inst != NULL);
1556 assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1557
1558 unsigned br = brw_jump_scale(devinfo);
1559
1560 assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1561 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1562
1563 if (else_inst == NULL) {
1564 /* Patch IF -> ENDIF */
1565 if (devinfo->ver < 6) {
1566 /* Turn it into an IFF, which means no mask stack operations for
1567 * all-false and jumping past the ENDIF.
1568 */
1569 brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1570 brw_inst_set_gfx4_jump_count(devinfo, if_inst,
1571 br * (endif_inst - if_inst + 1));
1572 brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1573 } else if (devinfo->ver == 6) {
1574 /* As of gfx6, there is no IFF and IF must point to the ENDIF. */
1575 brw_inst_set_gfx6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1576 } else {
1577 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1578 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1579 }
1580 } else {
1581 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1582
1583 /* Patch IF -> ELSE */
1584 if (devinfo->ver < 6) {
1585 brw_inst_set_gfx4_jump_count(devinfo, if_inst,
1586 br * (else_inst - if_inst));
1587 brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1588 } else if (devinfo->ver == 6) {
1589 brw_inst_set_gfx6_jump_count(devinfo, if_inst,
1590 br * (else_inst - if_inst + 1));
1591 }
1592
1593 /* Patch ELSE -> ENDIF */
1594 if (devinfo->ver < 6) {
1595 /* BRW_OPCODE_ELSE pre-gfx6 should point just past the
1596 * matching ENDIF.
1597 */
1598 brw_inst_set_gfx4_jump_count(devinfo, else_inst,
1599 br * (endif_inst - else_inst + 1));
1600 brw_inst_set_gfx4_pop_count(devinfo, else_inst, 1);
1601 } else if (devinfo->ver == 6) {
1602 /* BRW_OPCODE_ELSE on gfx6 should point to the matching ENDIF. */
1603 brw_inst_set_gfx6_jump_count(devinfo, else_inst,
1604 br * (endif_inst - else_inst));
1605 } else {
1606 /* The IF instruction's JIP should point just past the ELSE */
1607 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1608 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1609 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1610 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1611 if (devinfo->ver >= 8) {
1612 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1613 * should point to ENDIF.
1614 */
1615 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1616 }
1617 }
1618 }
1619 }
1620
1621 void
brw_ELSE(struct brw_codegen * p)1622 brw_ELSE(struct brw_codegen *p)
1623 {
1624 const struct intel_device_info *devinfo = p->devinfo;
1625 brw_inst *insn;
1626
1627 insn = next_insn(p, BRW_OPCODE_ELSE);
1628
1629 if (devinfo->ver < 6) {
1630 brw_set_dest(p, insn, brw_ip_reg());
1631 brw_set_src0(p, insn, brw_ip_reg());
1632 brw_set_src1(p, insn, brw_imm_d(0x0));
1633 } else if (devinfo->ver == 6) {
1634 brw_set_dest(p, insn, brw_imm_w(0));
1635 brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
1636 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1637 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1638 } else if (devinfo->ver == 7) {
1639 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1640 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1641 brw_set_src1(p, insn, brw_imm_w(0));
1642 brw_inst_set_jip(devinfo, insn, 0);
1643 brw_inst_set_uip(devinfo, insn, 0);
1644 } else {
1645 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1646 if (devinfo->ver < 12)
1647 brw_set_src0(p, insn, brw_imm_d(0));
1648 brw_inst_set_jip(devinfo, insn, 0);
1649 brw_inst_set_uip(devinfo, insn, 0);
1650 }
1651
1652 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1653 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1654 if (!p->single_program_flow && devinfo->ver < 6)
1655 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1656
1657 push_if_stack(p, insn);
1658 }
1659
1660 void
brw_ENDIF(struct brw_codegen * p)1661 brw_ENDIF(struct brw_codegen *p)
1662 {
1663 const struct intel_device_info *devinfo = p->devinfo;
1664 brw_inst *insn = NULL;
1665 brw_inst *else_inst = NULL;
1666 brw_inst *if_inst = NULL;
1667 brw_inst *tmp;
1668 bool emit_endif = true;
1669
1670 /* In single program flow mode, we can express IF and ELSE instructions
1671 * equivalently as ADD instructions that operate on IP. On platforms prior
1672 * to Gfx6, flow control instructions cause an implied thread switch, so
1673 * this is a significant savings.
1674 *
1675 * However, on Gfx6, writing to IP doesn't work in single program flow mode
1676 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1677 * not be updated by non-flow control instructions."). And on later
1678 * platforms, there is no significant benefit to converting control flow
1679 * instructions to conditional ADDs. So we only do this trick on Gfx4 and
1680 * Gfx5.
1681 */
1682 if (devinfo->ver < 6 && p->single_program_flow)
1683 emit_endif = false;
1684
1685 /*
1686 * A single next_insn() may change the base address of instruction store
1687 * memory(p->store), so call it first before referencing the instruction
1688 * store pointer from an index
1689 */
1690 if (emit_endif)
1691 insn = next_insn(p, BRW_OPCODE_ENDIF);
1692
1693 /* Pop the IF and (optional) ELSE instructions from the stack */
1694 p->if_depth_in_loop[p->loop_stack_depth]--;
1695 tmp = pop_if_stack(p);
1696 if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1697 else_inst = tmp;
1698 tmp = pop_if_stack(p);
1699 }
1700 if_inst = tmp;
1701
1702 if (!emit_endif) {
1703 /* ENDIF is useless; don't bother emitting it. */
1704 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1705 return;
1706 }
1707
1708 if (devinfo->ver < 6) {
1709 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1710 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1711 brw_set_src1(p, insn, brw_imm_d(0x0));
1712 } else if (devinfo->ver == 6) {
1713 brw_set_dest(p, insn, brw_imm_w(0));
1714 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1715 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1716 } else if (devinfo->ver == 7) {
1717 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1718 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1719 brw_set_src1(p, insn, brw_imm_w(0));
1720 } else {
1721 brw_set_src0(p, insn, brw_imm_d(0));
1722 }
1723
1724 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1725 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1726 if (devinfo->ver < 6)
1727 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1728
1729 /* Also pop item off the stack in the endif instruction: */
1730 if (devinfo->ver < 6) {
1731 brw_inst_set_gfx4_jump_count(devinfo, insn, 0);
1732 brw_inst_set_gfx4_pop_count(devinfo, insn, 1);
1733 } else if (devinfo->ver == 6) {
1734 brw_inst_set_gfx6_jump_count(devinfo, insn, 2);
1735 } else {
1736 brw_inst_set_jip(devinfo, insn, 2);
1737 }
1738 patch_IF_ELSE(p, if_inst, else_inst, insn);
1739 }
1740
1741 brw_inst *
brw_BREAK(struct brw_codegen * p)1742 brw_BREAK(struct brw_codegen *p)
1743 {
1744 const struct intel_device_info *devinfo = p->devinfo;
1745 brw_inst *insn;
1746
1747 insn = next_insn(p, BRW_OPCODE_BREAK);
1748 if (devinfo->ver >= 8) {
1749 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1750 brw_set_src0(p, insn, brw_imm_d(0x0));
1751 } else if (devinfo->ver >= 6) {
1752 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1753 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1754 brw_set_src1(p, insn, brw_imm_d(0x0));
1755 } else {
1756 brw_set_dest(p, insn, brw_ip_reg());
1757 brw_set_src0(p, insn, brw_ip_reg());
1758 brw_set_src1(p, insn, brw_imm_d(0x0));
1759 brw_inst_set_gfx4_pop_count(devinfo, insn,
1760 p->if_depth_in_loop[p->loop_stack_depth]);
1761 }
1762 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1763 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1764
1765 return insn;
1766 }
1767
1768 brw_inst *
brw_CONT(struct brw_codegen * p)1769 brw_CONT(struct brw_codegen *p)
1770 {
1771 const struct intel_device_info *devinfo = p->devinfo;
1772 brw_inst *insn;
1773
1774 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1775 brw_set_dest(p, insn, brw_ip_reg());
1776 if (devinfo->ver >= 8) {
1777 brw_set_src0(p, insn, brw_imm_d(0x0));
1778 } else {
1779 brw_set_src0(p, insn, brw_ip_reg());
1780 brw_set_src1(p, insn, brw_imm_d(0x0));
1781 }
1782
1783 if (devinfo->ver < 6) {
1784 brw_inst_set_gfx4_pop_count(devinfo, insn,
1785 p->if_depth_in_loop[p->loop_stack_depth]);
1786 }
1787 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1788 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1789 return insn;
1790 }
1791
1792 brw_inst *
brw_HALT(struct brw_codegen * p)1793 brw_HALT(struct brw_codegen *p)
1794 {
1795 const struct intel_device_info *devinfo = p->devinfo;
1796 brw_inst *insn;
1797
1798 insn = next_insn(p, BRW_OPCODE_HALT);
1799 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1800 if (devinfo->ver < 6) {
1801 /* From the Gfx4 PRM:
1802 *
1803 * "IP register must be put (for example, by the assembler) at <dst>
1804 * and <src0> locations.
1805 */
1806 brw_set_dest(p, insn, brw_ip_reg());
1807 brw_set_src0(p, insn, brw_ip_reg());
1808 brw_set_src1(p, insn, brw_imm_d(0x0)); /* exitcode updated later. */
1809 } else if (devinfo->ver < 8) {
1810 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1811 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1812 } else if (devinfo->ver < 12) {
1813 brw_set_src0(p, insn, brw_imm_d(0x0));
1814 }
1815
1816 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1817 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1818 return insn;
1819 }
1820
1821 /* DO/WHILE loop:
1822 *
1823 * The DO/WHILE is just an unterminated loop -- break or continue are
1824 * used for control within the loop. We have a few ways they can be
1825 * done.
1826 *
1827 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1828 * jip and no DO instruction.
1829 *
1830 * For non-uniform control flow pre-gfx6, there's a DO instruction to
1831 * push the mask, and a WHILE to jump back, and BREAK to get out and
1832 * pop the mask.
1833 *
1834 * For gfx6, there's no more mask stack, so no need for DO. WHILE
1835 * just points back to the first instruction of the loop.
1836 */
1837 brw_inst *
brw_DO(struct brw_codegen * p,unsigned execute_size)1838 brw_DO(struct brw_codegen *p, unsigned execute_size)
1839 {
1840 const struct intel_device_info *devinfo = p->devinfo;
1841
1842 if (devinfo->ver >= 6 || p->single_program_flow) {
1843 push_loop_stack(p, &p->store[p->nr_insn]);
1844 return &p->store[p->nr_insn];
1845 } else {
1846 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1847
1848 push_loop_stack(p, insn);
1849
1850 /* Override the defaults for this instruction:
1851 */
1852 brw_set_dest(p, insn, brw_null_reg());
1853 brw_set_src0(p, insn, brw_null_reg());
1854 brw_set_src1(p, insn, brw_null_reg());
1855
1856 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1857 brw_inst_set_exec_size(devinfo, insn, execute_size);
1858 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1859
1860 return insn;
1861 }
1862 }
1863
1864 /**
1865 * For pre-gfx6, we patch BREAK/CONT instructions to point at the WHILE
1866 * instruction here.
1867 *
1868 * For gfx6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1869 * nesting, since it can always just point to the end of the block/current loop.
1870 */
1871 static void
brw_patch_break_cont(struct brw_codegen * p,brw_inst * while_inst)1872 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1873 {
1874 const struct intel_device_info *devinfo = p->devinfo;
1875 brw_inst *do_inst = get_inner_do_insn(p);
1876 brw_inst *inst;
1877 unsigned br = brw_jump_scale(devinfo);
1878
1879 assert(devinfo->ver < 6);
1880
1881 for (inst = while_inst - 1; inst != do_inst; inst--) {
1882 /* If the jump count is != 0, that means that this instruction has already
1883 * been patched because it's part of a loop inside of the one we're
1884 * patching.
1885 */
1886 if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1887 brw_inst_gfx4_jump_count(devinfo, inst) == 0) {
1888 brw_inst_set_gfx4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1889 } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1890 brw_inst_gfx4_jump_count(devinfo, inst) == 0) {
1891 brw_inst_set_gfx4_jump_count(devinfo, inst, br * (while_inst - inst));
1892 }
1893 }
1894 }
1895
1896 brw_inst *
brw_WHILE(struct brw_codegen * p)1897 brw_WHILE(struct brw_codegen *p)
1898 {
1899 const struct intel_device_info *devinfo = p->devinfo;
1900 brw_inst *insn, *do_insn;
1901 unsigned br = brw_jump_scale(devinfo);
1902
1903 if (devinfo->ver >= 6) {
1904 insn = next_insn(p, BRW_OPCODE_WHILE);
1905 do_insn = get_inner_do_insn(p);
1906
1907 if (devinfo->ver >= 8) {
1908 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1909 if (devinfo->ver < 12)
1910 brw_set_src0(p, insn, brw_imm_d(0));
1911 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1912 } else if (devinfo->ver == 7) {
1913 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1914 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1915 brw_set_src1(p, insn, brw_imm_w(0));
1916 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1917 } else {
1918 brw_set_dest(p, insn, brw_imm_w(0));
1919 brw_inst_set_gfx6_jump_count(devinfo, insn, br * (do_insn - insn));
1920 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1921 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1922 }
1923
1924 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1925
1926 } else {
1927 if (p->single_program_flow) {
1928 insn = next_insn(p, BRW_OPCODE_ADD);
1929 do_insn = get_inner_do_insn(p);
1930
1931 brw_set_dest(p, insn, brw_ip_reg());
1932 brw_set_src0(p, insn, brw_ip_reg());
1933 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1934 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1935 } else {
1936 insn = next_insn(p, BRW_OPCODE_WHILE);
1937 do_insn = get_inner_do_insn(p);
1938
1939 assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1940
1941 brw_set_dest(p, insn, brw_ip_reg());
1942 brw_set_src0(p, insn, brw_ip_reg());
1943 brw_set_src1(p, insn, brw_imm_d(0));
1944
1945 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1946 brw_inst_set_gfx4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1947 brw_inst_set_gfx4_pop_count(devinfo, insn, 0);
1948
1949 brw_patch_break_cont(p, insn);
1950 }
1951 }
1952 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1953
1954 p->loop_stack_depth--;
1955
1956 return insn;
1957 }
1958
1959 /* FORWARD JUMPS:
1960 */
brw_land_fwd_jump(struct brw_codegen * p,int jmp_insn_idx)1961 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1962 {
1963 const struct intel_device_info *devinfo = p->devinfo;
1964 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1965 unsigned jmpi = 1;
1966
1967 if (devinfo->ver >= 5)
1968 jmpi = 2;
1969
1970 assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1971 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1972
1973 brw_inst_set_gfx4_jump_count(devinfo, jmp_insn,
1974 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1975 }
1976
1977 /* To integrate with the above, it makes sense that the comparison
1978 * instruction should populate the flag register. It might be simpler
1979 * just to use the flag reg for most WM tasks?
1980 */
brw_CMP(struct brw_codegen * p,struct brw_reg dest,unsigned conditional,struct brw_reg src0,struct brw_reg src1)1981 void brw_CMP(struct brw_codegen *p,
1982 struct brw_reg dest,
1983 unsigned conditional,
1984 struct brw_reg src0,
1985 struct brw_reg src1)
1986 {
1987 const struct intel_device_info *devinfo = p->devinfo;
1988 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1989
1990 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1991 brw_set_dest(p, insn, dest);
1992 brw_set_src0(p, insn, src0);
1993 brw_set_src1(p, insn, src1);
1994
1995 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1996 * page says:
1997 * "Any CMP instruction with a null destination must use a {switch}."
1998 *
1999 * It also applies to other Gfx7 platforms (IVB, BYT) even though it isn't
2000 * mentioned on their work-arounds pages.
2001 */
2002 if (devinfo->ver == 7) {
2003 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
2004 dest.nr == BRW_ARF_NULL) {
2005 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
2006 }
2007 }
2008 }
2009
brw_CMPN(struct brw_codegen * p,struct brw_reg dest,unsigned conditional,struct brw_reg src0,struct brw_reg src1)2010 void brw_CMPN(struct brw_codegen *p,
2011 struct brw_reg dest,
2012 unsigned conditional,
2013 struct brw_reg src0,
2014 struct brw_reg src1)
2015 {
2016 const struct intel_device_info *devinfo = p->devinfo;
2017 brw_inst *insn = next_insn(p, BRW_OPCODE_CMPN);
2018
2019 brw_inst_set_cond_modifier(devinfo, insn, conditional);
2020 brw_set_dest(p, insn, dest);
2021 brw_set_src0(p, insn, src0);
2022 brw_set_src1(p, insn, src1);
2023
2024 /* Page 166 of the Ivy Bridge PRM Volume 4 part 3 (Execution Unit ISA)
2025 * says:
2026 *
2027 * If the destination is the null register, the {Switch} instruction
2028 * option must be used.
2029 *
2030 * Page 77 of the Haswell PRM Volume 2b contains the same text.
2031 */
2032 if (devinfo->ver == 7) {
2033 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
2034 dest.nr == BRW_ARF_NULL) {
2035 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
2036 }
2037 }
2038 }
2039
2040 /***********************************************************************
2041 * Helpers for the various SEND message types:
2042 */
2043
2044 /** Extended math function, float[8].
2045 */
gfx4_math(struct brw_codegen * p,struct brw_reg dest,unsigned function,unsigned msg_reg_nr,struct brw_reg src,unsigned precision)2046 void gfx4_math(struct brw_codegen *p,
2047 struct brw_reg dest,
2048 unsigned function,
2049 unsigned msg_reg_nr,
2050 struct brw_reg src,
2051 unsigned precision )
2052 {
2053 const struct intel_device_info *devinfo = p->devinfo;
2054 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2055 unsigned data_type;
2056 if (has_scalar_region(src)) {
2057 data_type = BRW_MATH_DATA_SCALAR;
2058 } else {
2059 data_type = BRW_MATH_DATA_VECTOR;
2060 }
2061
2062 assert(devinfo->ver < 6);
2063
2064 /* Example code doesn't set predicate_control for send
2065 * instructions.
2066 */
2067 brw_inst_set_pred_control(devinfo, insn, 0);
2068 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2069
2070 brw_set_dest(p, insn, dest);
2071 brw_set_src0(p, insn, src);
2072 brw_set_math_message(p,
2073 insn,
2074 function,
2075 src.type == BRW_REGISTER_TYPE_D,
2076 precision,
2077 data_type);
2078 }
2079
gfx6_math(struct brw_codegen * p,struct brw_reg dest,unsigned function,struct brw_reg src0,struct brw_reg src1)2080 void gfx6_math(struct brw_codegen *p,
2081 struct brw_reg dest,
2082 unsigned function,
2083 struct brw_reg src0,
2084 struct brw_reg src1)
2085 {
2086 const struct intel_device_info *devinfo = p->devinfo;
2087 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
2088
2089 assert(devinfo->ver >= 6);
2090
2091 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
2092 (devinfo->ver >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
2093
2094 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2095 if (devinfo->ver == 6) {
2096 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2097 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2098 }
2099
2100 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2101 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2102 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2103 assert(src0.type != BRW_REGISTER_TYPE_F);
2104 assert(src1.type != BRW_REGISTER_TYPE_F);
2105 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2106 (devinfo->ver >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2107 /* From BSpec 6647/47428 "[Instruction] Extended Math Function":
2108 * INT DIV function does not support source modifiers.
2109 */
2110 assert(!src0.negate);
2111 assert(!src0.abs);
2112 assert(!src1.negate);
2113 assert(!src1.abs);
2114 } else {
2115 assert(src0.type == BRW_REGISTER_TYPE_F ||
2116 (src0.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9));
2117 assert(src1.type == BRW_REGISTER_TYPE_F ||
2118 (src1.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9));
2119 }
2120
2121 /* Source modifiers are ignored for extended math instructions on Gfx6. */
2122 if (devinfo->ver == 6) {
2123 assert(!src0.negate);
2124 assert(!src0.abs);
2125 assert(!src1.negate);
2126 assert(!src1.abs);
2127 }
2128
2129 brw_inst_set_math_function(devinfo, insn, function);
2130
2131 brw_set_dest(p, insn, dest);
2132 brw_set_src0(p, insn, src0);
2133 brw_set_src1(p, insn, src1);
2134 }
2135
2136 /**
2137 * Return the right surface index to access the thread scratch space using
2138 * stateless dataport messages.
2139 */
2140 unsigned
brw_scratch_surface_idx(const struct brw_codegen * p)2141 brw_scratch_surface_idx(const struct brw_codegen *p)
2142 {
2143 /* The scratch space is thread-local so IA coherency is unnecessary. */
2144 if (p->devinfo->ver >= 8)
2145 return GFX8_BTI_STATELESS_NON_COHERENT;
2146 else
2147 return BRW_BTI_STATELESS;
2148 }
2149
2150 /**
2151 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2152 * using a constant offset per channel.
2153 *
2154 * The offset must be aligned to oword size (16 bytes). Used for
2155 * register spilling.
2156 */
brw_oword_block_write_scratch(struct brw_codegen * p,struct brw_reg mrf,int num_regs,unsigned offset)2157 void brw_oword_block_write_scratch(struct brw_codegen *p,
2158 struct brw_reg mrf,
2159 int num_regs,
2160 unsigned offset)
2161 {
2162 const struct intel_device_info *devinfo = p->devinfo;
2163 const unsigned target_cache =
2164 (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
2165 devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2166 BRW_SFID_DATAPORT_WRITE);
2167 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2168 uint32_t msg_type;
2169
2170 if (devinfo->ver >= 6)
2171 offset /= 16;
2172
2173 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2174
2175 const unsigned mlen = 1 + num_regs;
2176
2177 /* Set up the message header. This is g0, with g0.2 filled with
2178 * the offset. We don't want to leave our offset around in g0 or
2179 * it'll screw up texture samples, so set it up inside the message
2180 * reg.
2181 */
2182 {
2183 brw_push_insn_state(p);
2184 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2185 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2186 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2187 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2188
2189 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2190
2191 /* set message header global offset field (reg 0, element 2) */
2192 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2193 brw_set_default_swsb(p, tgl_swsb_null());
2194 brw_MOV(p,
2195 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2196 mrf.nr,
2197 2), BRW_REGISTER_TYPE_UD),
2198 brw_imm_ud(offset));
2199
2200 brw_pop_insn_state(p);
2201 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2202 }
2203
2204 {
2205 struct brw_reg dest;
2206 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2207 int send_commit_msg;
2208 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2209 BRW_REGISTER_TYPE_UW);
2210
2211 brw_inst_set_sfid(devinfo, insn, target_cache);
2212 brw_inst_set_compression(devinfo, insn, false);
2213
2214 if (brw_inst_exec_size(devinfo, insn) >= 16)
2215 src_header = vec16(src_header);
2216
2217 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2218 if (devinfo->ver < 6)
2219 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2220
2221 /* Until gfx6, writes followed by reads from the same location
2222 * are not guaranteed to be ordered unless write_commit is set.
2223 * If set, then a no-op write is issued to the destination
2224 * register to set a dependency, and a read from the destination
2225 * can be used to ensure the ordering.
2226 *
2227 * For gfx6, only writes between different threads need ordering
2228 * protection. Our use of DP writes is all about register
2229 * spilling within a thread.
2230 */
2231 if (devinfo->ver >= 6) {
2232 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2233 send_commit_msg = 0;
2234 } else {
2235 dest = src_header;
2236 send_commit_msg = 1;
2237 }
2238
2239 brw_set_dest(p, insn, dest);
2240 if (devinfo->ver >= 6) {
2241 brw_set_src0(p, insn, mrf);
2242 } else {
2243 brw_set_src0(p, insn, brw_null_reg());
2244 }
2245
2246 if (devinfo->ver >= 6)
2247 msg_type = GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2248 else
2249 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2250
2251 brw_set_desc(p, insn,
2252 brw_message_desc(devinfo, mlen, send_commit_msg, true) |
2253 brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
2254 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2255 msg_type, send_commit_msg));
2256 }
2257 }
2258
2259
2260 /**
2261 * Read a block of owords (half a GRF each) from the scratch buffer
2262 * using a constant index per channel.
2263 *
2264 * Offset must be aligned to oword size (16 bytes). Used for register
2265 * spilling.
2266 */
2267 void
brw_oword_block_read_scratch(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,int num_regs,unsigned offset)2268 brw_oword_block_read_scratch(struct brw_codegen *p,
2269 struct brw_reg dest,
2270 struct brw_reg mrf,
2271 int num_regs,
2272 unsigned offset)
2273 {
2274 const struct intel_device_info *devinfo = p->devinfo;
2275 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2276
2277 if (devinfo->ver >= 6)
2278 offset /= 16;
2279
2280 if (p->devinfo->ver >= 7) {
2281 /* On gen 7 and above, we no longer have message registers and we can
2282 * send from any register we want. By using the destination register
2283 * for the message, we guarantee that the implied message write won't
2284 * accidentally overwrite anything. This has been a problem because
2285 * the MRF registers and source for the final FB write are both fixed
2286 * and may overlap.
2287 */
2288 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2289 } else {
2290 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2291 }
2292 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2293
2294 const unsigned rlen = num_regs;
2295 const unsigned target_cache =
2296 (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
2297 devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2298 BRW_SFID_DATAPORT_READ);
2299
2300 {
2301 brw_push_insn_state(p);
2302 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2303 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2304 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2305 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2306
2307 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2308
2309 /* set message header global offset field (reg 0, element 2) */
2310 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2311 brw_set_default_swsb(p, tgl_swsb_null());
2312 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2313
2314 brw_pop_insn_state(p);
2315 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2316 }
2317
2318 {
2319 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2320
2321 brw_inst_set_sfid(devinfo, insn, target_cache);
2322 assert(brw_inst_pred_control(devinfo, insn) == 0);
2323 brw_inst_set_compression(devinfo, insn, false);
2324
2325 brw_set_dest(p, insn, dest); /* UW? */
2326 if (devinfo->ver >= 6) {
2327 brw_set_src0(p, insn, mrf);
2328 } else {
2329 brw_set_src0(p, insn, brw_null_reg());
2330 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2331 }
2332
2333 brw_set_desc(p, insn,
2334 brw_message_desc(devinfo, 1, rlen, true) |
2335 brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
2336 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2337 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2338 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2339 }
2340 }
2341
2342 void
gfx7_block_read_scratch(struct brw_codegen * p,struct brw_reg dest,int num_regs,unsigned offset)2343 gfx7_block_read_scratch(struct brw_codegen *p,
2344 struct brw_reg dest,
2345 int num_regs,
2346 unsigned offset)
2347 {
2348 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2349 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2350
2351 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2352
2353 /* The HW requires that the header is present; this is to get the g0.5
2354 * scratch offset.
2355 */
2356 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2357
2358 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2359 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2360 * is 32 bytes, which happens to be the size of a register.
2361 */
2362 offset /= REG_SIZE;
2363 assert(offset < (1 << 12));
2364
2365 gfx7_set_dp_scratch_message(p, insn,
2366 false, /* scratch read */
2367 false, /* OWords */
2368 false, /* invalidate after read */
2369 num_regs,
2370 offset,
2371 1, /* mlen: just g0 */
2372 num_regs, /* rlen */
2373 true); /* header present */
2374 }
2375
2376 /**
2377 * Read float[4] vectors from the data port constant cache.
2378 * Location (in buffer) should be a multiple of 16.
2379 * Used for fetching shader constants.
2380 */
brw_oword_block_read(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,uint32_t offset,uint32_t bind_table_index)2381 void brw_oword_block_read(struct brw_codegen *p,
2382 struct brw_reg dest,
2383 struct brw_reg mrf,
2384 uint32_t offset,
2385 uint32_t bind_table_index)
2386 {
2387 const struct intel_device_info *devinfo = p->devinfo;
2388 const unsigned target_cache =
2389 (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_CONSTANT_CACHE :
2390 BRW_SFID_DATAPORT_READ);
2391 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2392 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2393
2394 /* On newer hardware, offset is in units of owords. */
2395 if (devinfo->ver >= 6)
2396 offset /= 16;
2397
2398 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2399
2400 brw_push_insn_state(p);
2401 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2402 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2403 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2404
2405 brw_push_insn_state(p);
2406 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2407 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2408 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2409
2410 /* set message header global offset field (reg 0, element 2) */
2411 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2412 brw_set_default_swsb(p, tgl_swsb_null());
2413 brw_MOV(p,
2414 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2415 mrf.nr,
2416 2), BRW_REGISTER_TYPE_UD),
2417 brw_imm_ud(offset));
2418 brw_pop_insn_state(p);
2419
2420 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2421
2422 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2423
2424 brw_inst_set_sfid(devinfo, insn, target_cache);
2425
2426 /* cast dest to a uword[8] vector */
2427 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2428
2429 brw_set_dest(p, insn, dest);
2430 if (devinfo->ver >= 6) {
2431 brw_set_src0(p, insn, mrf);
2432 } else {
2433 brw_set_src0(p, insn, brw_null_reg());
2434 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2435 }
2436
2437 brw_set_desc(p, insn,
2438 brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2439 brw_dp_read_desc(devinfo, bind_table_index,
2440 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2441 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2442 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
2443
2444 brw_pop_insn_state(p);
2445 }
2446
2447 brw_inst *
brw_fb_WRITE(struct brw_codegen * p,struct brw_reg payload,struct brw_reg implied_header,unsigned msg_control,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool eot,bool last_render_target,bool header_present)2448 brw_fb_WRITE(struct brw_codegen *p,
2449 struct brw_reg payload,
2450 struct brw_reg implied_header,
2451 unsigned msg_control,
2452 unsigned binding_table_index,
2453 unsigned msg_length,
2454 unsigned response_length,
2455 bool eot,
2456 bool last_render_target,
2457 bool header_present)
2458 {
2459 const struct intel_device_info *devinfo = p->devinfo;
2460 const unsigned target_cache =
2461 (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2462 BRW_SFID_DATAPORT_WRITE);
2463 brw_inst *insn;
2464 struct brw_reg dest, src0;
2465
2466 if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2467 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2468 else
2469 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2470
2471 if (devinfo->ver >= 6) {
2472 insn = next_insn(p, BRW_OPCODE_SENDC);
2473 } else {
2474 insn = next_insn(p, BRW_OPCODE_SEND);
2475 }
2476 brw_inst_set_sfid(devinfo, insn, target_cache);
2477 brw_inst_set_compression(devinfo, insn, false);
2478
2479 if (devinfo->ver >= 6) {
2480 /* headerless version, just submit color payload */
2481 src0 = payload;
2482 } else {
2483 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2484 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2485 src0 = implied_header;
2486 }
2487
2488 brw_set_dest(p, insn, dest);
2489 brw_set_src0(p, insn, src0);
2490 brw_set_desc(p, insn,
2491 brw_message_desc(devinfo, msg_length, response_length,
2492 header_present) |
2493 brw_fb_write_desc(devinfo, binding_table_index, msg_control,
2494 last_render_target,
2495 false /* coarse_write */));
2496 brw_inst_set_eot(devinfo, insn, eot);
2497
2498 return insn;
2499 }
2500
2501 brw_inst *
gfx9_fb_READ(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool per_sample)2502 gfx9_fb_READ(struct brw_codegen *p,
2503 struct brw_reg dst,
2504 struct brw_reg payload,
2505 unsigned binding_table_index,
2506 unsigned msg_length,
2507 unsigned response_length,
2508 bool per_sample)
2509 {
2510 const struct intel_device_info *devinfo = p->devinfo;
2511 assert(devinfo->ver >= 9);
2512 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2513
2514 brw_inst_set_sfid(devinfo, insn, GFX6_SFID_DATAPORT_RENDER_CACHE);
2515 brw_set_dest(p, insn, dst);
2516 brw_set_src0(p, insn, payload);
2517 brw_set_desc(
2518 p, insn,
2519 brw_message_desc(devinfo, msg_length, response_length, true) |
2520 brw_fb_read_desc(devinfo, binding_table_index, 0 /* msg_control */,
2521 1 << brw_get_default_exec_size(p), per_sample));
2522 brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2523
2524 return insn;
2525 }
2526
2527 /**
2528 * Texture sample instruction.
2529 * Note: the msg_type plus msg_length values determine exactly what kind
2530 * of sampling operation is performed. See volume 4, page 161 of docs.
2531 */
brw_SAMPLE(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,unsigned sampler,unsigned msg_type,unsigned response_length,unsigned msg_length,unsigned header_present,unsigned simd_mode,unsigned return_format)2532 void brw_SAMPLE(struct brw_codegen *p,
2533 struct brw_reg dest,
2534 unsigned msg_reg_nr,
2535 struct brw_reg src0,
2536 unsigned binding_table_index,
2537 unsigned sampler,
2538 unsigned msg_type,
2539 unsigned response_length,
2540 unsigned msg_length,
2541 unsigned header_present,
2542 unsigned simd_mode,
2543 unsigned return_format)
2544 {
2545 const struct intel_device_info *devinfo = p->devinfo;
2546 brw_inst *insn;
2547
2548 if (msg_reg_nr != -1)
2549 gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2550
2551 insn = next_insn(p, BRW_OPCODE_SEND);
2552 brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2553 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2554
2555 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2556 *
2557 * "Instruction compression is not allowed for this instruction (that
2558 * is, send). The hardware behavior is undefined if this instruction is
2559 * set as compressed. However, compress control can be set to "SecHalf"
2560 * to affect the EMask generation."
2561 *
2562 * No similar wording is found in later PRMs, but there are examples
2563 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2564 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2565 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2566 */
2567 brw_inst_set_compression(devinfo, insn, false);
2568
2569 if (devinfo->ver < 6)
2570 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2571
2572 brw_set_dest(p, insn, dest);
2573 brw_set_src0(p, insn, src0);
2574 brw_set_desc(p, insn,
2575 brw_message_desc(devinfo, msg_length, response_length,
2576 header_present) |
2577 brw_sampler_desc(devinfo, binding_table_index, sampler,
2578 msg_type, simd_mode, return_format));
2579 }
2580
2581 /* Adjust the message header's sampler state pointer to
2582 * select the correct group of 16 samplers.
2583 */
brw_adjust_sampler_state_pointer(struct brw_codegen * p,struct brw_reg header,struct brw_reg sampler_index)2584 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2585 struct brw_reg header,
2586 struct brw_reg sampler_index)
2587 {
2588 /* The "Sampler Index" field can only store values between 0 and 15.
2589 * However, we can add an offset to the "Sampler State Pointer"
2590 * field, effectively selecting a different set of 16 samplers.
2591 *
2592 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2593 * offset, and each sampler state is only 16-bytes, so we can't
2594 * exclusively use the offset - we have to use both.
2595 */
2596
2597 const struct intel_device_info *devinfo = p->devinfo;
2598
2599 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2600 const int sampler_state_size = 16; /* 16 bytes */
2601 uint32_t sampler = sampler_index.ud;
2602
2603 if (sampler >= 16) {
2604 assert(devinfo->verx10 >= 75);
2605 brw_ADD(p,
2606 get_element_ud(header, 3),
2607 get_element_ud(brw_vec8_grf(0, 0), 3),
2608 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2609 }
2610 } else {
2611 /* Non-const sampler array indexing case */
2612 if (devinfo->verx10 <= 70) {
2613 return;
2614 }
2615
2616 struct brw_reg temp = get_element_ud(header, 3);
2617
2618 brw_push_insn_state(p);
2619 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2620 brw_set_default_swsb(p, tgl_swsb_regdist(1));
2621 brw_SHL(p, temp, temp, brw_imm_ud(4));
2622 brw_ADD(p,
2623 get_element_ud(header, 3),
2624 get_element_ud(brw_vec8_grf(0, 0), 3),
2625 temp);
2626 brw_pop_insn_state(p);
2627 }
2628 }
2629
2630 /* All these variables are pretty confusing - we might be better off
2631 * using bitmasks and macros for this, in the old style. Or perhaps
2632 * just having the caller instantiate the fields in dword3 itself.
2633 */
brw_urb_WRITE(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,enum brw_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle)2634 void brw_urb_WRITE(struct brw_codegen *p,
2635 struct brw_reg dest,
2636 unsigned msg_reg_nr,
2637 struct brw_reg src0,
2638 enum brw_urb_write_flags flags,
2639 unsigned msg_length,
2640 unsigned response_length,
2641 unsigned offset,
2642 unsigned swizzle)
2643 {
2644 const struct intel_device_info *devinfo = p->devinfo;
2645 brw_inst *insn;
2646
2647 gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2648
2649 if (devinfo->ver >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2650 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2651 brw_push_insn_state(p);
2652 brw_set_default_access_mode(p, BRW_ALIGN_1);
2653 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2654 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2655 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2656 BRW_REGISTER_TYPE_UD),
2657 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2658 brw_imm_ud(0xff00));
2659 brw_pop_insn_state(p);
2660 }
2661
2662 insn = next_insn(p, BRW_OPCODE_SEND);
2663
2664 assert(msg_length < BRW_MAX_MRF(devinfo->ver));
2665
2666 brw_set_dest(p, insn, dest);
2667 brw_set_src0(p, insn, src0);
2668 brw_set_src1(p, insn, brw_imm_d(0));
2669
2670 if (devinfo->ver < 6)
2671 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2672
2673 brw_set_urb_message(p,
2674 insn,
2675 flags,
2676 msg_length,
2677 response_length,
2678 offset,
2679 swizzle);
2680 }
2681
2682 void
brw_send_indirect_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload,struct brw_reg desc,unsigned desc_imm,bool eot)2683 brw_send_indirect_message(struct brw_codegen *p,
2684 unsigned sfid,
2685 struct brw_reg dst,
2686 struct brw_reg payload,
2687 struct brw_reg desc,
2688 unsigned desc_imm,
2689 bool eot)
2690 {
2691 const struct intel_device_info *devinfo = p->devinfo;
2692 struct brw_inst *send;
2693
2694 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2695
2696 assert(desc.type == BRW_REGISTER_TYPE_UD);
2697
2698 if (desc.file == BRW_IMMEDIATE_VALUE) {
2699 send = next_insn(p, BRW_OPCODE_SEND);
2700 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2701 brw_set_desc(p, send, desc.ud | desc_imm);
2702 } else {
2703 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2704 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2705
2706 brw_push_insn_state(p);
2707 brw_set_default_access_mode(p, BRW_ALIGN_1);
2708 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2709 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2710 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2711 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2712
2713 /* Load the indirect descriptor to an address register using OR so the
2714 * caller can specify additional descriptor bits with the desc_imm
2715 * immediate.
2716 */
2717 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2718
2719 brw_pop_insn_state(p);
2720
2721 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2722 send = next_insn(p, BRW_OPCODE_SEND);
2723 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2724
2725 if (devinfo->ver >= 12)
2726 brw_inst_set_send_sel_reg32_desc(devinfo, send, true);
2727 else
2728 brw_set_src1(p, send, addr);
2729 }
2730
2731 brw_set_dest(p, send, dst);
2732 brw_inst_set_sfid(devinfo, send, sfid);
2733 brw_inst_set_eot(devinfo, send, eot);
2734 }
2735
2736 void
brw_send_indirect_split_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload0,struct brw_reg payload1,struct brw_reg desc,unsigned desc_imm,struct brw_reg ex_desc,unsigned ex_desc_imm,bool eot)2737 brw_send_indirect_split_message(struct brw_codegen *p,
2738 unsigned sfid,
2739 struct brw_reg dst,
2740 struct brw_reg payload0,
2741 struct brw_reg payload1,
2742 struct brw_reg desc,
2743 unsigned desc_imm,
2744 struct brw_reg ex_desc,
2745 unsigned ex_desc_imm,
2746 bool eot)
2747 {
2748 const struct intel_device_info *devinfo = p->devinfo;
2749 struct brw_inst *send;
2750
2751 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2752
2753 assert(desc.type == BRW_REGISTER_TYPE_UD);
2754
2755 if (desc.file == BRW_IMMEDIATE_VALUE) {
2756 desc.ud |= desc_imm;
2757 } else {
2758 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2759 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2760
2761 brw_push_insn_state(p);
2762 brw_set_default_access_mode(p, BRW_ALIGN_1);
2763 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2764 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2765 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2766 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2767
2768 /* Load the indirect descriptor to an address register using OR so the
2769 * caller can specify additional descriptor bits with the desc_imm
2770 * immediate.
2771 */
2772 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2773
2774 brw_pop_insn_state(p);
2775 desc = addr;
2776
2777 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2778 }
2779
2780 if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
2781 (devinfo->ver >= 12 ||
2782 ((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) {
2783 ex_desc.ud |= ex_desc_imm;
2784 } else {
2785 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2786 struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);
2787
2788 brw_push_insn_state(p);
2789 brw_set_default_access_mode(p, BRW_ALIGN_1);
2790 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2791 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2792 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2793 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2794
2795 /* Load the indirect extended descriptor to an address register using OR
2796 * so the caller can specify additional descriptor bits with the
2797 * desc_imm immediate.
2798 *
2799 * Even though the instruction dispatcher always pulls the SFID and EOT
2800 * fields from the instruction itself, actual external unit which
2801 * processes the message gets the SFID and EOT from the extended
2802 * descriptor which comes from the address register. If we don't OR
2803 * those two bits in, the external unit may get confused and hang.
2804 */
2805 unsigned imm_part = ex_desc_imm | sfid | eot << 5;
2806
2807 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2808 /* ex_desc bits 15:12 don't exist in the instruction encoding prior
2809 * to Gfx12, so we may have fallen back to an indirect extended
2810 * descriptor.
2811 */
2812 brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
2813 } else {
2814 brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
2815 }
2816
2817 brw_pop_insn_state(p);
2818 ex_desc = addr;
2819
2820 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2821 }
2822
2823 send = next_insn(p, devinfo->ver >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);
2824 brw_set_dest(p, send, dst);
2825 brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));
2826 brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));
2827
2828 if (desc.file == BRW_IMMEDIATE_VALUE) {
2829 brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);
2830 brw_inst_set_send_desc(devinfo, send, desc.ud);
2831 } else {
2832 assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2833 assert(desc.nr == BRW_ARF_ADDRESS);
2834 assert(desc.subnr == 0);
2835 brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);
2836 }
2837
2838 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2839 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
2840 brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
2841 } else {
2842 assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2843 assert(ex_desc.nr == BRW_ARF_ADDRESS);
2844 assert((ex_desc.subnr & 0x3) == 0);
2845 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
2846 brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2);
2847 }
2848
2849 brw_inst_set_sfid(devinfo, send, sfid);
2850 brw_inst_set_eot(devinfo, send, eot);
2851 }
2852
2853 static void
brw_send_indirect_surface_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned desc_imm)2854 brw_send_indirect_surface_message(struct brw_codegen *p,
2855 unsigned sfid,
2856 struct brw_reg dst,
2857 struct brw_reg payload,
2858 struct brw_reg surface,
2859 unsigned desc_imm)
2860 {
2861 if (surface.file != BRW_IMMEDIATE_VALUE) {
2862 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2863 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2864
2865 brw_push_insn_state(p);
2866 brw_set_default_access_mode(p, BRW_ALIGN_1);
2867 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2868 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2869 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2870 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2871
2872 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2873 * some surface array is accessed out of bounds.
2874 */
2875 brw_AND(p, addr,
2876 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2877 BRW_GET_SWZ(surface.swizzle, 0)),
2878 brw_imm_ud(0xff));
2879
2880 brw_pop_insn_state(p);
2881
2882 surface = addr;
2883 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2884 }
2885
2886 brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2887 }
2888
2889 static bool
while_jumps_before_offset(const struct intel_device_info * devinfo,brw_inst * insn,int while_offset,int start_offset)2890 while_jumps_before_offset(const struct intel_device_info *devinfo,
2891 brw_inst *insn, int while_offset, int start_offset)
2892 {
2893 int scale = 16 / brw_jump_scale(devinfo);
2894 int jip = devinfo->ver == 6 ? brw_inst_gfx6_jump_count(devinfo, insn)
2895 : brw_inst_jip(devinfo, insn);
2896 assert(jip < 0);
2897 return while_offset + jip * scale <= start_offset;
2898 }
2899
2900
2901 static int
brw_find_next_block_end(struct brw_codegen * p,int start_offset)2902 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2903 {
2904 int offset;
2905 void *store = p->store;
2906 const struct intel_device_info *devinfo = p->devinfo;
2907
2908 int depth = 0;
2909
2910 for (offset = next_offset(devinfo, store, start_offset);
2911 offset < p->next_insn_offset;
2912 offset = next_offset(devinfo, store, offset)) {
2913 brw_inst *insn = store + offset;
2914
2915 switch (brw_inst_opcode(devinfo, insn)) {
2916 case BRW_OPCODE_IF:
2917 depth++;
2918 break;
2919 case BRW_OPCODE_ENDIF:
2920 if (depth == 0)
2921 return offset;
2922 depth--;
2923 break;
2924 case BRW_OPCODE_WHILE:
2925 /* If the while doesn't jump before our instruction, it's the end
2926 * of a sibling do...while loop. Ignore it.
2927 */
2928 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2929 continue;
2930 FALLTHROUGH;
2931 case BRW_OPCODE_ELSE:
2932 case BRW_OPCODE_HALT:
2933 if (depth == 0)
2934 return offset;
2935 break;
2936 default:
2937 break;
2938 }
2939 }
2940
2941 return 0;
2942 }
2943
2944 /* There is no DO instruction on gfx6, so to find the end of the loop
2945 * we have to see if the loop is jumping back before our start
2946 * instruction.
2947 */
2948 static int
brw_find_loop_end(struct brw_codegen * p,int start_offset)2949 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2950 {
2951 const struct intel_device_info *devinfo = p->devinfo;
2952 int offset;
2953 void *store = p->store;
2954
2955 assert(devinfo->ver >= 6);
2956
2957 /* Always start after the instruction (such as a WHILE) we're trying to fix
2958 * up.
2959 */
2960 for (offset = next_offset(devinfo, store, start_offset);
2961 offset < p->next_insn_offset;
2962 offset = next_offset(devinfo, store, offset)) {
2963 brw_inst *insn = store + offset;
2964
2965 if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2966 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2967 return offset;
2968 }
2969 }
2970 assert(!"not reached");
2971 return start_offset;
2972 }
2973
2974 /* After program generation, go back and update the UIP and JIP of
2975 * BREAK, CONT, and HALT instructions to their correct locations.
2976 */
2977 void
brw_set_uip_jip(struct brw_codegen * p,int start_offset)2978 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2979 {
2980 const struct intel_device_info *devinfo = p->devinfo;
2981 int offset;
2982 int br = brw_jump_scale(devinfo);
2983 int scale = 16 / br;
2984 void *store = p->store;
2985
2986 if (devinfo->ver < 6)
2987 return;
2988
2989 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2990 brw_inst *insn = store + offset;
2991 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2992
2993 int block_end_offset = brw_find_next_block_end(p, offset);
2994 switch (brw_inst_opcode(devinfo, insn)) {
2995 case BRW_OPCODE_BREAK:
2996 assert(block_end_offset != 0);
2997 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2998 /* Gfx7 UIP points to WHILE; Gfx6 points just after it */
2999 brw_inst_set_uip(devinfo, insn,
3000 (brw_find_loop_end(p, offset) - offset +
3001 (devinfo->ver == 6 ? 16 : 0)) / scale);
3002 break;
3003 case BRW_OPCODE_CONTINUE:
3004 assert(block_end_offset != 0);
3005 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
3006 brw_inst_set_uip(devinfo, insn,
3007 (brw_find_loop_end(p, offset) - offset) / scale);
3008
3009 assert(brw_inst_uip(devinfo, insn) != 0);
3010 assert(brw_inst_jip(devinfo, insn) != 0);
3011 break;
3012
3013 case BRW_OPCODE_ENDIF: {
3014 int32_t jump = (block_end_offset == 0) ?
3015 1 * br : (block_end_offset - offset) / scale;
3016 if (devinfo->ver >= 7)
3017 brw_inst_set_jip(devinfo, insn, jump);
3018 else
3019 brw_inst_set_gfx6_jump_count(devinfo, insn, jump);
3020 break;
3021 }
3022
3023 case BRW_OPCODE_HALT:
3024 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
3025 *
3026 * "In case of the halt instruction not inside any conditional
3027 * code block, the value of <JIP> and <UIP> should be the
3028 * same. In case of the halt instruction inside conditional code
3029 * block, the <UIP> should be the end of the program, and the
3030 * <JIP> should be end of the most inner conditional code block."
3031 *
3032 * The uip will have already been set by whoever set up the
3033 * instruction.
3034 */
3035 if (block_end_offset == 0) {
3036 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
3037 } else {
3038 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
3039 }
3040 assert(brw_inst_uip(devinfo, insn) != 0);
3041 assert(brw_inst_jip(devinfo, insn) != 0);
3042 break;
3043
3044 default:
3045 break;
3046 }
3047 }
3048 }
3049
brw_ff_sync(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,bool allocate,unsigned response_length,bool eot)3050 void brw_ff_sync(struct brw_codegen *p,
3051 struct brw_reg dest,
3052 unsigned msg_reg_nr,
3053 struct brw_reg src0,
3054 bool allocate,
3055 unsigned response_length,
3056 bool eot)
3057 {
3058 const struct intel_device_info *devinfo = p->devinfo;
3059 brw_inst *insn;
3060
3061 gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
3062
3063 insn = next_insn(p, BRW_OPCODE_SEND);
3064 brw_set_dest(p, insn, dest);
3065 brw_set_src0(p, insn, src0);
3066 brw_set_src1(p, insn, brw_imm_d(0));
3067
3068 if (devinfo->ver < 6)
3069 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
3070
3071 brw_set_ff_sync_message(p,
3072 insn,
3073 allocate,
3074 response_length,
3075 eot);
3076 }
3077
3078 /**
3079 * Emit the SEND instruction necessary to generate stream output data on Gfx6
3080 * (for transform feedback).
3081 *
3082 * If send_commit_msg is true, this is the last piece of stream output data
3083 * from this thread, so send the data as a committed write. According to the
3084 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
3085 *
3086 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
3087 * writes are complete by sending the final write as a committed write."
3088 */
3089 void
brw_svb_write(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,bool send_commit_msg)3090 brw_svb_write(struct brw_codegen *p,
3091 struct brw_reg dest,
3092 unsigned msg_reg_nr,
3093 struct brw_reg src0,
3094 unsigned binding_table_index,
3095 bool send_commit_msg)
3096 {
3097 const struct intel_device_info *devinfo = p->devinfo;
3098 assert(devinfo->ver == 6);
3099 const unsigned target_cache = GFX6_SFID_DATAPORT_RENDER_CACHE;
3100 brw_inst *insn;
3101
3102 gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
3103
3104 insn = next_insn(p, BRW_OPCODE_SEND);
3105 brw_inst_set_sfid(devinfo, insn, target_cache);
3106 brw_set_dest(p, insn, dest);
3107 brw_set_src0(p, insn, src0);
3108 brw_set_desc(p, insn,
3109 brw_message_desc(devinfo, 1, send_commit_msg, true) |
3110 brw_dp_write_desc(devinfo, binding_table_index,
3111 0, /* msg_control: ignored */
3112 GFX6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
3113 send_commit_msg)); /* send_commit_msg */
3114 }
3115
3116 static unsigned
brw_surface_payload_size(unsigned num_channels,unsigned exec_size)3117 brw_surface_payload_size(unsigned num_channels,
3118 unsigned exec_size /**< 0 for SIMD4x2 */)
3119 {
3120 if (exec_size == 0)
3121 return 1; /* SIMD4x2 */
3122 else if (exec_size <= 8)
3123 return num_channels;
3124 else
3125 return 2 * num_channels;
3126 }
3127
3128 void
brw_untyped_atomic(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned atomic_op,unsigned msg_length,bool response_expected,bool header_present)3129 brw_untyped_atomic(struct brw_codegen *p,
3130 struct brw_reg dst,
3131 struct brw_reg payload,
3132 struct brw_reg surface,
3133 unsigned atomic_op,
3134 unsigned msg_length,
3135 bool response_expected,
3136 bool header_present)
3137 {
3138 const struct intel_device_info *devinfo = p->devinfo;
3139 const unsigned sfid = (devinfo->verx10 >= 75 ?
3140 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3141 GFX7_SFID_DATAPORT_DATA_CACHE);
3142 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3143 /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3144 const bool has_simd4x2 = devinfo->verx10 >= 75;
3145 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3146 has_simd4x2 ? 0 : 8;
3147 const unsigned response_length =
3148 brw_surface_payload_size(response_expected, exec_size);
3149 const unsigned desc =
3150 brw_message_desc(devinfo, msg_length, response_length, header_present) |
3151 brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
3152 response_expected);
3153 /* Mask out unused components -- This is especially important in Align16
3154 * mode on generations that don't have native support for SIMD4x2 atomics,
3155 * because unused but enabled components will cause the dataport to perform
3156 * additional atomic operations on the addresses that happen to be in the
3157 * uninitialized Y, Z and W coordinates of the payload.
3158 */
3159 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3160
3161 brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
3162 payload, surface, desc);
3163 }
3164
3165 void
brw_untyped_surface_read(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels)3166 brw_untyped_surface_read(struct brw_codegen *p,
3167 struct brw_reg dst,
3168 struct brw_reg payload,
3169 struct brw_reg surface,
3170 unsigned msg_length,
3171 unsigned num_channels)
3172 {
3173 const struct intel_device_info *devinfo = p->devinfo;
3174 const unsigned sfid = (devinfo->verx10 >= 75 ?
3175 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3176 GFX7_SFID_DATAPORT_DATA_CACHE);
3177 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3178 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
3179 const unsigned response_length =
3180 brw_surface_payload_size(num_channels, exec_size);
3181 const unsigned desc =
3182 brw_message_desc(devinfo, msg_length, response_length, false) |
3183 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
3184
3185 brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
3186 }
3187
3188 void
brw_untyped_surface_write(struct brw_codegen * p,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels,bool header_present)3189 brw_untyped_surface_write(struct brw_codegen *p,
3190 struct brw_reg payload,
3191 struct brw_reg surface,
3192 unsigned msg_length,
3193 unsigned num_channels,
3194 bool header_present)
3195 {
3196 const struct intel_device_info *devinfo = p->devinfo;
3197 const unsigned sfid = (devinfo->verx10 >= 75 ?
3198 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3199 GFX7_SFID_DATAPORT_DATA_CACHE);
3200 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3201 /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3202 const bool has_simd4x2 = devinfo->verx10 >= 75;
3203 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3204 has_simd4x2 ? 0 : 8;
3205 const unsigned desc =
3206 brw_message_desc(devinfo, msg_length, 0, header_present) |
3207 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
3208 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3209 const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
3210
3211 brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
3212 payload, surface, desc);
3213 }
3214
3215 static void
brw_set_memory_fence_message(struct brw_codegen * p,struct brw_inst * insn,enum brw_message_target sfid,bool commit_enable,unsigned bti)3216 brw_set_memory_fence_message(struct brw_codegen *p,
3217 struct brw_inst *insn,
3218 enum brw_message_target sfid,
3219 bool commit_enable,
3220 unsigned bti)
3221 {
3222 const struct intel_device_info *devinfo = p->devinfo;
3223
3224 brw_set_desc(p, insn, brw_message_desc(
3225 devinfo, 1, (commit_enable ? 1 : 0), true));
3226
3227 brw_inst_set_sfid(devinfo, insn, sfid);
3228
3229 switch (sfid) {
3230 case GFX6_SFID_DATAPORT_RENDER_CACHE:
3231 brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_RC_MEMORY_FENCE);
3232 break;
3233 case GFX7_SFID_DATAPORT_DATA_CACHE:
3234 brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_DC_MEMORY_FENCE);
3235 break;
3236 default:
3237 unreachable("Not reached");
3238 }
3239
3240 if (commit_enable)
3241 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3242
3243 assert(devinfo->ver >= 11 || bti == 0);
3244 brw_inst_set_binding_table_index(devinfo, insn, bti);
3245 }
3246
3247 static void
gfx12_set_memory_fence_message(struct brw_codegen * p,struct brw_inst * insn,enum brw_message_target sfid)3248 gfx12_set_memory_fence_message(struct brw_codegen *p,
3249 struct brw_inst *insn,
3250 enum brw_message_target sfid)
3251 {
3252 const unsigned mlen = 1; /* g0 header */
3253 /* Completion signaled by write to register. No data returned. */
3254 const unsigned rlen = 1;
3255
3256 brw_inst_set_sfid(p->devinfo, insn, sfid);
3257
3258 if (sfid == BRW_SFID_URB) {
3259 brw_set_desc(p, insn, brw_urb_fence_desc(p->devinfo) |
3260 brw_message_desc(p->devinfo, mlen, rlen, false));
3261 } else {
3262 enum lsc_fence_scope scope = LSC_FENCE_THREADGROUP;
3263 enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE;
3264
3265 if (sfid == GFX12_SFID_TGM) {
3266 scope = LSC_FENCE_TILE;
3267 flush_type = LSC_FLUSH_TYPE_EVICT;
3268 }
3269
3270 brw_set_desc(p, insn, lsc_fence_msg_desc(p->devinfo, scope,
3271 flush_type, false) |
3272 brw_message_desc(p->devinfo, mlen, rlen, false));
3273 }
3274 }
3275
3276 void
brw_memory_fence(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src,enum opcode send_op,enum brw_message_target sfid,bool commit_enable,unsigned bti)3277 brw_memory_fence(struct brw_codegen *p,
3278 struct brw_reg dst,
3279 struct brw_reg src,
3280 enum opcode send_op,
3281 enum brw_message_target sfid,
3282 bool commit_enable,
3283 unsigned bti)
3284 {
3285 const struct intel_device_info *devinfo = p->devinfo;
3286
3287 dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
3288 src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
3289
3290 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3291 * message doesn't write anything back.
3292 */
3293 struct brw_inst *insn = next_insn(p, send_op);
3294 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3295 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3296 brw_set_dest(p, insn, dst);
3297 brw_set_src0(p, insn, src);
3298
3299 /* All DG2 hardware requires LSC for fence messages, even A-step */
3300 if (devinfo->has_lsc)
3301 gfx12_set_memory_fence_message(p, insn, sfid);
3302 else
3303 brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
3304 }
3305
3306 void
brw_pixel_interpolator_query(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,bool noperspective,bool coarse_pixel_rate,unsigned mode,struct brw_reg data,unsigned msg_length,unsigned response_length)3307 brw_pixel_interpolator_query(struct brw_codegen *p,
3308 struct brw_reg dest,
3309 struct brw_reg mrf,
3310 bool noperspective,
3311 bool coarse_pixel_rate,
3312 unsigned mode,
3313 struct brw_reg data,
3314 unsigned msg_length,
3315 unsigned response_length)
3316 {
3317 const struct intel_device_info *devinfo = p->devinfo;
3318 const uint16_t exec_size = brw_get_default_exec_size(p);
3319 const unsigned slot_group = brw_get_default_group(p) / 16;
3320 const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
3321 const unsigned desc =
3322 brw_message_desc(devinfo, msg_length, response_length, false) |
3323 brw_pixel_interp_desc(devinfo, mode, noperspective, coarse_pixel_rate,
3324 simd_mode, slot_group);
3325
3326 /* brw_send_indirect_message will automatically use a direct send message
3327 * if data is actually immediate.
3328 */
3329 brw_send_indirect_message(p,
3330 GFX7_SFID_PIXEL_INTERPOLATOR,
3331 dest,
3332 mrf,
3333 vec1(data),
3334 desc,
3335 false);
3336 }
3337
3338 void
brw_find_live_channel(struct brw_codegen * p,struct brw_reg dst,struct brw_reg mask)3339 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3340 struct brw_reg mask)
3341 {
3342 const struct intel_device_info *devinfo = p->devinfo;
3343 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3344 const unsigned qtr_control = brw_get_default_group(p) / 8;
3345 brw_inst *inst;
3346
3347 assert(devinfo->ver >= 7);
3348 assert(mask.type == BRW_REGISTER_TYPE_UD);
3349
3350 brw_push_insn_state(p);
3351
3352 /* The flag register is only used on Gfx7 in align1 mode, so avoid setting
3353 * unnecessary bits in the instruction words, get the information we need
3354 * and reset the default flag register. This allows more instructions to be
3355 * compacted.
3356 */
3357 const unsigned flag_subreg = p->current->flag_subreg;
3358 brw_set_default_flag_reg(p, 0, 0);
3359
3360 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3361 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3362
3363 if (devinfo->ver >= 8) {
3364 /* Getting the first active channel index is easy on Gfx8: Just find
3365 * the first bit set in the execution mask. The register exists on
3366 * HSW already but it reads back as all ones when the current
3367 * instruction has execution masking disabled, so it's kind of
3368 * useless.
3369 */
3370 struct brw_reg exec_mask =
3371 retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3372
3373 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3374 if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3375 /* Unfortunately, ce0 does not take into account the thread
3376 * dispatch mask, which may be a problem in cases where it's not
3377 * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3378 * some n). Combine ce0 with the given dispatch (or vector) mask
3379 * to mask off those channels which were never dispatched by the
3380 * hardware.
3381 */
3382 brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3383 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3384 brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3385 exec_mask = vec1(dst);
3386 }
3387
3388 /* Quarter control has the effect of magically shifting the value of
3389 * ce0 so you'll get the first active channel relative to the
3390 * specified quarter control as result.
3391 */
3392 inst = brw_FBL(p, vec1(dst), exec_mask);
3393 } else {
3394 const struct brw_reg flag = brw_flag_subreg(flag_subreg);
3395
3396 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3397 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3398
3399 /* Run enough instructions returning zero with execution masking and
3400 * a conditional modifier enabled in order to get the full execution
3401 * mask in f1.0. We could use a single 32-wide move here if it
3402 * weren't because of the hardware bug that causes channel enables to
3403 * be applied incorrectly to the second half of 32-wide instructions
3404 * on Gfx7.
3405 */
3406 const unsigned lower_size = MIN2(16, exec_size);
3407 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3408 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3409 brw_imm_uw(0));
3410 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3411 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3412 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3413 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3414 brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3415 brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3416 }
3417
3418 /* Find the first bit set in the exec_size-wide portion of the flag
3419 * register that was updated by the last sequence of MOV
3420 * instructions.
3421 */
3422 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3423 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3424 brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3425 }
3426 } else {
3427 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3428
3429 if (devinfo->ver >= 8 &&
3430 mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3431 /* In SIMD4x2 mode the first active channel index is just the
3432 * negation of the first bit of the mask register. Note that ce0
3433 * doesn't take into account the dispatch mask, so the Gfx7 path
3434 * should be used instead unless you have the guarantee that the
3435 * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3436 * for some n).
3437 */
3438 inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3439 negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3440 brw_imm_ud(1));
3441
3442 } else {
3443 /* Overwrite the destination without and with execution masking to
3444 * find out which of the channels is active.
3445 */
3446 brw_push_insn_state(p);
3447 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3448 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3449 brw_imm_ud(1));
3450
3451 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3452 brw_imm_ud(0));
3453 brw_pop_insn_state(p);
3454 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3455 }
3456 }
3457
3458 brw_pop_insn_state(p);
3459 }
3460
3461 void
brw_broadcast(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src,struct brw_reg idx)3462 brw_broadcast(struct brw_codegen *p,
3463 struct brw_reg dst,
3464 struct brw_reg src,
3465 struct brw_reg idx)
3466 {
3467 const struct intel_device_info *devinfo = p->devinfo;
3468 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3469 brw_inst *inst;
3470
3471 brw_push_insn_state(p);
3472 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3473 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3474
3475 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3476 src.address_mode == BRW_ADDRESS_DIRECT);
3477 assert(!src.abs && !src.negate);
3478 assert(src.type == dst.type);
3479
3480 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3481 idx.file == BRW_IMMEDIATE_VALUE) {
3482 /* Trivial, the source is already uniform or the index is a constant.
3483 * We will typically not get here if the optimizer is doing its job, but
3484 * asserting would be mean.
3485 */
3486 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3487 src = align1 ? stride(suboffset(src, i), 0, 1, 0) :
3488 stride(suboffset(src, 4 * i), 0, 4, 1);
3489
3490 if (type_sz(src.type) > 4 && !devinfo->has_64bit_float) {
3491 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3492 subscript(src, BRW_REGISTER_TYPE_D, 0));
3493 brw_set_default_swsb(p, tgl_swsb_null());
3494 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3495 subscript(src, BRW_REGISTER_TYPE_D, 1));
3496 } else {
3497 brw_MOV(p, dst, src);
3498 }
3499 } else {
3500 /* From the Haswell PRM section "Register Region Restrictions":
3501 *
3502 * "The lower bits of the AddressImmediate must not overflow to
3503 * change the register address. The lower 5 bits of Address
3504 * Immediate when added to lower 5 bits of address register gives
3505 * the sub-register offset. The upper bits of Address Immediate
3506 * when added to upper bits of address register gives the register
3507 * address. Any overflow from sub-register offset is dropped."
3508 *
3509 * Fortunately, for broadcast, we never have a sub-register offset so
3510 * this isn't an issue.
3511 */
3512 assert(src.subnr == 0);
3513
3514 if (align1) {
3515 const struct brw_reg addr =
3516 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3517 unsigned offset = src.nr * REG_SIZE + src.subnr;
3518 /* Limit in bytes of the signed indirect addressing immediate. */
3519 const unsigned limit = 512;
3520
3521 brw_push_insn_state(p);
3522 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3523 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3524
3525 /* Take into account the component size and horizontal stride. */
3526 assert(src.vstride == src.hstride + src.width);
3527 brw_SHL(p, addr, vec1(idx),
3528 brw_imm_ud(util_logbase2(type_sz(src.type)) +
3529 src.hstride - 1));
3530
3531 /* We can only address up to limit bytes using the indirect
3532 * addressing immediate, account for the difference if the source
3533 * register is above this limit.
3534 */
3535 if (offset >= limit) {
3536 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3537 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3538 offset = offset % limit;
3539 }
3540
3541 brw_pop_insn_state(p);
3542
3543 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3544
3545 /* Use indirect addressing to fetch the specified component. */
3546 if (type_sz(src.type) > 4 &&
3547 (devinfo->is_cherryview || intel_device_info_is_9lp(devinfo) ||
3548 !devinfo->has_64bit_float)) {
3549 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3550 *
3551 * "When source or destination datatype is 64b or operation is
3552 * integer DWord multiply, indirect addressing must not be
3553 * used."
3554 *
3555 * To work around both of this issue, we do two integer MOVs
3556 * insead of one 64-bit MOV. Because no double value should ever
3557 * cross a register boundary, it's safe to use the immediate
3558 * offset in the indirect here to handle adding 4 bytes to the
3559 * offset and avoid the extra ADD to the register file.
3560 */
3561 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3562 retype(brw_vec1_indirect(addr.subnr, offset),
3563 BRW_REGISTER_TYPE_D));
3564 brw_set_default_swsb(p, tgl_swsb_null());
3565 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3566 retype(brw_vec1_indirect(addr.subnr, offset + 4),
3567 BRW_REGISTER_TYPE_D));
3568 } else {
3569 brw_MOV(p, dst,
3570 retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3571 }
3572 } else {
3573 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3574 * to all bits of a flag register,
3575 */
3576 inst = brw_MOV(p,
3577 brw_null_reg(),
3578 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3579 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3580 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3581 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3582
3583 /* and use predicated SEL to pick the right channel. */
3584 inst = brw_SEL(p, dst,
3585 stride(suboffset(src, 4), 4, 4, 1),
3586 stride(src, 4, 4, 1));
3587 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3588 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3589 }
3590 }
3591
3592 brw_pop_insn_state(p);
3593 }
3594
3595 /**
3596 * This instruction is generated as a single-channel align1 instruction by
3597 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3598 *
3599 * We can't use the typed atomic op in the FS because that has the execution
3600 * mask ANDed with the pixel mask, but we just want to write the one dword for
3601 * all the pixels.
3602 *
3603 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3604 * one u32. So we use the same untyped atomic write message as the pixel
3605 * shader.
3606 *
3607 * The untyped atomic operation requires a BUFFER surface type with RAW
3608 * format, and is only accessible through the legacy DATA_CACHE dataport
3609 * messages.
3610 */
brw_shader_time_add(struct brw_codegen * p,struct brw_reg payload,uint32_t surf_index)3611 void brw_shader_time_add(struct brw_codegen *p,
3612 struct brw_reg payload,
3613 uint32_t surf_index)
3614 {
3615 const struct intel_device_info *devinfo = p->devinfo;
3616 const unsigned sfid = (devinfo->verx10 >= 75 ?
3617 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3618 GFX7_SFID_DATAPORT_DATA_CACHE);
3619 assert(devinfo->ver >= 7);
3620
3621 brw_push_insn_state(p);
3622 brw_set_default_access_mode(p, BRW_ALIGN_1);
3623 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3624 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3625 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3626
3627 /* We use brw_vec1_reg and unmasked because we want to increment the given
3628 * offset only once.
3629 */
3630 brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3631 BRW_ARF_NULL, 0));
3632 brw_set_src0(p, send, brw_vec1_reg(payload.file,
3633 payload.nr, 0));
3634 brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) |
3635 brw_dp_untyped_atomic_desc(devinfo, 1, BRW_AOP_ADD,
3636 false)));
3637
3638 brw_inst_set_sfid(devinfo, send, sfid);
3639 brw_inst_set_binding_table_index(devinfo, send, surf_index);
3640
3641 brw_pop_insn_state(p);
3642 }
3643
3644
3645 /**
3646 * Emit the SEND message for a barrier
3647 */
3648 void
brw_barrier(struct brw_codegen * p,struct brw_reg src)3649 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3650 {
3651 const struct intel_device_info *devinfo = p->devinfo;
3652 struct brw_inst *inst;
3653
3654 assert(devinfo->ver >= 7);
3655
3656 brw_push_insn_state(p);
3657 brw_set_default_access_mode(p, BRW_ALIGN_1);
3658 inst = next_insn(p, BRW_OPCODE_SEND);
3659 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3660 brw_set_src0(p, inst, src);
3661 brw_set_src1(p, inst, brw_null_reg());
3662 brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3663
3664 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3665 brw_inst_set_gateway_subfuncid(devinfo, inst,
3666 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3667
3668 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3669 brw_pop_insn_state(p);
3670 }
3671
3672
3673 /**
3674 * Emit the wait instruction for a barrier
3675 */
3676 void
brw_WAIT(struct brw_codegen * p)3677 brw_WAIT(struct brw_codegen *p)
3678 {
3679 const struct intel_device_info *devinfo = p->devinfo;
3680 struct brw_inst *insn;
3681
3682 struct brw_reg src = brw_notification_reg();
3683
3684 insn = next_insn(p, BRW_OPCODE_WAIT);
3685 brw_set_dest(p, insn, src);
3686 brw_set_src0(p, insn, src);
3687 brw_set_src1(p, insn, brw_null_reg());
3688
3689 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3690 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3691 }
3692
3693 void
brw_float_controls_mode(struct brw_codegen * p,unsigned mode,unsigned mask)3694 brw_float_controls_mode(struct brw_codegen *p,
3695 unsigned mode, unsigned mask)
3696 {
3697 /* From the Skylake PRM, Volume 7, page 760:
3698 * "Implementation Restriction on Register Access: When the control
3699 * register is used as an explicit source and/or destination, hardware
3700 * does not ensure execution pipeline coherency. Software must set the
3701 * thread control field to ‘switch’ for an instruction that uses
3702 * control register as an explicit operand."
3703 *
3704 * On Gfx12+ this is implemented in terms of SWSB annotations instead.
3705 */
3706 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3707
3708 brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3709 brw_imm_ud(~mask));
3710 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3711 if (p->devinfo->ver < 12)
3712 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3713
3714 if (mode) {
3715 brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3716 brw_imm_ud(mode));
3717 brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1);
3718 if (p->devinfo->ver < 12)
3719 brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH);
3720 }
3721
3722 if (p->devinfo->ver >= 12)
3723 brw_SYNC(p, TGL_SYNC_NOP);
3724 }
3725
3726 void
brw_update_reloc_imm(const struct intel_device_info * devinfo,brw_inst * inst,uint32_t value)3727 brw_update_reloc_imm(const struct intel_device_info *devinfo,
3728 brw_inst *inst,
3729 uint32_t value)
3730 {
3731 /* Sanity check that the instruction is a MOV of an immediate */
3732 assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV);
3733 assert(brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE);
3734
3735 /* If it was compacted, we can't safely rewrite */
3736 assert(brw_inst_cmpt_control(devinfo, inst) == 0);
3737
3738 brw_inst_set_imm_ud(devinfo, inst, value);
3739 }
3740
3741 /* A default value for constants that will be patched at run-time.
3742 * We pick an arbitrary value that prevents instruction compaction.
3743 */
3744 #define DEFAULT_PATCH_IMM 0x4a7cc037
3745
3746 void
brw_MOV_reloc_imm(struct brw_codegen * p,struct brw_reg dst,enum brw_reg_type src_type,uint32_t id)3747 brw_MOV_reloc_imm(struct brw_codegen *p,
3748 struct brw_reg dst,
3749 enum brw_reg_type src_type,
3750 uint32_t id)
3751 {
3752 assert(type_sz(src_type) == 4);
3753 assert(type_sz(dst.type) == 4);
3754
3755 brw_add_reloc(p, id, BRW_SHADER_RELOC_TYPE_MOV_IMM,
3756 p->next_insn_offset, 0);
3757
3758 brw_MOV(p, dst, retype(brw_imm_ud(DEFAULT_PATCH_IMM), src_type));
3759 }
3760