1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
gfx6_resolve_implied_move(struct brw_codegen * p,struct brw_reg * src,unsigned msg_reg_nr)46 gfx6_resolve_implied_move(struct brw_codegen *p,
47 struct brw_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct intel_device_info *devinfo = p->devinfo;
51 if (devinfo->ver < 6)
52 return;
53
54 if (src->file == BRW_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58 assert(devinfo->ver < 12);
59 brw_push_insn_state(p);
60 brw_set_default_exec_size(p, BRW_EXECUTE_8);
61 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63 brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64 retype(*src, BRW_REGISTER_TYPE_UD));
65 brw_pop_insn_state(p);
66 }
67 *src = brw_message_reg(msg_reg_nr);
68 }
69
70 static void
gfx7_convert_mrf_to_grf(struct brw_codegen * p,struct brw_reg * reg)71 gfx7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72 {
73 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74 * "The send with EOT should use register space R112-R127 for <src>. This is
75 * to enable loading of a new thread into the same slot while the message
76 * with EOT for current thread is pending dispatch."
77 *
78 * Since we're pretending to have 16 MRFs anyway, we may as well use the
79 * registers required for messages with EOT.
80 */
81 const struct intel_device_info *devinfo = p->devinfo;
82 if (devinfo->ver >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83 reg->file = BRW_GENERAL_REGISTER_FILE;
84 reg->nr += GFX7_MRF_HACK_START;
85 }
86 }
87
88 void
brw_set_dest(struct brw_codegen * p,brw_inst * inst,struct brw_reg dest)89 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
90 {
91 const struct intel_device_info *devinfo = p->devinfo;
92
93 if (dest.file == BRW_MESSAGE_REGISTER_FILE)
94 assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));
95 else if (dest.file == BRW_GENERAL_REGISTER_FILE)
96 assert(dest.nr < 128);
97
98 /* The hardware has a restriction where a destination of size Byte with
99 * a stride of 1 is only allowed for a packed byte MOV. For any other
100 * instruction, the stride must be at least 2, even when the destination
101 * is the NULL register.
102 */
103 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
104 dest.nr == BRW_ARF_NULL &&
105 type_sz(dest.type) == 1 &&
106 dest.hstride == BRW_HORIZONTAL_STRIDE_1) {
107 dest.hstride = BRW_HORIZONTAL_STRIDE_2;
108 }
109
110 gfx7_convert_mrf_to_grf(p, &dest);
111
112 if (devinfo->ver >= 12 &&
113 (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
114 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC)) {
115 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
116 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
117 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
118 assert(dest.subnr == 0);
119 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
120 (dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
121 dest.vstride == dest.width + 1));
122 assert(!dest.negate && !dest.abs);
123 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
124 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
125
126 } else if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS ||
127 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC) {
128 assert(devinfo->ver < 12);
129 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
130 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
131 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
132 assert(dest.subnr % 16 == 0);
133 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
134 dest.vstride == dest.width + 1);
135 assert(!dest.negate && !dest.abs);
136 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
137 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
138 brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
139 } else {
140 brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
141 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
142
143 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
144 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
145
146 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
147 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
148 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
149 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
150 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
151 } else {
152 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
153 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
154 if (dest.file == BRW_GENERAL_REGISTER_FILE ||
155 dest.file == BRW_MESSAGE_REGISTER_FILE) {
156 assert(dest.writemask != 0);
157 }
158 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
159 * Although Dst.HorzStride is a don't care for Align16, HW needs
160 * this to be programmed as "01".
161 */
162 brw_inst_set_dst_hstride(devinfo, inst, 1);
163 }
164 } else {
165 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
166
167 /* These are different sizes in align1 vs align16:
168 */
169 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
170 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
171 dest.indirect_offset);
172 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
173 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
174 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
175 } else {
176 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
177 dest.indirect_offset);
178 /* even ignored in da16, still need to set as '01' */
179 brw_inst_set_dst_hstride(devinfo, inst, 1);
180 }
181 }
182 }
183
184 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
185 * or 16 (SIMD16), as that's normally correct. However, when dealing with
186 * small registers, it can be useful for us to automatically reduce it to
187 * match the register size.
188 */
189 if (p->automatic_exec_sizes) {
190 /*
191 * In platforms that support fp64 we can emit instructions with a width
192 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
193 * these cases we need to make sure that these instructions have their
194 * exec sizes set properly when they are emitted and we can't rely on
195 * this code to fix it.
196 */
197 bool fix_exec_size;
198 if (devinfo->ver >= 6)
199 fix_exec_size = dest.width < BRW_EXECUTE_4;
200 else
201 fix_exec_size = dest.width < BRW_EXECUTE_8;
202
203 if (fix_exec_size)
204 brw_inst_set_exec_size(devinfo, inst, dest.width);
205 }
206 }
207
208 void
brw_set_src0(struct brw_codegen * p,brw_inst * inst,struct brw_reg reg)209 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
210 {
211 const struct intel_device_info *devinfo = p->devinfo;
212
213 if (reg.file == BRW_MESSAGE_REGISTER_FILE)
214 assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));
215 else if (reg.file == BRW_GENERAL_REGISTER_FILE)
216 assert(reg.nr < 128);
217
218 gfx7_convert_mrf_to_grf(p, ®);
219
220 if (devinfo->ver >= 6 &&
221 (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
222 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC ||
223 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS ||
224 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC)) {
225 /* Any source modifiers or regions will be ignored, since this just
226 * identifies the MRF/GRF to start reading the message contents from.
227 * Check for some likely failures.
228 */
229 assert(!reg.negate);
230 assert(!reg.abs);
231 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
232 }
233
234 if (devinfo->ver >= 12 &&
235 (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
236 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC)) {
237 assert(reg.file != BRW_IMMEDIATE_VALUE);
238 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
239 assert(reg.subnr == 0);
240 assert(has_scalar_region(reg) ||
241 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
242 reg.vstride == reg.width + 1));
243 assert(!reg.negate && !reg.abs);
244 brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
245 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
246
247 } else if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS ||
248 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC) {
249 assert(reg.file == BRW_GENERAL_REGISTER_FILE);
250 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
251 assert(reg.subnr % 16 == 0);
252 assert(has_scalar_region(reg) ||
253 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
254 reg.vstride == reg.width + 1));
255 assert(!reg.negate && !reg.abs);
256 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
257 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
258 } else {
259 brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
260 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
261 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
262 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
263
264 if (reg.file == BRW_IMMEDIATE_VALUE) {
265 if (reg.type == BRW_REGISTER_TYPE_DF ||
266 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_DIM)
267 brw_inst_set_imm_df(devinfo, inst, reg.df);
268 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
269 reg.type == BRW_REGISTER_TYPE_Q)
270 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
271 else
272 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
273
274 if (devinfo->ver < 12 && type_sz(reg.type) < 8) {
275 brw_inst_set_src1_reg_file(devinfo, inst,
276 BRW_ARCHITECTURE_REGISTER_FILE);
277 brw_inst_set_src1_reg_hw_type(devinfo, inst,
278 brw_inst_src0_reg_hw_type(devinfo, inst));
279 }
280 } else {
281 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
282 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
283 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
284 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
285 } else {
286 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
287 }
288 } else {
289 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
290
291 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
292 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
293 } else {
294 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
295 }
296 }
297
298 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
299 if (reg.width == BRW_WIDTH_1 &&
300 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
301 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
302 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
303 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
304 } else {
305 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
306 brw_inst_set_src0_width(devinfo, inst, reg.width);
307 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
308 }
309 } else {
310 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
311 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
312 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
313 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
314 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
315 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
316 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
317 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
318
319 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
320 /* This is an oddity of the fact we're using the same
321 * descriptions for registers in align_16 as align_1:
322 */
323 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
324 } else if (devinfo->verx10 == 70 &&
325 reg.type == BRW_REGISTER_TYPE_DF &&
326 reg.vstride == BRW_VERTICAL_STRIDE_2) {
327 /* From SNB PRM:
328 *
329 * "For Align16 access mode, only encodings of 0000 and 0011
330 * are allowed. Other codes are reserved."
331 *
332 * Presumably the DevSNB behavior applies to IVB as well.
333 */
334 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
335 } else {
336 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
337 }
338 }
339 }
340 }
341 }
342
343
344 void
brw_set_src1(struct brw_codegen * p,brw_inst * inst,struct brw_reg reg)345 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
346 {
347 const struct intel_device_info *devinfo = p->devinfo;
348
349 if (reg.file == BRW_GENERAL_REGISTER_FILE)
350 assert(reg.nr < 128);
351
352 if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS ||
353 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC ||
354 (devinfo->ver >= 12 &&
355 (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
356 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC))) {
357 assert(reg.file == BRW_GENERAL_REGISTER_FILE ||
358 reg.file == BRW_ARCHITECTURE_REGISTER_FILE);
359 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
360 assert(reg.subnr == 0);
361 assert(has_scalar_region(reg) ||
362 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
363 reg.vstride == reg.width + 1));
364 assert(!reg.negate && !reg.abs);
365 brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr);
366 brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
367 } else {
368 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
369 *
370 * "Accumulator registers may be accessed explicitly as src0
371 * operands only."
372 */
373 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
374 reg.nr != BRW_ARF_ACCUMULATOR);
375
376 gfx7_convert_mrf_to_grf(p, ®);
377 assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
378
379 brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
380 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
381 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
382
383 /* Only src1 can be immediate in two-argument instructions.
384 */
385 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
386
387 if (reg.file == BRW_IMMEDIATE_VALUE) {
388 /* two-argument instructions can only use 32-bit immediates */
389 assert(type_sz(reg.type) < 8);
390 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
391 } else {
392 /* This is a hardware restriction, which may or may not be lifted
393 * in the future:
394 */
395 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
396 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
397
398 brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
399 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
400 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
401 } else {
402 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
403 }
404
405 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
406 if (reg.width == BRW_WIDTH_1 &&
407 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
408 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
409 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
410 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
411 } else {
412 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
413 brw_inst_set_src1_width(devinfo, inst, reg.width);
414 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
415 }
416 } else {
417 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
418 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
419 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
420 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
421 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
422 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
423 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
424 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
425
426 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
427 /* This is an oddity of the fact we're using the same
428 * descriptions for registers in align_16 as align_1:
429 */
430 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
431 } else if (devinfo->verx10 == 70 &&
432 reg.type == BRW_REGISTER_TYPE_DF &&
433 reg.vstride == BRW_VERTICAL_STRIDE_2) {
434 /* From SNB PRM:
435 *
436 * "For Align16 access mode, only encodings of 0000 and 0011
437 * are allowed. Other codes are reserved."
438 *
439 * Presumably the DevSNB behavior applies to IVB as well.
440 */
441 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
442 } else {
443 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
444 }
445 }
446 }
447 }
448 }
449
450 /**
451 * Specify the descriptor and extended descriptor immediate for a SEND(C)
452 * message instruction.
453 */
454 void
brw_set_desc_ex(struct brw_codegen * p,brw_inst * inst,unsigned desc,unsigned ex_desc)455 brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
456 unsigned desc, unsigned ex_desc)
457 {
458 const struct intel_device_info *devinfo = p->devinfo;
459 assert(brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
460 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC);
461 if (devinfo->ver < 12)
462 brw_inst_set_src1_file_type(devinfo, inst,
463 BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
464 brw_inst_set_send_desc(devinfo, inst, desc);
465 if (devinfo->ver >= 9)
466 brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
467 }
468
brw_set_math_message(struct brw_codegen * p,brw_inst * inst,unsigned function,unsigned integer_type,bool low_precision,unsigned dataType)469 static void brw_set_math_message( struct brw_codegen *p,
470 brw_inst *inst,
471 unsigned function,
472 unsigned integer_type,
473 bool low_precision,
474 unsigned dataType )
475 {
476 const struct intel_device_info *devinfo = p->devinfo;
477 unsigned msg_length;
478 unsigned response_length;
479
480 /* Infer message length from the function */
481 switch (function) {
482 case BRW_MATH_FUNCTION_POW:
483 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
484 case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
485 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
486 msg_length = 2;
487 break;
488 default:
489 msg_length = 1;
490 break;
491 }
492
493 /* Infer response length from the function */
494 switch (function) {
495 case BRW_MATH_FUNCTION_SINCOS:
496 case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
497 response_length = 2;
498 break;
499 default:
500 response_length = 1;
501 break;
502 }
503
504 brw_set_desc(p, inst, brw_message_desc(
505 devinfo, msg_length, response_length, false));
506
507 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
508 brw_inst_set_math_msg_function(devinfo, inst, function);
509 brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
510 brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
511 brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
512 brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
513 brw_inst_set_saturate(devinfo, inst, 0);
514 }
515
516
brw_set_ff_sync_message(struct brw_codegen * p,brw_inst * insn,bool allocate,unsigned response_length,bool end_of_thread)517 static void brw_set_ff_sync_message(struct brw_codegen *p,
518 brw_inst *insn,
519 bool allocate,
520 unsigned response_length,
521 bool end_of_thread)
522 {
523 const struct intel_device_info *devinfo = p->devinfo;
524
525 brw_set_desc(p, insn, brw_message_desc(
526 devinfo, 1, response_length, true));
527
528 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
529 brw_inst_set_eot(devinfo, insn, end_of_thread);
530 brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
531 brw_inst_set_urb_allocate(devinfo, insn, allocate);
532 /* The following fields are not used by FF_SYNC: */
533 brw_inst_set_urb_global_offset(devinfo, insn, 0);
534 brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
535 brw_inst_set_urb_used(devinfo, insn, 0);
536 brw_inst_set_urb_complete(devinfo, insn, 0);
537 }
538
brw_set_urb_message(struct brw_codegen * p,brw_inst * insn,enum brw_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle_control)539 static void brw_set_urb_message( struct brw_codegen *p,
540 brw_inst *insn,
541 enum brw_urb_write_flags flags,
542 unsigned msg_length,
543 unsigned response_length,
544 unsigned offset,
545 unsigned swizzle_control )
546 {
547 const struct intel_device_info *devinfo = p->devinfo;
548
549 assert(devinfo->ver < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
550 assert(devinfo->ver < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
551 assert(devinfo->ver >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
552
553 brw_set_desc(p, insn, brw_message_desc(
554 devinfo, msg_length, response_length, true));
555
556 brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
557 brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
558
559 if (flags & BRW_URB_WRITE_OWORD) {
560 assert(msg_length == 2); /* header + one OWORD of data */
561 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
562 } else {
563 brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
564 }
565
566 brw_inst_set_urb_global_offset(devinfo, insn, offset);
567 brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
568
569 if (devinfo->ver < 8) {
570 brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
571 }
572
573 if (devinfo->ver < 7) {
574 brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
575 brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
576 } else {
577 brw_inst_set_urb_per_slot_offset(devinfo, insn,
578 !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
579 }
580 }
581
582 static void
gfx7_set_dp_scratch_message(struct brw_codegen * p,brw_inst * inst,bool write,bool dword,bool invalidate_after_read,unsigned num_regs,unsigned addr_offset,unsigned mlen,unsigned rlen,bool header_present)583 gfx7_set_dp_scratch_message(struct brw_codegen *p,
584 brw_inst *inst,
585 bool write,
586 bool dword,
587 bool invalidate_after_read,
588 unsigned num_regs,
589 unsigned addr_offset,
590 unsigned mlen,
591 unsigned rlen,
592 bool header_present)
593 {
594 const struct intel_device_info *devinfo = p->devinfo;
595 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
596 (devinfo->ver >= 8 && num_regs == 8));
597 const unsigned block_size = (devinfo->ver >= 8 ? util_logbase2(num_regs) :
598 num_regs - 1);
599
600 brw_set_desc(p, inst, brw_message_desc(
601 devinfo, mlen, rlen, header_present));
602
603 brw_inst_set_sfid(devinfo, inst, GFX7_SFID_DATAPORT_DATA_CACHE);
604 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
605 brw_inst_set_scratch_read_write(devinfo, inst, write);
606 brw_inst_set_scratch_type(devinfo, inst, dword);
607 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
608 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
609 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
610 }
611
612 static void
brw_inst_set_state(const struct brw_isa_info * isa,brw_inst * insn,const struct brw_insn_state * state)613 brw_inst_set_state(const struct brw_isa_info *isa,
614 brw_inst *insn,
615 const struct brw_insn_state *state)
616 {
617 const struct intel_device_info *devinfo = isa->devinfo;
618
619 brw_inst_set_exec_size(devinfo, insn, state->exec_size);
620 brw_inst_set_group(devinfo, insn, state->group);
621 brw_inst_set_compression(devinfo, insn, state->compressed);
622 brw_inst_set_access_mode(devinfo, insn, state->access_mode);
623 brw_inst_set_mask_control(devinfo, insn, state->mask_control);
624 if (devinfo->ver >= 12)
625 brw_inst_set_swsb(devinfo, insn, tgl_swsb_encode(devinfo, state->swsb));
626 brw_inst_set_saturate(devinfo, insn, state->saturate);
627 brw_inst_set_pred_control(devinfo, insn, state->predicate);
628 brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
629
630 if (is_3src(isa, brw_inst_opcode(isa, insn)) &&
631 state->access_mode == BRW_ALIGN_16) {
632 brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
633 if (devinfo->ver >= 7)
634 brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
635 } else {
636 brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
637 if (devinfo->ver >= 7)
638 brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
639 }
640
641 if (devinfo->ver >= 6)
642 brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
643 }
644
645 static brw_inst *
brw_append_insns(struct brw_codegen * p,unsigned nr_insn,unsigned align)646 brw_append_insns(struct brw_codegen *p, unsigned nr_insn, unsigned align)
647 {
648 assert(util_is_power_of_two_or_zero(sizeof(brw_inst)));
649 assert(util_is_power_of_two_or_zero(align));
650 const unsigned align_insn = MAX2(align / sizeof(brw_inst), 1);
651 const unsigned start_insn = ALIGN(p->nr_insn, align_insn);
652 const unsigned new_nr_insn = start_insn + nr_insn;
653
654 if (p->store_size < new_nr_insn) {
655 p->store_size = util_next_power_of_two(new_nr_insn * sizeof(brw_inst));
656 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
657 }
658
659 /* Memset any padding due to alignment to 0. We don't want to be hashing
660 * or caching a bunch of random bits we got from a memory allocation.
661 */
662 if (p->nr_insn < start_insn) {
663 memset(&p->store[p->nr_insn], 0,
664 (start_insn - p->nr_insn) * sizeof(brw_inst));
665 }
666
667 assert(p->next_insn_offset == p->nr_insn * sizeof(brw_inst));
668 p->nr_insn = new_nr_insn;
669 p->next_insn_offset = new_nr_insn * sizeof(brw_inst);
670
671 return &p->store[start_insn];
672 }
673
674 void
brw_realign(struct brw_codegen * p,unsigned align)675 brw_realign(struct brw_codegen *p, unsigned align)
676 {
677 brw_append_insns(p, 0, align);
678 }
679
680 int
brw_append_data(struct brw_codegen * p,void * data,unsigned size,unsigned align)681 brw_append_data(struct brw_codegen *p, void *data,
682 unsigned size, unsigned align)
683 {
684 unsigned nr_insn = DIV_ROUND_UP(size, sizeof(brw_inst));
685 void *dst = brw_append_insns(p, nr_insn, align);
686 memcpy(dst, data, size);
687
688 /* If it's not a whole number of instructions, memset the end */
689 if (size < nr_insn * sizeof(brw_inst))
690 memset(dst + size, 0, nr_insn * sizeof(brw_inst) - size);
691
692 return dst - (void *)p->store;
693 }
694
695 #define next_insn brw_next_insn
696 brw_inst *
brw_next_insn(struct brw_codegen * p,unsigned opcode)697 brw_next_insn(struct brw_codegen *p, unsigned opcode)
698 {
699 brw_inst *insn = brw_append_insns(p, 1, sizeof(brw_inst));
700
701 memset(insn, 0, sizeof(*insn));
702 brw_inst_set_opcode(p->isa, insn, opcode);
703
704 /* Apply the default instruction state */
705 brw_inst_set_state(p->isa, insn, p->current);
706
707 return insn;
708 }
709
710 void
brw_add_reloc(struct brw_codegen * p,uint32_t id,enum brw_shader_reloc_type type,uint32_t offset,uint32_t delta)711 brw_add_reloc(struct brw_codegen *p, uint32_t id,
712 enum brw_shader_reloc_type type,
713 uint32_t offset, uint32_t delta)
714 {
715 if (p->num_relocs + 1 > p->reloc_array_size) {
716 p->reloc_array_size = MAX2(16, p->reloc_array_size * 2);
717 p->relocs = reralloc(p->mem_ctx, p->relocs,
718 struct brw_shader_reloc, p->reloc_array_size);
719 }
720
721 p->relocs[p->num_relocs++] = (struct brw_shader_reloc) {
722 .id = id,
723 .type = type,
724 .offset = offset,
725 .delta = delta,
726 };
727 }
728
729 static brw_inst *
brw_alu1(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src)730 brw_alu1(struct brw_codegen *p, unsigned opcode,
731 struct brw_reg dest, struct brw_reg src)
732 {
733 brw_inst *insn = next_insn(p, opcode);
734 brw_set_dest(p, insn, dest);
735 brw_set_src0(p, insn, src);
736 return insn;
737 }
738
739 static brw_inst *
brw_alu2(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)740 brw_alu2(struct brw_codegen *p, unsigned opcode,
741 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
742 {
743 /* 64-bit immediates are only supported on 1-src instructions */
744 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
745 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
746
747 brw_inst *insn = next_insn(p, opcode);
748 brw_set_dest(p, insn, dest);
749 brw_set_src0(p, insn, src0);
750 brw_set_src1(p, insn, src1);
751 return insn;
752 }
753
754 static int
get_3src_subreg_nr(struct brw_reg reg)755 get_3src_subreg_nr(struct brw_reg reg)
756 {
757 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
758 * use 32-bit units (components 0..7). Since they only support F/D/UD
759 * types, this doesn't lose any flexibility, but uses fewer bits.
760 */
761 return reg.subnr / 4;
762 }
763
764 static enum gfx10_align1_3src_vertical_stride
to_3src_align1_vstride(const struct intel_device_info * devinfo,enum brw_vertical_stride vstride)765 to_3src_align1_vstride(const struct intel_device_info *devinfo,
766 enum brw_vertical_stride vstride)
767 {
768 switch (vstride) {
769 case BRW_VERTICAL_STRIDE_0:
770 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
771 case BRW_VERTICAL_STRIDE_1:
772 assert(devinfo->ver >= 12);
773 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1;
774 case BRW_VERTICAL_STRIDE_2:
775 assert(devinfo->ver < 12);
776 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
777 case BRW_VERTICAL_STRIDE_4:
778 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
779 case BRW_VERTICAL_STRIDE_8:
780 case BRW_VERTICAL_STRIDE_16:
781 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
782 default:
783 unreachable("invalid vstride");
784 }
785 }
786
787
788 static enum gfx10_align1_3src_src_horizontal_stride
to_3src_align1_hstride(enum brw_horizontal_stride hstride)789 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
790 {
791 switch (hstride) {
792 case BRW_HORIZONTAL_STRIDE_0:
793 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
794 case BRW_HORIZONTAL_STRIDE_1:
795 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
796 case BRW_HORIZONTAL_STRIDE_2:
797 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
798 case BRW_HORIZONTAL_STRIDE_4:
799 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
800 default:
801 unreachable("invalid hstride");
802 }
803 }
804
805 static brw_inst *
brw_alu3(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1,struct brw_reg src2)806 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
807 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
808 {
809 const struct intel_device_info *devinfo = p->devinfo;
810 brw_inst *inst = next_insn(p, opcode);
811
812 gfx7_convert_mrf_to_grf(p, &dest);
813
814 assert(dest.nr < 128);
815
816 if (devinfo->ver >= 10)
817 assert(!(src0.file == BRW_IMMEDIATE_VALUE &&
818 src2.file == BRW_IMMEDIATE_VALUE));
819
820 assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128);
821 assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128);
822 assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128);
823 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
824 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
825 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
826 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
827
828 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
829 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
830 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
831
832 if (devinfo->ver >= 12) {
833 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
834 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
835 } else {
836 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
837 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
838 BRW_ALIGN1_3SRC_ACCUMULATOR);
839 brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
840 } else {
841 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
842 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
843 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
844 }
845 }
846 brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
847
848 brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
849
850 if (brw_reg_type_is_floating_point(dest.type)) {
851 brw_inst_set_3src_a1_exec_type(devinfo, inst,
852 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
853 } else {
854 brw_inst_set_3src_a1_exec_type(devinfo, inst,
855 BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
856 }
857
858 brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
859 brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
860 brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
861 brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
862
863 if (src0.file == BRW_IMMEDIATE_VALUE) {
864 brw_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud);
865 } else {
866 brw_inst_set_3src_a1_src0_vstride(
867 devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
868 brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
869 to_3src_align1_hstride(src0.hstride));
870 brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
871 if (src0.type == BRW_REGISTER_TYPE_NF) {
872 brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
873 } else {
874 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
875 }
876 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
877 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
878 }
879 brw_inst_set_3src_a1_src1_vstride(
880 devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
881 brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
882 to_3src_align1_hstride(src1.hstride));
883
884 brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
885 if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
886 brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
887 } else {
888 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
889 }
890 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
891 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
892
893 if (src2.file == BRW_IMMEDIATE_VALUE) {
894 brw_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud);
895 } else {
896 brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
897 to_3src_align1_hstride(src2.hstride));
898 /* no vstride on src2 */
899 brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
900 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
901 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
902 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
903 }
904
905 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
906 src0.file == BRW_IMMEDIATE_VALUE ||
907 (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
908 src0.type == BRW_REGISTER_TYPE_NF));
909 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
910 src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
911 assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
912 src2.file == BRW_IMMEDIATE_VALUE);
913
914 if (devinfo->ver >= 12) {
915 if (src0.file == BRW_IMMEDIATE_VALUE) {
916 brw_inst_set_3src_a1_src0_is_imm(devinfo, inst, 1);
917 } else {
918 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
919 }
920
921 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
922
923 if (src2.file == BRW_IMMEDIATE_VALUE) {
924 brw_inst_set_3src_a1_src2_is_imm(devinfo, inst, 1);
925 } else {
926 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
927 }
928 } else {
929 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
930 src0.file == BRW_GENERAL_REGISTER_FILE ?
931 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
932 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
933 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
934 src1.file == BRW_GENERAL_REGISTER_FILE ?
935 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
936 BRW_ALIGN1_3SRC_ACCUMULATOR);
937 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
938 src2.file == BRW_GENERAL_REGISTER_FILE ?
939 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
940 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
941 }
942
943 } else {
944 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
945 dest.file == BRW_MESSAGE_REGISTER_FILE);
946 assert(dest.type == BRW_REGISTER_TYPE_F ||
947 dest.type == BRW_REGISTER_TYPE_DF ||
948 dest.type == BRW_REGISTER_TYPE_D ||
949 dest.type == BRW_REGISTER_TYPE_UD ||
950 (dest.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 8));
951 if (devinfo->ver == 6) {
952 brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
953 dest.file == BRW_MESSAGE_REGISTER_FILE);
954 }
955 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
956 brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 4);
957 brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
958
959 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
960 brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
961 brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
962 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
963 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
964 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
965 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
966 src0.vstride == BRW_VERTICAL_STRIDE_0);
967
968 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
969 brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
970 brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
971 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
972 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
973 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
974 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
975 src1.vstride == BRW_VERTICAL_STRIDE_0);
976
977 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
978 brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
979 brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
980 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
981 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
982 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
983 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
984 src2.vstride == BRW_VERTICAL_STRIDE_0);
985
986 if (devinfo->ver >= 7) {
987 /* Set both the source and destination types based on dest.type,
988 * ignoring the source register types. The MAD and LRP emitters ensure
989 * that all four types are float. The BFE and BFI2 emitters, however,
990 * may send us mixed D and UD types and want us to ignore that and use
991 * the destination type.
992 */
993 brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
994 brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
995
996 /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
997 *
998 * "Three source instructions can use operands with mixed-mode
999 * precision. When SrcType field is set to :f or :hf it defines
1000 * precision for source 0 only, and fields Src1Type and Src2Type
1001 * define precision for other source operands:
1002 *
1003 * 0b = :f. Single precision Float (32-bit).
1004 * 1b = :hf. Half precision Float (16-bit)."
1005 */
1006 if (src1.type == BRW_REGISTER_TYPE_HF)
1007 brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
1008
1009 if (src2.type == BRW_REGISTER_TYPE_HF)
1010 brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
1011 }
1012 }
1013
1014 return inst;
1015 }
1016
1017
1018 /***********************************************************************
1019 * Convenience routines.
1020 */
1021 #define ALU1(OP) \
1022 brw_inst *brw_##OP(struct brw_codegen *p, \
1023 struct brw_reg dest, \
1024 struct brw_reg src0) \
1025 { \
1026 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
1027 }
1028
1029 #define ALU2(OP) \
1030 brw_inst *brw_##OP(struct brw_codegen *p, \
1031 struct brw_reg dest, \
1032 struct brw_reg src0, \
1033 struct brw_reg src1) \
1034 { \
1035 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
1036 }
1037
1038 #define ALU3(OP) \
1039 brw_inst *brw_##OP(struct brw_codegen *p, \
1040 struct brw_reg dest, \
1041 struct brw_reg src0, \
1042 struct brw_reg src1, \
1043 struct brw_reg src2) \
1044 { \
1045 if (p->current->access_mode == BRW_ALIGN_16) { \
1046 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
1047 src0.swizzle = BRW_SWIZZLE_XXXX; \
1048 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
1049 src1.swizzle = BRW_SWIZZLE_XXXX; \
1050 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
1051 src2.swizzle = BRW_SWIZZLE_XXXX; \
1052 } \
1053 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1054 }
1055
1056 #define ALU3F(OP) \
1057 brw_inst *brw_##OP(struct brw_codegen *p, \
1058 struct brw_reg dest, \
1059 struct brw_reg src0, \
1060 struct brw_reg src1, \
1061 struct brw_reg src2) \
1062 { \
1063 assert(dest.type == BRW_REGISTER_TYPE_F || \
1064 dest.type == BRW_REGISTER_TYPE_DF); \
1065 if (dest.type == BRW_REGISTER_TYPE_F) { \
1066 assert(src0.type == BRW_REGISTER_TYPE_F); \
1067 assert(src1.type == BRW_REGISTER_TYPE_F); \
1068 assert(src2.type == BRW_REGISTER_TYPE_F); \
1069 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
1070 assert(src0.type == BRW_REGISTER_TYPE_DF); \
1071 assert(src1.type == BRW_REGISTER_TYPE_DF); \
1072 assert(src2.type == BRW_REGISTER_TYPE_DF); \
1073 } \
1074 \
1075 if (p->current->access_mode == BRW_ALIGN_16) { \
1076 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
1077 src0.swizzle = BRW_SWIZZLE_XXXX; \
1078 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
1079 src1.swizzle = BRW_SWIZZLE_XXXX; \
1080 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
1081 src2.swizzle = BRW_SWIZZLE_XXXX; \
1082 } \
1083 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1084 }
1085
1086 ALU2(SEL)
ALU1(NOT)1087 ALU1(NOT)
1088 ALU2(AND)
1089 ALU2(OR)
1090 ALU2(XOR)
1091 ALU2(SHR)
1092 ALU2(SHL)
1093 ALU1(DIM)
1094 ALU2(ASR)
1095 ALU2(ROL)
1096 ALU2(ROR)
1097 ALU3(CSEL)
1098 ALU1(FRC)
1099 ALU1(RNDD)
1100 ALU1(RNDE)
1101 ALU1(RNDU)
1102 ALU1(RNDZ)
1103 ALU2(MAC)
1104 ALU2(MACH)
1105 ALU1(LZD)
1106 ALU2(DP4)
1107 ALU2(DPH)
1108 ALU2(DP3)
1109 ALU2(DP2)
1110 ALU3(DP4A)
1111 ALU3(MAD)
1112 ALU3F(LRP)
1113 ALU1(BFREV)
1114 ALU3(BFE)
1115 ALU2(BFI1)
1116 ALU3(BFI2)
1117 ALU1(FBH)
1118 ALU1(FBL)
1119 ALU1(CBIT)
1120 ALU2(ADDC)
1121 ALU2(SUBB)
1122 ALU3(ADD3)
1123
1124 brw_inst *
1125 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1126 {
1127 const struct intel_device_info *devinfo = p->devinfo;
1128
1129 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1130 * To avoid the problems that causes, we use an <X,2,0> source region to
1131 * read each element twice.
1132 */
1133 if (devinfo->verx10 == 70 &&
1134 brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1135 dest.type == BRW_REGISTER_TYPE_DF &&
1136 (src0.type == BRW_REGISTER_TYPE_F ||
1137 src0.type == BRW_REGISTER_TYPE_D ||
1138 src0.type == BRW_REGISTER_TYPE_UD) &&
1139 !has_scalar_region(src0)) {
1140 assert(src0.vstride == src0.width + src0.hstride);
1141 src0.vstride = src0.hstride;
1142 src0.width = BRW_WIDTH_2;
1143 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1144 }
1145
1146 return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1147 }
1148
1149 brw_inst *
brw_ADD(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1150 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1151 struct brw_reg src0, struct brw_reg src1)
1152 {
1153 /* 6.2.2: add */
1154 if (src0.type == BRW_REGISTER_TYPE_F ||
1155 (src0.file == BRW_IMMEDIATE_VALUE &&
1156 src0.type == BRW_REGISTER_TYPE_VF)) {
1157 assert(src1.type != BRW_REGISTER_TYPE_UD);
1158 assert(src1.type != BRW_REGISTER_TYPE_D);
1159 }
1160
1161 if (src1.type == BRW_REGISTER_TYPE_F ||
1162 (src1.file == BRW_IMMEDIATE_VALUE &&
1163 src1.type == BRW_REGISTER_TYPE_VF)) {
1164 assert(src0.type != BRW_REGISTER_TYPE_UD);
1165 assert(src0.type != BRW_REGISTER_TYPE_D);
1166 }
1167
1168 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1169 }
1170
1171 brw_inst *
brw_AVG(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1172 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1173 struct brw_reg src0, struct brw_reg src1)
1174 {
1175 assert(dest.type == src0.type);
1176 assert(src0.type == src1.type);
1177 switch (src0.type) {
1178 case BRW_REGISTER_TYPE_B:
1179 case BRW_REGISTER_TYPE_UB:
1180 case BRW_REGISTER_TYPE_W:
1181 case BRW_REGISTER_TYPE_UW:
1182 case BRW_REGISTER_TYPE_D:
1183 case BRW_REGISTER_TYPE_UD:
1184 break;
1185 default:
1186 unreachable("Bad type for brw_AVG");
1187 }
1188
1189 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1190 }
1191
1192 brw_inst *
brw_MUL(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1193 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1194 struct brw_reg src0, struct brw_reg src1)
1195 {
1196 /* 6.32.38: mul */
1197 if (src0.type == BRW_REGISTER_TYPE_D ||
1198 src0.type == BRW_REGISTER_TYPE_UD ||
1199 src1.type == BRW_REGISTER_TYPE_D ||
1200 src1.type == BRW_REGISTER_TYPE_UD) {
1201 assert(dest.type != BRW_REGISTER_TYPE_F);
1202 }
1203
1204 if (src0.type == BRW_REGISTER_TYPE_F ||
1205 (src0.file == BRW_IMMEDIATE_VALUE &&
1206 src0.type == BRW_REGISTER_TYPE_VF)) {
1207 assert(src1.type != BRW_REGISTER_TYPE_UD);
1208 assert(src1.type != BRW_REGISTER_TYPE_D);
1209 }
1210
1211 if (src1.type == BRW_REGISTER_TYPE_F ||
1212 (src1.file == BRW_IMMEDIATE_VALUE &&
1213 src1.type == BRW_REGISTER_TYPE_VF)) {
1214 assert(src0.type != BRW_REGISTER_TYPE_UD);
1215 assert(src0.type != BRW_REGISTER_TYPE_D);
1216 }
1217
1218 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1219 src0.nr != BRW_ARF_ACCUMULATOR);
1220 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1221 src1.nr != BRW_ARF_ACCUMULATOR);
1222
1223 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1224 }
1225
1226 brw_inst *
brw_LINE(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1227 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1228 struct brw_reg src0, struct brw_reg src1)
1229 {
1230 src0.vstride = BRW_VERTICAL_STRIDE_0;
1231 src0.width = BRW_WIDTH_1;
1232 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1233 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1234 }
1235
1236 brw_inst *
brw_PLN(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1237 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1238 struct brw_reg src0, struct brw_reg src1)
1239 {
1240 src0.vstride = BRW_VERTICAL_STRIDE_0;
1241 src0.width = BRW_WIDTH_1;
1242 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1243 src1.vstride = BRW_VERTICAL_STRIDE_8;
1244 src1.width = BRW_WIDTH_8;
1245 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1246 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1247 }
1248
1249 brw_inst *
brw_F32TO16(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src)1250 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1251 {
1252 const struct intel_device_info *devinfo = p->devinfo;
1253 const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1254 /* The F32TO16 instruction doesn't support 32-bit destination types in
1255 * Align1 mode, and neither does the Gfx8 implementation in terms of a
1256 * converting MOV. Gfx7 does zero out the high 16 bits in Align16 mode as
1257 * an undocumented feature.
1258 */
1259 const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1260 (!align16 || devinfo->ver >= 8));
1261 brw_inst *inst;
1262
1263 if (align16) {
1264 assert(dst.type == BRW_REGISTER_TYPE_UD);
1265 } else {
1266 if (devinfo->ver <= 7) {
1267 assert(dst.type == BRW_REGISTER_TYPE_W ||
1268 dst.type == BRW_REGISTER_TYPE_UW);
1269 } else {
1270 assert(dst.type == BRW_REGISTER_TYPE_HF);
1271 }
1272 }
1273
1274 brw_push_insn_state(p);
1275
1276 if (needs_zero_fill) {
1277 brw_set_default_access_mode(p, BRW_ALIGN_1);
1278 dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1279 }
1280
1281 if (devinfo->ver >= 8) {
1282 inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1283 } else {
1284 assert(devinfo->ver == 7);
1285 inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1286 }
1287
1288 if (needs_zero_fill) {
1289 if (devinfo->ver < 12)
1290 brw_inst_set_no_dd_clear(devinfo, inst, true);
1291 brw_set_default_swsb(p, tgl_swsb_null());
1292 inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1293 if (devinfo->ver < 12)
1294 brw_inst_set_no_dd_check(devinfo, inst, true);
1295 }
1296
1297 brw_pop_insn_state(p);
1298 return inst;
1299 }
1300
1301 brw_inst *
brw_F16TO32(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src)1302 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1303 {
1304 const struct intel_device_info *devinfo = p->devinfo;
1305 bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1306
1307 if (align16) {
1308 assert(src.type == BRW_REGISTER_TYPE_UD);
1309 } else {
1310 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1311 *
1312 * Because this instruction does not have a 16-bit floating-point
1313 * type, the source data type must be Word (W). The destination type
1314 * must be F (Float).
1315 */
1316 if (src.type == BRW_REGISTER_TYPE_UD)
1317 src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1318
1319 assert(src.type == BRW_REGISTER_TYPE_W ||
1320 src.type == BRW_REGISTER_TYPE_UW ||
1321 src.type == BRW_REGISTER_TYPE_HF);
1322 }
1323
1324 if (devinfo->ver >= 8) {
1325 return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1326 } else {
1327 assert(devinfo->ver == 7);
1328 return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1329 }
1330 }
1331
1332
brw_NOP(struct brw_codegen * p)1333 void brw_NOP(struct brw_codegen *p)
1334 {
1335 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1336 memset(insn, 0, sizeof(*insn));
1337 brw_inst_set_opcode(p->isa, insn, BRW_OPCODE_NOP);
1338 }
1339
brw_SYNC(struct brw_codegen * p,enum tgl_sync_function func)1340 void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func)
1341 {
1342 brw_inst *insn = next_insn(p, BRW_OPCODE_SYNC);
1343 brw_inst_set_cond_modifier(p->devinfo, insn, func);
1344 }
1345
1346 /***********************************************************************
1347 * Comparisons, if/else/endif
1348 */
1349
1350 brw_inst *
brw_JMPI(struct brw_codegen * p,struct brw_reg index,unsigned predicate_control)1351 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1352 unsigned predicate_control)
1353 {
1354 const struct intel_device_info *devinfo = p->devinfo;
1355 struct brw_reg ip = brw_ip_reg();
1356 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1357
1358 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1359 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1360 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1361 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1362
1363 return inst;
1364 }
1365
1366 static void
push_if_stack(struct brw_codegen * p,brw_inst * inst)1367 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1368 {
1369 p->if_stack[p->if_stack_depth] = inst - p->store;
1370
1371 p->if_stack_depth++;
1372 if (p->if_stack_array_size <= p->if_stack_depth) {
1373 p->if_stack_array_size *= 2;
1374 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1375 p->if_stack_array_size);
1376 }
1377 }
1378
1379 static brw_inst *
pop_if_stack(struct brw_codegen * p)1380 pop_if_stack(struct brw_codegen *p)
1381 {
1382 p->if_stack_depth--;
1383 return &p->store[p->if_stack[p->if_stack_depth]];
1384 }
1385
1386 static void
push_loop_stack(struct brw_codegen * p,brw_inst * inst)1387 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1388 {
1389 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1390 p->loop_stack_array_size *= 2;
1391 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1392 p->loop_stack_array_size);
1393 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1394 p->loop_stack_array_size);
1395 }
1396
1397 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1398 p->loop_stack_depth++;
1399 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1400 }
1401
1402 static brw_inst *
get_inner_do_insn(struct brw_codegen * p)1403 get_inner_do_insn(struct brw_codegen *p)
1404 {
1405 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1406 }
1407
1408 /* EU takes the value from the flag register and pushes it onto some
1409 * sort of a stack (presumably merging with any flag value already on
1410 * the stack). Within an if block, the flags at the top of the stack
1411 * control execution on each channel of the unit, eg. on each of the
1412 * 16 pixel values in our wm programs.
1413 *
1414 * When the matching 'else' instruction is reached (presumably by
1415 * countdown of the instruction count patched in by our ELSE/ENDIF
1416 * functions), the relevant flags are inverted.
1417 *
1418 * When the matching 'endif' instruction is reached, the flags are
1419 * popped off. If the stack is now empty, normal execution resumes.
1420 */
1421 brw_inst *
brw_IF(struct brw_codegen * p,unsigned execute_size)1422 brw_IF(struct brw_codegen *p, unsigned execute_size)
1423 {
1424 const struct intel_device_info *devinfo = p->devinfo;
1425 brw_inst *insn;
1426
1427 insn = next_insn(p, BRW_OPCODE_IF);
1428
1429 /* Override the defaults for this instruction:
1430 */
1431 if (devinfo->ver < 6) {
1432 brw_set_dest(p, insn, brw_ip_reg());
1433 brw_set_src0(p, insn, brw_ip_reg());
1434 brw_set_src1(p, insn, brw_imm_d(0x0));
1435 } else if (devinfo->ver == 6) {
1436 brw_set_dest(p, insn, brw_imm_w(0));
1437 brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
1438 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1439 brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1440 } else if (devinfo->ver == 7) {
1441 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1442 brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1443 brw_set_src1(p, insn, brw_imm_w(0));
1444 brw_inst_set_jip(devinfo, insn, 0);
1445 brw_inst_set_uip(devinfo, insn, 0);
1446 } else {
1447 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1448 if (devinfo->ver < 12)
1449 brw_set_src0(p, insn, brw_imm_d(0));
1450 brw_inst_set_jip(devinfo, insn, 0);
1451 brw_inst_set_uip(devinfo, insn, 0);
1452 }
1453
1454 brw_inst_set_exec_size(devinfo, insn, execute_size);
1455 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1456 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1457 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1458 if (!p->single_program_flow && devinfo->ver < 6)
1459 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1460
1461 push_if_stack(p, insn);
1462 p->if_depth_in_loop[p->loop_stack_depth]++;
1463 return insn;
1464 }
1465
1466 /* This function is only used for gfx6-style IF instructions with an
1467 * embedded comparison (conditional modifier). It is not used on gfx7.
1468 */
1469 brw_inst *
gfx6_IF(struct brw_codegen * p,enum brw_conditional_mod conditional,struct brw_reg src0,struct brw_reg src1)1470 gfx6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1471 struct brw_reg src0, struct brw_reg src1)
1472 {
1473 const struct intel_device_info *devinfo = p->devinfo;
1474 brw_inst *insn;
1475
1476 insn = next_insn(p, BRW_OPCODE_IF);
1477
1478 brw_set_dest(p, insn, brw_imm_w(0));
1479 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1480 brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
1481 brw_set_src0(p, insn, src0);
1482 brw_set_src1(p, insn, src1);
1483
1484 assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1485 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1486 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1487
1488 push_if_stack(p, insn);
1489 return insn;
1490 }
1491
1492 /**
1493 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1494 */
1495 static void
convert_IF_ELSE_to_ADD(struct brw_codegen * p,brw_inst * if_inst,brw_inst * else_inst)1496 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1497 brw_inst *if_inst, brw_inst *else_inst)
1498 {
1499 const struct intel_device_info *devinfo = p->devinfo;
1500
1501 /* The next instruction (where the ENDIF would be, if it existed) */
1502 brw_inst *next_inst = &p->store[p->nr_insn];
1503
1504 assert(p->single_program_flow);
1505 assert(if_inst != NULL && brw_inst_opcode(p->isa, if_inst) == BRW_OPCODE_IF);
1506 assert(else_inst == NULL || brw_inst_opcode(p->isa, else_inst) == BRW_OPCODE_ELSE);
1507 assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1508
1509 /* Convert IF to an ADD instruction that moves the instruction pointer
1510 * to the first instruction of the ELSE block. If there is no ELSE
1511 * block, point to where ENDIF would be. Reverse the predicate.
1512 *
1513 * There's no need to execute an ENDIF since we don't need to do any
1514 * stack operations, and if we're currently executing, we just want to
1515 * continue normally.
1516 */
1517 brw_inst_set_opcode(p->isa, if_inst, BRW_OPCODE_ADD);
1518 brw_inst_set_pred_inv(devinfo, if_inst, true);
1519
1520 if (else_inst != NULL) {
1521 /* Convert ELSE to an ADD instruction that points where the ENDIF
1522 * would be.
1523 */
1524 brw_inst_set_opcode(p->isa, else_inst, BRW_OPCODE_ADD);
1525
1526 brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1527 brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1528 } else {
1529 brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1530 }
1531 }
1532
1533 /**
1534 * Patch IF and ELSE instructions with appropriate jump targets.
1535 */
1536 static void
patch_IF_ELSE(struct brw_codegen * p,brw_inst * if_inst,brw_inst * else_inst,brw_inst * endif_inst)1537 patch_IF_ELSE(struct brw_codegen *p,
1538 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1539 {
1540 const struct intel_device_info *devinfo = p->devinfo;
1541
1542 /* We shouldn't be patching IF and ELSE instructions in single program flow
1543 * mode when gen < 6, because in single program flow mode on those
1544 * platforms, we convert flow control instructions to conditional ADDs that
1545 * operate on IP (see brw_ENDIF).
1546 *
1547 * However, on Gfx6, writing to IP doesn't work in single program flow mode
1548 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1549 * not be updated by non-flow control instructions."). And on later
1550 * platforms, there is no significant benefit to converting control flow
1551 * instructions to conditional ADDs. So we do patch IF and ELSE
1552 * instructions in single program flow mode on those platforms.
1553 */
1554 if (devinfo->ver < 6)
1555 assert(!p->single_program_flow);
1556
1557 assert(if_inst != NULL && brw_inst_opcode(p->isa, if_inst) == BRW_OPCODE_IF);
1558 assert(endif_inst != NULL);
1559 assert(else_inst == NULL || brw_inst_opcode(p->isa, else_inst) == BRW_OPCODE_ELSE);
1560
1561 unsigned br = brw_jump_scale(devinfo);
1562
1563 assert(brw_inst_opcode(p->isa, endif_inst) == BRW_OPCODE_ENDIF);
1564 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1565
1566 if (else_inst == NULL) {
1567 /* Patch IF -> ENDIF */
1568 if (devinfo->ver < 6) {
1569 /* Turn it into an IFF, which means no mask stack operations for
1570 * all-false and jumping past the ENDIF.
1571 */
1572 brw_inst_set_opcode(p->isa, if_inst, BRW_OPCODE_IFF);
1573 brw_inst_set_gfx4_jump_count(devinfo, if_inst,
1574 br * (endif_inst - if_inst + 1));
1575 brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1576 } else if (devinfo->ver == 6) {
1577 /* As of gfx6, there is no IFF and IF must point to the ENDIF. */
1578 brw_inst_set_gfx6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1579 } else {
1580 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1581 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1582 }
1583 } else {
1584 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1585
1586 /* Patch IF -> ELSE */
1587 if (devinfo->ver < 6) {
1588 brw_inst_set_gfx4_jump_count(devinfo, if_inst,
1589 br * (else_inst - if_inst));
1590 brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1591 } else if (devinfo->ver == 6) {
1592 brw_inst_set_gfx6_jump_count(devinfo, if_inst,
1593 br * (else_inst - if_inst + 1));
1594 }
1595
1596 /* Patch ELSE -> ENDIF */
1597 if (devinfo->ver < 6) {
1598 /* BRW_OPCODE_ELSE pre-gfx6 should point just past the
1599 * matching ENDIF.
1600 */
1601 brw_inst_set_gfx4_jump_count(devinfo, else_inst,
1602 br * (endif_inst - else_inst + 1));
1603 brw_inst_set_gfx4_pop_count(devinfo, else_inst, 1);
1604 } else if (devinfo->ver == 6) {
1605 /* BRW_OPCODE_ELSE on gfx6 should point to the matching ENDIF. */
1606 brw_inst_set_gfx6_jump_count(devinfo, else_inst,
1607 br * (endif_inst - else_inst));
1608 } else {
1609 /* The IF instruction's JIP should point just past the ELSE */
1610 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1611 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1612 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1613 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1614 if (devinfo->ver >= 8) {
1615 /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1616 * should point to ENDIF.
1617 */
1618 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1619 }
1620 }
1621 }
1622 }
1623
1624 void
brw_ELSE(struct brw_codegen * p)1625 brw_ELSE(struct brw_codegen *p)
1626 {
1627 const struct intel_device_info *devinfo = p->devinfo;
1628 brw_inst *insn;
1629
1630 insn = next_insn(p, BRW_OPCODE_ELSE);
1631
1632 if (devinfo->ver < 6) {
1633 brw_set_dest(p, insn, brw_ip_reg());
1634 brw_set_src0(p, insn, brw_ip_reg());
1635 brw_set_src1(p, insn, brw_imm_d(0x0));
1636 } else if (devinfo->ver == 6) {
1637 brw_set_dest(p, insn, brw_imm_w(0));
1638 brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
1639 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1640 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1641 } else if (devinfo->ver == 7) {
1642 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1643 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1644 brw_set_src1(p, insn, brw_imm_w(0));
1645 brw_inst_set_jip(devinfo, insn, 0);
1646 brw_inst_set_uip(devinfo, insn, 0);
1647 } else {
1648 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1649 if (devinfo->ver < 12)
1650 brw_set_src0(p, insn, brw_imm_d(0));
1651 brw_inst_set_jip(devinfo, insn, 0);
1652 brw_inst_set_uip(devinfo, insn, 0);
1653 }
1654
1655 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1656 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1657 if (!p->single_program_flow && devinfo->ver < 6)
1658 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1659
1660 push_if_stack(p, insn);
1661 }
1662
1663 void
brw_ENDIF(struct brw_codegen * p)1664 brw_ENDIF(struct brw_codegen *p)
1665 {
1666 const struct intel_device_info *devinfo = p->devinfo;
1667 brw_inst *insn = NULL;
1668 brw_inst *else_inst = NULL;
1669 brw_inst *if_inst = NULL;
1670 brw_inst *tmp;
1671 bool emit_endif = true;
1672
1673 /* In single program flow mode, we can express IF and ELSE instructions
1674 * equivalently as ADD instructions that operate on IP. On platforms prior
1675 * to Gfx6, flow control instructions cause an implied thread switch, so
1676 * this is a significant savings.
1677 *
1678 * However, on Gfx6, writing to IP doesn't work in single program flow mode
1679 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1680 * not be updated by non-flow control instructions."). And on later
1681 * platforms, there is no significant benefit to converting control flow
1682 * instructions to conditional ADDs. So we only do this trick on Gfx4 and
1683 * Gfx5.
1684 */
1685 if (devinfo->ver < 6 && p->single_program_flow)
1686 emit_endif = false;
1687
1688 /*
1689 * A single next_insn() may change the base address of instruction store
1690 * memory(p->store), so call it first before referencing the instruction
1691 * store pointer from an index
1692 */
1693 if (emit_endif)
1694 insn = next_insn(p, BRW_OPCODE_ENDIF);
1695
1696 /* Pop the IF and (optional) ELSE instructions from the stack */
1697 p->if_depth_in_loop[p->loop_stack_depth]--;
1698 tmp = pop_if_stack(p);
1699 if (brw_inst_opcode(p->isa, tmp) == BRW_OPCODE_ELSE) {
1700 else_inst = tmp;
1701 tmp = pop_if_stack(p);
1702 }
1703 if_inst = tmp;
1704
1705 if (!emit_endif) {
1706 /* ENDIF is useless; don't bother emitting it. */
1707 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1708 return;
1709 }
1710
1711 if (devinfo->ver < 6) {
1712 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1713 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1714 brw_set_src1(p, insn, brw_imm_d(0x0));
1715 } else if (devinfo->ver == 6) {
1716 brw_set_dest(p, insn, brw_imm_w(0));
1717 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1718 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1719 } else if (devinfo->ver == 7) {
1720 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1721 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1722 brw_set_src1(p, insn, brw_imm_w(0));
1723 } else {
1724 brw_set_src0(p, insn, brw_imm_d(0));
1725 }
1726
1727 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1728 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1729 if (devinfo->ver < 6)
1730 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1731
1732 /* Also pop item off the stack in the endif instruction: */
1733 if (devinfo->ver < 6) {
1734 brw_inst_set_gfx4_jump_count(devinfo, insn, 0);
1735 brw_inst_set_gfx4_pop_count(devinfo, insn, 1);
1736 } else if (devinfo->ver == 6) {
1737 brw_inst_set_gfx6_jump_count(devinfo, insn, 2);
1738 } else {
1739 brw_inst_set_jip(devinfo, insn, 2);
1740 }
1741 patch_IF_ELSE(p, if_inst, else_inst, insn);
1742 }
1743
1744 brw_inst *
brw_BREAK(struct brw_codegen * p)1745 brw_BREAK(struct brw_codegen *p)
1746 {
1747 const struct intel_device_info *devinfo = p->devinfo;
1748 brw_inst *insn;
1749
1750 insn = next_insn(p, BRW_OPCODE_BREAK);
1751 if (devinfo->ver >= 8) {
1752 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1753 brw_set_src0(p, insn, brw_imm_d(0x0));
1754 } else if (devinfo->ver >= 6) {
1755 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1756 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1757 brw_set_src1(p, insn, brw_imm_d(0x0));
1758 } else {
1759 brw_set_dest(p, insn, brw_ip_reg());
1760 brw_set_src0(p, insn, brw_ip_reg());
1761 brw_set_src1(p, insn, brw_imm_d(0x0));
1762 brw_inst_set_gfx4_pop_count(devinfo, insn,
1763 p->if_depth_in_loop[p->loop_stack_depth]);
1764 }
1765 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1766 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1767
1768 return insn;
1769 }
1770
1771 brw_inst *
brw_CONT(struct brw_codegen * p)1772 brw_CONT(struct brw_codegen *p)
1773 {
1774 const struct intel_device_info *devinfo = p->devinfo;
1775 brw_inst *insn;
1776
1777 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1778 brw_set_dest(p, insn, brw_ip_reg());
1779 if (devinfo->ver >= 8) {
1780 brw_set_src0(p, insn, brw_imm_d(0x0));
1781 } else {
1782 brw_set_src0(p, insn, brw_ip_reg());
1783 brw_set_src1(p, insn, brw_imm_d(0x0));
1784 }
1785
1786 if (devinfo->ver < 6) {
1787 brw_inst_set_gfx4_pop_count(devinfo, insn,
1788 p->if_depth_in_loop[p->loop_stack_depth]);
1789 }
1790 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1791 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1792 return insn;
1793 }
1794
1795 brw_inst *
brw_HALT(struct brw_codegen * p)1796 brw_HALT(struct brw_codegen *p)
1797 {
1798 const struct intel_device_info *devinfo = p->devinfo;
1799 brw_inst *insn;
1800
1801 insn = next_insn(p, BRW_OPCODE_HALT);
1802 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1803 if (devinfo->ver < 6) {
1804 /* From the Gfx4 PRM:
1805 *
1806 * "IP register must be put (for example, by the assembler) at <dst>
1807 * and <src0> locations.
1808 */
1809 brw_set_dest(p, insn, brw_ip_reg());
1810 brw_set_src0(p, insn, brw_ip_reg());
1811 brw_set_src1(p, insn, brw_imm_d(0x0)); /* exitcode updated later. */
1812 } else if (devinfo->ver < 8) {
1813 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1814 brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1815 } else if (devinfo->ver < 12) {
1816 brw_set_src0(p, insn, brw_imm_d(0x0));
1817 }
1818
1819 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1820 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1821 return insn;
1822 }
1823
1824 /* DO/WHILE loop:
1825 *
1826 * The DO/WHILE is just an unterminated loop -- break or continue are
1827 * used for control within the loop. We have a few ways they can be
1828 * done.
1829 *
1830 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1831 * jip and no DO instruction.
1832 *
1833 * For non-uniform control flow pre-gfx6, there's a DO instruction to
1834 * push the mask, and a WHILE to jump back, and BREAK to get out and
1835 * pop the mask.
1836 *
1837 * For gfx6, there's no more mask stack, so no need for DO. WHILE
1838 * just points back to the first instruction of the loop.
1839 */
1840 brw_inst *
brw_DO(struct brw_codegen * p,unsigned execute_size)1841 brw_DO(struct brw_codegen *p, unsigned execute_size)
1842 {
1843 const struct intel_device_info *devinfo = p->devinfo;
1844
1845 if (devinfo->ver >= 6 || p->single_program_flow) {
1846 push_loop_stack(p, &p->store[p->nr_insn]);
1847 return &p->store[p->nr_insn];
1848 } else {
1849 brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1850
1851 push_loop_stack(p, insn);
1852
1853 /* Override the defaults for this instruction:
1854 */
1855 brw_set_dest(p, insn, brw_null_reg());
1856 brw_set_src0(p, insn, brw_null_reg());
1857 brw_set_src1(p, insn, brw_null_reg());
1858
1859 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1860 brw_inst_set_exec_size(devinfo, insn, execute_size);
1861 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1862
1863 return insn;
1864 }
1865 }
1866
1867 /**
1868 * For pre-gfx6, we patch BREAK/CONT instructions to point at the WHILE
1869 * instruction here.
1870 *
1871 * For gfx6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1872 * nesting, since it can always just point to the end of the block/current loop.
1873 */
1874 static void
brw_patch_break_cont(struct brw_codegen * p,brw_inst * while_inst)1875 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1876 {
1877 const struct intel_device_info *devinfo = p->devinfo;
1878 brw_inst *do_inst = get_inner_do_insn(p);
1879 brw_inst *inst;
1880 unsigned br = brw_jump_scale(devinfo);
1881
1882 assert(devinfo->ver < 6);
1883
1884 for (inst = while_inst - 1; inst != do_inst; inst--) {
1885 /* If the jump count is != 0, that means that this instruction has already
1886 * been patched because it's part of a loop inside of the one we're
1887 * patching.
1888 */
1889 if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_BREAK &&
1890 brw_inst_gfx4_jump_count(devinfo, inst) == 0) {
1891 brw_inst_set_gfx4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1892 } else if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_CONTINUE &&
1893 brw_inst_gfx4_jump_count(devinfo, inst) == 0) {
1894 brw_inst_set_gfx4_jump_count(devinfo, inst, br * (while_inst - inst));
1895 }
1896 }
1897 }
1898
1899 brw_inst *
brw_WHILE(struct brw_codegen * p)1900 brw_WHILE(struct brw_codegen *p)
1901 {
1902 const struct intel_device_info *devinfo = p->devinfo;
1903 brw_inst *insn, *do_insn;
1904 unsigned br = brw_jump_scale(devinfo);
1905
1906 if (devinfo->ver >= 6) {
1907 insn = next_insn(p, BRW_OPCODE_WHILE);
1908 do_insn = get_inner_do_insn(p);
1909
1910 if (devinfo->ver >= 8) {
1911 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1912 if (devinfo->ver < 12)
1913 brw_set_src0(p, insn, brw_imm_d(0));
1914 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1915 } else if (devinfo->ver == 7) {
1916 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1917 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1918 brw_set_src1(p, insn, brw_imm_w(0));
1919 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1920 } else {
1921 brw_set_dest(p, insn, brw_imm_w(0));
1922 brw_inst_set_gfx6_jump_count(devinfo, insn, br * (do_insn - insn));
1923 brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1924 brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1925 }
1926
1927 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1928
1929 } else {
1930 if (p->single_program_flow) {
1931 insn = next_insn(p, BRW_OPCODE_ADD);
1932 do_insn = get_inner_do_insn(p);
1933
1934 brw_set_dest(p, insn, brw_ip_reg());
1935 brw_set_src0(p, insn, brw_ip_reg());
1936 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1937 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1938 } else {
1939 insn = next_insn(p, BRW_OPCODE_WHILE);
1940 do_insn = get_inner_do_insn(p);
1941
1942 assert(brw_inst_opcode(p->isa, do_insn) == BRW_OPCODE_DO);
1943
1944 brw_set_dest(p, insn, brw_ip_reg());
1945 brw_set_src0(p, insn, brw_ip_reg());
1946 brw_set_src1(p, insn, brw_imm_d(0));
1947
1948 brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1949 brw_inst_set_gfx4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1950 brw_inst_set_gfx4_pop_count(devinfo, insn, 0);
1951
1952 brw_patch_break_cont(p, insn);
1953 }
1954 }
1955 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1956
1957 p->loop_stack_depth--;
1958
1959 return insn;
1960 }
1961
1962 /* FORWARD JUMPS:
1963 */
brw_land_fwd_jump(struct brw_codegen * p,int jmp_insn_idx)1964 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1965 {
1966 const struct intel_device_info *devinfo = p->devinfo;
1967 brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1968 unsigned jmpi = 1;
1969
1970 if (devinfo->ver >= 5)
1971 jmpi = 2;
1972
1973 assert(brw_inst_opcode(p->isa, jmp_insn) == BRW_OPCODE_JMPI);
1974 assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1975
1976 brw_inst_set_gfx4_jump_count(devinfo, jmp_insn,
1977 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1978 }
1979
1980 /* To integrate with the above, it makes sense that the comparison
1981 * instruction should populate the flag register. It might be simpler
1982 * just to use the flag reg for most WM tasks?
1983 */
brw_CMP(struct brw_codegen * p,struct brw_reg dest,unsigned conditional,struct brw_reg src0,struct brw_reg src1)1984 void brw_CMP(struct brw_codegen *p,
1985 struct brw_reg dest,
1986 unsigned conditional,
1987 struct brw_reg src0,
1988 struct brw_reg src1)
1989 {
1990 const struct intel_device_info *devinfo = p->devinfo;
1991 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1992
1993 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1994 brw_set_dest(p, insn, dest);
1995 brw_set_src0(p, insn, src0);
1996 brw_set_src1(p, insn, src1);
1997
1998 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1999 * page says:
2000 * "Any CMP instruction with a null destination must use a {switch}."
2001 *
2002 * It also applies to other Gfx7 platforms (IVB, BYT) even though it isn't
2003 * mentioned on their work-arounds pages.
2004 */
2005 if (devinfo->ver == 7) {
2006 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
2007 dest.nr == BRW_ARF_NULL) {
2008 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
2009 }
2010 }
2011 }
2012
brw_CMPN(struct brw_codegen * p,struct brw_reg dest,unsigned conditional,struct brw_reg src0,struct brw_reg src1)2013 void brw_CMPN(struct brw_codegen *p,
2014 struct brw_reg dest,
2015 unsigned conditional,
2016 struct brw_reg src0,
2017 struct brw_reg src1)
2018 {
2019 const struct intel_device_info *devinfo = p->devinfo;
2020 brw_inst *insn = next_insn(p, BRW_OPCODE_CMPN);
2021
2022 brw_inst_set_cond_modifier(devinfo, insn, conditional);
2023 brw_set_dest(p, insn, dest);
2024 brw_set_src0(p, insn, src0);
2025 brw_set_src1(p, insn, src1);
2026
2027 /* Page 166 of the Ivy Bridge PRM Volume 4 part 3 (Execution Unit ISA)
2028 * says:
2029 *
2030 * If the destination is the null register, the {Switch} instruction
2031 * option must be used.
2032 *
2033 * Page 77 of the Haswell PRM Volume 2b contains the same text.
2034 */
2035 if (devinfo->ver == 7) {
2036 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
2037 dest.nr == BRW_ARF_NULL) {
2038 brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
2039 }
2040 }
2041 }
2042
2043 /***********************************************************************
2044 * Helpers for the various SEND message types:
2045 */
2046
2047 /** Extended math function, float[8].
2048 */
gfx4_math(struct brw_codegen * p,struct brw_reg dest,unsigned function,unsigned msg_reg_nr,struct brw_reg src,unsigned precision)2049 void gfx4_math(struct brw_codegen *p,
2050 struct brw_reg dest,
2051 unsigned function,
2052 unsigned msg_reg_nr,
2053 struct brw_reg src,
2054 unsigned precision )
2055 {
2056 const struct intel_device_info *devinfo = p->devinfo;
2057 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2058 unsigned data_type;
2059 if (has_scalar_region(src)) {
2060 data_type = BRW_MATH_DATA_SCALAR;
2061 } else {
2062 data_type = BRW_MATH_DATA_VECTOR;
2063 }
2064
2065 assert(devinfo->ver < 6);
2066
2067 /* Example code doesn't set predicate_control for send
2068 * instructions.
2069 */
2070 brw_inst_set_pred_control(devinfo, insn, 0);
2071 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2072
2073 brw_set_dest(p, insn, dest);
2074 brw_set_src0(p, insn, src);
2075 brw_set_math_message(p,
2076 insn,
2077 function,
2078 src.type == BRW_REGISTER_TYPE_D,
2079 precision,
2080 data_type);
2081 }
2082
gfx6_math(struct brw_codegen * p,struct brw_reg dest,unsigned function,struct brw_reg src0,struct brw_reg src1)2083 void gfx6_math(struct brw_codegen *p,
2084 struct brw_reg dest,
2085 unsigned function,
2086 struct brw_reg src0,
2087 struct brw_reg src1)
2088 {
2089 const struct intel_device_info *devinfo = p->devinfo;
2090 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
2091
2092 assert(devinfo->ver >= 6);
2093
2094 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
2095 (devinfo->ver >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
2096
2097 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2098 if (devinfo->ver == 6) {
2099 assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2100 assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2101 }
2102
2103 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2104 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2105 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2106 assert(src0.type != BRW_REGISTER_TYPE_F);
2107 assert(src1.type != BRW_REGISTER_TYPE_F);
2108 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2109 (devinfo->ver >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2110 /* From BSpec 6647/47428 "[Instruction] Extended Math Function":
2111 * INT DIV function does not support source modifiers.
2112 */
2113 assert(!src0.negate);
2114 assert(!src0.abs);
2115 assert(!src1.negate);
2116 assert(!src1.abs);
2117 } else {
2118 assert(src0.type == BRW_REGISTER_TYPE_F ||
2119 (src0.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9));
2120 assert(src1.type == BRW_REGISTER_TYPE_F ||
2121 (src1.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9));
2122 }
2123
2124 /* Source modifiers are ignored for extended math instructions on Gfx6. */
2125 if (devinfo->ver == 6) {
2126 assert(!src0.negate);
2127 assert(!src0.abs);
2128 assert(!src1.negate);
2129 assert(!src1.abs);
2130 }
2131
2132 brw_inst_set_math_function(devinfo, insn, function);
2133
2134 brw_set_dest(p, insn, dest);
2135 brw_set_src0(p, insn, src0);
2136 brw_set_src1(p, insn, src1);
2137 }
2138
2139 /**
2140 * Return the right surface index to access the thread scratch space using
2141 * stateless dataport messages.
2142 */
2143 unsigned
brw_scratch_surface_idx(const struct brw_codegen * p)2144 brw_scratch_surface_idx(const struct brw_codegen *p)
2145 {
2146 /* The scratch space is thread-local so IA coherency is unnecessary. */
2147 if (p->devinfo->ver >= 8)
2148 return GFX8_BTI_STATELESS_NON_COHERENT;
2149 else
2150 return BRW_BTI_STATELESS;
2151 }
2152
2153 /**
2154 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2155 * using a constant offset per channel.
2156 *
2157 * The offset must be aligned to oword size (16 bytes). Used for
2158 * register spilling.
2159 */
brw_oword_block_write_scratch(struct brw_codegen * p,struct brw_reg mrf,int num_regs,unsigned offset)2160 void brw_oword_block_write_scratch(struct brw_codegen *p,
2161 struct brw_reg mrf,
2162 int num_regs,
2163 unsigned offset)
2164 {
2165 const struct intel_device_info *devinfo = p->devinfo;
2166 const unsigned target_cache =
2167 (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
2168 devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2169 BRW_SFID_DATAPORT_WRITE);
2170 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2171 uint32_t msg_type;
2172
2173 if (devinfo->ver >= 6)
2174 offset /= 16;
2175
2176 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2177
2178 const unsigned mlen = 1 + num_regs;
2179
2180 /* Set up the message header. This is g0, with g0.2 filled with
2181 * the offset. We don't want to leave our offset around in g0 or
2182 * it'll screw up texture samples, so set it up inside the message
2183 * reg.
2184 */
2185 {
2186 brw_push_insn_state(p);
2187 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2188 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2189 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2190 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2191
2192 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2193
2194 /* set message header global offset field (reg 0, element 2) */
2195 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2196 brw_set_default_swsb(p, tgl_swsb_null());
2197 brw_MOV(p,
2198 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2199 mrf.nr,
2200 2), BRW_REGISTER_TYPE_UD),
2201 brw_imm_ud(offset));
2202
2203 brw_pop_insn_state(p);
2204 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2205 }
2206
2207 {
2208 struct brw_reg dest;
2209 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2210 int send_commit_msg;
2211 struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2212 BRW_REGISTER_TYPE_UW);
2213
2214 brw_inst_set_sfid(devinfo, insn, target_cache);
2215 brw_inst_set_compression(devinfo, insn, false);
2216
2217 if (brw_inst_exec_size(devinfo, insn) >= 16)
2218 src_header = vec16(src_header);
2219
2220 assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2221 if (devinfo->ver < 6)
2222 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2223
2224 /* Until gfx6, writes followed by reads from the same location
2225 * are not guaranteed to be ordered unless write_commit is set.
2226 * If set, then a no-op write is issued to the destination
2227 * register to set a dependency, and a read from the destination
2228 * can be used to ensure the ordering.
2229 *
2230 * For gfx6, only writes between different threads need ordering
2231 * protection. Our use of DP writes is all about register
2232 * spilling within a thread.
2233 */
2234 if (devinfo->ver >= 6) {
2235 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2236 send_commit_msg = 0;
2237 } else {
2238 dest = src_header;
2239 send_commit_msg = 1;
2240 }
2241
2242 brw_set_dest(p, insn, dest);
2243 if (devinfo->ver >= 6) {
2244 brw_set_src0(p, insn, mrf);
2245 } else {
2246 brw_set_src0(p, insn, brw_null_reg());
2247 }
2248
2249 if (devinfo->ver >= 6)
2250 msg_type = GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2251 else
2252 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2253
2254 brw_set_desc(p, insn,
2255 brw_message_desc(devinfo, mlen, send_commit_msg, true) |
2256 brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
2257 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2258 msg_type, send_commit_msg));
2259 }
2260 }
2261
2262
2263 /**
2264 * Read a block of owords (half a GRF each) from the scratch buffer
2265 * using a constant index per channel.
2266 *
2267 * Offset must be aligned to oword size (16 bytes). Used for register
2268 * spilling.
2269 */
2270 void
brw_oword_block_read_scratch(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,int num_regs,unsigned offset)2271 brw_oword_block_read_scratch(struct brw_codegen *p,
2272 struct brw_reg dest,
2273 struct brw_reg mrf,
2274 int num_regs,
2275 unsigned offset)
2276 {
2277 const struct intel_device_info *devinfo = p->devinfo;
2278 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2279
2280 if (devinfo->ver >= 6)
2281 offset /= 16;
2282
2283 if (p->devinfo->ver >= 7) {
2284 /* On gen 7 and above, we no longer have message registers and we can
2285 * send from any register we want. By using the destination register
2286 * for the message, we guarantee that the implied message write won't
2287 * accidentally overwrite anything. This has been a problem because
2288 * the MRF registers and source for the final FB write are both fixed
2289 * and may overlap.
2290 */
2291 mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2292 } else {
2293 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2294 }
2295 dest = retype(dest, BRW_REGISTER_TYPE_UW);
2296
2297 const unsigned rlen = num_regs;
2298 const unsigned target_cache =
2299 (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
2300 devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2301 BRW_SFID_DATAPORT_READ);
2302
2303 {
2304 brw_push_insn_state(p);
2305 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2306 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2307 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2308 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2309
2310 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2311
2312 /* set message header global offset field (reg 0, element 2) */
2313 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2314 brw_set_default_swsb(p, tgl_swsb_null());
2315 brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2316
2317 brw_pop_insn_state(p);
2318 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2319 }
2320
2321 {
2322 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2323
2324 brw_inst_set_sfid(devinfo, insn, target_cache);
2325 assert(brw_inst_pred_control(devinfo, insn) == 0);
2326 brw_inst_set_compression(devinfo, insn, false);
2327
2328 brw_set_dest(p, insn, dest); /* UW? */
2329 if (devinfo->ver >= 6) {
2330 brw_set_src0(p, insn, mrf);
2331 } else {
2332 brw_set_src0(p, insn, brw_null_reg());
2333 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2334 }
2335
2336 brw_set_desc(p, insn,
2337 brw_message_desc(devinfo, 1, rlen, true) |
2338 brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
2339 BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2340 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2341 BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2342 }
2343 }
2344
2345 void
gfx7_block_read_scratch(struct brw_codegen * p,struct brw_reg dest,int num_regs,unsigned offset)2346 gfx7_block_read_scratch(struct brw_codegen *p,
2347 struct brw_reg dest,
2348 int num_regs,
2349 unsigned offset)
2350 {
2351 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2352 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2353
2354 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2355
2356 /* The HW requires that the header is present; this is to get the g0.5
2357 * scratch offset.
2358 */
2359 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2360
2361 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2362 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2363 * is 32 bytes, which happens to be the size of a register.
2364 */
2365 offset /= REG_SIZE;
2366 assert(offset < (1 << 12));
2367
2368 gfx7_set_dp_scratch_message(p, insn,
2369 false, /* scratch read */
2370 false, /* OWords */
2371 false, /* invalidate after read */
2372 num_regs,
2373 offset,
2374 1, /* mlen: just g0 */
2375 num_regs, /* rlen */
2376 true); /* header present */
2377 }
2378
2379 /**
2380 * Read float[4] vectors from the data port constant cache.
2381 * Location (in buffer) should be a multiple of 16.
2382 * Used for fetching shader constants.
2383 */
brw_oword_block_read(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,uint32_t offset,uint32_t bind_table_index)2384 void brw_oword_block_read(struct brw_codegen *p,
2385 struct brw_reg dest,
2386 struct brw_reg mrf,
2387 uint32_t offset,
2388 uint32_t bind_table_index)
2389 {
2390 const struct intel_device_info *devinfo = p->devinfo;
2391 const unsigned target_cache =
2392 (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_CONSTANT_CACHE :
2393 BRW_SFID_DATAPORT_READ);
2394 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2395 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2396
2397 /* On newer hardware, offset is in units of owords. */
2398 if (devinfo->ver >= 6)
2399 offset /= 16;
2400
2401 mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2402
2403 brw_push_insn_state(p);
2404 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2405 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2406 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2407
2408 brw_push_insn_state(p);
2409 brw_set_default_exec_size(p, BRW_EXECUTE_8);
2410 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2411 brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2412
2413 /* set message header global offset field (reg 0, element 2) */
2414 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2415 brw_set_default_swsb(p, tgl_swsb_null());
2416 brw_MOV(p,
2417 retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2418 mrf.nr,
2419 2), BRW_REGISTER_TYPE_UD),
2420 brw_imm_ud(offset));
2421 brw_pop_insn_state(p);
2422
2423 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2424
2425 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2426
2427 brw_inst_set_sfid(devinfo, insn, target_cache);
2428
2429 /* cast dest to a uword[8] vector */
2430 dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2431
2432 brw_set_dest(p, insn, dest);
2433 if (devinfo->ver >= 6) {
2434 brw_set_src0(p, insn, mrf);
2435 } else {
2436 brw_set_src0(p, insn, brw_null_reg());
2437 brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2438 }
2439
2440 brw_set_desc(p, insn,
2441 brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2442 brw_dp_read_desc(devinfo, bind_table_index,
2443 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2444 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2445 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
2446
2447 brw_pop_insn_state(p);
2448 }
2449
2450 brw_inst *
brw_fb_WRITE(struct brw_codegen * p,struct brw_reg payload,struct brw_reg implied_header,unsigned msg_control,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool eot,bool last_render_target,bool header_present)2451 brw_fb_WRITE(struct brw_codegen *p,
2452 struct brw_reg payload,
2453 struct brw_reg implied_header,
2454 unsigned msg_control,
2455 unsigned binding_table_index,
2456 unsigned msg_length,
2457 unsigned response_length,
2458 bool eot,
2459 bool last_render_target,
2460 bool header_present)
2461 {
2462 const struct intel_device_info *devinfo = p->devinfo;
2463 const unsigned target_cache =
2464 (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2465 BRW_SFID_DATAPORT_WRITE);
2466 brw_inst *insn;
2467 struct brw_reg dest, src0;
2468
2469 if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2470 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2471 else
2472 dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2473
2474 if (devinfo->ver >= 6) {
2475 insn = next_insn(p, BRW_OPCODE_SENDC);
2476 } else {
2477 insn = next_insn(p, BRW_OPCODE_SEND);
2478 }
2479 brw_inst_set_sfid(devinfo, insn, target_cache);
2480 brw_inst_set_compression(devinfo, insn, false);
2481
2482 if (devinfo->ver >= 6) {
2483 /* headerless version, just submit color payload */
2484 src0 = payload;
2485 } else {
2486 assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2487 brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2488 src0 = implied_header;
2489 }
2490
2491 brw_set_dest(p, insn, dest);
2492 brw_set_src0(p, insn, src0);
2493 brw_set_desc(p, insn,
2494 brw_message_desc(devinfo, msg_length, response_length,
2495 header_present) |
2496 brw_fb_write_desc(devinfo, binding_table_index, msg_control,
2497 last_render_target,
2498 false /* coarse_write */));
2499 brw_inst_set_eot(devinfo, insn, eot);
2500
2501 return insn;
2502 }
2503
2504 brw_inst *
gfx9_fb_READ(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool per_sample)2505 gfx9_fb_READ(struct brw_codegen *p,
2506 struct brw_reg dst,
2507 struct brw_reg payload,
2508 unsigned binding_table_index,
2509 unsigned msg_length,
2510 unsigned response_length,
2511 bool per_sample)
2512 {
2513 const struct intel_device_info *devinfo = p->devinfo;
2514 assert(devinfo->ver >= 9);
2515 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2516
2517 brw_inst_set_sfid(devinfo, insn, GFX6_SFID_DATAPORT_RENDER_CACHE);
2518 brw_set_dest(p, insn, dst);
2519 brw_set_src0(p, insn, payload);
2520 brw_set_desc(
2521 p, insn,
2522 brw_message_desc(devinfo, msg_length, response_length, true) |
2523 brw_fb_read_desc(devinfo, binding_table_index, 0 /* msg_control */,
2524 1 << brw_get_default_exec_size(p), per_sample));
2525 brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2526
2527 return insn;
2528 }
2529
2530 /**
2531 * Texture sample instruction.
2532 * Note: the msg_type plus msg_length values determine exactly what kind
2533 * of sampling operation is performed. See volume 4, page 161 of docs.
2534 */
brw_SAMPLE(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,unsigned sampler,unsigned msg_type,unsigned response_length,unsigned msg_length,unsigned header_present,unsigned simd_mode,unsigned return_format)2535 void brw_SAMPLE(struct brw_codegen *p,
2536 struct brw_reg dest,
2537 unsigned msg_reg_nr,
2538 struct brw_reg src0,
2539 unsigned binding_table_index,
2540 unsigned sampler,
2541 unsigned msg_type,
2542 unsigned response_length,
2543 unsigned msg_length,
2544 unsigned header_present,
2545 unsigned simd_mode,
2546 unsigned return_format)
2547 {
2548 const struct intel_device_info *devinfo = p->devinfo;
2549 brw_inst *insn;
2550
2551 if (msg_reg_nr != -1)
2552 gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2553
2554 insn = next_insn(p, BRW_OPCODE_SEND);
2555 brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2556 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2557
2558 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2559 *
2560 * "Instruction compression is not allowed for this instruction (that
2561 * is, send). The hardware behavior is undefined if this instruction is
2562 * set as compressed. However, compress control can be set to "SecHalf"
2563 * to affect the EMask generation."
2564 *
2565 * No similar wording is found in later PRMs, but there are examples
2566 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2567 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2568 * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2569 */
2570 brw_inst_set_compression(devinfo, insn, false);
2571
2572 if (devinfo->ver < 6)
2573 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2574
2575 brw_set_dest(p, insn, dest);
2576 brw_set_src0(p, insn, src0);
2577 brw_set_desc(p, insn,
2578 brw_message_desc(devinfo, msg_length, response_length,
2579 header_present) |
2580 brw_sampler_desc(devinfo, binding_table_index, sampler,
2581 msg_type, simd_mode, return_format));
2582 }
2583
2584 /* Adjust the message header's sampler state pointer to
2585 * select the correct group of 16 samplers.
2586 */
brw_adjust_sampler_state_pointer(struct brw_codegen * p,struct brw_reg header,struct brw_reg sampler_index)2587 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2588 struct brw_reg header,
2589 struct brw_reg sampler_index)
2590 {
2591 /* The "Sampler Index" field can only store values between 0 and 15.
2592 * However, we can add an offset to the "Sampler State Pointer"
2593 * field, effectively selecting a different set of 16 samplers.
2594 *
2595 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2596 * offset, and each sampler state is only 16-bytes, so we can't
2597 * exclusively use the offset - we have to use both.
2598 */
2599
2600 const struct intel_device_info *devinfo = p->devinfo;
2601
2602 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2603 const int sampler_state_size = 16; /* 16 bytes */
2604 uint32_t sampler = sampler_index.ud;
2605
2606 if (sampler >= 16) {
2607 assert(devinfo->verx10 >= 75);
2608 brw_ADD(p,
2609 get_element_ud(header, 3),
2610 get_element_ud(brw_vec8_grf(0, 0), 3),
2611 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2612 }
2613 } else {
2614 /* Non-const sampler array indexing case */
2615 if (devinfo->verx10 <= 70) {
2616 return;
2617 }
2618
2619 struct brw_reg temp = get_element_ud(header, 3);
2620
2621 brw_push_insn_state(p);
2622 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2623 brw_set_default_swsb(p, tgl_swsb_regdist(1));
2624 brw_SHL(p, temp, temp, brw_imm_ud(4));
2625 brw_ADD(p,
2626 get_element_ud(header, 3),
2627 get_element_ud(brw_vec8_grf(0, 0), 3),
2628 temp);
2629 brw_pop_insn_state(p);
2630 }
2631 }
2632
2633 /* All these variables are pretty confusing - we might be better off
2634 * using bitmasks and macros for this, in the old style. Or perhaps
2635 * just having the caller instantiate the fields in dword3 itself.
2636 */
brw_urb_WRITE(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,enum brw_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle)2637 void brw_urb_WRITE(struct brw_codegen *p,
2638 struct brw_reg dest,
2639 unsigned msg_reg_nr,
2640 struct brw_reg src0,
2641 enum brw_urb_write_flags flags,
2642 unsigned msg_length,
2643 unsigned response_length,
2644 unsigned offset,
2645 unsigned swizzle)
2646 {
2647 const struct intel_device_info *devinfo = p->devinfo;
2648 brw_inst *insn;
2649
2650 gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2651
2652 if (devinfo->ver >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2653 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2654 brw_push_insn_state(p);
2655 brw_set_default_access_mode(p, BRW_ALIGN_1);
2656 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2657 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2658 brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2659 BRW_REGISTER_TYPE_UD),
2660 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2661 brw_imm_ud(0xff00));
2662 brw_pop_insn_state(p);
2663 }
2664
2665 insn = next_insn(p, BRW_OPCODE_SEND);
2666
2667 assert(msg_length < BRW_MAX_MRF(devinfo->ver));
2668
2669 brw_set_dest(p, insn, dest);
2670 brw_set_src0(p, insn, src0);
2671 brw_set_src1(p, insn, brw_imm_d(0));
2672
2673 if (devinfo->ver < 6)
2674 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2675
2676 brw_set_urb_message(p,
2677 insn,
2678 flags,
2679 msg_length,
2680 response_length,
2681 offset,
2682 swizzle);
2683 }
2684
2685 void
brw_send_indirect_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload,struct brw_reg desc,unsigned desc_imm,bool eot)2686 brw_send_indirect_message(struct brw_codegen *p,
2687 unsigned sfid,
2688 struct brw_reg dst,
2689 struct brw_reg payload,
2690 struct brw_reg desc,
2691 unsigned desc_imm,
2692 bool eot)
2693 {
2694 const struct intel_device_info *devinfo = p->devinfo;
2695 struct brw_inst *send;
2696
2697 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2698
2699 assert(desc.type == BRW_REGISTER_TYPE_UD);
2700
2701 if (desc.file == BRW_IMMEDIATE_VALUE) {
2702 send = next_insn(p, BRW_OPCODE_SEND);
2703 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2704 brw_set_desc(p, send, desc.ud | desc_imm);
2705 } else {
2706 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2707 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2708
2709 brw_push_insn_state(p);
2710 brw_set_default_access_mode(p, BRW_ALIGN_1);
2711 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2712 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2713 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2714 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2715
2716 /* Load the indirect descriptor to an address register using OR so the
2717 * caller can specify additional descriptor bits with the desc_imm
2718 * immediate.
2719 */
2720 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2721
2722 brw_pop_insn_state(p);
2723
2724 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2725 send = next_insn(p, BRW_OPCODE_SEND);
2726 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2727
2728 if (devinfo->ver >= 12)
2729 brw_inst_set_send_sel_reg32_desc(devinfo, send, true);
2730 else
2731 brw_set_src1(p, send, addr);
2732 }
2733
2734 brw_set_dest(p, send, dst);
2735 brw_inst_set_sfid(devinfo, send, sfid);
2736 brw_inst_set_eot(devinfo, send, eot);
2737 }
2738
2739 void
brw_send_indirect_split_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload0,struct brw_reg payload1,struct brw_reg desc,unsigned desc_imm,struct brw_reg ex_desc,unsigned ex_desc_imm,bool eot)2740 brw_send_indirect_split_message(struct brw_codegen *p,
2741 unsigned sfid,
2742 struct brw_reg dst,
2743 struct brw_reg payload0,
2744 struct brw_reg payload1,
2745 struct brw_reg desc,
2746 unsigned desc_imm,
2747 struct brw_reg ex_desc,
2748 unsigned ex_desc_imm,
2749 bool eot)
2750 {
2751 const struct intel_device_info *devinfo = p->devinfo;
2752 struct brw_inst *send;
2753
2754 dst = retype(dst, BRW_REGISTER_TYPE_UW);
2755
2756 assert(desc.type == BRW_REGISTER_TYPE_UD);
2757
2758 if (desc.file == BRW_IMMEDIATE_VALUE) {
2759 desc.ud |= desc_imm;
2760 } else {
2761 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2762 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2763
2764 brw_push_insn_state(p);
2765 brw_set_default_access_mode(p, BRW_ALIGN_1);
2766 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2767 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2768 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2769 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2770
2771 /* Load the indirect descriptor to an address register using OR so the
2772 * caller can specify additional descriptor bits with the desc_imm
2773 * immediate.
2774 */
2775 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2776
2777 brw_pop_insn_state(p);
2778 desc = addr;
2779
2780 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2781 }
2782
2783 if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
2784 (devinfo->ver >= 12 ||
2785 ((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) {
2786 ex_desc.ud |= ex_desc_imm;
2787 } else {
2788 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2789 struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);
2790
2791 brw_push_insn_state(p);
2792 brw_set_default_access_mode(p, BRW_ALIGN_1);
2793 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2794 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2795 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2796 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2797
2798 /* Load the indirect extended descriptor to an address register using OR
2799 * so the caller can specify additional descriptor bits with the
2800 * desc_imm immediate.
2801 *
2802 * Even though the instruction dispatcher always pulls the SFID and EOT
2803 * fields from the instruction itself, actual external unit which
2804 * processes the message gets the SFID and EOT from the extended
2805 * descriptor which comes from the address register. If we don't OR
2806 * those two bits in, the external unit may get confused and hang.
2807 */
2808 unsigned imm_part = ex_desc_imm | sfid | eot << 5;
2809
2810 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2811 /* ex_desc bits 15:12 don't exist in the instruction encoding prior
2812 * to Gfx12, so we may have fallen back to an indirect extended
2813 * descriptor.
2814 */
2815 brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
2816 } else {
2817 brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
2818 }
2819
2820 brw_pop_insn_state(p);
2821 ex_desc = addr;
2822
2823 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2824 }
2825
2826 send = next_insn(p, devinfo->ver >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);
2827 brw_set_dest(p, send, dst);
2828 brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));
2829 brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));
2830
2831 if (desc.file == BRW_IMMEDIATE_VALUE) {
2832 brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);
2833 brw_inst_set_send_desc(devinfo, send, desc.ud);
2834 } else {
2835 assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2836 assert(desc.nr == BRW_ARF_ADDRESS);
2837 assert(desc.subnr == 0);
2838 brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);
2839 }
2840
2841 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2842 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
2843 brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
2844 } else {
2845 assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2846 assert(ex_desc.nr == BRW_ARF_ADDRESS);
2847 assert((ex_desc.subnr & 0x3) == 0);
2848 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
2849 brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2);
2850 }
2851
2852 brw_inst_set_sfid(devinfo, send, sfid);
2853 brw_inst_set_eot(devinfo, send, eot);
2854 }
2855
2856 static void
brw_send_indirect_surface_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned desc_imm)2857 brw_send_indirect_surface_message(struct brw_codegen *p,
2858 unsigned sfid,
2859 struct brw_reg dst,
2860 struct brw_reg payload,
2861 struct brw_reg surface,
2862 unsigned desc_imm)
2863 {
2864 if (surface.file != BRW_IMMEDIATE_VALUE) {
2865 const struct tgl_swsb swsb = brw_get_default_swsb(p);
2866 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2867
2868 brw_push_insn_state(p);
2869 brw_set_default_access_mode(p, BRW_ALIGN_1);
2870 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2871 brw_set_default_exec_size(p, BRW_EXECUTE_1);
2872 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2873 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2874
2875 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2876 * some surface array is accessed out of bounds.
2877 */
2878 brw_AND(p, addr,
2879 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2880 BRW_GET_SWZ(surface.swizzle, 0)),
2881 brw_imm_ud(0xff));
2882
2883 brw_pop_insn_state(p);
2884
2885 surface = addr;
2886 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2887 }
2888
2889 brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2890 }
2891
2892 static bool
while_jumps_before_offset(const struct intel_device_info * devinfo,brw_inst * insn,int while_offset,int start_offset)2893 while_jumps_before_offset(const struct intel_device_info *devinfo,
2894 brw_inst *insn, int while_offset, int start_offset)
2895 {
2896 int scale = 16 / brw_jump_scale(devinfo);
2897 int jip = devinfo->ver == 6 ? brw_inst_gfx6_jump_count(devinfo, insn)
2898 : brw_inst_jip(devinfo, insn);
2899 assert(jip < 0);
2900 return while_offset + jip * scale <= start_offset;
2901 }
2902
2903
2904 static int
brw_find_next_block_end(struct brw_codegen * p,int start_offset)2905 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2906 {
2907 int offset;
2908 void *store = p->store;
2909 const struct intel_device_info *devinfo = p->devinfo;
2910
2911 int depth = 0;
2912
2913 for (offset = next_offset(devinfo, store, start_offset);
2914 offset < p->next_insn_offset;
2915 offset = next_offset(devinfo, store, offset)) {
2916 brw_inst *insn = store + offset;
2917
2918 switch (brw_inst_opcode(p->isa, insn)) {
2919 case BRW_OPCODE_IF:
2920 depth++;
2921 break;
2922 case BRW_OPCODE_ENDIF:
2923 if (depth == 0)
2924 return offset;
2925 depth--;
2926 break;
2927 case BRW_OPCODE_WHILE:
2928 /* If the while doesn't jump before our instruction, it's the end
2929 * of a sibling do...while loop. Ignore it.
2930 */
2931 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2932 continue;
2933 FALLTHROUGH;
2934 case BRW_OPCODE_ELSE:
2935 case BRW_OPCODE_HALT:
2936 if (depth == 0)
2937 return offset;
2938 break;
2939 default:
2940 break;
2941 }
2942 }
2943
2944 return 0;
2945 }
2946
2947 /* There is no DO instruction on gfx6, so to find the end of the loop
2948 * we have to see if the loop is jumping back before our start
2949 * instruction.
2950 */
2951 static int
brw_find_loop_end(struct brw_codegen * p,int start_offset)2952 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2953 {
2954 const struct intel_device_info *devinfo = p->devinfo;
2955 int offset;
2956 void *store = p->store;
2957
2958 assert(devinfo->ver >= 6);
2959
2960 /* Always start after the instruction (such as a WHILE) we're trying to fix
2961 * up.
2962 */
2963 for (offset = next_offset(devinfo, store, start_offset);
2964 offset < p->next_insn_offset;
2965 offset = next_offset(devinfo, store, offset)) {
2966 brw_inst *insn = store + offset;
2967
2968 if (brw_inst_opcode(p->isa, insn) == BRW_OPCODE_WHILE) {
2969 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2970 return offset;
2971 }
2972 }
2973 assert(!"not reached");
2974 return start_offset;
2975 }
2976
2977 /* After program generation, go back and update the UIP and JIP of
2978 * BREAK, CONT, and HALT instructions to their correct locations.
2979 */
2980 void
brw_set_uip_jip(struct brw_codegen * p,int start_offset)2981 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2982 {
2983 const struct intel_device_info *devinfo = p->devinfo;
2984 int offset;
2985 int br = brw_jump_scale(devinfo);
2986 int scale = 16 / br;
2987 void *store = p->store;
2988
2989 if (devinfo->ver < 6)
2990 return;
2991
2992 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2993 brw_inst *insn = store + offset;
2994 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2995
2996 switch (brw_inst_opcode(p->isa, insn)) {
2997 case BRW_OPCODE_BREAK: {
2998 int block_end_offset = brw_find_next_block_end(p, offset);
2999 assert(block_end_offset != 0);
3000 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
3001 /* Gfx7 UIP points to WHILE; Gfx6 points just after it */
3002 brw_inst_set_uip(devinfo, insn,
3003 (brw_find_loop_end(p, offset) - offset +
3004 (devinfo->ver == 6 ? 16 : 0)) / scale);
3005 break;
3006 }
3007
3008 case BRW_OPCODE_CONTINUE: {
3009 int block_end_offset = brw_find_next_block_end(p, offset);
3010 assert(block_end_offset != 0);
3011 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
3012 brw_inst_set_uip(devinfo, insn,
3013 (brw_find_loop_end(p, offset) - offset) / scale);
3014
3015 assert(brw_inst_uip(devinfo, insn) != 0);
3016 assert(brw_inst_jip(devinfo, insn) != 0);
3017 break;
3018 }
3019
3020 case BRW_OPCODE_ENDIF: {
3021 int block_end_offset = brw_find_next_block_end(p, offset);
3022 int32_t jump = (block_end_offset == 0) ?
3023 1 * br : (block_end_offset - offset) / scale;
3024 if (devinfo->ver >= 7)
3025 brw_inst_set_jip(devinfo, insn, jump);
3026 else
3027 brw_inst_set_gfx6_jump_count(devinfo, insn, jump);
3028 break;
3029 }
3030
3031 case BRW_OPCODE_HALT: {
3032 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
3033 *
3034 * "In case of the halt instruction not inside any conditional
3035 * code block, the value of <JIP> and <UIP> should be the
3036 * same. In case of the halt instruction inside conditional code
3037 * block, the <UIP> should be the end of the program, and the
3038 * <JIP> should be end of the most inner conditional code block."
3039 *
3040 * The uip will have already been set by whoever set up the
3041 * instruction.
3042 */
3043 int block_end_offset = brw_find_next_block_end(p, offset);
3044 if (block_end_offset == 0) {
3045 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
3046 } else {
3047 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
3048 }
3049 assert(brw_inst_uip(devinfo, insn) != 0);
3050 assert(brw_inst_jip(devinfo, insn) != 0);
3051 break;
3052 }
3053
3054 default:
3055 break;
3056 }
3057 }
3058 }
3059
brw_ff_sync(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,bool allocate,unsigned response_length,bool eot)3060 void brw_ff_sync(struct brw_codegen *p,
3061 struct brw_reg dest,
3062 unsigned msg_reg_nr,
3063 struct brw_reg src0,
3064 bool allocate,
3065 unsigned response_length,
3066 bool eot)
3067 {
3068 const struct intel_device_info *devinfo = p->devinfo;
3069 brw_inst *insn;
3070
3071 gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
3072
3073 insn = next_insn(p, BRW_OPCODE_SEND);
3074 brw_set_dest(p, insn, dest);
3075 brw_set_src0(p, insn, src0);
3076 brw_set_src1(p, insn, brw_imm_d(0));
3077
3078 if (devinfo->ver < 6)
3079 brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
3080
3081 brw_set_ff_sync_message(p,
3082 insn,
3083 allocate,
3084 response_length,
3085 eot);
3086 }
3087
3088 /**
3089 * Emit the SEND instruction necessary to generate stream output data on Gfx6
3090 * (for transform feedback).
3091 *
3092 * If send_commit_msg is true, this is the last piece of stream output data
3093 * from this thread, so send the data as a committed write. According to the
3094 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
3095 *
3096 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
3097 * writes are complete by sending the final write as a committed write."
3098 */
3099 void
brw_svb_write(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,bool send_commit_msg)3100 brw_svb_write(struct brw_codegen *p,
3101 struct brw_reg dest,
3102 unsigned msg_reg_nr,
3103 struct brw_reg src0,
3104 unsigned binding_table_index,
3105 bool send_commit_msg)
3106 {
3107 const struct intel_device_info *devinfo = p->devinfo;
3108 assert(devinfo->ver == 6);
3109 const unsigned target_cache = GFX6_SFID_DATAPORT_RENDER_CACHE;
3110 brw_inst *insn;
3111
3112 gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
3113
3114 insn = next_insn(p, BRW_OPCODE_SEND);
3115 brw_inst_set_sfid(devinfo, insn, target_cache);
3116 brw_set_dest(p, insn, dest);
3117 brw_set_src0(p, insn, src0);
3118 brw_set_desc(p, insn,
3119 brw_message_desc(devinfo, 1, send_commit_msg, true) |
3120 brw_dp_write_desc(devinfo, binding_table_index,
3121 0, /* msg_control: ignored */
3122 GFX6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
3123 send_commit_msg)); /* send_commit_msg */
3124 }
3125
3126 static unsigned
brw_surface_payload_size(unsigned num_channels,unsigned exec_size)3127 brw_surface_payload_size(unsigned num_channels,
3128 unsigned exec_size /**< 0 for SIMD4x2 */)
3129 {
3130 if (exec_size == 0)
3131 return 1; /* SIMD4x2 */
3132 else if (exec_size <= 8)
3133 return num_channels;
3134 else
3135 return 2 * num_channels;
3136 }
3137
3138 void
brw_untyped_atomic(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned atomic_op,unsigned msg_length,bool response_expected,bool header_present)3139 brw_untyped_atomic(struct brw_codegen *p,
3140 struct brw_reg dst,
3141 struct brw_reg payload,
3142 struct brw_reg surface,
3143 unsigned atomic_op,
3144 unsigned msg_length,
3145 bool response_expected,
3146 bool header_present)
3147 {
3148 const struct intel_device_info *devinfo = p->devinfo;
3149 const unsigned sfid = (devinfo->verx10 >= 75 ?
3150 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3151 GFX7_SFID_DATAPORT_DATA_CACHE);
3152 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3153 /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3154 const bool has_simd4x2 = devinfo->verx10 >= 75;
3155 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3156 has_simd4x2 ? 0 : 8;
3157 const unsigned response_length =
3158 brw_surface_payload_size(response_expected, exec_size);
3159 const unsigned desc =
3160 brw_message_desc(devinfo, msg_length, response_length, header_present) |
3161 brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
3162 response_expected);
3163 /* Mask out unused components -- This is especially important in Align16
3164 * mode on generations that don't have native support for SIMD4x2 atomics,
3165 * because unused but enabled components will cause the dataport to perform
3166 * additional atomic operations on the addresses that happen to be in the
3167 * uninitialized Y, Z and W coordinates of the payload.
3168 */
3169 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3170
3171 brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
3172 payload, surface, desc);
3173 }
3174
3175 void
brw_untyped_surface_read(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels)3176 brw_untyped_surface_read(struct brw_codegen *p,
3177 struct brw_reg dst,
3178 struct brw_reg payload,
3179 struct brw_reg surface,
3180 unsigned msg_length,
3181 unsigned num_channels)
3182 {
3183 const struct intel_device_info *devinfo = p->devinfo;
3184 const unsigned sfid = (devinfo->verx10 >= 75 ?
3185 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3186 GFX7_SFID_DATAPORT_DATA_CACHE);
3187 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3188 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
3189 const unsigned response_length =
3190 brw_surface_payload_size(num_channels, exec_size);
3191 const unsigned desc =
3192 brw_message_desc(devinfo, msg_length, response_length, false) |
3193 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
3194
3195 brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
3196 }
3197
3198 void
brw_untyped_surface_write(struct brw_codegen * p,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels,bool header_present)3199 brw_untyped_surface_write(struct brw_codegen *p,
3200 struct brw_reg payload,
3201 struct brw_reg surface,
3202 unsigned msg_length,
3203 unsigned num_channels,
3204 bool header_present)
3205 {
3206 const struct intel_device_info *devinfo = p->devinfo;
3207 const unsigned sfid = (devinfo->verx10 >= 75 ?
3208 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3209 GFX7_SFID_DATAPORT_DATA_CACHE);
3210 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3211 /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3212 const bool has_simd4x2 = devinfo->verx10 >= 75;
3213 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3214 has_simd4x2 ? 0 : 8;
3215 const unsigned desc =
3216 brw_message_desc(devinfo, msg_length, 0, header_present) |
3217 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
3218 /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3219 const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
3220
3221 brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
3222 payload, surface, desc);
3223 }
3224
3225 static void
brw_set_memory_fence_message(struct brw_codegen * p,struct brw_inst * insn,enum brw_message_target sfid,bool commit_enable,unsigned bti)3226 brw_set_memory_fence_message(struct brw_codegen *p,
3227 struct brw_inst *insn,
3228 enum brw_message_target sfid,
3229 bool commit_enable,
3230 unsigned bti)
3231 {
3232 const struct intel_device_info *devinfo = p->devinfo;
3233
3234 brw_set_desc(p, insn, brw_message_desc(
3235 devinfo, 1, (commit_enable ? 1 : 0), true));
3236
3237 brw_inst_set_sfid(devinfo, insn, sfid);
3238
3239 switch (sfid) {
3240 case GFX6_SFID_DATAPORT_RENDER_CACHE:
3241 brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_RC_MEMORY_FENCE);
3242 break;
3243 case GFX7_SFID_DATAPORT_DATA_CACHE:
3244 brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_DC_MEMORY_FENCE);
3245 break;
3246 default:
3247 unreachable("Not reached");
3248 }
3249
3250 if (commit_enable)
3251 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3252
3253 assert(devinfo->ver >= 11 || bti == 0);
3254 brw_inst_set_binding_table_index(devinfo, insn, bti);
3255 }
3256
3257 static void
gfx12_set_memory_fence_message(struct brw_codegen * p,struct brw_inst * insn,enum brw_message_target sfid,uint32_t desc)3258 gfx12_set_memory_fence_message(struct brw_codegen *p,
3259 struct brw_inst *insn,
3260 enum brw_message_target sfid,
3261 uint32_t desc)
3262 {
3263 const unsigned mlen = 1; /* g0 header */
3264 /* Completion signaled by write to register. No data returned. */
3265 const unsigned rlen = 1;
3266
3267 brw_inst_set_sfid(p->devinfo, insn, sfid);
3268
3269 if (sfid == BRW_SFID_URB) {
3270 brw_set_desc(p, insn, brw_urb_fence_desc(p->devinfo) |
3271 brw_message_desc(p->devinfo, mlen, rlen, true));
3272 } else {
3273 enum lsc_fence_scope scope = lsc_fence_msg_desc_scope(p->devinfo, desc);
3274 enum lsc_flush_type flush_type = lsc_fence_msg_desc_flush_type(p->devinfo, desc);
3275
3276 if (sfid == GFX12_SFID_TGM) {
3277 scope = LSC_FENCE_TILE;
3278 flush_type = LSC_FLUSH_TYPE_EVICT;
3279 }
3280
3281 /* Wa_14014435656:
3282 *
3283 * "For any fence greater than local scope, always set flush type to
3284 * at least invalidate so that fence goes on properly."
3285 *
3286 * "The bug is if flush_type is 'None', the scope is always downgraded
3287 * to 'local'."
3288 *
3289 * Here set scope to NONE_6 instead of NONE, which has the same effect
3290 * as NONE but avoids the downgrade to scope LOCAL.
3291 */
3292 if (intel_device_info_is_dg2(p->devinfo) &&
3293 scope > LSC_FENCE_LOCAL &&
3294 flush_type == LSC_FLUSH_TYPE_NONE) {
3295 flush_type = LSC_FLUSH_TYPE_NONE_6;
3296 }
3297
3298 brw_set_desc(p, insn, lsc_fence_msg_desc(p->devinfo, scope,
3299 flush_type, false) |
3300 brw_message_desc(p->devinfo, mlen, rlen, false));
3301 }
3302 }
3303
3304 void
brw_memory_fence(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src,enum opcode send_op,enum brw_message_target sfid,uint32_t desc,bool commit_enable,unsigned bti)3305 brw_memory_fence(struct brw_codegen *p,
3306 struct brw_reg dst,
3307 struct brw_reg src,
3308 enum opcode send_op,
3309 enum brw_message_target sfid,
3310 uint32_t desc,
3311 bool commit_enable,
3312 unsigned bti)
3313 {
3314 const struct intel_device_info *devinfo = p->devinfo;
3315
3316 dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
3317 src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
3318
3319 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3320 * message doesn't write anything back.
3321 */
3322 struct brw_inst *insn = next_insn(p, send_op);
3323 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3324 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3325 brw_set_dest(p, insn, dst);
3326 brw_set_src0(p, insn, src);
3327
3328 /* All DG2 hardware requires LSC for fence messages, even A-step */
3329 if (devinfo->has_lsc)
3330 gfx12_set_memory_fence_message(p, insn, sfid, desc);
3331 else
3332 brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
3333 }
3334
3335 void
brw_pixel_interpolator_query(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,bool noperspective,bool coarse_pixel_rate,unsigned mode,struct brw_reg data,unsigned msg_length,unsigned response_length)3336 brw_pixel_interpolator_query(struct brw_codegen *p,
3337 struct brw_reg dest,
3338 struct brw_reg mrf,
3339 bool noperspective,
3340 bool coarse_pixel_rate,
3341 unsigned mode,
3342 struct brw_reg data,
3343 unsigned msg_length,
3344 unsigned response_length)
3345 {
3346 const struct intel_device_info *devinfo = p->devinfo;
3347 const uint16_t exec_size = brw_get_default_exec_size(p);
3348 const unsigned slot_group = brw_get_default_group(p) / 16;
3349 const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
3350 const unsigned desc =
3351 brw_message_desc(devinfo, msg_length, response_length, false) |
3352 brw_pixel_interp_desc(devinfo, mode, noperspective, coarse_pixel_rate,
3353 simd_mode, slot_group);
3354
3355 /* brw_send_indirect_message will automatically use a direct send message
3356 * if data is actually immediate.
3357 */
3358 brw_send_indirect_message(p,
3359 GFX7_SFID_PIXEL_INTERPOLATOR,
3360 dest,
3361 mrf,
3362 vec1(data),
3363 desc,
3364 false);
3365 }
3366
3367 void
brw_find_live_channel(struct brw_codegen * p,struct brw_reg dst,bool last)3368 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst, bool last)
3369 {
3370 const struct intel_device_info *devinfo = p->devinfo;
3371 const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3372 const unsigned qtr_control = brw_get_default_group(p) / 8;
3373 brw_inst *inst;
3374
3375 assert(devinfo->ver == 7);
3376
3377 brw_push_insn_state(p);
3378
3379 /* The flag register is only used on Gfx7 in align1 mode, so avoid setting
3380 * unnecessary bits in the instruction words, get the information we need
3381 * and reset the default flag register. This allows more instructions to be
3382 * compacted.
3383 */
3384 const unsigned flag_subreg = p->current->flag_subreg;
3385 brw_set_default_flag_reg(p, 0, 0);
3386
3387 if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3388 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3389
3390 const struct brw_reg flag = brw_flag_subreg(flag_subreg);
3391
3392 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3393 brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3394
3395 /* Run enough instructions returning zero with execution masking and
3396 * a conditional modifier enabled in order to get the full execution
3397 * mask in f1.0. We could use a single 32-wide move here if it
3398 * weren't because of the hardware bug that causes channel enables to
3399 * be applied incorrectly to the second half of 32-wide instructions
3400 * on Gfx7.
3401 */
3402 const unsigned lower_size = MIN2(16, exec_size);
3403 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3404 inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3405 brw_imm_uw(0));
3406 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3407 brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3408 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3409 brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3410 brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3411 brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3412 }
3413
3414 /* Find the first bit set in the exec_size-wide portion of the flag
3415 * register that was updated by the last sequence of MOV
3416 * instructions.
3417 */
3418 const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3419 brw_set_default_exec_size(p, BRW_EXECUTE_1);
3420 if (!last) {
3421 inst = brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3422 } else {
3423 inst = brw_LZD(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3424 struct brw_reg neg = vec1(dst);
3425 neg.negate = true;
3426 inst = brw_ADD(p, vec1(dst), neg, brw_imm_uw(31));
3427 }
3428 } else {
3429 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3430
3431 /* Overwrite the destination without and with execution masking to
3432 * find out which of the channels is active.
3433 */
3434 brw_push_insn_state(p);
3435 brw_set_default_exec_size(p, BRW_EXECUTE_4);
3436 brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3437 brw_imm_ud(1));
3438
3439 inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3440 brw_imm_ud(0));
3441 brw_pop_insn_state(p);
3442 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3443 }
3444
3445 brw_pop_insn_state(p);
3446 }
3447
3448 void
brw_broadcast(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src,struct brw_reg idx)3449 brw_broadcast(struct brw_codegen *p,
3450 struct brw_reg dst,
3451 struct brw_reg src,
3452 struct brw_reg idx)
3453 {
3454 const struct intel_device_info *devinfo = p->devinfo;
3455 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3456 brw_inst *inst;
3457
3458 brw_push_insn_state(p);
3459 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3460 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3461
3462 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3463 src.address_mode == BRW_ADDRESS_DIRECT);
3464 assert(!src.abs && !src.negate);
3465 assert(src.type == dst.type);
3466
3467 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3468 idx.file == BRW_IMMEDIATE_VALUE) {
3469 /* Trivial, the source is already uniform or the index is a constant.
3470 * We will typically not get here if the optimizer is doing its job, but
3471 * asserting would be mean.
3472 */
3473 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3474 src = align1 ? stride(suboffset(src, i), 0, 1, 0) :
3475 stride(suboffset(src, 4 * i), 0, 4, 1);
3476
3477 if (type_sz(src.type) > 4 && !devinfo->has_64bit_float) {
3478 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3479 subscript(src, BRW_REGISTER_TYPE_D, 0));
3480 brw_set_default_swsb(p, tgl_swsb_null());
3481 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3482 subscript(src, BRW_REGISTER_TYPE_D, 1));
3483 } else {
3484 brw_MOV(p, dst, src);
3485 }
3486 } else {
3487 /* From the Haswell PRM section "Register Region Restrictions":
3488 *
3489 * "The lower bits of the AddressImmediate must not overflow to
3490 * change the register address. The lower 5 bits of Address
3491 * Immediate when added to lower 5 bits of address register gives
3492 * the sub-register offset. The upper bits of Address Immediate
3493 * when added to upper bits of address register gives the register
3494 * address. Any overflow from sub-register offset is dropped."
3495 *
3496 * Fortunately, for broadcast, we never have a sub-register offset so
3497 * this isn't an issue.
3498 */
3499 assert(src.subnr == 0);
3500
3501 if (align1) {
3502 const struct brw_reg addr =
3503 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3504 unsigned offset = src.nr * REG_SIZE + src.subnr;
3505 /* Limit in bytes of the signed indirect addressing immediate. */
3506 const unsigned limit = 512;
3507
3508 brw_push_insn_state(p);
3509 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3510 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3511
3512 /* Take into account the component size and horizontal stride. */
3513 assert(src.vstride == src.hstride + src.width);
3514 brw_SHL(p, addr, vec1(idx),
3515 brw_imm_ud(util_logbase2(type_sz(src.type)) +
3516 src.hstride - 1));
3517
3518 /* We can only address up to limit bytes using the indirect
3519 * addressing immediate, account for the difference if the source
3520 * register is above this limit.
3521 */
3522 if (offset >= limit) {
3523 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3524 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3525 offset = offset % limit;
3526 }
3527
3528 brw_pop_insn_state(p);
3529
3530 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3531
3532 /* Use indirect addressing to fetch the specified component. */
3533 if (type_sz(src.type) > 4 &&
3534 (devinfo->platform == INTEL_PLATFORM_CHV || intel_device_info_is_9lp(devinfo) ||
3535 !devinfo->has_64bit_float)) {
3536 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3537 *
3538 * "When source or destination datatype is 64b or operation is
3539 * integer DWord multiply, indirect addressing must not be
3540 * used."
3541 *
3542 * To work around both of this issue, we do two integer MOVs
3543 * insead of one 64-bit MOV. Because no double value should ever
3544 * cross a register boundary, it's safe to use the immediate
3545 * offset in the indirect here to handle adding 4 bytes to the
3546 * offset and avoid the extra ADD to the register file.
3547 */
3548 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3549 retype(brw_vec1_indirect(addr.subnr, offset),
3550 BRW_REGISTER_TYPE_D));
3551 brw_set_default_swsb(p, tgl_swsb_null());
3552 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3553 retype(brw_vec1_indirect(addr.subnr, offset + 4),
3554 BRW_REGISTER_TYPE_D));
3555 } else {
3556 brw_MOV(p, dst,
3557 retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3558 }
3559 } else {
3560 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3561 * to all bits of a flag register,
3562 */
3563 inst = brw_MOV(p,
3564 brw_null_reg(),
3565 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3566 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3567 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3568 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3569
3570 /* and use predicated SEL to pick the right channel. */
3571 inst = brw_SEL(p, dst,
3572 stride(suboffset(src, 4), 4, 4, 1),
3573 stride(src, 4, 4, 1));
3574 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3575 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3576 }
3577 }
3578
3579 brw_pop_insn_state(p);
3580 }
3581
3582
3583 /**
3584 * Emit the SEND message for a barrier
3585 */
3586 void
brw_barrier(struct brw_codegen * p,struct brw_reg src)3587 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3588 {
3589 const struct intel_device_info *devinfo = p->devinfo;
3590 struct brw_inst *inst;
3591
3592 assert(devinfo->ver >= 7);
3593
3594 brw_push_insn_state(p);
3595 brw_set_default_access_mode(p, BRW_ALIGN_1);
3596 inst = next_insn(p, BRW_OPCODE_SEND);
3597 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3598 brw_set_src0(p, inst, src);
3599 brw_set_src1(p, inst, brw_null_reg());
3600 brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3601
3602 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3603 brw_inst_set_gateway_subfuncid(devinfo, inst,
3604 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3605
3606 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3607 brw_pop_insn_state(p);
3608 }
3609
3610
3611 /**
3612 * Emit the wait instruction for a barrier
3613 */
3614 void
brw_WAIT(struct brw_codegen * p)3615 brw_WAIT(struct brw_codegen *p)
3616 {
3617 const struct intel_device_info *devinfo = p->devinfo;
3618 struct brw_inst *insn;
3619
3620 struct brw_reg src = brw_notification_reg();
3621
3622 insn = next_insn(p, BRW_OPCODE_WAIT);
3623 brw_set_dest(p, insn, src);
3624 brw_set_src0(p, insn, src);
3625 brw_set_src1(p, insn, brw_null_reg());
3626
3627 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3628 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3629 }
3630
3631 void
brw_float_controls_mode(struct brw_codegen * p,unsigned mode,unsigned mask)3632 brw_float_controls_mode(struct brw_codegen *p,
3633 unsigned mode, unsigned mask)
3634 {
3635 /* From the Skylake PRM, Volume 7, page 760:
3636 * "Implementation Restriction on Register Access: When the control
3637 * register is used as an explicit source and/or destination, hardware
3638 * does not ensure execution pipeline coherency. Software must set the
3639 * thread control field to ‘switch’ for an instruction that uses
3640 * control register as an explicit operand."
3641 *
3642 * On Gfx12+ this is implemented in terms of SWSB annotations instead.
3643 */
3644 brw_set_default_swsb(p, tgl_swsb_regdist(1));
3645
3646 brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3647 brw_imm_ud(~mask));
3648 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3649 if (p->devinfo->ver < 12)
3650 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3651
3652 if (mode) {
3653 brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3654 brw_imm_ud(mode));
3655 brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1);
3656 if (p->devinfo->ver < 12)
3657 brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH);
3658 }
3659
3660 if (p->devinfo->ver >= 12)
3661 brw_SYNC(p, TGL_SYNC_NOP);
3662 }
3663
3664 void
brw_update_reloc_imm(const struct brw_isa_info * isa,brw_inst * inst,uint32_t value)3665 brw_update_reloc_imm(const struct brw_isa_info *isa,
3666 brw_inst *inst,
3667 uint32_t value)
3668 {
3669 const struct intel_device_info *devinfo = isa->devinfo;
3670
3671 /* Sanity check that the instruction is a MOV of an immediate */
3672 assert(brw_inst_opcode(isa, inst) == BRW_OPCODE_MOV);
3673 assert(brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE);
3674
3675 /* If it was compacted, we can't safely rewrite */
3676 assert(brw_inst_cmpt_control(devinfo, inst) == 0);
3677
3678 brw_inst_set_imm_ud(devinfo, inst, value);
3679 }
3680
3681 /* A default value for constants that will be patched at run-time.
3682 * We pick an arbitrary value that prevents instruction compaction.
3683 */
3684 #define DEFAULT_PATCH_IMM 0x4a7cc037
3685
3686 void
brw_MOV_reloc_imm(struct brw_codegen * p,struct brw_reg dst,enum brw_reg_type src_type,uint32_t id)3687 brw_MOV_reloc_imm(struct brw_codegen *p,
3688 struct brw_reg dst,
3689 enum brw_reg_type src_type,
3690 uint32_t id)
3691 {
3692 assert(type_sz(src_type) == 4);
3693 assert(type_sz(dst.type) == 4);
3694
3695 brw_add_reloc(p, id, BRW_SHADER_RELOC_TYPE_MOV_IMM,
3696 p->next_insn_offset, 0);
3697
3698 brw_MOV(p, dst, retype(brw_imm_ud(DEFAULT_PATCH_IMM), src_type));
3699 }
3700