1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35
36 #include "util/ralloc.h"
37
38 void
brw_set_dest(struct brw_codegen * p,brw_inst * inst,struct brw_reg dest)39 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
40 {
41 const struct intel_device_info *devinfo = p->devinfo;
42
43 if (dest.file == BRW_GENERAL_REGISTER_FILE)
44 assert(dest.nr < XE2_MAX_GRF);
45
46 /* The hardware has a restriction where a destination of size Byte with
47 * a stride of 1 is only allowed for a packed byte MOV. For any other
48 * instruction, the stride must be at least 2, even when the destination
49 * is the NULL register.
50 */
51 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
52 dest.nr == BRW_ARF_NULL &&
53 type_sz(dest.type) == 1 &&
54 dest.hstride == BRW_HORIZONTAL_STRIDE_1) {
55 dest.hstride = BRW_HORIZONTAL_STRIDE_2;
56 }
57
58 if (devinfo->ver >= 12 &&
59 (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
60 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC)) {
61 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
62 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
63 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
64 assert(dest.subnr == 0);
65 assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
66 (dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
67 dest.vstride == dest.width + 1));
68 assert(!dest.negate && !dest.abs);
69 brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
70 brw_inst_set_dst_da_reg_nr(devinfo, inst, phys_nr(devinfo, dest));
71
72 } else if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS ||
73 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC) {
74 assert(devinfo->ver < 12);
75 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
76 dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
77 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
78 assert(dest.subnr % 16 == 0);
79 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
80 dest.vstride == dest.width + 1);
81 assert(!dest.negate && !dest.abs);
82 brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
83 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
84 brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
85 } else {
86 brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
87 brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
88
89 if (dest.address_mode == BRW_ADDRESS_DIRECT) {
90 brw_inst_set_dst_da_reg_nr(devinfo, inst, phys_nr(devinfo, dest));
91
92 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
93 brw_inst_set_dst_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, dest));
94 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
95 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
96 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
97 } else {
98 brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
99 brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
100 if (dest.file == BRW_GENERAL_REGISTER_FILE) {
101 assert(dest.writemask != 0);
102 }
103 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
104 * Although Dst.HorzStride is a don't care for Align16, HW needs
105 * this to be programmed as "01".
106 */
107 brw_inst_set_dst_hstride(devinfo, inst, 1);
108 }
109 } else {
110 brw_inst_set_dst_ia_subreg_nr(devinfo, inst, phys_subnr(devinfo, dest));
111
112 /* These are different sizes in align1 vs align16:
113 */
114 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
115 brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
116 dest.indirect_offset);
117 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
118 dest.hstride = BRW_HORIZONTAL_STRIDE_1;
119 brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
120 } else {
121 brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
122 dest.indirect_offset);
123 /* even ignored in da16, still need to set as '01' */
124 brw_inst_set_dst_hstride(devinfo, inst, 1);
125 }
126 }
127 }
128 }
129
130 void
brw_set_src0(struct brw_codegen * p,brw_inst * inst,struct brw_reg reg)131 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
132 {
133 const struct intel_device_info *devinfo = p->devinfo;
134
135 if (reg.file == BRW_GENERAL_REGISTER_FILE)
136 assert(reg.nr < XE2_MAX_GRF);
137
138 if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
139 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC ||
140 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS ||
141 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC) {
142 /* Any source modifiers or regions will be ignored, since this just
143 * identifies the GRF to start reading the message contents from.
144 * Check for some likely failures.
145 */
146 assert(!reg.negate);
147 assert(!reg.abs);
148 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
149 }
150
151 if (devinfo->ver >= 12 &&
152 (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
153 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC)) {
154 assert(reg.file != BRW_IMMEDIATE_VALUE);
155 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
156 assert(reg.subnr == 0);
157 assert(has_scalar_region(reg) ||
158 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
159 reg.vstride == reg.width + 1));
160 assert(!reg.negate && !reg.abs);
161 brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
162 brw_inst_set_src0_da_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
163
164 } else if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS ||
165 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC) {
166 assert(reg.file == BRW_GENERAL_REGISTER_FILE);
167 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
168 assert(reg.subnr % 16 == 0);
169 assert(has_scalar_region(reg) ||
170 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
171 reg.vstride == reg.width + 1));
172 assert(!reg.negate && !reg.abs);
173 brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
174 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
175 } else {
176 brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
177 brw_inst_set_src0_abs(devinfo, inst, reg.abs);
178 brw_inst_set_src0_negate(devinfo, inst, reg.negate);
179 brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
180
181 if (reg.file == BRW_IMMEDIATE_VALUE) {
182 if (reg.type == BRW_REGISTER_TYPE_DF)
183 brw_inst_set_imm_df(devinfo, inst, reg.df);
184 else if (reg.type == BRW_REGISTER_TYPE_UQ ||
185 reg.type == BRW_REGISTER_TYPE_Q)
186 brw_inst_set_imm_uq(devinfo, inst, reg.u64);
187 else
188 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
189
190 if (devinfo->ver < 12 && type_sz(reg.type) < 8) {
191 brw_inst_set_src1_reg_file(devinfo, inst,
192 BRW_ARCHITECTURE_REGISTER_FILE);
193 brw_inst_set_src1_reg_hw_type(devinfo, inst,
194 brw_inst_src0_reg_hw_type(devinfo, inst));
195 }
196 } else {
197 if (reg.address_mode == BRW_ADDRESS_DIRECT) {
198 brw_inst_set_src0_da_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
199 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
200 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg));
201 } else {
202 brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
203 }
204 } else {
205 brw_inst_set_src0_ia_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg));
206
207 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
208 brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
209 } else {
210 brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
211 }
212 }
213
214 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
215 if (reg.width == BRW_WIDTH_1 &&
216 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
217 brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
218 brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
219 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
220 } else {
221 brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
222 brw_inst_set_src0_width(devinfo, inst, reg.width);
223 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
224 }
225 } else {
226 brw_inst_set_src0_da16_swiz_x(devinfo, inst,
227 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
228 brw_inst_set_src0_da16_swiz_y(devinfo, inst,
229 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
230 brw_inst_set_src0_da16_swiz_z(devinfo, inst,
231 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
232 brw_inst_set_src0_da16_swiz_w(devinfo, inst,
233 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
234
235 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
236 /* This is an oddity of the fact we're using the same
237 * descriptions for registers in align_16 as align_1:
238 */
239 brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
240 } else {
241 brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
242 }
243 }
244 }
245 }
246 }
247
248
249 void
brw_set_src1(struct brw_codegen * p,brw_inst * inst,struct brw_reg reg)250 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
251 {
252 const struct intel_device_info *devinfo = p->devinfo;
253
254 if (reg.file == BRW_GENERAL_REGISTER_FILE)
255 assert(reg.nr < XE2_MAX_GRF);
256
257 if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS ||
258 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC ||
259 (devinfo->ver >= 12 &&
260 (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
261 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC))) {
262 assert(reg.file == BRW_GENERAL_REGISTER_FILE ||
263 reg.file == BRW_ARCHITECTURE_REGISTER_FILE);
264 assert(reg.address_mode == BRW_ADDRESS_DIRECT);
265 assert(reg.subnr == 0);
266 assert(has_scalar_region(reg) ||
267 (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
268 reg.vstride == reg.width + 1));
269 assert(!reg.negate && !reg.abs);
270 brw_inst_set_send_src1_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
271 brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
272 } else {
273 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
274 *
275 * "Accumulator registers may be accessed explicitly as src0
276 * operands only."
277 */
278 assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
279 reg.nr != BRW_ARF_ACCUMULATOR);
280
281 brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
282 brw_inst_set_src1_abs(devinfo, inst, reg.abs);
283 brw_inst_set_src1_negate(devinfo, inst, reg.negate);
284
285 /* Only src1 can be immediate in two-argument instructions.
286 */
287 assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
288
289 if (reg.file == BRW_IMMEDIATE_VALUE) {
290 /* two-argument instructions can only use 32-bit immediates */
291 assert(type_sz(reg.type) < 8);
292 brw_inst_set_imm_ud(devinfo, inst, reg.ud);
293 } else {
294 /* This is a hardware restriction, which may or may not be lifted
295 * in the future:
296 */
297 assert (reg.address_mode == BRW_ADDRESS_DIRECT);
298 /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
299
300 brw_inst_set_src1_da_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
301 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
302 brw_inst_set_src1_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg));
303 } else {
304 brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
305 }
306
307 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
308 if (reg.width == BRW_WIDTH_1 &&
309 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
310 brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
311 brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
312 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
313 } else {
314 brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
315 brw_inst_set_src1_width(devinfo, inst, reg.width);
316 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
317 }
318 } else {
319 brw_inst_set_src1_da16_swiz_x(devinfo, inst,
320 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
321 brw_inst_set_src1_da16_swiz_y(devinfo, inst,
322 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
323 brw_inst_set_src1_da16_swiz_z(devinfo, inst,
324 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
325 brw_inst_set_src1_da16_swiz_w(devinfo, inst,
326 BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
327
328 if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
329 /* This is an oddity of the fact we're using the same
330 * descriptions for registers in align_16 as align_1:
331 */
332 brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
333 } else {
334 brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
335 }
336 }
337 }
338 }
339 }
340
341 /**
342 * Specify the descriptor and extended descriptor immediate for a SEND(C)
343 * message instruction.
344 */
345 void
brw_set_desc_ex(struct brw_codegen * p,brw_inst * inst,unsigned desc,unsigned ex_desc)346 brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
347 unsigned desc, unsigned ex_desc)
348 {
349 const struct intel_device_info *devinfo = p->devinfo;
350 assert(brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
351 brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC);
352 if (devinfo->ver < 12)
353 brw_inst_set_src1_file_type(devinfo, inst,
354 BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
355 brw_inst_set_send_desc(devinfo, inst, desc);
356 if (devinfo->ver >= 9)
357 brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
358 }
359
360 static void
gfx7_set_dp_scratch_message(struct brw_codegen * p,brw_inst * inst,bool write,bool dword,bool invalidate_after_read,unsigned num_regs,unsigned addr_offset,unsigned mlen,unsigned rlen,bool header_present)361 gfx7_set_dp_scratch_message(struct brw_codegen *p,
362 brw_inst *inst,
363 bool write,
364 bool dword,
365 bool invalidate_after_read,
366 unsigned num_regs,
367 unsigned addr_offset,
368 unsigned mlen,
369 unsigned rlen,
370 bool header_present)
371 {
372 const struct intel_device_info *devinfo = p->devinfo;
373 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
374 num_regs == 8);
375 const unsigned block_size = util_logbase2(num_regs);
376
377 brw_set_desc(p, inst, brw_message_desc(
378 devinfo, mlen, rlen, header_present));
379
380 brw_inst_set_sfid(devinfo, inst, GFX7_SFID_DATAPORT_DATA_CACHE);
381 brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
382 brw_inst_set_scratch_read_write(devinfo, inst, write);
383 brw_inst_set_scratch_type(devinfo, inst, dword);
384 brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
385 brw_inst_set_scratch_block_size(devinfo, inst, block_size);
386 brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
387 }
388
389 static void
brw_inst_set_state(const struct brw_isa_info * isa,brw_inst * insn,const struct brw_insn_state * state)390 brw_inst_set_state(const struct brw_isa_info *isa,
391 brw_inst *insn,
392 const struct brw_insn_state *state)
393 {
394 const struct intel_device_info *devinfo = isa->devinfo;
395
396 brw_inst_set_exec_size(devinfo, insn, state->exec_size);
397 brw_inst_set_group(devinfo, insn, state->group);
398 brw_inst_set_access_mode(devinfo, insn, state->access_mode);
399 brw_inst_set_mask_control(devinfo, insn, state->mask_control);
400 if (devinfo->ver >= 12)
401 brw_inst_set_swsb(devinfo, insn, tgl_swsb_encode(devinfo, state->swsb));
402 brw_inst_set_saturate(devinfo, insn, state->saturate);
403 brw_inst_set_pred_control(devinfo, insn, state->predicate);
404 brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
405
406 if (is_3src(isa, brw_inst_opcode(isa, insn)) &&
407 state->access_mode == BRW_ALIGN_16) {
408 brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
409 brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
410 } else {
411 brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
412 brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
413 }
414
415 if (devinfo->ver < 20)
416 brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
417 }
418
419 static brw_inst *
brw_append_insns(struct brw_codegen * p,unsigned nr_insn,unsigned alignment)420 brw_append_insns(struct brw_codegen *p, unsigned nr_insn, unsigned alignment)
421 {
422 assert(util_is_power_of_two_or_zero(sizeof(brw_inst)));
423 assert(util_is_power_of_two_or_zero(alignment));
424 const unsigned align_insn = MAX2(alignment / sizeof(brw_inst), 1);
425 const unsigned start_insn = ALIGN(p->nr_insn, align_insn);
426 const unsigned new_nr_insn = start_insn + nr_insn;
427
428 if (p->store_size < new_nr_insn) {
429 p->store_size = util_next_power_of_two(new_nr_insn * sizeof(brw_inst));
430 p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
431 }
432
433 /* Memset any padding due to alignment to 0. We don't want to be hashing
434 * or caching a bunch of random bits we got from a memory allocation.
435 */
436 if (p->nr_insn < start_insn) {
437 memset(&p->store[p->nr_insn], 0,
438 (start_insn - p->nr_insn) * sizeof(brw_inst));
439 }
440
441 assert(p->next_insn_offset == p->nr_insn * sizeof(brw_inst));
442 p->nr_insn = new_nr_insn;
443 p->next_insn_offset = new_nr_insn * sizeof(brw_inst);
444
445 return &p->store[start_insn];
446 }
447
448 void
brw_realign(struct brw_codegen * p,unsigned alignment)449 brw_realign(struct brw_codegen *p, unsigned alignment)
450 {
451 brw_append_insns(p, 0, alignment);
452 }
453
454 int
brw_append_data(struct brw_codegen * p,void * data,unsigned size,unsigned alignment)455 brw_append_data(struct brw_codegen *p, void *data,
456 unsigned size, unsigned alignment)
457 {
458 unsigned nr_insn = DIV_ROUND_UP(size, sizeof(brw_inst));
459 void *dst = brw_append_insns(p, nr_insn, alignment);
460 memcpy(dst, data, size);
461
462 /* If it's not a whole number of instructions, memset the end */
463 if (size < nr_insn * sizeof(brw_inst))
464 memset(dst + size, 0, nr_insn * sizeof(brw_inst) - size);
465
466 return dst - (void *)p->store;
467 }
468
469 #define next_insn brw_next_insn
470 brw_inst *
brw_next_insn(struct brw_codegen * p,unsigned opcode)471 brw_next_insn(struct brw_codegen *p, unsigned opcode)
472 {
473 brw_inst *insn = brw_append_insns(p, 1, sizeof(brw_inst));
474
475 memset(insn, 0, sizeof(*insn));
476 brw_inst_set_opcode(p->isa, insn, opcode);
477
478 /* Apply the default instruction state */
479 brw_inst_set_state(p->isa, insn, p->current);
480
481 return insn;
482 }
483
484 void
brw_add_reloc(struct brw_codegen * p,uint32_t id,enum brw_shader_reloc_type type,uint32_t offset,uint32_t delta)485 brw_add_reloc(struct brw_codegen *p, uint32_t id,
486 enum brw_shader_reloc_type type,
487 uint32_t offset, uint32_t delta)
488 {
489 if (p->num_relocs + 1 > p->reloc_array_size) {
490 p->reloc_array_size = MAX2(16, p->reloc_array_size * 2);
491 p->relocs = reralloc(p->mem_ctx, p->relocs,
492 struct brw_shader_reloc, p->reloc_array_size);
493 }
494
495 p->relocs[p->num_relocs++] = (struct brw_shader_reloc) {
496 .id = id,
497 .type = type,
498 .offset = offset,
499 .delta = delta,
500 };
501 }
502
503 static brw_inst *
brw_alu1(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src)504 brw_alu1(struct brw_codegen *p, unsigned opcode,
505 struct brw_reg dest, struct brw_reg src)
506 {
507 brw_inst *insn = next_insn(p, opcode);
508 brw_set_dest(p, insn, dest);
509 brw_set_src0(p, insn, src);
510 return insn;
511 }
512
513 static brw_inst *
brw_alu2(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)514 brw_alu2(struct brw_codegen *p, unsigned opcode,
515 struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
516 {
517 /* 64-bit immediates are only supported on 1-src instructions */
518 assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
519 assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
520
521 brw_inst *insn = next_insn(p, opcode);
522 brw_set_dest(p, insn, dest);
523 brw_set_src0(p, insn, src0);
524 brw_set_src1(p, insn, src1);
525 return insn;
526 }
527
528 static int
get_3src_subreg_nr(struct brw_reg reg)529 get_3src_subreg_nr(struct brw_reg reg)
530 {
531 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
532 * use 32-bit units (components 0..7). Since they only support F/D/UD
533 * types, this doesn't lose any flexibility, but uses fewer bits.
534 */
535 return reg.subnr / 4;
536 }
537
538 static enum gfx10_align1_3src_vertical_stride
to_3src_align1_vstride(const struct intel_device_info * devinfo,enum brw_vertical_stride vstride)539 to_3src_align1_vstride(const struct intel_device_info *devinfo,
540 enum brw_vertical_stride vstride)
541 {
542 switch (vstride) {
543 case BRW_VERTICAL_STRIDE_0:
544 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
545 case BRW_VERTICAL_STRIDE_1:
546 assert(devinfo->ver >= 12);
547 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1;
548 case BRW_VERTICAL_STRIDE_2:
549 assert(devinfo->ver < 12);
550 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
551 case BRW_VERTICAL_STRIDE_4:
552 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
553 case BRW_VERTICAL_STRIDE_8:
554 case BRW_VERTICAL_STRIDE_16:
555 return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
556 default:
557 unreachable("invalid vstride");
558 }
559 }
560
561
562 static enum gfx10_align1_3src_src_horizontal_stride
to_3src_align1_hstride(enum brw_horizontal_stride hstride)563 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
564 {
565 switch (hstride) {
566 case BRW_HORIZONTAL_STRIDE_0:
567 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
568 case BRW_HORIZONTAL_STRIDE_1:
569 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
570 case BRW_HORIZONTAL_STRIDE_2:
571 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
572 case BRW_HORIZONTAL_STRIDE_4:
573 return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
574 default:
575 unreachable("invalid hstride");
576 }
577 }
578
579 static brw_inst *
brw_alu3(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1,struct brw_reg src2)580 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
581 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
582 {
583 const struct intel_device_info *devinfo = p->devinfo;
584 brw_inst *inst = next_insn(p, opcode);
585
586 assert(dest.nr < XE2_MAX_GRF);
587
588 if (devinfo->ver >= 10)
589 assert(!(src0.file == BRW_IMMEDIATE_VALUE &&
590 src2.file == BRW_IMMEDIATE_VALUE));
591
592 assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < XE2_MAX_GRF);
593 assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < XE2_MAX_GRF);
594 assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < XE2_MAX_GRF);
595 assert(dest.address_mode == BRW_ADDRESS_DIRECT);
596 assert(src0.address_mode == BRW_ADDRESS_DIRECT);
597 assert(src1.address_mode == BRW_ADDRESS_DIRECT);
598 assert(src2.address_mode == BRW_ADDRESS_DIRECT);
599
600 if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
601 assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
602 (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
603 dest.nr == BRW_ARF_ACCUMULATOR));
604
605 if (devinfo->ver >= 12) {
606 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
607 brw_inst_set_3src_dst_reg_nr(devinfo, inst, phys_nr(devinfo, dest));
608 } else {
609 if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
610 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
611 BRW_ALIGN1_3SRC_ACCUMULATOR);
612 brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
613 } else {
614 brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
615 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
616 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
617 }
618 }
619 brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, phys_subnr(devinfo, dest) / 8);
620
621 brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
622
623 if (brw_reg_type_is_floating_point(dest.type)) {
624 brw_inst_set_3src_a1_exec_type(devinfo, inst,
625 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
626 } else {
627 brw_inst_set_3src_a1_exec_type(devinfo, inst,
628 BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
629 }
630
631 brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
632 brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
633 brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
634 brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
635
636 if (src0.file == BRW_IMMEDIATE_VALUE) {
637 brw_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud);
638 } else {
639 brw_inst_set_3src_a1_src0_vstride(
640 devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
641 brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
642 to_3src_align1_hstride(src0.hstride));
643 brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, phys_subnr(devinfo, src0));
644 if (src0.type == BRW_REGISTER_TYPE_NF) {
645 brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
646 } else {
647 brw_inst_set_3src_src0_reg_nr(devinfo, inst, phys_nr(devinfo, src0));
648 }
649 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
650 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
651 }
652 brw_inst_set_3src_a1_src1_vstride(
653 devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
654 brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
655 to_3src_align1_hstride(src1.hstride));
656
657 brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, phys_subnr(devinfo, src1));
658 if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
659 brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
660 } else {
661 brw_inst_set_3src_src1_reg_nr(devinfo, inst, phys_nr(devinfo, src1));
662 }
663 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
664 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
665
666 if (src2.file == BRW_IMMEDIATE_VALUE) {
667 brw_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud);
668 } else {
669 brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
670 to_3src_align1_hstride(src2.hstride));
671 /* no vstride on src2 */
672 brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, phys_subnr(devinfo, src2));
673 brw_inst_set_3src_src2_reg_nr(devinfo, inst, phys_nr(devinfo, src2));
674 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
675 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
676 }
677
678 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
679 src0.file == BRW_IMMEDIATE_VALUE ||
680 (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
681 src0.type == BRW_REGISTER_TYPE_NF));
682 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
683 (src1.file == BRW_ARCHITECTURE_REGISTER_FILE &&
684 src1.nr == BRW_ARF_ACCUMULATOR));
685 assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
686 src2.file == BRW_IMMEDIATE_VALUE);
687
688 if (devinfo->ver >= 12) {
689 if (src0.file == BRW_IMMEDIATE_VALUE) {
690 brw_inst_set_3src_a1_src0_is_imm(devinfo, inst, 1);
691 } else {
692 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
693 }
694
695 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
696
697 if (src2.file == BRW_IMMEDIATE_VALUE) {
698 brw_inst_set_3src_a1_src2_is_imm(devinfo, inst, 1);
699 } else {
700 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
701 }
702 } else {
703 brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
704 src0.file == BRW_GENERAL_REGISTER_FILE ?
705 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
706 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
707 brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
708 src1.file == BRW_GENERAL_REGISTER_FILE ?
709 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
710 BRW_ALIGN1_3SRC_ACCUMULATOR);
711 brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
712 src2.file == BRW_GENERAL_REGISTER_FILE ?
713 BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
714 BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
715 }
716
717 } else {
718 assert(dest.file == BRW_GENERAL_REGISTER_FILE);
719 assert(dest.type == BRW_REGISTER_TYPE_F ||
720 dest.type == BRW_REGISTER_TYPE_DF ||
721 dest.type == BRW_REGISTER_TYPE_D ||
722 dest.type == BRW_REGISTER_TYPE_UD ||
723 dest.type == BRW_REGISTER_TYPE_HF);
724 brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
725 brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 4);
726 brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
727
728 assert(src0.file == BRW_GENERAL_REGISTER_FILE);
729 brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
730 brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
731 brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
732 brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
733 brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
734 brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
735 src0.vstride == BRW_VERTICAL_STRIDE_0);
736
737 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
738 brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
739 brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
740 brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
741 brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
742 brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
743 brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
744 src1.vstride == BRW_VERTICAL_STRIDE_0);
745
746 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
747 brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
748 brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
749 brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
750 brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
751 brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
752 brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
753 src2.vstride == BRW_VERTICAL_STRIDE_0);
754
755 /* Set both the source and destination types based on dest.type,
756 * ignoring the source register types. The MAD and LRP emitters ensure
757 * that all four types are float. The BFE and BFI2 emitters, however,
758 * may send us mixed D and UD types and want us to ignore that and use
759 * the destination type.
760 */
761 brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
762 brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
763
764 /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
765 *
766 * "Three source instructions can use operands with mixed-mode
767 * precision. When SrcType field is set to :f or :hf it defines
768 * precision for source 0 only, and fields Src1Type and Src2Type
769 * define precision for other source operands:
770 *
771 * 0b = :f. Single precision Float (32-bit).
772 * 1b = :hf. Half precision Float (16-bit)."
773 */
774 if (src1.type == BRW_REGISTER_TYPE_HF)
775 brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
776
777 if (src2.type == BRW_REGISTER_TYPE_HF)
778 brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
779 }
780
781 return inst;
782 }
783
784 static brw_inst *
brw_dpas_three_src(struct brw_codegen * p,enum gfx12_systolic_depth opcode,unsigned sdepth,unsigned rcount,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1,struct brw_reg src2)785 brw_dpas_three_src(struct brw_codegen *p, enum gfx12_systolic_depth opcode,
786 unsigned sdepth, unsigned rcount, struct brw_reg dest,
787 struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
788 {
789 const struct intel_device_info *devinfo = p->devinfo;
790 brw_inst *inst = next_insn(p, opcode);
791
792 assert(dest.file == BRW_GENERAL_REGISTER_FILE);
793 brw_inst_set_dpas_3src_dst_reg_file(devinfo, inst,
794 BRW_GENERAL_REGISTER_FILE);
795 brw_inst_set_dpas_3src_dst_reg_nr(devinfo, inst, dest.nr);
796 brw_inst_set_dpas_3src_dst_subreg_nr(devinfo, inst, dest.subnr);
797
798 if (brw_reg_type_is_floating_point(dest.type)) {
799 brw_inst_set_dpas_3src_exec_type(devinfo, inst,
800 BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
801 } else {
802 brw_inst_set_dpas_3src_exec_type(devinfo, inst,
803 BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
804 }
805
806 brw_inst_set_dpas_3src_sdepth(devinfo, inst, sdepth);
807 brw_inst_set_dpas_3src_rcount(devinfo, inst, rcount - 1);
808
809 brw_inst_set_dpas_3src_dst_type(devinfo, inst, dest.type);
810 brw_inst_set_dpas_3src_src0_type(devinfo, inst, src0.type);
811 brw_inst_set_dpas_3src_src1_type(devinfo, inst, src1.type);
812 brw_inst_set_dpas_3src_src2_type(devinfo, inst, src2.type);
813
814 assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
815 (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
816 src0.nr == BRW_ARF_NULL));
817
818 brw_inst_set_dpas_3src_src0_reg_file(devinfo, inst, src0.file);
819 brw_inst_set_dpas_3src_src0_reg_nr(devinfo, inst, src0.nr);
820 brw_inst_set_dpas_3src_src0_subreg_nr(devinfo, inst, src0.subnr);
821
822 assert(src1.file == BRW_GENERAL_REGISTER_FILE);
823
824 brw_inst_set_dpas_3src_src1_reg_file(devinfo, inst, src1.file);
825 brw_inst_set_dpas_3src_src1_reg_nr(devinfo, inst, src1.nr);
826 brw_inst_set_dpas_3src_src1_subreg_nr(devinfo, inst, src1.subnr);
827 brw_inst_set_dpas_3src_src1_subbyte(devinfo, inst, BRW_SUB_BYTE_PRECISION_NONE);
828
829 assert(src2.file == BRW_GENERAL_REGISTER_FILE);
830
831 brw_inst_set_dpas_3src_src2_reg_file(devinfo, inst, src2.file);
832 brw_inst_set_dpas_3src_src2_reg_nr(devinfo, inst, src2.nr);
833 brw_inst_set_dpas_3src_src2_subreg_nr(devinfo, inst, src2.subnr);
834 brw_inst_set_dpas_3src_src2_subbyte(devinfo, inst, BRW_SUB_BYTE_PRECISION_NONE);
835
836 return inst;
837 }
838
839 /***********************************************************************
840 * Convenience routines.
841 */
842 #define ALU1(OP) \
843 brw_inst *brw_##OP(struct brw_codegen *p, \
844 struct brw_reg dest, \
845 struct brw_reg src0) \
846 { \
847 return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
848 }
849
850 #define ALU2(OP) \
851 brw_inst *brw_##OP(struct brw_codegen *p, \
852 struct brw_reg dest, \
853 struct brw_reg src0, \
854 struct brw_reg src1) \
855 { \
856 return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
857 }
858
859 #define ALU3(OP) \
860 brw_inst *brw_##OP(struct brw_codegen *p, \
861 struct brw_reg dest, \
862 struct brw_reg src0, \
863 struct brw_reg src1, \
864 struct brw_reg src2) \
865 { \
866 if (p->current->access_mode == BRW_ALIGN_16) { \
867 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
868 src0.swizzle = BRW_SWIZZLE_XXXX; \
869 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
870 src1.swizzle = BRW_SWIZZLE_XXXX; \
871 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
872 src2.swizzle = BRW_SWIZZLE_XXXX; \
873 } \
874 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
875 }
876
877 #define ALU3F(OP) \
878 brw_inst *brw_##OP(struct brw_codegen *p, \
879 struct brw_reg dest, \
880 struct brw_reg src0, \
881 struct brw_reg src1, \
882 struct brw_reg src2) \
883 { \
884 assert(dest.type == BRW_REGISTER_TYPE_F || \
885 dest.type == BRW_REGISTER_TYPE_DF); \
886 if (dest.type == BRW_REGISTER_TYPE_F) { \
887 assert(src0.type == BRW_REGISTER_TYPE_F); \
888 assert(src1.type == BRW_REGISTER_TYPE_F); \
889 assert(src2.type == BRW_REGISTER_TYPE_F); \
890 } else if (dest.type == BRW_REGISTER_TYPE_DF) { \
891 assert(src0.type == BRW_REGISTER_TYPE_DF); \
892 assert(src1.type == BRW_REGISTER_TYPE_DF); \
893 assert(src2.type == BRW_REGISTER_TYPE_DF); \
894 } \
895 \
896 if (p->current->access_mode == BRW_ALIGN_16) { \
897 if (src0.vstride == BRW_VERTICAL_STRIDE_0) \
898 src0.swizzle = BRW_SWIZZLE_XXXX; \
899 if (src1.vstride == BRW_VERTICAL_STRIDE_0) \
900 src1.swizzle = BRW_SWIZZLE_XXXX; \
901 if (src2.vstride == BRW_VERTICAL_STRIDE_0) \
902 src2.swizzle = BRW_SWIZZLE_XXXX; \
903 } \
904 return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
905 }
906
907 ALU2(SEL)
ALU1(NOT)908 ALU1(NOT)
909 ALU2(AND)
910 ALU2(OR)
911 ALU2(XOR)
912 ALU2(SHR)
913 ALU2(SHL)
914 ALU2(ASR)
915 ALU2(ROL)
916 ALU2(ROR)
917 ALU3(CSEL)
918 ALU1(FRC)
919 ALU1(RNDD)
920 ALU1(RNDE)
921 ALU1(RNDU)
922 ALU1(RNDZ)
923 ALU2(MAC)
924 ALU2(MACH)
925 ALU1(LZD)
926 ALU2(DP4)
927 ALU2(DPH)
928 ALU2(DP3)
929 ALU2(DP2)
930 ALU3(DP4A)
931 ALU3(MAD)
932 ALU3F(LRP)
933 ALU1(BFREV)
934 ALU3(BFE)
935 ALU2(BFI1)
936 ALU3(BFI2)
937 ALU1(FBH)
938 ALU1(FBL)
939 ALU1(CBIT)
940 ALU2(ADDC)
941 ALU2(SUBB)
942 ALU3(ADD3)
943 ALU1(MOV)
944
945 brw_inst *
946 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
947 struct brw_reg src0, struct brw_reg src1)
948 {
949 /* 6.2.2: add */
950 if (src0.type == BRW_REGISTER_TYPE_F ||
951 (src0.file == BRW_IMMEDIATE_VALUE &&
952 src0.type == BRW_REGISTER_TYPE_VF)) {
953 assert(src1.type != BRW_REGISTER_TYPE_UD);
954 assert(src1.type != BRW_REGISTER_TYPE_D);
955 }
956
957 if (src1.type == BRW_REGISTER_TYPE_F ||
958 (src1.file == BRW_IMMEDIATE_VALUE &&
959 src1.type == BRW_REGISTER_TYPE_VF)) {
960 assert(src0.type != BRW_REGISTER_TYPE_UD);
961 assert(src0.type != BRW_REGISTER_TYPE_D);
962 }
963
964 return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
965 }
966
967 brw_inst *
brw_AVG(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)968 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
969 struct brw_reg src0, struct brw_reg src1)
970 {
971 assert(dest.type == src0.type);
972 assert(src0.type == src1.type);
973 switch (src0.type) {
974 case BRW_REGISTER_TYPE_B:
975 case BRW_REGISTER_TYPE_UB:
976 case BRW_REGISTER_TYPE_W:
977 case BRW_REGISTER_TYPE_UW:
978 case BRW_REGISTER_TYPE_D:
979 case BRW_REGISTER_TYPE_UD:
980 break;
981 default:
982 unreachable("Bad type for brw_AVG");
983 }
984
985 return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
986 }
987
988 brw_inst *
brw_MUL(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)989 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
990 struct brw_reg src0, struct brw_reg src1)
991 {
992 /* 6.32.38: mul */
993 if (src0.type == BRW_REGISTER_TYPE_D ||
994 src0.type == BRW_REGISTER_TYPE_UD ||
995 src1.type == BRW_REGISTER_TYPE_D ||
996 src1.type == BRW_REGISTER_TYPE_UD) {
997 assert(dest.type != BRW_REGISTER_TYPE_F);
998 }
999
1000 if (src0.type == BRW_REGISTER_TYPE_F ||
1001 (src0.file == BRW_IMMEDIATE_VALUE &&
1002 src0.type == BRW_REGISTER_TYPE_VF)) {
1003 assert(src1.type != BRW_REGISTER_TYPE_UD);
1004 assert(src1.type != BRW_REGISTER_TYPE_D);
1005 }
1006
1007 if (src1.type == BRW_REGISTER_TYPE_F ||
1008 (src1.file == BRW_IMMEDIATE_VALUE &&
1009 src1.type == BRW_REGISTER_TYPE_VF)) {
1010 assert(src0.type != BRW_REGISTER_TYPE_UD);
1011 assert(src0.type != BRW_REGISTER_TYPE_D);
1012 }
1013
1014 assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1015 src0.nr != BRW_ARF_ACCUMULATOR);
1016 assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1017 src1.nr != BRW_ARF_ACCUMULATOR);
1018
1019 return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1020 }
1021
1022 brw_inst *
brw_LINE(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1023 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1024 struct brw_reg src0, struct brw_reg src1)
1025 {
1026 src0.vstride = BRW_VERTICAL_STRIDE_0;
1027 src0.width = BRW_WIDTH_1;
1028 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1029 return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1030 }
1031
1032 brw_inst *
brw_PLN(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1033 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1034 struct brw_reg src0, struct brw_reg src1)
1035 {
1036 src0.vstride = BRW_VERTICAL_STRIDE_0;
1037 src0.width = BRW_WIDTH_1;
1038 src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1039 src1.vstride = BRW_VERTICAL_STRIDE_8;
1040 src1.width = BRW_WIDTH_8;
1041 src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1042 return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1043 }
1044
1045 brw_inst *
brw_DPAS(struct brw_codegen * p,enum gfx12_systolic_depth sdepth,unsigned rcount,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1,struct brw_reg src2)1046 brw_DPAS(struct brw_codegen *p, enum gfx12_systolic_depth sdepth,
1047 unsigned rcount, struct brw_reg dest, struct brw_reg src0,
1048 struct brw_reg src1, struct brw_reg src2)
1049 {
1050 return brw_dpas_three_src(p, BRW_OPCODE_DPAS, sdepth, rcount, dest, src0,
1051 src1, src2);
1052 }
1053
brw_NOP(struct brw_codegen * p)1054 void brw_NOP(struct brw_codegen *p)
1055 {
1056 brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1057 memset(insn, 0, sizeof(*insn));
1058 brw_inst_set_opcode(p->isa, insn, BRW_OPCODE_NOP);
1059 }
1060
brw_SYNC(struct brw_codegen * p,enum tgl_sync_function func)1061 void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func)
1062 {
1063 brw_inst *insn = next_insn(p, BRW_OPCODE_SYNC);
1064 brw_inst_set_cond_modifier(p->devinfo, insn, func);
1065 }
1066
1067 /***********************************************************************
1068 * Comparisons, if/else/endif
1069 */
1070
1071 brw_inst *
brw_JMPI(struct brw_codegen * p,struct brw_reg index,unsigned predicate_control)1072 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1073 unsigned predicate_control)
1074 {
1075 const struct intel_device_info *devinfo = p->devinfo;
1076 struct brw_reg ip = brw_ip_reg();
1077 brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1078
1079 brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1080 brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1081 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1082 brw_inst_set_pred_control(devinfo, inst, predicate_control);
1083
1084 return inst;
1085 }
1086
1087 static void
push_if_stack(struct brw_codegen * p,brw_inst * inst)1088 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1089 {
1090 p->if_stack[p->if_stack_depth] = inst - p->store;
1091
1092 p->if_stack_depth++;
1093 if (p->if_stack_array_size <= p->if_stack_depth) {
1094 p->if_stack_array_size *= 2;
1095 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1096 p->if_stack_array_size);
1097 }
1098 }
1099
1100 static brw_inst *
pop_if_stack(struct brw_codegen * p)1101 pop_if_stack(struct brw_codegen *p)
1102 {
1103 p->if_stack_depth--;
1104 return &p->store[p->if_stack[p->if_stack_depth]];
1105 }
1106
1107 static void
push_loop_stack(struct brw_codegen * p,brw_inst * inst)1108 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1109 {
1110 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1111 p->loop_stack_array_size *= 2;
1112 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1113 p->loop_stack_array_size);
1114 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1115 p->loop_stack_array_size);
1116 }
1117
1118 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1119 p->loop_stack_depth++;
1120 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1121 }
1122
1123 static brw_inst *
get_inner_do_insn(struct brw_codegen * p)1124 get_inner_do_insn(struct brw_codegen *p)
1125 {
1126 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1127 }
1128
1129 /* EU takes the value from the flag register and pushes it onto some
1130 * sort of a stack (presumably merging with any flag value already on
1131 * the stack). Within an if block, the flags at the top of the stack
1132 * control execution on each channel of the unit, eg. on each of the
1133 * 16 pixel values in our wm programs.
1134 *
1135 * When the matching 'else' instruction is reached (presumably by
1136 * countdown of the instruction count patched in by our ELSE/ENDIF
1137 * functions), the relevant flags are inverted.
1138 *
1139 * When the matching 'endif' instruction is reached, the flags are
1140 * popped off. If the stack is now empty, normal execution resumes.
1141 */
1142 brw_inst *
brw_IF(struct brw_codegen * p,unsigned execute_size)1143 brw_IF(struct brw_codegen *p, unsigned execute_size)
1144 {
1145 const struct intel_device_info *devinfo = p->devinfo;
1146 brw_inst *insn;
1147
1148 insn = next_insn(p, BRW_OPCODE_IF);
1149
1150 /* Override the defaults for this instruction:
1151 */
1152 brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1153 if (devinfo->ver < 12)
1154 brw_set_src0(p, insn, brw_imm_d(0));
1155 brw_inst_set_jip(devinfo, insn, 0);
1156 brw_inst_set_uip(devinfo, insn, 0);
1157
1158 brw_inst_set_exec_size(devinfo, insn, execute_size);
1159 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1160 brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1161 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1162
1163 push_if_stack(p, insn);
1164 p->if_depth_in_loop[p->loop_stack_depth]++;
1165 return insn;
1166 }
1167
1168 /**
1169 * Patch IF and ELSE instructions with appropriate jump targets.
1170 */
1171 static void
patch_IF_ELSE(struct brw_codegen * p,brw_inst * if_inst,brw_inst * else_inst,brw_inst * endif_inst)1172 patch_IF_ELSE(struct brw_codegen *p,
1173 brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1174 {
1175 const struct intel_device_info *devinfo = p->devinfo;
1176
1177 assert(if_inst != NULL && brw_inst_opcode(p->isa, if_inst) == BRW_OPCODE_IF);
1178 assert(endif_inst != NULL);
1179 assert(else_inst == NULL || brw_inst_opcode(p->isa, else_inst) == BRW_OPCODE_ELSE);
1180
1181 unsigned br = brw_jump_scale(devinfo);
1182
1183 assert(brw_inst_opcode(p->isa, endif_inst) == BRW_OPCODE_ENDIF);
1184 brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1185
1186 if (else_inst == NULL) {
1187 /* Patch IF -> ENDIF */
1188 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1189 brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1190 } else {
1191 brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1192
1193 /* Patch ELSE -> ENDIF */
1194 /* The IF instruction's JIP should point just past the ELSE */
1195 brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1196 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1197 brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1198
1199 if (devinfo->ver < 11) {
1200 /* Set the ELSE instruction to use branch_ctrl with a join
1201 * jump target pointing at the NOP inserted right before
1202 * the ENDIF instruction in order to make sure it is
1203 * executed in all cases, since attempting to do the same
1204 * as on other generations could cause the EU to jump at
1205 * the instruction immediately after the ENDIF due to
1206 * Wa_220160235, which could cause the program to continue
1207 * running with all channels disabled.
1208 */
1209 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst - 1));
1210 brw_inst_set_branch_control(devinfo, else_inst, true);
1211 } else {
1212 brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1213 }
1214
1215 /* Since we don't set branch_ctrl on Gfx11+, the ELSE's
1216 * JIP and UIP both should point to ENDIF on those
1217 * platforms.
1218 */
1219 brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1220 }
1221 }
1222
1223 void
brw_ELSE(struct brw_codegen * p)1224 brw_ELSE(struct brw_codegen *p)
1225 {
1226 const struct intel_device_info *devinfo = p->devinfo;
1227 brw_inst *insn;
1228
1229 insn = next_insn(p, BRW_OPCODE_ELSE);
1230
1231 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1232 if (devinfo->ver < 12)
1233 brw_set_src0(p, insn, brw_imm_d(0));
1234 brw_inst_set_jip(devinfo, insn, 0);
1235 brw_inst_set_uip(devinfo, insn, 0);
1236
1237 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1238 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1239
1240 push_if_stack(p, insn);
1241 }
1242
1243 void
brw_ENDIF(struct brw_codegen * p)1244 brw_ENDIF(struct brw_codegen *p)
1245 {
1246 const struct intel_device_info *devinfo = p->devinfo;
1247 brw_inst *insn = NULL;
1248 brw_inst *else_inst = NULL;
1249 brw_inst *if_inst = NULL;
1250 brw_inst *tmp;
1251
1252 assert(p->if_stack_depth > 0);
1253
1254 if (devinfo->ver < 11 &&
1255 brw_inst_opcode(p->isa, &p->store[p->if_stack[
1256 p->if_stack_depth - 1]]) == BRW_OPCODE_ELSE) {
1257 /* Insert a NOP to be specified as join instruction within the
1258 * ELSE block, which is valid for an ELSE instruction with
1259 * branch_ctrl on. The ELSE instruction will be set to jump
1260 * here instead of to the ENDIF instruction, since attempting to
1261 * do the latter would prevent the ENDIF from being executed in
1262 * some cases due to Wa_220160235, which could cause the program
1263 * to continue running with all channels disabled.
1264 */
1265 brw_NOP(p);
1266 }
1267
1268 /*
1269 * A single next_insn() may change the base address of instruction store
1270 * memory(p->store), so call it first before referencing the instruction
1271 * store pointer from an index
1272 */
1273 insn = next_insn(p, BRW_OPCODE_ENDIF);
1274
1275 /* Pop the IF and (optional) ELSE instructions from the stack */
1276 p->if_depth_in_loop[p->loop_stack_depth]--;
1277 tmp = pop_if_stack(p);
1278 if (brw_inst_opcode(p->isa, tmp) == BRW_OPCODE_ELSE) {
1279 else_inst = tmp;
1280 tmp = pop_if_stack(p);
1281 }
1282 if_inst = tmp;
1283
1284 brw_set_src0(p, insn, brw_imm_d(0));
1285
1286 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1287 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1288
1289 brw_inst_set_jip(devinfo, insn, 2);
1290 patch_IF_ELSE(p, if_inst, else_inst, insn);
1291 }
1292
1293 brw_inst *
brw_BREAK(struct brw_codegen * p)1294 brw_BREAK(struct brw_codegen *p)
1295 {
1296 const struct intel_device_info *devinfo = p->devinfo;
1297 brw_inst *insn;
1298
1299 insn = next_insn(p, BRW_OPCODE_BREAK);
1300 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1301 brw_set_src0(p, insn, brw_imm_d(0x0));
1302 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1303 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1304
1305 return insn;
1306 }
1307
1308 brw_inst *
brw_CONT(struct brw_codegen * p)1309 brw_CONT(struct brw_codegen *p)
1310 {
1311 const struct intel_device_info *devinfo = p->devinfo;
1312 brw_inst *insn;
1313
1314 insn = next_insn(p, BRW_OPCODE_CONTINUE);
1315 brw_set_dest(p, insn, brw_ip_reg());
1316 brw_set_src0(p, insn, brw_imm_d(0x0));
1317
1318 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1319 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1320 return insn;
1321 }
1322
1323 brw_inst *
brw_HALT(struct brw_codegen * p)1324 brw_HALT(struct brw_codegen *p)
1325 {
1326 const struct intel_device_info *devinfo = p->devinfo;
1327 brw_inst *insn;
1328
1329 insn = next_insn(p, BRW_OPCODE_HALT);
1330 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1331 if (devinfo->ver < 12) {
1332 brw_set_src0(p, insn, brw_imm_d(0x0));
1333 }
1334
1335 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1336 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1337 return insn;
1338 }
1339
1340 /* DO/WHILE loop:
1341 *
1342 * The DO/WHILE is just an unterminated loop -- break or continue are
1343 * used for control within the loop. We have a few ways they can be
1344 * done.
1345 *
1346 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1347 * jip and no DO instruction.
1348 *
1349 * For gfx6, there's no more mask stack, so no need for DO. WHILE
1350 * just points back to the first instruction of the loop.
1351 */
1352 brw_inst *
brw_DO(struct brw_codegen * p,unsigned execute_size)1353 brw_DO(struct brw_codegen *p, unsigned execute_size)
1354 {
1355 push_loop_stack(p, &p->store[p->nr_insn]);
1356 return &p->store[p->nr_insn];
1357 }
1358
1359 brw_inst *
brw_WHILE(struct brw_codegen * p)1360 brw_WHILE(struct brw_codegen *p)
1361 {
1362 const struct intel_device_info *devinfo = p->devinfo;
1363 brw_inst *insn, *do_insn;
1364 unsigned br = brw_jump_scale(devinfo);
1365
1366 insn = next_insn(p, BRW_OPCODE_WHILE);
1367 do_insn = get_inner_do_insn(p);
1368
1369 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1370 if (devinfo->ver < 12)
1371 brw_set_src0(p, insn, brw_imm_d(0));
1372 brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1373
1374 brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1375
1376 brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1377
1378 p->loop_stack_depth--;
1379
1380 return insn;
1381 }
1382
brw_CMP(struct brw_codegen * p,struct brw_reg dest,unsigned conditional,struct brw_reg src0,struct brw_reg src1)1383 void brw_CMP(struct brw_codegen *p,
1384 struct brw_reg dest,
1385 unsigned conditional,
1386 struct brw_reg src0,
1387 struct brw_reg src1)
1388 {
1389 const struct intel_device_info *devinfo = p->devinfo;
1390 brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1391
1392 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1393 brw_set_dest(p, insn, dest);
1394 brw_set_src0(p, insn, src0);
1395 brw_set_src1(p, insn, src1);
1396 }
1397
brw_CMPN(struct brw_codegen * p,struct brw_reg dest,unsigned conditional,struct brw_reg src0,struct brw_reg src1)1398 void brw_CMPN(struct brw_codegen *p,
1399 struct brw_reg dest,
1400 unsigned conditional,
1401 struct brw_reg src0,
1402 struct brw_reg src1)
1403 {
1404 const struct intel_device_info *devinfo = p->devinfo;
1405 brw_inst *insn = next_insn(p, BRW_OPCODE_CMPN);
1406
1407 brw_inst_set_cond_modifier(devinfo, insn, conditional);
1408 brw_set_dest(p, insn, dest);
1409 brw_set_src0(p, insn, src0);
1410 brw_set_src1(p, insn, src1);
1411 }
1412
1413 /***********************************************************************
1414 * Helpers for the various SEND message types:
1415 */
1416
gfx6_math(struct brw_codegen * p,struct brw_reg dest,unsigned function,struct brw_reg src0,struct brw_reg src1)1417 void gfx6_math(struct brw_codegen *p,
1418 struct brw_reg dest,
1419 unsigned function,
1420 struct brw_reg src0,
1421 struct brw_reg src1)
1422 {
1423 const struct intel_device_info *devinfo = p->devinfo;
1424 brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1425
1426 assert(dest.file == BRW_GENERAL_REGISTER_FILE);
1427
1428 assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1429
1430 if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1431 function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1432 function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1433 assert(src0.type != BRW_REGISTER_TYPE_F);
1434 assert(src1.type != BRW_REGISTER_TYPE_F);
1435 assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1436 src1.file == BRW_IMMEDIATE_VALUE);
1437 /* From BSpec 6647/47428 "[Instruction] Extended Math Function":
1438 * INT DIV function does not support source modifiers.
1439 */
1440 assert(!src0.negate);
1441 assert(!src0.abs);
1442 assert(!src1.negate);
1443 assert(!src1.abs);
1444 } else {
1445 assert(src0.type == BRW_REGISTER_TYPE_F ||
1446 (src0.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9));
1447 assert(src1.type == BRW_REGISTER_TYPE_F ||
1448 (src1.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9));
1449 }
1450
1451 brw_inst_set_math_function(devinfo, insn, function);
1452
1453 brw_set_dest(p, insn, dest);
1454 brw_set_src0(p, insn, src0);
1455 brw_set_src1(p, insn, src1);
1456 }
1457
1458 /**
1459 * Return the right surface index to access the thread scratch space using
1460 * stateless dataport messages.
1461 */
1462 unsigned
brw_scratch_surface_idx(const struct brw_codegen * p)1463 brw_scratch_surface_idx(const struct brw_codegen *p)
1464 {
1465 /* The scratch space is thread-local so IA coherency is unnecessary. */
1466 return GFX8_BTI_STATELESS_NON_COHERENT;
1467 }
1468
1469 void
gfx7_block_read_scratch(struct brw_codegen * p,struct brw_reg dest,int num_regs,unsigned offset)1470 gfx7_block_read_scratch(struct brw_codegen *p,
1471 struct brw_reg dest,
1472 int num_regs,
1473 unsigned offset)
1474 {
1475 brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1476 assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
1477
1478 brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
1479
1480 /* The HW requires that the header is present; this is to get the g0.5
1481 * scratch offset.
1482 */
1483 brw_set_src0(p, insn, brw_vec8_grf(0, 0));
1484
1485 /* According to the docs, offset is "A 12-bit HWord offset into the memory
1486 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
1487 * is 32 bytes, which happens to be the size of a register.
1488 */
1489 offset /= REG_SIZE;
1490 assert(offset < (1 << 12));
1491
1492 gfx7_set_dp_scratch_message(p, insn,
1493 false, /* scratch read */
1494 false, /* OWords */
1495 false, /* invalidate after read */
1496 num_regs,
1497 offset,
1498 1, /* mlen: just g0 */
1499 num_regs, /* rlen */
1500 true); /* header present */
1501 }
1502
1503 brw_inst *
gfx9_fb_READ(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool per_sample)1504 gfx9_fb_READ(struct brw_codegen *p,
1505 struct brw_reg dst,
1506 struct brw_reg payload,
1507 unsigned binding_table_index,
1508 unsigned msg_length,
1509 unsigned response_length,
1510 bool per_sample)
1511 {
1512 const struct intel_device_info *devinfo = p->devinfo;
1513 assert(devinfo->ver >= 9);
1514 brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
1515
1516 brw_inst_set_sfid(devinfo, insn, GFX6_SFID_DATAPORT_RENDER_CACHE);
1517 brw_set_dest(p, insn, dst);
1518 brw_set_src0(p, insn, payload);
1519 brw_set_desc(
1520 p, insn,
1521 brw_message_desc(devinfo, msg_length, response_length, true) |
1522 brw_fb_read_desc(devinfo, binding_table_index, 0 /* msg_control */,
1523 1 << brw_get_default_exec_size(p), per_sample));
1524 brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
1525
1526 return insn;
1527 }
1528
1529 /* Adjust the message header's sampler state pointer to
1530 * select the correct group of 16 samplers.
1531 */
brw_adjust_sampler_state_pointer(struct brw_codegen * p,struct brw_reg header,struct brw_reg sampler_index)1532 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
1533 struct brw_reg header,
1534 struct brw_reg sampler_index)
1535 {
1536 /* The "Sampler Index" field can only store values between 0 and 15.
1537 * However, we can add an offset to the "Sampler State Pointer"
1538 * field, effectively selecting a different set of 16 samplers.
1539 *
1540 * The "Sampler State Pointer" needs to be aligned to a 32-byte
1541 * offset, and each sampler state is only 16-bytes, so we can't
1542 * exclusively use the offset - we have to use both.
1543 */
1544
1545 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
1546 const int sampler_state_size = 16; /* 16 bytes */
1547 uint32_t sampler = sampler_index.ud;
1548
1549 if (sampler >= 16) {
1550 brw_ADD(p,
1551 get_element_ud(header, 3),
1552 get_element_ud(brw_vec8_grf(0, 0), 3),
1553 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
1554 }
1555 } else {
1556 /* Non-const sampler array indexing case */
1557 struct brw_reg temp = get_element_ud(header, 3);
1558
1559 brw_push_insn_state(p);
1560 brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
1561 brw_set_default_swsb(p, tgl_swsb_regdist(1));
1562 brw_SHL(p, temp, temp, brw_imm_ud(4));
1563 brw_ADD(p,
1564 get_element_ud(header, 3),
1565 get_element_ud(brw_vec8_grf(0, 0), 3),
1566 temp);
1567 brw_pop_insn_state(p);
1568 }
1569 }
1570
1571 void
brw_send_indirect_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload,struct brw_reg desc,unsigned desc_imm,bool eot)1572 brw_send_indirect_message(struct brw_codegen *p,
1573 unsigned sfid,
1574 struct brw_reg dst,
1575 struct brw_reg payload,
1576 struct brw_reg desc,
1577 unsigned desc_imm,
1578 bool eot)
1579 {
1580 const struct intel_device_info *devinfo = p->devinfo;
1581 struct brw_inst *send;
1582
1583 dst = retype(dst, BRW_REGISTER_TYPE_UW);
1584
1585 assert(desc.type == BRW_REGISTER_TYPE_UD);
1586
1587 if (desc.file == BRW_IMMEDIATE_VALUE) {
1588 send = next_insn(p, BRW_OPCODE_SEND);
1589 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
1590 brw_set_desc(p, send, desc.ud | desc_imm);
1591 } else {
1592 const struct tgl_swsb swsb = brw_get_default_swsb(p);
1593 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
1594
1595 brw_push_insn_state(p);
1596 brw_set_default_access_mode(p, BRW_ALIGN_1);
1597 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1598 brw_set_default_exec_size(p, BRW_EXECUTE_1);
1599 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1600 brw_set_default_flag_reg(p, 0, 0);
1601 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1602
1603 /* Load the indirect descriptor to an address register using OR so the
1604 * caller can specify additional descriptor bits with the desc_imm
1605 * immediate.
1606 */
1607 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
1608
1609 brw_pop_insn_state(p);
1610
1611 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
1612 send = next_insn(p, BRW_OPCODE_SEND);
1613 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
1614
1615 if (devinfo->ver >= 12)
1616 brw_inst_set_send_sel_reg32_desc(devinfo, send, true);
1617 else
1618 brw_set_src1(p, send, addr);
1619 }
1620
1621 brw_set_dest(p, send, dst);
1622 brw_inst_set_sfid(devinfo, send, sfid);
1623 brw_inst_set_eot(devinfo, send, eot);
1624 }
1625
1626 void
brw_send_indirect_split_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload0,struct brw_reg payload1,struct brw_reg desc,unsigned desc_imm,struct brw_reg ex_desc,unsigned ex_desc_imm,bool ex_desc_scratch,bool ex_bso,bool eot)1627 brw_send_indirect_split_message(struct brw_codegen *p,
1628 unsigned sfid,
1629 struct brw_reg dst,
1630 struct brw_reg payload0,
1631 struct brw_reg payload1,
1632 struct brw_reg desc,
1633 unsigned desc_imm,
1634 struct brw_reg ex_desc,
1635 unsigned ex_desc_imm,
1636 bool ex_desc_scratch,
1637 bool ex_bso,
1638 bool eot)
1639 {
1640 const struct intel_device_info *devinfo = p->devinfo;
1641 struct brw_inst *send;
1642
1643 dst = retype(dst, BRW_REGISTER_TYPE_UW);
1644
1645 assert(desc.type == BRW_REGISTER_TYPE_UD);
1646
1647 if (desc.file == BRW_IMMEDIATE_VALUE) {
1648 desc.ud |= desc_imm;
1649 } else {
1650 const struct tgl_swsb swsb = brw_get_default_swsb(p);
1651 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
1652
1653 brw_push_insn_state(p);
1654 brw_set_default_access_mode(p, BRW_ALIGN_1);
1655 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1656 brw_set_default_exec_size(p, BRW_EXECUTE_1);
1657 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1658 brw_set_default_flag_reg(p, 0, 0);
1659 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1660
1661 /* Load the indirect descriptor to an address register using OR so the
1662 * caller can specify additional descriptor bits with the desc_imm
1663 * immediate.
1664 */
1665 brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
1666
1667 brw_pop_insn_state(p);
1668 desc = addr;
1669
1670 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
1671 }
1672
1673 if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
1674 !ex_desc_scratch &&
1675 (devinfo->ver >= 12 ||
1676 ((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) {
1677 /* ATS-M PRMs, Volume 2d: Command Reference: Structures,
1678 * EU_INSTRUCTION_SEND instruction
1679 *
1680 * "ExBSO: Exists If: ([ExDesc.IsReg]==true)"
1681 */
1682 assert(!ex_bso);
1683 ex_desc.ud |= ex_desc_imm;
1684 } else {
1685 const struct tgl_swsb swsb = brw_get_default_swsb(p);
1686 struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);
1687
1688 brw_push_insn_state(p);
1689 brw_set_default_access_mode(p, BRW_ALIGN_1);
1690 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1691 brw_set_default_exec_size(p, BRW_EXECUTE_1);
1692 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1693 brw_set_default_flag_reg(p, 0, 0);
1694 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1695
1696 /* Load the indirect extended descriptor to an address register using OR
1697 * so the caller can specify additional descriptor bits with the
1698 * desc_imm immediate.
1699 *
1700 * Even though the instruction dispatcher always pulls the SFID and EOT
1701 * fields from the instruction itself, actual external unit which
1702 * processes the message gets the SFID and EOT from the extended
1703 * descriptor which comes from the address register. If we don't OR
1704 * those two bits in, the external unit may get confused and hang.
1705 */
1706 unsigned imm_part = ex_bso ? 0 : (ex_desc_imm | sfid | eot << 5);
1707
1708 if (ex_desc_scratch) {
1709 /* Or the scratch surface offset together with the immediate part of
1710 * the extended descriptor.
1711 */
1712 assert(devinfo->verx10 >= 125);
1713 brw_AND(p, addr,
1714 retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
1715 brw_imm_ud(INTEL_MASK(31, 10)));
1716 brw_OR(p, addr, addr, brw_imm_ud(imm_part));
1717 } else if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
1718 /* ex_desc bits 15:12 don't exist in the instruction encoding prior
1719 * to Gfx12, so we may have fallen back to an indirect extended
1720 * descriptor.
1721 */
1722 brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
1723 } else {
1724 brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
1725 }
1726
1727 brw_pop_insn_state(p);
1728 ex_desc = addr;
1729
1730 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
1731 }
1732
1733 send = next_insn(p, devinfo->ver >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);
1734 brw_set_dest(p, send, dst);
1735 brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));
1736 brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));
1737
1738 if (desc.file == BRW_IMMEDIATE_VALUE) {
1739 brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);
1740 brw_inst_set_send_desc(devinfo, send, desc.ud);
1741 } else {
1742 assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
1743 assert(desc.nr == BRW_ARF_ADDRESS);
1744 assert(desc.subnr == 0);
1745 brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);
1746 }
1747
1748 if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
1749 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
1750 brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
1751 } else {
1752 assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
1753 assert(ex_desc.nr == BRW_ARF_ADDRESS);
1754 assert((ex_desc.subnr & 0x3) == 0);
1755 brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
1756 brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, phys_subnr(devinfo, ex_desc) >> 2);
1757 }
1758
1759 if (ex_bso) {
1760 /* The send instruction ExBSO field does not exist with UGM on Gfx20+,
1761 * it is assumed.
1762 *
1763 * BSpec 56890
1764 */
1765 if (devinfo->ver < 20 || sfid != GFX12_SFID_UGM)
1766 brw_inst_set_send_ex_bso(devinfo, send, true);
1767 brw_inst_set_send_src1_len(devinfo, send, GET_BITS(ex_desc_imm, 10, 6));
1768 }
1769 brw_inst_set_sfid(devinfo, send, sfid);
1770 brw_inst_set_eot(devinfo, send, eot);
1771 }
1772
1773 static void
brw_send_indirect_surface_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned desc_imm)1774 brw_send_indirect_surface_message(struct brw_codegen *p,
1775 unsigned sfid,
1776 struct brw_reg dst,
1777 struct brw_reg payload,
1778 struct brw_reg surface,
1779 unsigned desc_imm)
1780 {
1781 if (surface.file != BRW_IMMEDIATE_VALUE) {
1782 const struct tgl_swsb swsb = brw_get_default_swsb(p);
1783 struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
1784
1785 brw_push_insn_state(p);
1786 brw_set_default_access_mode(p, BRW_ALIGN_1);
1787 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1788 brw_set_default_exec_size(p, BRW_EXECUTE_1);
1789 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1790 brw_set_default_flag_reg(p, 0, 0);
1791 brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
1792
1793 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
1794 * some surface array is accessed out of bounds.
1795 */
1796 brw_AND(p, addr,
1797 suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
1798 BRW_GET_SWZ(surface.swizzle, 0)),
1799 brw_imm_ud(0xff));
1800
1801 brw_pop_insn_state(p);
1802
1803 surface = addr;
1804 brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
1805 }
1806
1807 brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
1808 }
1809
1810 static bool
while_jumps_before_offset(const struct intel_device_info * devinfo,brw_inst * insn,int while_offset,int start_offset)1811 while_jumps_before_offset(const struct intel_device_info *devinfo,
1812 brw_inst *insn, int while_offset, int start_offset)
1813 {
1814 int scale = 16 / brw_jump_scale(devinfo);
1815 int jip = brw_inst_jip(devinfo, insn);
1816 assert(jip < 0);
1817 return while_offset + jip * scale <= start_offset;
1818 }
1819
1820
1821 static int
brw_find_next_block_end(struct brw_codegen * p,int start_offset)1822 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
1823 {
1824 int offset;
1825 void *store = p->store;
1826 const struct intel_device_info *devinfo = p->devinfo;
1827
1828 int depth = 0;
1829
1830 for (offset = next_offset(devinfo, store, start_offset);
1831 offset < p->next_insn_offset;
1832 offset = next_offset(devinfo, store, offset)) {
1833 brw_inst *insn = store + offset;
1834
1835 switch (brw_inst_opcode(p->isa, insn)) {
1836 case BRW_OPCODE_IF:
1837 depth++;
1838 break;
1839 case BRW_OPCODE_ENDIF:
1840 if (depth == 0)
1841 return offset;
1842 depth--;
1843 break;
1844 case BRW_OPCODE_WHILE:
1845 /* If the while doesn't jump before our instruction, it's the end
1846 * of a sibling do...while loop. Ignore it.
1847 */
1848 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
1849 continue;
1850 FALLTHROUGH;
1851 case BRW_OPCODE_ELSE:
1852 case BRW_OPCODE_HALT:
1853 if (depth == 0)
1854 return offset;
1855 break;
1856 default:
1857 break;
1858 }
1859 }
1860
1861 return 0;
1862 }
1863
1864 /* There is no DO instruction on gfx6, so to find the end of the loop
1865 * we have to see if the loop is jumping back before our start
1866 * instruction.
1867 */
1868 static int
brw_find_loop_end(struct brw_codegen * p,int start_offset)1869 brw_find_loop_end(struct brw_codegen *p, int start_offset)
1870 {
1871 const struct intel_device_info *devinfo = p->devinfo;
1872 int offset;
1873 void *store = p->store;
1874
1875 /* Always start after the instruction (such as a WHILE) we're trying to fix
1876 * up.
1877 */
1878 for (offset = next_offset(devinfo, store, start_offset);
1879 offset < p->next_insn_offset;
1880 offset = next_offset(devinfo, store, offset)) {
1881 brw_inst *insn = store + offset;
1882
1883 if (brw_inst_opcode(p->isa, insn) == BRW_OPCODE_WHILE) {
1884 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
1885 return offset;
1886 }
1887 }
1888 assert(!"not reached");
1889 return start_offset;
1890 }
1891
1892 /* After program generation, go back and update the UIP and JIP of
1893 * BREAK, CONT, and HALT instructions to their correct locations.
1894 */
1895 void
brw_set_uip_jip(struct brw_codegen * p,int start_offset)1896 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
1897 {
1898 const struct intel_device_info *devinfo = p->devinfo;
1899 int offset;
1900 int br = brw_jump_scale(devinfo);
1901 int scale = 16 / br;
1902 void *store = p->store;
1903
1904 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
1905 brw_inst *insn = store + offset;
1906 assert(brw_inst_cmpt_control(devinfo, insn) == 0);
1907
1908 switch (brw_inst_opcode(p->isa, insn)) {
1909 case BRW_OPCODE_BREAK: {
1910 int block_end_offset = brw_find_next_block_end(p, offset);
1911 assert(block_end_offset != 0);
1912 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
1913 /* Gfx7 UIP points to WHILE; Gfx6 points just after it */
1914 brw_inst_set_uip(devinfo, insn,
1915 (brw_find_loop_end(p, offset) - offset) / scale);
1916 break;
1917 }
1918
1919 case BRW_OPCODE_CONTINUE: {
1920 int block_end_offset = brw_find_next_block_end(p, offset);
1921 assert(block_end_offset != 0);
1922 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
1923 brw_inst_set_uip(devinfo, insn,
1924 (brw_find_loop_end(p, offset) - offset) / scale);
1925
1926 assert(brw_inst_uip(devinfo, insn) != 0);
1927 assert(brw_inst_jip(devinfo, insn) != 0);
1928 break;
1929 }
1930
1931 case BRW_OPCODE_ENDIF: {
1932 int block_end_offset = brw_find_next_block_end(p, offset);
1933 int32_t jump = (block_end_offset == 0) ?
1934 1 * br : (block_end_offset - offset) / scale;
1935 brw_inst_set_jip(devinfo, insn, jump);
1936 break;
1937 }
1938
1939 case BRW_OPCODE_HALT: {
1940 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
1941 *
1942 * "In case of the halt instruction not inside any conditional
1943 * code block, the value of <JIP> and <UIP> should be the
1944 * same. In case of the halt instruction inside conditional code
1945 * block, the <UIP> should be the end of the program, and the
1946 * <JIP> should be end of the most inner conditional code block."
1947 *
1948 * The uip will have already been set by whoever set up the
1949 * instruction.
1950 */
1951 int block_end_offset = brw_find_next_block_end(p, offset);
1952 if (block_end_offset == 0) {
1953 brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
1954 } else {
1955 brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
1956 }
1957 assert(brw_inst_uip(devinfo, insn) != 0);
1958 assert(brw_inst_jip(devinfo, insn) != 0);
1959 break;
1960 }
1961
1962 default:
1963 break;
1964 }
1965 }
1966 }
1967
1968 static unsigned
brw_surface_payload_size(unsigned num_channels,unsigned exec_size)1969 brw_surface_payload_size(unsigned num_channels,
1970 unsigned exec_size /**< 0 for SIMD4x2 */)
1971 {
1972 if (exec_size == 0)
1973 return 1; /* SIMD4x2 */
1974 else if (exec_size <= 8)
1975 return num_channels;
1976 else
1977 return 2 * num_channels;
1978 }
1979
1980 void
brw_untyped_atomic(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned atomic_op,unsigned msg_length,bool response_expected,bool header_present)1981 brw_untyped_atomic(struct brw_codegen *p,
1982 struct brw_reg dst,
1983 struct brw_reg payload,
1984 struct brw_reg surface,
1985 unsigned atomic_op,
1986 unsigned msg_length,
1987 bool response_expected,
1988 bool header_present)
1989 {
1990 const struct intel_device_info *devinfo = p->devinfo;
1991 const unsigned sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
1992 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
1993 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
1994 const unsigned response_length =
1995 brw_surface_payload_size(response_expected, exec_size);
1996 const unsigned desc =
1997 brw_message_desc(devinfo, msg_length, response_length, header_present) |
1998 brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
1999 response_expected);
2000 /* Mask out unused components -- This is especially important in Align16
2001 * mode on generations that don't have native support for SIMD4x2 atomics,
2002 * because unused but enabled components will cause the dataport to perform
2003 * additional atomic operations on the addresses that happen to be in the
2004 * uninitialized Y, Z and W coordinates of the payload.
2005 */
2006 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2007
2008 brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
2009 payload, surface, desc);
2010 }
2011
2012 void
brw_untyped_surface_read(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels)2013 brw_untyped_surface_read(struct brw_codegen *p,
2014 struct brw_reg dst,
2015 struct brw_reg payload,
2016 struct brw_reg surface,
2017 unsigned msg_length,
2018 unsigned num_channels)
2019 {
2020 const struct intel_device_info *devinfo = p->devinfo;
2021 const unsigned sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2022 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
2023 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
2024 const unsigned response_length =
2025 brw_surface_payload_size(num_channels, exec_size);
2026 const unsigned desc =
2027 brw_message_desc(devinfo, msg_length, response_length, false) |
2028 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
2029
2030 brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
2031 }
2032
2033 void
brw_untyped_surface_write(struct brw_codegen * p,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels,bool header_present)2034 brw_untyped_surface_write(struct brw_codegen *p,
2035 struct brw_reg payload,
2036 struct brw_reg surface,
2037 unsigned msg_length,
2038 unsigned num_channels,
2039 bool header_present)
2040 {
2041 const struct intel_device_info *devinfo = p->devinfo;
2042 const unsigned sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
2043 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
2044 const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
2045 const unsigned desc =
2046 brw_message_desc(devinfo, msg_length, 0, header_present) |
2047 brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
2048
2049 brw_send_indirect_surface_message(p, sfid, brw_null_reg(),
2050 payload, surface, desc);
2051 }
2052
2053 static void
brw_set_memory_fence_message(struct brw_codegen * p,struct brw_inst * insn,enum brw_message_target sfid,bool commit_enable,unsigned bti)2054 brw_set_memory_fence_message(struct brw_codegen *p,
2055 struct brw_inst *insn,
2056 enum brw_message_target sfid,
2057 bool commit_enable,
2058 unsigned bti)
2059 {
2060 const struct intel_device_info *devinfo = p->devinfo;
2061
2062 brw_set_desc(p, insn, brw_message_desc(
2063 devinfo, 1, (commit_enable ? 1 : 0), true));
2064
2065 brw_inst_set_sfid(devinfo, insn, sfid);
2066
2067 switch (sfid) {
2068 case GFX6_SFID_DATAPORT_RENDER_CACHE:
2069 brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_RC_MEMORY_FENCE);
2070 break;
2071 case GFX7_SFID_DATAPORT_DATA_CACHE:
2072 brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_DC_MEMORY_FENCE);
2073 break;
2074 default:
2075 unreachable("Not reached");
2076 }
2077
2078 if (commit_enable)
2079 brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
2080
2081 assert(devinfo->ver >= 11 || bti == 0);
2082 brw_inst_set_binding_table_index(devinfo, insn, bti);
2083 }
2084
2085 static void
gfx12_set_memory_fence_message(struct brw_codegen * p,struct brw_inst * insn,enum brw_message_target sfid,uint32_t desc)2086 gfx12_set_memory_fence_message(struct brw_codegen *p,
2087 struct brw_inst *insn,
2088 enum brw_message_target sfid,
2089 uint32_t desc)
2090 {
2091 const unsigned mlen = 1 * reg_unit(p->devinfo); /* g0 header */
2092 /* Completion signaled by write to register. No data returned. */
2093 const unsigned rlen = 1 * reg_unit(p->devinfo);
2094
2095 brw_inst_set_sfid(p->devinfo, insn, sfid);
2096
2097 if (sfid == BRW_SFID_URB && p->devinfo->ver < 20) {
2098 brw_set_desc(p, insn, brw_urb_fence_desc(p->devinfo) |
2099 brw_message_desc(p->devinfo, mlen, rlen, true));
2100 } else {
2101 enum lsc_fence_scope scope = lsc_fence_msg_desc_scope(p->devinfo, desc);
2102 enum lsc_flush_type flush_type = lsc_fence_msg_desc_flush_type(p->devinfo, desc);
2103
2104 if (sfid == GFX12_SFID_TGM) {
2105 scope = LSC_FENCE_TILE;
2106 flush_type = LSC_FLUSH_TYPE_EVICT;
2107 }
2108
2109 /* Wa_14012437816:
2110 *
2111 * "For any fence greater than local scope, always set flush type to
2112 * at least invalidate so that fence goes on properly."
2113 *
2114 * "The bug is if flush_type is 'None', the scope is always downgraded
2115 * to 'local'."
2116 *
2117 * Here set scope to NONE_6 instead of NONE, which has the same effect
2118 * as NONE but avoids the downgrade to scope LOCAL.
2119 */
2120 if (intel_needs_workaround(p->devinfo, 14012437816) &&
2121 scope > LSC_FENCE_LOCAL &&
2122 flush_type == LSC_FLUSH_TYPE_NONE) {
2123 flush_type = LSC_FLUSH_TYPE_NONE_6;
2124 }
2125
2126 brw_set_desc(p, insn, lsc_fence_msg_desc(p->devinfo, scope,
2127 flush_type, false) |
2128 brw_message_desc(p->devinfo, mlen, rlen, false));
2129 }
2130 }
2131
2132 void
brw_memory_fence(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src,enum opcode send_op,enum brw_message_target sfid,uint32_t desc,bool commit_enable,unsigned bti)2133 brw_memory_fence(struct brw_codegen *p,
2134 struct brw_reg dst,
2135 struct brw_reg src,
2136 enum opcode send_op,
2137 enum brw_message_target sfid,
2138 uint32_t desc,
2139 bool commit_enable,
2140 unsigned bti)
2141 {
2142 const struct intel_device_info *devinfo = p->devinfo;
2143
2144 dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
2145 src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
2146
2147 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
2148 * message doesn't write anything back.
2149 */
2150 struct brw_inst *insn = next_insn(p, send_op);
2151 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
2152 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
2153 brw_set_dest(p, insn, dst);
2154 brw_set_src0(p, insn, src);
2155
2156 /* All DG2 hardware requires LSC for fence messages, even A-step */
2157 if (devinfo->has_lsc)
2158 gfx12_set_memory_fence_message(p, insn, sfid, desc);
2159 else
2160 brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
2161 }
2162
2163 void
brw_broadcast(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src,struct brw_reg idx)2164 brw_broadcast(struct brw_codegen *p,
2165 struct brw_reg dst,
2166 struct brw_reg src,
2167 struct brw_reg idx)
2168 {
2169 const struct intel_device_info *devinfo = p->devinfo;
2170 const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
2171 brw_inst *inst;
2172
2173 brw_push_insn_state(p);
2174 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2175 brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
2176
2177 assert(src.file == BRW_GENERAL_REGISTER_FILE &&
2178 src.address_mode == BRW_ADDRESS_DIRECT);
2179 assert(!src.abs && !src.negate);
2180
2181 /* Gen12.5 adds the following region restriction:
2182 *
2183 * "Vx1 and VxH indirect addressing for Float, Half-Float, Double-Float
2184 * and Quad-Word data must not be used."
2185 *
2186 * We require the source and destination types to match so stomp to an
2187 * unsigned integer type.
2188 */
2189 assert(src.type == dst.type);
2190 src.type = dst.type = brw_reg_type_from_bit_size(type_sz(src.type) * 8,
2191 BRW_REGISTER_TYPE_UD);
2192
2193 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
2194 idx.file == BRW_IMMEDIATE_VALUE) {
2195 /* Trivial, the source is already uniform or the index is a constant.
2196 * We will typically not get here if the optimizer is doing its job, but
2197 * asserting would be mean.
2198 */
2199 const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
2200 src = align1 ? stride(suboffset(src, i), 0, 1, 0) :
2201 stride(suboffset(src, 4 * i), 0, 4, 1);
2202
2203 if (type_sz(src.type) > 4 && !devinfo->has_64bit_int) {
2204 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
2205 subscript(src, BRW_REGISTER_TYPE_D, 0));
2206 brw_set_default_swsb(p, tgl_swsb_null());
2207 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
2208 subscript(src, BRW_REGISTER_TYPE_D, 1));
2209 } else {
2210 brw_MOV(p, dst, src);
2211 }
2212 } else {
2213 /* From the Haswell PRM section "Register Region Restrictions":
2214 *
2215 * "The lower bits of the AddressImmediate must not overflow to
2216 * change the register address. The lower 5 bits of Address
2217 * Immediate when added to lower 5 bits of address register gives
2218 * the sub-register offset. The upper bits of Address Immediate
2219 * when added to upper bits of address register gives the register
2220 * address. Any overflow from sub-register offset is dropped."
2221 *
2222 * Fortunately, for broadcast, we never have a sub-register offset so
2223 * this isn't an issue.
2224 */
2225 assert(src.subnr == 0);
2226
2227 if (align1) {
2228 const struct brw_reg addr =
2229 retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2230 unsigned offset = src.nr * REG_SIZE + src.subnr;
2231 /* Limit in bytes of the signed indirect addressing immediate. */
2232 const unsigned limit = 512;
2233
2234 brw_push_insn_state(p);
2235 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2236 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2237 brw_set_default_flag_reg(p, 0, 0);
2238
2239 /* Take into account the component size and horizontal stride. */
2240 assert(src.vstride == src.hstride + src.width);
2241 brw_SHL(p, addr, vec1(idx),
2242 brw_imm_ud(util_logbase2(type_sz(src.type)) +
2243 src.hstride - 1));
2244
2245 /* We can only address up to limit bytes using the indirect
2246 * addressing immediate, account for the difference if the source
2247 * register is above this limit.
2248 */
2249 if (offset >= limit) {
2250 brw_set_default_swsb(p, tgl_swsb_regdist(1));
2251 brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
2252 offset = offset % limit;
2253 }
2254
2255 brw_pop_insn_state(p);
2256
2257 brw_set_default_swsb(p, tgl_swsb_regdist(1));
2258
2259 /* Use indirect addressing to fetch the specified component. */
2260 if (type_sz(src.type) > 4 &&
2261 (devinfo->platform == INTEL_PLATFORM_CHV || intel_device_info_is_9lp(devinfo) ||
2262 !devinfo->has_64bit_int)) {
2263 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
2264 *
2265 * "When source or destination datatype is 64b or operation is
2266 * integer DWord multiply, indirect addressing must not be
2267 * used."
2268 *
2269 * To work around both of this issue, we do two integer MOVs
2270 * insead of one 64-bit MOV. Because no double value should ever
2271 * cross a register boundary, it's safe to use the immediate
2272 * offset in the indirect here to handle adding 4 bytes to the
2273 * offset and avoid the extra ADD to the register file.
2274 */
2275 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
2276 retype(brw_vec1_indirect(addr.subnr, offset),
2277 BRW_REGISTER_TYPE_D));
2278 brw_set_default_swsb(p, tgl_swsb_null());
2279 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
2280 retype(brw_vec1_indirect(addr.subnr, offset + 4),
2281 BRW_REGISTER_TYPE_D));
2282 } else {
2283 brw_MOV(p, dst,
2284 retype(brw_vec1_indirect(addr.subnr, offset), src.type));
2285 }
2286 } else {
2287 /* In SIMD4x2 mode the index can be either zero or one, replicate it
2288 * to all bits of a flag register,
2289 */
2290 inst = brw_MOV(p,
2291 brw_null_reg(),
2292 stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
2293 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
2294 brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
2295 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
2296
2297 /* and use predicated SEL to pick the right channel. */
2298 inst = brw_SEL(p, dst,
2299 stride(suboffset(src, 4), 4, 4, 1),
2300 stride(src, 4, 4, 1));
2301 brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
2302 brw_inst_set_flag_reg_nr(devinfo, inst, 1);
2303 }
2304 }
2305
2306 brw_pop_insn_state(p);
2307 }
2308
2309
2310 /**
2311 * Emit the SEND message for a barrier
2312 */
2313 void
brw_barrier(struct brw_codegen * p,struct brw_reg src)2314 brw_barrier(struct brw_codegen *p, struct brw_reg src)
2315 {
2316 const struct intel_device_info *devinfo = p->devinfo;
2317 struct brw_inst *inst;
2318
2319 brw_push_insn_state(p);
2320 brw_set_default_access_mode(p, BRW_ALIGN_1);
2321 inst = next_insn(p, BRW_OPCODE_SEND);
2322 brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
2323 brw_set_src0(p, inst, src);
2324 brw_set_src1(p, inst, brw_null_reg());
2325 brw_set_desc(p, inst, brw_message_desc(devinfo,
2326 1 * reg_unit(devinfo), 0, false));
2327
2328 brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
2329 brw_inst_set_gateway_subfuncid(devinfo, inst,
2330 BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
2331
2332 brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
2333 brw_pop_insn_state(p);
2334 }
2335
2336
2337 /**
2338 * Emit the wait instruction for a barrier
2339 */
2340 void
brw_WAIT(struct brw_codegen * p)2341 brw_WAIT(struct brw_codegen *p)
2342 {
2343 const struct intel_device_info *devinfo = p->devinfo;
2344 struct brw_inst *insn;
2345
2346 struct brw_reg src = brw_notification_reg();
2347
2348 insn = next_insn(p, BRW_OPCODE_WAIT);
2349 brw_set_dest(p, insn, src);
2350 brw_set_src0(p, insn, src);
2351 brw_set_src1(p, insn, brw_null_reg());
2352
2353 brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
2354 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
2355 }
2356
2357 void
brw_float_controls_mode(struct brw_codegen * p,unsigned mode,unsigned mask)2358 brw_float_controls_mode(struct brw_codegen *p,
2359 unsigned mode, unsigned mask)
2360 {
2361 assert(p->current->mask_control == BRW_MASK_DISABLE);
2362
2363 /* From the Skylake PRM, Volume 7, page 760:
2364 * "Implementation Restriction on Register Access: When the control
2365 * register is used as an explicit source and/or destination, hardware
2366 * does not ensure execution pipeline coherency. Software must set the
2367 * thread control field to ‘switch’ for an instruction that uses
2368 * control register as an explicit operand."
2369 *
2370 * On Gfx12+ this is implemented in terms of SWSB annotations instead.
2371 */
2372 brw_set_default_swsb(p, tgl_swsb_regdist(1));
2373
2374 brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
2375 brw_imm_ud(~mask));
2376 brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
2377 if (p->devinfo->ver < 12)
2378 brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
2379
2380 if (mode) {
2381 brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
2382 brw_imm_ud(mode));
2383 brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1);
2384 if (p->devinfo->ver < 12)
2385 brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH);
2386 }
2387
2388 if (p->devinfo->ver >= 12)
2389 brw_SYNC(p, TGL_SYNC_NOP);
2390 }
2391
2392 void
brw_update_reloc_imm(const struct brw_isa_info * isa,brw_inst * inst,uint32_t value)2393 brw_update_reloc_imm(const struct brw_isa_info *isa,
2394 brw_inst *inst,
2395 uint32_t value)
2396 {
2397 const struct intel_device_info *devinfo = isa->devinfo;
2398
2399 /* Sanity check that the instruction is a MOV of an immediate */
2400 assert(brw_inst_opcode(isa, inst) == BRW_OPCODE_MOV);
2401 assert(brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE);
2402
2403 /* If it was compacted, we can't safely rewrite */
2404 assert(brw_inst_cmpt_control(devinfo, inst) == 0);
2405
2406 brw_inst_set_imm_ud(devinfo, inst, value);
2407 }
2408
2409 /* A default value for constants that will be patched at run-time.
2410 * We pick an arbitrary value that prevents instruction compaction.
2411 */
2412 #define DEFAULT_PATCH_IMM 0x4a7cc037
2413
2414 void
brw_MOV_reloc_imm(struct brw_codegen * p,struct brw_reg dst,enum brw_reg_type src_type,uint32_t id)2415 brw_MOV_reloc_imm(struct brw_codegen *p,
2416 struct brw_reg dst,
2417 enum brw_reg_type src_type,
2418 uint32_t id)
2419 {
2420 assert(type_sz(src_type) == 4);
2421 assert(type_sz(dst.type) == 4);
2422
2423 brw_add_reloc(p, id, BRW_SHADER_RELOC_TYPE_MOV_IMM,
2424 p->next_insn_offset, 0);
2425
2426 brw_MOV(p, dst, retype(brw_imm_ud(DEFAULT_PATCH_IMM), src_type));
2427 }
2428