1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keithw@vmware.com>
30 */
31
32
33 #include "elk_eu_defines.h"
34 #include "elk_eu.h"
35
36 #include "util/ralloc.h"
37
38 /**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case. This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45 void
elk_gfx6_resolve_implied_move(struct elk_codegen * p,struct elk_reg * src,unsigned msg_reg_nr)46 elk_gfx6_resolve_implied_move(struct elk_codegen *p,
47 struct elk_reg *src,
48 unsigned msg_reg_nr)
49 {
50 const struct intel_device_info *devinfo = p->devinfo;
51 if (devinfo->ver < 6)
52 return;
53
54 if (src->file == ELK_MESSAGE_REGISTER_FILE)
55 return;
56
57 if (src->file != ELK_ARCHITECTURE_REGISTER_FILE || src->nr != ELK_ARF_NULL) {
58 assert(devinfo->ver < 12);
59 elk_push_insn_state(p);
60 elk_set_default_exec_size(p, ELK_EXECUTE_8);
61 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
62 elk_set_default_compression_control(p, ELK_COMPRESSION_NONE);
63 elk_MOV(p, retype(elk_message_reg(msg_reg_nr), ELK_REGISTER_TYPE_UD),
64 retype(*src, ELK_REGISTER_TYPE_UD));
65 elk_pop_insn_state(p);
66 }
67 *src = elk_message_reg(msg_reg_nr);
68 }
69
70 static void
gfx7_convert_mrf_to_grf(struct elk_codegen * p,struct elk_reg * reg)71 gfx7_convert_mrf_to_grf(struct elk_codegen *p, struct elk_reg *reg)
72 {
73 /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74 * "The send with EOT should use register space R112-R127 for <src>. This is
75 * to enable loading of a new thread into the same slot while the message
76 * with EOT for current thread is pending dispatch."
77 *
78 * Since we're pretending to have 16 MRFs anyway, we may as well use the
79 * registers required for messages with EOT.
80 */
81 const struct intel_device_info *devinfo = p->devinfo;
82 if (devinfo->ver >= 7 && reg->file == ELK_MESSAGE_REGISTER_FILE) {
83 reg->file = ELK_GENERAL_REGISTER_FILE;
84 reg->nr += GFX7_MRF_HACK_START;
85 }
86 }
87
88 void
elk_set_dest(struct elk_codegen * p,elk_inst * inst,struct elk_reg dest)89 elk_set_dest(struct elk_codegen *p, elk_inst *inst, struct elk_reg dest)
90 {
91 const struct intel_device_info *devinfo = p->devinfo;
92
93 if (dest.file == ELK_MESSAGE_REGISTER_FILE)
94 assert((dest.nr & ~ELK_MRF_COMPR4) < ELK_MAX_MRF(devinfo->ver));
95 else if (dest.file == ELK_GENERAL_REGISTER_FILE)
96 assert(dest.nr < XE2_MAX_GRF);
97
98 /* The hardware has a restriction where a destination of size Byte with
99 * a stride of 1 is only allowed for a packed byte MOV. For any other
100 * instruction, the stride must be at least 2, even when the destination
101 * is the NULL register.
102 */
103 if (dest.file == ELK_ARCHITECTURE_REGISTER_FILE &&
104 dest.nr == ELK_ARF_NULL &&
105 type_sz(dest.type) == 1 &&
106 dest.hstride == ELK_HORIZONTAL_STRIDE_1) {
107 dest.hstride = ELK_HORIZONTAL_STRIDE_2;
108 }
109
110 gfx7_convert_mrf_to_grf(p, &dest);
111
112 if (devinfo->ver >= 12 &&
113 (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SEND ||
114 elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDC)) {
115 assert(dest.file == ELK_GENERAL_REGISTER_FILE ||
116 dest.file == ELK_ARCHITECTURE_REGISTER_FILE);
117 assert(dest.address_mode == ELK_ADDRESS_DIRECT);
118 assert(dest.subnr == 0);
119 assert(elk_inst_exec_size(devinfo, inst) == ELK_EXECUTE_1 ||
120 (dest.hstride == ELK_HORIZONTAL_STRIDE_1 &&
121 dest.vstride == dest.width + 1));
122 assert(!dest.negate && !dest.abs);
123 elk_inst_set_dst_reg_file(devinfo, inst, dest.file);
124 elk_inst_set_dst_da_reg_nr(devinfo, inst, phys_nr(devinfo, dest));
125
126 } else if (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDS ||
127 elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDSC) {
128 assert(devinfo->ver < 12);
129 assert(dest.file == ELK_GENERAL_REGISTER_FILE ||
130 dest.file == ELK_ARCHITECTURE_REGISTER_FILE);
131 assert(dest.address_mode == ELK_ADDRESS_DIRECT);
132 assert(dest.subnr % 16 == 0);
133 assert(dest.hstride == ELK_HORIZONTAL_STRIDE_1 &&
134 dest.vstride == dest.width + 1);
135 assert(!dest.negate && !dest.abs);
136 elk_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
137 elk_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
138 elk_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
139 } else {
140 elk_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
141 elk_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
142
143 if (dest.address_mode == ELK_ADDRESS_DIRECT) {
144 elk_inst_set_dst_da_reg_nr(devinfo, inst, phys_nr(devinfo, dest));
145
146 if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
147 elk_inst_set_dst_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, dest));
148 if (dest.hstride == ELK_HORIZONTAL_STRIDE_0)
149 dest.hstride = ELK_HORIZONTAL_STRIDE_1;
150 elk_inst_set_dst_hstride(devinfo, inst, dest.hstride);
151 } else {
152 elk_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
153 elk_inst_set_da16_writemask(devinfo, inst, dest.writemask);
154 if (dest.file == ELK_GENERAL_REGISTER_FILE ||
155 dest.file == ELK_MESSAGE_REGISTER_FILE) {
156 assert(dest.writemask != 0);
157 }
158 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
159 * Although Dst.HorzStride is a don't care for Align16, HW needs
160 * this to be programmed as "01".
161 */
162 elk_inst_set_dst_hstride(devinfo, inst, 1);
163 }
164 } else {
165 elk_inst_set_dst_ia_subreg_nr(devinfo, inst, phys_subnr(devinfo, dest));
166
167 /* These are different sizes in align1 vs align16:
168 */
169 if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
170 elk_inst_set_dst_ia1_addr_imm(devinfo, inst,
171 dest.indirect_offset);
172 if (dest.hstride == ELK_HORIZONTAL_STRIDE_0)
173 dest.hstride = ELK_HORIZONTAL_STRIDE_1;
174 elk_inst_set_dst_hstride(devinfo, inst, dest.hstride);
175 } else {
176 elk_inst_set_dst_ia16_addr_imm(devinfo, inst,
177 dest.indirect_offset);
178 /* even ignored in da16, still need to set as '01' */
179 elk_inst_set_dst_hstride(devinfo, inst, 1);
180 }
181 }
182 }
183
184 /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
185 * or 16 (SIMD16), as that's normally correct. However, when dealing with
186 * small registers, it can be useful for us to automatically reduce it to
187 * match the register size.
188 */
189 if (p->automatic_exec_sizes) {
190 /*
191 * In platforms that support fp64 we can emit instructions with a width
192 * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
193 * these cases we need to make sure that these instructions have their
194 * exec sizes set properly when they are emitted and we can't rely on
195 * this code to fix it.
196 */
197 bool fix_exec_size;
198 if (devinfo->ver >= 6)
199 fix_exec_size = dest.width < ELK_EXECUTE_4;
200 else
201 fix_exec_size = dest.width < ELK_EXECUTE_8;
202
203 if (fix_exec_size)
204 elk_inst_set_exec_size(devinfo, inst, dest.width);
205 }
206 }
207
208 void
elk_set_src0(struct elk_codegen * p,elk_inst * inst,struct elk_reg reg)209 elk_set_src0(struct elk_codegen *p, elk_inst *inst, struct elk_reg reg)
210 {
211 const struct intel_device_info *devinfo = p->devinfo;
212
213 if (reg.file == ELK_MESSAGE_REGISTER_FILE)
214 assert((reg.nr & ~ELK_MRF_COMPR4) < ELK_MAX_MRF(devinfo->ver));
215 else if (reg.file == ELK_GENERAL_REGISTER_FILE)
216 assert(reg.nr < XE2_MAX_GRF);
217
218 gfx7_convert_mrf_to_grf(p, ®);
219
220 if (devinfo->ver >= 6 &&
221 (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SEND ||
222 elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDC ||
223 elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDS ||
224 elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDSC)) {
225 /* Any source modifiers or regions will be ignored, since this just
226 * identifies the MRF/GRF to start reading the message contents from.
227 * Check for some likely failures.
228 */
229 assert(!reg.negate);
230 assert(!reg.abs);
231 assert(reg.address_mode == ELK_ADDRESS_DIRECT);
232 }
233
234 if (devinfo->ver >= 12 &&
235 (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SEND ||
236 elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDC)) {
237 assert(reg.file != ELK_IMMEDIATE_VALUE);
238 assert(reg.address_mode == ELK_ADDRESS_DIRECT);
239 assert(reg.subnr == 0);
240 assert(has_scalar_region(reg) ||
241 (reg.hstride == ELK_HORIZONTAL_STRIDE_1 &&
242 reg.vstride == reg.width + 1));
243 assert(!reg.negate && !reg.abs);
244 elk_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
245 elk_inst_set_src0_da_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
246
247 } else if (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDS ||
248 elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDSC) {
249 assert(reg.file == ELK_GENERAL_REGISTER_FILE);
250 assert(reg.address_mode == ELK_ADDRESS_DIRECT);
251 assert(reg.subnr % 16 == 0);
252 assert(has_scalar_region(reg) ||
253 (reg.hstride == ELK_HORIZONTAL_STRIDE_1 &&
254 reg.vstride == reg.width + 1));
255 assert(!reg.negate && !reg.abs);
256 elk_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
257 elk_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
258 } else {
259 elk_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
260 elk_inst_set_src0_abs(devinfo, inst, reg.abs);
261 elk_inst_set_src0_negate(devinfo, inst, reg.negate);
262 elk_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
263
264 if (reg.file == ELK_IMMEDIATE_VALUE) {
265 if (reg.type == ELK_REGISTER_TYPE_DF ||
266 elk_inst_opcode(p->isa, inst) == ELK_OPCODE_DIM)
267 elk_inst_set_imm_df(devinfo, inst, reg.df);
268 else if (reg.type == ELK_REGISTER_TYPE_UQ ||
269 reg.type == ELK_REGISTER_TYPE_Q)
270 elk_inst_set_imm_uq(devinfo, inst, reg.u64);
271 else
272 elk_inst_set_imm_ud(devinfo, inst, reg.ud);
273
274 if (devinfo->ver < 12 && type_sz(reg.type) < 8) {
275 elk_inst_set_src1_reg_file(devinfo, inst,
276 ELK_ARCHITECTURE_REGISTER_FILE);
277 elk_inst_set_src1_reg_hw_type(devinfo, inst,
278 elk_inst_src0_reg_hw_type(devinfo, inst));
279 }
280 } else {
281 if (reg.address_mode == ELK_ADDRESS_DIRECT) {
282 elk_inst_set_src0_da_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
283 if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
284 elk_inst_set_src0_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg));
285 } else {
286 elk_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
287 }
288 } else {
289 elk_inst_set_src0_ia_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg));
290
291 if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
292 elk_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
293 } else {
294 elk_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
295 }
296 }
297
298 if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
299 if (reg.width == ELK_WIDTH_1 &&
300 elk_inst_exec_size(devinfo, inst) == ELK_EXECUTE_1) {
301 elk_inst_set_src0_hstride(devinfo, inst, ELK_HORIZONTAL_STRIDE_0);
302 elk_inst_set_src0_width(devinfo, inst, ELK_WIDTH_1);
303 elk_inst_set_src0_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_0);
304 } else {
305 elk_inst_set_src0_hstride(devinfo, inst, reg.hstride);
306 elk_inst_set_src0_width(devinfo, inst, reg.width);
307 elk_inst_set_src0_vstride(devinfo, inst, reg.vstride);
308 }
309 } else {
310 elk_inst_set_src0_da16_swiz_x(devinfo, inst,
311 ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_X));
312 elk_inst_set_src0_da16_swiz_y(devinfo, inst,
313 ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_Y));
314 elk_inst_set_src0_da16_swiz_z(devinfo, inst,
315 ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_Z));
316 elk_inst_set_src0_da16_swiz_w(devinfo, inst,
317 ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_W));
318
319 if (reg.vstride == ELK_VERTICAL_STRIDE_8) {
320 /* This is an oddity of the fact we're using the same
321 * descriptions for registers in align_16 as align_1:
322 */
323 elk_inst_set_src0_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_4);
324 } else if (devinfo->verx10 == 70 &&
325 reg.type == ELK_REGISTER_TYPE_DF &&
326 reg.vstride == ELK_VERTICAL_STRIDE_2) {
327 /* From SNB PRM:
328 *
329 * "For Align16 access mode, only encodings of 0000 and 0011
330 * are allowed. Other codes are reserved."
331 *
332 * Presumably the DevSNB behavior applies to IVB as well.
333 */
334 elk_inst_set_src0_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_4);
335 } else {
336 elk_inst_set_src0_vstride(devinfo, inst, reg.vstride);
337 }
338 }
339 }
340 }
341 }
342
343
344 void
elk_set_src1(struct elk_codegen * p,elk_inst * inst,struct elk_reg reg)345 elk_set_src1(struct elk_codegen *p, elk_inst *inst, struct elk_reg reg)
346 {
347 const struct intel_device_info *devinfo = p->devinfo;
348
349 if (reg.file == ELK_GENERAL_REGISTER_FILE)
350 assert(reg.nr < XE2_MAX_GRF);
351
352 if (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDS ||
353 elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDSC ||
354 (devinfo->ver >= 12 &&
355 (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SEND ||
356 elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDC))) {
357 assert(reg.file == ELK_GENERAL_REGISTER_FILE ||
358 reg.file == ELK_ARCHITECTURE_REGISTER_FILE);
359 assert(reg.address_mode == ELK_ADDRESS_DIRECT);
360 assert(reg.subnr == 0);
361 assert(has_scalar_region(reg) ||
362 (reg.hstride == ELK_HORIZONTAL_STRIDE_1 &&
363 reg.vstride == reg.width + 1));
364 assert(!reg.negate && !reg.abs);
365 elk_inst_set_send_src1_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
366 elk_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
367 } else {
368 /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
369 *
370 * "Accumulator registers may be accessed explicitly as src0
371 * operands only."
372 */
373 assert(reg.file != ELK_ARCHITECTURE_REGISTER_FILE ||
374 reg.nr != ELK_ARF_ACCUMULATOR);
375
376 gfx7_convert_mrf_to_grf(p, ®);
377 assert(reg.file != ELK_MESSAGE_REGISTER_FILE);
378
379 elk_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
380 elk_inst_set_src1_abs(devinfo, inst, reg.abs);
381 elk_inst_set_src1_negate(devinfo, inst, reg.negate);
382
383 /* Only src1 can be immediate in two-argument instructions.
384 */
385 assert(elk_inst_src0_reg_file(devinfo, inst) != ELK_IMMEDIATE_VALUE);
386
387 if (reg.file == ELK_IMMEDIATE_VALUE) {
388 /* two-argument instructions can only use 32-bit immediates */
389 assert(type_sz(reg.type) < 8);
390 elk_inst_set_imm_ud(devinfo, inst, reg.ud);
391 } else {
392 /* This is a hardware restriction, which may or may not be lifted
393 * in the future:
394 */
395 assert (reg.address_mode == ELK_ADDRESS_DIRECT);
396 /* assert (reg.file == ELK_GENERAL_REGISTER_FILE); */
397
398 elk_inst_set_src1_da_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
399 if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
400 elk_inst_set_src1_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg));
401 } else {
402 elk_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
403 }
404
405 if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
406 if (reg.width == ELK_WIDTH_1 &&
407 elk_inst_exec_size(devinfo, inst) == ELK_EXECUTE_1) {
408 elk_inst_set_src1_hstride(devinfo, inst, ELK_HORIZONTAL_STRIDE_0);
409 elk_inst_set_src1_width(devinfo, inst, ELK_WIDTH_1);
410 elk_inst_set_src1_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_0);
411 } else {
412 elk_inst_set_src1_hstride(devinfo, inst, reg.hstride);
413 elk_inst_set_src1_width(devinfo, inst, reg.width);
414 elk_inst_set_src1_vstride(devinfo, inst, reg.vstride);
415 }
416 } else {
417 elk_inst_set_src1_da16_swiz_x(devinfo, inst,
418 ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_X));
419 elk_inst_set_src1_da16_swiz_y(devinfo, inst,
420 ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_Y));
421 elk_inst_set_src1_da16_swiz_z(devinfo, inst,
422 ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_Z));
423 elk_inst_set_src1_da16_swiz_w(devinfo, inst,
424 ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_W));
425
426 if (reg.vstride == ELK_VERTICAL_STRIDE_8) {
427 /* This is an oddity of the fact we're using the same
428 * descriptions for registers in align_16 as align_1:
429 */
430 elk_inst_set_src1_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_4);
431 } else if (devinfo->verx10 == 70 &&
432 reg.type == ELK_REGISTER_TYPE_DF &&
433 reg.vstride == ELK_VERTICAL_STRIDE_2) {
434 /* From SNB PRM:
435 *
436 * "For Align16 access mode, only encodings of 0000 and 0011
437 * are allowed. Other codes are reserved."
438 *
439 * Presumably the DevSNB behavior applies to IVB as well.
440 */
441 elk_inst_set_src1_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_4);
442 } else {
443 elk_inst_set_src1_vstride(devinfo, inst, reg.vstride);
444 }
445 }
446 }
447 }
448 }
449
450 /**
451 * Specify the descriptor and extended descriptor immediate for a SEND(C)
452 * message instruction.
453 */
454 void
elk_set_desc_ex(struct elk_codegen * p,elk_inst * inst,unsigned desc,unsigned ex_desc)455 elk_set_desc_ex(struct elk_codegen *p, elk_inst *inst,
456 unsigned desc, unsigned ex_desc)
457 {
458 const struct intel_device_info *devinfo = p->devinfo;
459 assert(elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SEND ||
460 elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDC);
461 if (devinfo->ver < 12)
462 elk_inst_set_src1_file_type(devinfo, inst,
463 ELK_IMMEDIATE_VALUE, ELK_REGISTER_TYPE_UD);
464 elk_inst_set_send_desc(devinfo, inst, desc);
465 if (devinfo->ver >= 9)
466 elk_inst_set_send_ex_desc(devinfo, inst, ex_desc);
467 }
468
elk_set_math_message(struct elk_codegen * p,elk_inst * inst,unsigned function,unsigned integer_type,bool low_precision,unsigned dataType)469 static void elk_set_math_message( struct elk_codegen *p,
470 elk_inst *inst,
471 unsigned function,
472 unsigned integer_type,
473 bool low_precision,
474 unsigned dataType )
475 {
476 const struct intel_device_info *devinfo = p->devinfo;
477 unsigned msg_length;
478 unsigned response_length;
479
480 /* Infer message length from the function */
481 switch (function) {
482 case ELK_MATH_FUNCTION_POW:
483 case ELK_MATH_FUNCTION_INT_DIV_QUOTIENT:
484 case ELK_MATH_FUNCTION_INT_DIV_REMAINDER:
485 case ELK_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
486 msg_length = 2;
487 break;
488 default:
489 msg_length = 1;
490 break;
491 }
492
493 /* Infer response length from the function */
494 switch (function) {
495 case ELK_MATH_FUNCTION_SINCOS:
496 case ELK_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
497 response_length = 2;
498 break;
499 default:
500 response_length = 1;
501 break;
502 }
503
504 elk_set_desc(p, inst, elk_message_desc(
505 devinfo, msg_length, response_length, false));
506
507 elk_inst_set_sfid(devinfo, inst, ELK_SFID_MATH);
508 elk_inst_set_math_msg_function(devinfo, inst, function);
509 elk_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
510 elk_inst_set_math_msg_precision(devinfo, inst, low_precision);
511 elk_inst_set_math_msg_saturate(devinfo, inst, elk_inst_saturate(devinfo, inst));
512 elk_inst_set_math_msg_data_type(devinfo, inst, dataType);
513 elk_inst_set_saturate(devinfo, inst, 0);
514 }
515
516
elk_set_ff_sync_message(struct elk_codegen * p,elk_inst * insn,bool allocate,unsigned response_length,bool end_of_thread)517 static void elk_set_ff_sync_message(struct elk_codegen *p,
518 elk_inst *insn,
519 bool allocate,
520 unsigned response_length,
521 bool end_of_thread)
522 {
523 const struct intel_device_info *devinfo = p->devinfo;
524
525 elk_set_desc(p, insn, elk_message_desc(
526 devinfo, 1, response_length, true));
527
528 elk_inst_set_sfid(devinfo, insn, ELK_SFID_URB);
529 elk_inst_set_eot(devinfo, insn, end_of_thread);
530 elk_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
531 elk_inst_set_urb_allocate(devinfo, insn, allocate);
532 /* The following fields are not used by FF_SYNC: */
533 elk_inst_set_urb_global_offset(devinfo, insn, 0);
534 elk_inst_set_urb_swizzle_control(devinfo, insn, 0);
535 elk_inst_set_urb_used(devinfo, insn, 0);
536 elk_inst_set_urb_complete(devinfo, insn, 0);
537 }
538
elk_set_urb_message(struct elk_codegen * p,elk_inst * insn,enum elk_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle_control)539 static void elk_set_urb_message( struct elk_codegen *p,
540 elk_inst *insn,
541 enum elk_urb_write_flags flags,
542 unsigned msg_length,
543 unsigned response_length,
544 unsigned offset,
545 unsigned swizzle_control )
546 {
547 const struct intel_device_info *devinfo = p->devinfo;
548
549 assert(devinfo->ver < 7 || swizzle_control != ELK_URB_SWIZZLE_TRANSPOSE);
550 assert(devinfo->ver < 7 || !(flags & ELK_URB_WRITE_ALLOCATE));
551 assert(devinfo->ver >= 7 || !(flags & ELK_URB_WRITE_PER_SLOT_OFFSET));
552
553 elk_set_desc(p, insn, elk_message_desc(
554 devinfo, msg_length, response_length, true));
555
556 elk_inst_set_sfid(devinfo, insn, ELK_SFID_URB);
557 elk_inst_set_eot(devinfo, insn, !!(flags & ELK_URB_WRITE_EOT));
558
559 if (flags & ELK_URB_WRITE_OWORD) {
560 assert(msg_length == 2); /* header + one OWORD of data */
561 elk_inst_set_urb_opcode(devinfo, insn, ELK_URB_OPCODE_WRITE_OWORD);
562 } else {
563 elk_inst_set_urb_opcode(devinfo, insn, ELK_URB_OPCODE_WRITE_HWORD);
564 }
565
566 elk_inst_set_urb_global_offset(devinfo, insn, offset);
567 elk_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
568
569 if (devinfo->ver < 8) {
570 elk_inst_set_urb_complete(devinfo, insn, !!(flags & ELK_URB_WRITE_COMPLETE));
571 }
572
573 if (devinfo->ver < 7) {
574 elk_inst_set_urb_allocate(devinfo, insn, !!(flags & ELK_URB_WRITE_ALLOCATE));
575 elk_inst_set_urb_used(devinfo, insn, !(flags & ELK_URB_WRITE_UNUSED));
576 } else {
577 elk_inst_set_urb_per_slot_offset(devinfo, insn,
578 !!(flags & ELK_URB_WRITE_PER_SLOT_OFFSET));
579 }
580 }
581
582 static void
gfx7_set_dp_scratch_message(struct elk_codegen * p,elk_inst * inst,bool write,bool dword,bool invalidate_after_read,unsigned num_regs,unsigned addr_offset,unsigned mlen,unsigned rlen,bool header_present)583 gfx7_set_dp_scratch_message(struct elk_codegen *p,
584 elk_inst *inst,
585 bool write,
586 bool dword,
587 bool invalidate_after_read,
588 unsigned num_regs,
589 unsigned addr_offset,
590 unsigned mlen,
591 unsigned rlen,
592 bool header_present)
593 {
594 const struct intel_device_info *devinfo = p->devinfo;
595 assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
596 (devinfo->ver >= 8 && num_regs == 8));
597 const unsigned block_size = (devinfo->ver >= 8 ? util_logbase2(num_regs) :
598 num_regs - 1);
599
600 elk_set_desc(p, inst, elk_message_desc(
601 devinfo, mlen, rlen, header_present));
602
603 elk_inst_set_sfid(devinfo, inst, GFX7_SFID_DATAPORT_DATA_CACHE);
604 elk_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
605 elk_inst_set_scratch_read_write(devinfo, inst, write);
606 elk_inst_set_scratch_type(devinfo, inst, dword);
607 elk_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
608 elk_inst_set_scratch_block_size(devinfo, inst, block_size);
609 elk_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
610 }
611
612 static void
elk_inst_set_state(const struct elk_isa_info * isa,elk_inst * insn,const struct elk_insn_state * state)613 elk_inst_set_state(const struct elk_isa_info *isa,
614 elk_inst *insn,
615 const struct elk_insn_state *state)
616 {
617 const struct intel_device_info *devinfo = isa->devinfo;
618
619 elk_inst_set_exec_size(devinfo, insn, state->exec_size);
620 elk_inst_set_group(devinfo, insn, state->group);
621 elk_inst_set_compression(devinfo, insn, state->compressed);
622 elk_inst_set_access_mode(devinfo, insn, state->access_mode);
623 elk_inst_set_mask_control(devinfo, insn, state->mask_control);
624 if (devinfo->ver >= 12)
625 elk_inst_set_swsb(devinfo, insn, tgl_swsb_encode(devinfo, state->swsb));
626 elk_inst_set_saturate(devinfo, insn, state->saturate);
627 elk_inst_set_pred_control(devinfo, insn, state->predicate);
628 elk_inst_set_pred_inv(devinfo, insn, state->pred_inv);
629
630 if (elk_is_3src(isa, elk_inst_opcode(isa, insn)) &&
631 state->access_mode == ELK_ALIGN_16) {
632 elk_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
633 if (devinfo->ver >= 7)
634 elk_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
635 } else {
636 elk_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
637 if (devinfo->ver >= 7)
638 elk_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
639 }
640
641 if (devinfo->ver >= 6 && devinfo->ver < 20)
642 elk_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
643 }
644
645 static elk_inst *
elk_append_insns(struct elk_codegen * p,unsigned nr_insn,unsigned alignment)646 elk_append_insns(struct elk_codegen *p, unsigned nr_insn, unsigned alignment)
647 {
648 assert(util_is_power_of_two_or_zero(sizeof(elk_inst)));
649 assert(util_is_power_of_two_or_zero(alignment));
650 const unsigned align_insn = MAX2(alignment / sizeof(elk_inst), 1);
651 const unsigned start_insn = ALIGN(p->nr_insn, align_insn);
652 const unsigned new_nr_insn = start_insn + nr_insn;
653
654 if (p->store_size < new_nr_insn) {
655 p->store_size = util_next_power_of_two(new_nr_insn * sizeof(elk_inst));
656 p->store = reralloc(p->mem_ctx, p->store, elk_inst, p->store_size);
657 }
658
659 /* Memset any padding due to alignment to 0. We don't want to be hashing
660 * or caching a bunch of random bits we got from a memory allocation.
661 */
662 if (p->nr_insn < start_insn) {
663 memset(&p->store[p->nr_insn], 0,
664 (start_insn - p->nr_insn) * sizeof(elk_inst));
665 }
666
667 assert(p->next_insn_offset == p->nr_insn * sizeof(elk_inst));
668 p->nr_insn = new_nr_insn;
669 p->next_insn_offset = new_nr_insn * sizeof(elk_inst);
670
671 return &p->store[start_insn];
672 }
673
674 void
elk_realign(struct elk_codegen * p,unsigned alignment)675 elk_realign(struct elk_codegen *p, unsigned alignment)
676 {
677 elk_append_insns(p, 0, alignment);
678 }
679
680 int
elk_append_data(struct elk_codegen * p,void * data,unsigned size,unsigned alignment)681 elk_append_data(struct elk_codegen *p, void *data,
682 unsigned size, unsigned alignment)
683 {
684 unsigned nr_insn = DIV_ROUND_UP(size, sizeof(elk_inst));
685 void *dst = elk_append_insns(p, nr_insn, alignment);
686 memcpy(dst, data, size);
687
688 /* If it's not a whole number of instructions, memset the end */
689 if (size < nr_insn * sizeof(elk_inst))
690 memset(dst + size, 0, nr_insn * sizeof(elk_inst) - size);
691
692 return dst - (void *)p->store;
693 }
694
695 #define next_insn elk_next_insn
696 elk_inst *
elk_next_insn(struct elk_codegen * p,unsigned opcode)697 elk_next_insn(struct elk_codegen *p, unsigned opcode)
698 {
699 elk_inst *insn = elk_append_insns(p, 1, sizeof(elk_inst));
700
701 memset(insn, 0, sizeof(*insn));
702 elk_inst_set_opcode(p->isa, insn, opcode);
703
704 /* Apply the default instruction state */
705 elk_inst_set_state(p->isa, insn, p->current);
706
707 return insn;
708 }
709
710 void
elk_add_reloc(struct elk_codegen * p,uint32_t id,enum elk_shader_reloc_type type,uint32_t offset,uint32_t delta)711 elk_add_reloc(struct elk_codegen *p, uint32_t id,
712 enum elk_shader_reloc_type type,
713 uint32_t offset, uint32_t delta)
714 {
715 if (p->num_relocs + 1 > p->reloc_array_size) {
716 p->reloc_array_size = MAX2(16, p->reloc_array_size * 2);
717 p->relocs = reralloc(p->mem_ctx, p->relocs,
718 struct elk_shader_reloc, p->reloc_array_size);
719 }
720
721 p->relocs[p->num_relocs++] = (struct elk_shader_reloc) {
722 .id = id,
723 .type = type,
724 .offset = offset,
725 .delta = delta,
726 };
727 }
728
729 static elk_inst *
elk_alu1(struct elk_codegen * p,unsigned opcode,struct elk_reg dest,struct elk_reg src)730 elk_alu1(struct elk_codegen *p, unsigned opcode,
731 struct elk_reg dest, struct elk_reg src)
732 {
733 elk_inst *insn = next_insn(p, opcode);
734 elk_set_dest(p, insn, dest);
735 elk_set_src0(p, insn, src);
736 return insn;
737 }
738
739 static elk_inst *
elk_alu2(struct elk_codegen * p,unsigned opcode,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)740 elk_alu2(struct elk_codegen *p, unsigned opcode,
741 struct elk_reg dest, struct elk_reg src0, struct elk_reg src1)
742 {
743 /* 64-bit immediates are only supported on 1-src instructions */
744 assert(src0.file != ELK_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
745 assert(src1.file != ELK_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
746
747 elk_inst *insn = next_insn(p, opcode);
748 elk_set_dest(p, insn, dest);
749 elk_set_src0(p, insn, src0);
750 elk_set_src1(p, insn, src1);
751 return insn;
752 }
753
754 static int
get_3src_subreg_nr(struct elk_reg reg)755 get_3src_subreg_nr(struct elk_reg reg)
756 {
757 /* Normally, SubRegNum is in bytes (0..31). However, 3-src instructions
758 * use 32-bit units (components 0..7). Since they only support F/D/UD
759 * types, this doesn't lose any flexibility, but uses fewer bits.
760 */
761 return reg.subnr / 4;
762 }
763
764 static enum gfx10_align1_3src_vertical_stride
to_3src_align1_vstride(const struct intel_device_info * devinfo,enum elk_vertical_stride vstride)765 to_3src_align1_vstride(const struct intel_device_info *devinfo,
766 enum elk_vertical_stride vstride)
767 {
768 switch (vstride) {
769 case ELK_VERTICAL_STRIDE_0:
770 return ELK_ALIGN1_3SRC_VERTICAL_STRIDE_0;
771 case ELK_VERTICAL_STRIDE_1:
772 assert(devinfo->ver >= 12);
773 return ELK_ALIGN1_3SRC_VERTICAL_STRIDE_1;
774 case ELK_VERTICAL_STRIDE_2:
775 assert(devinfo->ver < 12);
776 return ELK_ALIGN1_3SRC_VERTICAL_STRIDE_2;
777 case ELK_VERTICAL_STRIDE_4:
778 return ELK_ALIGN1_3SRC_VERTICAL_STRIDE_4;
779 case ELK_VERTICAL_STRIDE_8:
780 case ELK_VERTICAL_STRIDE_16:
781 return ELK_ALIGN1_3SRC_VERTICAL_STRIDE_8;
782 default:
783 unreachable("invalid vstride");
784 }
785 }
786
787
788 static enum gfx10_align1_3src_src_horizontal_stride
to_3src_align1_hstride(enum elk_horizontal_stride hstride)789 to_3src_align1_hstride(enum elk_horizontal_stride hstride)
790 {
791 switch (hstride) {
792 case ELK_HORIZONTAL_STRIDE_0:
793 return ELK_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
794 case ELK_HORIZONTAL_STRIDE_1:
795 return ELK_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
796 case ELK_HORIZONTAL_STRIDE_2:
797 return ELK_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
798 case ELK_HORIZONTAL_STRIDE_4:
799 return ELK_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
800 default:
801 unreachable("invalid hstride");
802 }
803 }
804
805 static elk_inst *
elk_alu3(struct elk_codegen * p,unsigned opcode,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1,struct elk_reg src2)806 elk_alu3(struct elk_codegen *p, unsigned opcode, struct elk_reg dest,
807 struct elk_reg src0, struct elk_reg src1, struct elk_reg src2)
808 {
809 const struct intel_device_info *devinfo = p->devinfo;
810 elk_inst *inst = next_insn(p, opcode);
811
812 gfx7_convert_mrf_to_grf(p, &dest);
813
814 assert(dest.nr < XE2_MAX_GRF);
815
816 if (devinfo->ver >= 10)
817 assert(!(src0.file == ELK_IMMEDIATE_VALUE &&
818 src2.file == ELK_IMMEDIATE_VALUE));
819
820 assert(src0.file == ELK_IMMEDIATE_VALUE || src0.nr < XE2_MAX_GRF);
821 assert(src1.file != ELK_IMMEDIATE_VALUE && src1.nr < XE2_MAX_GRF);
822 assert(src2.file == ELK_IMMEDIATE_VALUE || src2.nr < XE2_MAX_GRF);
823 assert(dest.address_mode == ELK_ADDRESS_DIRECT);
824 assert(src0.address_mode == ELK_ADDRESS_DIRECT);
825 assert(src1.address_mode == ELK_ADDRESS_DIRECT);
826 assert(src2.address_mode == ELK_ADDRESS_DIRECT);
827
828 if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
829 assert(dest.file == ELK_GENERAL_REGISTER_FILE ||
830 (dest.file == ELK_ARCHITECTURE_REGISTER_FILE &&
831 dest.nr == ELK_ARF_ACCUMULATOR));
832
833 if (devinfo->ver >= 12) {
834 elk_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
835 elk_inst_set_3src_dst_reg_nr(devinfo, inst, phys_nr(devinfo, dest));
836 } else {
837 if (dest.file == ELK_ARCHITECTURE_REGISTER_FILE) {
838 elk_inst_set_3src_a1_dst_reg_file(devinfo, inst,
839 ELK_ALIGN1_3SRC_ACCUMULATOR);
840 elk_inst_set_3src_dst_reg_nr(devinfo, inst, ELK_ARF_ACCUMULATOR);
841 } else {
842 elk_inst_set_3src_a1_dst_reg_file(devinfo, inst,
843 ELK_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
844 elk_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
845 }
846 }
847 elk_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, phys_subnr(devinfo, dest) / 8);
848
849 elk_inst_set_3src_a1_dst_hstride(devinfo, inst, ELK_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
850
851 if (elk_reg_type_is_floating_point(dest.type)) {
852 elk_inst_set_3src_a1_exec_type(devinfo, inst,
853 ELK_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
854 } else {
855 elk_inst_set_3src_a1_exec_type(devinfo, inst,
856 ELK_ALIGN1_3SRC_EXEC_TYPE_INT);
857 }
858
859 elk_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
860 elk_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
861 elk_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
862 elk_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
863
864 if (src0.file == ELK_IMMEDIATE_VALUE) {
865 elk_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud);
866 } else {
867 elk_inst_set_3src_a1_src0_vstride(
868 devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
869 elk_inst_set_3src_a1_src0_hstride(devinfo, inst,
870 to_3src_align1_hstride(src0.hstride));
871 elk_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, phys_subnr(devinfo, src0));
872 if (src0.type == ELK_REGISTER_TYPE_NF) {
873 elk_inst_set_3src_src0_reg_nr(devinfo, inst, ELK_ARF_ACCUMULATOR);
874 } else {
875 elk_inst_set_3src_src0_reg_nr(devinfo, inst, phys_nr(devinfo, src0));
876 }
877 elk_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
878 elk_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
879 }
880 elk_inst_set_3src_a1_src1_vstride(
881 devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
882 elk_inst_set_3src_a1_src1_hstride(devinfo, inst,
883 to_3src_align1_hstride(src1.hstride));
884
885 elk_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, phys_subnr(devinfo, src1));
886 if (src1.file == ELK_ARCHITECTURE_REGISTER_FILE) {
887 elk_inst_set_3src_src1_reg_nr(devinfo, inst, ELK_ARF_ACCUMULATOR);
888 } else {
889 elk_inst_set_3src_src1_reg_nr(devinfo, inst, phys_nr(devinfo, src1));
890 }
891 elk_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
892 elk_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
893
894 if (src2.file == ELK_IMMEDIATE_VALUE) {
895 elk_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud);
896 } else {
897 elk_inst_set_3src_a1_src2_hstride(devinfo, inst,
898 to_3src_align1_hstride(src2.hstride));
899 /* no vstride on src2 */
900 elk_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, phys_subnr(devinfo, src2));
901 elk_inst_set_3src_src2_reg_nr(devinfo, inst, phys_nr(devinfo, src2));
902 elk_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
903 elk_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
904 }
905
906 assert(src0.file == ELK_GENERAL_REGISTER_FILE ||
907 src0.file == ELK_IMMEDIATE_VALUE ||
908 (src0.file == ELK_ARCHITECTURE_REGISTER_FILE &&
909 src0.type == ELK_REGISTER_TYPE_NF));
910 assert(src1.file == ELK_GENERAL_REGISTER_FILE ||
911 (src1.file == ELK_ARCHITECTURE_REGISTER_FILE &&
912 src1.nr == ELK_ARF_ACCUMULATOR));
913 assert(src2.file == ELK_GENERAL_REGISTER_FILE ||
914 src2.file == ELK_IMMEDIATE_VALUE);
915
916 if (devinfo->ver >= 12) {
917 if (src0.file == ELK_IMMEDIATE_VALUE) {
918 elk_inst_set_3src_a1_src0_is_imm(devinfo, inst, 1);
919 } else {
920 elk_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
921 }
922
923 elk_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
924
925 if (src2.file == ELK_IMMEDIATE_VALUE) {
926 elk_inst_set_3src_a1_src2_is_imm(devinfo, inst, 1);
927 } else {
928 elk_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
929 }
930 } else {
931 elk_inst_set_3src_a1_src0_reg_file(devinfo, inst,
932 src0.file == ELK_GENERAL_REGISTER_FILE ?
933 ELK_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
934 ELK_ALIGN1_3SRC_IMMEDIATE_VALUE);
935 elk_inst_set_3src_a1_src1_reg_file(devinfo, inst,
936 src1.file == ELK_GENERAL_REGISTER_FILE ?
937 ELK_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
938 ELK_ALIGN1_3SRC_ACCUMULATOR);
939 elk_inst_set_3src_a1_src2_reg_file(devinfo, inst,
940 src2.file == ELK_GENERAL_REGISTER_FILE ?
941 ELK_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
942 ELK_ALIGN1_3SRC_IMMEDIATE_VALUE);
943 }
944
945 } else {
946 assert(dest.file == ELK_GENERAL_REGISTER_FILE ||
947 dest.file == ELK_MESSAGE_REGISTER_FILE);
948 assert(dest.type == ELK_REGISTER_TYPE_F ||
949 dest.type == ELK_REGISTER_TYPE_DF ||
950 dest.type == ELK_REGISTER_TYPE_D ||
951 dest.type == ELK_REGISTER_TYPE_UD ||
952 (dest.type == ELK_REGISTER_TYPE_HF && devinfo->ver >= 8));
953 if (devinfo->ver == 6) {
954 elk_inst_set_3src_a16_dst_reg_file(devinfo, inst,
955 dest.file == ELK_MESSAGE_REGISTER_FILE);
956 }
957 elk_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
958 elk_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 4);
959 elk_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
960
961 assert(src0.file == ELK_GENERAL_REGISTER_FILE);
962 elk_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
963 elk_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
964 elk_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
965 elk_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
966 elk_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
967 elk_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
968 src0.vstride == ELK_VERTICAL_STRIDE_0);
969
970 assert(src1.file == ELK_GENERAL_REGISTER_FILE);
971 elk_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
972 elk_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
973 elk_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
974 elk_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
975 elk_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
976 elk_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
977 src1.vstride == ELK_VERTICAL_STRIDE_0);
978
979 assert(src2.file == ELK_GENERAL_REGISTER_FILE);
980 elk_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
981 elk_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
982 elk_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
983 elk_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
984 elk_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
985 elk_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
986 src2.vstride == ELK_VERTICAL_STRIDE_0);
987
988 if (devinfo->ver >= 7) {
989 /* Set both the source and destination types based on dest.type,
990 * ignoring the source register types. The MAD and LRP emitters ensure
991 * that all four types are float. The BFE and BFI2 emitters, however,
992 * may send us mixed D and UD types and want us to ignore that and use
993 * the destination type.
994 */
995 elk_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
996 elk_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
997
998 /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
999 *
1000 * "Three source instructions can use operands with mixed-mode
1001 * precision. When SrcType field is set to :f or :hf it defines
1002 * precision for source 0 only, and fields Src1Type and Src2Type
1003 * define precision for other source operands:
1004 *
1005 * 0b = :f. Single precision Float (32-bit).
1006 * 1b = :hf. Half precision Float (16-bit)."
1007 */
1008 if (src1.type == ELK_REGISTER_TYPE_HF)
1009 elk_inst_set_3src_a16_src1_type(devinfo, inst, 1);
1010
1011 if (src2.type == ELK_REGISTER_TYPE_HF)
1012 elk_inst_set_3src_a16_src2_type(devinfo, inst, 1);
1013 }
1014 }
1015
1016 return inst;
1017 }
1018
1019 static elk_inst *
elk_dpas_three_src(struct elk_codegen * p,enum elk_gfx12_systolic_depth opcode,unsigned sdepth,unsigned rcount,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1,struct elk_reg src2)1020 elk_dpas_three_src(struct elk_codegen *p, enum elk_gfx12_systolic_depth opcode,
1021 unsigned sdepth, unsigned rcount, struct elk_reg dest,
1022 struct elk_reg src0, struct elk_reg src1, struct elk_reg src2)
1023 {
1024 const struct intel_device_info *devinfo = p->devinfo;
1025 elk_inst *inst = next_insn(p, opcode);
1026
1027 assert(dest.file == ELK_GENERAL_REGISTER_FILE);
1028 elk_inst_set_dpas_3src_dst_reg_file(devinfo, inst,
1029 ELK_GENERAL_REGISTER_FILE);
1030 elk_inst_set_dpas_3src_dst_reg_nr(devinfo, inst, dest.nr);
1031 elk_inst_set_dpas_3src_dst_subreg_nr(devinfo, inst, dest.subnr);
1032
1033 if (elk_reg_type_is_floating_point(dest.type)) {
1034 elk_inst_set_dpas_3src_exec_type(devinfo, inst,
1035 ELK_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
1036 } else {
1037 elk_inst_set_dpas_3src_exec_type(devinfo, inst,
1038 ELK_ALIGN1_3SRC_EXEC_TYPE_INT);
1039 }
1040
1041 elk_inst_set_dpas_3src_sdepth(devinfo, inst, sdepth);
1042 elk_inst_set_dpas_3src_rcount(devinfo, inst, rcount - 1);
1043
1044 elk_inst_set_dpas_3src_dst_type(devinfo, inst, dest.type);
1045 elk_inst_set_dpas_3src_src0_type(devinfo, inst, src0.type);
1046 elk_inst_set_dpas_3src_src1_type(devinfo, inst, src1.type);
1047 elk_inst_set_dpas_3src_src2_type(devinfo, inst, src2.type);
1048
1049 assert(src0.file == ELK_GENERAL_REGISTER_FILE ||
1050 (src0.file == ELK_ARCHITECTURE_REGISTER_FILE &&
1051 src0.nr == ELK_ARF_NULL));
1052
1053 elk_inst_set_dpas_3src_src0_reg_file(devinfo, inst, src0.file);
1054 elk_inst_set_dpas_3src_src0_reg_nr(devinfo, inst, src0.nr);
1055 elk_inst_set_dpas_3src_src0_subreg_nr(devinfo, inst, src0.subnr);
1056
1057 assert(src1.file == ELK_GENERAL_REGISTER_FILE);
1058
1059 elk_inst_set_dpas_3src_src1_reg_file(devinfo, inst, src1.file);
1060 elk_inst_set_dpas_3src_src1_reg_nr(devinfo, inst, src1.nr);
1061 elk_inst_set_dpas_3src_src1_subreg_nr(devinfo, inst, src1.subnr);
1062 elk_inst_set_dpas_3src_src1_subbyte(devinfo, inst, ELK_SUB_BYTE_PRECISION_NONE);
1063
1064 assert(src2.file == ELK_GENERAL_REGISTER_FILE);
1065
1066 elk_inst_set_dpas_3src_src2_reg_file(devinfo, inst, src2.file);
1067 elk_inst_set_dpas_3src_src2_reg_nr(devinfo, inst, src2.nr);
1068 elk_inst_set_dpas_3src_src2_subreg_nr(devinfo, inst, src2.subnr);
1069 elk_inst_set_dpas_3src_src2_subbyte(devinfo, inst, ELK_SUB_BYTE_PRECISION_NONE);
1070
1071 return inst;
1072 }
1073
1074 /***********************************************************************
1075 * Convenience routines.
1076 */
1077 #define ALU1(OP) \
1078 elk_inst *elk_##OP(struct elk_codegen *p, \
1079 struct elk_reg dest, \
1080 struct elk_reg src0) \
1081 { \
1082 return elk_alu1(p, ELK_OPCODE_##OP, dest, src0); \
1083 }
1084
1085 #define ALU2(OP) \
1086 elk_inst *elk_##OP(struct elk_codegen *p, \
1087 struct elk_reg dest, \
1088 struct elk_reg src0, \
1089 struct elk_reg src1) \
1090 { \
1091 return elk_alu2(p, ELK_OPCODE_##OP, dest, src0, src1); \
1092 }
1093
1094 #define ALU3(OP) \
1095 elk_inst *elk_##OP(struct elk_codegen *p, \
1096 struct elk_reg dest, \
1097 struct elk_reg src0, \
1098 struct elk_reg src1, \
1099 struct elk_reg src2) \
1100 { \
1101 if (p->current->access_mode == ELK_ALIGN_16) { \
1102 if (src0.vstride == ELK_VERTICAL_STRIDE_0) \
1103 src0.swizzle = ELK_SWIZZLE_XXXX; \
1104 if (src1.vstride == ELK_VERTICAL_STRIDE_0) \
1105 src1.swizzle = ELK_SWIZZLE_XXXX; \
1106 if (src2.vstride == ELK_VERTICAL_STRIDE_0) \
1107 src2.swizzle = ELK_SWIZZLE_XXXX; \
1108 } \
1109 return elk_alu3(p, ELK_OPCODE_##OP, dest, src0, src1, src2); \
1110 }
1111
1112 #define ALU3F(OP) \
1113 elk_inst *elk_##OP(struct elk_codegen *p, \
1114 struct elk_reg dest, \
1115 struct elk_reg src0, \
1116 struct elk_reg src1, \
1117 struct elk_reg src2) \
1118 { \
1119 assert(dest.type == ELK_REGISTER_TYPE_F || \
1120 dest.type == ELK_REGISTER_TYPE_DF); \
1121 if (dest.type == ELK_REGISTER_TYPE_F) { \
1122 assert(src0.type == ELK_REGISTER_TYPE_F); \
1123 assert(src1.type == ELK_REGISTER_TYPE_F); \
1124 assert(src2.type == ELK_REGISTER_TYPE_F); \
1125 } else if (dest.type == ELK_REGISTER_TYPE_DF) { \
1126 assert(src0.type == ELK_REGISTER_TYPE_DF); \
1127 assert(src1.type == ELK_REGISTER_TYPE_DF); \
1128 assert(src2.type == ELK_REGISTER_TYPE_DF); \
1129 } \
1130 \
1131 if (p->current->access_mode == ELK_ALIGN_16) { \
1132 if (src0.vstride == ELK_VERTICAL_STRIDE_0) \
1133 src0.swizzle = ELK_SWIZZLE_XXXX; \
1134 if (src1.vstride == ELK_VERTICAL_STRIDE_0) \
1135 src1.swizzle = ELK_SWIZZLE_XXXX; \
1136 if (src2.vstride == ELK_VERTICAL_STRIDE_0) \
1137 src2.swizzle = ELK_SWIZZLE_XXXX; \
1138 } \
1139 return elk_alu3(p, ELK_OPCODE_##OP, dest, src0, src1, src2); \
1140 }
1141
1142 ALU2(SEL)
ALU1(NOT)1143 ALU1(NOT)
1144 ALU2(AND)
1145 ALU2(OR)
1146 ALU2(XOR)
1147 ALU2(SHR)
1148 ALU2(SHL)
1149 ALU1(DIM)
1150 ALU2(ASR)
1151 ALU2(ROL)
1152 ALU2(ROR)
1153 ALU3(CSEL)
1154 ALU1(FRC)
1155 ALU1(RNDD)
1156 ALU1(RNDE)
1157 ALU1(RNDU)
1158 ALU1(RNDZ)
1159 ALU2(MAC)
1160 ALU2(MACH)
1161 ALU1(LZD)
1162 ALU2(DP4)
1163 ALU2(DPH)
1164 ALU2(DP3)
1165 ALU2(DP2)
1166 ALU3(DP4A)
1167 ALU3(MAD)
1168 ALU3F(LRP)
1169 ALU1(BFREV)
1170 ALU3(BFE)
1171 ALU2(BFI1)
1172 ALU3(BFI2)
1173 ALU1(FBH)
1174 ALU1(FBL)
1175 ALU1(CBIT)
1176 ALU2(ADDC)
1177 ALU2(SUBB)
1178 ALU3(ADD3)
1179
1180 elk_inst *
1181 elk_MOV(struct elk_codegen *p, struct elk_reg dest, struct elk_reg src0)
1182 {
1183 const struct intel_device_info *devinfo = p->devinfo;
1184
1185 /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1186 * To avoid the problems that causes, we use an <X,2,0> source region to
1187 * read each element twice.
1188 */
1189 if (devinfo->verx10 == 70 &&
1190 elk_get_default_access_mode(p) == ELK_ALIGN_1 &&
1191 dest.type == ELK_REGISTER_TYPE_DF &&
1192 (src0.type == ELK_REGISTER_TYPE_F ||
1193 src0.type == ELK_REGISTER_TYPE_D ||
1194 src0.type == ELK_REGISTER_TYPE_UD) &&
1195 !has_scalar_region(src0)) {
1196 assert(src0.vstride == src0.width + src0.hstride);
1197 src0.vstride = src0.hstride;
1198 src0.width = ELK_WIDTH_2;
1199 src0.hstride = ELK_HORIZONTAL_STRIDE_0;
1200 }
1201
1202 return elk_alu1(p, ELK_OPCODE_MOV, dest, src0);
1203 }
1204
1205 elk_inst *
elk_ADD(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)1206 elk_ADD(struct elk_codegen *p, struct elk_reg dest,
1207 struct elk_reg src0, struct elk_reg src1)
1208 {
1209 /* 6.2.2: add */
1210 if (src0.type == ELK_REGISTER_TYPE_F ||
1211 (src0.file == ELK_IMMEDIATE_VALUE &&
1212 src0.type == ELK_REGISTER_TYPE_VF)) {
1213 assert(src1.type != ELK_REGISTER_TYPE_UD);
1214 assert(src1.type != ELK_REGISTER_TYPE_D);
1215 }
1216
1217 if (src1.type == ELK_REGISTER_TYPE_F ||
1218 (src1.file == ELK_IMMEDIATE_VALUE &&
1219 src1.type == ELK_REGISTER_TYPE_VF)) {
1220 assert(src0.type != ELK_REGISTER_TYPE_UD);
1221 assert(src0.type != ELK_REGISTER_TYPE_D);
1222 }
1223
1224 return elk_alu2(p, ELK_OPCODE_ADD, dest, src0, src1);
1225 }
1226
1227 elk_inst *
elk_AVG(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)1228 elk_AVG(struct elk_codegen *p, struct elk_reg dest,
1229 struct elk_reg src0, struct elk_reg src1)
1230 {
1231 assert(dest.type == src0.type);
1232 assert(src0.type == src1.type);
1233 switch (src0.type) {
1234 case ELK_REGISTER_TYPE_B:
1235 case ELK_REGISTER_TYPE_UB:
1236 case ELK_REGISTER_TYPE_W:
1237 case ELK_REGISTER_TYPE_UW:
1238 case ELK_REGISTER_TYPE_D:
1239 case ELK_REGISTER_TYPE_UD:
1240 break;
1241 default:
1242 unreachable("Bad type for elk_AVG");
1243 }
1244
1245 return elk_alu2(p, ELK_OPCODE_AVG, dest, src0, src1);
1246 }
1247
1248 elk_inst *
elk_MUL(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)1249 elk_MUL(struct elk_codegen *p, struct elk_reg dest,
1250 struct elk_reg src0, struct elk_reg src1)
1251 {
1252 /* 6.32.38: mul */
1253 if (src0.type == ELK_REGISTER_TYPE_D ||
1254 src0.type == ELK_REGISTER_TYPE_UD ||
1255 src1.type == ELK_REGISTER_TYPE_D ||
1256 src1.type == ELK_REGISTER_TYPE_UD) {
1257 assert(dest.type != ELK_REGISTER_TYPE_F);
1258 }
1259
1260 if (src0.type == ELK_REGISTER_TYPE_F ||
1261 (src0.file == ELK_IMMEDIATE_VALUE &&
1262 src0.type == ELK_REGISTER_TYPE_VF)) {
1263 assert(src1.type != ELK_REGISTER_TYPE_UD);
1264 assert(src1.type != ELK_REGISTER_TYPE_D);
1265 }
1266
1267 if (src1.type == ELK_REGISTER_TYPE_F ||
1268 (src1.file == ELK_IMMEDIATE_VALUE &&
1269 src1.type == ELK_REGISTER_TYPE_VF)) {
1270 assert(src0.type != ELK_REGISTER_TYPE_UD);
1271 assert(src0.type != ELK_REGISTER_TYPE_D);
1272 }
1273
1274 assert(src0.file != ELK_ARCHITECTURE_REGISTER_FILE ||
1275 src0.nr != ELK_ARF_ACCUMULATOR);
1276 assert(src1.file != ELK_ARCHITECTURE_REGISTER_FILE ||
1277 src1.nr != ELK_ARF_ACCUMULATOR);
1278
1279 return elk_alu2(p, ELK_OPCODE_MUL, dest, src0, src1);
1280 }
1281
1282 elk_inst *
elk_LINE(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)1283 elk_LINE(struct elk_codegen *p, struct elk_reg dest,
1284 struct elk_reg src0, struct elk_reg src1)
1285 {
1286 src0.vstride = ELK_VERTICAL_STRIDE_0;
1287 src0.width = ELK_WIDTH_1;
1288 src0.hstride = ELK_HORIZONTAL_STRIDE_0;
1289 return elk_alu2(p, ELK_OPCODE_LINE, dest, src0, src1);
1290 }
1291
1292 elk_inst *
elk_PLN(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)1293 elk_PLN(struct elk_codegen *p, struct elk_reg dest,
1294 struct elk_reg src0, struct elk_reg src1)
1295 {
1296 src0.vstride = ELK_VERTICAL_STRIDE_0;
1297 src0.width = ELK_WIDTH_1;
1298 src0.hstride = ELK_HORIZONTAL_STRIDE_0;
1299 src1.vstride = ELK_VERTICAL_STRIDE_8;
1300 src1.width = ELK_WIDTH_8;
1301 src1.hstride = ELK_HORIZONTAL_STRIDE_1;
1302 return elk_alu2(p, ELK_OPCODE_PLN, dest, src0, src1);
1303 }
1304
1305 elk_inst *
elk_DPAS(struct elk_codegen * p,enum elk_gfx12_systolic_depth sdepth,unsigned rcount,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1,struct elk_reg src2)1306 elk_DPAS(struct elk_codegen *p, enum elk_gfx12_systolic_depth sdepth,
1307 unsigned rcount, struct elk_reg dest, struct elk_reg src0,
1308 struct elk_reg src1, struct elk_reg src2)
1309 {
1310 return elk_dpas_three_src(p, ELK_OPCODE_DPAS, sdepth, rcount, dest, src0,
1311 src1, src2);
1312 }
1313
1314 elk_inst *
elk_F32TO16(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src)1315 elk_F32TO16(struct elk_codegen *p, struct elk_reg dst, struct elk_reg src)
1316 {
1317 assert(p->devinfo->ver == 7);
1318
1319 /* The F32TO16 instruction doesn't support 32-bit destination types in
1320 * Align1 mode. Gfx7 (only) does zero out the high 16 bits in Align16
1321 * mode as an undocumented feature.
1322 */
1323 if (ELK_ALIGN_16 == elk_get_default_access_mode(p)) {
1324 assert(dst.type == ELK_REGISTER_TYPE_UD);
1325 } else {
1326 assert(dst.type == ELK_REGISTER_TYPE_W ||
1327 dst.type == ELK_REGISTER_TYPE_UW);
1328 }
1329
1330 return elk_alu1(p, ELK_OPCODE_F32TO16, dst, src);
1331 }
1332
1333 elk_inst *
elk_F16TO32(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src)1334 elk_F16TO32(struct elk_codegen *p, struct elk_reg dst, struct elk_reg src)
1335 {
1336 assert(p->devinfo->ver == 7);
1337
1338 if (ELK_ALIGN_16 == elk_get_default_access_mode(p)) {
1339 assert(src.type == ELK_REGISTER_TYPE_UD);
1340 } else {
1341 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1342 *
1343 * Because this instruction does not have a 16-bit floating-point
1344 * type, the source data type must be Word (W). The destination type
1345 * must be F (Float).
1346 */
1347 assert(src.type == ELK_REGISTER_TYPE_W ||
1348 src.type == ELK_REGISTER_TYPE_UW);
1349 }
1350
1351 return elk_alu1(p, ELK_OPCODE_F16TO32, dst, src);
1352 }
1353
1354
elk_NOP(struct elk_codegen * p)1355 void elk_NOP(struct elk_codegen *p)
1356 {
1357 elk_inst *insn = next_insn(p, ELK_OPCODE_NOP);
1358 memset(insn, 0, sizeof(*insn));
1359 elk_inst_set_opcode(p->isa, insn, ELK_OPCODE_NOP);
1360 }
1361
elk_SYNC(struct elk_codegen * p,enum tgl_sync_function func)1362 void elk_SYNC(struct elk_codegen *p, enum tgl_sync_function func)
1363 {
1364 elk_inst *insn = next_insn(p, ELK_OPCODE_SYNC);
1365 elk_inst_set_cond_modifier(p->devinfo, insn, func);
1366 }
1367
1368 /***********************************************************************
1369 * Comparisons, if/else/endif
1370 */
1371
1372 elk_inst *
elk_JMPI(struct elk_codegen * p,struct elk_reg index,unsigned predicate_control)1373 elk_JMPI(struct elk_codegen *p, struct elk_reg index,
1374 unsigned predicate_control)
1375 {
1376 const struct intel_device_info *devinfo = p->devinfo;
1377 struct elk_reg ip = elk_ip_reg();
1378 elk_inst *inst = elk_alu2(p, ELK_OPCODE_JMPI, ip, ip, index);
1379
1380 elk_inst_set_exec_size(devinfo, inst, ELK_EXECUTE_1);
1381 elk_inst_set_qtr_control(devinfo, inst, ELK_COMPRESSION_NONE);
1382 elk_inst_set_mask_control(devinfo, inst, ELK_MASK_DISABLE);
1383 elk_inst_set_pred_control(devinfo, inst, predicate_control);
1384
1385 return inst;
1386 }
1387
1388 static void
push_if_stack(struct elk_codegen * p,elk_inst * inst)1389 push_if_stack(struct elk_codegen *p, elk_inst *inst)
1390 {
1391 p->if_stack[p->if_stack_depth] = inst - p->store;
1392
1393 p->if_stack_depth++;
1394 if (p->if_stack_array_size <= p->if_stack_depth) {
1395 p->if_stack_array_size *= 2;
1396 p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1397 p->if_stack_array_size);
1398 }
1399 }
1400
1401 static elk_inst *
pop_if_stack(struct elk_codegen * p)1402 pop_if_stack(struct elk_codegen *p)
1403 {
1404 p->if_stack_depth--;
1405 return &p->store[p->if_stack[p->if_stack_depth]];
1406 }
1407
1408 static void
push_loop_stack(struct elk_codegen * p,elk_inst * inst)1409 push_loop_stack(struct elk_codegen *p, elk_inst *inst)
1410 {
1411 if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1412 p->loop_stack_array_size *= 2;
1413 p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1414 p->loop_stack_array_size);
1415 p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1416 p->loop_stack_array_size);
1417 }
1418
1419 p->loop_stack[p->loop_stack_depth] = inst - p->store;
1420 p->loop_stack_depth++;
1421 p->if_depth_in_loop[p->loop_stack_depth] = 0;
1422 }
1423
1424 static elk_inst *
get_inner_do_insn(struct elk_codegen * p)1425 get_inner_do_insn(struct elk_codegen *p)
1426 {
1427 return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1428 }
1429
1430 /* EU takes the value from the flag register and pushes it onto some
1431 * sort of a stack (presumably merging with any flag value already on
1432 * the stack). Within an if block, the flags at the top of the stack
1433 * control execution on each channel of the unit, eg. on each of the
1434 * 16 pixel values in our wm programs.
1435 *
1436 * When the matching 'else' instruction is reached (presumably by
1437 * countdown of the instruction count patched in by our ELSE/ENDIF
1438 * functions), the relevant flags are inverted.
1439 *
1440 * When the matching 'endif' instruction is reached, the flags are
1441 * popped off. If the stack is now empty, normal execution resumes.
1442 */
1443 elk_inst *
elk_IF(struct elk_codegen * p,unsigned execute_size)1444 elk_IF(struct elk_codegen *p, unsigned execute_size)
1445 {
1446 const struct intel_device_info *devinfo = p->devinfo;
1447 elk_inst *insn;
1448
1449 insn = next_insn(p, ELK_OPCODE_IF);
1450
1451 /* Override the defaults for this instruction:
1452 */
1453 if (devinfo->ver < 6) {
1454 elk_set_dest(p, insn, elk_ip_reg());
1455 elk_set_src0(p, insn, elk_ip_reg());
1456 elk_set_src1(p, insn, elk_imm_d(0x0));
1457 } else if (devinfo->ver == 6) {
1458 elk_set_dest(p, insn, elk_imm_w(0));
1459 elk_inst_set_gfx6_jump_count(devinfo, insn, 0);
1460 elk_set_src0(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1461 elk_set_src1(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1462 } else if (devinfo->ver == 7) {
1463 elk_set_dest(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1464 elk_set_src0(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1465 elk_set_src1(p, insn, elk_imm_w(0));
1466 elk_inst_set_jip(devinfo, insn, 0);
1467 elk_inst_set_uip(devinfo, insn, 0);
1468 } else {
1469 elk_set_dest(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1470 if (devinfo->ver < 12)
1471 elk_set_src0(p, insn, elk_imm_d(0));
1472 elk_inst_set_jip(devinfo, insn, 0);
1473 elk_inst_set_uip(devinfo, insn, 0);
1474 }
1475
1476 elk_inst_set_exec_size(devinfo, insn, execute_size);
1477 elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1478 elk_inst_set_pred_control(devinfo, insn, ELK_PREDICATE_NORMAL);
1479 elk_inst_set_mask_control(devinfo, insn, ELK_MASK_ENABLE);
1480 if (!p->single_program_flow && devinfo->ver < 6)
1481 elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
1482
1483 push_if_stack(p, insn);
1484 p->if_depth_in_loop[p->loop_stack_depth]++;
1485 return insn;
1486 }
1487
1488 /* This function is only used for gfx6-style IF instructions with an
1489 * embedded comparison (conditional modifier). It is not used on gfx7.
1490 */
1491 elk_inst *
elk_gfx6_IF(struct elk_codegen * p,enum elk_conditional_mod conditional,struct elk_reg src0,struct elk_reg src1)1492 elk_gfx6_IF(struct elk_codegen *p, enum elk_conditional_mod conditional,
1493 struct elk_reg src0, struct elk_reg src1)
1494 {
1495 const struct intel_device_info *devinfo = p->devinfo;
1496 elk_inst *insn;
1497
1498 insn = next_insn(p, ELK_OPCODE_IF);
1499
1500 elk_set_dest(p, insn, elk_imm_w(0));
1501 elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1502 elk_inst_set_gfx6_jump_count(devinfo, insn, 0);
1503 elk_set_src0(p, insn, src0);
1504 elk_set_src1(p, insn, src1);
1505
1506 assert(elk_inst_qtr_control(devinfo, insn) == ELK_COMPRESSION_NONE);
1507 assert(elk_inst_pred_control(devinfo, insn) == ELK_PREDICATE_NONE);
1508 elk_inst_set_cond_modifier(devinfo, insn, conditional);
1509
1510 push_if_stack(p, insn);
1511 return insn;
1512 }
1513
1514 /**
1515 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1516 */
1517 static void
convert_IF_ELSE_to_ADD(struct elk_codegen * p,elk_inst * if_inst,elk_inst * else_inst)1518 convert_IF_ELSE_to_ADD(struct elk_codegen *p,
1519 elk_inst *if_inst, elk_inst *else_inst)
1520 {
1521 const struct intel_device_info *devinfo = p->devinfo;
1522
1523 /* The next instruction (where the ENDIF would be, if it existed) */
1524 elk_inst *next_inst = &p->store[p->nr_insn];
1525
1526 assert(p->single_program_flow);
1527 assert(if_inst != NULL && elk_inst_opcode(p->isa, if_inst) == ELK_OPCODE_IF);
1528 assert(else_inst == NULL || elk_inst_opcode(p->isa, else_inst) == ELK_OPCODE_ELSE);
1529 assert(elk_inst_exec_size(devinfo, if_inst) == ELK_EXECUTE_1);
1530
1531 /* Convert IF to an ADD instruction that moves the instruction pointer
1532 * to the first instruction of the ELSE block. If there is no ELSE
1533 * block, point to where ENDIF would be. Reverse the predicate.
1534 *
1535 * There's no need to execute an ENDIF since we don't need to do any
1536 * stack operations, and if we're currently executing, we just want to
1537 * continue normally.
1538 */
1539 elk_inst_set_opcode(p->isa, if_inst, ELK_OPCODE_ADD);
1540 elk_inst_set_pred_inv(devinfo, if_inst, true);
1541
1542 if (else_inst != NULL) {
1543 /* Convert ELSE to an ADD instruction that points where the ENDIF
1544 * would be.
1545 */
1546 elk_inst_set_opcode(p->isa, else_inst, ELK_OPCODE_ADD);
1547
1548 elk_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1549 elk_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1550 } else {
1551 elk_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1552 }
1553 }
1554
1555 /**
1556 * Patch IF and ELSE instructions with appropriate jump targets.
1557 */
1558 static void
patch_IF_ELSE(struct elk_codegen * p,elk_inst * if_inst,elk_inst * else_inst,elk_inst * endif_inst)1559 patch_IF_ELSE(struct elk_codegen *p,
1560 elk_inst *if_inst, elk_inst *else_inst, elk_inst *endif_inst)
1561 {
1562 const struct intel_device_info *devinfo = p->devinfo;
1563
1564 /* We shouldn't be patching IF and ELSE instructions in single program flow
1565 * mode when gen < 6, because in single program flow mode on those
1566 * platforms, we convert flow control instructions to conditional ADDs that
1567 * operate on IP (see elk_ENDIF).
1568 *
1569 * However, on Gfx6, writing to IP doesn't work in single program flow mode
1570 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1571 * not be updated by non-flow control instructions."). And on later
1572 * platforms, there is no significant benefit to converting control flow
1573 * instructions to conditional ADDs. So we do patch IF and ELSE
1574 * instructions in single program flow mode on those platforms.
1575 */
1576 if (devinfo->ver < 6)
1577 assert(!p->single_program_flow);
1578
1579 assert(if_inst != NULL && elk_inst_opcode(p->isa, if_inst) == ELK_OPCODE_IF);
1580 assert(endif_inst != NULL);
1581 assert(else_inst == NULL || elk_inst_opcode(p->isa, else_inst) == ELK_OPCODE_ELSE);
1582
1583 unsigned br = elk_jump_scale(devinfo);
1584
1585 assert(elk_inst_opcode(p->isa, endif_inst) == ELK_OPCODE_ENDIF);
1586 elk_inst_set_exec_size(devinfo, endif_inst, elk_inst_exec_size(devinfo, if_inst));
1587
1588 if (else_inst == NULL) {
1589 /* Patch IF -> ENDIF */
1590 if (devinfo->ver < 6) {
1591 /* Turn it into an IFF, which means no mask stack operations for
1592 * all-false and jumping past the ENDIF.
1593 */
1594 elk_inst_set_opcode(p->isa, if_inst, ELK_OPCODE_IFF);
1595 elk_inst_set_gfx4_jump_count(devinfo, if_inst,
1596 br * (endif_inst - if_inst + 1));
1597 elk_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1598 } else if (devinfo->ver == 6) {
1599 /* As of gfx6, there is no IFF and IF must point to the ENDIF. */
1600 elk_inst_set_gfx6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1601 } else {
1602 elk_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1603 elk_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1604 }
1605 } else {
1606 elk_inst_set_exec_size(devinfo, else_inst, elk_inst_exec_size(devinfo, if_inst));
1607
1608 /* Patch IF -> ELSE */
1609 if (devinfo->ver < 6) {
1610 elk_inst_set_gfx4_jump_count(devinfo, if_inst,
1611 br * (else_inst - if_inst));
1612 elk_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1613 } else if (devinfo->ver == 6) {
1614 elk_inst_set_gfx6_jump_count(devinfo, if_inst,
1615 br * (else_inst - if_inst + 1));
1616 }
1617
1618 /* Patch ELSE -> ENDIF */
1619 if (devinfo->ver < 6) {
1620 /* ELK_OPCODE_ELSE pre-gfx6 should point just past the
1621 * matching ENDIF.
1622 */
1623 elk_inst_set_gfx4_jump_count(devinfo, else_inst,
1624 br * (endif_inst - else_inst + 1));
1625 elk_inst_set_gfx4_pop_count(devinfo, else_inst, 1);
1626 } else if (devinfo->ver == 6) {
1627 /* ELK_OPCODE_ELSE on gfx6 should point to the matching ENDIF. */
1628 elk_inst_set_gfx6_jump_count(devinfo, else_inst,
1629 br * (endif_inst - else_inst));
1630 } else {
1631 /* The IF instruction's JIP should point just past the ELSE */
1632 elk_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1633 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1634 elk_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1635
1636 if (devinfo->ver >= 8 && devinfo->ver < 11) {
1637 /* Set the ELSE instruction to use branch_ctrl with a join
1638 * jump target pointing at the NOP inserted right before
1639 * the ENDIF instruction in order to make sure it is
1640 * executed in all cases, since attempting to do the same
1641 * as on other generations could cause the EU to jump at
1642 * the instruction immediately after the ENDIF due to
1643 * Wa_220160235, which could cause the program to continue
1644 * running with all channels disabled.
1645 */
1646 elk_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst - 1));
1647 elk_inst_set_branch_control(devinfo, else_inst, true);
1648 } else {
1649 elk_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1650 }
1651
1652 if (devinfo->ver >= 8) {
1653 /* Since we don't set branch_ctrl on Gfx11+, the ELSE's
1654 * JIP and UIP both should point to ENDIF on those
1655 * platforms.
1656 */
1657 elk_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1658 }
1659 }
1660 }
1661 }
1662
1663 void
elk_ELSE(struct elk_codegen * p)1664 elk_ELSE(struct elk_codegen *p)
1665 {
1666 const struct intel_device_info *devinfo = p->devinfo;
1667 elk_inst *insn;
1668
1669 insn = next_insn(p, ELK_OPCODE_ELSE);
1670
1671 if (devinfo->ver < 6) {
1672 elk_set_dest(p, insn, elk_ip_reg());
1673 elk_set_src0(p, insn, elk_ip_reg());
1674 elk_set_src1(p, insn, elk_imm_d(0x0));
1675 } else if (devinfo->ver == 6) {
1676 elk_set_dest(p, insn, elk_imm_w(0));
1677 elk_inst_set_gfx6_jump_count(devinfo, insn, 0);
1678 elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1679 elk_set_src1(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1680 } else if (devinfo->ver == 7) {
1681 elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1682 elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1683 elk_set_src1(p, insn, elk_imm_w(0));
1684 elk_inst_set_jip(devinfo, insn, 0);
1685 elk_inst_set_uip(devinfo, insn, 0);
1686 } else {
1687 elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1688 if (devinfo->ver < 12)
1689 elk_set_src0(p, insn, elk_imm_d(0));
1690 elk_inst_set_jip(devinfo, insn, 0);
1691 elk_inst_set_uip(devinfo, insn, 0);
1692 }
1693
1694 elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1695 elk_inst_set_mask_control(devinfo, insn, ELK_MASK_ENABLE);
1696 if (!p->single_program_flow && devinfo->ver < 6)
1697 elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
1698
1699 push_if_stack(p, insn);
1700 }
1701
1702 void
elk_ENDIF(struct elk_codegen * p)1703 elk_ENDIF(struct elk_codegen *p)
1704 {
1705 const struct intel_device_info *devinfo = p->devinfo;
1706 elk_inst *insn = NULL;
1707 elk_inst *else_inst = NULL;
1708 elk_inst *if_inst = NULL;
1709 elk_inst *tmp;
1710 bool emit_endif = true;
1711
1712 assert(p->if_stack_depth > 0);
1713
1714 if (devinfo->ver >= 8 && devinfo->ver < 11 &&
1715 elk_inst_opcode(p->isa, &p->store[p->if_stack[
1716 p->if_stack_depth - 1]]) == ELK_OPCODE_ELSE) {
1717 /* Insert a NOP to be specified as join instruction within the
1718 * ELSE block, which is valid for an ELSE instruction with
1719 * branch_ctrl on. The ELSE instruction will be set to jump
1720 * here instead of to the ENDIF instruction, since attempting to
1721 * do the latter would prevent the ENDIF from being executed in
1722 * some cases due to Wa_220160235, which could cause the program
1723 * to continue running with all channels disabled.
1724 */
1725 elk_NOP(p);
1726 }
1727
1728 /* In single program flow mode, we can express IF and ELSE instructions
1729 * equivalently as ADD instructions that operate on IP. On platforms prior
1730 * to Gfx6, flow control instructions cause an implied thread switch, so
1731 * this is a significant savings.
1732 *
1733 * However, on Gfx6, writing to IP doesn't work in single program flow mode
1734 * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1735 * not be updated by non-flow control instructions."). And on later
1736 * platforms, there is no significant benefit to converting control flow
1737 * instructions to conditional ADDs. So we only do this trick on Gfx4 and
1738 * Gfx5.
1739 */
1740 if (devinfo->ver < 6 && p->single_program_flow)
1741 emit_endif = false;
1742
1743 /*
1744 * A single next_insn() may change the base address of instruction store
1745 * memory(p->store), so call it first before referencing the instruction
1746 * store pointer from an index
1747 */
1748 if (emit_endif)
1749 insn = next_insn(p, ELK_OPCODE_ENDIF);
1750
1751 /* Pop the IF and (optional) ELSE instructions from the stack */
1752 p->if_depth_in_loop[p->loop_stack_depth]--;
1753 tmp = pop_if_stack(p);
1754 if (elk_inst_opcode(p->isa, tmp) == ELK_OPCODE_ELSE) {
1755 else_inst = tmp;
1756 tmp = pop_if_stack(p);
1757 }
1758 if_inst = tmp;
1759
1760 if (!emit_endif) {
1761 /* ENDIF is useless; don't bother emitting it. */
1762 convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1763 return;
1764 }
1765
1766 if (devinfo->ver < 6) {
1767 elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1768 elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1769 elk_set_src1(p, insn, elk_imm_d(0x0));
1770 } else if (devinfo->ver == 6) {
1771 elk_set_dest(p, insn, elk_imm_w(0));
1772 elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1773 elk_set_src1(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1774 } else if (devinfo->ver == 7) {
1775 elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1776 elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1777 elk_set_src1(p, insn, elk_imm_w(0));
1778 } else {
1779 elk_set_src0(p, insn, elk_imm_d(0));
1780 }
1781
1782 elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1783 elk_inst_set_mask_control(devinfo, insn, ELK_MASK_ENABLE);
1784 if (devinfo->ver < 6)
1785 elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
1786
1787 /* Also pop item off the stack in the endif instruction: */
1788 if (devinfo->ver < 6) {
1789 elk_inst_set_gfx4_jump_count(devinfo, insn, 0);
1790 elk_inst_set_gfx4_pop_count(devinfo, insn, 1);
1791 } else if (devinfo->ver == 6) {
1792 elk_inst_set_gfx6_jump_count(devinfo, insn, 2);
1793 } else {
1794 elk_inst_set_jip(devinfo, insn, 2);
1795 }
1796 patch_IF_ELSE(p, if_inst, else_inst, insn);
1797 }
1798
1799 elk_inst *
elk_BREAK(struct elk_codegen * p)1800 elk_BREAK(struct elk_codegen *p)
1801 {
1802 const struct intel_device_info *devinfo = p->devinfo;
1803 elk_inst *insn;
1804
1805 insn = next_insn(p, ELK_OPCODE_BREAK);
1806 if (devinfo->ver >= 8) {
1807 elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1808 elk_set_src0(p, insn, elk_imm_d(0x0));
1809 } else if (devinfo->ver >= 6) {
1810 elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1811 elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1812 elk_set_src1(p, insn, elk_imm_d(0x0));
1813 } else {
1814 elk_set_dest(p, insn, elk_ip_reg());
1815 elk_set_src0(p, insn, elk_ip_reg());
1816 elk_set_src1(p, insn, elk_imm_d(0x0));
1817 elk_inst_set_gfx4_pop_count(devinfo, insn,
1818 p->if_depth_in_loop[p->loop_stack_depth]);
1819 }
1820 elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1821 elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1822
1823 return insn;
1824 }
1825
1826 elk_inst *
elk_CONT(struct elk_codegen * p)1827 elk_CONT(struct elk_codegen *p)
1828 {
1829 const struct intel_device_info *devinfo = p->devinfo;
1830 elk_inst *insn;
1831
1832 insn = next_insn(p, ELK_OPCODE_CONTINUE);
1833 elk_set_dest(p, insn, elk_ip_reg());
1834 if (devinfo->ver >= 8) {
1835 elk_set_src0(p, insn, elk_imm_d(0x0));
1836 } else {
1837 elk_set_src0(p, insn, elk_ip_reg());
1838 elk_set_src1(p, insn, elk_imm_d(0x0));
1839 }
1840
1841 if (devinfo->ver < 6) {
1842 elk_inst_set_gfx4_pop_count(devinfo, insn,
1843 p->if_depth_in_loop[p->loop_stack_depth]);
1844 }
1845 elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1846 elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1847 return insn;
1848 }
1849
1850 elk_inst *
elk_HALT(struct elk_codegen * p)1851 elk_HALT(struct elk_codegen *p)
1852 {
1853 const struct intel_device_info *devinfo = p->devinfo;
1854 elk_inst *insn;
1855
1856 insn = next_insn(p, ELK_OPCODE_HALT);
1857 elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1858 if (devinfo->ver < 6) {
1859 /* From the Gfx4 PRM:
1860 *
1861 * "IP register must be put (for example, by the assembler) at <dst>
1862 * and <src0> locations.
1863 */
1864 elk_set_dest(p, insn, elk_ip_reg());
1865 elk_set_src0(p, insn, elk_ip_reg());
1866 elk_set_src1(p, insn, elk_imm_d(0x0)); /* exitcode updated later. */
1867 } else if (devinfo->ver < 8) {
1868 elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1869 elk_set_src1(p, insn, elk_imm_d(0x0)); /* UIP and JIP, updated later. */
1870 } else if (devinfo->ver < 12) {
1871 elk_set_src0(p, insn, elk_imm_d(0x0));
1872 }
1873
1874 elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1875 elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1876 return insn;
1877 }
1878
1879 /* DO/WHILE loop:
1880 *
1881 * The DO/WHILE is just an unterminated loop -- break or continue are
1882 * used for control within the loop. We have a few ways they can be
1883 * done.
1884 *
1885 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1886 * jip and no DO instruction.
1887 *
1888 * For non-uniform control flow pre-gfx6, there's a DO instruction to
1889 * push the mask, and a WHILE to jump back, and BREAK to get out and
1890 * pop the mask.
1891 *
1892 * For gfx6, there's no more mask stack, so no need for DO. WHILE
1893 * just points back to the first instruction of the loop.
1894 */
1895 elk_inst *
elk_DO(struct elk_codegen * p,unsigned execute_size)1896 elk_DO(struct elk_codegen *p, unsigned execute_size)
1897 {
1898 const struct intel_device_info *devinfo = p->devinfo;
1899
1900 if (devinfo->ver >= 6 || p->single_program_flow) {
1901 push_loop_stack(p, &p->store[p->nr_insn]);
1902 return &p->store[p->nr_insn];
1903 } else {
1904 elk_inst *insn = next_insn(p, ELK_OPCODE_DO);
1905
1906 push_loop_stack(p, insn);
1907
1908 /* Override the defaults for this instruction:
1909 */
1910 elk_set_dest(p, insn, elk_null_reg());
1911 elk_set_src0(p, insn, elk_null_reg());
1912 elk_set_src1(p, insn, elk_null_reg());
1913
1914 elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1915 elk_inst_set_exec_size(devinfo, insn, execute_size);
1916 elk_inst_set_pred_control(devinfo, insn, ELK_PREDICATE_NONE);
1917
1918 return insn;
1919 }
1920 }
1921
1922 /**
1923 * For pre-gfx6, we patch BREAK/CONT instructions to point at the WHILE
1924 * instruction here.
1925 *
1926 * For gfx6+, see elk_set_uip_jip(), which doesn't care so much about the loop
1927 * nesting, since it can always just point to the end of the block/current loop.
1928 */
1929 static void
elk_patch_break_cont(struct elk_codegen * p,elk_inst * while_inst)1930 elk_patch_break_cont(struct elk_codegen *p, elk_inst *while_inst)
1931 {
1932 const struct intel_device_info *devinfo = p->devinfo;
1933 elk_inst *do_inst = get_inner_do_insn(p);
1934 elk_inst *inst;
1935 unsigned br = elk_jump_scale(devinfo);
1936
1937 assert(devinfo->ver < 6);
1938
1939 for (inst = while_inst - 1; inst != do_inst; inst--) {
1940 /* If the jump count is != 0, that means that this instruction has already
1941 * been patched because it's part of a loop inside of the one we're
1942 * patching.
1943 */
1944 if (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_BREAK &&
1945 elk_inst_gfx4_jump_count(devinfo, inst) == 0) {
1946 elk_inst_set_gfx4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1947 } else if (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_CONTINUE &&
1948 elk_inst_gfx4_jump_count(devinfo, inst) == 0) {
1949 elk_inst_set_gfx4_jump_count(devinfo, inst, br * (while_inst - inst));
1950 }
1951 }
1952 }
1953
1954 elk_inst *
elk_WHILE(struct elk_codegen * p)1955 elk_WHILE(struct elk_codegen *p)
1956 {
1957 const struct intel_device_info *devinfo = p->devinfo;
1958 elk_inst *insn, *do_insn;
1959 unsigned br = elk_jump_scale(devinfo);
1960
1961 if (devinfo->ver >= 6) {
1962 insn = next_insn(p, ELK_OPCODE_WHILE);
1963 do_insn = get_inner_do_insn(p);
1964
1965 if (devinfo->ver >= 8) {
1966 elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1967 if (devinfo->ver < 12)
1968 elk_set_src0(p, insn, elk_imm_d(0));
1969 elk_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1970 } else if (devinfo->ver == 7) {
1971 elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1972 elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1973 elk_set_src1(p, insn, elk_imm_w(0));
1974 elk_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1975 } else {
1976 elk_set_dest(p, insn, elk_imm_w(0));
1977 elk_inst_set_gfx6_jump_count(devinfo, insn, br * (do_insn - insn));
1978 elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1979 elk_set_src1(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1980 }
1981
1982 elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1983
1984 } else {
1985 if (p->single_program_flow) {
1986 insn = next_insn(p, ELK_OPCODE_ADD);
1987 do_insn = get_inner_do_insn(p);
1988
1989 elk_set_dest(p, insn, elk_ip_reg());
1990 elk_set_src0(p, insn, elk_ip_reg());
1991 elk_set_src1(p, insn, elk_imm_d((do_insn - insn) * 16));
1992 elk_inst_set_exec_size(devinfo, insn, ELK_EXECUTE_1);
1993 } else {
1994 insn = next_insn(p, ELK_OPCODE_WHILE);
1995 do_insn = get_inner_do_insn(p);
1996
1997 assert(elk_inst_opcode(p->isa, do_insn) == ELK_OPCODE_DO);
1998
1999 elk_set_dest(p, insn, elk_ip_reg());
2000 elk_set_src0(p, insn, elk_ip_reg());
2001 elk_set_src1(p, insn, elk_imm_d(0));
2002
2003 elk_inst_set_exec_size(devinfo, insn, elk_inst_exec_size(devinfo, do_insn));
2004 elk_inst_set_gfx4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
2005 elk_inst_set_gfx4_pop_count(devinfo, insn, 0);
2006
2007 elk_patch_break_cont(p, insn);
2008 }
2009 }
2010 elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
2011
2012 p->loop_stack_depth--;
2013
2014 return insn;
2015 }
2016
2017 /* FORWARD JUMPS:
2018 */
elk_land_fwd_jump(struct elk_codegen * p,int jmp_insn_idx)2019 void elk_land_fwd_jump(struct elk_codegen *p, int jmp_insn_idx)
2020 {
2021 const struct intel_device_info *devinfo = p->devinfo;
2022 elk_inst *jmp_insn = &p->store[jmp_insn_idx];
2023 unsigned jmpi = 1;
2024
2025 if (devinfo->ver >= 5)
2026 jmpi = 2;
2027
2028 assert(elk_inst_opcode(p->isa, jmp_insn) == ELK_OPCODE_JMPI);
2029 assert(elk_inst_src1_reg_file(devinfo, jmp_insn) == ELK_IMMEDIATE_VALUE);
2030
2031 elk_inst_set_gfx4_jump_count(devinfo, jmp_insn,
2032 jmpi * (p->nr_insn - jmp_insn_idx - 1));
2033 }
2034
2035 /* To integrate with the above, it makes sense that the comparison
2036 * instruction should populate the flag register. It might be simpler
2037 * just to use the flag reg for most WM tasks?
2038 */
elk_CMP(struct elk_codegen * p,struct elk_reg dest,unsigned conditional,struct elk_reg src0,struct elk_reg src1)2039 void elk_CMP(struct elk_codegen *p,
2040 struct elk_reg dest,
2041 unsigned conditional,
2042 struct elk_reg src0,
2043 struct elk_reg src1)
2044 {
2045 const struct intel_device_info *devinfo = p->devinfo;
2046 elk_inst *insn = next_insn(p, ELK_OPCODE_CMP);
2047
2048 elk_inst_set_cond_modifier(devinfo, insn, conditional);
2049 elk_set_dest(p, insn, dest);
2050 elk_set_src0(p, insn, src0);
2051 elk_set_src1(p, insn, src1);
2052
2053 /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
2054 * page says:
2055 * "Any CMP instruction with a null destination must use a {switch}."
2056 *
2057 * It also applies to other Gfx7 platforms (IVB, BYT) even though it isn't
2058 * mentioned on their work-arounds pages.
2059 */
2060 if (devinfo->ver == 7) {
2061 if (dest.file == ELK_ARCHITECTURE_REGISTER_FILE &&
2062 dest.nr == ELK_ARF_NULL) {
2063 elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
2064 }
2065 }
2066 }
2067
elk_CMPN(struct elk_codegen * p,struct elk_reg dest,unsigned conditional,struct elk_reg src0,struct elk_reg src1)2068 void elk_CMPN(struct elk_codegen *p,
2069 struct elk_reg dest,
2070 unsigned conditional,
2071 struct elk_reg src0,
2072 struct elk_reg src1)
2073 {
2074 const struct intel_device_info *devinfo = p->devinfo;
2075 elk_inst *insn = next_insn(p, ELK_OPCODE_CMPN);
2076
2077 elk_inst_set_cond_modifier(devinfo, insn, conditional);
2078 elk_set_dest(p, insn, dest);
2079 elk_set_src0(p, insn, src0);
2080 elk_set_src1(p, insn, src1);
2081
2082 /* Page 166 of the Ivy Bridge PRM Volume 4 part 3 (Execution Unit ISA)
2083 * says:
2084 *
2085 * If the destination is the null register, the {Switch} instruction
2086 * option must be used.
2087 *
2088 * Page 77 of the Haswell PRM Volume 2b contains the same text.
2089 */
2090 if (devinfo->ver == 7) {
2091 if (dest.file == ELK_ARCHITECTURE_REGISTER_FILE &&
2092 dest.nr == ELK_ARF_NULL) {
2093 elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
2094 }
2095 }
2096 }
2097
2098 /***********************************************************************
2099 * Helpers for the various SEND message types:
2100 */
2101
2102 /** Extended math function, float[8].
2103 */
elk_gfx4_math(struct elk_codegen * p,struct elk_reg dest,unsigned function,unsigned msg_reg_nr,struct elk_reg src,unsigned precision)2104 void elk_gfx4_math(struct elk_codegen *p,
2105 struct elk_reg dest,
2106 unsigned function,
2107 unsigned msg_reg_nr,
2108 struct elk_reg src,
2109 unsigned precision )
2110 {
2111 const struct intel_device_info *devinfo = p->devinfo;
2112 elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
2113 unsigned data_type;
2114 if (has_scalar_region(src)) {
2115 data_type = ELK_MATH_DATA_SCALAR;
2116 } else {
2117 data_type = ELK_MATH_DATA_VECTOR;
2118 }
2119
2120 assert(devinfo->ver < 6);
2121
2122 /* Example code doesn't set predicate_control for send
2123 * instructions.
2124 */
2125 elk_inst_set_pred_control(devinfo, insn, 0);
2126 elk_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2127
2128 elk_set_dest(p, insn, dest);
2129 elk_set_src0(p, insn, src);
2130 elk_set_math_message(p,
2131 insn,
2132 function,
2133 src.type == ELK_REGISTER_TYPE_D,
2134 precision,
2135 data_type);
2136 }
2137
elk_gfx6_math(struct elk_codegen * p,struct elk_reg dest,unsigned function,struct elk_reg src0,struct elk_reg src1)2138 void elk_gfx6_math(struct elk_codegen *p,
2139 struct elk_reg dest,
2140 unsigned function,
2141 struct elk_reg src0,
2142 struct elk_reg src1)
2143 {
2144 const struct intel_device_info *devinfo = p->devinfo;
2145 elk_inst *insn = next_insn(p, ELK_OPCODE_MATH);
2146
2147 assert(devinfo->ver >= 6);
2148
2149 assert(dest.file == ELK_GENERAL_REGISTER_FILE ||
2150 (devinfo->ver >= 7 && dest.file == ELK_MESSAGE_REGISTER_FILE));
2151
2152 assert(dest.hstride == ELK_HORIZONTAL_STRIDE_1);
2153 if (devinfo->ver == 6) {
2154 assert(src0.hstride == ELK_HORIZONTAL_STRIDE_1);
2155 assert(src1.hstride == ELK_HORIZONTAL_STRIDE_1);
2156 }
2157
2158 if (function == ELK_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2159 function == ELK_MATH_FUNCTION_INT_DIV_REMAINDER ||
2160 function == ELK_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2161 assert(src0.type != ELK_REGISTER_TYPE_F);
2162 assert(src1.type != ELK_REGISTER_TYPE_F);
2163 assert(src1.file == ELK_GENERAL_REGISTER_FILE ||
2164 (devinfo->ver >= 8 && src1.file == ELK_IMMEDIATE_VALUE));
2165 /* From BSpec 6647/47428 "[Instruction] Extended Math Function":
2166 * INT DIV function does not support source modifiers.
2167 */
2168 assert(!src0.negate);
2169 assert(!src0.abs);
2170 assert(!src1.negate);
2171 assert(!src1.abs);
2172 } else {
2173 assert(src0.type == ELK_REGISTER_TYPE_F ||
2174 (src0.type == ELK_REGISTER_TYPE_HF && devinfo->ver >= 9));
2175 assert(src1.type == ELK_REGISTER_TYPE_F ||
2176 (src1.type == ELK_REGISTER_TYPE_HF && devinfo->ver >= 9));
2177 }
2178
2179 /* Source modifiers are ignored for extended math instructions on Gfx6. */
2180 if (devinfo->ver == 6) {
2181 assert(!src0.negate);
2182 assert(!src0.abs);
2183 assert(!src1.negate);
2184 assert(!src1.abs);
2185 }
2186
2187 elk_inst_set_math_function(devinfo, insn, function);
2188
2189 elk_set_dest(p, insn, dest);
2190 elk_set_src0(p, insn, src0);
2191 elk_set_src1(p, insn, src1);
2192 }
2193
2194 /**
2195 * Return the right surface index to access the thread scratch space using
2196 * stateless dataport messages.
2197 */
2198 unsigned
elk_scratch_surface_idx(const struct elk_codegen * p)2199 elk_scratch_surface_idx(const struct elk_codegen *p)
2200 {
2201 /* The scratch space is thread-local so IA coherency is unnecessary. */
2202 if (p->devinfo->ver >= 8)
2203 return GFX8_BTI_STATELESS_NON_COHERENT;
2204 else
2205 return ELK_BTI_STATELESS;
2206 }
2207
2208 /**
2209 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2210 * using a constant offset per channel.
2211 *
2212 * The offset must be aligned to oword size (16 bytes). Used for
2213 * register spilling.
2214 */
elk_oword_block_write_scratch(struct elk_codegen * p,struct elk_reg mrf,int num_regs,unsigned offset)2215 void elk_oword_block_write_scratch(struct elk_codegen *p,
2216 struct elk_reg mrf,
2217 int num_regs,
2218 unsigned offset)
2219 {
2220 const struct intel_device_info *devinfo = p->devinfo;
2221 const unsigned target_cache =
2222 (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
2223 devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2224 ELK_SFID_DATAPORT_WRITE);
2225 const struct tgl_swsb swsb = elk_get_default_swsb(p);
2226 uint32_t msg_type;
2227
2228 if (devinfo->ver >= 6)
2229 offset /= 16;
2230
2231 mrf = retype(mrf, ELK_REGISTER_TYPE_UD);
2232
2233 const unsigned mlen = 1 + num_regs;
2234
2235 /* Set up the message header. This is g0, with g0.2 filled with
2236 * the offset. We don't want to leave our offset around in g0 or
2237 * it'll screw up texture samples, so set it up inside the message
2238 * reg.
2239 */
2240 {
2241 elk_push_insn_state(p);
2242 elk_set_default_exec_size(p, ELK_EXECUTE_8);
2243 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2244 elk_set_default_compression_control(p, ELK_COMPRESSION_NONE);
2245 elk_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2246
2247 elk_MOV(p, mrf, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
2248
2249 /* set message header global offset field (reg 0, element 2) */
2250 elk_set_default_exec_size(p, ELK_EXECUTE_1);
2251 elk_set_default_swsb(p, tgl_swsb_null());
2252 elk_MOV(p,
2253 retype(elk_vec1_reg(ELK_MESSAGE_REGISTER_FILE,
2254 mrf.nr,
2255 2), ELK_REGISTER_TYPE_UD),
2256 elk_imm_ud(offset));
2257
2258 elk_pop_insn_state(p);
2259 elk_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2260 }
2261
2262 {
2263 struct elk_reg dest;
2264 elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
2265 int send_commit_msg;
2266 struct elk_reg src_header = retype(elk_vec8_grf(0, 0),
2267 ELK_REGISTER_TYPE_UW);
2268
2269 elk_inst_set_sfid(devinfo, insn, target_cache);
2270 elk_inst_set_compression(devinfo, insn, false);
2271
2272 if (elk_inst_exec_size(devinfo, insn) >= 16)
2273 src_header = vec16(src_header);
2274
2275 assert(elk_inst_pred_control(devinfo, insn) == ELK_PREDICATE_NONE);
2276 if (devinfo->ver < 6)
2277 elk_inst_set_base_mrf(devinfo, insn, mrf.nr);
2278
2279 /* Until gfx6, writes followed by reads from the same location
2280 * are not guaranteed to be ordered unless write_commit is set.
2281 * If set, then a no-op write is issued to the destination
2282 * register to set a dependency, and a read from the destination
2283 * can be used to ensure the ordering.
2284 *
2285 * For gfx6, only writes between different threads need ordering
2286 * protection. Our use of DP writes is all about register
2287 * spilling within a thread.
2288 */
2289 if (devinfo->ver >= 6) {
2290 dest = retype(vec16(elk_null_reg()), ELK_REGISTER_TYPE_UW);
2291 send_commit_msg = 0;
2292 } else {
2293 dest = src_header;
2294 send_commit_msg = 1;
2295 }
2296
2297 elk_set_dest(p, insn, dest);
2298 if (devinfo->ver >= 6) {
2299 elk_set_src0(p, insn, mrf);
2300 } else {
2301 elk_set_src0(p, insn, elk_null_reg());
2302 }
2303
2304 if (devinfo->ver >= 6)
2305 msg_type = GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2306 else
2307 msg_type = ELK_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2308
2309 elk_set_desc(p, insn,
2310 elk_message_desc(devinfo, mlen, send_commit_msg, true) |
2311 elk_dp_write_desc(devinfo, elk_scratch_surface_idx(p),
2312 ELK_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2313 msg_type, send_commit_msg));
2314 }
2315 }
2316
2317
2318 /**
2319 * Read a block of owords (half a GRF each) from the scratch buffer
2320 * using a constant index per channel.
2321 *
2322 * Offset must be aligned to oword size (16 bytes). Used for register
2323 * spilling.
2324 */
2325 void
elk_oword_block_read_scratch(struct elk_codegen * p,struct elk_reg dest,struct elk_reg mrf,int num_regs,unsigned offset)2326 elk_oword_block_read_scratch(struct elk_codegen *p,
2327 struct elk_reg dest,
2328 struct elk_reg mrf,
2329 int num_regs,
2330 unsigned offset)
2331 {
2332 const struct intel_device_info *devinfo = p->devinfo;
2333 const struct tgl_swsb swsb = elk_get_default_swsb(p);
2334
2335 if (devinfo->ver >= 6)
2336 offset /= 16;
2337
2338 if (p->devinfo->ver >= 7) {
2339 /* On gen 7 and above, we no longer have message registers and we can
2340 * send from any register we want. By using the destination register
2341 * for the message, we guarantee that the implied message write won't
2342 * accidentally overwrite anything. This has been a problem because
2343 * the MRF registers and source for the final FB write are both fixed
2344 * and may overlap.
2345 */
2346 mrf = retype(dest, ELK_REGISTER_TYPE_UD);
2347 } else {
2348 mrf = retype(mrf, ELK_REGISTER_TYPE_UD);
2349 }
2350 dest = retype(dest, ELK_REGISTER_TYPE_UW);
2351
2352 const unsigned rlen = num_regs;
2353 const unsigned target_cache =
2354 (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
2355 devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2356 ELK_SFID_DATAPORT_READ);
2357
2358 {
2359 elk_push_insn_state(p);
2360 elk_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2361 elk_set_default_exec_size(p, ELK_EXECUTE_8);
2362 elk_set_default_compression_control(p, ELK_COMPRESSION_NONE);
2363 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2364
2365 elk_MOV(p, mrf, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
2366
2367 /* set message header global offset field (reg 0, element 2) */
2368 elk_set_default_exec_size(p, ELK_EXECUTE_1);
2369 elk_set_default_swsb(p, tgl_swsb_null());
2370 elk_MOV(p, get_element_ud(mrf, 2), elk_imm_ud(offset));
2371
2372 elk_pop_insn_state(p);
2373 elk_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2374 }
2375
2376 {
2377 elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
2378
2379 elk_inst_set_sfid(devinfo, insn, target_cache);
2380 assert(elk_inst_pred_control(devinfo, insn) == 0);
2381 elk_inst_set_compression(devinfo, insn, false);
2382
2383 elk_set_dest(p, insn, dest); /* UW? */
2384 if (devinfo->ver >= 6) {
2385 elk_set_src0(p, insn, mrf);
2386 } else {
2387 elk_set_src0(p, insn, elk_null_reg());
2388 elk_inst_set_base_mrf(devinfo, insn, mrf.nr);
2389 }
2390
2391 elk_set_desc(p, insn,
2392 elk_message_desc(devinfo, 1, rlen, true) |
2393 elk_dp_read_desc(devinfo, elk_scratch_surface_idx(p),
2394 ELK_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2395 ELK_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2396 ELK_DATAPORT_READ_TARGET_RENDER_CACHE));
2397 }
2398 }
2399
2400 void
elk_gfx7_block_read_scratch(struct elk_codegen * p,struct elk_reg dest,int num_regs,unsigned offset)2401 elk_gfx7_block_read_scratch(struct elk_codegen *p,
2402 struct elk_reg dest,
2403 int num_regs,
2404 unsigned offset)
2405 {
2406 elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
2407 assert(elk_inst_pred_control(p->devinfo, insn) == ELK_PREDICATE_NONE);
2408
2409 elk_set_dest(p, insn, retype(dest, ELK_REGISTER_TYPE_UW));
2410
2411 /* The HW requires that the header is present; this is to get the g0.5
2412 * scratch offset.
2413 */
2414 elk_set_src0(p, insn, elk_vec8_grf(0, 0));
2415
2416 /* According to the docs, offset is "A 12-bit HWord offset into the memory
2417 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
2418 * is 32 bytes, which happens to be the size of a register.
2419 */
2420 offset /= REG_SIZE;
2421 assert(offset < (1 << 12));
2422
2423 gfx7_set_dp_scratch_message(p, insn,
2424 false, /* scratch read */
2425 false, /* OWords */
2426 false, /* invalidate after read */
2427 num_regs,
2428 offset,
2429 1, /* mlen: just g0 */
2430 num_regs, /* rlen */
2431 true); /* header present */
2432 }
2433
2434 /**
2435 * Read float[4] vectors from the data port constant cache.
2436 * Location (in buffer) should be a multiple of 16.
2437 * Used for fetching shader constants.
2438 */
elk_oword_block_read(struct elk_codegen * p,struct elk_reg dest,struct elk_reg mrf,uint32_t offset,uint32_t bind_table_index)2439 void elk_oword_block_read(struct elk_codegen *p,
2440 struct elk_reg dest,
2441 struct elk_reg mrf,
2442 uint32_t offset,
2443 uint32_t bind_table_index)
2444 {
2445 const struct intel_device_info *devinfo = p->devinfo;
2446 const unsigned target_cache =
2447 (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_CONSTANT_CACHE :
2448 ELK_SFID_DATAPORT_READ);
2449 const unsigned exec_size = 1 << elk_get_default_exec_size(p);
2450 const struct tgl_swsb swsb = elk_get_default_swsb(p);
2451
2452 /* On newer hardware, offset is in units of owords. */
2453 if (devinfo->ver >= 6)
2454 offset /= 16;
2455
2456 mrf = retype(mrf, ELK_REGISTER_TYPE_UD);
2457
2458 elk_push_insn_state(p);
2459 elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
2460 elk_set_default_flag_reg(p, 0, 0);
2461 elk_set_default_compression_control(p, ELK_COMPRESSION_NONE);
2462 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2463
2464 elk_push_insn_state(p);
2465 elk_set_default_exec_size(p, ELK_EXECUTE_8);
2466 elk_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2467 elk_MOV(p, mrf, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
2468
2469 /* set message header global offset field (reg 0, element 2) */
2470 elk_set_default_exec_size(p, ELK_EXECUTE_1);
2471 elk_set_default_swsb(p, tgl_swsb_null());
2472 elk_MOV(p,
2473 retype(elk_vec1_reg(ELK_MESSAGE_REGISTER_FILE,
2474 mrf.nr,
2475 2), ELK_REGISTER_TYPE_UD),
2476 elk_imm_ud(offset));
2477 elk_pop_insn_state(p);
2478
2479 elk_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2480
2481 elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
2482
2483 elk_inst_set_sfid(devinfo, insn, target_cache);
2484
2485 /* cast dest to a uword[8] vector */
2486 dest = retype(vec8(dest), ELK_REGISTER_TYPE_UW);
2487
2488 elk_set_dest(p, insn, dest);
2489 if (devinfo->ver >= 6) {
2490 elk_set_src0(p, insn, mrf);
2491 } else {
2492 elk_set_src0(p, insn, elk_null_reg());
2493 elk_inst_set_base_mrf(devinfo, insn, mrf.nr);
2494 }
2495
2496 elk_set_desc(p, insn,
2497 elk_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2498 elk_dp_read_desc(devinfo, bind_table_index,
2499 ELK_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2500 ELK_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2501 ELK_DATAPORT_READ_TARGET_DATA_CACHE));
2502
2503 elk_pop_insn_state(p);
2504 }
2505
2506 elk_inst *
elk_fb_WRITE(struct elk_codegen * p,struct elk_reg payload,struct elk_reg implied_header,unsigned msg_control,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool eot,bool last_render_target,bool header_present)2507 elk_fb_WRITE(struct elk_codegen *p,
2508 struct elk_reg payload,
2509 struct elk_reg implied_header,
2510 unsigned msg_control,
2511 unsigned binding_table_index,
2512 unsigned msg_length,
2513 unsigned response_length,
2514 bool eot,
2515 bool last_render_target,
2516 bool header_present)
2517 {
2518 const struct intel_device_info *devinfo = p->devinfo;
2519 const unsigned target_cache =
2520 (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2521 ELK_SFID_DATAPORT_WRITE);
2522 elk_inst *insn;
2523 struct elk_reg dest, src0;
2524
2525 if (elk_get_default_exec_size(p) >= ELK_EXECUTE_16)
2526 dest = retype(vec16(elk_null_reg()), ELK_REGISTER_TYPE_UW);
2527 else
2528 dest = retype(vec8(elk_null_reg()), ELK_REGISTER_TYPE_UW);
2529
2530 if (devinfo->ver >= 6) {
2531 insn = next_insn(p, ELK_OPCODE_SENDC);
2532 } else {
2533 insn = next_insn(p, ELK_OPCODE_SEND);
2534 }
2535 elk_inst_set_sfid(devinfo, insn, target_cache);
2536 elk_inst_set_compression(devinfo, insn, false);
2537
2538 if (devinfo->ver >= 6) {
2539 /* headerless version, just submit color payload */
2540 src0 = payload;
2541 } else {
2542 assert(payload.file == ELK_MESSAGE_REGISTER_FILE);
2543 elk_inst_set_base_mrf(devinfo, insn, payload.nr);
2544 src0 = implied_header;
2545 }
2546
2547 elk_set_dest(p, insn, dest);
2548 elk_set_src0(p, insn, src0);
2549 elk_set_desc(p, insn,
2550 elk_message_desc(devinfo, msg_length, response_length,
2551 header_present) |
2552 elk_fb_write_desc(devinfo, binding_table_index, msg_control,
2553 last_render_target,
2554 false /* coarse_write */));
2555 elk_inst_set_eot(devinfo, insn, eot);
2556
2557 return insn;
2558 }
2559
2560 elk_inst *
elk_gfx9_fb_READ(struct elk_codegen * p,struct elk_reg dst,struct elk_reg payload,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool per_sample)2561 elk_gfx9_fb_READ(struct elk_codegen *p,
2562 struct elk_reg dst,
2563 struct elk_reg payload,
2564 unsigned binding_table_index,
2565 unsigned msg_length,
2566 unsigned response_length,
2567 bool per_sample)
2568 {
2569 const struct intel_device_info *devinfo = p->devinfo;
2570 assert(devinfo->ver >= 9);
2571 elk_inst *insn = next_insn(p, ELK_OPCODE_SENDC);
2572
2573 elk_inst_set_sfid(devinfo, insn, GFX6_SFID_DATAPORT_RENDER_CACHE);
2574 elk_set_dest(p, insn, dst);
2575 elk_set_src0(p, insn, payload);
2576 elk_set_desc(
2577 p, insn,
2578 elk_message_desc(devinfo, msg_length, response_length, true) |
2579 elk_fb_read_desc(devinfo, binding_table_index, 0 /* msg_control */,
2580 1 << elk_get_default_exec_size(p), per_sample));
2581 elk_inst_set_rt_slot_group(devinfo, insn, elk_get_default_group(p) / 16);
2582
2583 return insn;
2584 }
2585
2586 /**
2587 * Texture sample instruction.
2588 * Note: the msg_type plus msg_length values determine exactly what kind
2589 * of sampling operation is performed. See volume 4, page 161 of docs.
2590 */
elk_SAMPLE(struct elk_codegen * p,struct elk_reg dest,unsigned msg_reg_nr,struct elk_reg src0,unsigned binding_table_index,unsigned sampler,unsigned msg_type,unsigned response_length,unsigned msg_length,unsigned header_present,unsigned simd_mode,unsigned return_format)2591 void elk_SAMPLE(struct elk_codegen *p,
2592 struct elk_reg dest,
2593 unsigned msg_reg_nr,
2594 struct elk_reg src0,
2595 unsigned binding_table_index,
2596 unsigned sampler,
2597 unsigned msg_type,
2598 unsigned response_length,
2599 unsigned msg_length,
2600 unsigned header_present,
2601 unsigned simd_mode,
2602 unsigned return_format)
2603 {
2604 const struct intel_device_info *devinfo = p->devinfo;
2605 elk_inst *insn;
2606
2607 if (msg_reg_nr != -1)
2608 elk_gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2609
2610 insn = next_insn(p, ELK_OPCODE_SEND);
2611 elk_inst_set_sfid(devinfo, insn, ELK_SFID_SAMPLER);
2612 elk_inst_set_pred_control(devinfo, insn, ELK_PREDICATE_NONE); /* XXX */
2613
2614 /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2615 *
2616 * "Instruction compression is not allowed for this instruction (that
2617 * is, send). The hardware behavior is undefined if this instruction is
2618 * set as compressed. However, compress control can be set to "SecHalf"
2619 * to affect the EMask generation."
2620 *
2621 * No similar wording is found in later PRMs, but there are examples
2622 * utilizing send with SecHalf. More importantly, SIMD8 sampler messages
2623 * are allowed in SIMD16 mode and they could not work without SecHalf. For
2624 * these reasons, we allow ELK_COMPRESSION_2NDHALF here.
2625 */
2626 elk_inst_set_compression(devinfo, insn, false);
2627
2628 if (devinfo->ver < 6)
2629 elk_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2630
2631 elk_set_dest(p, insn, dest);
2632 elk_set_src0(p, insn, src0);
2633 elk_set_desc(p, insn,
2634 elk_message_desc(devinfo, msg_length, response_length,
2635 header_present) |
2636 elk_sampler_desc(devinfo, binding_table_index, sampler,
2637 msg_type, simd_mode, return_format));
2638 }
2639
2640 /* Adjust the message header's sampler state pointer to
2641 * select the correct group of 16 samplers.
2642 */
elk_adjust_sampler_state_pointer(struct elk_codegen * p,struct elk_reg header,struct elk_reg sampler_index)2643 void elk_adjust_sampler_state_pointer(struct elk_codegen *p,
2644 struct elk_reg header,
2645 struct elk_reg sampler_index)
2646 {
2647 /* The "Sampler Index" field can only store values between 0 and 15.
2648 * However, we can add an offset to the "Sampler State Pointer"
2649 * field, effectively selecting a different set of 16 samplers.
2650 *
2651 * The "Sampler State Pointer" needs to be aligned to a 32-byte
2652 * offset, and each sampler state is only 16-bytes, so we can't
2653 * exclusively use the offset - we have to use both.
2654 */
2655
2656 const struct intel_device_info *devinfo = p->devinfo;
2657
2658 if (sampler_index.file == ELK_IMMEDIATE_VALUE) {
2659 const int sampler_state_size = 16; /* 16 bytes */
2660 uint32_t sampler = sampler_index.ud;
2661
2662 if (sampler >= 16) {
2663 assert(devinfo->verx10 >= 75);
2664 elk_ADD(p,
2665 get_element_ud(header, 3),
2666 get_element_ud(elk_vec8_grf(0, 0), 3),
2667 elk_imm_ud(16 * (sampler / 16) * sampler_state_size));
2668 }
2669 } else {
2670 /* Non-const sampler array indexing case */
2671 if (devinfo->verx10 <= 70) {
2672 return;
2673 }
2674
2675 struct elk_reg temp = get_element_ud(header, 3);
2676
2677 elk_push_insn_state(p);
2678 elk_AND(p, temp, get_element_ud(sampler_index, 0), elk_imm_ud(0x0f0));
2679 elk_set_default_swsb(p, tgl_swsb_regdist(1));
2680 elk_SHL(p, temp, temp, elk_imm_ud(4));
2681 elk_ADD(p,
2682 get_element_ud(header, 3),
2683 get_element_ud(elk_vec8_grf(0, 0), 3),
2684 temp);
2685 elk_pop_insn_state(p);
2686 }
2687 }
2688
2689 /* All these variables are pretty confusing - we might be better off
2690 * using bitmasks and macros for this, in the old style. Or perhaps
2691 * just having the caller instantiate the fields in dword3 itself.
2692 */
elk_urb_WRITE(struct elk_codegen * p,struct elk_reg dest,unsigned msg_reg_nr,struct elk_reg src0,enum elk_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle)2693 void elk_urb_WRITE(struct elk_codegen *p,
2694 struct elk_reg dest,
2695 unsigned msg_reg_nr,
2696 struct elk_reg src0,
2697 enum elk_urb_write_flags flags,
2698 unsigned msg_length,
2699 unsigned response_length,
2700 unsigned offset,
2701 unsigned swizzle)
2702 {
2703 const struct intel_device_info *devinfo = p->devinfo;
2704 elk_inst *insn;
2705
2706 elk_gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2707
2708 if (devinfo->ver >= 7 && !(flags & ELK_URB_WRITE_USE_CHANNEL_MASKS)) {
2709 /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2710 elk_push_insn_state(p);
2711 elk_set_default_access_mode(p, ELK_ALIGN_1);
2712 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2713 elk_set_default_exec_size(p, ELK_EXECUTE_1);
2714 elk_OR(p, retype(elk_vec1_reg(ELK_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2715 ELK_REGISTER_TYPE_UD),
2716 retype(elk_vec1_grf(0, 5), ELK_REGISTER_TYPE_UD),
2717 elk_imm_ud(0xff00));
2718 elk_pop_insn_state(p);
2719 }
2720
2721 insn = next_insn(p, ELK_OPCODE_SEND);
2722
2723 assert(msg_length < ELK_MAX_MRF(devinfo->ver));
2724
2725 elk_set_dest(p, insn, dest);
2726 elk_set_src0(p, insn, src0);
2727 elk_set_src1(p, insn, elk_imm_d(0));
2728
2729 if (devinfo->ver < 6)
2730 elk_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2731
2732 elk_set_urb_message(p,
2733 insn,
2734 flags,
2735 msg_length,
2736 response_length,
2737 offset,
2738 swizzle);
2739 }
2740
2741 void
elk_send_indirect_message(struct elk_codegen * p,unsigned sfid,struct elk_reg dst,struct elk_reg payload,struct elk_reg desc,unsigned desc_imm,bool eot)2742 elk_send_indirect_message(struct elk_codegen *p,
2743 unsigned sfid,
2744 struct elk_reg dst,
2745 struct elk_reg payload,
2746 struct elk_reg desc,
2747 unsigned desc_imm,
2748 bool eot)
2749 {
2750 const struct intel_device_info *devinfo = p->devinfo;
2751 struct elk_inst *send;
2752
2753 dst = retype(dst, ELK_REGISTER_TYPE_UW);
2754
2755 assert(desc.type == ELK_REGISTER_TYPE_UD);
2756
2757 if (desc.file == ELK_IMMEDIATE_VALUE) {
2758 send = next_insn(p, ELK_OPCODE_SEND);
2759 elk_set_src0(p, send, retype(payload, ELK_REGISTER_TYPE_UD));
2760 elk_set_desc(p, send, desc.ud | desc_imm);
2761 } else {
2762 const struct tgl_swsb swsb = elk_get_default_swsb(p);
2763 struct elk_reg addr = retype(elk_address_reg(0), ELK_REGISTER_TYPE_UD);
2764
2765 elk_push_insn_state(p);
2766 elk_set_default_access_mode(p, ELK_ALIGN_1);
2767 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2768 elk_set_default_exec_size(p, ELK_EXECUTE_1);
2769 elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
2770 elk_set_default_flag_reg(p, 0, 0);
2771 elk_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2772
2773 /* Load the indirect descriptor to an address register using OR so the
2774 * caller can specify additional descriptor bits with the desc_imm
2775 * immediate.
2776 */
2777 elk_OR(p, addr, desc, elk_imm_ud(desc_imm));
2778
2779 elk_pop_insn_state(p);
2780
2781 elk_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2782 send = next_insn(p, ELK_OPCODE_SEND);
2783 elk_set_src0(p, send, retype(payload, ELK_REGISTER_TYPE_UD));
2784
2785 if (devinfo->ver >= 12)
2786 elk_inst_set_send_sel_reg32_desc(devinfo, send, true);
2787 else
2788 elk_set_src1(p, send, addr);
2789 }
2790
2791 elk_set_dest(p, send, dst);
2792 elk_inst_set_sfid(devinfo, send, sfid);
2793 elk_inst_set_eot(devinfo, send, eot);
2794 }
2795
2796 void
elk_send_indirect_split_message(struct elk_codegen * p,unsigned sfid,struct elk_reg dst,struct elk_reg payload0,struct elk_reg payload1,struct elk_reg desc,unsigned desc_imm,struct elk_reg ex_desc,unsigned ex_desc_imm,bool ex_desc_scratch,bool ex_bso,bool eot)2797 elk_send_indirect_split_message(struct elk_codegen *p,
2798 unsigned sfid,
2799 struct elk_reg dst,
2800 struct elk_reg payload0,
2801 struct elk_reg payload1,
2802 struct elk_reg desc,
2803 unsigned desc_imm,
2804 struct elk_reg ex_desc,
2805 unsigned ex_desc_imm,
2806 bool ex_desc_scratch,
2807 bool ex_bso,
2808 bool eot)
2809 {
2810 const struct intel_device_info *devinfo = p->devinfo;
2811 struct elk_inst *send;
2812
2813 dst = retype(dst, ELK_REGISTER_TYPE_UW);
2814
2815 assert(desc.type == ELK_REGISTER_TYPE_UD);
2816
2817 if (desc.file == ELK_IMMEDIATE_VALUE) {
2818 desc.ud |= desc_imm;
2819 } else {
2820 const struct tgl_swsb swsb = elk_get_default_swsb(p);
2821 struct elk_reg addr = retype(elk_address_reg(0), ELK_REGISTER_TYPE_UD);
2822
2823 elk_push_insn_state(p);
2824 elk_set_default_access_mode(p, ELK_ALIGN_1);
2825 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2826 elk_set_default_exec_size(p, ELK_EXECUTE_1);
2827 elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
2828 elk_set_default_flag_reg(p, 0, 0);
2829 elk_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2830
2831 /* Load the indirect descriptor to an address register using OR so the
2832 * caller can specify additional descriptor bits with the desc_imm
2833 * immediate.
2834 */
2835 elk_OR(p, addr, desc, elk_imm_ud(desc_imm));
2836
2837 elk_pop_insn_state(p);
2838 desc = addr;
2839
2840 elk_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2841 }
2842
2843 if (ex_desc.file == ELK_IMMEDIATE_VALUE &&
2844 !ex_desc_scratch &&
2845 (devinfo->ver >= 12 ||
2846 ((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) {
2847 /* ATS-M PRMs, Volume 2d: Command Reference: Structures,
2848 * EU_INSTRUCTION_SEND instruction
2849 *
2850 * "ExBSO: Exists If: ([ExDesc.IsReg]==true)"
2851 */
2852 assert(!ex_bso);
2853 ex_desc.ud |= ex_desc_imm;
2854 } else {
2855 const struct tgl_swsb swsb = elk_get_default_swsb(p);
2856 struct elk_reg addr = retype(elk_address_reg(2), ELK_REGISTER_TYPE_UD);
2857
2858 elk_push_insn_state(p);
2859 elk_set_default_access_mode(p, ELK_ALIGN_1);
2860 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2861 elk_set_default_exec_size(p, ELK_EXECUTE_1);
2862 elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
2863 elk_set_default_flag_reg(p, 0, 0);
2864 elk_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2865
2866 /* Load the indirect extended descriptor to an address register using OR
2867 * so the caller can specify additional descriptor bits with the
2868 * desc_imm immediate.
2869 *
2870 * Even though the instruction dispatcher always pulls the SFID and EOT
2871 * fields from the instruction itself, actual external unit which
2872 * processes the message gets the SFID and EOT from the extended
2873 * descriptor which comes from the address register. If we don't OR
2874 * those two bits in, the external unit may get confused and hang.
2875 */
2876 unsigned imm_part = ex_bso ? 0 : (ex_desc_imm | sfid | eot << 5);
2877
2878 if (ex_desc_scratch) {
2879 /* Or the scratch surface offset together with the immediate part of
2880 * the extended descriptor.
2881 */
2882 assert(devinfo->verx10 >= 125);
2883 elk_AND(p, addr,
2884 retype(elk_vec1_grf(0, 5), ELK_REGISTER_TYPE_UD),
2885 elk_imm_ud(INTEL_MASK(31, 10)));
2886 elk_OR(p, addr, addr, elk_imm_ud(imm_part));
2887 } else if (ex_desc.file == ELK_IMMEDIATE_VALUE) {
2888 /* ex_desc bits 15:12 don't exist in the instruction encoding prior
2889 * to Gfx12, so we may have fallen back to an indirect extended
2890 * descriptor.
2891 */
2892 elk_MOV(p, addr, elk_imm_ud(ex_desc.ud | imm_part));
2893 } else {
2894 elk_OR(p, addr, ex_desc, elk_imm_ud(imm_part));
2895 }
2896
2897 elk_pop_insn_state(p);
2898 ex_desc = addr;
2899
2900 elk_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2901 }
2902
2903 send = next_insn(p, devinfo->ver >= 12 ? ELK_OPCODE_SEND : ELK_OPCODE_SENDS);
2904 elk_set_dest(p, send, dst);
2905 elk_set_src0(p, send, retype(payload0, ELK_REGISTER_TYPE_UD));
2906 elk_set_src1(p, send, retype(payload1, ELK_REGISTER_TYPE_UD));
2907
2908 if (desc.file == ELK_IMMEDIATE_VALUE) {
2909 elk_inst_set_send_sel_reg32_desc(devinfo, send, 0);
2910 elk_inst_set_send_desc(devinfo, send, desc.ud);
2911 } else {
2912 assert(desc.file == ELK_ARCHITECTURE_REGISTER_FILE);
2913 assert(desc.nr == ELK_ARF_ADDRESS);
2914 assert(desc.subnr == 0);
2915 elk_inst_set_send_sel_reg32_desc(devinfo, send, 1);
2916 }
2917
2918 if (ex_desc.file == ELK_IMMEDIATE_VALUE) {
2919 elk_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
2920 elk_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
2921 } else {
2922 assert(ex_desc.file == ELK_ARCHITECTURE_REGISTER_FILE);
2923 assert(ex_desc.nr == ELK_ARF_ADDRESS);
2924 assert((ex_desc.subnr & 0x3) == 0);
2925 elk_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
2926 elk_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, phys_subnr(devinfo, ex_desc) >> 2);
2927 }
2928
2929 if (ex_bso) {
2930 /* The send instruction ExBSO field does not exist with UGM on Gfx20+,
2931 * it is assumed.
2932 *
2933 * BSpec 56890
2934 */
2935 if (devinfo->ver < 20 || sfid != GFX12_SFID_UGM)
2936 elk_inst_set_send_ex_bso(devinfo, send, true);
2937 elk_inst_set_send_src1_len(devinfo, send, GET_BITS(ex_desc_imm, 10, 6));
2938 }
2939 elk_inst_set_sfid(devinfo, send, sfid);
2940 elk_inst_set_eot(devinfo, send, eot);
2941 }
2942
2943 static void
elk_send_indirect_surface_message(struct elk_codegen * p,unsigned sfid,struct elk_reg dst,struct elk_reg payload,struct elk_reg surface,unsigned desc_imm)2944 elk_send_indirect_surface_message(struct elk_codegen *p,
2945 unsigned sfid,
2946 struct elk_reg dst,
2947 struct elk_reg payload,
2948 struct elk_reg surface,
2949 unsigned desc_imm)
2950 {
2951 if (surface.file != ELK_IMMEDIATE_VALUE) {
2952 const struct tgl_swsb swsb = elk_get_default_swsb(p);
2953 struct elk_reg addr = retype(elk_address_reg(0), ELK_REGISTER_TYPE_UD);
2954
2955 elk_push_insn_state(p);
2956 elk_set_default_access_mode(p, ELK_ALIGN_1);
2957 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2958 elk_set_default_exec_size(p, ELK_EXECUTE_1);
2959 elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
2960 elk_set_default_flag_reg(p, 0, 0);
2961 elk_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2962
2963 /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2964 * some surface array is accessed out of bounds.
2965 */
2966 elk_AND(p, addr,
2967 suboffset(vec1(retype(surface, ELK_REGISTER_TYPE_UD)),
2968 ELK_GET_SWZ(surface.swizzle, 0)),
2969 elk_imm_ud(0xff));
2970
2971 elk_pop_insn_state(p);
2972
2973 surface = addr;
2974 elk_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2975 }
2976
2977 elk_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2978 }
2979
2980 static bool
while_jumps_before_offset(const struct intel_device_info * devinfo,elk_inst * insn,int while_offset,int start_offset)2981 while_jumps_before_offset(const struct intel_device_info *devinfo,
2982 elk_inst *insn, int while_offset, int start_offset)
2983 {
2984 int scale = 16 / elk_jump_scale(devinfo);
2985 int jip = devinfo->ver == 6 ? elk_inst_gfx6_jump_count(devinfo, insn)
2986 : elk_inst_jip(devinfo, insn);
2987 assert(jip < 0);
2988 return while_offset + jip * scale <= start_offset;
2989 }
2990
2991
2992 static int
elk_find_next_block_end(struct elk_codegen * p,int start_offset)2993 elk_find_next_block_end(struct elk_codegen *p, int start_offset)
2994 {
2995 int offset;
2996 void *store = p->store;
2997 const struct intel_device_info *devinfo = p->devinfo;
2998
2999 int depth = 0;
3000
3001 for (offset = next_offset(devinfo, store, start_offset);
3002 offset < p->next_insn_offset;
3003 offset = next_offset(devinfo, store, offset)) {
3004 elk_inst *insn = store + offset;
3005
3006 switch (elk_inst_opcode(p->isa, insn)) {
3007 case ELK_OPCODE_IF:
3008 depth++;
3009 break;
3010 case ELK_OPCODE_ENDIF:
3011 if (depth == 0)
3012 return offset;
3013 depth--;
3014 break;
3015 case ELK_OPCODE_WHILE:
3016 /* If the while doesn't jump before our instruction, it's the end
3017 * of a sibling do...while loop. Ignore it.
3018 */
3019 if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
3020 continue;
3021 FALLTHROUGH;
3022 case ELK_OPCODE_ELSE:
3023 case ELK_OPCODE_HALT:
3024 if (depth == 0)
3025 return offset;
3026 break;
3027 default:
3028 break;
3029 }
3030 }
3031
3032 return 0;
3033 }
3034
3035 /* There is no DO instruction on gfx6, so to find the end of the loop
3036 * we have to see if the loop is jumping back before our start
3037 * instruction.
3038 */
3039 static int
elk_find_loop_end(struct elk_codegen * p,int start_offset)3040 elk_find_loop_end(struct elk_codegen *p, int start_offset)
3041 {
3042 const struct intel_device_info *devinfo = p->devinfo;
3043 int offset;
3044 void *store = p->store;
3045
3046 assert(devinfo->ver >= 6);
3047
3048 /* Always start after the instruction (such as a WHILE) we're trying to fix
3049 * up.
3050 */
3051 for (offset = next_offset(devinfo, store, start_offset);
3052 offset < p->next_insn_offset;
3053 offset = next_offset(devinfo, store, offset)) {
3054 elk_inst *insn = store + offset;
3055
3056 if (elk_inst_opcode(p->isa, insn) == ELK_OPCODE_WHILE) {
3057 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
3058 return offset;
3059 }
3060 }
3061 assert(!"not reached");
3062 return start_offset;
3063 }
3064
3065 /* After program generation, go back and update the UIP and JIP of
3066 * BREAK, CONT, and HALT instructions to their correct locations.
3067 */
3068 void
elk_set_uip_jip(struct elk_codegen * p,int start_offset)3069 elk_set_uip_jip(struct elk_codegen *p, int start_offset)
3070 {
3071 const struct intel_device_info *devinfo = p->devinfo;
3072 int offset;
3073 int br = elk_jump_scale(devinfo);
3074 int scale = 16 / br;
3075 void *store = p->store;
3076
3077 if (devinfo->ver < 6)
3078 return;
3079
3080 for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
3081 elk_inst *insn = store + offset;
3082 assert(elk_inst_cmpt_control(devinfo, insn) == 0);
3083
3084 switch (elk_inst_opcode(p->isa, insn)) {
3085 case ELK_OPCODE_BREAK: {
3086 int block_end_offset = elk_find_next_block_end(p, offset);
3087 assert(block_end_offset != 0);
3088 elk_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
3089 /* Gfx7 UIP points to WHILE; Gfx6 points just after it */
3090 elk_inst_set_uip(devinfo, insn,
3091 (elk_find_loop_end(p, offset) - offset +
3092 (devinfo->ver == 6 ? 16 : 0)) / scale);
3093 break;
3094 }
3095
3096 case ELK_OPCODE_CONTINUE: {
3097 int block_end_offset = elk_find_next_block_end(p, offset);
3098 assert(block_end_offset != 0);
3099 elk_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
3100 elk_inst_set_uip(devinfo, insn,
3101 (elk_find_loop_end(p, offset) - offset) / scale);
3102
3103 assert(elk_inst_uip(devinfo, insn) != 0);
3104 assert(elk_inst_jip(devinfo, insn) != 0);
3105 break;
3106 }
3107
3108 case ELK_OPCODE_ENDIF: {
3109 int block_end_offset = elk_find_next_block_end(p, offset);
3110 int32_t jump = (block_end_offset == 0) ?
3111 1 * br : (block_end_offset - offset) / scale;
3112 if (devinfo->ver >= 7)
3113 elk_inst_set_jip(devinfo, insn, jump);
3114 else
3115 elk_inst_set_gfx6_jump_count(devinfo, insn, jump);
3116 break;
3117 }
3118
3119 case ELK_OPCODE_HALT: {
3120 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
3121 *
3122 * "In case of the halt instruction not inside any conditional
3123 * code block, the value of <JIP> and <UIP> should be the
3124 * same. In case of the halt instruction inside conditional code
3125 * block, the <UIP> should be the end of the program, and the
3126 * <JIP> should be end of the most inner conditional code block."
3127 *
3128 * The uip will have already been set by whoever set up the
3129 * instruction.
3130 */
3131 int block_end_offset = elk_find_next_block_end(p, offset);
3132 if (block_end_offset == 0) {
3133 elk_inst_set_jip(devinfo, insn, elk_inst_uip(devinfo, insn));
3134 } else {
3135 elk_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
3136 }
3137 assert(elk_inst_uip(devinfo, insn) != 0);
3138 assert(elk_inst_jip(devinfo, insn) != 0);
3139 break;
3140 }
3141
3142 default:
3143 break;
3144 }
3145 }
3146 }
3147
elk_ff_sync(struct elk_codegen * p,struct elk_reg dest,unsigned msg_reg_nr,struct elk_reg src0,bool allocate,unsigned response_length,bool eot)3148 void elk_ff_sync(struct elk_codegen *p,
3149 struct elk_reg dest,
3150 unsigned msg_reg_nr,
3151 struct elk_reg src0,
3152 bool allocate,
3153 unsigned response_length,
3154 bool eot)
3155 {
3156 const struct intel_device_info *devinfo = p->devinfo;
3157 elk_inst *insn;
3158
3159 elk_gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
3160
3161 insn = next_insn(p, ELK_OPCODE_SEND);
3162 elk_set_dest(p, insn, dest);
3163 elk_set_src0(p, insn, src0);
3164 elk_set_src1(p, insn, elk_imm_d(0));
3165
3166 if (devinfo->ver < 6)
3167 elk_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
3168
3169 elk_set_ff_sync_message(p,
3170 insn,
3171 allocate,
3172 response_length,
3173 eot);
3174 }
3175
3176 /**
3177 * Emit the SEND instruction necessary to generate stream output data on Gfx6
3178 * (for transform feedback).
3179 *
3180 * If send_commit_msg is true, this is the last piece of stream output data
3181 * from this thread, so send the data as a committed write. According to the
3182 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
3183 *
3184 * "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
3185 * writes are complete by sending the final write as a committed write."
3186 */
3187 void
elk_svb_write(struct elk_codegen * p,struct elk_reg dest,unsigned msg_reg_nr,struct elk_reg src0,unsigned binding_table_index,bool send_commit_msg)3188 elk_svb_write(struct elk_codegen *p,
3189 struct elk_reg dest,
3190 unsigned msg_reg_nr,
3191 struct elk_reg src0,
3192 unsigned binding_table_index,
3193 bool send_commit_msg)
3194 {
3195 const struct intel_device_info *devinfo = p->devinfo;
3196 assert(devinfo->ver == 6);
3197 const unsigned target_cache = GFX6_SFID_DATAPORT_RENDER_CACHE;
3198 elk_inst *insn;
3199
3200 elk_gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
3201
3202 insn = next_insn(p, ELK_OPCODE_SEND);
3203 elk_inst_set_sfid(devinfo, insn, target_cache);
3204 elk_set_dest(p, insn, dest);
3205 elk_set_src0(p, insn, src0);
3206 elk_set_desc(p, insn,
3207 elk_message_desc(devinfo, 1, send_commit_msg, true) |
3208 elk_dp_write_desc(devinfo, binding_table_index,
3209 0, /* msg_control: ignored */
3210 GFX6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
3211 send_commit_msg)); /* send_commit_msg */
3212 }
3213
3214 static unsigned
elk_surface_payload_size(unsigned num_channels,unsigned exec_size)3215 elk_surface_payload_size(unsigned num_channels,
3216 unsigned exec_size /**< 0 for SIMD4x2 */)
3217 {
3218 if (exec_size == 0)
3219 return 1; /* SIMD4x2 */
3220 else if (exec_size <= 8)
3221 return num_channels;
3222 else
3223 return 2 * num_channels;
3224 }
3225
3226 void
elk_untyped_atomic(struct elk_codegen * p,struct elk_reg dst,struct elk_reg payload,struct elk_reg surface,unsigned atomic_op,unsigned msg_length,bool response_expected,bool header_present)3227 elk_untyped_atomic(struct elk_codegen *p,
3228 struct elk_reg dst,
3229 struct elk_reg payload,
3230 struct elk_reg surface,
3231 unsigned atomic_op,
3232 unsigned msg_length,
3233 bool response_expected,
3234 bool header_present)
3235 {
3236 const struct intel_device_info *devinfo = p->devinfo;
3237 const unsigned sfid = (devinfo->verx10 >= 75 ?
3238 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3239 GFX7_SFID_DATAPORT_DATA_CACHE);
3240 const bool align1 = elk_get_default_access_mode(p) == ELK_ALIGN_1;
3241 /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3242 const bool has_simd4x2 = devinfo->verx10 >= 75;
3243 const unsigned exec_size = align1 ? 1 << elk_get_default_exec_size(p) :
3244 has_simd4x2 ? 0 : 8;
3245 const unsigned response_length =
3246 elk_surface_payload_size(response_expected, exec_size);
3247 const unsigned desc =
3248 elk_message_desc(devinfo, msg_length, response_length, header_present) |
3249 elk_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
3250 response_expected);
3251 /* Mask out unused components -- This is especially important in Align16
3252 * mode on generations that don't have native support for SIMD4x2 atomics,
3253 * because unused but enabled components will cause the dataport to perform
3254 * additional atomic operations on the addresses that happen to be in the
3255 * uninitialized Y, Z and W coordinates of the payload.
3256 */
3257 const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3258
3259 elk_send_indirect_surface_message(p, sfid, elk_writemask(dst, mask),
3260 payload, surface, desc);
3261 }
3262
3263 void
elk_untyped_surface_read(struct elk_codegen * p,struct elk_reg dst,struct elk_reg payload,struct elk_reg surface,unsigned msg_length,unsigned num_channels)3264 elk_untyped_surface_read(struct elk_codegen *p,
3265 struct elk_reg dst,
3266 struct elk_reg payload,
3267 struct elk_reg surface,
3268 unsigned msg_length,
3269 unsigned num_channels)
3270 {
3271 const struct intel_device_info *devinfo = p->devinfo;
3272 const unsigned sfid = (devinfo->verx10 >= 75 ?
3273 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3274 GFX7_SFID_DATAPORT_DATA_CACHE);
3275 const bool align1 = elk_get_default_access_mode(p) == ELK_ALIGN_1;
3276 const unsigned exec_size = align1 ? 1 << elk_get_default_exec_size(p) : 0;
3277 const unsigned response_length =
3278 elk_surface_payload_size(num_channels, exec_size);
3279 const unsigned desc =
3280 elk_message_desc(devinfo, msg_length, response_length, false) |
3281 elk_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
3282
3283 elk_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
3284 }
3285
3286 void
elk_untyped_surface_write(struct elk_codegen * p,struct elk_reg payload,struct elk_reg surface,unsigned msg_length,unsigned num_channels,bool header_present)3287 elk_untyped_surface_write(struct elk_codegen *p,
3288 struct elk_reg payload,
3289 struct elk_reg surface,
3290 unsigned msg_length,
3291 unsigned num_channels,
3292 bool header_present)
3293 {
3294 const struct intel_device_info *devinfo = p->devinfo;
3295 const unsigned sfid = (devinfo->verx10 >= 75 ?
3296 HSW_SFID_DATAPORT_DATA_CACHE_1 :
3297 GFX7_SFID_DATAPORT_DATA_CACHE);
3298 const bool align1 = elk_get_default_access_mode(p) == ELK_ALIGN_1;
3299 /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3300 const bool has_simd4x2 = devinfo->verx10 >= 75;
3301 const unsigned exec_size = align1 ? 1 << elk_get_default_exec_size(p) :
3302 has_simd4x2 ? 0 : 8;
3303 const unsigned desc =
3304 elk_message_desc(devinfo, msg_length, 0, header_present) |
3305 elk_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
3306 /* Mask out unused components -- See comment in elk_untyped_atomic(). */
3307 const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
3308
3309 elk_send_indirect_surface_message(p, sfid, elk_writemask(elk_null_reg(), mask),
3310 payload, surface, desc);
3311 }
3312
3313 static void
elk_set_memory_fence_message(struct elk_codegen * p,struct elk_inst * insn,enum elk_message_target sfid,bool commit_enable,unsigned bti)3314 elk_set_memory_fence_message(struct elk_codegen *p,
3315 struct elk_inst *insn,
3316 enum elk_message_target sfid,
3317 bool commit_enable,
3318 unsigned bti)
3319 {
3320 const struct intel_device_info *devinfo = p->devinfo;
3321
3322 elk_set_desc(p, insn, elk_message_desc(
3323 devinfo, 1, (commit_enable ? 1 : 0), true));
3324
3325 elk_inst_set_sfid(devinfo, insn, sfid);
3326
3327 switch (sfid) {
3328 case GFX6_SFID_DATAPORT_RENDER_CACHE:
3329 elk_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_RC_MEMORY_FENCE);
3330 break;
3331 case GFX7_SFID_DATAPORT_DATA_CACHE:
3332 elk_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_DC_MEMORY_FENCE);
3333 break;
3334 default:
3335 unreachable("Not reached");
3336 }
3337
3338 if (commit_enable)
3339 elk_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3340
3341 assert(devinfo->ver >= 11 || bti == 0);
3342 elk_inst_set_binding_table_index(devinfo, insn, bti);
3343 }
3344
3345 static void
gfx12_set_memory_fence_message(struct elk_codegen * p,struct elk_inst * insn,enum elk_message_target sfid,uint32_t desc)3346 gfx12_set_memory_fence_message(struct elk_codegen *p,
3347 struct elk_inst *insn,
3348 enum elk_message_target sfid,
3349 uint32_t desc)
3350 {
3351 const unsigned mlen = 1 * reg_unit(p->devinfo); /* g0 header */
3352 /* Completion signaled by write to register. No data returned. */
3353 const unsigned rlen = 1 * reg_unit(p->devinfo);
3354
3355 elk_inst_set_sfid(p->devinfo, insn, sfid);
3356
3357 if (sfid == ELK_SFID_URB && p->devinfo->ver < 20) {
3358 elk_set_desc(p, insn, elk_urb_fence_desc(p->devinfo) |
3359 elk_message_desc(p->devinfo, mlen, rlen, true));
3360 } else {
3361 enum lsc_fence_scope scope = lsc_fence_msg_desc_scope(p->devinfo, desc);
3362 enum lsc_flush_type flush_type = lsc_fence_msg_desc_flush_type(p->devinfo, desc);
3363
3364 if (sfid == GFX12_SFID_TGM) {
3365 scope = LSC_FENCE_TILE;
3366 flush_type = LSC_FLUSH_TYPE_EVICT;
3367 }
3368
3369 /* Wa_14012437816:
3370 *
3371 * "For any fence greater than local scope, always set flush type to
3372 * at least invalidate so that fence goes on properly."
3373 *
3374 * "The bug is if flush_type is 'None', the scope is always downgraded
3375 * to 'local'."
3376 *
3377 * Here set scope to NONE_6 instead of NONE, which has the same effect
3378 * as NONE but avoids the downgrade to scope LOCAL.
3379 */
3380 if (intel_needs_workaround(p->devinfo, 14012437816) &&
3381 scope > LSC_FENCE_LOCAL &&
3382 flush_type == LSC_FLUSH_TYPE_NONE) {
3383 flush_type = LSC_FLUSH_TYPE_NONE_6;
3384 }
3385
3386 elk_set_desc(p, insn, lsc_fence_msg_desc(p->devinfo, scope,
3387 flush_type, false) |
3388 elk_message_desc(p->devinfo, mlen, rlen, false));
3389 }
3390 }
3391
3392 void
elk_memory_fence(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src,enum elk_opcode send_op,enum elk_message_target sfid,uint32_t desc,bool commit_enable,unsigned bti)3393 elk_memory_fence(struct elk_codegen *p,
3394 struct elk_reg dst,
3395 struct elk_reg src,
3396 enum elk_opcode send_op,
3397 enum elk_message_target sfid,
3398 uint32_t desc,
3399 bool commit_enable,
3400 unsigned bti)
3401 {
3402 const struct intel_device_info *devinfo = p->devinfo;
3403
3404 dst = retype(vec1(dst), ELK_REGISTER_TYPE_UW);
3405 src = retype(vec1(src), ELK_REGISTER_TYPE_UD);
3406
3407 /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3408 * message doesn't write anything back.
3409 */
3410 struct elk_inst *insn = next_insn(p, send_op);
3411 elk_inst_set_mask_control(devinfo, insn, ELK_MASK_DISABLE);
3412 elk_inst_set_exec_size(devinfo, insn, ELK_EXECUTE_1);
3413 elk_set_dest(p, insn, dst);
3414 elk_set_src0(p, insn, src);
3415
3416 /* All DG2 hardware requires LSC for fence messages, even A-step */
3417 if (devinfo->has_lsc)
3418 gfx12_set_memory_fence_message(p, insn, sfid, desc);
3419 else
3420 elk_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
3421 }
3422
3423 void
elk_find_live_channel(struct elk_codegen * p,struct elk_reg dst,bool last)3424 elk_find_live_channel(struct elk_codegen *p, struct elk_reg dst, bool last)
3425 {
3426 const struct intel_device_info *devinfo = p->devinfo;
3427 const unsigned exec_size = 1 << elk_get_default_exec_size(p);
3428 const unsigned qtr_control = elk_get_default_group(p) / 8;
3429 elk_inst *inst;
3430
3431 assert(devinfo->ver == 7);
3432
3433 elk_push_insn_state(p);
3434
3435 /* The flag register is only used on Gfx7 in align1 mode, so avoid setting
3436 * unnecessary bits in the instruction words, get the information we need
3437 * and reset the default flag register. This allows more instructions to be
3438 * compacted.
3439 */
3440 const unsigned flag_subreg = p->current->flag_subreg;
3441 elk_set_default_flag_reg(p, 0, 0);
3442
3443 if (elk_get_default_access_mode(p) == ELK_ALIGN_1) {
3444 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
3445
3446 const struct elk_reg flag = elk_flag_subreg(flag_subreg);
3447
3448 elk_set_default_exec_size(p, ELK_EXECUTE_1);
3449 elk_MOV(p, retype(flag, ELK_REGISTER_TYPE_UD), elk_imm_ud(0));
3450
3451 /* Run enough instructions returning zero with execution masking and
3452 * a conditional modifier enabled in order to get the full execution
3453 * mask in f1.0. We could use a single 32-wide move here if it
3454 * weren't because of the hardware bug that causes channel enables to
3455 * be applied incorrectly to the second half of 32-wide instructions
3456 * on Gfx7.
3457 */
3458 const unsigned lower_size = MIN2(16, exec_size);
3459 for (unsigned i = 0; i < exec_size / lower_size; i++) {
3460 inst = elk_MOV(p, retype(elk_null_reg(), ELK_REGISTER_TYPE_UW),
3461 elk_imm_uw(0));
3462 elk_inst_set_mask_control(devinfo, inst, ELK_MASK_ENABLE);
3463 elk_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3464 elk_inst_set_cond_modifier(devinfo, inst, ELK_CONDITIONAL_Z);
3465 elk_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3466 elk_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3467 elk_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3468 }
3469
3470 /* Find the first bit set in the exec_size-wide portion of the flag
3471 * register that was updated by the last sequence of MOV
3472 * instructions.
3473 */
3474 const enum elk_reg_type type = elk_int_type(exec_size / 8, false);
3475 elk_set_default_exec_size(p, ELK_EXECUTE_1);
3476 if (!last) {
3477 inst = elk_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3478 } else {
3479 inst = elk_LZD(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3480 struct elk_reg neg = vec1(dst);
3481 neg.negate = true;
3482 inst = elk_ADD(p, vec1(dst), neg, elk_imm_uw(31));
3483 }
3484 } else {
3485 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
3486
3487 /* Overwrite the destination without and with execution masking to
3488 * find out which of the channels is active.
3489 */
3490 elk_push_insn_state(p);
3491 elk_set_default_exec_size(p, ELK_EXECUTE_4);
3492 elk_MOV(p, elk_writemask(vec4(dst), WRITEMASK_X),
3493 elk_imm_ud(1));
3494
3495 inst = elk_MOV(p, elk_writemask(vec4(dst), WRITEMASK_X),
3496 elk_imm_ud(0));
3497 elk_pop_insn_state(p);
3498 elk_inst_set_mask_control(devinfo, inst, ELK_MASK_ENABLE);
3499 }
3500
3501 elk_pop_insn_state(p);
3502 }
3503
3504 void
elk_broadcast(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src,struct elk_reg idx)3505 elk_broadcast(struct elk_codegen *p,
3506 struct elk_reg dst,
3507 struct elk_reg src,
3508 struct elk_reg idx)
3509 {
3510 const struct intel_device_info *devinfo = p->devinfo;
3511 const bool align1 = elk_get_default_access_mode(p) == ELK_ALIGN_1;
3512 elk_inst *inst;
3513
3514 elk_push_insn_state(p);
3515 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
3516 elk_set_default_exec_size(p, align1 ? ELK_EXECUTE_1 : ELK_EXECUTE_4);
3517
3518 assert(src.file == ELK_GENERAL_REGISTER_FILE &&
3519 src.address_mode == ELK_ADDRESS_DIRECT);
3520 assert(!src.abs && !src.negate);
3521
3522 /* Gen12.5 adds the following region restriction:
3523 *
3524 * "Vx1 and VxH indirect addressing for Float, Half-Float, Double-Float
3525 * and Quad-Word data must not be used."
3526 *
3527 * We require the source and destination types to match so stomp to an
3528 * unsigned integer type.
3529 */
3530 assert(src.type == dst.type);
3531 src.type = dst.type = elk_reg_type_from_bit_size(type_sz(src.type) * 8,
3532 ELK_REGISTER_TYPE_UD);
3533
3534 if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3535 idx.file == ELK_IMMEDIATE_VALUE) {
3536 /* Trivial, the source is already uniform or the index is a constant.
3537 * We will typically not get here if the optimizer is doing its job, but
3538 * asserting would be mean.
3539 */
3540 const unsigned i = idx.file == ELK_IMMEDIATE_VALUE ? idx.ud : 0;
3541 src = align1 ? stride(suboffset(src, i), 0, 1, 0) :
3542 stride(suboffset(src, 4 * i), 0, 4, 1);
3543
3544 if (type_sz(src.type) > 4 && !devinfo->has_64bit_int) {
3545 elk_MOV(p, subscript(dst, ELK_REGISTER_TYPE_D, 0),
3546 subscript(src, ELK_REGISTER_TYPE_D, 0));
3547 elk_set_default_swsb(p, tgl_swsb_null());
3548 elk_MOV(p, subscript(dst, ELK_REGISTER_TYPE_D, 1),
3549 subscript(src, ELK_REGISTER_TYPE_D, 1));
3550 } else {
3551 elk_MOV(p, dst, src);
3552 }
3553 } else {
3554 /* From the Haswell PRM section "Register Region Restrictions":
3555 *
3556 * "The lower bits of the AddressImmediate must not overflow to
3557 * change the register address. The lower 5 bits of Address
3558 * Immediate when added to lower 5 bits of address register gives
3559 * the sub-register offset. The upper bits of Address Immediate
3560 * when added to upper bits of address register gives the register
3561 * address. Any overflow from sub-register offset is dropped."
3562 *
3563 * Fortunately, for broadcast, we never have a sub-register offset so
3564 * this isn't an issue.
3565 */
3566 assert(src.subnr == 0);
3567
3568 if (align1) {
3569 const struct elk_reg addr =
3570 retype(elk_address_reg(0), ELK_REGISTER_TYPE_UD);
3571 unsigned offset = src.nr * REG_SIZE + src.subnr;
3572 /* Limit in bytes of the signed indirect addressing immediate. */
3573 const unsigned limit = 512;
3574
3575 elk_push_insn_state(p);
3576 elk_set_default_mask_control(p, ELK_MASK_DISABLE);
3577 elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
3578 elk_set_default_flag_reg(p, 0, 0);
3579
3580 /* Take into account the component size and horizontal stride. */
3581 assert(src.vstride == src.hstride + src.width);
3582 elk_SHL(p, addr, vec1(idx),
3583 elk_imm_ud(util_logbase2(type_sz(src.type)) +
3584 src.hstride - 1));
3585
3586 /* We can only address up to limit bytes using the indirect
3587 * addressing immediate, account for the difference if the source
3588 * register is above this limit.
3589 */
3590 if (offset >= limit) {
3591 elk_set_default_swsb(p, tgl_swsb_regdist(1));
3592 elk_ADD(p, addr, addr, elk_imm_ud(offset - offset % limit));
3593 offset = offset % limit;
3594 }
3595
3596 elk_pop_insn_state(p);
3597
3598 elk_set_default_swsb(p, tgl_swsb_regdist(1));
3599
3600 /* Use indirect addressing to fetch the specified component. */
3601 if (type_sz(src.type) > 4 &&
3602 (devinfo->platform == INTEL_PLATFORM_CHV || intel_device_info_is_9lp(devinfo) ||
3603 !devinfo->has_64bit_int)) {
3604 /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3605 *
3606 * "When source or destination datatype is 64b or operation is
3607 * integer DWord multiply, indirect addressing must not be
3608 * used."
3609 *
3610 * To work around both of this issue, we do two integer MOVs
3611 * insead of one 64-bit MOV. Because no double value should ever
3612 * cross a register boundary, it's safe to use the immediate
3613 * offset in the indirect here to handle adding 4 bytes to the
3614 * offset and avoid the extra ADD to the register file.
3615 */
3616 elk_MOV(p, subscript(dst, ELK_REGISTER_TYPE_D, 0),
3617 retype(elk_vec1_indirect(addr.subnr, offset),
3618 ELK_REGISTER_TYPE_D));
3619 elk_set_default_swsb(p, tgl_swsb_null());
3620 elk_MOV(p, subscript(dst, ELK_REGISTER_TYPE_D, 1),
3621 retype(elk_vec1_indirect(addr.subnr, offset + 4),
3622 ELK_REGISTER_TYPE_D));
3623 } else {
3624 elk_MOV(p, dst,
3625 retype(elk_vec1_indirect(addr.subnr, offset), src.type));
3626 }
3627 } else {
3628 /* In SIMD4x2 mode the index can be either zero or one, replicate it
3629 * to all bits of a flag register,
3630 */
3631 inst = elk_MOV(p,
3632 elk_null_reg(),
3633 stride(elk_swizzle(idx, ELK_SWIZZLE_XXXX), 4, 4, 1));
3634 elk_inst_set_pred_control(devinfo, inst, ELK_PREDICATE_NONE);
3635 elk_inst_set_cond_modifier(devinfo, inst, ELK_CONDITIONAL_NZ);
3636 elk_inst_set_flag_reg_nr(devinfo, inst, 1);
3637
3638 /* and use predicated SEL to pick the right channel. */
3639 inst = elk_SEL(p, dst,
3640 stride(suboffset(src, 4), 4, 4, 1),
3641 stride(src, 4, 4, 1));
3642 elk_inst_set_pred_control(devinfo, inst, ELK_PREDICATE_NORMAL);
3643 elk_inst_set_flag_reg_nr(devinfo, inst, 1);
3644 }
3645 }
3646
3647 elk_pop_insn_state(p);
3648 }
3649
3650
3651 /**
3652 * Emit the SEND message for a barrier
3653 */
3654 void
elk_barrier(struct elk_codegen * p,struct elk_reg src)3655 elk_barrier(struct elk_codegen *p, struct elk_reg src)
3656 {
3657 const struct intel_device_info *devinfo = p->devinfo;
3658 struct elk_inst *inst;
3659
3660 assert(devinfo->ver >= 7);
3661
3662 elk_push_insn_state(p);
3663 elk_set_default_access_mode(p, ELK_ALIGN_1);
3664 inst = next_insn(p, ELK_OPCODE_SEND);
3665 elk_set_dest(p, inst, retype(elk_null_reg(), ELK_REGISTER_TYPE_UW));
3666 elk_set_src0(p, inst, src);
3667 elk_set_src1(p, inst, elk_null_reg());
3668 elk_set_desc(p, inst, elk_message_desc(devinfo,
3669 1 * reg_unit(devinfo), 0, false));
3670
3671 elk_inst_set_sfid(devinfo, inst, ELK_SFID_MESSAGE_GATEWAY);
3672 elk_inst_set_gateway_subfuncid(devinfo, inst,
3673 ELK_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3674
3675 elk_inst_set_mask_control(devinfo, inst, ELK_MASK_DISABLE);
3676 elk_pop_insn_state(p);
3677 }
3678
3679
3680 /**
3681 * Emit the wait instruction for a barrier
3682 */
3683 void
elk_WAIT(struct elk_codegen * p)3684 elk_WAIT(struct elk_codegen *p)
3685 {
3686 const struct intel_device_info *devinfo = p->devinfo;
3687 struct elk_inst *insn;
3688
3689 struct elk_reg src = elk_notification_reg();
3690
3691 insn = next_insn(p, ELK_OPCODE_WAIT);
3692 elk_set_dest(p, insn, src);
3693 elk_set_src0(p, insn, src);
3694 elk_set_src1(p, insn, elk_null_reg());
3695
3696 elk_inst_set_exec_size(devinfo, insn, ELK_EXECUTE_1);
3697 elk_inst_set_mask_control(devinfo, insn, ELK_MASK_DISABLE);
3698 }
3699
3700 void
elk_float_controls_mode(struct elk_codegen * p,unsigned mode,unsigned mask)3701 elk_float_controls_mode(struct elk_codegen *p,
3702 unsigned mode, unsigned mask)
3703 {
3704 assert(p->current->mask_control == ELK_MASK_DISABLE);
3705
3706 /* From the Skylake PRM, Volume 7, page 760:
3707 * "Implementation Restriction on Register Access: When the control
3708 * register is used as an explicit source and/or destination, hardware
3709 * does not ensure execution pipeline coherency. Software must set the
3710 * thread control field to ‘switch’ for an instruction that uses
3711 * control register as an explicit operand."
3712 *
3713 * On Gfx12+ this is implemented in terms of SWSB annotations instead.
3714 */
3715 elk_set_default_swsb(p, tgl_swsb_regdist(1));
3716
3717 elk_inst *inst = elk_AND(p, elk_cr0_reg(0), elk_cr0_reg(0),
3718 elk_imm_ud(~mask));
3719 elk_inst_set_exec_size(p->devinfo, inst, ELK_EXECUTE_1);
3720 if (p->devinfo->ver < 12)
3721 elk_inst_set_thread_control(p->devinfo, inst, ELK_THREAD_SWITCH);
3722
3723 if (mode) {
3724 elk_inst *inst_or = elk_OR(p, elk_cr0_reg(0), elk_cr0_reg(0),
3725 elk_imm_ud(mode));
3726 elk_inst_set_exec_size(p->devinfo, inst_or, ELK_EXECUTE_1);
3727 if (p->devinfo->ver < 12)
3728 elk_inst_set_thread_control(p->devinfo, inst_or, ELK_THREAD_SWITCH);
3729 }
3730
3731 if (p->devinfo->ver >= 12)
3732 elk_SYNC(p, TGL_SYNC_NOP);
3733 }
3734
3735 void
elk_update_reloc_imm(const struct elk_isa_info * isa,elk_inst * inst,uint32_t value)3736 elk_update_reloc_imm(const struct elk_isa_info *isa,
3737 elk_inst *inst,
3738 uint32_t value)
3739 {
3740 const struct intel_device_info *devinfo = isa->devinfo;
3741
3742 /* Sanity check that the instruction is a MOV of an immediate */
3743 assert(elk_inst_opcode(isa, inst) == ELK_OPCODE_MOV);
3744 assert(elk_inst_src0_reg_file(devinfo, inst) == ELK_IMMEDIATE_VALUE);
3745
3746 /* If it was compacted, we can't safely rewrite */
3747 assert(elk_inst_cmpt_control(devinfo, inst) == 0);
3748
3749 elk_inst_set_imm_ud(devinfo, inst, value);
3750 }
3751
3752 /* A default value for constants that will be patched at run-time.
3753 * We pick an arbitrary value that prevents instruction compaction.
3754 */
3755 #define DEFAULT_PATCH_IMM 0x4a7cc037
3756
3757 void
elk_MOV_reloc_imm(struct elk_codegen * p,struct elk_reg dst,enum elk_reg_type src_type,uint32_t id)3758 elk_MOV_reloc_imm(struct elk_codegen *p,
3759 struct elk_reg dst,
3760 enum elk_reg_type src_type,
3761 uint32_t id)
3762 {
3763 assert(type_sz(src_type) == 4);
3764 assert(type_sz(dst.type) == 4);
3765
3766 elk_add_reloc(p, id, ELK_SHADER_RELOC_TYPE_MOV_IMM,
3767 p->next_insn_offset, 0);
3768
3769 elk_MOV(p, dst, retype(elk_imm_ud(DEFAULT_PATCH_IMM), src_type));
3770 }
3771