• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3  Intel funded Tungsten Graphics to
4  develop this 3D driver.
5 
6  Permission is hereby granted, free of charge, to any person obtaining
7  a copy of this software and associated documentation files (the
8  "Software"), to deal in the Software without restriction, including
9  without limitation the rights to use, copy, modify, merge, publish,
10  distribute, sublicense, and/or sell copies of the Software, and to
11  permit persons to whom the Software is furnished to do so, subject to
12  the following conditions:
13 
14  The above copyright notice and this permission notice (including the
15  next paragraph) shall be included in all copies or substantial
16  portions of the Software.
17 
18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 
26  **********************************************************************/
27  /*
28   * Authors:
29   *   Keith Whitwell <keithw@vmware.com>
30   */
31 
32 
33 #include "elk_eu_defines.h"
34 #include "elk_eu.h"
35 
36 #include "util/ralloc.h"
37 
38 /**
39  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40  * registers, implicitly moving the operand to a message register.
41  *
42  * On Sandybridge, this is no longer the case.  This function performs the
43  * explicit move; it should be called before emitting a SEND instruction.
44  */
45 void
elk_gfx6_resolve_implied_move(struct elk_codegen * p,struct elk_reg * src,unsigned msg_reg_nr)46 elk_gfx6_resolve_implied_move(struct elk_codegen *p,
47 			  struct elk_reg *src,
48 			  unsigned msg_reg_nr)
49 {
50    const struct intel_device_info *devinfo = p->devinfo;
51    if (devinfo->ver < 6)
52       return;
53 
54    if (src->file == ELK_MESSAGE_REGISTER_FILE)
55       return;
56 
57    if (src->file != ELK_ARCHITECTURE_REGISTER_FILE || src->nr != ELK_ARF_NULL) {
58       assert(devinfo->ver < 12);
59       elk_push_insn_state(p);
60       elk_set_default_exec_size(p, ELK_EXECUTE_8);
61       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
62       elk_set_default_compression_control(p, ELK_COMPRESSION_NONE);
63       elk_MOV(p, retype(elk_message_reg(msg_reg_nr), ELK_REGISTER_TYPE_UD),
64 	      retype(*src, ELK_REGISTER_TYPE_UD));
65       elk_pop_insn_state(p);
66    }
67    *src = elk_message_reg(msg_reg_nr);
68 }
69 
70 static void
gfx7_convert_mrf_to_grf(struct elk_codegen * p,struct elk_reg * reg)71 gfx7_convert_mrf_to_grf(struct elk_codegen *p, struct elk_reg *reg)
72 {
73    /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74     * "The send with EOT should use register space R112-R127 for <src>. This is
75     *  to enable loading of a new thread into the same slot while the message
76     *  with EOT for current thread is pending dispatch."
77     *
78     * Since we're pretending to have 16 MRFs anyway, we may as well use the
79     * registers required for messages with EOT.
80     */
81    const struct intel_device_info *devinfo = p->devinfo;
82    if (devinfo->ver >= 7 && reg->file == ELK_MESSAGE_REGISTER_FILE) {
83       reg->file = ELK_GENERAL_REGISTER_FILE;
84       reg->nr += GFX7_MRF_HACK_START;
85    }
86 }
87 
88 void
elk_set_dest(struct elk_codegen * p,elk_inst * inst,struct elk_reg dest)89 elk_set_dest(struct elk_codegen *p, elk_inst *inst, struct elk_reg dest)
90 {
91    const struct intel_device_info *devinfo = p->devinfo;
92 
93    if (dest.file == ELK_MESSAGE_REGISTER_FILE)
94       assert((dest.nr & ~ELK_MRF_COMPR4) < ELK_MAX_MRF(devinfo->ver));
95    else if (dest.file == ELK_GENERAL_REGISTER_FILE)
96       assert(dest.nr < XE2_MAX_GRF);
97 
98    /* The hardware has a restriction where a destination of size Byte with
99     * a stride of 1 is only allowed for a packed byte MOV. For any other
100     * instruction, the stride must be at least 2, even when the destination
101     * is the NULL register.
102     */
103    if (dest.file == ELK_ARCHITECTURE_REGISTER_FILE &&
104        dest.nr == ELK_ARF_NULL &&
105        type_sz(dest.type) == 1 &&
106        dest.hstride == ELK_HORIZONTAL_STRIDE_1) {
107       dest.hstride = ELK_HORIZONTAL_STRIDE_2;
108    }
109 
110    gfx7_convert_mrf_to_grf(p, &dest);
111 
112    if (devinfo->ver >= 12 &&
113        (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SEND ||
114         elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDC)) {
115       assert(dest.file == ELK_GENERAL_REGISTER_FILE ||
116              dest.file == ELK_ARCHITECTURE_REGISTER_FILE);
117       assert(dest.address_mode == ELK_ADDRESS_DIRECT);
118       assert(dest.subnr == 0);
119       assert(elk_inst_exec_size(devinfo, inst) == ELK_EXECUTE_1 ||
120              (dest.hstride == ELK_HORIZONTAL_STRIDE_1 &&
121               dest.vstride == dest.width + 1));
122       assert(!dest.negate && !dest.abs);
123       elk_inst_set_dst_reg_file(devinfo, inst, dest.file);
124       elk_inst_set_dst_da_reg_nr(devinfo, inst, phys_nr(devinfo, dest));
125 
126    } else if (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDS ||
127               elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDSC) {
128       assert(devinfo->ver < 12);
129       assert(dest.file == ELK_GENERAL_REGISTER_FILE ||
130              dest.file == ELK_ARCHITECTURE_REGISTER_FILE);
131       assert(dest.address_mode == ELK_ADDRESS_DIRECT);
132       assert(dest.subnr % 16 == 0);
133       assert(dest.hstride == ELK_HORIZONTAL_STRIDE_1 &&
134              dest.vstride == dest.width + 1);
135       assert(!dest.negate && !dest.abs);
136       elk_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
137       elk_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
138       elk_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
139    } else {
140       elk_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
141       elk_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
142 
143       if (dest.address_mode == ELK_ADDRESS_DIRECT) {
144          elk_inst_set_dst_da_reg_nr(devinfo, inst, phys_nr(devinfo, dest));
145 
146          if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
147             elk_inst_set_dst_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, dest));
148             if (dest.hstride == ELK_HORIZONTAL_STRIDE_0)
149                dest.hstride = ELK_HORIZONTAL_STRIDE_1;
150             elk_inst_set_dst_hstride(devinfo, inst, dest.hstride);
151          } else {
152             elk_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
153             elk_inst_set_da16_writemask(devinfo, inst, dest.writemask);
154             if (dest.file == ELK_GENERAL_REGISTER_FILE ||
155                 dest.file == ELK_MESSAGE_REGISTER_FILE) {
156                assert(dest.writemask != 0);
157             }
158             /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
159              *    Although Dst.HorzStride is a don't care for Align16, HW needs
160              *    this to be programmed as "01".
161              */
162             elk_inst_set_dst_hstride(devinfo, inst, 1);
163          }
164       } else {
165          elk_inst_set_dst_ia_subreg_nr(devinfo, inst, phys_subnr(devinfo, dest));
166 
167          /* These are different sizes in align1 vs align16:
168           */
169          if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
170             elk_inst_set_dst_ia1_addr_imm(devinfo, inst,
171                                           dest.indirect_offset);
172             if (dest.hstride == ELK_HORIZONTAL_STRIDE_0)
173                dest.hstride = ELK_HORIZONTAL_STRIDE_1;
174             elk_inst_set_dst_hstride(devinfo, inst, dest.hstride);
175          } else {
176             elk_inst_set_dst_ia16_addr_imm(devinfo, inst,
177                                            dest.indirect_offset);
178             /* even ignored in da16, still need to set as '01' */
179             elk_inst_set_dst_hstride(devinfo, inst, 1);
180          }
181       }
182    }
183 
184    /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
185     * or 16 (SIMD16), as that's normally correct.  However, when dealing with
186     * small registers, it can be useful for us to automatically reduce it to
187     * match the register size.
188     */
189    if (p->automatic_exec_sizes) {
190       /*
191        * In platforms that support fp64 we can emit instructions with a width
192        * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
193        * these cases we need to make sure that these instructions have their
194        * exec sizes set properly when they are emitted and we can't rely on
195        * this code to fix it.
196        */
197       bool fix_exec_size;
198       if (devinfo->ver >= 6)
199          fix_exec_size = dest.width < ELK_EXECUTE_4;
200       else
201          fix_exec_size = dest.width < ELK_EXECUTE_8;
202 
203       if (fix_exec_size)
204          elk_inst_set_exec_size(devinfo, inst, dest.width);
205    }
206 }
207 
208 void
elk_set_src0(struct elk_codegen * p,elk_inst * inst,struct elk_reg reg)209 elk_set_src0(struct elk_codegen *p, elk_inst *inst, struct elk_reg reg)
210 {
211    const struct intel_device_info *devinfo = p->devinfo;
212 
213    if (reg.file == ELK_MESSAGE_REGISTER_FILE)
214       assert((reg.nr & ~ELK_MRF_COMPR4) < ELK_MAX_MRF(devinfo->ver));
215    else if (reg.file == ELK_GENERAL_REGISTER_FILE)
216       assert(reg.nr < XE2_MAX_GRF);
217 
218    gfx7_convert_mrf_to_grf(p, &reg);
219 
220    if (devinfo->ver >= 6 &&
221        (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SEND ||
222         elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDC ||
223         elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDS ||
224         elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDSC)) {
225       /* Any source modifiers or regions will be ignored, since this just
226        * identifies the MRF/GRF to start reading the message contents from.
227        * Check for some likely failures.
228        */
229       assert(!reg.negate);
230       assert(!reg.abs);
231       assert(reg.address_mode == ELK_ADDRESS_DIRECT);
232    }
233 
234    if (devinfo->ver >= 12 &&
235        (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SEND ||
236         elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDC)) {
237       assert(reg.file != ELK_IMMEDIATE_VALUE);
238       assert(reg.address_mode == ELK_ADDRESS_DIRECT);
239       assert(reg.subnr == 0);
240       assert(has_scalar_region(reg) ||
241              (reg.hstride == ELK_HORIZONTAL_STRIDE_1 &&
242               reg.vstride == reg.width + 1));
243       assert(!reg.negate && !reg.abs);
244       elk_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
245       elk_inst_set_src0_da_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
246 
247    } else if (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDS ||
248               elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDSC) {
249       assert(reg.file == ELK_GENERAL_REGISTER_FILE);
250       assert(reg.address_mode == ELK_ADDRESS_DIRECT);
251       assert(reg.subnr % 16 == 0);
252       assert(has_scalar_region(reg) ||
253              (reg.hstride == ELK_HORIZONTAL_STRIDE_1 &&
254               reg.vstride == reg.width + 1));
255       assert(!reg.negate && !reg.abs);
256       elk_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
257       elk_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
258    } else {
259       elk_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
260       elk_inst_set_src0_abs(devinfo, inst, reg.abs);
261       elk_inst_set_src0_negate(devinfo, inst, reg.negate);
262       elk_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
263 
264       if (reg.file == ELK_IMMEDIATE_VALUE) {
265          if (reg.type == ELK_REGISTER_TYPE_DF ||
266              elk_inst_opcode(p->isa, inst) == ELK_OPCODE_DIM)
267             elk_inst_set_imm_df(devinfo, inst, reg.df);
268          else if (reg.type == ELK_REGISTER_TYPE_UQ ||
269                   reg.type == ELK_REGISTER_TYPE_Q)
270             elk_inst_set_imm_uq(devinfo, inst, reg.u64);
271          else
272             elk_inst_set_imm_ud(devinfo, inst, reg.ud);
273 
274          if (devinfo->ver < 12 && type_sz(reg.type) < 8) {
275             elk_inst_set_src1_reg_file(devinfo, inst,
276                                        ELK_ARCHITECTURE_REGISTER_FILE);
277             elk_inst_set_src1_reg_hw_type(devinfo, inst,
278                                           elk_inst_src0_reg_hw_type(devinfo, inst));
279          }
280       } else {
281          if (reg.address_mode == ELK_ADDRESS_DIRECT) {
282             elk_inst_set_src0_da_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
283             if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
284                elk_inst_set_src0_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg));
285             } else {
286                elk_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
287             }
288          } else {
289             elk_inst_set_src0_ia_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg));
290 
291             if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
292                elk_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
293             } else {
294                elk_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
295             }
296          }
297 
298          if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
299             if (reg.width == ELK_WIDTH_1 &&
300                 elk_inst_exec_size(devinfo, inst) == ELK_EXECUTE_1) {
301                elk_inst_set_src0_hstride(devinfo, inst, ELK_HORIZONTAL_STRIDE_0);
302                elk_inst_set_src0_width(devinfo, inst, ELK_WIDTH_1);
303                elk_inst_set_src0_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_0);
304             } else {
305                elk_inst_set_src0_hstride(devinfo, inst, reg.hstride);
306                elk_inst_set_src0_width(devinfo, inst, reg.width);
307                elk_inst_set_src0_vstride(devinfo, inst, reg.vstride);
308             }
309          } else {
310             elk_inst_set_src0_da16_swiz_x(devinfo, inst,
311                ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_X));
312             elk_inst_set_src0_da16_swiz_y(devinfo, inst,
313                ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_Y));
314             elk_inst_set_src0_da16_swiz_z(devinfo, inst,
315                ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_Z));
316             elk_inst_set_src0_da16_swiz_w(devinfo, inst,
317                ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_W));
318 
319             if (reg.vstride == ELK_VERTICAL_STRIDE_8) {
320                /* This is an oddity of the fact we're using the same
321                 * descriptions for registers in align_16 as align_1:
322                 */
323                elk_inst_set_src0_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_4);
324             } else if (devinfo->verx10 == 70 &&
325                        reg.type == ELK_REGISTER_TYPE_DF &&
326                        reg.vstride == ELK_VERTICAL_STRIDE_2) {
327                /* From SNB PRM:
328                 *
329                 * "For Align16 access mode, only encodings of 0000 and 0011
330                 *  are allowed. Other codes are reserved."
331                 *
332                 * Presumably the DevSNB behavior applies to IVB as well.
333                 */
334                elk_inst_set_src0_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_4);
335             } else {
336                elk_inst_set_src0_vstride(devinfo, inst, reg.vstride);
337             }
338          }
339       }
340    }
341 }
342 
343 
344 void
elk_set_src1(struct elk_codegen * p,elk_inst * inst,struct elk_reg reg)345 elk_set_src1(struct elk_codegen *p, elk_inst *inst, struct elk_reg reg)
346 {
347    const struct intel_device_info *devinfo = p->devinfo;
348 
349    if (reg.file == ELK_GENERAL_REGISTER_FILE)
350       assert(reg.nr < XE2_MAX_GRF);
351 
352    if (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDS ||
353        elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDSC ||
354        (devinfo->ver >= 12 &&
355         (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SEND ||
356          elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDC))) {
357       assert(reg.file == ELK_GENERAL_REGISTER_FILE ||
358              reg.file == ELK_ARCHITECTURE_REGISTER_FILE);
359       assert(reg.address_mode == ELK_ADDRESS_DIRECT);
360       assert(reg.subnr == 0);
361       assert(has_scalar_region(reg) ||
362              (reg.hstride == ELK_HORIZONTAL_STRIDE_1 &&
363               reg.vstride == reg.width + 1));
364       assert(!reg.negate && !reg.abs);
365       elk_inst_set_send_src1_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
366       elk_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
367    } else {
368       /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
369        *
370        *    "Accumulator registers may be accessed explicitly as src0
371        *    operands only."
372        */
373       assert(reg.file != ELK_ARCHITECTURE_REGISTER_FILE ||
374              reg.nr != ELK_ARF_ACCUMULATOR);
375 
376       gfx7_convert_mrf_to_grf(p, &reg);
377       assert(reg.file != ELK_MESSAGE_REGISTER_FILE);
378 
379       elk_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
380       elk_inst_set_src1_abs(devinfo, inst, reg.abs);
381       elk_inst_set_src1_negate(devinfo, inst, reg.negate);
382 
383       /* Only src1 can be immediate in two-argument instructions.
384        */
385       assert(elk_inst_src0_reg_file(devinfo, inst) != ELK_IMMEDIATE_VALUE);
386 
387       if (reg.file == ELK_IMMEDIATE_VALUE) {
388          /* two-argument instructions can only use 32-bit immediates */
389          assert(type_sz(reg.type) < 8);
390          elk_inst_set_imm_ud(devinfo, inst, reg.ud);
391       } else {
392          /* This is a hardware restriction, which may or may not be lifted
393           * in the future:
394           */
395          assert (reg.address_mode == ELK_ADDRESS_DIRECT);
396          /* assert (reg.file == ELK_GENERAL_REGISTER_FILE); */
397 
398          elk_inst_set_src1_da_reg_nr(devinfo, inst, phys_nr(devinfo, reg));
399          if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
400             elk_inst_set_src1_da1_subreg_nr(devinfo, inst, phys_subnr(devinfo, reg));
401          } else {
402             elk_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
403          }
404 
405          if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
406             if (reg.width == ELK_WIDTH_1 &&
407                 elk_inst_exec_size(devinfo, inst) == ELK_EXECUTE_1) {
408                elk_inst_set_src1_hstride(devinfo, inst, ELK_HORIZONTAL_STRIDE_0);
409                elk_inst_set_src1_width(devinfo, inst, ELK_WIDTH_1);
410                elk_inst_set_src1_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_0);
411             } else {
412                elk_inst_set_src1_hstride(devinfo, inst, reg.hstride);
413                elk_inst_set_src1_width(devinfo, inst, reg.width);
414                elk_inst_set_src1_vstride(devinfo, inst, reg.vstride);
415             }
416          } else {
417             elk_inst_set_src1_da16_swiz_x(devinfo, inst,
418                ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_X));
419             elk_inst_set_src1_da16_swiz_y(devinfo, inst,
420                ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_Y));
421             elk_inst_set_src1_da16_swiz_z(devinfo, inst,
422                ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_Z));
423             elk_inst_set_src1_da16_swiz_w(devinfo, inst,
424                ELK_GET_SWZ(reg.swizzle, ELK_CHANNEL_W));
425 
426             if (reg.vstride == ELK_VERTICAL_STRIDE_8) {
427                /* This is an oddity of the fact we're using the same
428                 * descriptions for registers in align_16 as align_1:
429                 */
430                elk_inst_set_src1_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_4);
431             } else if (devinfo->verx10 == 70 &&
432                        reg.type == ELK_REGISTER_TYPE_DF &&
433                        reg.vstride == ELK_VERTICAL_STRIDE_2) {
434                /* From SNB PRM:
435                 *
436                 * "For Align16 access mode, only encodings of 0000 and 0011
437                 *  are allowed. Other codes are reserved."
438                 *
439                 * Presumably the DevSNB behavior applies to IVB as well.
440                 */
441                elk_inst_set_src1_vstride(devinfo, inst, ELK_VERTICAL_STRIDE_4);
442             } else {
443                elk_inst_set_src1_vstride(devinfo, inst, reg.vstride);
444             }
445          }
446       }
447    }
448 }
449 
450 /**
451  * Specify the descriptor and extended descriptor immediate for a SEND(C)
452  * message instruction.
453  */
454 void
elk_set_desc_ex(struct elk_codegen * p,elk_inst * inst,unsigned desc,unsigned ex_desc)455 elk_set_desc_ex(struct elk_codegen *p, elk_inst *inst,
456                 unsigned desc, unsigned ex_desc)
457 {
458    const struct intel_device_info *devinfo = p->devinfo;
459    assert(elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SEND ||
460           elk_inst_opcode(p->isa, inst) == ELK_OPCODE_SENDC);
461    if (devinfo->ver < 12)
462       elk_inst_set_src1_file_type(devinfo, inst,
463                                   ELK_IMMEDIATE_VALUE, ELK_REGISTER_TYPE_UD);
464    elk_inst_set_send_desc(devinfo, inst, desc);
465    if (devinfo->ver >= 9)
466       elk_inst_set_send_ex_desc(devinfo, inst, ex_desc);
467 }
468 
elk_set_math_message(struct elk_codegen * p,elk_inst * inst,unsigned function,unsigned integer_type,bool low_precision,unsigned dataType)469 static void elk_set_math_message( struct elk_codegen *p,
470 				  elk_inst *inst,
471 				  unsigned function,
472 				  unsigned integer_type,
473 				  bool low_precision,
474 				  unsigned dataType )
475 {
476    const struct intel_device_info *devinfo = p->devinfo;
477    unsigned msg_length;
478    unsigned response_length;
479 
480    /* Infer message length from the function */
481    switch (function) {
482    case ELK_MATH_FUNCTION_POW:
483    case ELK_MATH_FUNCTION_INT_DIV_QUOTIENT:
484    case ELK_MATH_FUNCTION_INT_DIV_REMAINDER:
485    case ELK_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
486       msg_length = 2;
487       break;
488    default:
489       msg_length = 1;
490       break;
491    }
492 
493    /* Infer response length from the function */
494    switch (function) {
495    case ELK_MATH_FUNCTION_SINCOS:
496    case ELK_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
497       response_length = 2;
498       break;
499    default:
500       response_length = 1;
501       break;
502    }
503 
504    elk_set_desc(p, inst, elk_message_desc(
505                    devinfo, msg_length, response_length, false));
506 
507    elk_inst_set_sfid(devinfo, inst, ELK_SFID_MATH);
508    elk_inst_set_math_msg_function(devinfo, inst, function);
509    elk_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
510    elk_inst_set_math_msg_precision(devinfo, inst, low_precision);
511    elk_inst_set_math_msg_saturate(devinfo, inst, elk_inst_saturate(devinfo, inst));
512    elk_inst_set_math_msg_data_type(devinfo, inst, dataType);
513    elk_inst_set_saturate(devinfo, inst, 0);
514 }
515 
516 
elk_set_ff_sync_message(struct elk_codegen * p,elk_inst * insn,bool allocate,unsigned response_length,bool end_of_thread)517 static void elk_set_ff_sync_message(struct elk_codegen *p,
518 				    elk_inst *insn,
519 				    bool allocate,
520 				    unsigned response_length,
521 				    bool end_of_thread)
522 {
523    const struct intel_device_info *devinfo = p->devinfo;
524 
525    elk_set_desc(p, insn, elk_message_desc(
526                    devinfo, 1, response_length, true));
527 
528    elk_inst_set_sfid(devinfo, insn, ELK_SFID_URB);
529    elk_inst_set_eot(devinfo, insn, end_of_thread);
530    elk_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
531    elk_inst_set_urb_allocate(devinfo, insn, allocate);
532    /* The following fields are not used by FF_SYNC: */
533    elk_inst_set_urb_global_offset(devinfo, insn, 0);
534    elk_inst_set_urb_swizzle_control(devinfo, insn, 0);
535    elk_inst_set_urb_used(devinfo, insn, 0);
536    elk_inst_set_urb_complete(devinfo, insn, 0);
537 }
538 
elk_set_urb_message(struct elk_codegen * p,elk_inst * insn,enum elk_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle_control)539 static void elk_set_urb_message( struct elk_codegen *p,
540 				 elk_inst *insn,
541                                  enum elk_urb_write_flags flags,
542 				 unsigned msg_length,
543 				 unsigned response_length,
544 				 unsigned offset,
545 				 unsigned swizzle_control )
546 {
547    const struct intel_device_info *devinfo = p->devinfo;
548 
549    assert(devinfo->ver < 7 || swizzle_control != ELK_URB_SWIZZLE_TRANSPOSE);
550    assert(devinfo->ver < 7 || !(flags & ELK_URB_WRITE_ALLOCATE));
551    assert(devinfo->ver >= 7 || !(flags & ELK_URB_WRITE_PER_SLOT_OFFSET));
552 
553    elk_set_desc(p, insn, elk_message_desc(
554                    devinfo, msg_length, response_length, true));
555 
556    elk_inst_set_sfid(devinfo, insn, ELK_SFID_URB);
557    elk_inst_set_eot(devinfo, insn, !!(flags & ELK_URB_WRITE_EOT));
558 
559    if (flags & ELK_URB_WRITE_OWORD) {
560       assert(msg_length == 2); /* header + one OWORD of data */
561       elk_inst_set_urb_opcode(devinfo, insn, ELK_URB_OPCODE_WRITE_OWORD);
562    } else {
563       elk_inst_set_urb_opcode(devinfo, insn, ELK_URB_OPCODE_WRITE_HWORD);
564    }
565 
566    elk_inst_set_urb_global_offset(devinfo, insn, offset);
567    elk_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
568 
569    if (devinfo->ver < 8) {
570       elk_inst_set_urb_complete(devinfo, insn, !!(flags & ELK_URB_WRITE_COMPLETE));
571    }
572 
573    if (devinfo->ver < 7) {
574       elk_inst_set_urb_allocate(devinfo, insn, !!(flags & ELK_URB_WRITE_ALLOCATE));
575       elk_inst_set_urb_used(devinfo, insn, !(flags & ELK_URB_WRITE_UNUSED));
576    } else {
577       elk_inst_set_urb_per_slot_offset(devinfo, insn,
578          !!(flags & ELK_URB_WRITE_PER_SLOT_OFFSET));
579    }
580 }
581 
582 static void
gfx7_set_dp_scratch_message(struct elk_codegen * p,elk_inst * inst,bool write,bool dword,bool invalidate_after_read,unsigned num_regs,unsigned addr_offset,unsigned mlen,unsigned rlen,bool header_present)583 gfx7_set_dp_scratch_message(struct elk_codegen *p,
584                             elk_inst *inst,
585                             bool write,
586                             bool dword,
587                             bool invalidate_after_read,
588                             unsigned num_regs,
589                             unsigned addr_offset,
590                             unsigned mlen,
591                             unsigned rlen,
592                             bool header_present)
593 {
594    const struct intel_device_info *devinfo = p->devinfo;
595    assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
596           (devinfo->ver >= 8 && num_regs == 8));
597    const unsigned block_size = (devinfo->ver >= 8 ? util_logbase2(num_regs) :
598                                 num_regs - 1);
599 
600    elk_set_desc(p, inst, elk_message_desc(
601                    devinfo, mlen, rlen, header_present));
602 
603    elk_inst_set_sfid(devinfo, inst, GFX7_SFID_DATAPORT_DATA_CACHE);
604    elk_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
605    elk_inst_set_scratch_read_write(devinfo, inst, write);
606    elk_inst_set_scratch_type(devinfo, inst, dword);
607    elk_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
608    elk_inst_set_scratch_block_size(devinfo, inst, block_size);
609    elk_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
610 }
611 
612 static void
elk_inst_set_state(const struct elk_isa_info * isa,elk_inst * insn,const struct elk_insn_state * state)613 elk_inst_set_state(const struct elk_isa_info *isa,
614                    elk_inst *insn,
615                    const struct elk_insn_state *state)
616 {
617    const struct intel_device_info *devinfo = isa->devinfo;
618 
619    elk_inst_set_exec_size(devinfo, insn, state->exec_size);
620    elk_inst_set_group(devinfo, insn, state->group);
621    elk_inst_set_compression(devinfo, insn, state->compressed);
622    elk_inst_set_access_mode(devinfo, insn, state->access_mode);
623    elk_inst_set_mask_control(devinfo, insn, state->mask_control);
624    if (devinfo->ver >= 12)
625       elk_inst_set_swsb(devinfo, insn, tgl_swsb_encode(devinfo, state->swsb));
626    elk_inst_set_saturate(devinfo, insn, state->saturate);
627    elk_inst_set_pred_control(devinfo, insn, state->predicate);
628    elk_inst_set_pred_inv(devinfo, insn, state->pred_inv);
629 
630    if (elk_is_3src(isa, elk_inst_opcode(isa, insn)) &&
631        state->access_mode == ELK_ALIGN_16) {
632       elk_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
633       if (devinfo->ver >= 7)
634          elk_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
635    } else {
636       elk_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
637       if (devinfo->ver >= 7)
638          elk_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
639    }
640 
641    if (devinfo->ver >= 6 && devinfo->ver < 20)
642       elk_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
643 }
644 
645 static elk_inst *
elk_append_insns(struct elk_codegen * p,unsigned nr_insn,unsigned alignment)646 elk_append_insns(struct elk_codegen *p, unsigned nr_insn, unsigned alignment)
647 {
648    assert(util_is_power_of_two_or_zero(sizeof(elk_inst)));
649    assert(util_is_power_of_two_or_zero(alignment));
650    const unsigned align_insn = MAX2(alignment / sizeof(elk_inst), 1);
651    const unsigned start_insn = ALIGN(p->nr_insn, align_insn);
652    const unsigned new_nr_insn = start_insn + nr_insn;
653 
654    if (p->store_size < new_nr_insn) {
655       p->store_size = util_next_power_of_two(new_nr_insn * sizeof(elk_inst));
656       p->store = reralloc(p->mem_ctx, p->store, elk_inst, p->store_size);
657    }
658 
659    /* Memset any padding due to alignment to 0.  We don't want to be hashing
660     * or caching a bunch of random bits we got from a memory allocation.
661     */
662    if (p->nr_insn < start_insn) {
663       memset(&p->store[p->nr_insn], 0,
664              (start_insn - p->nr_insn) * sizeof(elk_inst));
665    }
666 
667    assert(p->next_insn_offset == p->nr_insn * sizeof(elk_inst));
668    p->nr_insn = new_nr_insn;
669    p->next_insn_offset = new_nr_insn * sizeof(elk_inst);
670 
671    return &p->store[start_insn];
672 }
673 
674 void
elk_realign(struct elk_codegen * p,unsigned alignment)675 elk_realign(struct elk_codegen *p, unsigned alignment)
676 {
677    elk_append_insns(p, 0, alignment);
678 }
679 
680 int
elk_append_data(struct elk_codegen * p,void * data,unsigned size,unsigned alignment)681 elk_append_data(struct elk_codegen *p, void *data,
682                 unsigned size, unsigned alignment)
683 {
684    unsigned nr_insn = DIV_ROUND_UP(size, sizeof(elk_inst));
685    void *dst = elk_append_insns(p, nr_insn, alignment);
686    memcpy(dst, data, size);
687 
688    /* If it's not a whole number of instructions, memset the end */
689    if (size < nr_insn * sizeof(elk_inst))
690       memset(dst + size, 0, nr_insn * sizeof(elk_inst) - size);
691 
692    return dst - (void *)p->store;
693 }
694 
695 #define next_insn elk_next_insn
696 elk_inst *
elk_next_insn(struct elk_codegen * p,unsigned opcode)697 elk_next_insn(struct elk_codegen *p, unsigned opcode)
698 {
699    elk_inst *insn = elk_append_insns(p, 1, sizeof(elk_inst));
700 
701    memset(insn, 0, sizeof(*insn));
702    elk_inst_set_opcode(p->isa, insn, opcode);
703 
704    /* Apply the default instruction state */
705    elk_inst_set_state(p->isa, insn, p->current);
706 
707    return insn;
708 }
709 
710 void
elk_add_reloc(struct elk_codegen * p,uint32_t id,enum elk_shader_reloc_type type,uint32_t offset,uint32_t delta)711 elk_add_reloc(struct elk_codegen *p, uint32_t id,
712               enum elk_shader_reloc_type type,
713               uint32_t offset, uint32_t delta)
714 {
715    if (p->num_relocs + 1 > p->reloc_array_size) {
716       p->reloc_array_size = MAX2(16, p->reloc_array_size * 2);
717       p->relocs = reralloc(p->mem_ctx, p->relocs,
718                            struct elk_shader_reloc, p->reloc_array_size);
719    }
720 
721    p->relocs[p->num_relocs++] = (struct elk_shader_reloc) {
722       .id = id,
723       .type = type,
724       .offset = offset,
725       .delta = delta,
726    };
727 }
728 
729 static elk_inst *
elk_alu1(struct elk_codegen * p,unsigned opcode,struct elk_reg dest,struct elk_reg src)730 elk_alu1(struct elk_codegen *p, unsigned opcode,
731          struct elk_reg dest, struct elk_reg src)
732 {
733    elk_inst *insn = next_insn(p, opcode);
734    elk_set_dest(p, insn, dest);
735    elk_set_src0(p, insn, src);
736    return insn;
737 }
738 
739 static elk_inst *
elk_alu2(struct elk_codegen * p,unsigned opcode,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)740 elk_alu2(struct elk_codegen *p, unsigned opcode,
741          struct elk_reg dest, struct elk_reg src0, struct elk_reg src1)
742 {
743    /* 64-bit immediates are only supported on 1-src instructions */
744    assert(src0.file != ELK_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
745    assert(src1.file != ELK_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
746 
747    elk_inst *insn = next_insn(p, opcode);
748    elk_set_dest(p, insn, dest);
749    elk_set_src0(p, insn, src0);
750    elk_set_src1(p, insn, src1);
751    return insn;
752 }
753 
754 static int
get_3src_subreg_nr(struct elk_reg reg)755 get_3src_subreg_nr(struct elk_reg reg)
756 {
757    /* Normally, SubRegNum is in bytes (0..31).  However, 3-src instructions
758     * use 32-bit units (components 0..7).  Since they only support F/D/UD
759     * types, this doesn't lose any flexibility, but uses fewer bits.
760     */
761    return reg.subnr / 4;
762 }
763 
764 static enum gfx10_align1_3src_vertical_stride
to_3src_align1_vstride(const struct intel_device_info * devinfo,enum elk_vertical_stride vstride)765 to_3src_align1_vstride(const struct intel_device_info *devinfo,
766                        enum elk_vertical_stride vstride)
767 {
768    switch (vstride) {
769    case ELK_VERTICAL_STRIDE_0:
770       return ELK_ALIGN1_3SRC_VERTICAL_STRIDE_0;
771    case ELK_VERTICAL_STRIDE_1:
772       assert(devinfo->ver >= 12);
773       return ELK_ALIGN1_3SRC_VERTICAL_STRIDE_1;
774    case ELK_VERTICAL_STRIDE_2:
775       assert(devinfo->ver < 12);
776       return ELK_ALIGN1_3SRC_VERTICAL_STRIDE_2;
777    case ELK_VERTICAL_STRIDE_4:
778       return ELK_ALIGN1_3SRC_VERTICAL_STRIDE_4;
779    case ELK_VERTICAL_STRIDE_8:
780    case ELK_VERTICAL_STRIDE_16:
781       return ELK_ALIGN1_3SRC_VERTICAL_STRIDE_8;
782    default:
783       unreachable("invalid vstride");
784    }
785 }
786 
787 
788 static enum gfx10_align1_3src_src_horizontal_stride
to_3src_align1_hstride(enum elk_horizontal_stride hstride)789 to_3src_align1_hstride(enum elk_horizontal_stride hstride)
790 {
791    switch (hstride) {
792    case ELK_HORIZONTAL_STRIDE_0:
793       return ELK_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
794    case ELK_HORIZONTAL_STRIDE_1:
795       return ELK_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
796    case ELK_HORIZONTAL_STRIDE_2:
797       return ELK_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
798    case ELK_HORIZONTAL_STRIDE_4:
799       return ELK_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
800    default:
801       unreachable("invalid hstride");
802    }
803 }
804 
805 static elk_inst *
elk_alu3(struct elk_codegen * p,unsigned opcode,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1,struct elk_reg src2)806 elk_alu3(struct elk_codegen *p, unsigned opcode, struct elk_reg dest,
807          struct elk_reg src0, struct elk_reg src1, struct elk_reg src2)
808 {
809    const struct intel_device_info *devinfo = p->devinfo;
810    elk_inst *inst = next_insn(p, opcode);
811 
812    gfx7_convert_mrf_to_grf(p, &dest);
813 
814    assert(dest.nr < XE2_MAX_GRF);
815 
816    if (devinfo->ver >= 10)
817       assert(!(src0.file == ELK_IMMEDIATE_VALUE &&
818                src2.file == ELK_IMMEDIATE_VALUE));
819 
820    assert(src0.file == ELK_IMMEDIATE_VALUE || src0.nr < XE2_MAX_GRF);
821    assert(src1.file != ELK_IMMEDIATE_VALUE && src1.nr < XE2_MAX_GRF);
822    assert(src2.file == ELK_IMMEDIATE_VALUE || src2.nr < XE2_MAX_GRF);
823    assert(dest.address_mode == ELK_ADDRESS_DIRECT);
824    assert(src0.address_mode == ELK_ADDRESS_DIRECT);
825    assert(src1.address_mode == ELK_ADDRESS_DIRECT);
826    assert(src2.address_mode == ELK_ADDRESS_DIRECT);
827 
828    if (elk_inst_access_mode(devinfo, inst) == ELK_ALIGN_1) {
829       assert(dest.file == ELK_GENERAL_REGISTER_FILE ||
830              (dest.file == ELK_ARCHITECTURE_REGISTER_FILE &&
831               dest.nr == ELK_ARF_ACCUMULATOR));
832 
833       if (devinfo->ver >= 12) {
834          elk_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
835          elk_inst_set_3src_dst_reg_nr(devinfo, inst, phys_nr(devinfo, dest));
836       } else {
837          if (dest.file == ELK_ARCHITECTURE_REGISTER_FILE) {
838             elk_inst_set_3src_a1_dst_reg_file(devinfo, inst,
839                                               ELK_ALIGN1_3SRC_ACCUMULATOR);
840             elk_inst_set_3src_dst_reg_nr(devinfo, inst, ELK_ARF_ACCUMULATOR);
841          } else {
842             elk_inst_set_3src_a1_dst_reg_file(devinfo, inst,
843                                               ELK_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
844             elk_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
845          }
846       }
847       elk_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, phys_subnr(devinfo, dest) / 8);
848 
849       elk_inst_set_3src_a1_dst_hstride(devinfo, inst, ELK_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
850 
851       if (elk_reg_type_is_floating_point(dest.type)) {
852          elk_inst_set_3src_a1_exec_type(devinfo, inst,
853                                         ELK_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
854       } else {
855          elk_inst_set_3src_a1_exec_type(devinfo, inst,
856                                         ELK_ALIGN1_3SRC_EXEC_TYPE_INT);
857       }
858 
859       elk_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
860       elk_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
861       elk_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
862       elk_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
863 
864       if (src0.file == ELK_IMMEDIATE_VALUE) {
865          elk_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud);
866       } else {
867          elk_inst_set_3src_a1_src0_vstride(
868             devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
869          elk_inst_set_3src_a1_src0_hstride(devinfo, inst,
870                                            to_3src_align1_hstride(src0.hstride));
871          elk_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, phys_subnr(devinfo, src0));
872          if (src0.type == ELK_REGISTER_TYPE_NF) {
873             elk_inst_set_3src_src0_reg_nr(devinfo, inst, ELK_ARF_ACCUMULATOR);
874          } else {
875             elk_inst_set_3src_src0_reg_nr(devinfo, inst, phys_nr(devinfo, src0));
876          }
877          elk_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
878          elk_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
879       }
880       elk_inst_set_3src_a1_src1_vstride(
881          devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
882       elk_inst_set_3src_a1_src1_hstride(devinfo, inst,
883                                         to_3src_align1_hstride(src1.hstride));
884 
885       elk_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, phys_subnr(devinfo, src1));
886       if (src1.file == ELK_ARCHITECTURE_REGISTER_FILE) {
887          elk_inst_set_3src_src1_reg_nr(devinfo, inst, ELK_ARF_ACCUMULATOR);
888       } else {
889          elk_inst_set_3src_src1_reg_nr(devinfo, inst, phys_nr(devinfo, src1));
890       }
891       elk_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
892       elk_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
893 
894       if (src2.file == ELK_IMMEDIATE_VALUE) {
895          elk_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud);
896       } else {
897          elk_inst_set_3src_a1_src2_hstride(devinfo, inst,
898                                            to_3src_align1_hstride(src2.hstride));
899          /* no vstride on src2 */
900          elk_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, phys_subnr(devinfo, src2));
901          elk_inst_set_3src_src2_reg_nr(devinfo, inst, phys_nr(devinfo, src2));
902          elk_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
903          elk_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
904       }
905 
906       assert(src0.file == ELK_GENERAL_REGISTER_FILE ||
907              src0.file == ELK_IMMEDIATE_VALUE ||
908              (src0.file == ELK_ARCHITECTURE_REGISTER_FILE &&
909               src0.type == ELK_REGISTER_TYPE_NF));
910       assert(src1.file == ELK_GENERAL_REGISTER_FILE ||
911              (src1.file == ELK_ARCHITECTURE_REGISTER_FILE &&
912               src1.nr == ELK_ARF_ACCUMULATOR));
913       assert(src2.file == ELK_GENERAL_REGISTER_FILE ||
914              src2.file == ELK_IMMEDIATE_VALUE);
915 
916       if (devinfo->ver >= 12) {
917          if (src0.file == ELK_IMMEDIATE_VALUE) {
918             elk_inst_set_3src_a1_src0_is_imm(devinfo, inst, 1);
919          } else {
920             elk_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
921          }
922 
923          elk_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
924 
925          if (src2.file == ELK_IMMEDIATE_VALUE) {
926             elk_inst_set_3src_a1_src2_is_imm(devinfo, inst, 1);
927          } else {
928             elk_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
929          }
930       } else {
931          elk_inst_set_3src_a1_src0_reg_file(devinfo, inst,
932                                             src0.file == ELK_GENERAL_REGISTER_FILE ?
933                                             ELK_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
934                                             ELK_ALIGN1_3SRC_IMMEDIATE_VALUE);
935          elk_inst_set_3src_a1_src1_reg_file(devinfo, inst,
936                                             src1.file == ELK_GENERAL_REGISTER_FILE ?
937                                             ELK_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
938                                             ELK_ALIGN1_3SRC_ACCUMULATOR);
939          elk_inst_set_3src_a1_src2_reg_file(devinfo, inst,
940                                             src2.file == ELK_GENERAL_REGISTER_FILE ?
941                                             ELK_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
942                                             ELK_ALIGN1_3SRC_IMMEDIATE_VALUE);
943       }
944 
945    } else {
946       assert(dest.file == ELK_GENERAL_REGISTER_FILE ||
947              dest.file == ELK_MESSAGE_REGISTER_FILE);
948       assert(dest.type == ELK_REGISTER_TYPE_F  ||
949              dest.type == ELK_REGISTER_TYPE_DF ||
950              dest.type == ELK_REGISTER_TYPE_D  ||
951              dest.type == ELK_REGISTER_TYPE_UD ||
952              (dest.type == ELK_REGISTER_TYPE_HF && devinfo->ver >= 8));
953       if (devinfo->ver == 6) {
954          elk_inst_set_3src_a16_dst_reg_file(devinfo, inst,
955                                             dest.file == ELK_MESSAGE_REGISTER_FILE);
956       }
957       elk_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
958       elk_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 4);
959       elk_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
960 
961       assert(src0.file == ELK_GENERAL_REGISTER_FILE);
962       elk_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
963       elk_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
964       elk_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
965       elk_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
966       elk_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
967       elk_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
968                                           src0.vstride == ELK_VERTICAL_STRIDE_0);
969 
970       assert(src1.file == ELK_GENERAL_REGISTER_FILE);
971       elk_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
972       elk_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
973       elk_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
974       elk_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
975       elk_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
976       elk_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
977                                           src1.vstride == ELK_VERTICAL_STRIDE_0);
978 
979       assert(src2.file == ELK_GENERAL_REGISTER_FILE);
980       elk_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
981       elk_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
982       elk_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
983       elk_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
984       elk_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
985       elk_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
986                                           src2.vstride == ELK_VERTICAL_STRIDE_0);
987 
988       if (devinfo->ver >= 7) {
989          /* Set both the source and destination types based on dest.type,
990           * ignoring the source register types.  The MAD and LRP emitters ensure
991           * that all four types are float.  The BFE and BFI2 emitters, however,
992           * may send us mixed D and UD types and want us to ignore that and use
993           * the destination type.
994           */
995          elk_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
996          elk_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
997 
998          /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
999           *
1000           *    "Three source instructions can use operands with mixed-mode
1001           *     precision. When SrcType field is set to :f or :hf it defines
1002           *     precision for source 0 only, and fields Src1Type and Src2Type
1003           *     define precision for other source operands:
1004           *
1005           *     0b = :f. Single precision Float (32-bit).
1006           *     1b = :hf. Half precision Float (16-bit)."
1007           */
1008          if (src1.type == ELK_REGISTER_TYPE_HF)
1009             elk_inst_set_3src_a16_src1_type(devinfo, inst, 1);
1010 
1011          if (src2.type == ELK_REGISTER_TYPE_HF)
1012             elk_inst_set_3src_a16_src2_type(devinfo, inst, 1);
1013       }
1014    }
1015 
1016    return inst;
1017 }
1018 
1019 static elk_inst *
elk_dpas_three_src(struct elk_codegen * p,enum elk_gfx12_systolic_depth opcode,unsigned sdepth,unsigned rcount,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1,struct elk_reg src2)1020 elk_dpas_three_src(struct elk_codegen *p, enum elk_gfx12_systolic_depth opcode,
1021                    unsigned sdepth, unsigned rcount, struct elk_reg dest,
1022                    struct elk_reg src0, struct elk_reg src1, struct elk_reg src2)
1023 {
1024    const struct intel_device_info *devinfo = p->devinfo;
1025    elk_inst *inst = next_insn(p, opcode);
1026 
1027    assert(dest.file == ELK_GENERAL_REGISTER_FILE);
1028    elk_inst_set_dpas_3src_dst_reg_file(devinfo, inst,
1029                                        ELK_GENERAL_REGISTER_FILE);
1030    elk_inst_set_dpas_3src_dst_reg_nr(devinfo, inst, dest.nr);
1031    elk_inst_set_dpas_3src_dst_subreg_nr(devinfo, inst, dest.subnr);
1032 
1033    if (elk_reg_type_is_floating_point(dest.type)) {
1034       elk_inst_set_dpas_3src_exec_type(devinfo, inst,
1035                                        ELK_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
1036    } else {
1037       elk_inst_set_dpas_3src_exec_type(devinfo, inst,
1038                                        ELK_ALIGN1_3SRC_EXEC_TYPE_INT);
1039    }
1040 
1041    elk_inst_set_dpas_3src_sdepth(devinfo, inst, sdepth);
1042    elk_inst_set_dpas_3src_rcount(devinfo, inst, rcount - 1);
1043 
1044    elk_inst_set_dpas_3src_dst_type(devinfo, inst, dest.type);
1045    elk_inst_set_dpas_3src_src0_type(devinfo, inst, src0.type);
1046    elk_inst_set_dpas_3src_src1_type(devinfo, inst, src1.type);
1047    elk_inst_set_dpas_3src_src2_type(devinfo, inst, src2.type);
1048 
1049    assert(src0.file == ELK_GENERAL_REGISTER_FILE ||
1050           (src0.file == ELK_ARCHITECTURE_REGISTER_FILE &&
1051            src0.nr == ELK_ARF_NULL));
1052 
1053    elk_inst_set_dpas_3src_src0_reg_file(devinfo, inst, src0.file);
1054    elk_inst_set_dpas_3src_src0_reg_nr(devinfo, inst, src0.nr);
1055    elk_inst_set_dpas_3src_src0_subreg_nr(devinfo, inst, src0.subnr);
1056 
1057    assert(src1.file == ELK_GENERAL_REGISTER_FILE);
1058 
1059    elk_inst_set_dpas_3src_src1_reg_file(devinfo, inst, src1.file);
1060    elk_inst_set_dpas_3src_src1_reg_nr(devinfo, inst, src1.nr);
1061    elk_inst_set_dpas_3src_src1_subreg_nr(devinfo, inst, src1.subnr);
1062    elk_inst_set_dpas_3src_src1_subbyte(devinfo, inst, ELK_SUB_BYTE_PRECISION_NONE);
1063 
1064    assert(src2.file == ELK_GENERAL_REGISTER_FILE);
1065 
1066    elk_inst_set_dpas_3src_src2_reg_file(devinfo, inst, src2.file);
1067    elk_inst_set_dpas_3src_src2_reg_nr(devinfo, inst, src2.nr);
1068    elk_inst_set_dpas_3src_src2_subreg_nr(devinfo, inst, src2.subnr);
1069    elk_inst_set_dpas_3src_src2_subbyte(devinfo, inst, ELK_SUB_BYTE_PRECISION_NONE);
1070 
1071    return inst;
1072 }
1073 
1074 /***********************************************************************
1075  * Convenience routines.
1076  */
1077 #define ALU1(OP)					\
1078 elk_inst *elk_##OP(struct elk_codegen *p,		\
1079 	      struct elk_reg dest,			\
1080 	      struct elk_reg src0)   			\
1081 {							\
1082    return elk_alu1(p, ELK_OPCODE_##OP, dest, src0);    	\
1083 }
1084 
1085 #define ALU2(OP)					\
1086 elk_inst *elk_##OP(struct elk_codegen *p,		\
1087 	      struct elk_reg dest,			\
1088 	      struct elk_reg src0,			\
1089 	      struct elk_reg src1)   			\
1090 {							\
1091    return elk_alu2(p, ELK_OPCODE_##OP, dest, src0, src1);	\
1092 }
1093 
1094 #define ALU3(OP)					\
1095 elk_inst *elk_##OP(struct elk_codegen *p,		\
1096 	      struct elk_reg dest,			\
1097 	      struct elk_reg src0,			\
1098 	      struct elk_reg src1,			\
1099 	      struct elk_reg src2)   			\
1100 {                                                       \
1101    if (p->current->access_mode == ELK_ALIGN_16) {       \
1102       if (src0.vstride == ELK_VERTICAL_STRIDE_0)        \
1103          src0.swizzle = ELK_SWIZZLE_XXXX;               \
1104       if (src1.vstride == ELK_VERTICAL_STRIDE_0)        \
1105          src1.swizzle = ELK_SWIZZLE_XXXX;               \
1106       if (src2.vstride == ELK_VERTICAL_STRIDE_0)        \
1107          src2.swizzle = ELK_SWIZZLE_XXXX;               \
1108    }                                                    \
1109    return elk_alu3(p, ELK_OPCODE_##OP, dest, src0, src1, src2);	\
1110 }
1111 
1112 #define ALU3F(OP)                                               \
1113 elk_inst *elk_##OP(struct elk_codegen *p,         \
1114                                  struct elk_reg dest,           \
1115                                  struct elk_reg src0,           \
1116                                  struct elk_reg src1,           \
1117                                  struct elk_reg src2)           \
1118 {                                                               \
1119    assert(dest.type == ELK_REGISTER_TYPE_F ||                   \
1120           dest.type == ELK_REGISTER_TYPE_DF);                   \
1121    if (dest.type == ELK_REGISTER_TYPE_F) {                      \
1122       assert(src0.type == ELK_REGISTER_TYPE_F);                 \
1123       assert(src1.type == ELK_REGISTER_TYPE_F);                 \
1124       assert(src2.type == ELK_REGISTER_TYPE_F);                 \
1125    } else if (dest.type == ELK_REGISTER_TYPE_DF) {              \
1126       assert(src0.type == ELK_REGISTER_TYPE_DF);                \
1127       assert(src1.type == ELK_REGISTER_TYPE_DF);                \
1128       assert(src2.type == ELK_REGISTER_TYPE_DF);                \
1129    }                                                            \
1130                                                                 \
1131    if (p->current->access_mode == ELK_ALIGN_16) {               \
1132       if (src0.vstride == ELK_VERTICAL_STRIDE_0)                \
1133          src0.swizzle = ELK_SWIZZLE_XXXX;                       \
1134       if (src1.vstride == ELK_VERTICAL_STRIDE_0)                \
1135          src1.swizzle = ELK_SWIZZLE_XXXX;                       \
1136       if (src2.vstride == ELK_VERTICAL_STRIDE_0)                \
1137          src2.swizzle = ELK_SWIZZLE_XXXX;                       \
1138    }                                                            \
1139    return elk_alu3(p, ELK_OPCODE_##OP, dest, src0, src1, src2); \
1140 }
1141 
1142 ALU2(SEL)
ALU1(NOT)1143 ALU1(NOT)
1144 ALU2(AND)
1145 ALU2(OR)
1146 ALU2(XOR)
1147 ALU2(SHR)
1148 ALU2(SHL)
1149 ALU1(DIM)
1150 ALU2(ASR)
1151 ALU2(ROL)
1152 ALU2(ROR)
1153 ALU3(CSEL)
1154 ALU1(FRC)
1155 ALU1(RNDD)
1156 ALU1(RNDE)
1157 ALU1(RNDU)
1158 ALU1(RNDZ)
1159 ALU2(MAC)
1160 ALU2(MACH)
1161 ALU1(LZD)
1162 ALU2(DP4)
1163 ALU2(DPH)
1164 ALU2(DP3)
1165 ALU2(DP2)
1166 ALU3(DP4A)
1167 ALU3(MAD)
1168 ALU3F(LRP)
1169 ALU1(BFREV)
1170 ALU3(BFE)
1171 ALU2(BFI1)
1172 ALU3(BFI2)
1173 ALU1(FBH)
1174 ALU1(FBL)
1175 ALU1(CBIT)
1176 ALU2(ADDC)
1177 ALU2(SUBB)
1178 ALU3(ADD3)
1179 
1180 elk_inst *
1181 elk_MOV(struct elk_codegen *p, struct elk_reg dest, struct elk_reg src0)
1182 {
1183    const struct intel_device_info *devinfo = p->devinfo;
1184 
1185    /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1186     * To avoid the problems that causes, we use an <X,2,0> source region to
1187     * read each element twice.
1188     */
1189    if (devinfo->verx10 == 70 &&
1190        elk_get_default_access_mode(p) == ELK_ALIGN_1 &&
1191        dest.type == ELK_REGISTER_TYPE_DF &&
1192        (src0.type == ELK_REGISTER_TYPE_F ||
1193         src0.type == ELK_REGISTER_TYPE_D ||
1194         src0.type == ELK_REGISTER_TYPE_UD) &&
1195        !has_scalar_region(src0)) {
1196       assert(src0.vstride == src0.width + src0.hstride);
1197       src0.vstride = src0.hstride;
1198       src0.width = ELK_WIDTH_2;
1199       src0.hstride = ELK_HORIZONTAL_STRIDE_0;
1200    }
1201 
1202    return elk_alu1(p, ELK_OPCODE_MOV, dest, src0);
1203 }
1204 
1205 elk_inst *
elk_ADD(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)1206 elk_ADD(struct elk_codegen *p, struct elk_reg dest,
1207         struct elk_reg src0, struct elk_reg src1)
1208 {
1209    /* 6.2.2: add */
1210    if (src0.type == ELK_REGISTER_TYPE_F ||
1211        (src0.file == ELK_IMMEDIATE_VALUE &&
1212 	src0.type == ELK_REGISTER_TYPE_VF)) {
1213       assert(src1.type != ELK_REGISTER_TYPE_UD);
1214       assert(src1.type != ELK_REGISTER_TYPE_D);
1215    }
1216 
1217    if (src1.type == ELK_REGISTER_TYPE_F ||
1218        (src1.file == ELK_IMMEDIATE_VALUE &&
1219 	src1.type == ELK_REGISTER_TYPE_VF)) {
1220       assert(src0.type != ELK_REGISTER_TYPE_UD);
1221       assert(src0.type != ELK_REGISTER_TYPE_D);
1222    }
1223 
1224    return elk_alu2(p, ELK_OPCODE_ADD, dest, src0, src1);
1225 }
1226 
1227 elk_inst *
elk_AVG(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)1228 elk_AVG(struct elk_codegen *p, struct elk_reg dest,
1229         struct elk_reg src0, struct elk_reg src1)
1230 {
1231    assert(dest.type == src0.type);
1232    assert(src0.type == src1.type);
1233    switch (src0.type) {
1234    case ELK_REGISTER_TYPE_B:
1235    case ELK_REGISTER_TYPE_UB:
1236    case ELK_REGISTER_TYPE_W:
1237    case ELK_REGISTER_TYPE_UW:
1238    case ELK_REGISTER_TYPE_D:
1239    case ELK_REGISTER_TYPE_UD:
1240       break;
1241    default:
1242       unreachable("Bad type for elk_AVG");
1243    }
1244 
1245    return elk_alu2(p, ELK_OPCODE_AVG, dest, src0, src1);
1246 }
1247 
1248 elk_inst *
elk_MUL(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)1249 elk_MUL(struct elk_codegen *p, struct elk_reg dest,
1250         struct elk_reg src0, struct elk_reg src1)
1251 {
1252    /* 6.32.38: mul */
1253    if (src0.type == ELK_REGISTER_TYPE_D ||
1254        src0.type == ELK_REGISTER_TYPE_UD ||
1255        src1.type == ELK_REGISTER_TYPE_D ||
1256        src1.type == ELK_REGISTER_TYPE_UD) {
1257       assert(dest.type != ELK_REGISTER_TYPE_F);
1258    }
1259 
1260    if (src0.type == ELK_REGISTER_TYPE_F ||
1261        (src0.file == ELK_IMMEDIATE_VALUE &&
1262 	src0.type == ELK_REGISTER_TYPE_VF)) {
1263       assert(src1.type != ELK_REGISTER_TYPE_UD);
1264       assert(src1.type != ELK_REGISTER_TYPE_D);
1265    }
1266 
1267    if (src1.type == ELK_REGISTER_TYPE_F ||
1268        (src1.file == ELK_IMMEDIATE_VALUE &&
1269 	src1.type == ELK_REGISTER_TYPE_VF)) {
1270       assert(src0.type != ELK_REGISTER_TYPE_UD);
1271       assert(src0.type != ELK_REGISTER_TYPE_D);
1272    }
1273 
1274    assert(src0.file != ELK_ARCHITECTURE_REGISTER_FILE ||
1275 	  src0.nr != ELK_ARF_ACCUMULATOR);
1276    assert(src1.file != ELK_ARCHITECTURE_REGISTER_FILE ||
1277 	  src1.nr != ELK_ARF_ACCUMULATOR);
1278 
1279    return elk_alu2(p, ELK_OPCODE_MUL, dest, src0, src1);
1280 }
1281 
1282 elk_inst *
elk_LINE(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)1283 elk_LINE(struct elk_codegen *p, struct elk_reg dest,
1284          struct elk_reg src0, struct elk_reg src1)
1285 {
1286    src0.vstride = ELK_VERTICAL_STRIDE_0;
1287    src0.width = ELK_WIDTH_1;
1288    src0.hstride = ELK_HORIZONTAL_STRIDE_0;
1289    return elk_alu2(p, ELK_OPCODE_LINE, dest, src0, src1);
1290 }
1291 
1292 elk_inst *
elk_PLN(struct elk_codegen * p,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1)1293 elk_PLN(struct elk_codegen *p, struct elk_reg dest,
1294         struct elk_reg src0, struct elk_reg src1)
1295 {
1296    src0.vstride = ELK_VERTICAL_STRIDE_0;
1297    src0.width = ELK_WIDTH_1;
1298    src0.hstride = ELK_HORIZONTAL_STRIDE_0;
1299    src1.vstride = ELK_VERTICAL_STRIDE_8;
1300    src1.width = ELK_WIDTH_8;
1301    src1.hstride = ELK_HORIZONTAL_STRIDE_1;
1302    return elk_alu2(p, ELK_OPCODE_PLN, dest, src0, src1);
1303 }
1304 
1305 elk_inst *
elk_DPAS(struct elk_codegen * p,enum elk_gfx12_systolic_depth sdepth,unsigned rcount,struct elk_reg dest,struct elk_reg src0,struct elk_reg src1,struct elk_reg src2)1306 elk_DPAS(struct elk_codegen *p, enum elk_gfx12_systolic_depth sdepth,
1307          unsigned rcount, struct elk_reg dest, struct elk_reg src0,
1308          struct elk_reg src1, struct elk_reg src2)
1309 {
1310    return elk_dpas_three_src(p, ELK_OPCODE_DPAS, sdepth, rcount, dest, src0,
1311                              src1, src2);
1312 }
1313 
1314 elk_inst *
elk_F32TO16(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src)1315 elk_F32TO16(struct elk_codegen *p, struct elk_reg dst, struct elk_reg src)
1316 {
1317    assert(p->devinfo->ver == 7);
1318 
1319    /* The F32TO16 instruction doesn't support 32-bit destination types in
1320     * Align1 mode.  Gfx7 (only) does zero out the high 16 bits in Align16
1321     * mode as an undocumented feature.
1322     */
1323    if (ELK_ALIGN_16 == elk_get_default_access_mode(p)) {
1324       assert(dst.type == ELK_REGISTER_TYPE_UD);
1325    } else {
1326       assert(dst.type == ELK_REGISTER_TYPE_W ||
1327              dst.type == ELK_REGISTER_TYPE_UW);
1328    }
1329 
1330    return elk_alu1(p, ELK_OPCODE_F32TO16, dst, src);
1331 }
1332 
1333 elk_inst *
elk_F16TO32(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src)1334 elk_F16TO32(struct elk_codegen *p, struct elk_reg dst, struct elk_reg src)
1335 {
1336    assert(p->devinfo->ver == 7);
1337 
1338    if (ELK_ALIGN_16 == elk_get_default_access_mode(p)) {
1339       assert(src.type == ELK_REGISTER_TYPE_UD);
1340    } else {
1341       /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1342        *
1343        *   Because this instruction does not have a 16-bit floating-point
1344        *   type, the source data type must be Word (W). The destination type
1345        *   must be F (Float).
1346        */
1347       assert(src.type == ELK_REGISTER_TYPE_W ||
1348              src.type == ELK_REGISTER_TYPE_UW);
1349    }
1350 
1351    return elk_alu1(p, ELK_OPCODE_F16TO32, dst, src);
1352 }
1353 
1354 
elk_NOP(struct elk_codegen * p)1355 void elk_NOP(struct elk_codegen *p)
1356 {
1357    elk_inst *insn = next_insn(p, ELK_OPCODE_NOP);
1358    memset(insn, 0, sizeof(*insn));
1359    elk_inst_set_opcode(p->isa, insn, ELK_OPCODE_NOP);
1360 }
1361 
elk_SYNC(struct elk_codegen * p,enum tgl_sync_function func)1362 void elk_SYNC(struct elk_codegen *p, enum tgl_sync_function func)
1363 {
1364    elk_inst *insn = next_insn(p, ELK_OPCODE_SYNC);
1365    elk_inst_set_cond_modifier(p->devinfo, insn, func);
1366 }
1367 
1368 /***********************************************************************
1369  * Comparisons, if/else/endif
1370  */
1371 
1372 elk_inst *
elk_JMPI(struct elk_codegen * p,struct elk_reg index,unsigned predicate_control)1373 elk_JMPI(struct elk_codegen *p, struct elk_reg index,
1374          unsigned predicate_control)
1375 {
1376    const struct intel_device_info *devinfo = p->devinfo;
1377    struct elk_reg ip = elk_ip_reg();
1378    elk_inst *inst = elk_alu2(p, ELK_OPCODE_JMPI, ip, ip, index);
1379 
1380    elk_inst_set_exec_size(devinfo, inst, ELK_EXECUTE_1);
1381    elk_inst_set_qtr_control(devinfo, inst, ELK_COMPRESSION_NONE);
1382    elk_inst_set_mask_control(devinfo, inst, ELK_MASK_DISABLE);
1383    elk_inst_set_pred_control(devinfo, inst, predicate_control);
1384 
1385    return inst;
1386 }
1387 
1388 static void
push_if_stack(struct elk_codegen * p,elk_inst * inst)1389 push_if_stack(struct elk_codegen *p, elk_inst *inst)
1390 {
1391    p->if_stack[p->if_stack_depth] = inst - p->store;
1392 
1393    p->if_stack_depth++;
1394    if (p->if_stack_array_size <= p->if_stack_depth) {
1395       p->if_stack_array_size *= 2;
1396       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1397 			     p->if_stack_array_size);
1398    }
1399 }
1400 
1401 static elk_inst *
pop_if_stack(struct elk_codegen * p)1402 pop_if_stack(struct elk_codegen *p)
1403 {
1404    p->if_stack_depth--;
1405    return &p->store[p->if_stack[p->if_stack_depth]];
1406 }
1407 
1408 static void
push_loop_stack(struct elk_codegen * p,elk_inst * inst)1409 push_loop_stack(struct elk_codegen *p, elk_inst *inst)
1410 {
1411    if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1412       p->loop_stack_array_size *= 2;
1413       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1414 			       p->loop_stack_array_size);
1415       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1416 				     p->loop_stack_array_size);
1417    }
1418 
1419    p->loop_stack[p->loop_stack_depth] = inst - p->store;
1420    p->loop_stack_depth++;
1421    p->if_depth_in_loop[p->loop_stack_depth] = 0;
1422 }
1423 
1424 static elk_inst *
get_inner_do_insn(struct elk_codegen * p)1425 get_inner_do_insn(struct elk_codegen *p)
1426 {
1427    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1428 }
1429 
1430 /* EU takes the value from the flag register and pushes it onto some
1431  * sort of a stack (presumably merging with any flag value already on
1432  * the stack).  Within an if block, the flags at the top of the stack
1433  * control execution on each channel of the unit, eg. on each of the
1434  * 16 pixel values in our wm programs.
1435  *
1436  * When the matching 'else' instruction is reached (presumably by
1437  * countdown of the instruction count patched in by our ELSE/ENDIF
1438  * functions), the relevant flags are inverted.
1439  *
1440  * When the matching 'endif' instruction is reached, the flags are
1441  * popped off.  If the stack is now empty, normal execution resumes.
1442  */
1443 elk_inst *
elk_IF(struct elk_codegen * p,unsigned execute_size)1444 elk_IF(struct elk_codegen *p, unsigned execute_size)
1445 {
1446    const struct intel_device_info *devinfo = p->devinfo;
1447    elk_inst *insn;
1448 
1449    insn = next_insn(p, ELK_OPCODE_IF);
1450 
1451    /* Override the defaults for this instruction:
1452     */
1453    if (devinfo->ver < 6) {
1454       elk_set_dest(p, insn, elk_ip_reg());
1455       elk_set_src0(p, insn, elk_ip_reg());
1456       elk_set_src1(p, insn, elk_imm_d(0x0));
1457    } else if (devinfo->ver == 6) {
1458       elk_set_dest(p, insn, elk_imm_w(0));
1459       elk_inst_set_gfx6_jump_count(devinfo, insn, 0);
1460       elk_set_src0(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1461       elk_set_src1(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1462    } else if (devinfo->ver == 7) {
1463       elk_set_dest(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1464       elk_set_src0(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1465       elk_set_src1(p, insn, elk_imm_w(0));
1466       elk_inst_set_jip(devinfo, insn, 0);
1467       elk_inst_set_uip(devinfo, insn, 0);
1468    } else {
1469       elk_set_dest(p, insn, vec1(retype(elk_null_reg(), ELK_REGISTER_TYPE_D)));
1470       if (devinfo->ver < 12)
1471          elk_set_src0(p, insn, elk_imm_d(0));
1472       elk_inst_set_jip(devinfo, insn, 0);
1473       elk_inst_set_uip(devinfo, insn, 0);
1474    }
1475 
1476    elk_inst_set_exec_size(devinfo, insn, execute_size);
1477    elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1478    elk_inst_set_pred_control(devinfo, insn, ELK_PREDICATE_NORMAL);
1479    elk_inst_set_mask_control(devinfo, insn, ELK_MASK_ENABLE);
1480    if (!p->single_program_flow && devinfo->ver < 6)
1481       elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
1482 
1483    push_if_stack(p, insn);
1484    p->if_depth_in_loop[p->loop_stack_depth]++;
1485    return insn;
1486 }
1487 
1488 /* This function is only used for gfx6-style IF instructions with an
1489  * embedded comparison (conditional modifier).  It is not used on gfx7.
1490  */
1491 elk_inst *
elk_gfx6_IF(struct elk_codegen * p,enum elk_conditional_mod conditional,struct elk_reg src0,struct elk_reg src1)1492 elk_gfx6_IF(struct elk_codegen *p, enum elk_conditional_mod conditional,
1493 	struct elk_reg src0, struct elk_reg src1)
1494 {
1495    const struct intel_device_info *devinfo = p->devinfo;
1496    elk_inst *insn;
1497 
1498    insn = next_insn(p, ELK_OPCODE_IF);
1499 
1500    elk_set_dest(p, insn, elk_imm_w(0));
1501    elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1502    elk_inst_set_gfx6_jump_count(devinfo, insn, 0);
1503    elk_set_src0(p, insn, src0);
1504    elk_set_src1(p, insn, src1);
1505 
1506    assert(elk_inst_qtr_control(devinfo, insn) == ELK_COMPRESSION_NONE);
1507    assert(elk_inst_pred_control(devinfo, insn) == ELK_PREDICATE_NONE);
1508    elk_inst_set_cond_modifier(devinfo, insn, conditional);
1509 
1510    push_if_stack(p, insn);
1511    return insn;
1512 }
1513 
1514 /**
1515  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1516  */
1517 static void
convert_IF_ELSE_to_ADD(struct elk_codegen * p,elk_inst * if_inst,elk_inst * else_inst)1518 convert_IF_ELSE_to_ADD(struct elk_codegen *p,
1519                        elk_inst *if_inst, elk_inst *else_inst)
1520 {
1521    const struct intel_device_info *devinfo = p->devinfo;
1522 
1523    /* The next instruction (where the ENDIF would be, if it existed) */
1524    elk_inst *next_inst = &p->store[p->nr_insn];
1525 
1526    assert(p->single_program_flow);
1527    assert(if_inst != NULL && elk_inst_opcode(p->isa, if_inst) == ELK_OPCODE_IF);
1528    assert(else_inst == NULL || elk_inst_opcode(p->isa, else_inst) == ELK_OPCODE_ELSE);
1529    assert(elk_inst_exec_size(devinfo, if_inst) == ELK_EXECUTE_1);
1530 
1531    /* Convert IF to an ADD instruction that moves the instruction pointer
1532     * to the first instruction of the ELSE block.  If there is no ELSE
1533     * block, point to where ENDIF would be.  Reverse the predicate.
1534     *
1535     * There's no need to execute an ENDIF since we don't need to do any
1536     * stack operations, and if we're currently executing, we just want to
1537     * continue normally.
1538     */
1539    elk_inst_set_opcode(p->isa, if_inst, ELK_OPCODE_ADD);
1540    elk_inst_set_pred_inv(devinfo, if_inst, true);
1541 
1542    if (else_inst != NULL) {
1543       /* Convert ELSE to an ADD instruction that points where the ENDIF
1544        * would be.
1545        */
1546       elk_inst_set_opcode(p->isa, else_inst, ELK_OPCODE_ADD);
1547 
1548       elk_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1549       elk_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1550    } else {
1551       elk_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1552    }
1553 }
1554 
1555 /**
1556  * Patch IF and ELSE instructions with appropriate jump targets.
1557  */
1558 static void
patch_IF_ELSE(struct elk_codegen * p,elk_inst * if_inst,elk_inst * else_inst,elk_inst * endif_inst)1559 patch_IF_ELSE(struct elk_codegen *p,
1560               elk_inst *if_inst, elk_inst *else_inst, elk_inst *endif_inst)
1561 {
1562    const struct intel_device_info *devinfo = p->devinfo;
1563 
1564    /* We shouldn't be patching IF and ELSE instructions in single program flow
1565     * mode when gen < 6, because in single program flow mode on those
1566     * platforms, we convert flow control instructions to conditional ADDs that
1567     * operate on IP (see elk_ENDIF).
1568     *
1569     * However, on Gfx6, writing to IP doesn't work in single program flow mode
1570     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1571     * not be updated by non-flow control instructions.").  And on later
1572     * platforms, there is no significant benefit to converting control flow
1573     * instructions to conditional ADDs.  So we do patch IF and ELSE
1574     * instructions in single program flow mode on those platforms.
1575     */
1576    if (devinfo->ver < 6)
1577       assert(!p->single_program_flow);
1578 
1579    assert(if_inst != NULL && elk_inst_opcode(p->isa, if_inst) == ELK_OPCODE_IF);
1580    assert(endif_inst != NULL);
1581    assert(else_inst == NULL || elk_inst_opcode(p->isa, else_inst) == ELK_OPCODE_ELSE);
1582 
1583    unsigned br = elk_jump_scale(devinfo);
1584 
1585    assert(elk_inst_opcode(p->isa, endif_inst) == ELK_OPCODE_ENDIF);
1586    elk_inst_set_exec_size(devinfo, endif_inst, elk_inst_exec_size(devinfo, if_inst));
1587 
1588    if (else_inst == NULL) {
1589       /* Patch IF -> ENDIF */
1590       if (devinfo->ver < 6) {
1591 	 /* Turn it into an IFF, which means no mask stack operations for
1592 	  * all-false and jumping past the ENDIF.
1593 	  */
1594          elk_inst_set_opcode(p->isa, if_inst, ELK_OPCODE_IFF);
1595          elk_inst_set_gfx4_jump_count(devinfo, if_inst,
1596                                       br * (endif_inst - if_inst + 1));
1597          elk_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1598       } else if (devinfo->ver == 6) {
1599 	 /* As of gfx6, there is no IFF and IF must point to the ENDIF. */
1600          elk_inst_set_gfx6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1601       } else {
1602          elk_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1603          elk_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1604       }
1605    } else {
1606       elk_inst_set_exec_size(devinfo, else_inst, elk_inst_exec_size(devinfo, if_inst));
1607 
1608       /* Patch IF -> ELSE */
1609       if (devinfo->ver < 6) {
1610          elk_inst_set_gfx4_jump_count(devinfo, if_inst,
1611                                       br * (else_inst - if_inst));
1612          elk_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1613       } else if (devinfo->ver == 6) {
1614          elk_inst_set_gfx6_jump_count(devinfo, if_inst,
1615                                       br * (else_inst - if_inst + 1));
1616       }
1617 
1618       /* Patch ELSE -> ENDIF */
1619       if (devinfo->ver < 6) {
1620 	 /* ELK_OPCODE_ELSE pre-gfx6 should point just past the
1621 	  * matching ENDIF.
1622 	  */
1623          elk_inst_set_gfx4_jump_count(devinfo, else_inst,
1624                                       br * (endif_inst - else_inst + 1));
1625          elk_inst_set_gfx4_pop_count(devinfo, else_inst, 1);
1626       } else if (devinfo->ver == 6) {
1627 	 /* ELK_OPCODE_ELSE on gfx6 should point to the matching ENDIF. */
1628          elk_inst_set_gfx6_jump_count(devinfo, else_inst,
1629                                       br * (endif_inst - else_inst));
1630       } else {
1631 	 /* The IF instruction's JIP should point just past the ELSE */
1632          elk_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1633 	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1634          elk_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1635 
1636          if (devinfo->ver >= 8 && devinfo->ver < 11) {
1637             /* Set the ELSE instruction to use branch_ctrl with a join
1638              * jump target pointing at the NOP inserted right before
1639              * the ENDIF instruction in order to make sure it is
1640              * executed in all cases, since attempting to do the same
1641              * as on other generations could cause the EU to jump at
1642              * the instruction immediately after the ENDIF due to
1643              * Wa_220160235, which could cause the program to continue
1644              * running with all channels disabled.
1645              */
1646             elk_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst - 1));
1647             elk_inst_set_branch_control(devinfo, else_inst, true);
1648          } else {
1649             elk_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1650          }
1651 
1652          if (devinfo->ver >= 8) {
1653             /* Since we don't set branch_ctrl on Gfx11+, the ELSE's
1654              * JIP and UIP both should point to ENDIF on those
1655              * platforms.
1656              */
1657             elk_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1658          }
1659       }
1660    }
1661 }
1662 
1663 void
elk_ELSE(struct elk_codegen * p)1664 elk_ELSE(struct elk_codegen *p)
1665 {
1666    const struct intel_device_info *devinfo = p->devinfo;
1667    elk_inst *insn;
1668 
1669    insn = next_insn(p, ELK_OPCODE_ELSE);
1670 
1671    if (devinfo->ver < 6) {
1672       elk_set_dest(p, insn, elk_ip_reg());
1673       elk_set_src0(p, insn, elk_ip_reg());
1674       elk_set_src1(p, insn, elk_imm_d(0x0));
1675    } else if (devinfo->ver == 6) {
1676       elk_set_dest(p, insn, elk_imm_w(0));
1677       elk_inst_set_gfx6_jump_count(devinfo, insn, 0);
1678       elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1679       elk_set_src1(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1680    } else if (devinfo->ver == 7) {
1681       elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1682       elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1683       elk_set_src1(p, insn, elk_imm_w(0));
1684       elk_inst_set_jip(devinfo, insn, 0);
1685       elk_inst_set_uip(devinfo, insn, 0);
1686    } else {
1687       elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1688       if (devinfo->ver < 12)
1689          elk_set_src0(p, insn, elk_imm_d(0));
1690       elk_inst_set_jip(devinfo, insn, 0);
1691       elk_inst_set_uip(devinfo, insn, 0);
1692    }
1693 
1694    elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1695    elk_inst_set_mask_control(devinfo, insn, ELK_MASK_ENABLE);
1696    if (!p->single_program_flow && devinfo->ver < 6)
1697       elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
1698 
1699    push_if_stack(p, insn);
1700 }
1701 
1702 void
elk_ENDIF(struct elk_codegen * p)1703 elk_ENDIF(struct elk_codegen *p)
1704 {
1705    const struct intel_device_info *devinfo = p->devinfo;
1706    elk_inst *insn = NULL;
1707    elk_inst *else_inst = NULL;
1708    elk_inst *if_inst = NULL;
1709    elk_inst *tmp;
1710    bool emit_endif = true;
1711 
1712    assert(p->if_stack_depth > 0);
1713 
1714    if (devinfo->ver >= 8 && devinfo->ver < 11 &&
1715        elk_inst_opcode(p->isa, &p->store[p->if_stack[
1716                              p->if_stack_depth - 1]]) == ELK_OPCODE_ELSE) {
1717       /* Insert a NOP to be specified as join instruction within the
1718        * ELSE block, which is valid for an ELSE instruction with
1719        * branch_ctrl on.  The ELSE instruction will be set to jump
1720        * here instead of to the ENDIF instruction, since attempting to
1721        * do the latter would prevent the ENDIF from being executed in
1722        * some cases due to Wa_220160235, which could cause the program
1723        * to continue running with all channels disabled.
1724        */
1725       elk_NOP(p);
1726    }
1727 
1728    /* In single program flow mode, we can express IF and ELSE instructions
1729     * equivalently as ADD instructions that operate on IP.  On platforms prior
1730     * to Gfx6, flow control instructions cause an implied thread switch, so
1731     * this is a significant savings.
1732     *
1733     * However, on Gfx6, writing to IP doesn't work in single program flow mode
1734     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1735     * not be updated by non-flow control instructions.").  And on later
1736     * platforms, there is no significant benefit to converting control flow
1737     * instructions to conditional ADDs.  So we only do this trick on Gfx4 and
1738     * Gfx5.
1739     */
1740    if (devinfo->ver < 6 && p->single_program_flow)
1741       emit_endif = false;
1742 
1743    /*
1744     * A single next_insn() may change the base address of instruction store
1745     * memory(p->store), so call it first before referencing the instruction
1746     * store pointer from an index
1747     */
1748    if (emit_endif)
1749       insn = next_insn(p, ELK_OPCODE_ENDIF);
1750 
1751    /* Pop the IF and (optional) ELSE instructions from the stack */
1752    p->if_depth_in_loop[p->loop_stack_depth]--;
1753    tmp = pop_if_stack(p);
1754    if (elk_inst_opcode(p->isa, tmp) == ELK_OPCODE_ELSE) {
1755       else_inst = tmp;
1756       tmp = pop_if_stack(p);
1757    }
1758    if_inst = tmp;
1759 
1760    if (!emit_endif) {
1761       /* ENDIF is useless; don't bother emitting it. */
1762       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1763       return;
1764    }
1765 
1766    if (devinfo->ver < 6) {
1767       elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1768       elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1769       elk_set_src1(p, insn, elk_imm_d(0x0));
1770    } else if (devinfo->ver == 6) {
1771       elk_set_dest(p, insn, elk_imm_w(0));
1772       elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1773       elk_set_src1(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1774    } else if (devinfo->ver == 7) {
1775       elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1776       elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1777       elk_set_src1(p, insn, elk_imm_w(0));
1778    } else {
1779       elk_set_src0(p, insn, elk_imm_d(0));
1780    }
1781 
1782    elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1783    elk_inst_set_mask_control(devinfo, insn, ELK_MASK_ENABLE);
1784    if (devinfo->ver < 6)
1785       elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
1786 
1787    /* Also pop item off the stack in the endif instruction: */
1788    if (devinfo->ver < 6) {
1789       elk_inst_set_gfx4_jump_count(devinfo, insn, 0);
1790       elk_inst_set_gfx4_pop_count(devinfo, insn, 1);
1791    } else if (devinfo->ver == 6) {
1792       elk_inst_set_gfx6_jump_count(devinfo, insn, 2);
1793    } else {
1794       elk_inst_set_jip(devinfo, insn, 2);
1795    }
1796    patch_IF_ELSE(p, if_inst, else_inst, insn);
1797 }
1798 
1799 elk_inst *
elk_BREAK(struct elk_codegen * p)1800 elk_BREAK(struct elk_codegen *p)
1801 {
1802    const struct intel_device_info *devinfo = p->devinfo;
1803    elk_inst *insn;
1804 
1805    insn = next_insn(p, ELK_OPCODE_BREAK);
1806    if (devinfo->ver >= 8) {
1807       elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1808       elk_set_src0(p, insn, elk_imm_d(0x0));
1809    } else if (devinfo->ver >= 6) {
1810       elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1811       elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1812       elk_set_src1(p, insn, elk_imm_d(0x0));
1813    } else {
1814       elk_set_dest(p, insn, elk_ip_reg());
1815       elk_set_src0(p, insn, elk_ip_reg());
1816       elk_set_src1(p, insn, elk_imm_d(0x0));
1817       elk_inst_set_gfx4_pop_count(devinfo, insn,
1818                                   p->if_depth_in_loop[p->loop_stack_depth]);
1819    }
1820    elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1821    elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1822 
1823    return insn;
1824 }
1825 
1826 elk_inst *
elk_CONT(struct elk_codegen * p)1827 elk_CONT(struct elk_codegen *p)
1828 {
1829    const struct intel_device_info *devinfo = p->devinfo;
1830    elk_inst *insn;
1831 
1832    insn = next_insn(p, ELK_OPCODE_CONTINUE);
1833    elk_set_dest(p, insn, elk_ip_reg());
1834    if (devinfo->ver >= 8) {
1835       elk_set_src0(p, insn, elk_imm_d(0x0));
1836    } else {
1837       elk_set_src0(p, insn, elk_ip_reg());
1838       elk_set_src1(p, insn, elk_imm_d(0x0));
1839    }
1840 
1841    if (devinfo->ver < 6) {
1842       elk_inst_set_gfx4_pop_count(devinfo, insn,
1843                                   p->if_depth_in_loop[p->loop_stack_depth]);
1844    }
1845    elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1846    elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1847    return insn;
1848 }
1849 
1850 elk_inst *
elk_HALT(struct elk_codegen * p)1851 elk_HALT(struct elk_codegen *p)
1852 {
1853    const struct intel_device_info *devinfo = p->devinfo;
1854    elk_inst *insn;
1855 
1856    insn = next_insn(p, ELK_OPCODE_HALT);
1857    elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1858    if (devinfo->ver < 6) {
1859       /* From the Gfx4 PRM:
1860        *
1861        *    "IP register must be put (for example, by the assembler) at <dst>
1862        *    and <src0> locations.
1863        */
1864       elk_set_dest(p, insn, elk_ip_reg());
1865       elk_set_src0(p, insn, elk_ip_reg());
1866       elk_set_src1(p, insn, elk_imm_d(0x0)); /* exitcode updated later. */
1867    } else if (devinfo->ver < 8) {
1868       elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1869       elk_set_src1(p, insn, elk_imm_d(0x0)); /* UIP and JIP, updated later. */
1870    } else if (devinfo->ver < 12) {
1871       elk_set_src0(p, insn, elk_imm_d(0x0));
1872    }
1873 
1874    elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1875    elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1876    return insn;
1877 }
1878 
1879 /* DO/WHILE loop:
1880  *
1881  * The DO/WHILE is just an unterminated loop -- break or continue are
1882  * used for control within the loop.  We have a few ways they can be
1883  * done.
1884  *
1885  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1886  * jip and no DO instruction.
1887  *
1888  * For non-uniform control flow pre-gfx6, there's a DO instruction to
1889  * push the mask, and a WHILE to jump back, and BREAK to get out and
1890  * pop the mask.
1891  *
1892  * For gfx6, there's no more mask stack, so no need for DO.  WHILE
1893  * just points back to the first instruction of the loop.
1894  */
1895 elk_inst *
elk_DO(struct elk_codegen * p,unsigned execute_size)1896 elk_DO(struct elk_codegen *p, unsigned execute_size)
1897 {
1898    const struct intel_device_info *devinfo = p->devinfo;
1899 
1900    if (devinfo->ver >= 6 || p->single_program_flow) {
1901       push_loop_stack(p, &p->store[p->nr_insn]);
1902       return &p->store[p->nr_insn];
1903    } else {
1904       elk_inst *insn = next_insn(p, ELK_OPCODE_DO);
1905 
1906       push_loop_stack(p, insn);
1907 
1908       /* Override the defaults for this instruction:
1909        */
1910       elk_set_dest(p, insn, elk_null_reg());
1911       elk_set_src0(p, insn, elk_null_reg());
1912       elk_set_src1(p, insn, elk_null_reg());
1913 
1914       elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
1915       elk_inst_set_exec_size(devinfo, insn, execute_size);
1916       elk_inst_set_pred_control(devinfo, insn, ELK_PREDICATE_NONE);
1917 
1918       return insn;
1919    }
1920 }
1921 
1922 /**
1923  * For pre-gfx6, we patch BREAK/CONT instructions to point at the WHILE
1924  * instruction here.
1925  *
1926  * For gfx6+, see elk_set_uip_jip(), which doesn't care so much about the loop
1927  * nesting, since it can always just point to the end of the block/current loop.
1928  */
1929 static void
elk_patch_break_cont(struct elk_codegen * p,elk_inst * while_inst)1930 elk_patch_break_cont(struct elk_codegen *p, elk_inst *while_inst)
1931 {
1932    const struct intel_device_info *devinfo = p->devinfo;
1933    elk_inst *do_inst = get_inner_do_insn(p);
1934    elk_inst *inst;
1935    unsigned br = elk_jump_scale(devinfo);
1936 
1937    assert(devinfo->ver < 6);
1938 
1939    for (inst = while_inst - 1; inst != do_inst; inst--) {
1940       /* If the jump count is != 0, that means that this instruction has already
1941        * been patched because it's part of a loop inside of the one we're
1942        * patching.
1943        */
1944       if (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_BREAK &&
1945           elk_inst_gfx4_jump_count(devinfo, inst) == 0) {
1946          elk_inst_set_gfx4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1947       } else if (elk_inst_opcode(p->isa, inst) == ELK_OPCODE_CONTINUE &&
1948                  elk_inst_gfx4_jump_count(devinfo, inst) == 0) {
1949          elk_inst_set_gfx4_jump_count(devinfo, inst, br * (while_inst - inst));
1950       }
1951    }
1952 }
1953 
1954 elk_inst *
elk_WHILE(struct elk_codegen * p)1955 elk_WHILE(struct elk_codegen *p)
1956 {
1957    const struct intel_device_info *devinfo = p->devinfo;
1958    elk_inst *insn, *do_insn;
1959    unsigned br = elk_jump_scale(devinfo);
1960 
1961    if (devinfo->ver >= 6) {
1962       insn = next_insn(p, ELK_OPCODE_WHILE);
1963       do_insn = get_inner_do_insn(p);
1964 
1965       if (devinfo->ver >= 8) {
1966          elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1967          if (devinfo->ver < 12)
1968             elk_set_src0(p, insn, elk_imm_d(0));
1969          elk_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1970       } else if (devinfo->ver == 7) {
1971          elk_set_dest(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1972          elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1973          elk_set_src1(p, insn, elk_imm_w(0));
1974          elk_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1975       } else {
1976          elk_set_dest(p, insn, elk_imm_w(0));
1977          elk_inst_set_gfx6_jump_count(devinfo, insn, br * (do_insn - insn));
1978          elk_set_src0(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1979          elk_set_src1(p, insn, retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
1980       }
1981 
1982       elk_inst_set_exec_size(devinfo, insn, elk_get_default_exec_size(p));
1983 
1984    } else {
1985       if (p->single_program_flow) {
1986 	 insn = next_insn(p, ELK_OPCODE_ADD);
1987          do_insn = get_inner_do_insn(p);
1988 
1989 	 elk_set_dest(p, insn, elk_ip_reg());
1990 	 elk_set_src0(p, insn, elk_ip_reg());
1991 	 elk_set_src1(p, insn, elk_imm_d((do_insn - insn) * 16));
1992          elk_inst_set_exec_size(devinfo, insn, ELK_EXECUTE_1);
1993       } else {
1994 	 insn = next_insn(p, ELK_OPCODE_WHILE);
1995          do_insn = get_inner_do_insn(p);
1996 
1997          assert(elk_inst_opcode(p->isa, do_insn) == ELK_OPCODE_DO);
1998 
1999 	 elk_set_dest(p, insn, elk_ip_reg());
2000 	 elk_set_src0(p, insn, elk_ip_reg());
2001 	 elk_set_src1(p, insn, elk_imm_d(0));
2002 
2003          elk_inst_set_exec_size(devinfo, insn, elk_inst_exec_size(devinfo, do_insn));
2004          elk_inst_set_gfx4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
2005          elk_inst_set_gfx4_pop_count(devinfo, insn, 0);
2006 
2007 	 elk_patch_break_cont(p, insn);
2008       }
2009    }
2010    elk_inst_set_qtr_control(devinfo, insn, ELK_COMPRESSION_NONE);
2011 
2012    p->loop_stack_depth--;
2013 
2014    return insn;
2015 }
2016 
2017 /* FORWARD JUMPS:
2018  */
elk_land_fwd_jump(struct elk_codegen * p,int jmp_insn_idx)2019 void elk_land_fwd_jump(struct elk_codegen *p, int jmp_insn_idx)
2020 {
2021    const struct intel_device_info *devinfo = p->devinfo;
2022    elk_inst *jmp_insn = &p->store[jmp_insn_idx];
2023    unsigned jmpi = 1;
2024 
2025    if (devinfo->ver >= 5)
2026       jmpi = 2;
2027 
2028    assert(elk_inst_opcode(p->isa, jmp_insn) == ELK_OPCODE_JMPI);
2029    assert(elk_inst_src1_reg_file(devinfo, jmp_insn) == ELK_IMMEDIATE_VALUE);
2030 
2031    elk_inst_set_gfx4_jump_count(devinfo, jmp_insn,
2032                                 jmpi * (p->nr_insn - jmp_insn_idx - 1));
2033 }
2034 
2035 /* To integrate with the above, it makes sense that the comparison
2036  * instruction should populate the flag register.  It might be simpler
2037  * just to use the flag reg for most WM tasks?
2038  */
elk_CMP(struct elk_codegen * p,struct elk_reg dest,unsigned conditional,struct elk_reg src0,struct elk_reg src1)2039 void elk_CMP(struct elk_codegen *p,
2040 	     struct elk_reg dest,
2041 	     unsigned conditional,
2042 	     struct elk_reg src0,
2043 	     struct elk_reg src1)
2044 {
2045    const struct intel_device_info *devinfo = p->devinfo;
2046    elk_inst *insn = next_insn(p, ELK_OPCODE_CMP);
2047 
2048    elk_inst_set_cond_modifier(devinfo, insn, conditional);
2049    elk_set_dest(p, insn, dest);
2050    elk_set_src0(p, insn, src0);
2051    elk_set_src1(p, insn, src1);
2052 
2053    /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
2054     * page says:
2055     *    "Any CMP instruction with a null destination must use a {switch}."
2056     *
2057     * It also applies to other Gfx7 platforms (IVB, BYT) even though it isn't
2058     * mentioned on their work-arounds pages.
2059     */
2060    if (devinfo->ver == 7) {
2061       if (dest.file == ELK_ARCHITECTURE_REGISTER_FILE &&
2062           dest.nr == ELK_ARF_NULL) {
2063          elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
2064       }
2065    }
2066 }
2067 
elk_CMPN(struct elk_codegen * p,struct elk_reg dest,unsigned conditional,struct elk_reg src0,struct elk_reg src1)2068 void elk_CMPN(struct elk_codegen *p,
2069               struct elk_reg dest,
2070               unsigned conditional,
2071               struct elk_reg src0,
2072               struct elk_reg src1)
2073 {
2074    const struct intel_device_info *devinfo = p->devinfo;
2075    elk_inst *insn = next_insn(p, ELK_OPCODE_CMPN);
2076 
2077    elk_inst_set_cond_modifier(devinfo, insn, conditional);
2078    elk_set_dest(p, insn, dest);
2079    elk_set_src0(p, insn, src0);
2080    elk_set_src1(p, insn, src1);
2081 
2082    /* Page 166 of the Ivy Bridge PRM Volume 4 part 3 (Execution Unit ISA)
2083     * says:
2084     *
2085     *    If the destination is the null register, the {Switch} instruction
2086     *    option must be used.
2087     *
2088     * Page 77 of the Haswell PRM Volume 2b contains the same text.
2089     */
2090    if (devinfo->ver == 7) {
2091       if (dest.file == ELK_ARCHITECTURE_REGISTER_FILE &&
2092           dest.nr == ELK_ARF_NULL) {
2093          elk_inst_set_thread_control(devinfo, insn, ELK_THREAD_SWITCH);
2094       }
2095    }
2096 }
2097 
2098 /***********************************************************************
2099  * Helpers for the various SEND message types:
2100  */
2101 
2102 /** Extended math function, float[8].
2103  */
elk_gfx4_math(struct elk_codegen * p,struct elk_reg dest,unsigned function,unsigned msg_reg_nr,struct elk_reg src,unsigned precision)2104 void elk_gfx4_math(struct elk_codegen *p,
2105 	       struct elk_reg dest,
2106 	       unsigned function,
2107 	       unsigned msg_reg_nr,
2108 	       struct elk_reg src,
2109 	       unsigned precision )
2110 {
2111    const struct intel_device_info *devinfo = p->devinfo;
2112    elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
2113    unsigned data_type;
2114    if (has_scalar_region(src)) {
2115       data_type = ELK_MATH_DATA_SCALAR;
2116    } else {
2117       data_type = ELK_MATH_DATA_VECTOR;
2118    }
2119 
2120    assert(devinfo->ver < 6);
2121 
2122    /* Example code doesn't set predicate_control for send
2123     * instructions.
2124     */
2125    elk_inst_set_pred_control(devinfo, insn, 0);
2126    elk_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2127 
2128    elk_set_dest(p, insn, dest);
2129    elk_set_src0(p, insn, src);
2130    elk_set_math_message(p,
2131                         insn,
2132                         function,
2133                         src.type == ELK_REGISTER_TYPE_D,
2134                         precision,
2135                         data_type);
2136 }
2137 
elk_gfx6_math(struct elk_codegen * p,struct elk_reg dest,unsigned function,struct elk_reg src0,struct elk_reg src1)2138 void elk_gfx6_math(struct elk_codegen *p,
2139 	       struct elk_reg dest,
2140 	       unsigned function,
2141 	       struct elk_reg src0,
2142 	       struct elk_reg src1)
2143 {
2144    const struct intel_device_info *devinfo = p->devinfo;
2145    elk_inst *insn = next_insn(p, ELK_OPCODE_MATH);
2146 
2147    assert(devinfo->ver >= 6);
2148 
2149    assert(dest.file == ELK_GENERAL_REGISTER_FILE ||
2150           (devinfo->ver >= 7 && dest.file == ELK_MESSAGE_REGISTER_FILE));
2151 
2152    assert(dest.hstride == ELK_HORIZONTAL_STRIDE_1);
2153    if (devinfo->ver == 6) {
2154       assert(src0.hstride == ELK_HORIZONTAL_STRIDE_1);
2155       assert(src1.hstride == ELK_HORIZONTAL_STRIDE_1);
2156    }
2157 
2158    if (function == ELK_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2159        function == ELK_MATH_FUNCTION_INT_DIV_REMAINDER ||
2160        function == ELK_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2161       assert(src0.type != ELK_REGISTER_TYPE_F);
2162       assert(src1.type != ELK_REGISTER_TYPE_F);
2163       assert(src1.file == ELK_GENERAL_REGISTER_FILE ||
2164              (devinfo->ver >= 8 && src1.file == ELK_IMMEDIATE_VALUE));
2165       /* From BSpec 6647/47428 "[Instruction] Extended Math Function":
2166        *     INT DIV function does not support source modifiers.
2167        */
2168       assert(!src0.negate);
2169       assert(!src0.abs);
2170       assert(!src1.negate);
2171       assert(!src1.abs);
2172    } else {
2173       assert(src0.type == ELK_REGISTER_TYPE_F ||
2174              (src0.type == ELK_REGISTER_TYPE_HF && devinfo->ver >= 9));
2175       assert(src1.type == ELK_REGISTER_TYPE_F ||
2176              (src1.type == ELK_REGISTER_TYPE_HF && devinfo->ver >= 9));
2177    }
2178 
2179    /* Source modifiers are ignored for extended math instructions on Gfx6. */
2180    if (devinfo->ver == 6) {
2181       assert(!src0.negate);
2182       assert(!src0.abs);
2183       assert(!src1.negate);
2184       assert(!src1.abs);
2185    }
2186 
2187    elk_inst_set_math_function(devinfo, insn, function);
2188 
2189    elk_set_dest(p, insn, dest);
2190    elk_set_src0(p, insn, src0);
2191    elk_set_src1(p, insn, src1);
2192 }
2193 
2194 /**
2195  * Return the right surface index to access the thread scratch space using
2196  * stateless dataport messages.
2197  */
2198 unsigned
elk_scratch_surface_idx(const struct elk_codegen * p)2199 elk_scratch_surface_idx(const struct elk_codegen *p)
2200 {
2201    /* The scratch space is thread-local so IA coherency is unnecessary. */
2202    if (p->devinfo->ver >= 8)
2203       return GFX8_BTI_STATELESS_NON_COHERENT;
2204    else
2205       return ELK_BTI_STATELESS;
2206 }
2207 
2208 /**
2209  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2210  * using a constant offset per channel.
2211  *
2212  * The offset must be aligned to oword size (16 bytes).  Used for
2213  * register spilling.
2214  */
elk_oword_block_write_scratch(struct elk_codegen * p,struct elk_reg mrf,int num_regs,unsigned offset)2215 void elk_oword_block_write_scratch(struct elk_codegen *p,
2216 				   struct elk_reg mrf,
2217 				   int num_regs,
2218 				   unsigned offset)
2219 {
2220    const struct intel_device_info *devinfo = p->devinfo;
2221    const unsigned target_cache =
2222       (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
2223        devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2224        ELK_SFID_DATAPORT_WRITE);
2225    const struct tgl_swsb swsb = elk_get_default_swsb(p);
2226    uint32_t msg_type;
2227 
2228    if (devinfo->ver >= 6)
2229       offset /= 16;
2230 
2231    mrf = retype(mrf, ELK_REGISTER_TYPE_UD);
2232 
2233    const unsigned mlen = 1 + num_regs;
2234 
2235    /* Set up the message header.  This is g0, with g0.2 filled with
2236     * the offset.  We don't want to leave our offset around in g0 or
2237     * it'll screw up texture samples, so set it up inside the message
2238     * reg.
2239     */
2240    {
2241       elk_push_insn_state(p);
2242       elk_set_default_exec_size(p, ELK_EXECUTE_8);
2243       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2244       elk_set_default_compression_control(p, ELK_COMPRESSION_NONE);
2245       elk_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2246 
2247       elk_MOV(p, mrf, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
2248 
2249       /* set message header global offset field (reg 0, element 2) */
2250       elk_set_default_exec_size(p, ELK_EXECUTE_1);
2251       elk_set_default_swsb(p, tgl_swsb_null());
2252       elk_MOV(p,
2253 	      retype(elk_vec1_reg(ELK_MESSAGE_REGISTER_FILE,
2254 				  mrf.nr,
2255 				  2), ELK_REGISTER_TYPE_UD),
2256 	      elk_imm_ud(offset));
2257 
2258       elk_pop_insn_state(p);
2259       elk_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2260    }
2261 
2262    {
2263       struct elk_reg dest;
2264       elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
2265       int send_commit_msg;
2266       struct elk_reg src_header = retype(elk_vec8_grf(0, 0),
2267 					 ELK_REGISTER_TYPE_UW);
2268 
2269       elk_inst_set_sfid(devinfo, insn, target_cache);
2270       elk_inst_set_compression(devinfo, insn, false);
2271 
2272       if (elk_inst_exec_size(devinfo, insn) >= 16)
2273 	 src_header = vec16(src_header);
2274 
2275       assert(elk_inst_pred_control(devinfo, insn) == ELK_PREDICATE_NONE);
2276       if (devinfo->ver < 6)
2277          elk_inst_set_base_mrf(devinfo, insn, mrf.nr);
2278 
2279       /* Until gfx6, writes followed by reads from the same location
2280        * are not guaranteed to be ordered unless write_commit is set.
2281        * If set, then a no-op write is issued to the destination
2282        * register to set a dependency, and a read from the destination
2283        * can be used to ensure the ordering.
2284        *
2285        * For gfx6, only writes between different threads need ordering
2286        * protection.  Our use of DP writes is all about register
2287        * spilling within a thread.
2288        */
2289       if (devinfo->ver >= 6) {
2290 	 dest = retype(vec16(elk_null_reg()), ELK_REGISTER_TYPE_UW);
2291 	 send_commit_msg = 0;
2292       } else {
2293 	 dest = src_header;
2294 	 send_commit_msg = 1;
2295       }
2296 
2297       elk_set_dest(p, insn, dest);
2298       if (devinfo->ver >= 6) {
2299 	 elk_set_src0(p, insn, mrf);
2300       } else {
2301 	 elk_set_src0(p, insn, elk_null_reg());
2302       }
2303 
2304       if (devinfo->ver >= 6)
2305 	 msg_type = GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2306       else
2307 	 msg_type = ELK_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2308 
2309       elk_set_desc(p, insn,
2310                    elk_message_desc(devinfo, mlen, send_commit_msg, true) |
2311                    elk_dp_write_desc(devinfo, elk_scratch_surface_idx(p),
2312                                      ELK_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2313                                      msg_type, send_commit_msg));
2314    }
2315 }
2316 
2317 
2318 /**
2319  * Read a block of owords (half a GRF each) from the scratch buffer
2320  * using a constant index per channel.
2321  *
2322  * Offset must be aligned to oword size (16 bytes).  Used for register
2323  * spilling.
2324  */
2325 void
elk_oword_block_read_scratch(struct elk_codegen * p,struct elk_reg dest,struct elk_reg mrf,int num_regs,unsigned offset)2326 elk_oword_block_read_scratch(struct elk_codegen *p,
2327 			     struct elk_reg dest,
2328 			     struct elk_reg mrf,
2329 			     int num_regs,
2330 			     unsigned offset)
2331 {
2332    const struct intel_device_info *devinfo = p->devinfo;
2333    const struct tgl_swsb swsb = elk_get_default_swsb(p);
2334 
2335    if (devinfo->ver >= 6)
2336       offset /= 16;
2337 
2338    if (p->devinfo->ver >= 7) {
2339       /* On gen 7 and above, we no longer have message registers and we can
2340        * send from any register we want.  By using the destination register
2341        * for the message, we guarantee that the implied message write won't
2342        * accidentally overwrite anything.  This has been a problem because
2343        * the MRF registers and source for the final FB write are both fixed
2344        * and may overlap.
2345        */
2346       mrf = retype(dest, ELK_REGISTER_TYPE_UD);
2347    } else {
2348       mrf = retype(mrf, ELK_REGISTER_TYPE_UD);
2349    }
2350    dest = retype(dest, ELK_REGISTER_TYPE_UW);
2351 
2352    const unsigned rlen = num_regs;
2353    const unsigned target_cache =
2354       (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
2355        devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2356        ELK_SFID_DATAPORT_READ);
2357 
2358    {
2359       elk_push_insn_state(p);
2360       elk_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2361       elk_set_default_exec_size(p, ELK_EXECUTE_8);
2362       elk_set_default_compression_control(p, ELK_COMPRESSION_NONE);
2363       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2364 
2365       elk_MOV(p, mrf, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
2366 
2367       /* set message header global offset field (reg 0, element 2) */
2368       elk_set_default_exec_size(p, ELK_EXECUTE_1);
2369       elk_set_default_swsb(p, tgl_swsb_null());
2370       elk_MOV(p, get_element_ud(mrf, 2), elk_imm_ud(offset));
2371 
2372       elk_pop_insn_state(p);
2373       elk_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2374    }
2375 
2376    {
2377       elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
2378 
2379       elk_inst_set_sfid(devinfo, insn, target_cache);
2380       assert(elk_inst_pred_control(devinfo, insn) == 0);
2381       elk_inst_set_compression(devinfo, insn, false);
2382 
2383       elk_set_dest(p, insn, dest);	/* UW? */
2384       if (devinfo->ver >= 6) {
2385 	 elk_set_src0(p, insn, mrf);
2386       } else {
2387 	 elk_set_src0(p, insn, elk_null_reg());
2388          elk_inst_set_base_mrf(devinfo, insn, mrf.nr);
2389       }
2390 
2391       elk_set_desc(p, insn,
2392                    elk_message_desc(devinfo, 1, rlen, true) |
2393                    elk_dp_read_desc(devinfo, elk_scratch_surface_idx(p),
2394                                     ELK_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2395                                     ELK_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2396                                     ELK_DATAPORT_READ_TARGET_RENDER_CACHE));
2397    }
2398 }
2399 
2400 void
elk_gfx7_block_read_scratch(struct elk_codegen * p,struct elk_reg dest,int num_regs,unsigned offset)2401 elk_gfx7_block_read_scratch(struct elk_codegen *p,
2402                         struct elk_reg dest,
2403                         int num_regs,
2404                         unsigned offset)
2405 {
2406    elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
2407    assert(elk_inst_pred_control(p->devinfo, insn) == ELK_PREDICATE_NONE);
2408 
2409    elk_set_dest(p, insn, retype(dest, ELK_REGISTER_TYPE_UW));
2410 
2411    /* The HW requires that the header is present; this is to get the g0.5
2412     * scratch offset.
2413     */
2414    elk_set_src0(p, insn, elk_vec8_grf(0, 0));
2415 
2416    /* According to the docs, offset is "A 12-bit HWord offset into the memory
2417     * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
2418     * is 32 bytes, which happens to be the size of a register.
2419     */
2420    offset /= REG_SIZE;
2421    assert(offset < (1 << 12));
2422 
2423    gfx7_set_dp_scratch_message(p, insn,
2424                                false, /* scratch read */
2425                                false, /* OWords */
2426                                false, /* invalidate after read */
2427                                num_regs,
2428                                offset,
2429                                1,        /* mlen: just g0 */
2430                                num_regs, /* rlen */
2431                                true);    /* header present */
2432 }
2433 
2434 /**
2435  * Read float[4] vectors from the data port constant cache.
2436  * Location (in buffer) should be a multiple of 16.
2437  * Used for fetching shader constants.
2438  */
elk_oword_block_read(struct elk_codegen * p,struct elk_reg dest,struct elk_reg mrf,uint32_t offset,uint32_t bind_table_index)2439 void elk_oword_block_read(struct elk_codegen *p,
2440 			  struct elk_reg dest,
2441 			  struct elk_reg mrf,
2442 			  uint32_t offset,
2443 			  uint32_t bind_table_index)
2444 {
2445    const struct intel_device_info *devinfo = p->devinfo;
2446    const unsigned target_cache =
2447       (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_CONSTANT_CACHE :
2448        ELK_SFID_DATAPORT_READ);
2449    const unsigned exec_size = 1 << elk_get_default_exec_size(p);
2450    const struct tgl_swsb swsb = elk_get_default_swsb(p);
2451 
2452    /* On newer hardware, offset is in units of owords. */
2453    if (devinfo->ver >= 6)
2454       offset /= 16;
2455 
2456    mrf = retype(mrf, ELK_REGISTER_TYPE_UD);
2457 
2458    elk_push_insn_state(p);
2459    elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
2460    elk_set_default_flag_reg(p, 0, 0);
2461    elk_set_default_compression_control(p, ELK_COMPRESSION_NONE);
2462    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2463 
2464    elk_push_insn_state(p);
2465    elk_set_default_exec_size(p, ELK_EXECUTE_8);
2466    elk_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2467    elk_MOV(p, mrf, retype(elk_vec8_grf(0, 0), ELK_REGISTER_TYPE_UD));
2468 
2469    /* set message header global offset field (reg 0, element 2) */
2470    elk_set_default_exec_size(p, ELK_EXECUTE_1);
2471    elk_set_default_swsb(p, tgl_swsb_null());
2472    elk_MOV(p,
2473 	   retype(elk_vec1_reg(ELK_MESSAGE_REGISTER_FILE,
2474 			       mrf.nr,
2475 			       2), ELK_REGISTER_TYPE_UD),
2476 	   elk_imm_ud(offset));
2477    elk_pop_insn_state(p);
2478 
2479    elk_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2480 
2481    elk_inst *insn = next_insn(p, ELK_OPCODE_SEND);
2482 
2483    elk_inst_set_sfid(devinfo, insn, target_cache);
2484 
2485    /* cast dest to a uword[8] vector */
2486    dest = retype(vec8(dest), ELK_REGISTER_TYPE_UW);
2487 
2488    elk_set_dest(p, insn, dest);
2489    if (devinfo->ver >= 6) {
2490       elk_set_src0(p, insn, mrf);
2491    } else {
2492       elk_set_src0(p, insn, elk_null_reg());
2493       elk_inst_set_base_mrf(devinfo, insn, mrf.nr);
2494    }
2495 
2496    elk_set_desc(p, insn,
2497                 elk_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2498                 elk_dp_read_desc(devinfo, bind_table_index,
2499                                  ELK_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2500                                  ELK_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2501                                  ELK_DATAPORT_READ_TARGET_DATA_CACHE));
2502 
2503    elk_pop_insn_state(p);
2504 }
2505 
2506 elk_inst *
elk_fb_WRITE(struct elk_codegen * p,struct elk_reg payload,struct elk_reg implied_header,unsigned msg_control,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool eot,bool last_render_target,bool header_present)2507 elk_fb_WRITE(struct elk_codegen *p,
2508              struct elk_reg payload,
2509              struct elk_reg implied_header,
2510              unsigned msg_control,
2511              unsigned binding_table_index,
2512              unsigned msg_length,
2513              unsigned response_length,
2514              bool eot,
2515              bool last_render_target,
2516              bool header_present)
2517 {
2518    const struct intel_device_info *devinfo = p->devinfo;
2519    const unsigned target_cache =
2520       (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2521        ELK_SFID_DATAPORT_WRITE);
2522    elk_inst *insn;
2523    struct elk_reg dest, src0;
2524 
2525    if (elk_get_default_exec_size(p) >= ELK_EXECUTE_16)
2526       dest = retype(vec16(elk_null_reg()), ELK_REGISTER_TYPE_UW);
2527    else
2528       dest = retype(vec8(elk_null_reg()), ELK_REGISTER_TYPE_UW);
2529 
2530    if (devinfo->ver >= 6) {
2531       insn = next_insn(p, ELK_OPCODE_SENDC);
2532    } else {
2533       insn = next_insn(p, ELK_OPCODE_SEND);
2534    }
2535    elk_inst_set_sfid(devinfo, insn, target_cache);
2536    elk_inst_set_compression(devinfo, insn, false);
2537 
2538    if (devinfo->ver >= 6) {
2539       /* headerless version, just submit color payload */
2540       src0 = payload;
2541    } else {
2542       assert(payload.file == ELK_MESSAGE_REGISTER_FILE);
2543       elk_inst_set_base_mrf(devinfo, insn, payload.nr);
2544       src0 = implied_header;
2545    }
2546 
2547    elk_set_dest(p, insn, dest);
2548    elk_set_src0(p, insn, src0);
2549    elk_set_desc(p, insn,
2550                 elk_message_desc(devinfo, msg_length, response_length,
2551                                  header_present) |
2552                 elk_fb_write_desc(devinfo, binding_table_index, msg_control,
2553                                   last_render_target,
2554                                   false /* coarse_write */));
2555    elk_inst_set_eot(devinfo, insn, eot);
2556 
2557    return insn;
2558 }
2559 
2560 elk_inst *
elk_gfx9_fb_READ(struct elk_codegen * p,struct elk_reg dst,struct elk_reg payload,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool per_sample)2561 elk_gfx9_fb_READ(struct elk_codegen *p,
2562              struct elk_reg dst,
2563              struct elk_reg payload,
2564              unsigned binding_table_index,
2565              unsigned msg_length,
2566              unsigned response_length,
2567              bool per_sample)
2568 {
2569    const struct intel_device_info *devinfo = p->devinfo;
2570    assert(devinfo->ver >= 9);
2571    elk_inst *insn = next_insn(p, ELK_OPCODE_SENDC);
2572 
2573    elk_inst_set_sfid(devinfo, insn, GFX6_SFID_DATAPORT_RENDER_CACHE);
2574    elk_set_dest(p, insn, dst);
2575    elk_set_src0(p, insn, payload);
2576    elk_set_desc(
2577       p, insn,
2578       elk_message_desc(devinfo, msg_length, response_length, true) |
2579       elk_fb_read_desc(devinfo, binding_table_index, 0 /* msg_control */,
2580                        1 << elk_get_default_exec_size(p), per_sample));
2581    elk_inst_set_rt_slot_group(devinfo, insn, elk_get_default_group(p) / 16);
2582 
2583    return insn;
2584 }
2585 
2586 /**
2587  * Texture sample instruction.
2588  * Note: the msg_type plus msg_length values determine exactly what kind
2589  * of sampling operation is performed.  See volume 4, page 161 of docs.
2590  */
elk_SAMPLE(struct elk_codegen * p,struct elk_reg dest,unsigned msg_reg_nr,struct elk_reg src0,unsigned binding_table_index,unsigned sampler,unsigned msg_type,unsigned response_length,unsigned msg_length,unsigned header_present,unsigned simd_mode,unsigned return_format)2591 void elk_SAMPLE(struct elk_codegen *p,
2592 		struct elk_reg dest,
2593 		unsigned msg_reg_nr,
2594 		struct elk_reg src0,
2595 		unsigned binding_table_index,
2596 		unsigned sampler,
2597 		unsigned msg_type,
2598 		unsigned response_length,
2599 		unsigned msg_length,
2600 		unsigned header_present,
2601 		unsigned simd_mode,
2602 		unsigned return_format)
2603 {
2604    const struct intel_device_info *devinfo = p->devinfo;
2605    elk_inst *insn;
2606 
2607    if (msg_reg_nr != -1)
2608       elk_gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2609 
2610    insn = next_insn(p, ELK_OPCODE_SEND);
2611    elk_inst_set_sfid(devinfo, insn, ELK_SFID_SAMPLER);
2612    elk_inst_set_pred_control(devinfo, insn, ELK_PREDICATE_NONE); /* XXX */
2613 
2614    /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2615     *
2616     *    "Instruction compression is not allowed for this instruction (that
2617     *     is, send). The hardware behavior is undefined if this instruction is
2618     *     set as compressed. However, compress control can be set to "SecHalf"
2619     *     to affect the EMask generation."
2620     *
2621     * No similar wording is found in later PRMs, but there are examples
2622     * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
2623     * are allowed in SIMD16 mode and they could not work without SecHalf.  For
2624     * these reasons, we allow ELK_COMPRESSION_2NDHALF here.
2625     */
2626    elk_inst_set_compression(devinfo, insn, false);
2627 
2628    if (devinfo->ver < 6)
2629       elk_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2630 
2631    elk_set_dest(p, insn, dest);
2632    elk_set_src0(p, insn, src0);
2633    elk_set_desc(p, insn,
2634                 elk_message_desc(devinfo, msg_length, response_length,
2635                                  header_present) |
2636                 elk_sampler_desc(devinfo, binding_table_index, sampler,
2637                                  msg_type, simd_mode, return_format));
2638 }
2639 
2640 /* Adjust the message header's sampler state pointer to
2641  * select the correct group of 16 samplers.
2642  */
elk_adjust_sampler_state_pointer(struct elk_codegen * p,struct elk_reg header,struct elk_reg sampler_index)2643 void elk_adjust_sampler_state_pointer(struct elk_codegen *p,
2644                                       struct elk_reg header,
2645                                       struct elk_reg sampler_index)
2646 {
2647    /* The "Sampler Index" field can only store values between 0 and 15.
2648     * However, we can add an offset to the "Sampler State Pointer"
2649     * field, effectively selecting a different set of 16 samplers.
2650     *
2651     * The "Sampler State Pointer" needs to be aligned to a 32-byte
2652     * offset, and each sampler state is only 16-bytes, so we can't
2653     * exclusively use the offset - we have to use both.
2654     */
2655 
2656    const struct intel_device_info *devinfo = p->devinfo;
2657 
2658    if (sampler_index.file == ELK_IMMEDIATE_VALUE) {
2659       const int sampler_state_size = 16; /* 16 bytes */
2660       uint32_t sampler = sampler_index.ud;
2661 
2662       if (sampler >= 16) {
2663          assert(devinfo->verx10 >= 75);
2664          elk_ADD(p,
2665                  get_element_ud(header, 3),
2666                  get_element_ud(elk_vec8_grf(0, 0), 3),
2667                  elk_imm_ud(16 * (sampler / 16) * sampler_state_size));
2668       }
2669    } else {
2670       /* Non-const sampler array indexing case */
2671       if (devinfo->verx10 <= 70) {
2672          return;
2673       }
2674 
2675       struct elk_reg temp = get_element_ud(header, 3);
2676 
2677       elk_push_insn_state(p);
2678       elk_AND(p, temp, get_element_ud(sampler_index, 0), elk_imm_ud(0x0f0));
2679       elk_set_default_swsb(p, tgl_swsb_regdist(1));
2680       elk_SHL(p, temp, temp, elk_imm_ud(4));
2681       elk_ADD(p,
2682               get_element_ud(header, 3),
2683               get_element_ud(elk_vec8_grf(0, 0), 3),
2684               temp);
2685       elk_pop_insn_state(p);
2686    }
2687 }
2688 
2689 /* All these variables are pretty confusing - we might be better off
2690  * using bitmasks and macros for this, in the old style.  Or perhaps
2691  * just having the caller instantiate the fields in dword3 itself.
2692  */
elk_urb_WRITE(struct elk_codegen * p,struct elk_reg dest,unsigned msg_reg_nr,struct elk_reg src0,enum elk_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle)2693 void elk_urb_WRITE(struct elk_codegen *p,
2694 		   struct elk_reg dest,
2695 		   unsigned msg_reg_nr,
2696 		   struct elk_reg src0,
2697                    enum elk_urb_write_flags flags,
2698 		   unsigned msg_length,
2699 		   unsigned response_length,
2700 		   unsigned offset,
2701 		   unsigned swizzle)
2702 {
2703    const struct intel_device_info *devinfo = p->devinfo;
2704    elk_inst *insn;
2705 
2706    elk_gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2707 
2708    if (devinfo->ver >= 7 && !(flags & ELK_URB_WRITE_USE_CHANNEL_MASKS)) {
2709       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2710       elk_push_insn_state(p);
2711       elk_set_default_access_mode(p, ELK_ALIGN_1);
2712       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2713       elk_set_default_exec_size(p, ELK_EXECUTE_1);
2714       elk_OR(p, retype(elk_vec1_reg(ELK_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2715 		       ELK_REGISTER_TYPE_UD),
2716 	        retype(elk_vec1_grf(0, 5), ELK_REGISTER_TYPE_UD),
2717 		elk_imm_ud(0xff00));
2718       elk_pop_insn_state(p);
2719    }
2720 
2721    insn = next_insn(p, ELK_OPCODE_SEND);
2722 
2723    assert(msg_length < ELK_MAX_MRF(devinfo->ver));
2724 
2725    elk_set_dest(p, insn, dest);
2726    elk_set_src0(p, insn, src0);
2727    elk_set_src1(p, insn, elk_imm_d(0));
2728 
2729    if (devinfo->ver < 6)
2730       elk_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2731 
2732    elk_set_urb_message(p,
2733 		       insn,
2734 		       flags,
2735 		       msg_length,
2736 		       response_length,
2737 		       offset,
2738 		       swizzle);
2739 }
2740 
2741 void
elk_send_indirect_message(struct elk_codegen * p,unsigned sfid,struct elk_reg dst,struct elk_reg payload,struct elk_reg desc,unsigned desc_imm,bool eot)2742 elk_send_indirect_message(struct elk_codegen *p,
2743                           unsigned sfid,
2744                           struct elk_reg dst,
2745                           struct elk_reg payload,
2746                           struct elk_reg desc,
2747                           unsigned desc_imm,
2748                           bool eot)
2749 {
2750    const struct intel_device_info *devinfo = p->devinfo;
2751    struct elk_inst *send;
2752 
2753    dst = retype(dst, ELK_REGISTER_TYPE_UW);
2754 
2755    assert(desc.type == ELK_REGISTER_TYPE_UD);
2756 
2757    if (desc.file == ELK_IMMEDIATE_VALUE) {
2758       send = next_insn(p, ELK_OPCODE_SEND);
2759       elk_set_src0(p, send, retype(payload, ELK_REGISTER_TYPE_UD));
2760       elk_set_desc(p, send, desc.ud | desc_imm);
2761    } else {
2762       const struct tgl_swsb swsb = elk_get_default_swsb(p);
2763       struct elk_reg addr = retype(elk_address_reg(0), ELK_REGISTER_TYPE_UD);
2764 
2765       elk_push_insn_state(p);
2766       elk_set_default_access_mode(p, ELK_ALIGN_1);
2767       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2768       elk_set_default_exec_size(p, ELK_EXECUTE_1);
2769       elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
2770       elk_set_default_flag_reg(p, 0, 0);
2771       elk_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2772 
2773       /* Load the indirect descriptor to an address register using OR so the
2774        * caller can specify additional descriptor bits with the desc_imm
2775        * immediate.
2776        */
2777       elk_OR(p, addr, desc, elk_imm_ud(desc_imm));
2778 
2779       elk_pop_insn_state(p);
2780 
2781       elk_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2782       send = next_insn(p, ELK_OPCODE_SEND);
2783       elk_set_src0(p, send, retype(payload, ELK_REGISTER_TYPE_UD));
2784 
2785       if (devinfo->ver >= 12)
2786          elk_inst_set_send_sel_reg32_desc(devinfo, send, true);
2787       else
2788          elk_set_src1(p, send, addr);
2789    }
2790 
2791    elk_set_dest(p, send, dst);
2792    elk_inst_set_sfid(devinfo, send, sfid);
2793    elk_inst_set_eot(devinfo, send, eot);
2794 }
2795 
2796 void
elk_send_indirect_split_message(struct elk_codegen * p,unsigned sfid,struct elk_reg dst,struct elk_reg payload0,struct elk_reg payload1,struct elk_reg desc,unsigned desc_imm,struct elk_reg ex_desc,unsigned ex_desc_imm,bool ex_desc_scratch,bool ex_bso,bool eot)2797 elk_send_indirect_split_message(struct elk_codegen *p,
2798                                 unsigned sfid,
2799                                 struct elk_reg dst,
2800                                 struct elk_reg payload0,
2801                                 struct elk_reg payload1,
2802                                 struct elk_reg desc,
2803                                 unsigned desc_imm,
2804                                 struct elk_reg ex_desc,
2805                                 unsigned ex_desc_imm,
2806                                 bool ex_desc_scratch,
2807                                 bool ex_bso,
2808                                 bool eot)
2809 {
2810    const struct intel_device_info *devinfo = p->devinfo;
2811    struct elk_inst *send;
2812 
2813    dst = retype(dst, ELK_REGISTER_TYPE_UW);
2814 
2815    assert(desc.type == ELK_REGISTER_TYPE_UD);
2816 
2817    if (desc.file == ELK_IMMEDIATE_VALUE) {
2818       desc.ud |= desc_imm;
2819    } else {
2820       const struct tgl_swsb swsb = elk_get_default_swsb(p);
2821       struct elk_reg addr = retype(elk_address_reg(0), ELK_REGISTER_TYPE_UD);
2822 
2823       elk_push_insn_state(p);
2824       elk_set_default_access_mode(p, ELK_ALIGN_1);
2825       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2826       elk_set_default_exec_size(p, ELK_EXECUTE_1);
2827       elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
2828       elk_set_default_flag_reg(p, 0, 0);
2829       elk_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2830 
2831       /* Load the indirect descriptor to an address register using OR so the
2832        * caller can specify additional descriptor bits with the desc_imm
2833        * immediate.
2834        */
2835       elk_OR(p, addr, desc, elk_imm_ud(desc_imm));
2836 
2837       elk_pop_insn_state(p);
2838       desc = addr;
2839 
2840       elk_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2841    }
2842 
2843    if (ex_desc.file == ELK_IMMEDIATE_VALUE &&
2844        !ex_desc_scratch &&
2845        (devinfo->ver >= 12 ||
2846         ((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) {
2847       /* ATS-M PRMs, Volume 2d: Command Reference: Structures,
2848        * EU_INSTRUCTION_SEND instruction
2849        *
2850        *    "ExBSO: Exists If: ([ExDesc.IsReg]==true)"
2851        */
2852       assert(!ex_bso);
2853       ex_desc.ud |= ex_desc_imm;
2854    } else {
2855       const struct tgl_swsb swsb = elk_get_default_swsb(p);
2856       struct elk_reg addr = retype(elk_address_reg(2), ELK_REGISTER_TYPE_UD);
2857 
2858       elk_push_insn_state(p);
2859       elk_set_default_access_mode(p, ELK_ALIGN_1);
2860       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2861       elk_set_default_exec_size(p, ELK_EXECUTE_1);
2862       elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
2863       elk_set_default_flag_reg(p, 0, 0);
2864       elk_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2865 
2866       /* Load the indirect extended descriptor to an address register using OR
2867        * so the caller can specify additional descriptor bits with the
2868        * desc_imm immediate.
2869        *
2870        * Even though the instruction dispatcher always pulls the SFID and EOT
2871        * fields from the instruction itself, actual external unit which
2872        * processes the message gets the SFID and EOT from the extended
2873        * descriptor which comes from the address register.  If we don't OR
2874        * those two bits in, the external unit may get confused and hang.
2875        */
2876       unsigned imm_part = ex_bso ? 0 : (ex_desc_imm | sfid | eot << 5);
2877 
2878       if (ex_desc_scratch) {
2879          /* Or the scratch surface offset together with the immediate part of
2880           * the extended descriptor.
2881           */
2882          assert(devinfo->verx10 >= 125);
2883          elk_AND(p, addr,
2884                  retype(elk_vec1_grf(0, 5), ELK_REGISTER_TYPE_UD),
2885                  elk_imm_ud(INTEL_MASK(31, 10)));
2886          elk_OR(p, addr, addr, elk_imm_ud(imm_part));
2887       } else if (ex_desc.file == ELK_IMMEDIATE_VALUE) {
2888          /* ex_desc bits 15:12 don't exist in the instruction encoding prior
2889           * to Gfx12, so we may have fallen back to an indirect extended
2890           * descriptor.
2891           */
2892          elk_MOV(p, addr, elk_imm_ud(ex_desc.ud | imm_part));
2893       } else {
2894          elk_OR(p, addr, ex_desc, elk_imm_ud(imm_part));
2895       }
2896 
2897       elk_pop_insn_state(p);
2898       ex_desc = addr;
2899 
2900       elk_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2901    }
2902 
2903    send = next_insn(p, devinfo->ver >= 12 ? ELK_OPCODE_SEND : ELK_OPCODE_SENDS);
2904    elk_set_dest(p, send, dst);
2905    elk_set_src0(p, send, retype(payload0, ELK_REGISTER_TYPE_UD));
2906    elk_set_src1(p, send, retype(payload1, ELK_REGISTER_TYPE_UD));
2907 
2908    if (desc.file == ELK_IMMEDIATE_VALUE) {
2909       elk_inst_set_send_sel_reg32_desc(devinfo, send, 0);
2910       elk_inst_set_send_desc(devinfo, send, desc.ud);
2911    } else {
2912       assert(desc.file == ELK_ARCHITECTURE_REGISTER_FILE);
2913       assert(desc.nr == ELK_ARF_ADDRESS);
2914       assert(desc.subnr == 0);
2915       elk_inst_set_send_sel_reg32_desc(devinfo, send, 1);
2916    }
2917 
2918    if (ex_desc.file == ELK_IMMEDIATE_VALUE) {
2919       elk_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
2920       elk_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
2921    } else {
2922       assert(ex_desc.file == ELK_ARCHITECTURE_REGISTER_FILE);
2923       assert(ex_desc.nr == ELK_ARF_ADDRESS);
2924       assert((ex_desc.subnr & 0x3) == 0);
2925       elk_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
2926       elk_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, phys_subnr(devinfo, ex_desc) >> 2);
2927    }
2928 
2929    if (ex_bso) {
2930       /* The send instruction ExBSO field does not exist with UGM on Gfx20+,
2931        * it is assumed.
2932        *
2933        * BSpec 56890
2934        */
2935       if (devinfo->ver < 20 || sfid != GFX12_SFID_UGM)
2936          elk_inst_set_send_ex_bso(devinfo, send, true);
2937       elk_inst_set_send_src1_len(devinfo, send, GET_BITS(ex_desc_imm, 10, 6));
2938    }
2939    elk_inst_set_sfid(devinfo, send, sfid);
2940    elk_inst_set_eot(devinfo, send, eot);
2941 }
2942 
2943 static void
elk_send_indirect_surface_message(struct elk_codegen * p,unsigned sfid,struct elk_reg dst,struct elk_reg payload,struct elk_reg surface,unsigned desc_imm)2944 elk_send_indirect_surface_message(struct elk_codegen *p,
2945                                   unsigned sfid,
2946                                   struct elk_reg dst,
2947                                   struct elk_reg payload,
2948                                   struct elk_reg surface,
2949                                   unsigned desc_imm)
2950 {
2951    if (surface.file != ELK_IMMEDIATE_VALUE) {
2952       const struct tgl_swsb swsb = elk_get_default_swsb(p);
2953       struct elk_reg addr = retype(elk_address_reg(0), ELK_REGISTER_TYPE_UD);
2954 
2955       elk_push_insn_state(p);
2956       elk_set_default_access_mode(p, ELK_ALIGN_1);
2957       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
2958       elk_set_default_exec_size(p, ELK_EXECUTE_1);
2959       elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
2960       elk_set_default_flag_reg(p, 0, 0);
2961       elk_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2962 
2963       /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2964        * some surface array is accessed out of bounds.
2965        */
2966       elk_AND(p, addr,
2967               suboffset(vec1(retype(surface, ELK_REGISTER_TYPE_UD)),
2968                         ELK_GET_SWZ(surface.swizzle, 0)),
2969               elk_imm_ud(0xff));
2970 
2971       elk_pop_insn_state(p);
2972 
2973       surface = addr;
2974       elk_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2975    }
2976 
2977    elk_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2978 }
2979 
2980 static bool
while_jumps_before_offset(const struct intel_device_info * devinfo,elk_inst * insn,int while_offset,int start_offset)2981 while_jumps_before_offset(const struct intel_device_info *devinfo,
2982                           elk_inst *insn, int while_offset, int start_offset)
2983 {
2984    int scale = 16 / elk_jump_scale(devinfo);
2985    int jip = devinfo->ver == 6 ? elk_inst_gfx6_jump_count(devinfo, insn)
2986                                : elk_inst_jip(devinfo, insn);
2987    assert(jip < 0);
2988    return while_offset + jip * scale <= start_offset;
2989 }
2990 
2991 
2992 static int
elk_find_next_block_end(struct elk_codegen * p,int start_offset)2993 elk_find_next_block_end(struct elk_codegen *p, int start_offset)
2994 {
2995    int offset;
2996    void *store = p->store;
2997    const struct intel_device_info *devinfo = p->devinfo;
2998 
2999    int depth = 0;
3000 
3001    for (offset = next_offset(devinfo, store, start_offset);
3002         offset < p->next_insn_offset;
3003         offset = next_offset(devinfo, store, offset)) {
3004       elk_inst *insn = store + offset;
3005 
3006       switch (elk_inst_opcode(p->isa, insn)) {
3007       case ELK_OPCODE_IF:
3008          depth++;
3009          break;
3010       case ELK_OPCODE_ENDIF:
3011          if (depth == 0)
3012             return offset;
3013          depth--;
3014          break;
3015       case ELK_OPCODE_WHILE:
3016          /* If the while doesn't jump before our instruction, it's the end
3017           * of a sibling do...while loop.  Ignore it.
3018           */
3019          if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
3020             continue;
3021          FALLTHROUGH;
3022       case ELK_OPCODE_ELSE:
3023       case ELK_OPCODE_HALT:
3024          if (depth == 0)
3025             return offset;
3026          break;
3027       default:
3028          break;
3029       }
3030    }
3031 
3032    return 0;
3033 }
3034 
3035 /* There is no DO instruction on gfx6, so to find the end of the loop
3036  * we have to see if the loop is jumping back before our start
3037  * instruction.
3038  */
3039 static int
elk_find_loop_end(struct elk_codegen * p,int start_offset)3040 elk_find_loop_end(struct elk_codegen *p, int start_offset)
3041 {
3042    const struct intel_device_info *devinfo = p->devinfo;
3043    int offset;
3044    void *store = p->store;
3045 
3046    assert(devinfo->ver >= 6);
3047 
3048    /* Always start after the instruction (such as a WHILE) we're trying to fix
3049     * up.
3050     */
3051    for (offset = next_offset(devinfo, store, start_offset);
3052         offset < p->next_insn_offset;
3053         offset = next_offset(devinfo, store, offset)) {
3054       elk_inst *insn = store + offset;
3055 
3056       if (elk_inst_opcode(p->isa, insn) == ELK_OPCODE_WHILE) {
3057 	 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
3058 	    return offset;
3059       }
3060    }
3061    assert(!"not reached");
3062    return start_offset;
3063 }
3064 
3065 /* After program generation, go back and update the UIP and JIP of
3066  * BREAK, CONT, and HALT instructions to their correct locations.
3067  */
3068 void
elk_set_uip_jip(struct elk_codegen * p,int start_offset)3069 elk_set_uip_jip(struct elk_codegen *p, int start_offset)
3070 {
3071    const struct intel_device_info *devinfo = p->devinfo;
3072    int offset;
3073    int br = elk_jump_scale(devinfo);
3074    int scale = 16 / br;
3075    void *store = p->store;
3076 
3077    if (devinfo->ver < 6)
3078       return;
3079 
3080    for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
3081       elk_inst *insn = store + offset;
3082       assert(elk_inst_cmpt_control(devinfo, insn) == 0);
3083 
3084       switch (elk_inst_opcode(p->isa, insn)) {
3085       case ELK_OPCODE_BREAK: {
3086          int block_end_offset = elk_find_next_block_end(p, offset);
3087          assert(block_end_offset != 0);
3088          elk_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
3089 	 /* Gfx7 UIP points to WHILE; Gfx6 points just after it */
3090          elk_inst_set_uip(devinfo, insn,
3091 	    (elk_find_loop_end(p, offset) - offset +
3092              (devinfo->ver == 6 ? 16 : 0)) / scale);
3093 	 break;
3094       }
3095 
3096       case ELK_OPCODE_CONTINUE: {
3097          int block_end_offset = elk_find_next_block_end(p, offset);
3098          assert(block_end_offset != 0);
3099          elk_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
3100          elk_inst_set_uip(devinfo, insn,
3101             (elk_find_loop_end(p, offset) - offset) / scale);
3102 
3103          assert(elk_inst_uip(devinfo, insn) != 0);
3104          assert(elk_inst_jip(devinfo, insn) != 0);
3105 	 break;
3106       }
3107 
3108       case ELK_OPCODE_ENDIF: {
3109          int block_end_offset = elk_find_next_block_end(p, offset);
3110          int32_t jump = (block_end_offset == 0) ?
3111                         1 * br : (block_end_offset - offset) / scale;
3112          if (devinfo->ver >= 7)
3113             elk_inst_set_jip(devinfo, insn, jump);
3114          else
3115             elk_inst_set_gfx6_jump_count(devinfo, insn, jump);
3116 	 break;
3117       }
3118 
3119       case ELK_OPCODE_HALT: {
3120 	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
3121 	  *
3122 	  *    "In case of the halt instruction not inside any conditional
3123 	  *     code block, the value of <JIP> and <UIP> should be the
3124 	  *     same. In case of the halt instruction inside conditional code
3125 	  *     block, the <UIP> should be the end of the program, and the
3126 	  *     <JIP> should be end of the most inner conditional code block."
3127 	  *
3128 	  * The uip will have already been set by whoever set up the
3129 	  * instruction.
3130 	  */
3131          int block_end_offset = elk_find_next_block_end(p, offset);
3132 	 if (block_end_offset == 0) {
3133             elk_inst_set_jip(devinfo, insn, elk_inst_uip(devinfo, insn));
3134 	 } else {
3135             elk_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
3136 	 }
3137          assert(elk_inst_uip(devinfo, insn) != 0);
3138          assert(elk_inst_jip(devinfo, insn) != 0);
3139 	 break;
3140       }
3141 
3142       default:
3143          break;
3144       }
3145    }
3146 }
3147 
elk_ff_sync(struct elk_codegen * p,struct elk_reg dest,unsigned msg_reg_nr,struct elk_reg src0,bool allocate,unsigned response_length,bool eot)3148 void elk_ff_sync(struct elk_codegen *p,
3149 		   struct elk_reg dest,
3150 		   unsigned msg_reg_nr,
3151 		   struct elk_reg src0,
3152 		   bool allocate,
3153 		   unsigned response_length,
3154 		   bool eot)
3155 {
3156    const struct intel_device_info *devinfo = p->devinfo;
3157    elk_inst *insn;
3158 
3159    elk_gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
3160 
3161    insn = next_insn(p, ELK_OPCODE_SEND);
3162    elk_set_dest(p, insn, dest);
3163    elk_set_src0(p, insn, src0);
3164    elk_set_src1(p, insn, elk_imm_d(0));
3165 
3166    if (devinfo->ver < 6)
3167       elk_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
3168 
3169    elk_set_ff_sync_message(p,
3170 			   insn,
3171 			   allocate,
3172 			   response_length,
3173 			   eot);
3174 }
3175 
3176 /**
3177  * Emit the SEND instruction necessary to generate stream output data on Gfx6
3178  * (for transform feedback).
3179  *
3180  * If send_commit_msg is true, this is the last piece of stream output data
3181  * from this thread, so send the data as a committed write.  According to the
3182  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
3183  *
3184  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
3185  *   writes are complete by sending the final write as a committed write."
3186  */
3187 void
elk_svb_write(struct elk_codegen * p,struct elk_reg dest,unsigned msg_reg_nr,struct elk_reg src0,unsigned binding_table_index,bool send_commit_msg)3188 elk_svb_write(struct elk_codegen *p,
3189               struct elk_reg dest,
3190               unsigned msg_reg_nr,
3191               struct elk_reg src0,
3192               unsigned binding_table_index,
3193               bool   send_commit_msg)
3194 {
3195    const struct intel_device_info *devinfo = p->devinfo;
3196    assert(devinfo->ver == 6);
3197    const unsigned target_cache = GFX6_SFID_DATAPORT_RENDER_CACHE;
3198    elk_inst *insn;
3199 
3200    elk_gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
3201 
3202    insn = next_insn(p, ELK_OPCODE_SEND);
3203    elk_inst_set_sfid(devinfo, insn, target_cache);
3204    elk_set_dest(p, insn, dest);
3205    elk_set_src0(p, insn, src0);
3206    elk_set_desc(p, insn,
3207                 elk_message_desc(devinfo, 1, send_commit_msg, true) |
3208                 elk_dp_write_desc(devinfo, binding_table_index,
3209                                   0, /* msg_control: ignored */
3210                                   GFX6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
3211                                   send_commit_msg)); /* send_commit_msg */
3212 }
3213 
3214 static unsigned
elk_surface_payload_size(unsigned num_channels,unsigned exec_size)3215 elk_surface_payload_size(unsigned num_channels,
3216                          unsigned exec_size /**< 0 for SIMD4x2 */)
3217 {
3218    if (exec_size == 0)
3219       return 1; /* SIMD4x2 */
3220    else if (exec_size <= 8)
3221       return num_channels;
3222    else
3223       return 2 * num_channels;
3224 }
3225 
3226 void
elk_untyped_atomic(struct elk_codegen * p,struct elk_reg dst,struct elk_reg payload,struct elk_reg surface,unsigned atomic_op,unsigned msg_length,bool response_expected,bool header_present)3227 elk_untyped_atomic(struct elk_codegen *p,
3228                    struct elk_reg dst,
3229                    struct elk_reg payload,
3230                    struct elk_reg surface,
3231                    unsigned atomic_op,
3232                    unsigned msg_length,
3233                    bool response_expected,
3234                    bool header_present)
3235 {
3236    const struct intel_device_info *devinfo = p->devinfo;
3237    const unsigned sfid = (devinfo->verx10 >= 75 ?
3238                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3239                           GFX7_SFID_DATAPORT_DATA_CACHE);
3240    const bool align1 = elk_get_default_access_mode(p) == ELK_ALIGN_1;
3241    /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3242    const bool has_simd4x2 = devinfo->verx10 >= 75;
3243    const unsigned exec_size = align1 ? 1 << elk_get_default_exec_size(p) :
3244                               has_simd4x2 ? 0 : 8;
3245    const unsigned response_length =
3246       elk_surface_payload_size(response_expected, exec_size);
3247    const unsigned desc =
3248       elk_message_desc(devinfo, msg_length, response_length, header_present) |
3249       elk_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
3250                                  response_expected);
3251    /* Mask out unused components -- This is especially important in Align16
3252     * mode on generations that don't have native support for SIMD4x2 atomics,
3253     * because unused but enabled components will cause the dataport to perform
3254     * additional atomic operations on the addresses that happen to be in the
3255     * uninitialized Y, Z and W coordinates of the payload.
3256     */
3257    const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3258 
3259    elk_send_indirect_surface_message(p, sfid, elk_writemask(dst, mask),
3260                                      payload, surface, desc);
3261 }
3262 
3263 void
elk_untyped_surface_read(struct elk_codegen * p,struct elk_reg dst,struct elk_reg payload,struct elk_reg surface,unsigned msg_length,unsigned num_channels)3264 elk_untyped_surface_read(struct elk_codegen *p,
3265                          struct elk_reg dst,
3266                          struct elk_reg payload,
3267                          struct elk_reg surface,
3268                          unsigned msg_length,
3269                          unsigned num_channels)
3270 {
3271    const struct intel_device_info *devinfo = p->devinfo;
3272    const unsigned sfid = (devinfo->verx10 >= 75 ?
3273                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3274                           GFX7_SFID_DATAPORT_DATA_CACHE);
3275    const bool align1 = elk_get_default_access_mode(p) == ELK_ALIGN_1;
3276    const unsigned exec_size = align1 ? 1 << elk_get_default_exec_size(p) : 0;
3277    const unsigned response_length =
3278       elk_surface_payload_size(num_channels, exec_size);
3279    const unsigned desc =
3280       elk_message_desc(devinfo, msg_length, response_length, false) |
3281       elk_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
3282 
3283    elk_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
3284 }
3285 
3286 void
elk_untyped_surface_write(struct elk_codegen * p,struct elk_reg payload,struct elk_reg surface,unsigned msg_length,unsigned num_channels,bool header_present)3287 elk_untyped_surface_write(struct elk_codegen *p,
3288                           struct elk_reg payload,
3289                           struct elk_reg surface,
3290                           unsigned msg_length,
3291                           unsigned num_channels,
3292                           bool header_present)
3293 {
3294    const struct intel_device_info *devinfo = p->devinfo;
3295    const unsigned sfid = (devinfo->verx10 >= 75 ?
3296                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3297                           GFX7_SFID_DATAPORT_DATA_CACHE);
3298    const bool align1 = elk_get_default_access_mode(p) == ELK_ALIGN_1;
3299    /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3300    const bool has_simd4x2 = devinfo->verx10 >= 75;
3301    const unsigned exec_size = align1 ? 1 << elk_get_default_exec_size(p) :
3302                               has_simd4x2 ? 0 : 8;
3303    const unsigned desc =
3304       elk_message_desc(devinfo, msg_length, 0, header_present) |
3305       elk_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
3306    /* Mask out unused components -- See comment in elk_untyped_atomic(). */
3307    const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
3308 
3309    elk_send_indirect_surface_message(p, sfid, elk_writemask(elk_null_reg(), mask),
3310                                      payload, surface, desc);
3311 }
3312 
3313 static void
elk_set_memory_fence_message(struct elk_codegen * p,struct elk_inst * insn,enum elk_message_target sfid,bool commit_enable,unsigned bti)3314 elk_set_memory_fence_message(struct elk_codegen *p,
3315                              struct elk_inst *insn,
3316                              enum elk_message_target sfid,
3317                              bool commit_enable,
3318                              unsigned bti)
3319 {
3320    const struct intel_device_info *devinfo = p->devinfo;
3321 
3322    elk_set_desc(p, insn, elk_message_desc(
3323                    devinfo, 1, (commit_enable ? 1 : 0), true));
3324 
3325    elk_inst_set_sfid(devinfo, insn, sfid);
3326 
3327    switch (sfid) {
3328    case GFX6_SFID_DATAPORT_RENDER_CACHE:
3329       elk_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_RC_MEMORY_FENCE);
3330       break;
3331    case GFX7_SFID_DATAPORT_DATA_CACHE:
3332       elk_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_DC_MEMORY_FENCE);
3333       break;
3334    default:
3335       unreachable("Not reached");
3336    }
3337 
3338    if (commit_enable)
3339       elk_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3340 
3341    assert(devinfo->ver >= 11 || bti == 0);
3342    elk_inst_set_binding_table_index(devinfo, insn, bti);
3343 }
3344 
3345 static void
gfx12_set_memory_fence_message(struct elk_codegen * p,struct elk_inst * insn,enum elk_message_target sfid,uint32_t desc)3346 gfx12_set_memory_fence_message(struct elk_codegen *p,
3347                                struct elk_inst *insn,
3348                                enum elk_message_target sfid,
3349                                uint32_t desc)
3350 {
3351    const unsigned mlen = 1 * reg_unit(p->devinfo); /* g0 header */
3352     /* Completion signaled by write to register. No data returned. */
3353    const unsigned rlen = 1 * reg_unit(p->devinfo);
3354 
3355    elk_inst_set_sfid(p->devinfo, insn, sfid);
3356 
3357    if (sfid == ELK_SFID_URB && p->devinfo->ver < 20) {
3358       elk_set_desc(p, insn, elk_urb_fence_desc(p->devinfo) |
3359                             elk_message_desc(p->devinfo, mlen, rlen, true));
3360    } else {
3361       enum lsc_fence_scope scope = lsc_fence_msg_desc_scope(p->devinfo, desc);
3362       enum lsc_flush_type flush_type = lsc_fence_msg_desc_flush_type(p->devinfo, desc);
3363 
3364       if (sfid == GFX12_SFID_TGM) {
3365          scope = LSC_FENCE_TILE;
3366          flush_type = LSC_FLUSH_TYPE_EVICT;
3367       }
3368 
3369       /* Wa_14012437816:
3370        *
3371        *   "For any fence greater than local scope, always set flush type to
3372        *    at least invalidate so that fence goes on properly."
3373        *
3374        *   "The bug is if flush_type is 'None', the scope is always downgraded
3375        *    to 'local'."
3376        *
3377        * Here set scope to NONE_6 instead of NONE, which has the same effect
3378        * as NONE but avoids the downgrade to scope LOCAL.
3379        */
3380       if (intel_needs_workaround(p->devinfo, 14012437816) &&
3381           scope > LSC_FENCE_LOCAL &&
3382           flush_type == LSC_FLUSH_TYPE_NONE) {
3383          flush_type = LSC_FLUSH_TYPE_NONE_6;
3384       }
3385 
3386       elk_set_desc(p, insn, lsc_fence_msg_desc(p->devinfo, scope,
3387                                                flush_type, false) |
3388                             elk_message_desc(p->devinfo, mlen, rlen, false));
3389    }
3390 }
3391 
3392 void
elk_memory_fence(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src,enum elk_opcode send_op,enum elk_message_target sfid,uint32_t desc,bool commit_enable,unsigned bti)3393 elk_memory_fence(struct elk_codegen *p,
3394                  struct elk_reg dst,
3395                  struct elk_reg src,
3396                  enum elk_opcode send_op,
3397                  enum elk_message_target sfid,
3398                  uint32_t desc,
3399                  bool commit_enable,
3400                  unsigned bti)
3401 {
3402    const struct intel_device_info *devinfo = p->devinfo;
3403 
3404    dst = retype(vec1(dst), ELK_REGISTER_TYPE_UW);
3405    src = retype(vec1(src), ELK_REGISTER_TYPE_UD);
3406 
3407    /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3408     * message doesn't write anything back.
3409     */
3410    struct elk_inst *insn = next_insn(p, send_op);
3411    elk_inst_set_mask_control(devinfo, insn, ELK_MASK_DISABLE);
3412    elk_inst_set_exec_size(devinfo, insn, ELK_EXECUTE_1);
3413    elk_set_dest(p, insn, dst);
3414    elk_set_src0(p, insn, src);
3415 
3416    /* All DG2 hardware requires LSC for fence messages, even A-step */
3417    if (devinfo->has_lsc)
3418       gfx12_set_memory_fence_message(p, insn, sfid, desc);
3419    else
3420       elk_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
3421 }
3422 
3423 void
elk_find_live_channel(struct elk_codegen * p,struct elk_reg dst,bool last)3424 elk_find_live_channel(struct elk_codegen *p, struct elk_reg dst, bool last)
3425 {
3426    const struct intel_device_info *devinfo = p->devinfo;
3427    const unsigned exec_size = 1 << elk_get_default_exec_size(p);
3428    const unsigned qtr_control = elk_get_default_group(p) / 8;
3429    elk_inst *inst;
3430 
3431    assert(devinfo->ver == 7);
3432 
3433    elk_push_insn_state(p);
3434 
3435    /* The flag register is only used on Gfx7 in align1 mode, so avoid setting
3436     * unnecessary bits in the instruction words, get the information we need
3437     * and reset the default flag register. This allows more instructions to be
3438     * compacted.
3439     */
3440    const unsigned flag_subreg = p->current->flag_subreg;
3441    elk_set_default_flag_reg(p, 0, 0);
3442 
3443    if (elk_get_default_access_mode(p) == ELK_ALIGN_1) {
3444       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
3445 
3446       const struct elk_reg flag = elk_flag_subreg(flag_subreg);
3447 
3448       elk_set_default_exec_size(p, ELK_EXECUTE_1);
3449       elk_MOV(p, retype(flag, ELK_REGISTER_TYPE_UD), elk_imm_ud(0));
3450 
3451       /* Run enough instructions returning zero with execution masking and
3452        * a conditional modifier enabled in order to get the full execution
3453        * mask in f1.0.  We could use a single 32-wide move here if it
3454        * weren't because of the hardware bug that causes channel enables to
3455        * be applied incorrectly to the second half of 32-wide instructions
3456        * on Gfx7.
3457        */
3458       const unsigned lower_size = MIN2(16, exec_size);
3459       for (unsigned i = 0; i < exec_size / lower_size; i++) {
3460          inst = elk_MOV(p, retype(elk_null_reg(), ELK_REGISTER_TYPE_UW),
3461                         elk_imm_uw(0));
3462          elk_inst_set_mask_control(devinfo, inst, ELK_MASK_ENABLE);
3463          elk_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3464          elk_inst_set_cond_modifier(devinfo, inst, ELK_CONDITIONAL_Z);
3465          elk_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3466          elk_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3467          elk_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3468       }
3469 
3470       /* Find the first bit set in the exec_size-wide portion of the flag
3471        * register that was updated by the last sequence of MOV
3472        * instructions.
3473        */
3474       const enum elk_reg_type type = elk_int_type(exec_size / 8, false);
3475       elk_set_default_exec_size(p, ELK_EXECUTE_1);
3476       if (!last) {
3477          inst = elk_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3478       } else {
3479          inst = elk_LZD(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3480          struct elk_reg neg = vec1(dst);
3481          neg.negate = true;
3482          inst = elk_ADD(p, vec1(dst), neg, elk_imm_uw(31));
3483       }
3484    } else {
3485       elk_set_default_mask_control(p, ELK_MASK_DISABLE);
3486 
3487       /* Overwrite the destination without and with execution masking to
3488        * find out which of the channels is active.
3489        */
3490       elk_push_insn_state(p);
3491       elk_set_default_exec_size(p, ELK_EXECUTE_4);
3492       elk_MOV(p, elk_writemask(vec4(dst), WRITEMASK_X),
3493               elk_imm_ud(1));
3494 
3495       inst = elk_MOV(p, elk_writemask(vec4(dst), WRITEMASK_X),
3496                      elk_imm_ud(0));
3497       elk_pop_insn_state(p);
3498       elk_inst_set_mask_control(devinfo, inst, ELK_MASK_ENABLE);
3499    }
3500 
3501    elk_pop_insn_state(p);
3502 }
3503 
3504 void
elk_broadcast(struct elk_codegen * p,struct elk_reg dst,struct elk_reg src,struct elk_reg idx)3505 elk_broadcast(struct elk_codegen *p,
3506               struct elk_reg dst,
3507               struct elk_reg src,
3508               struct elk_reg idx)
3509 {
3510    const struct intel_device_info *devinfo = p->devinfo;
3511    const bool align1 = elk_get_default_access_mode(p) == ELK_ALIGN_1;
3512    elk_inst *inst;
3513 
3514    elk_push_insn_state(p);
3515    elk_set_default_mask_control(p, ELK_MASK_DISABLE);
3516    elk_set_default_exec_size(p, align1 ? ELK_EXECUTE_1 : ELK_EXECUTE_4);
3517 
3518    assert(src.file == ELK_GENERAL_REGISTER_FILE &&
3519           src.address_mode == ELK_ADDRESS_DIRECT);
3520    assert(!src.abs && !src.negate);
3521 
3522    /* Gen12.5 adds the following region restriction:
3523     *
3524     *    "Vx1 and VxH indirect addressing for Float, Half-Float, Double-Float
3525     *    and Quad-Word data must not be used."
3526     *
3527     * We require the source and destination types to match so stomp to an
3528     * unsigned integer type.
3529     */
3530    assert(src.type == dst.type);
3531    src.type = dst.type = elk_reg_type_from_bit_size(type_sz(src.type) * 8,
3532                                                     ELK_REGISTER_TYPE_UD);
3533 
3534    if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3535        idx.file == ELK_IMMEDIATE_VALUE) {
3536       /* Trivial, the source is already uniform or the index is a constant.
3537        * We will typically not get here if the optimizer is doing its job, but
3538        * asserting would be mean.
3539        */
3540       const unsigned i = idx.file == ELK_IMMEDIATE_VALUE ? idx.ud : 0;
3541       src = align1 ? stride(suboffset(src, i), 0, 1, 0) :
3542                      stride(suboffset(src, 4 * i), 0, 4, 1);
3543 
3544       if (type_sz(src.type) > 4 && !devinfo->has_64bit_int) {
3545          elk_MOV(p, subscript(dst, ELK_REGISTER_TYPE_D, 0),
3546                     subscript(src, ELK_REGISTER_TYPE_D, 0));
3547          elk_set_default_swsb(p, tgl_swsb_null());
3548          elk_MOV(p, subscript(dst, ELK_REGISTER_TYPE_D, 1),
3549                     subscript(src, ELK_REGISTER_TYPE_D, 1));
3550       } else {
3551          elk_MOV(p, dst, src);
3552       }
3553    } else {
3554       /* From the Haswell PRM section "Register Region Restrictions":
3555        *
3556        *    "The lower bits of the AddressImmediate must not overflow to
3557        *    change the register address.  The lower 5 bits of Address
3558        *    Immediate when added to lower 5 bits of address register gives
3559        *    the sub-register offset. The upper bits of Address Immediate
3560        *    when added to upper bits of address register gives the register
3561        *    address. Any overflow from sub-register offset is dropped."
3562        *
3563        * Fortunately, for broadcast, we never have a sub-register offset so
3564        * this isn't an issue.
3565        */
3566       assert(src.subnr == 0);
3567 
3568       if (align1) {
3569          const struct elk_reg addr =
3570             retype(elk_address_reg(0), ELK_REGISTER_TYPE_UD);
3571          unsigned offset = src.nr * REG_SIZE + src.subnr;
3572          /* Limit in bytes of the signed indirect addressing immediate. */
3573          const unsigned limit = 512;
3574 
3575          elk_push_insn_state(p);
3576          elk_set_default_mask_control(p, ELK_MASK_DISABLE);
3577          elk_set_default_predicate_control(p, ELK_PREDICATE_NONE);
3578          elk_set_default_flag_reg(p, 0, 0);
3579 
3580          /* Take into account the component size and horizontal stride. */
3581          assert(src.vstride == src.hstride + src.width);
3582          elk_SHL(p, addr, vec1(idx),
3583                  elk_imm_ud(util_logbase2(type_sz(src.type)) +
3584                             src.hstride - 1));
3585 
3586          /* We can only address up to limit bytes using the indirect
3587           * addressing immediate, account for the difference if the source
3588           * register is above this limit.
3589           */
3590          if (offset >= limit) {
3591             elk_set_default_swsb(p, tgl_swsb_regdist(1));
3592             elk_ADD(p, addr, addr, elk_imm_ud(offset - offset % limit));
3593             offset = offset % limit;
3594          }
3595 
3596          elk_pop_insn_state(p);
3597 
3598          elk_set_default_swsb(p, tgl_swsb_regdist(1));
3599 
3600          /* Use indirect addressing to fetch the specified component. */
3601          if (type_sz(src.type) > 4 &&
3602              (devinfo->platform == INTEL_PLATFORM_CHV || intel_device_info_is_9lp(devinfo) ||
3603               !devinfo->has_64bit_int)) {
3604             /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3605              *
3606              *    "When source or destination datatype is 64b or operation is
3607              *    integer DWord multiply, indirect addressing must not be
3608              *    used."
3609              *
3610              * To work around both of this issue, we do two integer MOVs
3611              * insead of one 64-bit MOV.  Because no double value should ever
3612              * cross a register boundary, it's safe to use the immediate
3613              * offset in the indirect here to handle adding 4 bytes to the
3614              * offset and avoid the extra ADD to the register file.
3615              */
3616             elk_MOV(p, subscript(dst, ELK_REGISTER_TYPE_D, 0),
3617                        retype(elk_vec1_indirect(addr.subnr, offset),
3618                               ELK_REGISTER_TYPE_D));
3619             elk_set_default_swsb(p, tgl_swsb_null());
3620             elk_MOV(p, subscript(dst, ELK_REGISTER_TYPE_D, 1),
3621                        retype(elk_vec1_indirect(addr.subnr, offset + 4),
3622                               ELK_REGISTER_TYPE_D));
3623          } else {
3624             elk_MOV(p, dst,
3625                     retype(elk_vec1_indirect(addr.subnr, offset), src.type));
3626          }
3627       } else {
3628          /* In SIMD4x2 mode the index can be either zero or one, replicate it
3629           * to all bits of a flag register,
3630           */
3631          inst = elk_MOV(p,
3632                         elk_null_reg(),
3633                         stride(elk_swizzle(idx, ELK_SWIZZLE_XXXX), 4, 4, 1));
3634          elk_inst_set_pred_control(devinfo, inst, ELK_PREDICATE_NONE);
3635          elk_inst_set_cond_modifier(devinfo, inst, ELK_CONDITIONAL_NZ);
3636          elk_inst_set_flag_reg_nr(devinfo, inst, 1);
3637 
3638          /* and use predicated SEL to pick the right channel. */
3639          inst = elk_SEL(p, dst,
3640                         stride(suboffset(src, 4), 4, 4, 1),
3641                         stride(src, 4, 4, 1));
3642          elk_inst_set_pred_control(devinfo, inst, ELK_PREDICATE_NORMAL);
3643          elk_inst_set_flag_reg_nr(devinfo, inst, 1);
3644       }
3645    }
3646 
3647    elk_pop_insn_state(p);
3648 }
3649 
3650 
3651 /**
3652  * Emit the SEND message for a barrier
3653  */
3654 void
elk_barrier(struct elk_codegen * p,struct elk_reg src)3655 elk_barrier(struct elk_codegen *p, struct elk_reg src)
3656 {
3657    const struct intel_device_info *devinfo = p->devinfo;
3658    struct elk_inst *inst;
3659 
3660    assert(devinfo->ver >= 7);
3661 
3662    elk_push_insn_state(p);
3663    elk_set_default_access_mode(p, ELK_ALIGN_1);
3664    inst = next_insn(p, ELK_OPCODE_SEND);
3665    elk_set_dest(p, inst, retype(elk_null_reg(), ELK_REGISTER_TYPE_UW));
3666    elk_set_src0(p, inst, src);
3667    elk_set_src1(p, inst, elk_null_reg());
3668    elk_set_desc(p, inst, elk_message_desc(devinfo,
3669                                           1 * reg_unit(devinfo), 0, false));
3670 
3671    elk_inst_set_sfid(devinfo, inst, ELK_SFID_MESSAGE_GATEWAY);
3672    elk_inst_set_gateway_subfuncid(devinfo, inst,
3673                                   ELK_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3674 
3675    elk_inst_set_mask_control(devinfo, inst, ELK_MASK_DISABLE);
3676    elk_pop_insn_state(p);
3677 }
3678 
3679 
3680 /**
3681  * Emit the wait instruction for a barrier
3682  */
3683 void
elk_WAIT(struct elk_codegen * p)3684 elk_WAIT(struct elk_codegen *p)
3685 {
3686    const struct intel_device_info *devinfo = p->devinfo;
3687    struct elk_inst *insn;
3688 
3689    struct elk_reg src = elk_notification_reg();
3690 
3691    insn = next_insn(p, ELK_OPCODE_WAIT);
3692    elk_set_dest(p, insn, src);
3693    elk_set_src0(p, insn, src);
3694    elk_set_src1(p, insn, elk_null_reg());
3695 
3696    elk_inst_set_exec_size(devinfo, insn, ELK_EXECUTE_1);
3697    elk_inst_set_mask_control(devinfo, insn, ELK_MASK_DISABLE);
3698 }
3699 
3700 void
elk_float_controls_mode(struct elk_codegen * p,unsigned mode,unsigned mask)3701 elk_float_controls_mode(struct elk_codegen *p,
3702                         unsigned mode, unsigned mask)
3703 {
3704    assert(p->current->mask_control == ELK_MASK_DISABLE);
3705 
3706    /* From the Skylake PRM, Volume 7, page 760:
3707     *  "Implementation Restriction on Register Access: When the control
3708     *   register is used as an explicit source and/or destination, hardware
3709     *   does not ensure execution pipeline coherency. Software must set the
3710     *   thread control field to ‘switch’ for an instruction that uses
3711     *   control register as an explicit operand."
3712     *
3713     * On Gfx12+ this is implemented in terms of SWSB annotations instead.
3714     */
3715    elk_set_default_swsb(p, tgl_swsb_regdist(1));
3716 
3717    elk_inst *inst = elk_AND(p, elk_cr0_reg(0), elk_cr0_reg(0),
3718                             elk_imm_ud(~mask));
3719    elk_inst_set_exec_size(p->devinfo, inst, ELK_EXECUTE_1);
3720    if (p->devinfo->ver < 12)
3721       elk_inst_set_thread_control(p->devinfo, inst, ELK_THREAD_SWITCH);
3722 
3723    if (mode) {
3724       elk_inst *inst_or = elk_OR(p, elk_cr0_reg(0), elk_cr0_reg(0),
3725                                  elk_imm_ud(mode));
3726       elk_inst_set_exec_size(p->devinfo, inst_or, ELK_EXECUTE_1);
3727       if (p->devinfo->ver < 12)
3728          elk_inst_set_thread_control(p->devinfo, inst_or, ELK_THREAD_SWITCH);
3729    }
3730 
3731    if (p->devinfo->ver >= 12)
3732       elk_SYNC(p, TGL_SYNC_NOP);
3733 }
3734 
3735 void
elk_update_reloc_imm(const struct elk_isa_info * isa,elk_inst * inst,uint32_t value)3736 elk_update_reloc_imm(const struct elk_isa_info *isa,
3737                      elk_inst *inst,
3738                      uint32_t value)
3739 {
3740    const struct intel_device_info *devinfo = isa->devinfo;
3741 
3742    /* Sanity check that the instruction is a MOV of an immediate */
3743    assert(elk_inst_opcode(isa, inst) == ELK_OPCODE_MOV);
3744    assert(elk_inst_src0_reg_file(devinfo, inst) == ELK_IMMEDIATE_VALUE);
3745 
3746    /* If it was compacted, we can't safely rewrite */
3747    assert(elk_inst_cmpt_control(devinfo, inst) == 0);
3748 
3749    elk_inst_set_imm_ud(devinfo, inst, value);
3750 }
3751 
3752 /* A default value for constants that will be patched at run-time.
3753  * We pick an arbitrary value that prevents instruction compaction.
3754  */
3755 #define DEFAULT_PATCH_IMM 0x4a7cc037
3756 
3757 void
elk_MOV_reloc_imm(struct elk_codegen * p,struct elk_reg dst,enum elk_reg_type src_type,uint32_t id)3758 elk_MOV_reloc_imm(struct elk_codegen *p,
3759                   struct elk_reg dst,
3760                   enum elk_reg_type src_type,
3761                   uint32_t id)
3762 {
3763    assert(type_sz(src_type) == 4);
3764    assert(type_sz(dst.type) == 4);
3765 
3766    elk_add_reloc(p, id, ELK_SHADER_RELOC_TYPE_MOV_IMM,
3767                  p->next_insn_offset, 0);
3768 
3769    elk_MOV(p, dst, retype(elk_imm_ud(DEFAULT_PATCH_IMM), src_type));
3770 }
3771