• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3  Intel funded Tungsten Graphics to
4  develop this 3D driver.
5 
6  Permission is hereby granted, free of charge, to any person obtaining
7  a copy of this software and associated documentation files (the
8  "Software"), to deal in the Software without restriction, including
9  without limitation the rights to use, copy, modify, merge, publish,
10  distribute, sublicense, and/or sell copies of the Software, and to
11  permit persons to whom the Software is furnished to do so, subject to
12  the following conditions:
13 
14  The above copyright notice and this permission notice (including the
15  next paragraph) shall be included in all copies or substantial
16  portions of the Software.
17 
18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 
26  **********************************************************************/
27  /*
28   * Authors:
29   *   Keith Whitwell <keithw@vmware.com>
30   */
31 
32 
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35 
36 #include "util/ralloc.h"
37 
38 /**
39  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40  * registers, implicitly moving the operand to a message register.
41  *
42  * On Sandybridge, this is no longer the case.  This function performs the
43  * explicit move; it should be called before emitting a SEND instruction.
44  */
45 void
gfx6_resolve_implied_move(struct brw_codegen * p,struct brw_reg * src,unsigned msg_reg_nr)46 gfx6_resolve_implied_move(struct brw_codegen *p,
47 			  struct brw_reg *src,
48 			  unsigned msg_reg_nr)
49 {
50    const struct intel_device_info *devinfo = p->devinfo;
51    if (devinfo->ver < 6)
52       return;
53 
54    if (src->file == BRW_MESSAGE_REGISTER_FILE)
55       return;
56 
57    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58       assert(devinfo->ver < 12);
59       brw_push_insn_state(p);
60       brw_set_default_exec_size(p, BRW_EXECUTE_8);
61       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64 	      retype(*src, BRW_REGISTER_TYPE_UD));
65       brw_pop_insn_state(p);
66    }
67    *src = brw_message_reg(msg_reg_nr);
68 }
69 
70 static void
gfx7_convert_mrf_to_grf(struct brw_codegen * p,struct brw_reg * reg)71 gfx7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72 {
73    /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74     * "The send with EOT should use register space R112-R127 for <src>. This is
75     *  to enable loading of a new thread into the same slot while the message
76     *  with EOT for current thread is pending dispatch."
77     *
78     * Since we're pretending to have 16 MRFs anyway, we may as well use the
79     * registers required for messages with EOT.
80     */
81    const struct intel_device_info *devinfo = p->devinfo;
82    if (devinfo->ver >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83       reg->file = BRW_GENERAL_REGISTER_FILE;
84       reg->nr += GFX7_MRF_HACK_START;
85    }
86 }
87 
88 void
brw_set_dest(struct brw_codegen * p,brw_inst * inst,struct brw_reg dest)89 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
90 {
91    const struct intel_device_info *devinfo = p->devinfo;
92 
93    if (dest.file == BRW_MESSAGE_REGISTER_FILE)
94       assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));
95    else if (dest.file == BRW_GENERAL_REGISTER_FILE)
96       assert(dest.nr < 128);
97 
98    /* The hardware has a restriction where a destination of size Byte with
99     * a stride of 1 is only allowed for a packed byte MOV. For any other
100     * instruction, the stride must be at least 2, even when the destination
101     * is the NULL register.
102     */
103    if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
104        dest.nr == BRW_ARF_NULL &&
105        type_sz(dest.type) == 1 &&
106        dest.hstride == BRW_HORIZONTAL_STRIDE_1) {
107       dest.hstride = BRW_HORIZONTAL_STRIDE_2;
108    }
109 
110    gfx7_convert_mrf_to_grf(p, &dest);
111 
112    if (devinfo->ver >= 12 &&
113        (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
114         brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC)) {
115       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
116              dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
117       assert(dest.address_mode == BRW_ADDRESS_DIRECT);
118       assert(dest.subnr == 0);
119       assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
120              (dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
121               dest.vstride == dest.width + 1));
122       assert(!dest.negate && !dest.abs);
123       brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
124       brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
125 
126    } else if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS ||
127               brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC) {
128       assert(devinfo->ver < 12);
129       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
130              dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
131       assert(dest.address_mode == BRW_ADDRESS_DIRECT);
132       assert(dest.subnr % 16 == 0);
133       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
134              dest.vstride == dest.width + 1);
135       assert(!dest.negate && !dest.abs);
136       brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
137       brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
138       brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
139    } else {
140       brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
141       brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
142 
143       if (dest.address_mode == BRW_ADDRESS_DIRECT) {
144          brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
145 
146          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
147             brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
148             if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
149                dest.hstride = BRW_HORIZONTAL_STRIDE_1;
150             brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
151          } else {
152             brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
153             brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
154             if (dest.file == BRW_GENERAL_REGISTER_FILE ||
155                 dest.file == BRW_MESSAGE_REGISTER_FILE) {
156                assert(dest.writemask != 0);
157             }
158             /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
159              *    Although Dst.HorzStride is a don't care for Align16, HW needs
160              *    this to be programmed as "01".
161              */
162             brw_inst_set_dst_hstride(devinfo, inst, 1);
163          }
164       } else {
165          brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
166 
167          /* These are different sizes in align1 vs align16:
168           */
169          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
170             brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
171                                           dest.indirect_offset);
172             if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
173                dest.hstride = BRW_HORIZONTAL_STRIDE_1;
174             brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
175          } else {
176             brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
177                                            dest.indirect_offset);
178             /* even ignored in da16, still need to set as '01' */
179             brw_inst_set_dst_hstride(devinfo, inst, 1);
180          }
181       }
182    }
183 
184    /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
185     * or 16 (SIMD16), as that's normally correct.  However, when dealing with
186     * small registers, it can be useful for us to automatically reduce it to
187     * match the register size.
188     */
189    if (p->automatic_exec_sizes) {
190       /*
191        * In platforms that support fp64 we can emit instructions with a width
192        * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
193        * these cases we need to make sure that these instructions have their
194        * exec sizes set properly when they are emitted and we can't rely on
195        * this code to fix it.
196        */
197       bool fix_exec_size;
198       if (devinfo->ver >= 6)
199          fix_exec_size = dest.width < BRW_EXECUTE_4;
200       else
201          fix_exec_size = dest.width < BRW_EXECUTE_8;
202 
203       if (fix_exec_size)
204          brw_inst_set_exec_size(devinfo, inst, dest.width);
205    }
206 }
207 
208 void
brw_set_src0(struct brw_codegen * p,brw_inst * inst,struct brw_reg reg)209 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
210 {
211    const struct intel_device_info *devinfo = p->devinfo;
212 
213    if (reg.file == BRW_MESSAGE_REGISTER_FILE)
214       assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));
215    else if (reg.file == BRW_GENERAL_REGISTER_FILE)
216       assert(reg.nr < 128);
217 
218    gfx7_convert_mrf_to_grf(p, &reg);
219 
220    if (devinfo->ver >= 6 &&
221        (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
222         brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC ||
223         brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS ||
224         brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC)) {
225       /* Any source modifiers or regions will be ignored, since this just
226        * identifies the MRF/GRF to start reading the message contents from.
227        * Check for some likely failures.
228        */
229       assert(!reg.negate);
230       assert(!reg.abs);
231       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
232    }
233 
234    if (devinfo->ver >= 12 &&
235        (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
236         brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC)) {
237       assert(reg.file != BRW_IMMEDIATE_VALUE);
238       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
239       assert(reg.subnr == 0);
240       assert(has_scalar_region(reg) ||
241              (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
242               reg.vstride == reg.width + 1));
243       assert(!reg.negate && !reg.abs);
244       brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
245       brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
246 
247    } else if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS ||
248               brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC) {
249       assert(reg.file == BRW_GENERAL_REGISTER_FILE);
250       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
251       assert(reg.subnr % 16 == 0);
252       assert(has_scalar_region(reg) ||
253              (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
254               reg.vstride == reg.width + 1));
255       assert(!reg.negate && !reg.abs);
256       brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
257       brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
258    } else {
259       brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
260       brw_inst_set_src0_abs(devinfo, inst, reg.abs);
261       brw_inst_set_src0_negate(devinfo, inst, reg.negate);
262       brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
263 
264       if (reg.file == BRW_IMMEDIATE_VALUE) {
265          if (reg.type == BRW_REGISTER_TYPE_DF ||
266              brw_inst_opcode(p->isa, inst) == BRW_OPCODE_DIM)
267             brw_inst_set_imm_df(devinfo, inst, reg.df);
268          else if (reg.type == BRW_REGISTER_TYPE_UQ ||
269                   reg.type == BRW_REGISTER_TYPE_Q)
270             brw_inst_set_imm_uq(devinfo, inst, reg.u64);
271          else
272             brw_inst_set_imm_ud(devinfo, inst, reg.ud);
273 
274          if (devinfo->ver < 12 && type_sz(reg.type) < 8) {
275             brw_inst_set_src1_reg_file(devinfo, inst,
276                                        BRW_ARCHITECTURE_REGISTER_FILE);
277             brw_inst_set_src1_reg_hw_type(devinfo, inst,
278                                           brw_inst_src0_reg_hw_type(devinfo, inst));
279          }
280       } else {
281          if (reg.address_mode == BRW_ADDRESS_DIRECT) {
282             brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
283             if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
284                 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
285             } else {
286                brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
287             }
288          } else {
289             brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
290 
291             if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
292                brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
293             } else {
294                brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
295             }
296          }
297 
298          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
299             if (reg.width == BRW_WIDTH_1 &&
300                 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
301                brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
302                brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
303                brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
304             } else {
305                brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
306                brw_inst_set_src0_width(devinfo, inst, reg.width);
307                brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
308             }
309          } else {
310             brw_inst_set_src0_da16_swiz_x(devinfo, inst,
311                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
312             brw_inst_set_src0_da16_swiz_y(devinfo, inst,
313                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
314             brw_inst_set_src0_da16_swiz_z(devinfo, inst,
315                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
316             brw_inst_set_src0_da16_swiz_w(devinfo, inst,
317                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
318 
319             if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
320                /* This is an oddity of the fact we're using the same
321                 * descriptions for registers in align_16 as align_1:
322                 */
323                brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
324             } else if (devinfo->verx10 == 70 &&
325                        reg.type == BRW_REGISTER_TYPE_DF &&
326                        reg.vstride == BRW_VERTICAL_STRIDE_2) {
327                /* From SNB PRM:
328                 *
329                 * "For Align16 access mode, only encodings of 0000 and 0011
330                 *  are allowed. Other codes are reserved."
331                 *
332                 * Presumably the DevSNB behavior applies to IVB as well.
333                 */
334                brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
335             } else {
336                brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
337             }
338          }
339       }
340    }
341 }
342 
343 
344 void
brw_set_src1(struct brw_codegen * p,brw_inst * inst,struct brw_reg reg)345 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
346 {
347    const struct intel_device_info *devinfo = p->devinfo;
348 
349    if (reg.file == BRW_GENERAL_REGISTER_FILE)
350       assert(reg.nr < 128);
351 
352    if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDS ||
353        brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDSC ||
354        (devinfo->ver >= 12 &&
355         (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
356          brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC))) {
357       assert(reg.file == BRW_GENERAL_REGISTER_FILE ||
358              reg.file == BRW_ARCHITECTURE_REGISTER_FILE);
359       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
360       assert(reg.subnr == 0);
361       assert(has_scalar_region(reg) ||
362              (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
363               reg.vstride == reg.width + 1));
364       assert(!reg.negate && !reg.abs);
365       brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr);
366       brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
367    } else {
368       /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
369        *
370        *    "Accumulator registers may be accessed explicitly as src0
371        *    operands only."
372        */
373       assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
374              reg.nr != BRW_ARF_ACCUMULATOR);
375 
376       gfx7_convert_mrf_to_grf(p, &reg);
377       assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
378 
379       brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
380       brw_inst_set_src1_abs(devinfo, inst, reg.abs);
381       brw_inst_set_src1_negate(devinfo, inst, reg.negate);
382 
383       /* Only src1 can be immediate in two-argument instructions.
384        */
385       assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
386 
387       if (reg.file == BRW_IMMEDIATE_VALUE) {
388          /* two-argument instructions can only use 32-bit immediates */
389          assert(type_sz(reg.type) < 8);
390          brw_inst_set_imm_ud(devinfo, inst, reg.ud);
391       } else {
392          /* This is a hardware restriction, which may or may not be lifted
393           * in the future:
394           */
395          assert (reg.address_mode == BRW_ADDRESS_DIRECT);
396          /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
397 
398          brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
399          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
400             brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
401          } else {
402             brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
403          }
404 
405          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
406             if (reg.width == BRW_WIDTH_1 &&
407                 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
408                brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
409                brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
410                brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
411             } else {
412                brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
413                brw_inst_set_src1_width(devinfo, inst, reg.width);
414                brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
415             }
416          } else {
417             brw_inst_set_src1_da16_swiz_x(devinfo, inst,
418                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
419             brw_inst_set_src1_da16_swiz_y(devinfo, inst,
420                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
421             brw_inst_set_src1_da16_swiz_z(devinfo, inst,
422                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
423             brw_inst_set_src1_da16_swiz_w(devinfo, inst,
424                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
425 
426             if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
427                /* This is an oddity of the fact we're using the same
428                 * descriptions for registers in align_16 as align_1:
429                 */
430                brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
431             } else if (devinfo->verx10 == 70 &&
432                        reg.type == BRW_REGISTER_TYPE_DF &&
433                        reg.vstride == BRW_VERTICAL_STRIDE_2) {
434                /* From SNB PRM:
435                 *
436                 * "For Align16 access mode, only encodings of 0000 and 0011
437                 *  are allowed. Other codes are reserved."
438                 *
439                 * Presumably the DevSNB behavior applies to IVB as well.
440                 */
441                brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
442             } else {
443                brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
444             }
445          }
446       }
447    }
448 }
449 
450 /**
451  * Specify the descriptor and extended descriptor immediate for a SEND(C)
452  * message instruction.
453  */
454 void
brw_set_desc_ex(struct brw_codegen * p,brw_inst * inst,unsigned desc,unsigned ex_desc)455 brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
456                 unsigned desc, unsigned ex_desc)
457 {
458    const struct intel_device_info *devinfo = p->devinfo;
459    assert(brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SEND ||
460           brw_inst_opcode(p->isa, inst) == BRW_OPCODE_SENDC);
461    if (devinfo->ver < 12)
462       brw_inst_set_src1_file_type(devinfo, inst,
463                                   BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
464    brw_inst_set_send_desc(devinfo, inst, desc);
465    if (devinfo->ver >= 9)
466       brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
467 }
468 
brw_set_math_message(struct brw_codegen * p,brw_inst * inst,unsigned function,unsigned integer_type,bool low_precision,unsigned dataType)469 static void brw_set_math_message( struct brw_codegen *p,
470 				  brw_inst *inst,
471 				  unsigned function,
472 				  unsigned integer_type,
473 				  bool low_precision,
474 				  unsigned dataType )
475 {
476    const struct intel_device_info *devinfo = p->devinfo;
477    unsigned msg_length;
478    unsigned response_length;
479 
480    /* Infer message length from the function */
481    switch (function) {
482    case BRW_MATH_FUNCTION_POW:
483    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
484    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
485    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
486       msg_length = 2;
487       break;
488    default:
489       msg_length = 1;
490       break;
491    }
492 
493    /* Infer response length from the function */
494    switch (function) {
495    case BRW_MATH_FUNCTION_SINCOS:
496    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
497       response_length = 2;
498       break;
499    default:
500       response_length = 1;
501       break;
502    }
503 
504    brw_set_desc(p, inst, brw_message_desc(
505                    devinfo, msg_length, response_length, false));
506 
507    brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
508    brw_inst_set_math_msg_function(devinfo, inst, function);
509    brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
510    brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
511    brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
512    brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
513    brw_inst_set_saturate(devinfo, inst, 0);
514 }
515 
516 
brw_set_ff_sync_message(struct brw_codegen * p,brw_inst * insn,bool allocate,unsigned response_length,bool end_of_thread)517 static void brw_set_ff_sync_message(struct brw_codegen *p,
518 				    brw_inst *insn,
519 				    bool allocate,
520 				    unsigned response_length,
521 				    bool end_of_thread)
522 {
523    const struct intel_device_info *devinfo = p->devinfo;
524 
525    brw_set_desc(p, insn, brw_message_desc(
526                    devinfo, 1, response_length, true));
527 
528    brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
529    brw_inst_set_eot(devinfo, insn, end_of_thread);
530    brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
531    brw_inst_set_urb_allocate(devinfo, insn, allocate);
532    /* The following fields are not used by FF_SYNC: */
533    brw_inst_set_urb_global_offset(devinfo, insn, 0);
534    brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
535    brw_inst_set_urb_used(devinfo, insn, 0);
536    brw_inst_set_urb_complete(devinfo, insn, 0);
537 }
538 
brw_set_urb_message(struct brw_codegen * p,brw_inst * insn,enum brw_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle_control)539 static void brw_set_urb_message( struct brw_codegen *p,
540 				 brw_inst *insn,
541                                  enum brw_urb_write_flags flags,
542 				 unsigned msg_length,
543 				 unsigned response_length,
544 				 unsigned offset,
545 				 unsigned swizzle_control )
546 {
547    const struct intel_device_info *devinfo = p->devinfo;
548 
549    assert(devinfo->ver < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
550    assert(devinfo->ver < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
551    assert(devinfo->ver >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
552 
553    brw_set_desc(p, insn, brw_message_desc(
554                    devinfo, msg_length, response_length, true));
555 
556    brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
557    brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
558 
559    if (flags & BRW_URB_WRITE_OWORD) {
560       assert(msg_length == 2); /* header + one OWORD of data */
561       brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
562    } else {
563       brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
564    }
565 
566    brw_inst_set_urb_global_offset(devinfo, insn, offset);
567    brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
568 
569    if (devinfo->ver < 8) {
570       brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
571    }
572 
573    if (devinfo->ver < 7) {
574       brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
575       brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
576    } else {
577       brw_inst_set_urb_per_slot_offset(devinfo, insn,
578          !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
579    }
580 }
581 
582 static void
gfx7_set_dp_scratch_message(struct brw_codegen * p,brw_inst * inst,bool write,bool dword,bool invalidate_after_read,unsigned num_regs,unsigned addr_offset,unsigned mlen,unsigned rlen,bool header_present)583 gfx7_set_dp_scratch_message(struct brw_codegen *p,
584                             brw_inst *inst,
585                             bool write,
586                             bool dword,
587                             bool invalidate_after_read,
588                             unsigned num_regs,
589                             unsigned addr_offset,
590                             unsigned mlen,
591                             unsigned rlen,
592                             bool header_present)
593 {
594    const struct intel_device_info *devinfo = p->devinfo;
595    assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
596           (devinfo->ver >= 8 && num_regs == 8));
597    const unsigned block_size = (devinfo->ver >= 8 ? util_logbase2(num_regs) :
598                                 num_regs - 1);
599 
600    brw_set_desc(p, inst, brw_message_desc(
601                    devinfo, mlen, rlen, header_present));
602 
603    brw_inst_set_sfid(devinfo, inst, GFX7_SFID_DATAPORT_DATA_CACHE);
604    brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
605    brw_inst_set_scratch_read_write(devinfo, inst, write);
606    brw_inst_set_scratch_type(devinfo, inst, dword);
607    brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
608    brw_inst_set_scratch_block_size(devinfo, inst, block_size);
609    brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
610 }
611 
612 static void
brw_inst_set_state(const struct brw_isa_info * isa,brw_inst * insn,const struct brw_insn_state * state)613 brw_inst_set_state(const struct brw_isa_info *isa,
614                    brw_inst *insn,
615                    const struct brw_insn_state *state)
616 {
617    const struct intel_device_info *devinfo = isa->devinfo;
618 
619    brw_inst_set_exec_size(devinfo, insn, state->exec_size);
620    brw_inst_set_group(devinfo, insn, state->group);
621    brw_inst_set_compression(devinfo, insn, state->compressed);
622    brw_inst_set_access_mode(devinfo, insn, state->access_mode);
623    brw_inst_set_mask_control(devinfo, insn, state->mask_control);
624    if (devinfo->ver >= 12)
625       brw_inst_set_swsb(devinfo, insn, tgl_swsb_encode(devinfo, state->swsb));
626    brw_inst_set_saturate(devinfo, insn, state->saturate);
627    brw_inst_set_pred_control(devinfo, insn, state->predicate);
628    brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
629 
630    if (is_3src(isa, brw_inst_opcode(isa, insn)) &&
631        state->access_mode == BRW_ALIGN_16) {
632       brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
633       if (devinfo->ver >= 7)
634          brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
635    } else {
636       brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
637       if (devinfo->ver >= 7)
638          brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
639    }
640 
641    if (devinfo->ver >= 6)
642       brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
643 }
644 
645 static brw_inst *
brw_append_insns(struct brw_codegen * p,unsigned nr_insn,unsigned align)646 brw_append_insns(struct brw_codegen *p, unsigned nr_insn, unsigned align)
647 {
648    assert(util_is_power_of_two_or_zero(sizeof(brw_inst)));
649    assert(util_is_power_of_two_or_zero(align));
650    const unsigned align_insn = MAX2(align / sizeof(brw_inst), 1);
651    const unsigned start_insn = ALIGN(p->nr_insn, align_insn);
652    const unsigned new_nr_insn = start_insn + nr_insn;
653 
654    if (p->store_size < new_nr_insn) {
655       p->store_size = util_next_power_of_two(new_nr_insn * sizeof(brw_inst));
656       p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
657    }
658 
659    /* Memset any padding due to alignment to 0.  We don't want to be hashing
660     * or caching a bunch of random bits we got from a memory allocation.
661     */
662    if (p->nr_insn < start_insn) {
663       memset(&p->store[p->nr_insn], 0,
664              (start_insn - p->nr_insn) * sizeof(brw_inst));
665    }
666 
667    assert(p->next_insn_offset == p->nr_insn * sizeof(brw_inst));
668    p->nr_insn = new_nr_insn;
669    p->next_insn_offset = new_nr_insn * sizeof(brw_inst);
670 
671    return &p->store[start_insn];
672 }
673 
674 void
brw_realign(struct brw_codegen * p,unsigned align)675 brw_realign(struct brw_codegen *p, unsigned align)
676 {
677    brw_append_insns(p, 0, align);
678 }
679 
680 int
brw_append_data(struct brw_codegen * p,void * data,unsigned size,unsigned align)681 brw_append_data(struct brw_codegen *p, void *data,
682                 unsigned size, unsigned align)
683 {
684    unsigned nr_insn = DIV_ROUND_UP(size, sizeof(brw_inst));
685    void *dst = brw_append_insns(p, nr_insn, align);
686    memcpy(dst, data, size);
687 
688    /* If it's not a whole number of instructions, memset the end */
689    if (size < nr_insn * sizeof(brw_inst))
690       memset(dst + size, 0, nr_insn * sizeof(brw_inst) - size);
691 
692    return dst - (void *)p->store;
693 }
694 
695 #define next_insn brw_next_insn
696 brw_inst *
brw_next_insn(struct brw_codegen * p,unsigned opcode)697 brw_next_insn(struct brw_codegen *p, unsigned opcode)
698 {
699    brw_inst *insn = brw_append_insns(p, 1, sizeof(brw_inst));
700 
701    memset(insn, 0, sizeof(*insn));
702    brw_inst_set_opcode(p->isa, insn, opcode);
703 
704    /* Apply the default instruction state */
705    brw_inst_set_state(p->isa, insn, p->current);
706 
707    return insn;
708 }
709 
710 void
brw_add_reloc(struct brw_codegen * p,uint32_t id,enum brw_shader_reloc_type type,uint32_t offset,uint32_t delta)711 brw_add_reloc(struct brw_codegen *p, uint32_t id,
712               enum brw_shader_reloc_type type,
713               uint32_t offset, uint32_t delta)
714 {
715    if (p->num_relocs + 1 > p->reloc_array_size) {
716       p->reloc_array_size = MAX2(16, p->reloc_array_size * 2);
717       p->relocs = reralloc(p->mem_ctx, p->relocs,
718                            struct brw_shader_reloc, p->reloc_array_size);
719    }
720 
721    p->relocs[p->num_relocs++] = (struct brw_shader_reloc) {
722       .id = id,
723       .type = type,
724       .offset = offset,
725       .delta = delta,
726    };
727 }
728 
729 static brw_inst *
brw_alu1(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src)730 brw_alu1(struct brw_codegen *p, unsigned opcode,
731          struct brw_reg dest, struct brw_reg src)
732 {
733    brw_inst *insn = next_insn(p, opcode);
734    brw_set_dest(p, insn, dest);
735    brw_set_src0(p, insn, src);
736    return insn;
737 }
738 
739 static brw_inst *
brw_alu2(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)740 brw_alu2(struct brw_codegen *p, unsigned opcode,
741          struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
742 {
743    /* 64-bit immediates are only supported on 1-src instructions */
744    assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
745    assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
746 
747    brw_inst *insn = next_insn(p, opcode);
748    brw_set_dest(p, insn, dest);
749    brw_set_src0(p, insn, src0);
750    brw_set_src1(p, insn, src1);
751    return insn;
752 }
753 
754 static int
get_3src_subreg_nr(struct brw_reg reg)755 get_3src_subreg_nr(struct brw_reg reg)
756 {
757    /* Normally, SubRegNum is in bytes (0..31).  However, 3-src instructions
758     * use 32-bit units (components 0..7).  Since they only support F/D/UD
759     * types, this doesn't lose any flexibility, but uses fewer bits.
760     */
761    return reg.subnr / 4;
762 }
763 
764 static enum gfx10_align1_3src_vertical_stride
to_3src_align1_vstride(const struct intel_device_info * devinfo,enum brw_vertical_stride vstride)765 to_3src_align1_vstride(const struct intel_device_info *devinfo,
766                        enum brw_vertical_stride vstride)
767 {
768    switch (vstride) {
769    case BRW_VERTICAL_STRIDE_0:
770       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
771    case BRW_VERTICAL_STRIDE_1:
772       assert(devinfo->ver >= 12);
773       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1;
774    case BRW_VERTICAL_STRIDE_2:
775       assert(devinfo->ver < 12);
776       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
777    case BRW_VERTICAL_STRIDE_4:
778       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
779    case BRW_VERTICAL_STRIDE_8:
780    case BRW_VERTICAL_STRIDE_16:
781       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
782    default:
783       unreachable("invalid vstride");
784    }
785 }
786 
787 
788 static enum gfx10_align1_3src_src_horizontal_stride
to_3src_align1_hstride(enum brw_horizontal_stride hstride)789 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
790 {
791    switch (hstride) {
792    case BRW_HORIZONTAL_STRIDE_0:
793       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
794    case BRW_HORIZONTAL_STRIDE_1:
795       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
796    case BRW_HORIZONTAL_STRIDE_2:
797       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
798    case BRW_HORIZONTAL_STRIDE_4:
799       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
800    default:
801       unreachable("invalid hstride");
802    }
803 }
804 
805 static brw_inst *
brw_alu3(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1,struct brw_reg src2)806 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
807          struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
808 {
809    const struct intel_device_info *devinfo = p->devinfo;
810    brw_inst *inst = next_insn(p, opcode);
811 
812    gfx7_convert_mrf_to_grf(p, &dest);
813 
814    assert(dest.nr < 128);
815 
816    if (devinfo->ver >= 10)
817       assert(!(src0.file == BRW_IMMEDIATE_VALUE &&
818                src2.file == BRW_IMMEDIATE_VALUE));
819 
820    assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128);
821    assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128);
822    assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128);
823    assert(dest.address_mode == BRW_ADDRESS_DIRECT);
824    assert(src0.address_mode == BRW_ADDRESS_DIRECT);
825    assert(src1.address_mode == BRW_ADDRESS_DIRECT);
826    assert(src2.address_mode == BRW_ADDRESS_DIRECT);
827 
828    if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
829       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
830              dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
831 
832       if (devinfo->ver >= 12) {
833          brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
834          brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
835       } else {
836          if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
837             brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
838                                               BRW_ALIGN1_3SRC_ACCUMULATOR);
839             brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
840          } else {
841             brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
842                                               BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
843             brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
844          }
845       }
846       brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
847 
848       brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
849 
850       if (brw_reg_type_is_floating_point(dest.type)) {
851          brw_inst_set_3src_a1_exec_type(devinfo, inst,
852                                         BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
853       } else {
854          brw_inst_set_3src_a1_exec_type(devinfo, inst,
855                                         BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
856       }
857 
858       brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
859       brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
860       brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
861       brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
862 
863       if (src0.file == BRW_IMMEDIATE_VALUE) {
864          brw_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud);
865       } else {
866          brw_inst_set_3src_a1_src0_vstride(
867             devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
868          brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
869                                            to_3src_align1_hstride(src0.hstride));
870          brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
871          if (src0.type == BRW_REGISTER_TYPE_NF) {
872             brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
873          } else {
874             brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
875          }
876          brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
877          brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
878       }
879       brw_inst_set_3src_a1_src1_vstride(
880          devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
881       brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
882                                         to_3src_align1_hstride(src1.hstride));
883 
884       brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
885       if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
886          brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
887       } else {
888          brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
889       }
890       brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
891       brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
892 
893       if (src2.file == BRW_IMMEDIATE_VALUE) {
894          brw_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud);
895       } else {
896          brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
897                                            to_3src_align1_hstride(src2.hstride));
898          /* no vstride on src2 */
899          brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
900          brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
901          brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
902          brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
903       }
904 
905       assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
906              src0.file == BRW_IMMEDIATE_VALUE ||
907              (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
908               src0.type == BRW_REGISTER_TYPE_NF));
909       assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
910              src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
911       assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
912              src2.file == BRW_IMMEDIATE_VALUE);
913 
914       if (devinfo->ver >= 12) {
915          if (src0.file == BRW_IMMEDIATE_VALUE) {
916             brw_inst_set_3src_a1_src0_is_imm(devinfo, inst, 1);
917          } else {
918             brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
919          }
920 
921          brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
922 
923          if (src2.file == BRW_IMMEDIATE_VALUE) {
924             brw_inst_set_3src_a1_src2_is_imm(devinfo, inst, 1);
925          } else {
926             brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
927          }
928       } else {
929          brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
930                                             src0.file == BRW_GENERAL_REGISTER_FILE ?
931                                             BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
932                                             BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
933          brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
934                                             src1.file == BRW_GENERAL_REGISTER_FILE ?
935                                             BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
936                                             BRW_ALIGN1_3SRC_ACCUMULATOR);
937          brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
938                                             src2.file == BRW_GENERAL_REGISTER_FILE ?
939                                             BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
940                                             BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
941       }
942 
943    } else {
944       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
945              dest.file == BRW_MESSAGE_REGISTER_FILE);
946       assert(dest.type == BRW_REGISTER_TYPE_F  ||
947              dest.type == BRW_REGISTER_TYPE_DF ||
948              dest.type == BRW_REGISTER_TYPE_D  ||
949              dest.type == BRW_REGISTER_TYPE_UD ||
950              (dest.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 8));
951       if (devinfo->ver == 6) {
952          brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
953                                             dest.file == BRW_MESSAGE_REGISTER_FILE);
954       }
955       brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
956       brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 4);
957       brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
958 
959       assert(src0.file == BRW_GENERAL_REGISTER_FILE);
960       brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
961       brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
962       brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
963       brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
964       brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
965       brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
966                                           src0.vstride == BRW_VERTICAL_STRIDE_0);
967 
968       assert(src1.file == BRW_GENERAL_REGISTER_FILE);
969       brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
970       brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
971       brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
972       brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
973       brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
974       brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
975                                           src1.vstride == BRW_VERTICAL_STRIDE_0);
976 
977       assert(src2.file == BRW_GENERAL_REGISTER_FILE);
978       brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
979       brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
980       brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
981       brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
982       brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
983       brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
984                                           src2.vstride == BRW_VERTICAL_STRIDE_0);
985 
986       if (devinfo->ver >= 7) {
987          /* Set both the source and destination types based on dest.type,
988           * ignoring the source register types.  The MAD and LRP emitters ensure
989           * that all four types are float.  The BFE and BFI2 emitters, however,
990           * may send us mixed D and UD types and want us to ignore that and use
991           * the destination type.
992           */
993          brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
994          brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
995 
996          /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
997           *
998           *    "Three source instructions can use operands with mixed-mode
999           *     precision. When SrcType field is set to :f or :hf it defines
1000           *     precision for source 0 only, and fields Src1Type and Src2Type
1001           *     define precision for other source operands:
1002           *
1003           *     0b = :f. Single precision Float (32-bit).
1004           *     1b = :hf. Half precision Float (16-bit)."
1005           */
1006          if (src1.type == BRW_REGISTER_TYPE_HF)
1007             brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
1008 
1009          if (src2.type == BRW_REGISTER_TYPE_HF)
1010             brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
1011       }
1012    }
1013 
1014    return inst;
1015 }
1016 
1017 
1018 /***********************************************************************
1019  * Convenience routines.
1020  */
1021 #define ALU1(OP)					\
1022 brw_inst *brw_##OP(struct brw_codegen *p,		\
1023 	      struct brw_reg dest,			\
1024 	      struct brw_reg src0)   			\
1025 {							\
1026    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
1027 }
1028 
1029 #define ALU2(OP)					\
1030 brw_inst *brw_##OP(struct brw_codegen *p,		\
1031 	      struct brw_reg dest,			\
1032 	      struct brw_reg src0,			\
1033 	      struct brw_reg src1)   			\
1034 {							\
1035    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
1036 }
1037 
1038 #define ALU3(OP)					\
1039 brw_inst *brw_##OP(struct brw_codegen *p,		\
1040 	      struct brw_reg dest,			\
1041 	      struct brw_reg src0,			\
1042 	      struct brw_reg src1,			\
1043 	      struct brw_reg src2)   			\
1044 {                                                       \
1045    if (p->current->access_mode == BRW_ALIGN_16) {       \
1046       if (src0.vstride == BRW_VERTICAL_STRIDE_0)        \
1047          src0.swizzle = BRW_SWIZZLE_XXXX;               \
1048       if (src1.vstride == BRW_VERTICAL_STRIDE_0)        \
1049          src1.swizzle = BRW_SWIZZLE_XXXX;               \
1050       if (src2.vstride == BRW_VERTICAL_STRIDE_0)        \
1051          src2.swizzle = BRW_SWIZZLE_XXXX;               \
1052    }                                                    \
1053    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
1054 }
1055 
1056 #define ALU3F(OP)                                               \
1057 brw_inst *brw_##OP(struct brw_codegen *p,         \
1058                                  struct brw_reg dest,           \
1059                                  struct brw_reg src0,           \
1060                                  struct brw_reg src1,           \
1061                                  struct brw_reg src2)           \
1062 {                                                               \
1063    assert(dest.type == BRW_REGISTER_TYPE_F ||                   \
1064           dest.type == BRW_REGISTER_TYPE_DF);                   \
1065    if (dest.type == BRW_REGISTER_TYPE_F) {                      \
1066       assert(src0.type == BRW_REGISTER_TYPE_F);                 \
1067       assert(src1.type == BRW_REGISTER_TYPE_F);                 \
1068       assert(src2.type == BRW_REGISTER_TYPE_F);                 \
1069    } else if (dest.type == BRW_REGISTER_TYPE_DF) {              \
1070       assert(src0.type == BRW_REGISTER_TYPE_DF);                \
1071       assert(src1.type == BRW_REGISTER_TYPE_DF);                \
1072       assert(src2.type == BRW_REGISTER_TYPE_DF);                \
1073    }                                                            \
1074                                                                 \
1075    if (p->current->access_mode == BRW_ALIGN_16) {               \
1076       if (src0.vstride == BRW_VERTICAL_STRIDE_0)                \
1077          src0.swizzle = BRW_SWIZZLE_XXXX;                       \
1078       if (src1.vstride == BRW_VERTICAL_STRIDE_0)                \
1079          src1.swizzle = BRW_SWIZZLE_XXXX;                       \
1080       if (src2.vstride == BRW_VERTICAL_STRIDE_0)                \
1081          src2.swizzle = BRW_SWIZZLE_XXXX;                       \
1082    }                                                            \
1083    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1084 }
1085 
1086 ALU2(SEL)
ALU1(NOT)1087 ALU1(NOT)
1088 ALU2(AND)
1089 ALU2(OR)
1090 ALU2(XOR)
1091 ALU2(SHR)
1092 ALU2(SHL)
1093 ALU1(DIM)
1094 ALU2(ASR)
1095 ALU2(ROL)
1096 ALU2(ROR)
1097 ALU3(CSEL)
1098 ALU1(FRC)
1099 ALU1(RNDD)
1100 ALU1(RNDE)
1101 ALU1(RNDU)
1102 ALU1(RNDZ)
1103 ALU2(MAC)
1104 ALU2(MACH)
1105 ALU1(LZD)
1106 ALU2(DP4)
1107 ALU2(DPH)
1108 ALU2(DP3)
1109 ALU2(DP2)
1110 ALU3(DP4A)
1111 ALU3(MAD)
1112 ALU3F(LRP)
1113 ALU1(BFREV)
1114 ALU3(BFE)
1115 ALU2(BFI1)
1116 ALU3(BFI2)
1117 ALU1(FBH)
1118 ALU1(FBL)
1119 ALU1(CBIT)
1120 ALU2(ADDC)
1121 ALU2(SUBB)
1122 ALU3(ADD3)
1123 
1124 brw_inst *
1125 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1126 {
1127    const struct intel_device_info *devinfo = p->devinfo;
1128 
1129    /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1130     * To avoid the problems that causes, we use an <X,2,0> source region to
1131     * read each element twice.
1132     */
1133    if (devinfo->verx10 == 70 &&
1134        brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1135        dest.type == BRW_REGISTER_TYPE_DF &&
1136        (src0.type == BRW_REGISTER_TYPE_F ||
1137         src0.type == BRW_REGISTER_TYPE_D ||
1138         src0.type == BRW_REGISTER_TYPE_UD) &&
1139        !has_scalar_region(src0)) {
1140       assert(src0.vstride == src0.width + src0.hstride);
1141       src0.vstride = src0.hstride;
1142       src0.width = BRW_WIDTH_2;
1143       src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1144    }
1145 
1146    return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1147 }
1148 
1149 brw_inst *
brw_ADD(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1150 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1151         struct brw_reg src0, struct brw_reg src1)
1152 {
1153    /* 6.2.2: add */
1154    if (src0.type == BRW_REGISTER_TYPE_F ||
1155        (src0.file == BRW_IMMEDIATE_VALUE &&
1156 	src0.type == BRW_REGISTER_TYPE_VF)) {
1157       assert(src1.type != BRW_REGISTER_TYPE_UD);
1158       assert(src1.type != BRW_REGISTER_TYPE_D);
1159    }
1160 
1161    if (src1.type == BRW_REGISTER_TYPE_F ||
1162        (src1.file == BRW_IMMEDIATE_VALUE &&
1163 	src1.type == BRW_REGISTER_TYPE_VF)) {
1164       assert(src0.type != BRW_REGISTER_TYPE_UD);
1165       assert(src0.type != BRW_REGISTER_TYPE_D);
1166    }
1167 
1168    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1169 }
1170 
1171 brw_inst *
brw_AVG(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1172 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1173         struct brw_reg src0, struct brw_reg src1)
1174 {
1175    assert(dest.type == src0.type);
1176    assert(src0.type == src1.type);
1177    switch (src0.type) {
1178    case BRW_REGISTER_TYPE_B:
1179    case BRW_REGISTER_TYPE_UB:
1180    case BRW_REGISTER_TYPE_W:
1181    case BRW_REGISTER_TYPE_UW:
1182    case BRW_REGISTER_TYPE_D:
1183    case BRW_REGISTER_TYPE_UD:
1184       break;
1185    default:
1186       unreachable("Bad type for brw_AVG");
1187    }
1188 
1189    return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1190 }
1191 
1192 brw_inst *
brw_MUL(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1193 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1194         struct brw_reg src0, struct brw_reg src1)
1195 {
1196    /* 6.32.38: mul */
1197    if (src0.type == BRW_REGISTER_TYPE_D ||
1198        src0.type == BRW_REGISTER_TYPE_UD ||
1199        src1.type == BRW_REGISTER_TYPE_D ||
1200        src1.type == BRW_REGISTER_TYPE_UD) {
1201       assert(dest.type != BRW_REGISTER_TYPE_F);
1202    }
1203 
1204    if (src0.type == BRW_REGISTER_TYPE_F ||
1205        (src0.file == BRW_IMMEDIATE_VALUE &&
1206 	src0.type == BRW_REGISTER_TYPE_VF)) {
1207       assert(src1.type != BRW_REGISTER_TYPE_UD);
1208       assert(src1.type != BRW_REGISTER_TYPE_D);
1209    }
1210 
1211    if (src1.type == BRW_REGISTER_TYPE_F ||
1212        (src1.file == BRW_IMMEDIATE_VALUE &&
1213 	src1.type == BRW_REGISTER_TYPE_VF)) {
1214       assert(src0.type != BRW_REGISTER_TYPE_UD);
1215       assert(src0.type != BRW_REGISTER_TYPE_D);
1216    }
1217 
1218    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1219 	  src0.nr != BRW_ARF_ACCUMULATOR);
1220    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1221 	  src1.nr != BRW_ARF_ACCUMULATOR);
1222 
1223    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1224 }
1225 
1226 brw_inst *
brw_LINE(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1227 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1228          struct brw_reg src0, struct brw_reg src1)
1229 {
1230    src0.vstride = BRW_VERTICAL_STRIDE_0;
1231    src0.width = BRW_WIDTH_1;
1232    src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1233    return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1234 }
1235 
1236 brw_inst *
brw_PLN(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1237 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1238         struct brw_reg src0, struct brw_reg src1)
1239 {
1240    src0.vstride = BRW_VERTICAL_STRIDE_0;
1241    src0.width = BRW_WIDTH_1;
1242    src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1243    src1.vstride = BRW_VERTICAL_STRIDE_8;
1244    src1.width = BRW_WIDTH_8;
1245    src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1246    return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1247 }
1248 
1249 brw_inst *
brw_F32TO16(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src)1250 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1251 {
1252    const struct intel_device_info *devinfo = p->devinfo;
1253    const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1254    /* The F32TO16 instruction doesn't support 32-bit destination types in
1255     * Align1 mode, and neither does the Gfx8 implementation in terms of a
1256     * converting MOV.  Gfx7 does zero out the high 16 bits in Align16 mode as
1257     * an undocumented feature.
1258     */
1259    const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1260                                  (!align16 || devinfo->ver >= 8));
1261    brw_inst *inst;
1262 
1263    if (align16) {
1264       assert(dst.type == BRW_REGISTER_TYPE_UD);
1265    } else {
1266       if (devinfo->ver <= 7) {
1267          assert(dst.type == BRW_REGISTER_TYPE_W ||
1268                 dst.type == BRW_REGISTER_TYPE_UW);
1269       } else {
1270          assert(dst.type == BRW_REGISTER_TYPE_HF);
1271       }
1272    }
1273 
1274    brw_push_insn_state(p);
1275 
1276    if (needs_zero_fill) {
1277       brw_set_default_access_mode(p, BRW_ALIGN_1);
1278       dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1279    }
1280 
1281    if (devinfo->ver >= 8) {
1282       inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1283    } else {
1284       assert(devinfo->ver == 7);
1285       inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1286    }
1287 
1288    if (needs_zero_fill) {
1289       if (devinfo->ver < 12)
1290          brw_inst_set_no_dd_clear(devinfo, inst, true);
1291       brw_set_default_swsb(p, tgl_swsb_null());
1292       inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1293       if (devinfo->ver < 12)
1294          brw_inst_set_no_dd_check(devinfo, inst, true);
1295    }
1296 
1297    brw_pop_insn_state(p);
1298    return inst;
1299 }
1300 
1301 brw_inst *
brw_F16TO32(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src)1302 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1303 {
1304    const struct intel_device_info *devinfo = p->devinfo;
1305    bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1306 
1307    if (align16) {
1308       assert(src.type == BRW_REGISTER_TYPE_UD);
1309    } else {
1310       /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1311        *
1312        *   Because this instruction does not have a 16-bit floating-point
1313        *   type, the source data type must be Word (W). The destination type
1314        *   must be F (Float).
1315        */
1316       if (src.type == BRW_REGISTER_TYPE_UD)
1317          src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1318 
1319       assert(src.type == BRW_REGISTER_TYPE_W ||
1320              src.type == BRW_REGISTER_TYPE_UW ||
1321              src.type == BRW_REGISTER_TYPE_HF);
1322    }
1323 
1324    if (devinfo->ver >= 8) {
1325       return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1326    } else {
1327       assert(devinfo->ver == 7);
1328       return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1329    }
1330 }
1331 
1332 
brw_NOP(struct brw_codegen * p)1333 void brw_NOP(struct brw_codegen *p)
1334 {
1335    brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1336    memset(insn, 0, sizeof(*insn));
1337    brw_inst_set_opcode(p->isa, insn, BRW_OPCODE_NOP);
1338 }
1339 
brw_SYNC(struct brw_codegen * p,enum tgl_sync_function func)1340 void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func)
1341 {
1342    brw_inst *insn = next_insn(p, BRW_OPCODE_SYNC);
1343    brw_inst_set_cond_modifier(p->devinfo, insn, func);
1344 }
1345 
1346 /***********************************************************************
1347  * Comparisons, if/else/endif
1348  */
1349 
1350 brw_inst *
brw_JMPI(struct brw_codegen * p,struct brw_reg index,unsigned predicate_control)1351 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1352          unsigned predicate_control)
1353 {
1354    const struct intel_device_info *devinfo = p->devinfo;
1355    struct brw_reg ip = brw_ip_reg();
1356    brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1357 
1358    brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1359    brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1360    brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1361    brw_inst_set_pred_control(devinfo, inst, predicate_control);
1362 
1363    return inst;
1364 }
1365 
1366 static void
push_if_stack(struct brw_codegen * p,brw_inst * inst)1367 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1368 {
1369    p->if_stack[p->if_stack_depth] = inst - p->store;
1370 
1371    p->if_stack_depth++;
1372    if (p->if_stack_array_size <= p->if_stack_depth) {
1373       p->if_stack_array_size *= 2;
1374       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1375 			     p->if_stack_array_size);
1376    }
1377 }
1378 
1379 static brw_inst *
pop_if_stack(struct brw_codegen * p)1380 pop_if_stack(struct brw_codegen *p)
1381 {
1382    p->if_stack_depth--;
1383    return &p->store[p->if_stack[p->if_stack_depth]];
1384 }
1385 
1386 static void
push_loop_stack(struct brw_codegen * p,brw_inst * inst)1387 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1388 {
1389    if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1390       p->loop_stack_array_size *= 2;
1391       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1392 			       p->loop_stack_array_size);
1393       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1394 				     p->loop_stack_array_size);
1395    }
1396 
1397    p->loop_stack[p->loop_stack_depth] = inst - p->store;
1398    p->loop_stack_depth++;
1399    p->if_depth_in_loop[p->loop_stack_depth] = 0;
1400 }
1401 
1402 static brw_inst *
get_inner_do_insn(struct brw_codegen * p)1403 get_inner_do_insn(struct brw_codegen *p)
1404 {
1405    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1406 }
1407 
1408 /* EU takes the value from the flag register and pushes it onto some
1409  * sort of a stack (presumably merging with any flag value already on
1410  * the stack).  Within an if block, the flags at the top of the stack
1411  * control execution on each channel of the unit, eg. on each of the
1412  * 16 pixel values in our wm programs.
1413  *
1414  * When the matching 'else' instruction is reached (presumably by
1415  * countdown of the instruction count patched in by our ELSE/ENDIF
1416  * functions), the relevant flags are inverted.
1417  *
1418  * When the matching 'endif' instruction is reached, the flags are
1419  * popped off.  If the stack is now empty, normal execution resumes.
1420  */
1421 brw_inst *
brw_IF(struct brw_codegen * p,unsigned execute_size)1422 brw_IF(struct brw_codegen *p, unsigned execute_size)
1423 {
1424    const struct intel_device_info *devinfo = p->devinfo;
1425    brw_inst *insn;
1426 
1427    insn = next_insn(p, BRW_OPCODE_IF);
1428 
1429    /* Override the defaults for this instruction:
1430     */
1431    if (devinfo->ver < 6) {
1432       brw_set_dest(p, insn, brw_ip_reg());
1433       brw_set_src0(p, insn, brw_ip_reg());
1434       brw_set_src1(p, insn, brw_imm_d(0x0));
1435    } else if (devinfo->ver == 6) {
1436       brw_set_dest(p, insn, brw_imm_w(0));
1437       brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
1438       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1439       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1440    } else if (devinfo->ver == 7) {
1441       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1442       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1443       brw_set_src1(p, insn, brw_imm_w(0));
1444       brw_inst_set_jip(devinfo, insn, 0);
1445       brw_inst_set_uip(devinfo, insn, 0);
1446    } else {
1447       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1448       if (devinfo->ver < 12)
1449          brw_set_src0(p, insn, brw_imm_d(0));
1450       brw_inst_set_jip(devinfo, insn, 0);
1451       brw_inst_set_uip(devinfo, insn, 0);
1452    }
1453 
1454    brw_inst_set_exec_size(devinfo, insn, execute_size);
1455    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1456    brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1457    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1458    if (!p->single_program_flow && devinfo->ver < 6)
1459       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1460 
1461    push_if_stack(p, insn);
1462    p->if_depth_in_loop[p->loop_stack_depth]++;
1463    return insn;
1464 }
1465 
1466 /* This function is only used for gfx6-style IF instructions with an
1467  * embedded comparison (conditional modifier).  It is not used on gfx7.
1468  */
1469 brw_inst *
gfx6_IF(struct brw_codegen * p,enum brw_conditional_mod conditional,struct brw_reg src0,struct brw_reg src1)1470 gfx6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1471 	struct brw_reg src0, struct brw_reg src1)
1472 {
1473    const struct intel_device_info *devinfo = p->devinfo;
1474    brw_inst *insn;
1475 
1476    insn = next_insn(p, BRW_OPCODE_IF);
1477 
1478    brw_set_dest(p, insn, brw_imm_w(0));
1479    brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1480    brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
1481    brw_set_src0(p, insn, src0);
1482    brw_set_src1(p, insn, src1);
1483 
1484    assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1485    assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1486    brw_inst_set_cond_modifier(devinfo, insn, conditional);
1487 
1488    push_if_stack(p, insn);
1489    return insn;
1490 }
1491 
1492 /**
1493  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1494  */
1495 static void
convert_IF_ELSE_to_ADD(struct brw_codegen * p,brw_inst * if_inst,brw_inst * else_inst)1496 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1497                        brw_inst *if_inst, brw_inst *else_inst)
1498 {
1499    const struct intel_device_info *devinfo = p->devinfo;
1500 
1501    /* The next instruction (where the ENDIF would be, if it existed) */
1502    brw_inst *next_inst = &p->store[p->nr_insn];
1503 
1504    assert(p->single_program_flow);
1505    assert(if_inst != NULL && brw_inst_opcode(p->isa, if_inst) == BRW_OPCODE_IF);
1506    assert(else_inst == NULL || brw_inst_opcode(p->isa, else_inst) == BRW_OPCODE_ELSE);
1507    assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1508 
1509    /* Convert IF to an ADD instruction that moves the instruction pointer
1510     * to the first instruction of the ELSE block.  If there is no ELSE
1511     * block, point to where ENDIF would be.  Reverse the predicate.
1512     *
1513     * There's no need to execute an ENDIF since we don't need to do any
1514     * stack operations, and if we're currently executing, we just want to
1515     * continue normally.
1516     */
1517    brw_inst_set_opcode(p->isa, if_inst, BRW_OPCODE_ADD);
1518    brw_inst_set_pred_inv(devinfo, if_inst, true);
1519 
1520    if (else_inst != NULL) {
1521       /* Convert ELSE to an ADD instruction that points where the ENDIF
1522        * would be.
1523        */
1524       brw_inst_set_opcode(p->isa, else_inst, BRW_OPCODE_ADD);
1525 
1526       brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1527       brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1528    } else {
1529       brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1530    }
1531 }
1532 
1533 /**
1534  * Patch IF and ELSE instructions with appropriate jump targets.
1535  */
1536 static void
patch_IF_ELSE(struct brw_codegen * p,brw_inst * if_inst,brw_inst * else_inst,brw_inst * endif_inst)1537 patch_IF_ELSE(struct brw_codegen *p,
1538               brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1539 {
1540    const struct intel_device_info *devinfo = p->devinfo;
1541 
1542    /* We shouldn't be patching IF and ELSE instructions in single program flow
1543     * mode when gen < 6, because in single program flow mode on those
1544     * platforms, we convert flow control instructions to conditional ADDs that
1545     * operate on IP (see brw_ENDIF).
1546     *
1547     * However, on Gfx6, writing to IP doesn't work in single program flow mode
1548     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1549     * not be updated by non-flow control instructions.").  And on later
1550     * platforms, there is no significant benefit to converting control flow
1551     * instructions to conditional ADDs.  So we do patch IF and ELSE
1552     * instructions in single program flow mode on those platforms.
1553     */
1554    if (devinfo->ver < 6)
1555       assert(!p->single_program_flow);
1556 
1557    assert(if_inst != NULL && brw_inst_opcode(p->isa, if_inst) == BRW_OPCODE_IF);
1558    assert(endif_inst != NULL);
1559    assert(else_inst == NULL || brw_inst_opcode(p->isa, else_inst) == BRW_OPCODE_ELSE);
1560 
1561    unsigned br = brw_jump_scale(devinfo);
1562 
1563    assert(brw_inst_opcode(p->isa, endif_inst) == BRW_OPCODE_ENDIF);
1564    brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1565 
1566    if (else_inst == NULL) {
1567       /* Patch IF -> ENDIF */
1568       if (devinfo->ver < 6) {
1569 	 /* Turn it into an IFF, which means no mask stack operations for
1570 	  * all-false and jumping past the ENDIF.
1571 	  */
1572          brw_inst_set_opcode(p->isa, if_inst, BRW_OPCODE_IFF);
1573          brw_inst_set_gfx4_jump_count(devinfo, if_inst,
1574                                       br * (endif_inst - if_inst + 1));
1575          brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1576       } else if (devinfo->ver == 6) {
1577 	 /* As of gfx6, there is no IFF and IF must point to the ENDIF. */
1578          brw_inst_set_gfx6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1579       } else {
1580          brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1581          brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1582       }
1583    } else {
1584       brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1585 
1586       /* Patch IF -> ELSE */
1587       if (devinfo->ver < 6) {
1588          brw_inst_set_gfx4_jump_count(devinfo, if_inst,
1589                                       br * (else_inst - if_inst));
1590          brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1591       } else if (devinfo->ver == 6) {
1592          brw_inst_set_gfx6_jump_count(devinfo, if_inst,
1593                                       br * (else_inst - if_inst + 1));
1594       }
1595 
1596       /* Patch ELSE -> ENDIF */
1597       if (devinfo->ver < 6) {
1598 	 /* BRW_OPCODE_ELSE pre-gfx6 should point just past the
1599 	  * matching ENDIF.
1600 	  */
1601          brw_inst_set_gfx4_jump_count(devinfo, else_inst,
1602                                       br * (endif_inst - else_inst + 1));
1603          brw_inst_set_gfx4_pop_count(devinfo, else_inst, 1);
1604       } else if (devinfo->ver == 6) {
1605 	 /* BRW_OPCODE_ELSE on gfx6 should point to the matching ENDIF. */
1606          brw_inst_set_gfx6_jump_count(devinfo, else_inst,
1607                                       br * (endif_inst - else_inst));
1608       } else {
1609 	 /* The IF instruction's JIP should point just past the ELSE */
1610          brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1611 	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1612          brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1613          brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1614          if (devinfo->ver >= 8) {
1615             /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1616              * should point to ENDIF.
1617              */
1618             brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1619          }
1620       }
1621    }
1622 }
1623 
1624 void
brw_ELSE(struct brw_codegen * p)1625 brw_ELSE(struct brw_codegen *p)
1626 {
1627    const struct intel_device_info *devinfo = p->devinfo;
1628    brw_inst *insn;
1629 
1630    insn = next_insn(p, BRW_OPCODE_ELSE);
1631 
1632    if (devinfo->ver < 6) {
1633       brw_set_dest(p, insn, brw_ip_reg());
1634       brw_set_src0(p, insn, brw_ip_reg());
1635       brw_set_src1(p, insn, brw_imm_d(0x0));
1636    } else if (devinfo->ver == 6) {
1637       brw_set_dest(p, insn, brw_imm_w(0));
1638       brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
1639       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1640       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1641    } else if (devinfo->ver == 7) {
1642       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1643       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1644       brw_set_src1(p, insn, brw_imm_w(0));
1645       brw_inst_set_jip(devinfo, insn, 0);
1646       brw_inst_set_uip(devinfo, insn, 0);
1647    } else {
1648       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1649       if (devinfo->ver < 12)
1650          brw_set_src0(p, insn, brw_imm_d(0));
1651       brw_inst_set_jip(devinfo, insn, 0);
1652       brw_inst_set_uip(devinfo, insn, 0);
1653    }
1654 
1655    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1656    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1657    if (!p->single_program_flow && devinfo->ver < 6)
1658       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1659 
1660    push_if_stack(p, insn);
1661 }
1662 
1663 void
brw_ENDIF(struct brw_codegen * p)1664 brw_ENDIF(struct brw_codegen *p)
1665 {
1666    const struct intel_device_info *devinfo = p->devinfo;
1667    brw_inst *insn = NULL;
1668    brw_inst *else_inst = NULL;
1669    brw_inst *if_inst = NULL;
1670    brw_inst *tmp;
1671    bool emit_endif = true;
1672 
1673    /* In single program flow mode, we can express IF and ELSE instructions
1674     * equivalently as ADD instructions that operate on IP.  On platforms prior
1675     * to Gfx6, flow control instructions cause an implied thread switch, so
1676     * this is a significant savings.
1677     *
1678     * However, on Gfx6, writing to IP doesn't work in single program flow mode
1679     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1680     * not be updated by non-flow control instructions.").  And on later
1681     * platforms, there is no significant benefit to converting control flow
1682     * instructions to conditional ADDs.  So we only do this trick on Gfx4 and
1683     * Gfx5.
1684     */
1685    if (devinfo->ver < 6 && p->single_program_flow)
1686       emit_endif = false;
1687 
1688    /*
1689     * A single next_insn() may change the base address of instruction store
1690     * memory(p->store), so call it first before referencing the instruction
1691     * store pointer from an index
1692     */
1693    if (emit_endif)
1694       insn = next_insn(p, BRW_OPCODE_ENDIF);
1695 
1696    /* Pop the IF and (optional) ELSE instructions from the stack */
1697    p->if_depth_in_loop[p->loop_stack_depth]--;
1698    tmp = pop_if_stack(p);
1699    if (brw_inst_opcode(p->isa, tmp) == BRW_OPCODE_ELSE) {
1700       else_inst = tmp;
1701       tmp = pop_if_stack(p);
1702    }
1703    if_inst = tmp;
1704 
1705    if (!emit_endif) {
1706       /* ENDIF is useless; don't bother emitting it. */
1707       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1708       return;
1709    }
1710 
1711    if (devinfo->ver < 6) {
1712       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1713       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1714       brw_set_src1(p, insn, brw_imm_d(0x0));
1715    } else if (devinfo->ver == 6) {
1716       brw_set_dest(p, insn, brw_imm_w(0));
1717       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1718       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1719    } else if (devinfo->ver == 7) {
1720       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1721       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1722       brw_set_src1(p, insn, brw_imm_w(0));
1723    } else {
1724       brw_set_src0(p, insn, brw_imm_d(0));
1725    }
1726 
1727    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1728    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1729    if (devinfo->ver < 6)
1730       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1731 
1732    /* Also pop item off the stack in the endif instruction: */
1733    if (devinfo->ver < 6) {
1734       brw_inst_set_gfx4_jump_count(devinfo, insn, 0);
1735       brw_inst_set_gfx4_pop_count(devinfo, insn, 1);
1736    } else if (devinfo->ver == 6) {
1737       brw_inst_set_gfx6_jump_count(devinfo, insn, 2);
1738    } else {
1739       brw_inst_set_jip(devinfo, insn, 2);
1740    }
1741    patch_IF_ELSE(p, if_inst, else_inst, insn);
1742 }
1743 
1744 brw_inst *
brw_BREAK(struct brw_codegen * p)1745 brw_BREAK(struct brw_codegen *p)
1746 {
1747    const struct intel_device_info *devinfo = p->devinfo;
1748    brw_inst *insn;
1749 
1750    insn = next_insn(p, BRW_OPCODE_BREAK);
1751    if (devinfo->ver >= 8) {
1752       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1753       brw_set_src0(p, insn, brw_imm_d(0x0));
1754    } else if (devinfo->ver >= 6) {
1755       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1756       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1757       brw_set_src1(p, insn, brw_imm_d(0x0));
1758    } else {
1759       brw_set_dest(p, insn, brw_ip_reg());
1760       brw_set_src0(p, insn, brw_ip_reg());
1761       brw_set_src1(p, insn, brw_imm_d(0x0));
1762       brw_inst_set_gfx4_pop_count(devinfo, insn,
1763                                   p->if_depth_in_loop[p->loop_stack_depth]);
1764    }
1765    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1766    brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1767 
1768    return insn;
1769 }
1770 
1771 brw_inst *
brw_CONT(struct brw_codegen * p)1772 brw_CONT(struct brw_codegen *p)
1773 {
1774    const struct intel_device_info *devinfo = p->devinfo;
1775    brw_inst *insn;
1776 
1777    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1778    brw_set_dest(p, insn, brw_ip_reg());
1779    if (devinfo->ver >= 8) {
1780       brw_set_src0(p, insn, brw_imm_d(0x0));
1781    } else {
1782       brw_set_src0(p, insn, brw_ip_reg());
1783       brw_set_src1(p, insn, brw_imm_d(0x0));
1784    }
1785 
1786    if (devinfo->ver < 6) {
1787       brw_inst_set_gfx4_pop_count(devinfo, insn,
1788                                   p->if_depth_in_loop[p->loop_stack_depth]);
1789    }
1790    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1791    brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1792    return insn;
1793 }
1794 
1795 brw_inst *
brw_HALT(struct brw_codegen * p)1796 brw_HALT(struct brw_codegen *p)
1797 {
1798    const struct intel_device_info *devinfo = p->devinfo;
1799    brw_inst *insn;
1800 
1801    insn = next_insn(p, BRW_OPCODE_HALT);
1802    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1803    if (devinfo->ver < 6) {
1804       /* From the Gfx4 PRM:
1805        *
1806        *    "IP register must be put (for example, by the assembler) at <dst>
1807        *    and <src0> locations.
1808        */
1809       brw_set_dest(p, insn, brw_ip_reg());
1810       brw_set_src0(p, insn, brw_ip_reg());
1811       brw_set_src1(p, insn, brw_imm_d(0x0)); /* exitcode updated later. */
1812    } else if (devinfo->ver < 8) {
1813       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1814       brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1815    } else if (devinfo->ver < 12) {
1816       brw_set_src0(p, insn, brw_imm_d(0x0));
1817    }
1818 
1819    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1820    brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1821    return insn;
1822 }
1823 
1824 /* DO/WHILE loop:
1825  *
1826  * The DO/WHILE is just an unterminated loop -- break or continue are
1827  * used for control within the loop.  We have a few ways they can be
1828  * done.
1829  *
1830  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1831  * jip and no DO instruction.
1832  *
1833  * For non-uniform control flow pre-gfx6, there's a DO instruction to
1834  * push the mask, and a WHILE to jump back, and BREAK to get out and
1835  * pop the mask.
1836  *
1837  * For gfx6, there's no more mask stack, so no need for DO.  WHILE
1838  * just points back to the first instruction of the loop.
1839  */
1840 brw_inst *
brw_DO(struct brw_codegen * p,unsigned execute_size)1841 brw_DO(struct brw_codegen *p, unsigned execute_size)
1842 {
1843    const struct intel_device_info *devinfo = p->devinfo;
1844 
1845    if (devinfo->ver >= 6 || p->single_program_flow) {
1846       push_loop_stack(p, &p->store[p->nr_insn]);
1847       return &p->store[p->nr_insn];
1848    } else {
1849       brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1850 
1851       push_loop_stack(p, insn);
1852 
1853       /* Override the defaults for this instruction:
1854        */
1855       brw_set_dest(p, insn, brw_null_reg());
1856       brw_set_src0(p, insn, brw_null_reg());
1857       brw_set_src1(p, insn, brw_null_reg());
1858 
1859       brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1860       brw_inst_set_exec_size(devinfo, insn, execute_size);
1861       brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1862 
1863       return insn;
1864    }
1865 }
1866 
1867 /**
1868  * For pre-gfx6, we patch BREAK/CONT instructions to point at the WHILE
1869  * instruction here.
1870  *
1871  * For gfx6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1872  * nesting, since it can always just point to the end of the block/current loop.
1873  */
1874 static void
brw_patch_break_cont(struct brw_codegen * p,brw_inst * while_inst)1875 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1876 {
1877    const struct intel_device_info *devinfo = p->devinfo;
1878    brw_inst *do_inst = get_inner_do_insn(p);
1879    brw_inst *inst;
1880    unsigned br = brw_jump_scale(devinfo);
1881 
1882    assert(devinfo->ver < 6);
1883 
1884    for (inst = while_inst - 1; inst != do_inst; inst--) {
1885       /* If the jump count is != 0, that means that this instruction has already
1886        * been patched because it's part of a loop inside of the one we're
1887        * patching.
1888        */
1889       if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_BREAK &&
1890           brw_inst_gfx4_jump_count(devinfo, inst) == 0) {
1891          brw_inst_set_gfx4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1892       } else if (brw_inst_opcode(p->isa, inst) == BRW_OPCODE_CONTINUE &&
1893                  brw_inst_gfx4_jump_count(devinfo, inst) == 0) {
1894          brw_inst_set_gfx4_jump_count(devinfo, inst, br * (while_inst - inst));
1895       }
1896    }
1897 }
1898 
1899 brw_inst *
brw_WHILE(struct brw_codegen * p)1900 brw_WHILE(struct brw_codegen *p)
1901 {
1902    const struct intel_device_info *devinfo = p->devinfo;
1903    brw_inst *insn, *do_insn;
1904    unsigned br = brw_jump_scale(devinfo);
1905 
1906    if (devinfo->ver >= 6) {
1907       insn = next_insn(p, BRW_OPCODE_WHILE);
1908       do_insn = get_inner_do_insn(p);
1909 
1910       if (devinfo->ver >= 8) {
1911          brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1912          if (devinfo->ver < 12)
1913             brw_set_src0(p, insn, brw_imm_d(0));
1914          brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1915       } else if (devinfo->ver == 7) {
1916          brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1917          brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1918          brw_set_src1(p, insn, brw_imm_w(0));
1919          brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1920       } else {
1921          brw_set_dest(p, insn, brw_imm_w(0));
1922          brw_inst_set_gfx6_jump_count(devinfo, insn, br * (do_insn - insn));
1923          brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1924          brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1925       }
1926 
1927       brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1928 
1929    } else {
1930       if (p->single_program_flow) {
1931 	 insn = next_insn(p, BRW_OPCODE_ADD);
1932          do_insn = get_inner_do_insn(p);
1933 
1934 	 brw_set_dest(p, insn, brw_ip_reg());
1935 	 brw_set_src0(p, insn, brw_ip_reg());
1936 	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1937          brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1938       } else {
1939 	 insn = next_insn(p, BRW_OPCODE_WHILE);
1940          do_insn = get_inner_do_insn(p);
1941 
1942          assert(brw_inst_opcode(p->isa, do_insn) == BRW_OPCODE_DO);
1943 
1944 	 brw_set_dest(p, insn, brw_ip_reg());
1945 	 brw_set_src0(p, insn, brw_ip_reg());
1946 	 brw_set_src1(p, insn, brw_imm_d(0));
1947 
1948          brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1949          brw_inst_set_gfx4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1950          brw_inst_set_gfx4_pop_count(devinfo, insn, 0);
1951 
1952 	 brw_patch_break_cont(p, insn);
1953       }
1954    }
1955    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1956 
1957    p->loop_stack_depth--;
1958 
1959    return insn;
1960 }
1961 
1962 /* FORWARD JUMPS:
1963  */
brw_land_fwd_jump(struct brw_codegen * p,int jmp_insn_idx)1964 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1965 {
1966    const struct intel_device_info *devinfo = p->devinfo;
1967    brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1968    unsigned jmpi = 1;
1969 
1970    if (devinfo->ver >= 5)
1971       jmpi = 2;
1972 
1973    assert(brw_inst_opcode(p->isa, jmp_insn) == BRW_OPCODE_JMPI);
1974    assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1975 
1976    brw_inst_set_gfx4_jump_count(devinfo, jmp_insn,
1977                                 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1978 }
1979 
1980 /* To integrate with the above, it makes sense that the comparison
1981  * instruction should populate the flag register.  It might be simpler
1982  * just to use the flag reg for most WM tasks?
1983  */
brw_CMP(struct brw_codegen * p,struct brw_reg dest,unsigned conditional,struct brw_reg src0,struct brw_reg src1)1984 void brw_CMP(struct brw_codegen *p,
1985 	     struct brw_reg dest,
1986 	     unsigned conditional,
1987 	     struct brw_reg src0,
1988 	     struct brw_reg src1)
1989 {
1990    const struct intel_device_info *devinfo = p->devinfo;
1991    brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1992 
1993    brw_inst_set_cond_modifier(devinfo, insn, conditional);
1994    brw_set_dest(p, insn, dest);
1995    brw_set_src0(p, insn, src0);
1996    brw_set_src1(p, insn, src1);
1997 
1998    /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1999     * page says:
2000     *    "Any CMP instruction with a null destination must use a {switch}."
2001     *
2002     * It also applies to other Gfx7 platforms (IVB, BYT) even though it isn't
2003     * mentioned on their work-arounds pages.
2004     */
2005    if (devinfo->ver == 7) {
2006       if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
2007           dest.nr == BRW_ARF_NULL) {
2008          brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
2009       }
2010    }
2011 }
2012 
brw_CMPN(struct brw_codegen * p,struct brw_reg dest,unsigned conditional,struct brw_reg src0,struct brw_reg src1)2013 void brw_CMPN(struct brw_codegen *p,
2014               struct brw_reg dest,
2015               unsigned conditional,
2016               struct brw_reg src0,
2017               struct brw_reg src1)
2018 {
2019    const struct intel_device_info *devinfo = p->devinfo;
2020    brw_inst *insn = next_insn(p, BRW_OPCODE_CMPN);
2021 
2022    brw_inst_set_cond_modifier(devinfo, insn, conditional);
2023    brw_set_dest(p, insn, dest);
2024    brw_set_src0(p, insn, src0);
2025    brw_set_src1(p, insn, src1);
2026 
2027    /* Page 166 of the Ivy Bridge PRM Volume 4 part 3 (Execution Unit ISA)
2028     * says:
2029     *
2030     *    If the destination is the null register, the {Switch} instruction
2031     *    option must be used.
2032     *
2033     * Page 77 of the Haswell PRM Volume 2b contains the same text.
2034     */
2035    if (devinfo->ver == 7) {
2036       if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
2037           dest.nr == BRW_ARF_NULL) {
2038          brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
2039       }
2040    }
2041 }
2042 
2043 /***********************************************************************
2044  * Helpers for the various SEND message types:
2045  */
2046 
2047 /** Extended math function, float[8].
2048  */
gfx4_math(struct brw_codegen * p,struct brw_reg dest,unsigned function,unsigned msg_reg_nr,struct brw_reg src,unsigned precision)2049 void gfx4_math(struct brw_codegen *p,
2050 	       struct brw_reg dest,
2051 	       unsigned function,
2052 	       unsigned msg_reg_nr,
2053 	       struct brw_reg src,
2054 	       unsigned precision )
2055 {
2056    const struct intel_device_info *devinfo = p->devinfo;
2057    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2058    unsigned data_type;
2059    if (has_scalar_region(src)) {
2060       data_type = BRW_MATH_DATA_SCALAR;
2061    } else {
2062       data_type = BRW_MATH_DATA_VECTOR;
2063    }
2064 
2065    assert(devinfo->ver < 6);
2066 
2067    /* Example code doesn't set predicate_control for send
2068     * instructions.
2069     */
2070    brw_inst_set_pred_control(devinfo, insn, 0);
2071    brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2072 
2073    brw_set_dest(p, insn, dest);
2074    brw_set_src0(p, insn, src);
2075    brw_set_math_message(p,
2076                         insn,
2077                         function,
2078                         src.type == BRW_REGISTER_TYPE_D,
2079                         precision,
2080                         data_type);
2081 }
2082 
gfx6_math(struct brw_codegen * p,struct brw_reg dest,unsigned function,struct brw_reg src0,struct brw_reg src1)2083 void gfx6_math(struct brw_codegen *p,
2084 	       struct brw_reg dest,
2085 	       unsigned function,
2086 	       struct brw_reg src0,
2087 	       struct brw_reg src1)
2088 {
2089    const struct intel_device_info *devinfo = p->devinfo;
2090    brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
2091 
2092    assert(devinfo->ver >= 6);
2093 
2094    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
2095           (devinfo->ver >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
2096 
2097    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2098    if (devinfo->ver == 6) {
2099       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2100       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2101    }
2102 
2103    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2104        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2105        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2106       assert(src0.type != BRW_REGISTER_TYPE_F);
2107       assert(src1.type != BRW_REGISTER_TYPE_F);
2108       assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2109              (devinfo->ver >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2110       /* From BSpec 6647/47428 "[Instruction] Extended Math Function":
2111        *     INT DIV function does not support source modifiers.
2112        */
2113       assert(!src0.negate);
2114       assert(!src0.abs);
2115       assert(!src1.negate);
2116       assert(!src1.abs);
2117    } else {
2118       assert(src0.type == BRW_REGISTER_TYPE_F ||
2119              (src0.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9));
2120       assert(src1.type == BRW_REGISTER_TYPE_F ||
2121              (src1.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9));
2122    }
2123 
2124    /* Source modifiers are ignored for extended math instructions on Gfx6. */
2125    if (devinfo->ver == 6) {
2126       assert(!src0.negate);
2127       assert(!src0.abs);
2128       assert(!src1.negate);
2129       assert(!src1.abs);
2130    }
2131 
2132    brw_inst_set_math_function(devinfo, insn, function);
2133 
2134    brw_set_dest(p, insn, dest);
2135    brw_set_src0(p, insn, src0);
2136    brw_set_src1(p, insn, src1);
2137 }
2138 
2139 /**
2140  * Return the right surface index to access the thread scratch space using
2141  * stateless dataport messages.
2142  */
2143 unsigned
brw_scratch_surface_idx(const struct brw_codegen * p)2144 brw_scratch_surface_idx(const struct brw_codegen *p)
2145 {
2146    /* The scratch space is thread-local so IA coherency is unnecessary. */
2147    if (p->devinfo->ver >= 8)
2148       return GFX8_BTI_STATELESS_NON_COHERENT;
2149    else
2150       return BRW_BTI_STATELESS;
2151 }
2152 
2153 /**
2154  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2155  * using a constant offset per channel.
2156  *
2157  * The offset must be aligned to oword size (16 bytes).  Used for
2158  * register spilling.
2159  */
brw_oword_block_write_scratch(struct brw_codegen * p,struct brw_reg mrf,int num_regs,unsigned offset)2160 void brw_oword_block_write_scratch(struct brw_codegen *p,
2161 				   struct brw_reg mrf,
2162 				   int num_regs,
2163 				   unsigned offset)
2164 {
2165    const struct intel_device_info *devinfo = p->devinfo;
2166    const unsigned target_cache =
2167       (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
2168        devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2169        BRW_SFID_DATAPORT_WRITE);
2170    const struct tgl_swsb swsb = brw_get_default_swsb(p);
2171    uint32_t msg_type;
2172 
2173    if (devinfo->ver >= 6)
2174       offset /= 16;
2175 
2176    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2177 
2178    const unsigned mlen = 1 + num_regs;
2179 
2180    /* Set up the message header.  This is g0, with g0.2 filled with
2181     * the offset.  We don't want to leave our offset around in g0 or
2182     * it'll screw up texture samples, so set it up inside the message
2183     * reg.
2184     */
2185    {
2186       brw_push_insn_state(p);
2187       brw_set_default_exec_size(p, BRW_EXECUTE_8);
2188       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2189       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2190       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2191 
2192       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2193 
2194       /* set message header global offset field (reg 0, element 2) */
2195       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2196       brw_set_default_swsb(p, tgl_swsb_null());
2197       brw_MOV(p,
2198 	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2199 				  mrf.nr,
2200 				  2), BRW_REGISTER_TYPE_UD),
2201 	      brw_imm_ud(offset));
2202 
2203       brw_pop_insn_state(p);
2204       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2205    }
2206 
2207    {
2208       struct brw_reg dest;
2209       brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2210       int send_commit_msg;
2211       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2212 					 BRW_REGISTER_TYPE_UW);
2213 
2214       brw_inst_set_sfid(devinfo, insn, target_cache);
2215       brw_inst_set_compression(devinfo, insn, false);
2216 
2217       if (brw_inst_exec_size(devinfo, insn) >= 16)
2218 	 src_header = vec16(src_header);
2219 
2220       assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2221       if (devinfo->ver < 6)
2222          brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2223 
2224       /* Until gfx6, writes followed by reads from the same location
2225        * are not guaranteed to be ordered unless write_commit is set.
2226        * If set, then a no-op write is issued to the destination
2227        * register to set a dependency, and a read from the destination
2228        * can be used to ensure the ordering.
2229        *
2230        * For gfx6, only writes between different threads need ordering
2231        * protection.  Our use of DP writes is all about register
2232        * spilling within a thread.
2233        */
2234       if (devinfo->ver >= 6) {
2235 	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2236 	 send_commit_msg = 0;
2237       } else {
2238 	 dest = src_header;
2239 	 send_commit_msg = 1;
2240       }
2241 
2242       brw_set_dest(p, insn, dest);
2243       if (devinfo->ver >= 6) {
2244 	 brw_set_src0(p, insn, mrf);
2245       } else {
2246 	 brw_set_src0(p, insn, brw_null_reg());
2247       }
2248 
2249       if (devinfo->ver >= 6)
2250 	 msg_type = GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2251       else
2252 	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2253 
2254       brw_set_desc(p, insn,
2255                    brw_message_desc(devinfo, mlen, send_commit_msg, true) |
2256                    brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
2257                                      BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2258                                      msg_type, send_commit_msg));
2259    }
2260 }
2261 
2262 
2263 /**
2264  * Read a block of owords (half a GRF each) from the scratch buffer
2265  * using a constant index per channel.
2266  *
2267  * Offset must be aligned to oword size (16 bytes).  Used for register
2268  * spilling.
2269  */
2270 void
brw_oword_block_read_scratch(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,int num_regs,unsigned offset)2271 brw_oword_block_read_scratch(struct brw_codegen *p,
2272 			     struct brw_reg dest,
2273 			     struct brw_reg mrf,
2274 			     int num_regs,
2275 			     unsigned offset)
2276 {
2277    const struct intel_device_info *devinfo = p->devinfo;
2278    const struct tgl_swsb swsb = brw_get_default_swsb(p);
2279 
2280    if (devinfo->ver >= 6)
2281       offset /= 16;
2282 
2283    if (p->devinfo->ver >= 7) {
2284       /* On gen 7 and above, we no longer have message registers and we can
2285        * send from any register we want.  By using the destination register
2286        * for the message, we guarantee that the implied message write won't
2287        * accidentally overwrite anything.  This has been a problem because
2288        * the MRF registers and source for the final FB write are both fixed
2289        * and may overlap.
2290        */
2291       mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2292    } else {
2293       mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2294    }
2295    dest = retype(dest, BRW_REGISTER_TYPE_UW);
2296 
2297    const unsigned rlen = num_regs;
2298    const unsigned target_cache =
2299       (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
2300        devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2301        BRW_SFID_DATAPORT_READ);
2302 
2303    {
2304       brw_push_insn_state(p);
2305       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2306       brw_set_default_exec_size(p, BRW_EXECUTE_8);
2307       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2308       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2309 
2310       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2311 
2312       /* set message header global offset field (reg 0, element 2) */
2313       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2314       brw_set_default_swsb(p, tgl_swsb_null());
2315       brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2316 
2317       brw_pop_insn_state(p);
2318       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2319    }
2320 
2321    {
2322       brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2323 
2324       brw_inst_set_sfid(devinfo, insn, target_cache);
2325       assert(brw_inst_pred_control(devinfo, insn) == 0);
2326       brw_inst_set_compression(devinfo, insn, false);
2327 
2328       brw_set_dest(p, insn, dest);	/* UW? */
2329       if (devinfo->ver >= 6) {
2330 	 brw_set_src0(p, insn, mrf);
2331       } else {
2332 	 brw_set_src0(p, insn, brw_null_reg());
2333          brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2334       }
2335 
2336       brw_set_desc(p, insn,
2337                    brw_message_desc(devinfo, 1, rlen, true) |
2338                    brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
2339                                     BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2340                                     BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2341                                     BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2342    }
2343 }
2344 
2345 void
gfx7_block_read_scratch(struct brw_codegen * p,struct brw_reg dest,int num_regs,unsigned offset)2346 gfx7_block_read_scratch(struct brw_codegen *p,
2347                         struct brw_reg dest,
2348                         int num_regs,
2349                         unsigned offset)
2350 {
2351    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2352    assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2353 
2354    brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2355 
2356    /* The HW requires that the header is present; this is to get the g0.5
2357     * scratch offset.
2358     */
2359    brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2360 
2361    /* According to the docs, offset is "A 12-bit HWord offset into the memory
2362     * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
2363     * is 32 bytes, which happens to be the size of a register.
2364     */
2365    offset /= REG_SIZE;
2366    assert(offset < (1 << 12));
2367 
2368    gfx7_set_dp_scratch_message(p, insn,
2369                                false, /* scratch read */
2370                                false, /* OWords */
2371                                false, /* invalidate after read */
2372                                num_regs,
2373                                offset,
2374                                1,        /* mlen: just g0 */
2375                                num_regs, /* rlen */
2376                                true);    /* header present */
2377 }
2378 
2379 /**
2380  * Read float[4] vectors from the data port constant cache.
2381  * Location (in buffer) should be a multiple of 16.
2382  * Used for fetching shader constants.
2383  */
brw_oword_block_read(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,uint32_t offset,uint32_t bind_table_index)2384 void brw_oword_block_read(struct brw_codegen *p,
2385 			  struct brw_reg dest,
2386 			  struct brw_reg mrf,
2387 			  uint32_t offset,
2388 			  uint32_t bind_table_index)
2389 {
2390    const struct intel_device_info *devinfo = p->devinfo;
2391    const unsigned target_cache =
2392       (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_CONSTANT_CACHE :
2393        BRW_SFID_DATAPORT_READ);
2394    const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2395    const struct tgl_swsb swsb = brw_get_default_swsb(p);
2396 
2397    /* On newer hardware, offset is in units of owords. */
2398    if (devinfo->ver >= 6)
2399       offset /= 16;
2400 
2401    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2402 
2403    brw_push_insn_state(p);
2404    brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2405    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2406    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2407 
2408    brw_push_insn_state(p);
2409    brw_set_default_exec_size(p, BRW_EXECUTE_8);
2410    brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2411    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2412 
2413    /* set message header global offset field (reg 0, element 2) */
2414    brw_set_default_exec_size(p, BRW_EXECUTE_1);
2415    brw_set_default_swsb(p, tgl_swsb_null());
2416    brw_MOV(p,
2417 	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2418 			       mrf.nr,
2419 			       2), BRW_REGISTER_TYPE_UD),
2420 	   brw_imm_ud(offset));
2421    brw_pop_insn_state(p);
2422 
2423    brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2424 
2425    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2426 
2427    brw_inst_set_sfid(devinfo, insn, target_cache);
2428 
2429    /* cast dest to a uword[8] vector */
2430    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2431 
2432    brw_set_dest(p, insn, dest);
2433    if (devinfo->ver >= 6) {
2434       brw_set_src0(p, insn, mrf);
2435    } else {
2436       brw_set_src0(p, insn, brw_null_reg());
2437       brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2438    }
2439 
2440    brw_set_desc(p, insn,
2441                 brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2442                 brw_dp_read_desc(devinfo, bind_table_index,
2443                                  BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2444                                  BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2445                                  BRW_DATAPORT_READ_TARGET_DATA_CACHE));
2446 
2447    brw_pop_insn_state(p);
2448 }
2449 
2450 brw_inst *
brw_fb_WRITE(struct brw_codegen * p,struct brw_reg payload,struct brw_reg implied_header,unsigned msg_control,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool eot,bool last_render_target,bool header_present)2451 brw_fb_WRITE(struct brw_codegen *p,
2452              struct brw_reg payload,
2453              struct brw_reg implied_header,
2454              unsigned msg_control,
2455              unsigned binding_table_index,
2456              unsigned msg_length,
2457              unsigned response_length,
2458              bool eot,
2459              bool last_render_target,
2460              bool header_present)
2461 {
2462    const struct intel_device_info *devinfo = p->devinfo;
2463    const unsigned target_cache =
2464       (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2465        BRW_SFID_DATAPORT_WRITE);
2466    brw_inst *insn;
2467    struct brw_reg dest, src0;
2468 
2469    if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2470       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2471    else
2472       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2473 
2474    if (devinfo->ver >= 6) {
2475       insn = next_insn(p, BRW_OPCODE_SENDC);
2476    } else {
2477       insn = next_insn(p, BRW_OPCODE_SEND);
2478    }
2479    brw_inst_set_sfid(devinfo, insn, target_cache);
2480    brw_inst_set_compression(devinfo, insn, false);
2481 
2482    if (devinfo->ver >= 6) {
2483       /* headerless version, just submit color payload */
2484       src0 = payload;
2485    } else {
2486       assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2487       brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2488       src0 = implied_header;
2489    }
2490 
2491    brw_set_dest(p, insn, dest);
2492    brw_set_src0(p, insn, src0);
2493    brw_set_desc(p, insn,
2494                 brw_message_desc(devinfo, msg_length, response_length,
2495                                  header_present) |
2496                 brw_fb_write_desc(devinfo, binding_table_index, msg_control,
2497                                   last_render_target,
2498                                   false /* coarse_write */));
2499    brw_inst_set_eot(devinfo, insn, eot);
2500 
2501    return insn;
2502 }
2503 
2504 brw_inst *
gfx9_fb_READ(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool per_sample)2505 gfx9_fb_READ(struct brw_codegen *p,
2506              struct brw_reg dst,
2507              struct brw_reg payload,
2508              unsigned binding_table_index,
2509              unsigned msg_length,
2510              unsigned response_length,
2511              bool per_sample)
2512 {
2513    const struct intel_device_info *devinfo = p->devinfo;
2514    assert(devinfo->ver >= 9);
2515    brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2516 
2517    brw_inst_set_sfid(devinfo, insn, GFX6_SFID_DATAPORT_RENDER_CACHE);
2518    brw_set_dest(p, insn, dst);
2519    brw_set_src0(p, insn, payload);
2520    brw_set_desc(
2521       p, insn,
2522       brw_message_desc(devinfo, msg_length, response_length, true) |
2523       brw_fb_read_desc(devinfo, binding_table_index, 0 /* msg_control */,
2524                        1 << brw_get_default_exec_size(p), per_sample));
2525    brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2526 
2527    return insn;
2528 }
2529 
2530 /**
2531  * Texture sample instruction.
2532  * Note: the msg_type plus msg_length values determine exactly what kind
2533  * of sampling operation is performed.  See volume 4, page 161 of docs.
2534  */
brw_SAMPLE(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,unsigned sampler,unsigned msg_type,unsigned response_length,unsigned msg_length,unsigned header_present,unsigned simd_mode,unsigned return_format)2535 void brw_SAMPLE(struct brw_codegen *p,
2536 		struct brw_reg dest,
2537 		unsigned msg_reg_nr,
2538 		struct brw_reg src0,
2539 		unsigned binding_table_index,
2540 		unsigned sampler,
2541 		unsigned msg_type,
2542 		unsigned response_length,
2543 		unsigned msg_length,
2544 		unsigned header_present,
2545 		unsigned simd_mode,
2546 		unsigned return_format)
2547 {
2548    const struct intel_device_info *devinfo = p->devinfo;
2549    brw_inst *insn;
2550 
2551    if (msg_reg_nr != -1)
2552       gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2553 
2554    insn = next_insn(p, BRW_OPCODE_SEND);
2555    brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2556    brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2557 
2558    /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2559     *
2560     *    "Instruction compression is not allowed for this instruction (that
2561     *     is, send). The hardware behavior is undefined if this instruction is
2562     *     set as compressed. However, compress control can be set to "SecHalf"
2563     *     to affect the EMask generation."
2564     *
2565     * No similar wording is found in later PRMs, but there are examples
2566     * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
2567     * are allowed in SIMD16 mode and they could not work without SecHalf.  For
2568     * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2569     */
2570    brw_inst_set_compression(devinfo, insn, false);
2571 
2572    if (devinfo->ver < 6)
2573       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2574 
2575    brw_set_dest(p, insn, dest);
2576    brw_set_src0(p, insn, src0);
2577    brw_set_desc(p, insn,
2578                 brw_message_desc(devinfo, msg_length, response_length,
2579                                  header_present) |
2580                 brw_sampler_desc(devinfo, binding_table_index, sampler,
2581                                  msg_type, simd_mode, return_format));
2582 }
2583 
2584 /* Adjust the message header's sampler state pointer to
2585  * select the correct group of 16 samplers.
2586  */
brw_adjust_sampler_state_pointer(struct brw_codegen * p,struct brw_reg header,struct brw_reg sampler_index)2587 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2588                                       struct brw_reg header,
2589                                       struct brw_reg sampler_index)
2590 {
2591    /* The "Sampler Index" field can only store values between 0 and 15.
2592     * However, we can add an offset to the "Sampler State Pointer"
2593     * field, effectively selecting a different set of 16 samplers.
2594     *
2595     * The "Sampler State Pointer" needs to be aligned to a 32-byte
2596     * offset, and each sampler state is only 16-bytes, so we can't
2597     * exclusively use the offset - we have to use both.
2598     */
2599 
2600    const struct intel_device_info *devinfo = p->devinfo;
2601 
2602    if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2603       const int sampler_state_size = 16; /* 16 bytes */
2604       uint32_t sampler = sampler_index.ud;
2605 
2606       if (sampler >= 16) {
2607          assert(devinfo->verx10 >= 75);
2608          brw_ADD(p,
2609                  get_element_ud(header, 3),
2610                  get_element_ud(brw_vec8_grf(0, 0), 3),
2611                  brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2612       }
2613    } else {
2614       /* Non-const sampler array indexing case */
2615       if (devinfo->verx10 <= 70) {
2616          return;
2617       }
2618 
2619       struct brw_reg temp = get_element_ud(header, 3);
2620 
2621       brw_push_insn_state(p);
2622       brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2623       brw_set_default_swsb(p, tgl_swsb_regdist(1));
2624       brw_SHL(p, temp, temp, brw_imm_ud(4));
2625       brw_ADD(p,
2626               get_element_ud(header, 3),
2627               get_element_ud(brw_vec8_grf(0, 0), 3),
2628               temp);
2629       brw_pop_insn_state(p);
2630    }
2631 }
2632 
2633 /* All these variables are pretty confusing - we might be better off
2634  * using bitmasks and macros for this, in the old style.  Or perhaps
2635  * just having the caller instantiate the fields in dword3 itself.
2636  */
brw_urb_WRITE(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,enum brw_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle)2637 void brw_urb_WRITE(struct brw_codegen *p,
2638 		   struct brw_reg dest,
2639 		   unsigned msg_reg_nr,
2640 		   struct brw_reg src0,
2641                    enum brw_urb_write_flags flags,
2642 		   unsigned msg_length,
2643 		   unsigned response_length,
2644 		   unsigned offset,
2645 		   unsigned swizzle)
2646 {
2647    const struct intel_device_info *devinfo = p->devinfo;
2648    brw_inst *insn;
2649 
2650    gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2651 
2652    if (devinfo->ver >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2653       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2654       brw_push_insn_state(p);
2655       brw_set_default_access_mode(p, BRW_ALIGN_1);
2656       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2657       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2658       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2659 		       BRW_REGISTER_TYPE_UD),
2660 	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2661 		brw_imm_ud(0xff00));
2662       brw_pop_insn_state(p);
2663    }
2664 
2665    insn = next_insn(p, BRW_OPCODE_SEND);
2666 
2667    assert(msg_length < BRW_MAX_MRF(devinfo->ver));
2668 
2669    brw_set_dest(p, insn, dest);
2670    brw_set_src0(p, insn, src0);
2671    brw_set_src1(p, insn, brw_imm_d(0));
2672 
2673    if (devinfo->ver < 6)
2674       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2675 
2676    brw_set_urb_message(p,
2677 		       insn,
2678 		       flags,
2679 		       msg_length,
2680 		       response_length,
2681 		       offset,
2682 		       swizzle);
2683 }
2684 
2685 void
brw_send_indirect_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload,struct brw_reg desc,unsigned desc_imm,bool eot)2686 brw_send_indirect_message(struct brw_codegen *p,
2687                           unsigned sfid,
2688                           struct brw_reg dst,
2689                           struct brw_reg payload,
2690                           struct brw_reg desc,
2691                           unsigned desc_imm,
2692                           bool eot)
2693 {
2694    const struct intel_device_info *devinfo = p->devinfo;
2695    struct brw_inst *send;
2696 
2697    dst = retype(dst, BRW_REGISTER_TYPE_UW);
2698 
2699    assert(desc.type == BRW_REGISTER_TYPE_UD);
2700 
2701    if (desc.file == BRW_IMMEDIATE_VALUE) {
2702       send = next_insn(p, BRW_OPCODE_SEND);
2703       brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2704       brw_set_desc(p, send, desc.ud | desc_imm);
2705    } else {
2706       const struct tgl_swsb swsb = brw_get_default_swsb(p);
2707       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2708 
2709       brw_push_insn_state(p);
2710       brw_set_default_access_mode(p, BRW_ALIGN_1);
2711       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2712       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2713       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2714       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2715 
2716       /* Load the indirect descriptor to an address register using OR so the
2717        * caller can specify additional descriptor bits with the desc_imm
2718        * immediate.
2719        */
2720       brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2721 
2722       brw_pop_insn_state(p);
2723 
2724       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2725       send = next_insn(p, BRW_OPCODE_SEND);
2726       brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2727 
2728       if (devinfo->ver >= 12)
2729          brw_inst_set_send_sel_reg32_desc(devinfo, send, true);
2730       else
2731          brw_set_src1(p, send, addr);
2732    }
2733 
2734    brw_set_dest(p, send, dst);
2735    brw_inst_set_sfid(devinfo, send, sfid);
2736    brw_inst_set_eot(devinfo, send, eot);
2737 }
2738 
2739 void
brw_send_indirect_split_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload0,struct brw_reg payload1,struct brw_reg desc,unsigned desc_imm,struct brw_reg ex_desc,unsigned ex_desc_imm,bool eot)2740 brw_send_indirect_split_message(struct brw_codegen *p,
2741                                 unsigned sfid,
2742                                 struct brw_reg dst,
2743                                 struct brw_reg payload0,
2744                                 struct brw_reg payload1,
2745                                 struct brw_reg desc,
2746                                 unsigned desc_imm,
2747                                 struct brw_reg ex_desc,
2748                                 unsigned ex_desc_imm,
2749                                 bool eot)
2750 {
2751    const struct intel_device_info *devinfo = p->devinfo;
2752    struct brw_inst *send;
2753 
2754    dst = retype(dst, BRW_REGISTER_TYPE_UW);
2755 
2756    assert(desc.type == BRW_REGISTER_TYPE_UD);
2757 
2758    if (desc.file == BRW_IMMEDIATE_VALUE) {
2759       desc.ud |= desc_imm;
2760    } else {
2761       const struct tgl_swsb swsb = brw_get_default_swsb(p);
2762       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2763 
2764       brw_push_insn_state(p);
2765       brw_set_default_access_mode(p, BRW_ALIGN_1);
2766       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2767       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2768       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2769       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2770 
2771       /* Load the indirect descriptor to an address register using OR so the
2772        * caller can specify additional descriptor bits with the desc_imm
2773        * immediate.
2774        */
2775       brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2776 
2777       brw_pop_insn_state(p);
2778       desc = addr;
2779 
2780       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2781    }
2782 
2783    if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
2784        (devinfo->ver >= 12 ||
2785         ((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) {
2786       ex_desc.ud |= ex_desc_imm;
2787    } else {
2788       const struct tgl_swsb swsb = brw_get_default_swsb(p);
2789       struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);
2790 
2791       brw_push_insn_state(p);
2792       brw_set_default_access_mode(p, BRW_ALIGN_1);
2793       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2794       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2795       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2796       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2797 
2798       /* Load the indirect extended descriptor to an address register using OR
2799        * so the caller can specify additional descriptor bits with the
2800        * desc_imm immediate.
2801        *
2802        * Even though the instruction dispatcher always pulls the SFID and EOT
2803        * fields from the instruction itself, actual external unit which
2804        * processes the message gets the SFID and EOT from the extended
2805        * descriptor which comes from the address register.  If we don't OR
2806        * those two bits in, the external unit may get confused and hang.
2807        */
2808       unsigned imm_part = ex_desc_imm | sfid | eot << 5;
2809 
2810       if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2811          /* ex_desc bits 15:12 don't exist in the instruction encoding prior
2812           * to Gfx12, so we may have fallen back to an indirect extended
2813           * descriptor.
2814           */
2815          brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
2816       } else {
2817          brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
2818       }
2819 
2820       brw_pop_insn_state(p);
2821       ex_desc = addr;
2822 
2823       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2824    }
2825 
2826    send = next_insn(p, devinfo->ver >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);
2827    brw_set_dest(p, send, dst);
2828    brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));
2829    brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));
2830 
2831    if (desc.file == BRW_IMMEDIATE_VALUE) {
2832       brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);
2833       brw_inst_set_send_desc(devinfo, send, desc.ud);
2834    } else {
2835       assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2836       assert(desc.nr == BRW_ARF_ADDRESS);
2837       assert(desc.subnr == 0);
2838       brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);
2839    }
2840 
2841    if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2842       brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
2843       brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
2844    } else {
2845       assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2846       assert(ex_desc.nr == BRW_ARF_ADDRESS);
2847       assert((ex_desc.subnr & 0x3) == 0);
2848       brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
2849       brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2);
2850    }
2851 
2852    brw_inst_set_sfid(devinfo, send, sfid);
2853    brw_inst_set_eot(devinfo, send, eot);
2854 }
2855 
2856 static void
brw_send_indirect_surface_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned desc_imm)2857 brw_send_indirect_surface_message(struct brw_codegen *p,
2858                                   unsigned sfid,
2859                                   struct brw_reg dst,
2860                                   struct brw_reg payload,
2861                                   struct brw_reg surface,
2862                                   unsigned desc_imm)
2863 {
2864    if (surface.file != BRW_IMMEDIATE_VALUE) {
2865       const struct tgl_swsb swsb = brw_get_default_swsb(p);
2866       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2867 
2868       brw_push_insn_state(p);
2869       brw_set_default_access_mode(p, BRW_ALIGN_1);
2870       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2871       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2872       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2873       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2874 
2875       /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2876        * some surface array is accessed out of bounds.
2877        */
2878       brw_AND(p, addr,
2879               suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2880                         BRW_GET_SWZ(surface.swizzle, 0)),
2881               brw_imm_ud(0xff));
2882 
2883       brw_pop_insn_state(p);
2884 
2885       surface = addr;
2886       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2887    }
2888 
2889    brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2890 }
2891 
2892 static bool
while_jumps_before_offset(const struct intel_device_info * devinfo,brw_inst * insn,int while_offset,int start_offset)2893 while_jumps_before_offset(const struct intel_device_info *devinfo,
2894                           brw_inst *insn, int while_offset, int start_offset)
2895 {
2896    int scale = 16 / brw_jump_scale(devinfo);
2897    int jip = devinfo->ver == 6 ? brw_inst_gfx6_jump_count(devinfo, insn)
2898                                : brw_inst_jip(devinfo, insn);
2899    assert(jip < 0);
2900    return while_offset + jip * scale <= start_offset;
2901 }
2902 
2903 
2904 static int
brw_find_next_block_end(struct brw_codegen * p,int start_offset)2905 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2906 {
2907    int offset;
2908    void *store = p->store;
2909    const struct intel_device_info *devinfo = p->devinfo;
2910 
2911    int depth = 0;
2912 
2913    for (offset = next_offset(devinfo, store, start_offset);
2914         offset < p->next_insn_offset;
2915         offset = next_offset(devinfo, store, offset)) {
2916       brw_inst *insn = store + offset;
2917 
2918       switch (brw_inst_opcode(p->isa, insn)) {
2919       case BRW_OPCODE_IF:
2920          depth++;
2921          break;
2922       case BRW_OPCODE_ENDIF:
2923          if (depth == 0)
2924             return offset;
2925          depth--;
2926          break;
2927       case BRW_OPCODE_WHILE:
2928          /* If the while doesn't jump before our instruction, it's the end
2929           * of a sibling do...while loop.  Ignore it.
2930           */
2931          if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2932             continue;
2933          FALLTHROUGH;
2934       case BRW_OPCODE_ELSE:
2935       case BRW_OPCODE_HALT:
2936          if (depth == 0)
2937             return offset;
2938          break;
2939       default:
2940          break;
2941       }
2942    }
2943 
2944    return 0;
2945 }
2946 
2947 /* There is no DO instruction on gfx6, so to find the end of the loop
2948  * we have to see if the loop is jumping back before our start
2949  * instruction.
2950  */
2951 static int
brw_find_loop_end(struct brw_codegen * p,int start_offset)2952 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2953 {
2954    const struct intel_device_info *devinfo = p->devinfo;
2955    int offset;
2956    void *store = p->store;
2957 
2958    assert(devinfo->ver >= 6);
2959 
2960    /* Always start after the instruction (such as a WHILE) we're trying to fix
2961     * up.
2962     */
2963    for (offset = next_offset(devinfo, store, start_offset);
2964         offset < p->next_insn_offset;
2965         offset = next_offset(devinfo, store, offset)) {
2966       brw_inst *insn = store + offset;
2967 
2968       if (brw_inst_opcode(p->isa, insn) == BRW_OPCODE_WHILE) {
2969 	 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2970 	    return offset;
2971       }
2972    }
2973    assert(!"not reached");
2974    return start_offset;
2975 }
2976 
2977 /* After program generation, go back and update the UIP and JIP of
2978  * BREAK, CONT, and HALT instructions to their correct locations.
2979  */
2980 void
brw_set_uip_jip(struct brw_codegen * p,int start_offset)2981 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2982 {
2983    const struct intel_device_info *devinfo = p->devinfo;
2984    int offset;
2985    int br = brw_jump_scale(devinfo);
2986    int scale = 16 / br;
2987    void *store = p->store;
2988 
2989    if (devinfo->ver < 6)
2990       return;
2991 
2992    for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2993       brw_inst *insn = store + offset;
2994       assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2995 
2996       switch (brw_inst_opcode(p->isa, insn)) {
2997       case BRW_OPCODE_BREAK: {
2998          int block_end_offset = brw_find_next_block_end(p, offset);
2999          assert(block_end_offset != 0);
3000          brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
3001 	 /* Gfx7 UIP points to WHILE; Gfx6 points just after it */
3002          brw_inst_set_uip(devinfo, insn,
3003 	    (brw_find_loop_end(p, offset) - offset +
3004              (devinfo->ver == 6 ? 16 : 0)) / scale);
3005 	 break;
3006       }
3007 
3008       case BRW_OPCODE_CONTINUE: {
3009          int block_end_offset = brw_find_next_block_end(p, offset);
3010          assert(block_end_offset != 0);
3011          brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
3012          brw_inst_set_uip(devinfo, insn,
3013             (brw_find_loop_end(p, offset) - offset) / scale);
3014 
3015          assert(brw_inst_uip(devinfo, insn) != 0);
3016          assert(brw_inst_jip(devinfo, insn) != 0);
3017 	 break;
3018       }
3019 
3020       case BRW_OPCODE_ENDIF: {
3021          int block_end_offset = brw_find_next_block_end(p, offset);
3022          int32_t jump = (block_end_offset == 0) ?
3023                         1 * br : (block_end_offset - offset) / scale;
3024          if (devinfo->ver >= 7)
3025             brw_inst_set_jip(devinfo, insn, jump);
3026          else
3027             brw_inst_set_gfx6_jump_count(devinfo, insn, jump);
3028 	 break;
3029       }
3030 
3031       case BRW_OPCODE_HALT: {
3032 	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
3033 	  *
3034 	  *    "In case of the halt instruction not inside any conditional
3035 	  *     code block, the value of <JIP> and <UIP> should be the
3036 	  *     same. In case of the halt instruction inside conditional code
3037 	  *     block, the <UIP> should be the end of the program, and the
3038 	  *     <JIP> should be end of the most inner conditional code block."
3039 	  *
3040 	  * The uip will have already been set by whoever set up the
3041 	  * instruction.
3042 	  */
3043          int block_end_offset = brw_find_next_block_end(p, offset);
3044 	 if (block_end_offset == 0) {
3045             brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
3046 	 } else {
3047             brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
3048 	 }
3049          assert(brw_inst_uip(devinfo, insn) != 0);
3050          assert(brw_inst_jip(devinfo, insn) != 0);
3051 	 break;
3052       }
3053 
3054       default:
3055          break;
3056       }
3057    }
3058 }
3059 
brw_ff_sync(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,bool allocate,unsigned response_length,bool eot)3060 void brw_ff_sync(struct brw_codegen *p,
3061 		   struct brw_reg dest,
3062 		   unsigned msg_reg_nr,
3063 		   struct brw_reg src0,
3064 		   bool allocate,
3065 		   unsigned response_length,
3066 		   bool eot)
3067 {
3068    const struct intel_device_info *devinfo = p->devinfo;
3069    brw_inst *insn;
3070 
3071    gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
3072 
3073    insn = next_insn(p, BRW_OPCODE_SEND);
3074    brw_set_dest(p, insn, dest);
3075    brw_set_src0(p, insn, src0);
3076    brw_set_src1(p, insn, brw_imm_d(0));
3077 
3078    if (devinfo->ver < 6)
3079       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
3080 
3081    brw_set_ff_sync_message(p,
3082 			   insn,
3083 			   allocate,
3084 			   response_length,
3085 			   eot);
3086 }
3087 
3088 /**
3089  * Emit the SEND instruction necessary to generate stream output data on Gfx6
3090  * (for transform feedback).
3091  *
3092  * If send_commit_msg is true, this is the last piece of stream output data
3093  * from this thread, so send the data as a committed write.  According to the
3094  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
3095  *
3096  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
3097  *   writes are complete by sending the final write as a committed write."
3098  */
3099 void
brw_svb_write(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,bool send_commit_msg)3100 brw_svb_write(struct brw_codegen *p,
3101               struct brw_reg dest,
3102               unsigned msg_reg_nr,
3103               struct brw_reg src0,
3104               unsigned binding_table_index,
3105               bool   send_commit_msg)
3106 {
3107    const struct intel_device_info *devinfo = p->devinfo;
3108    assert(devinfo->ver == 6);
3109    const unsigned target_cache = GFX6_SFID_DATAPORT_RENDER_CACHE;
3110    brw_inst *insn;
3111 
3112    gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
3113 
3114    insn = next_insn(p, BRW_OPCODE_SEND);
3115    brw_inst_set_sfid(devinfo, insn, target_cache);
3116    brw_set_dest(p, insn, dest);
3117    brw_set_src0(p, insn, src0);
3118    brw_set_desc(p, insn,
3119                 brw_message_desc(devinfo, 1, send_commit_msg, true) |
3120                 brw_dp_write_desc(devinfo, binding_table_index,
3121                                   0, /* msg_control: ignored */
3122                                   GFX6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
3123                                   send_commit_msg)); /* send_commit_msg */
3124 }
3125 
3126 static unsigned
brw_surface_payload_size(unsigned num_channels,unsigned exec_size)3127 brw_surface_payload_size(unsigned num_channels,
3128                          unsigned exec_size /**< 0 for SIMD4x2 */)
3129 {
3130    if (exec_size == 0)
3131       return 1; /* SIMD4x2 */
3132    else if (exec_size <= 8)
3133       return num_channels;
3134    else
3135       return 2 * num_channels;
3136 }
3137 
3138 void
brw_untyped_atomic(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned atomic_op,unsigned msg_length,bool response_expected,bool header_present)3139 brw_untyped_atomic(struct brw_codegen *p,
3140                    struct brw_reg dst,
3141                    struct brw_reg payload,
3142                    struct brw_reg surface,
3143                    unsigned atomic_op,
3144                    unsigned msg_length,
3145                    bool response_expected,
3146                    bool header_present)
3147 {
3148    const struct intel_device_info *devinfo = p->devinfo;
3149    const unsigned sfid = (devinfo->verx10 >= 75 ?
3150                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3151                           GFX7_SFID_DATAPORT_DATA_CACHE);
3152    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3153    /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3154    const bool has_simd4x2 = devinfo->verx10 >= 75;
3155    const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3156                               has_simd4x2 ? 0 : 8;
3157    const unsigned response_length =
3158       brw_surface_payload_size(response_expected, exec_size);
3159    const unsigned desc =
3160       brw_message_desc(devinfo, msg_length, response_length, header_present) |
3161       brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
3162                                  response_expected);
3163    /* Mask out unused components -- This is especially important in Align16
3164     * mode on generations that don't have native support for SIMD4x2 atomics,
3165     * because unused but enabled components will cause the dataport to perform
3166     * additional atomic operations on the addresses that happen to be in the
3167     * uninitialized Y, Z and W coordinates of the payload.
3168     */
3169    const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3170 
3171    brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
3172                                      payload, surface, desc);
3173 }
3174 
3175 void
brw_untyped_surface_read(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels)3176 brw_untyped_surface_read(struct brw_codegen *p,
3177                          struct brw_reg dst,
3178                          struct brw_reg payload,
3179                          struct brw_reg surface,
3180                          unsigned msg_length,
3181                          unsigned num_channels)
3182 {
3183    const struct intel_device_info *devinfo = p->devinfo;
3184    const unsigned sfid = (devinfo->verx10 >= 75 ?
3185                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3186                           GFX7_SFID_DATAPORT_DATA_CACHE);
3187    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3188    const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
3189    const unsigned response_length =
3190       brw_surface_payload_size(num_channels, exec_size);
3191    const unsigned desc =
3192       brw_message_desc(devinfo, msg_length, response_length, false) |
3193       brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
3194 
3195    brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
3196 }
3197 
3198 void
brw_untyped_surface_write(struct brw_codegen * p,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels,bool header_present)3199 brw_untyped_surface_write(struct brw_codegen *p,
3200                           struct brw_reg payload,
3201                           struct brw_reg surface,
3202                           unsigned msg_length,
3203                           unsigned num_channels,
3204                           bool header_present)
3205 {
3206    const struct intel_device_info *devinfo = p->devinfo;
3207    const unsigned sfid = (devinfo->verx10 >= 75 ?
3208                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3209                           GFX7_SFID_DATAPORT_DATA_CACHE);
3210    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3211    /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3212    const bool has_simd4x2 = devinfo->verx10 >= 75;
3213    const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3214                               has_simd4x2 ? 0 : 8;
3215    const unsigned desc =
3216       brw_message_desc(devinfo, msg_length, 0, header_present) |
3217       brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
3218    /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3219    const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
3220 
3221    brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
3222                                      payload, surface, desc);
3223 }
3224 
3225 static void
brw_set_memory_fence_message(struct brw_codegen * p,struct brw_inst * insn,enum brw_message_target sfid,bool commit_enable,unsigned bti)3226 brw_set_memory_fence_message(struct brw_codegen *p,
3227                              struct brw_inst *insn,
3228                              enum brw_message_target sfid,
3229                              bool commit_enable,
3230                              unsigned bti)
3231 {
3232    const struct intel_device_info *devinfo = p->devinfo;
3233 
3234    brw_set_desc(p, insn, brw_message_desc(
3235                    devinfo, 1, (commit_enable ? 1 : 0), true));
3236 
3237    brw_inst_set_sfid(devinfo, insn, sfid);
3238 
3239    switch (sfid) {
3240    case GFX6_SFID_DATAPORT_RENDER_CACHE:
3241       brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_RC_MEMORY_FENCE);
3242       break;
3243    case GFX7_SFID_DATAPORT_DATA_CACHE:
3244       brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_DC_MEMORY_FENCE);
3245       break;
3246    default:
3247       unreachable("Not reached");
3248    }
3249 
3250    if (commit_enable)
3251       brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3252 
3253    assert(devinfo->ver >= 11 || bti == 0);
3254    brw_inst_set_binding_table_index(devinfo, insn, bti);
3255 }
3256 
3257 static void
gfx12_set_memory_fence_message(struct brw_codegen * p,struct brw_inst * insn,enum brw_message_target sfid,uint32_t desc)3258 gfx12_set_memory_fence_message(struct brw_codegen *p,
3259                                struct brw_inst *insn,
3260                                enum brw_message_target sfid,
3261                                uint32_t desc)
3262 {
3263    const unsigned mlen = 1; /* g0 header */
3264     /* Completion signaled by write to register. No data returned. */
3265    const unsigned rlen = 1;
3266 
3267    brw_inst_set_sfid(p->devinfo, insn, sfid);
3268 
3269    if (sfid == BRW_SFID_URB) {
3270       brw_set_desc(p, insn, brw_urb_fence_desc(p->devinfo) |
3271                             brw_message_desc(p->devinfo, mlen, rlen, true));
3272    } else {
3273       enum lsc_fence_scope scope = lsc_fence_msg_desc_scope(p->devinfo, desc);
3274       enum lsc_flush_type flush_type = lsc_fence_msg_desc_flush_type(p->devinfo, desc);
3275 
3276       if (sfid == GFX12_SFID_TGM) {
3277          scope = LSC_FENCE_TILE;
3278          flush_type = LSC_FLUSH_TYPE_EVICT;
3279       }
3280 
3281       /* Wa_14014435656:
3282        *
3283        *   "For any fence greater than local scope, always set flush type to
3284        *    at least invalidate so that fence goes on properly."
3285        *
3286        *   "The bug is if flush_type is 'None', the scope is always downgraded
3287        *    to 'local'."
3288        *
3289        * Here set scope to NONE_6 instead of NONE, which has the same effect
3290        * as NONE but avoids the downgrade to scope LOCAL.
3291        */
3292       if (intel_device_info_is_dg2(p->devinfo) &&
3293           scope > LSC_FENCE_LOCAL &&
3294           flush_type == LSC_FLUSH_TYPE_NONE) {
3295          flush_type = LSC_FLUSH_TYPE_NONE_6;
3296       }
3297 
3298       brw_set_desc(p, insn, lsc_fence_msg_desc(p->devinfo, scope,
3299                                                flush_type, false) |
3300                             brw_message_desc(p->devinfo, mlen, rlen, false));
3301    }
3302 }
3303 
3304 void
brw_memory_fence(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src,enum opcode send_op,enum brw_message_target sfid,uint32_t desc,bool commit_enable,unsigned bti)3305 brw_memory_fence(struct brw_codegen *p,
3306                  struct brw_reg dst,
3307                  struct brw_reg src,
3308                  enum opcode send_op,
3309                  enum brw_message_target sfid,
3310                  uint32_t desc,
3311                  bool commit_enable,
3312                  unsigned bti)
3313 {
3314    const struct intel_device_info *devinfo = p->devinfo;
3315 
3316    dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
3317    src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
3318 
3319    /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3320     * message doesn't write anything back.
3321     */
3322    struct brw_inst *insn = next_insn(p, send_op);
3323    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3324    brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3325    brw_set_dest(p, insn, dst);
3326    brw_set_src0(p, insn, src);
3327 
3328    /* All DG2 hardware requires LSC for fence messages, even A-step */
3329    if (devinfo->has_lsc)
3330       gfx12_set_memory_fence_message(p, insn, sfid, desc);
3331    else
3332       brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
3333 }
3334 
3335 void
brw_pixel_interpolator_query(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,bool noperspective,bool coarse_pixel_rate,unsigned mode,struct brw_reg data,unsigned msg_length,unsigned response_length)3336 brw_pixel_interpolator_query(struct brw_codegen *p,
3337                              struct brw_reg dest,
3338                              struct brw_reg mrf,
3339                              bool noperspective,
3340                              bool coarse_pixel_rate,
3341                              unsigned mode,
3342                              struct brw_reg data,
3343                              unsigned msg_length,
3344                              unsigned response_length)
3345 {
3346    const struct intel_device_info *devinfo = p->devinfo;
3347    const uint16_t exec_size = brw_get_default_exec_size(p);
3348    const unsigned slot_group = brw_get_default_group(p) / 16;
3349    const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
3350    const unsigned desc =
3351       brw_message_desc(devinfo, msg_length, response_length, false) |
3352       brw_pixel_interp_desc(devinfo, mode, noperspective, coarse_pixel_rate,
3353                             simd_mode, slot_group);
3354 
3355    /* brw_send_indirect_message will automatically use a direct send message
3356     * if data is actually immediate.
3357     */
3358    brw_send_indirect_message(p,
3359                              GFX7_SFID_PIXEL_INTERPOLATOR,
3360                              dest,
3361                              mrf,
3362                              vec1(data),
3363                              desc,
3364                              false);
3365 }
3366 
3367 void
brw_find_live_channel(struct brw_codegen * p,struct brw_reg dst,bool last)3368 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst, bool last)
3369 {
3370    const struct intel_device_info *devinfo = p->devinfo;
3371    const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3372    const unsigned qtr_control = brw_get_default_group(p) / 8;
3373    brw_inst *inst;
3374 
3375    assert(devinfo->ver == 7);
3376 
3377    brw_push_insn_state(p);
3378 
3379    /* The flag register is only used on Gfx7 in align1 mode, so avoid setting
3380     * unnecessary bits in the instruction words, get the information we need
3381     * and reset the default flag register. This allows more instructions to be
3382     * compacted.
3383     */
3384    const unsigned flag_subreg = p->current->flag_subreg;
3385    brw_set_default_flag_reg(p, 0, 0);
3386 
3387    if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3388       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3389 
3390       const struct brw_reg flag = brw_flag_subreg(flag_subreg);
3391 
3392       brw_set_default_exec_size(p, BRW_EXECUTE_1);
3393       brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3394 
3395       /* Run enough instructions returning zero with execution masking and
3396        * a conditional modifier enabled in order to get the full execution
3397        * mask in f1.0.  We could use a single 32-wide move here if it
3398        * weren't because of the hardware bug that causes channel enables to
3399        * be applied incorrectly to the second half of 32-wide instructions
3400        * on Gfx7.
3401        */
3402       const unsigned lower_size = MIN2(16, exec_size);
3403       for (unsigned i = 0; i < exec_size / lower_size; i++) {
3404          inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3405                         brw_imm_uw(0));
3406          brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3407          brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3408          brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3409          brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3410          brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3411          brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3412       }
3413 
3414       /* Find the first bit set in the exec_size-wide portion of the flag
3415        * register that was updated by the last sequence of MOV
3416        * instructions.
3417        */
3418       const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3419       brw_set_default_exec_size(p, BRW_EXECUTE_1);
3420       if (!last) {
3421          inst = brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3422       } else {
3423          inst = brw_LZD(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3424          struct brw_reg neg = vec1(dst);
3425          neg.negate = true;
3426          inst = brw_ADD(p, vec1(dst), neg, brw_imm_uw(31));
3427       }
3428    } else {
3429       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3430 
3431       /* Overwrite the destination without and with execution masking to
3432        * find out which of the channels is active.
3433        */
3434       brw_push_insn_state(p);
3435       brw_set_default_exec_size(p, BRW_EXECUTE_4);
3436       brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3437               brw_imm_ud(1));
3438 
3439       inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3440                      brw_imm_ud(0));
3441       brw_pop_insn_state(p);
3442       brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3443    }
3444 
3445    brw_pop_insn_state(p);
3446 }
3447 
3448 void
brw_broadcast(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src,struct brw_reg idx)3449 brw_broadcast(struct brw_codegen *p,
3450               struct brw_reg dst,
3451               struct brw_reg src,
3452               struct brw_reg idx)
3453 {
3454    const struct intel_device_info *devinfo = p->devinfo;
3455    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3456    brw_inst *inst;
3457 
3458    brw_push_insn_state(p);
3459    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3460    brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3461 
3462    assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3463           src.address_mode == BRW_ADDRESS_DIRECT);
3464    assert(!src.abs && !src.negate);
3465    assert(src.type == dst.type);
3466 
3467    if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3468        idx.file == BRW_IMMEDIATE_VALUE) {
3469       /* Trivial, the source is already uniform or the index is a constant.
3470        * We will typically not get here if the optimizer is doing its job, but
3471        * asserting would be mean.
3472        */
3473       const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3474       src = align1 ? stride(suboffset(src, i), 0, 1, 0) :
3475                      stride(suboffset(src, 4 * i), 0, 4, 1);
3476 
3477       if (type_sz(src.type) > 4 && !devinfo->has_64bit_float) {
3478          brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3479                     subscript(src, BRW_REGISTER_TYPE_D, 0));
3480          brw_set_default_swsb(p, tgl_swsb_null());
3481          brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3482                     subscript(src, BRW_REGISTER_TYPE_D, 1));
3483       } else {
3484          brw_MOV(p, dst, src);
3485       }
3486    } else {
3487       /* From the Haswell PRM section "Register Region Restrictions":
3488        *
3489        *    "The lower bits of the AddressImmediate must not overflow to
3490        *    change the register address.  The lower 5 bits of Address
3491        *    Immediate when added to lower 5 bits of address register gives
3492        *    the sub-register offset. The upper bits of Address Immediate
3493        *    when added to upper bits of address register gives the register
3494        *    address. Any overflow from sub-register offset is dropped."
3495        *
3496        * Fortunately, for broadcast, we never have a sub-register offset so
3497        * this isn't an issue.
3498        */
3499       assert(src.subnr == 0);
3500 
3501       if (align1) {
3502          const struct brw_reg addr =
3503             retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3504          unsigned offset = src.nr * REG_SIZE + src.subnr;
3505          /* Limit in bytes of the signed indirect addressing immediate. */
3506          const unsigned limit = 512;
3507 
3508          brw_push_insn_state(p);
3509          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3510          brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3511 
3512          /* Take into account the component size and horizontal stride. */
3513          assert(src.vstride == src.hstride + src.width);
3514          brw_SHL(p, addr, vec1(idx),
3515                  brw_imm_ud(util_logbase2(type_sz(src.type)) +
3516                             src.hstride - 1));
3517 
3518          /* We can only address up to limit bytes using the indirect
3519           * addressing immediate, account for the difference if the source
3520           * register is above this limit.
3521           */
3522          if (offset >= limit) {
3523             brw_set_default_swsb(p, tgl_swsb_regdist(1));
3524             brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3525             offset = offset % limit;
3526          }
3527 
3528          brw_pop_insn_state(p);
3529 
3530          brw_set_default_swsb(p, tgl_swsb_regdist(1));
3531 
3532          /* Use indirect addressing to fetch the specified component. */
3533          if (type_sz(src.type) > 4 &&
3534              (devinfo->platform == INTEL_PLATFORM_CHV || intel_device_info_is_9lp(devinfo) ||
3535               !devinfo->has_64bit_float)) {
3536             /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3537              *
3538              *    "When source or destination datatype is 64b or operation is
3539              *    integer DWord multiply, indirect addressing must not be
3540              *    used."
3541              *
3542              * To work around both of this issue, we do two integer MOVs
3543              * insead of one 64-bit MOV.  Because no double value should ever
3544              * cross a register boundary, it's safe to use the immediate
3545              * offset in the indirect here to handle adding 4 bytes to the
3546              * offset and avoid the extra ADD to the register file.
3547              */
3548             brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3549                        retype(brw_vec1_indirect(addr.subnr, offset),
3550                               BRW_REGISTER_TYPE_D));
3551             brw_set_default_swsb(p, tgl_swsb_null());
3552             brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3553                        retype(brw_vec1_indirect(addr.subnr, offset + 4),
3554                               BRW_REGISTER_TYPE_D));
3555          } else {
3556             brw_MOV(p, dst,
3557                     retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3558          }
3559       } else {
3560          /* In SIMD4x2 mode the index can be either zero or one, replicate it
3561           * to all bits of a flag register,
3562           */
3563          inst = brw_MOV(p,
3564                         brw_null_reg(),
3565                         stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3566          brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3567          brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3568          brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3569 
3570          /* and use predicated SEL to pick the right channel. */
3571          inst = brw_SEL(p, dst,
3572                         stride(suboffset(src, 4), 4, 4, 1),
3573                         stride(src, 4, 4, 1));
3574          brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3575          brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3576       }
3577    }
3578 
3579    brw_pop_insn_state(p);
3580 }
3581 
3582 
3583 /**
3584  * Emit the SEND message for a barrier
3585  */
3586 void
brw_barrier(struct brw_codegen * p,struct brw_reg src)3587 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3588 {
3589    const struct intel_device_info *devinfo = p->devinfo;
3590    struct brw_inst *inst;
3591 
3592    assert(devinfo->ver >= 7);
3593 
3594    brw_push_insn_state(p);
3595    brw_set_default_access_mode(p, BRW_ALIGN_1);
3596    inst = next_insn(p, BRW_OPCODE_SEND);
3597    brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3598    brw_set_src0(p, inst, src);
3599    brw_set_src1(p, inst, brw_null_reg());
3600    brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3601 
3602    brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3603    brw_inst_set_gateway_subfuncid(devinfo, inst,
3604                                   BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3605 
3606    brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3607    brw_pop_insn_state(p);
3608 }
3609 
3610 
3611 /**
3612  * Emit the wait instruction for a barrier
3613  */
3614 void
brw_WAIT(struct brw_codegen * p)3615 brw_WAIT(struct brw_codegen *p)
3616 {
3617    const struct intel_device_info *devinfo = p->devinfo;
3618    struct brw_inst *insn;
3619 
3620    struct brw_reg src = brw_notification_reg();
3621 
3622    insn = next_insn(p, BRW_OPCODE_WAIT);
3623    brw_set_dest(p, insn, src);
3624    brw_set_src0(p, insn, src);
3625    brw_set_src1(p, insn, brw_null_reg());
3626 
3627    brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3628    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3629 }
3630 
3631 void
brw_float_controls_mode(struct brw_codegen * p,unsigned mode,unsigned mask)3632 brw_float_controls_mode(struct brw_codegen *p,
3633                         unsigned mode, unsigned mask)
3634 {
3635    /* From the Skylake PRM, Volume 7, page 760:
3636     *  "Implementation Restriction on Register Access: When the control
3637     *   register is used as an explicit source and/or destination, hardware
3638     *   does not ensure execution pipeline coherency. Software must set the
3639     *   thread control field to ‘switch’ for an instruction that uses
3640     *   control register as an explicit operand."
3641     *
3642     * On Gfx12+ this is implemented in terms of SWSB annotations instead.
3643     */
3644    brw_set_default_swsb(p, tgl_swsb_regdist(1));
3645 
3646    brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3647                             brw_imm_ud(~mask));
3648    brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3649    if (p->devinfo->ver < 12)
3650       brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3651 
3652    if (mode) {
3653       brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3654                                  brw_imm_ud(mode));
3655       brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1);
3656       if (p->devinfo->ver < 12)
3657          brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH);
3658    }
3659 
3660    if (p->devinfo->ver >= 12)
3661       brw_SYNC(p, TGL_SYNC_NOP);
3662 }
3663 
3664 void
brw_update_reloc_imm(const struct brw_isa_info * isa,brw_inst * inst,uint32_t value)3665 brw_update_reloc_imm(const struct brw_isa_info *isa,
3666                      brw_inst *inst,
3667                      uint32_t value)
3668 {
3669    const struct intel_device_info *devinfo = isa->devinfo;
3670 
3671    /* Sanity check that the instruction is a MOV of an immediate */
3672    assert(brw_inst_opcode(isa, inst) == BRW_OPCODE_MOV);
3673    assert(brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE);
3674 
3675    /* If it was compacted, we can't safely rewrite */
3676    assert(brw_inst_cmpt_control(devinfo, inst) == 0);
3677 
3678    brw_inst_set_imm_ud(devinfo, inst, value);
3679 }
3680 
3681 /* A default value for constants that will be patched at run-time.
3682  * We pick an arbitrary value that prevents instruction compaction.
3683  */
3684 #define DEFAULT_PATCH_IMM 0x4a7cc037
3685 
3686 void
brw_MOV_reloc_imm(struct brw_codegen * p,struct brw_reg dst,enum brw_reg_type src_type,uint32_t id)3687 brw_MOV_reloc_imm(struct brw_codegen *p,
3688                   struct brw_reg dst,
3689                   enum brw_reg_type src_type,
3690                   uint32_t id)
3691 {
3692    assert(type_sz(src_type) == 4);
3693    assert(type_sz(dst.type) == 4);
3694 
3695    brw_add_reloc(p, id, BRW_SHADER_RELOC_TYPE_MOV_IMM,
3696                  p->next_insn_offset, 0);
3697 
3698    brw_MOV(p, dst, retype(brw_imm_ud(DEFAULT_PATCH_IMM), src_type));
3699 }
3700