• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3  Intel funded Tungsten Graphics to
4  develop this 3D driver.
5 
6  Permission is hereby granted, free of charge, to any person obtaining
7  a copy of this software and associated documentation files (the
8  "Software"), to deal in the Software without restriction, including
9  without limitation the rights to use, copy, modify, merge, publish,
10  distribute, sublicense, and/or sell copies of the Software, and to
11  permit persons to whom the Software is furnished to do so, subject to
12  the following conditions:
13 
14  The above copyright notice and this permission notice (including the
15  next paragraph) shall be included in all copies or substantial
16  portions of the Software.
17 
18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 
26  **********************************************************************/
27  /*
28   * Authors:
29   *   Keith Whitwell <keithw@vmware.com>
30   */
31 
32 
33 #include "brw_eu_defines.h"
34 #include "brw_eu.h"
35 
36 #include "util/ralloc.h"
37 
38 /**
39  * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40  * registers, implicitly moving the operand to a message register.
41  *
42  * On Sandybridge, this is no longer the case.  This function performs the
43  * explicit move; it should be called before emitting a SEND instruction.
44  */
45 void
gen6_resolve_implied_move(struct brw_codegen * p,struct brw_reg * src,unsigned msg_reg_nr)46 gen6_resolve_implied_move(struct brw_codegen *p,
47 			  struct brw_reg *src,
48 			  unsigned msg_reg_nr)
49 {
50    const struct gen_device_info *devinfo = p->devinfo;
51    if (devinfo->gen < 6)
52       return;
53 
54    if (src->file == BRW_MESSAGE_REGISTER_FILE)
55       return;
56 
57    if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58       assert(devinfo->gen < 12);
59       brw_push_insn_state(p);
60       brw_set_default_exec_size(p, BRW_EXECUTE_8);
61       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63       brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64 	      retype(*src, BRW_REGISTER_TYPE_UD));
65       brw_pop_insn_state(p);
66    }
67    *src = brw_message_reg(msg_reg_nr);
68 }
69 
70 static void
gen7_convert_mrf_to_grf(struct brw_codegen * p,struct brw_reg * reg)71 gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72 {
73    /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74     * "The send with EOT should use register space R112-R127 for <src>. This is
75     *  to enable loading of a new thread into the same slot while the message
76     *  with EOT for current thread is pending dispatch."
77     *
78     * Since we're pretending to have 16 MRFs anyway, we may as well use the
79     * registers required for messages with EOT.
80     */
81    const struct gen_device_info *devinfo = p->devinfo;
82    if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83       reg->file = BRW_GENERAL_REGISTER_FILE;
84       reg->nr += GEN7_MRF_HACK_START;
85    }
86 }
87 
88 void
brw_set_dest(struct brw_codegen * p,brw_inst * inst,struct brw_reg dest)89 brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
90 {
91    const struct gen_device_info *devinfo = p->devinfo;
92 
93    if (dest.file == BRW_MESSAGE_REGISTER_FILE)
94       assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
95    else if (dest.file == BRW_GENERAL_REGISTER_FILE)
96       assert(dest.nr < 128);
97 
98    /* The hardware has a restriction where a destination of size Byte with
99     * a stride of 1 is only allowed for a packed byte MOV. For any other
100     * instruction, the stride must be at least 2, even when the destination
101     * is the NULL register.
102     */
103    if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
104        dest.nr == BRW_ARF_NULL &&
105        type_sz(dest.type) == 1 &&
106        dest.hstride == BRW_HORIZONTAL_STRIDE_1) {
107       dest.hstride = BRW_HORIZONTAL_STRIDE_2;
108    }
109 
110    gen7_convert_mrf_to_grf(p, &dest);
111 
112    if (devinfo->gen >= 12 &&
113        (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
114         brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
115       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
116              dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
117       assert(dest.address_mode == BRW_ADDRESS_DIRECT);
118       assert(dest.subnr == 0);
119       assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
120              (dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
121               dest.vstride == dest.width + 1));
122       assert(!dest.negate && !dest.abs);
123       brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
124       brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
125 
126    } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
127               brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
128       assert(devinfo->gen < 12);
129       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
130              dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
131       assert(dest.address_mode == BRW_ADDRESS_DIRECT);
132       assert(dest.subnr % 16 == 0);
133       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
134              dest.vstride == dest.width + 1);
135       assert(!dest.negate && !dest.abs);
136       brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
137       brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
138       brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
139    } else {
140       brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
141       brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
142 
143       if (dest.address_mode == BRW_ADDRESS_DIRECT) {
144          brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
145 
146          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
147             brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
148             if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
149                dest.hstride = BRW_HORIZONTAL_STRIDE_1;
150             brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
151          } else {
152             brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
153             brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
154             if (dest.file == BRW_GENERAL_REGISTER_FILE ||
155                 dest.file == BRW_MESSAGE_REGISTER_FILE) {
156                assert(dest.writemask != 0);
157             }
158             /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
159              *    Although Dst.HorzStride is a don't care for Align16, HW needs
160              *    this to be programmed as "01".
161              */
162             brw_inst_set_dst_hstride(devinfo, inst, 1);
163          }
164       } else {
165          brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
166 
167          /* These are different sizes in align1 vs align16:
168           */
169          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
170             brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
171                                           dest.indirect_offset);
172             if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
173                dest.hstride = BRW_HORIZONTAL_STRIDE_1;
174             brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
175          } else {
176             brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
177                                            dest.indirect_offset);
178             /* even ignored in da16, still need to set as '01' */
179             brw_inst_set_dst_hstride(devinfo, inst, 1);
180          }
181       }
182    }
183 
184    /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
185     * or 16 (SIMD16), as that's normally correct.  However, when dealing with
186     * small registers, it can be useful for us to automatically reduce it to
187     * match the register size.
188     */
189    if (p->automatic_exec_sizes) {
190       /*
191        * In platforms that support fp64 we can emit instructions with a width
192        * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
193        * these cases we need to make sure that these instructions have their
194        * exec sizes set properly when they are emitted and we can't rely on
195        * this code to fix it.
196        */
197       bool fix_exec_size;
198       if (devinfo->gen >= 6)
199          fix_exec_size = dest.width < BRW_EXECUTE_4;
200       else
201          fix_exec_size = dest.width < BRW_EXECUTE_8;
202 
203       if (fix_exec_size)
204          brw_inst_set_exec_size(devinfo, inst, dest.width);
205    }
206 }
207 
208 void
brw_set_src0(struct brw_codegen * p,brw_inst * inst,struct brw_reg reg)209 brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
210 {
211    const struct gen_device_info *devinfo = p->devinfo;
212 
213    if (reg.file == BRW_MESSAGE_REGISTER_FILE)
214       assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
215    else if (reg.file == BRW_GENERAL_REGISTER_FILE)
216       assert(reg.nr < 128);
217 
218    gen7_convert_mrf_to_grf(p, &reg);
219 
220    if (devinfo->gen >= 6 &&
221        (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
222         brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC ||
223         brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
224         brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC)) {
225       /* Any source modifiers or regions will be ignored, since this just
226        * identifies the MRF/GRF to start reading the message contents from.
227        * Check for some likely failures.
228        */
229       assert(!reg.negate);
230       assert(!reg.abs);
231       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
232    }
233 
234    if (devinfo->gen >= 12 &&
235        (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
236         brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
237       assert(reg.file != BRW_IMMEDIATE_VALUE);
238       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
239       assert(reg.subnr == 0);
240       assert(has_scalar_region(reg) ||
241              (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
242               reg.vstride == reg.width + 1));
243       assert(!reg.negate && !reg.abs);
244       brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
245       brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
246 
247    } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
248               brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
249       assert(reg.file == BRW_GENERAL_REGISTER_FILE);
250       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
251       assert(reg.subnr % 16 == 0);
252       assert(has_scalar_region(reg) ||
253              (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
254               reg.vstride == reg.width + 1));
255       assert(!reg.negate && !reg.abs);
256       brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
257       brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
258    } else {
259       brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
260       brw_inst_set_src0_abs(devinfo, inst, reg.abs);
261       brw_inst_set_src0_negate(devinfo, inst, reg.negate);
262       brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
263 
264       if (reg.file == BRW_IMMEDIATE_VALUE) {
265          if (reg.type == BRW_REGISTER_TYPE_DF ||
266              brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
267             brw_inst_set_imm_df(devinfo, inst, reg.df);
268          else if (reg.type == BRW_REGISTER_TYPE_UQ ||
269                   reg.type == BRW_REGISTER_TYPE_Q)
270             brw_inst_set_imm_uq(devinfo, inst, reg.u64);
271          else
272             brw_inst_set_imm_ud(devinfo, inst, reg.ud);
273 
274          if (devinfo->gen < 12 && type_sz(reg.type) < 8) {
275             brw_inst_set_src1_reg_file(devinfo, inst,
276                                        BRW_ARCHITECTURE_REGISTER_FILE);
277             brw_inst_set_src1_reg_hw_type(devinfo, inst,
278                                           brw_inst_src0_reg_hw_type(devinfo, inst));
279          }
280       } else {
281          if (reg.address_mode == BRW_ADDRESS_DIRECT) {
282             brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
283             if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
284                 brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
285             } else {
286                brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
287             }
288          } else {
289             brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
290 
291             if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
292                brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
293             } else {
294                brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
295             }
296          }
297 
298          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
299             if (reg.width == BRW_WIDTH_1 &&
300                 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
301                brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
302                brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
303                brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
304             } else {
305                brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
306                brw_inst_set_src0_width(devinfo, inst, reg.width);
307                brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
308             }
309          } else {
310             brw_inst_set_src0_da16_swiz_x(devinfo, inst,
311                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
312             brw_inst_set_src0_da16_swiz_y(devinfo, inst,
313                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
314             brw_inst_set_src0_da16_swiz_z(devinfo, inst,
315                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
316             brw_inst_set_src0_da16_swiz_w(devinfo, inst,
317                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
318 
319             if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
320                /* This is an oddity of the fact we're using the same
321                 * descriptions for registers in align_16 as align_1:
322                 */
323                brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
324             } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
325                        reg.type == BRW_REGISTER_TYPE_DF &&
326                        reg.vstride == BRW_VERTICAL_STRIDE_2) {
327                /* From SNB PRM:
328                 *
329                 * "For Align16 access mode, only encodings of 0000 and 0011
330                 *  are allowed. Other codes are reserved."
331                 *
332                 * Presumably the DevSNB behavior applies to IVB as well.
333                 */
334                brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
335             } else {
336                brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
337             }
338          }
339       }
340    }
341 }
342 
343 
344 void
brw_set_src1(struct brw_codegen * p,brw_inst * inst,struct brw_reg reg)345 brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
346 {
347    const struct gen_device_info *devinfo = p->devinfo;
348 
349    if (reg.file == BRW_GENERAL_REGISTER_FILE)
350       assert(reg.nr < 128);
351 
352    if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
353        brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC ||
354        (devinfo->gen >= 12 &&
355         (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
356          brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC))) {
357       assert(reg.file == BRW_GENERAL_REGISTER_FILE ||
358              reg.file == BRW_ARCHITECTURE_REGISTER_FILE);
359       assert(reg.address_mode == BRW_ADDRESS_DIRECT);
360       assert(reg.subnr == 0);
361       assert(has_scalar_region(reg) ||
362              (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
363               reg.vstride == reg.width + 1));
364       assert(!reg.negate && !reg.abs);
365       brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr);
366       brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
367    } else {
368       /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
369        *
370        *    "Accumulator registers may be accessed explicitly as src0
371        *    operands only."
372        */
373       assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
374              reg.nr != BRW_ARF_ACCUMULATOR);
375 
376       gen7_convert_mrf_to_grf(p, &reg);
377       assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
378 
379       brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
380       brw_inst_set_src1_abs(devinfo, inst, reg.abs);
381       brw_inst_set_src1_negate(devinfo, inst, reg.negate);
382 
383       /* Only src1 can be immediate in two-argument instructions.
384        */
385       assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
386 
387       if (reg.file == BRW_IMMEDIATE_VALUE) {
388          /* two-argument instructions can only use 32-bit immediates */
389          assert(type_sz(reg.type) < 8);
390          brw_inst_set_imm_ud(devinfo, inst, reg.ud);
391       } else {
392          /* This is a hardware restriction, which may or may not be lifted
393           * in the future:
394           */
395          assert (reg.address_mode == BRW_ADDRESS_DIRECT);
396          /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
397 
398          brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
399          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
400             brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
401          } else {
402             brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
403          }
404 
405          if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
406             if (reg.width == BRW_WIDTH_1 &&
407                 brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
408                brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
409                brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
410                brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
411             } else {
412                brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
413                brw_inst_set_src1_width(devinfo, inst, reg.width);
414                brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
415             }
416          } else {
417             brw_inst_set_src1_da16_swiz_x(devinfo, inst,
418                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
419             brw_inst_set_src1_da16_swiz_y(devinfo, inst,
420                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
421             brw_inst_set_src1_da16_swiz_z(devinfo, inst,
422                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
423             brw_inst_set_src1_da16_swiz_w(devinfo, inst,
424                BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
425 
426             if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
427                /* This is an oddity of the fact we're using the same
428                 * descriptions for registers in align_16 as align_1:
429                 */
430                brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
431             } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
432                        reg.type == BRW_REGISTER_TYPE_DF &&
433                        reg.vstride == BRW_VERTICAL_STRIDE_2) {
434                /* From SNB PRM:
435                 *
436                 * "For Align16 access mode, only encodings of 0000 and 0011
437                 *  are allowed. Other codes are reserved."
438                 *
439                 * Presumably the DevSNB behavior applies to IVB as well.
440                 */
441                brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
442             } else {
443                brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
444             }
445          }
446       }
447    }
448 }
449 
450 /**
451  * Specify the descriptor and extended descriptor immediate for a SEND(C)
452  * message instruction.
453  */
454 void
brw_set_desc_ex(struct brw_codegen * p,brw_inst * inst,unsigned desc,unsigned ex_desc)455 brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
456                 unsigned desc, unsigned ex_desc)
457 {
458    const struct gen_device_info *devinfo = p->devinfo;
459    assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
460           brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC);
461    if (devinfo->gen < 12)
462       brw_inst_set_src1_file_type(devinfo, inst,
463                                   BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
464    brw_inst_set_send_desc(devinfo, inst, desc);
465    if (devinfo->gen >= 9)
466       brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
467 }
468 
brw_set_math_message(struct brw_codegen * p,brw_inst * inst,unsigned function,unsigned integer_type,bool low_precision,unsigned dataType)469 static void brw_set_math_message( struct brw_codegen *p,
470 				  brw_inst *inst,
471 				  unsigned function,
472 				  unsigned integer_type,
473 				  bool low_precision,
474 				  unsigned dataType )
475 {
476    const struct gen_device_info *devinfo = p->devinfo;
477    unsigned msg_length;
478    unsigned response_length;
479 
480    /* Infer message length from the function */
481    switch (function) {
482    case BRW_MATH_FUNCTION_POW:
483    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
484    case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
485    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
486       msg_length = 2;
487       break;
488    default:
489       msg_length = 1;
490       break;
491    }
492 
493    /* Infer response length from the function */
494    switch (function) {
495    case BRW_MATH_FUNCTION_SINCOS:
496    case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
497       response_length = 2;
498       break;
499    default:
500       response_length = 1;
501       break;
502    }
503 
504    brw_set_desc(p, inst, brw_message_desc(
505                    devinfo, msg_length, response_length, false));
506 
507    brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
508    brw_inst_set_math_msg_function(devinfo, inst, function);
509    brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
510    brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
511    brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
512    brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
513    brw_inst_set_saturate(devinfo, inst, 0);
514 }
515 
516 
brw_set_ff_sync_message(struct brw_codegen * p,brw_inst * insn,bool allocate,unsigned response_length,bool end_of_thread)517 static void brw_set_ff_sync_message(struct brw_codegen *p,
518 				    brw_inst *insn,
519 				    bool allocate,
520 				    unsigned response_length,
521 				    bool end_of_thread)
522 {
523    const struct gen_device_info *devinfo = p->devinfo;
524 
525    brw_set_desc(p, insn, brw_message_desc(
526                    devinfo, 1, response_length, true));
527 
528    brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
529    brw_inst_set_eot(devinfo, insn, end_of_thread);
530    brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
531    brw_inst_set_urb_allocate(devinfo, insn, allocate);
532    /* The following fields are not used by FF_SYNC: */
533    brw_inst_set_urb_global_offset(devinfo, insn, 0);
534    brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
535    brw_inst_set_urb_used(devinfo, insn, 0);
536    brw_inst_set_urb_complete(devinfo, insn, 0);
537 }
538 
brw_set_urb_message(struct brw_codegen * p,brw_inst * insn,enum brw_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle_control)539 static void brw_set_urb_message( struct brw_codegen *p,
540 				 brw_inst *insn,
541                                  enum brw_urb_write_flags flags,
542 				 unsigned msg_length,
543 				 unsigned response_length,
544 				 unsigned offset,
545 				 unsigned swizzle_control )
546 {
547    const struct gen_device_info *devinfo = p->devinfo;
548 
549    assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
550    assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
551    assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
552 
553    brw_set_desc(p, insn, brw_message_desc(
554                    devinfo, msg_length, response_length, true));
555 
556    brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
557    brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
558 
559    if (flags & BRW_URB_WRITE_OWORD) {
560       assert(msg_length == 2); /* header + one OWORD of data */
561       brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
562    } else {
563       brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
564    }
565 
566    brw_inst_set_urb_global_offset(devinfo, insn, offset);
567    brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
568 
569    if (devinfo->gen < 8) {
570       brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
571    }
572 
573    if (devinfo->gen < 7) {
574       brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
575       brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
576    } else {
577       brw_inst_set_urb_per_slot_offset(devinfo, insn,
578          !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
579    }
580 }
581 
582 static void
gen7_set_dp_scratch_message(struct brw_codegen * p,brw_inst * inst,bool write,bool dword,bool invalidate_after_read,unsigned num_regs,unsigned addr_offset,unsigned mlen,unsigned rlen,bool header_present)583 gen7_set_dp_scratch_message(struct brw_codegen *p,
584                             brw_inst *inst,
585                             bool write,
586                             bool dword,
587                             bool invalidate_after_read,
588                             unsigned num_regs,
589                             unsigned addr_offset,
590                             unsigned mlen,
591                             unsigned rlen,
592                             bool header_present)
593 {
594    const struct gen_device_info *devinfo = p->devinfo;
595    assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
596           (devinfo->gen >= 8 && num_regs == 8));
597    const unsigned block_size = (devinfo->gen >= 8 ? util_logbase2(num_regs) :
598                                 num_regs - 1);
599 
600    brw_set_desc(p, inst, brw_message_desc(
601                    devinfo, mlen, rlen, header_present));
602 
603    brw_inst_set_sfid(devinfo, inst, GEN7_SFID_DATAPORT_DATA_CACHE);
604    brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
605    brw_inst_set_scratch_read_write(devinfo, inst, write);
606    brw_inst_set_scratch_type(devinfo, inst, dword);
607    brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
608    brw_inst_set_scratch_block_size(devinfo, inst, block_size);
609    brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
610 }
611 
612 static void
brw_inst_set_state(const struct gen_device_info * devinfo,brw_inst * insn,const struct brw_insn_state * state)613 brw_inst_set_state(const struct gen_device_info *devinfo,
614                    brw_inst *insn,
615                    const struct brw_insn_state *state)
616 {
617    brw_inst_set_exec_size(devinfo, insn, state->exec_size);
618    brw_inst_set_group(devinfo, insn, state->group);
619    brw_inst_set_compression(devinfo, insn, state->compressed);
620    brw_inst_set_access_mode(devinfo, insn, state->access_mode);
621    brw_inst_set_mask_control(devinfo, insn, state->mask_control);
622    if (devinfo->gen >= 12)
623       brw_inst_set_swsb(devinfo, insn, tgl_swsb_encode(state->swsb));
624    brw_inst_set_saturate(devinfo, insn, state->saturate);
625    brw_inst_set_pred_control(devinfo, insn, state->predicate);
626    brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
627 
628    if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
629        state->access_mode == BRW_ALIGN_16) {
630       brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
631       if (devinfo->gen >= 7)
632          brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
633    } else {
634       brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
635       if (devinfo->gen >= 7)
636          brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
637    }
638 
639    if (devinfo->gen >= 6)
640       brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
641 }
642 
643 static brw_inst *
brw_append_insns(struct brw_codegen * p,unsigned nr_insn,unsigned align)644 brw_append_insns(struct brw_codegen *p, unsigned nr_insn, unsigned align)
645 {
646    assert(util_is_power_of_two_or_zero(sizeof(brw_inst)));
647    assert(util_is_power_of_two_or_zero(align));
648    const unsigned align_insn = MAX2(align / sizeof(brw_inst), 1);
649    const unsigned start_insn = ALIGN(p->nr_insn, align_insn);
650    const unsigned new_nr_insn = start_insn + nr_insn;
651 
652    if (p->store_size < new_nr_insn) {
653       p->store_size = util_next_power_of_two(new_nr_insn * sizeof(brw_inst));
654       p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
655    }
656 
657    /* Memset any padding due to alignment to 0.  We don't want to be hashing
658     * or caching a bunch of random bits we got from a memory allocation.
659     */
660    if (p->nr_insn < start_insn) {
661       memset(&p->store[p->nr_insn], 0,
662              (start_insn - p->nr_insn) * sizeof(brw_inst));
663    }
664 
665    assert(p->next_insn_offset == p->nr_insn * sizeof(brw_inst));
666    p->nr_insn = new_nr_insn;
667    p->next_insn_offset = new_nr_insn * sizeof(brw_inst);
668 
669    return &p->store[start_insn];
670 }
671 
672 void
brw_realign(struct brw_codegen * p,unsigned align)673 brw_realign(struct brw_codegen *p, unsigned align)
674 {
675    brw_append_insns(p, 0, align);
676 }
677 
678 int
brw_append_data(struct brw_codegen * p,void * data,unsigned size,unsigned align)679 brw_append_data(struct brw_codegen *p, void *data,
680                 unsigned size, unsigned align)
681 {
682    unsigned nr_insn = DIV_ROUND_UP(size, sizeof(brw_inst));
683    void *dst = brw_append_insns(p, nr_insn, align);
684    memcpy(dst, data, size);
685 
686    /* If it's not a whole number of instructions, memset the end */
687    if (size < nr_insn * sizeof(brw_inst))
688       memset(dst + size, 0, nr_insn * sizeof(brw_inst) - size);
689 
690    return dst - (void *)p->store;
691 }
692 
693 #define next_insn brw_next_insn
694 brw_inst *
brw_next_insn(struct brw_codegen * p,unsigned opcode)695 brw_next_insn(struct brw_codegen *p, unsigned opcode)
696 {
697    const struct gen_device_info *devinfo = p->devinfo;
698    brw_inst *insn = brw_append_insns(p, 1, sizeof(brw_inst));
699 
700    memset(insn, 0, sizeof(*insn));
701    brw_inst_set_opcode(devinfo, insn, opcode);
702 
703    /* Apply the default instruction state */
704    brw_inst_set_state(devinfo, insn, p->current);
705 
706    return insn;
707 }
708 
709 static brw_inst *
brw_alu1(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src)710 brw_alu1(struct brw_codegen *p, unsigned opcode,
711          struct brw_reg dest, struct brw_reg src)
712 {
713    brw_inst *insn = next_insn(p, opcode);
714    brw_set_dest(p, insn, dest);
715    brw_set_src0(p, insn, src);
716    return insn;
717 }
718 
719 static brw_inst *
brw_alu2(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)720 brw_alu2(struct brw_codegen *p, unsigned opcode,
721          struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
722 {
723    /* 64-bit immediates are only supported on 1-src instructions */
724    assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
725    assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
726 
727    brw_inst *insn = next_insn(p, opcode);
728    brw_set_dest(p, insn, dest);
729    brw_set_src0(p, insn, src0);
730    brw_set_src1(p, insn, src1);
731    return insn;
732 }
733 
734 static int
get_3src_subreg_nr(struct brw_reg reg)735 get_3src_subreg_nr(struct brw_reg reg)
736 {
737    /* Normally, SubRegNum is in bytes (0..31).  However, 3-src instructions
738     * use 32-bit units (components 0..7).  Since they only support F/D/UD
739     * types, this doesn't lose any flexibility, but uses fewer bits.
740     */
741    return reg.subnr / 4;
742 }
743 
744 static enum gen10_align1_3src_vertical_stride
to_3src_align1_vstride(const struct gen_device_info * devinfo,enum brw_vertical_stride vstride)745 to_3src_align1_vstride(const struct gen_device_info *devinfo,
746                        enum brw_vertical_stride vstride)
747 {
748    switch (vstride) {
749    case BRW_VERTICAL_STRIDE_0:
750       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
751    case BRW_VERTICAL_STRIDE_1:
752       assert(devinfo->gen >= 12);
753       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1;
754    case BRW_VERTICAL_STRIDE_2:
755       assert(devinfo->gen < 12);
756       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
757    case BRW_VERTICAL_STRIDE_4:
758       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
759    case BRW_VERTICAL_STRIDE_8:
760    case BRW_VERTICAL_STRIDE_16:
761       return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
762    default:
763       unreachable("invalid vstride");
764    }
765 }
766 
767 
768 static enum gen10_align1_3src_src_horizontal_stride
to_3src_align1_hstride(enum brw_horizontal_stride hstride)769 to_3src_align1_hstride(enum brw_horizontal_stride hstride)
770 {
771    switch (hstride) {
772    case BRW_HORIZONTAL_STRIDE_0:
773       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
774    case BRW_HORIZONTAL_STRIDE_1:
775       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
776    case BRW_HORIZONTAL_STRIDE_2:
777       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
778    case BRW_HORIZONTAL_STRIDE_4:
779       return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
780    default:
781       unreachable("invalid hstride");
782    }
783 }
784 
785 static brw_inst *
brw_alu3(struct brw_codegen * p,unsigned opcode,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1,struct brw_reg src2)786 brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
787          struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
788 {
789    const struct gen_device_info *devinfo = p->devinfo;
790    brw_inst *inst = next_insn(p, opcode);
791 
792    gen7_convert_mrf_to_grf(p, &dest);
793 
794    assert(dest.nr < 128);
795 
796    if (devinfo->gen >= 10)
797       assert(!(src0.file == BRW_IMMEDIATE_VALUE &&
798                src2.file == BRW_IMMEDIATE_VALUE));
799 
800    assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128);
801    assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128);
802    assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128);
803    assert(dest.address_mode == BRW_ADDRESS_DIRECT);
804    assert(src0.address_mode == BRW_ADDRESS_DIRECT);
805    assert(src1.address_mode == BRW_ADDRESS_DIRECT);
806    assert(src2.address_mode == BRW_ADDRESS_DIRECT);
807 
808    if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
809       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
810              dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
811 
812       if (devinfo->gen >= 12) {
813          brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
814          brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
815       } else {
816          if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
817             brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
818                                               BRW_ALIGN1_3SRC_ACCUMULATOR);
819             brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
820          } else {
821             brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
822                                               BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
823             brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
824          }
825       }
826       brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
827 
828       brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
829 
830       if (brw_reg_type_is_floating_point(dest.type)) {
831          brw_inst_set_3src_a1_exec_type(devinfo, inst,
832                                         BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
833       } else {
834          brw_inst_set_3src_a1_exec_type(devinfo, inst,
835                                         BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
836       }
837 
838       brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
839       brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
840       brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
841       brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
842 
843       if (src0.file == BRW_IMMEDIATE_VALUE) {
844          brw_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud);
845       } else {
846          brw_inst_set_3src_a1_src0_vstride(
847             devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
848          brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
849                                            to_3src_align1_hstride(src0.hstride));
850          brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
851          if (src0.type == BRW_REGISTER_TYPE_NF) {
852             brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
853          } else {
854             brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
855          }
856          brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
857          brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
858       }
859       brw_inst_set_3src_a1_src1_vstride(
860          devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
861       brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
862                                         to_3src_align1_hstride(src1.hstride));
863 
864       brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
865       if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
866          brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
867       } else {
868          brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
869       }
870       brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
871       brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
872 
873       if (src2.file == BRW_IMMEDIATE_VALUE) {
874          brw_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud);
875       } else {
876          brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
877                                            to_3src_align1_hstride(src2.hstride));
878          /* no vstride on src2 */
879          brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
880          brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
881          brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
882          brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
883       }
884 
885       assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
886              src0.file == BRW_IMMEDIATE_VALUE ||
887              (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
888               src0.type == BRW_REGISTER_TYPE_NF));
889       assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
890              src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
891       assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
892              src2.file == BRW_IMMEDIATE_VALUE);
893 
894       if (devinfo->gen >= 12) {
895          if (src0.file == BRW_IMMEDIATE_VALUE) {
896             brw_inst_set_3src_a1_src0_is_imm(devinfo, inst, 1);
897          } else {
898             brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
899          }
900 
901          brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
902 
903          if (src2.file == BRW_IMMEDIATE_VALUE) {
904             brw_inst_set_3src_a1_src2_is_imm(devinfo, inst, 1);
905          } else {
906             brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
907          }
908       } else {
909          brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
910                                             src0.file == BRW_GENERAL_REGISTER_FILE ?
911                                             BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
912                                             BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
913          brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
914                                             src1.file == BRW_GENERAL_REGISTER_FILE ?
915                                             BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
916                                             BRW_ALIGN1_3SRC_ACCUMULATOR);
917          brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
918                                             src2.file == BRW_GENERAL_REGISTER_FILE ?
919                                             BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
920                                             BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
921       }
922 
923    } else {
924       assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
925              dest.file == BRW_MESSAGE_REGISTER_FILE);
926       assert(dest.type == BRW_REGISTER_TYPE_F  ||
927              dest.type == BRW_REGISTER_TYPE_DF ||
928              dest.type == BRW_REGISTER_TYPE_D  ||
929              dest.type == BRW_REGISTER_TYPE_UD ||
930              (dest.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 8));
931       if (devinfo->gen == 6) {
932          brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
933                                             dest.file == BRW_MESSAGE_REGISTER_FILE);
934       }
935       brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
936       brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 4);
937       brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
938 
939       assert(src0.file == BRW_GENERAL_REGISTER_FILE);
940       brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
941       brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
942       brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
943       brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
944       brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
945       brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
946                                           src0.vstride == BRW_VERTICAL_STRIDE_0);
947 
948       assert(src1.file == BRW_GENERAL_REGISTER_FILE);
949       brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
950       brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
951       brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
952       brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
953       brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
954       brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
955                                           src1.vstride == BRW_VERTICAL_STRIDE_0);
956 
957       assert(src2.file == BRW_GENERAL_REGISTER_FILE);
958       brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
959       brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
960       brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
961       brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
962       brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
963       brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
964                                           src2.vstride == BRW_VERTICAL_STRIDE_0);
965 
966       if (devinfo->gen >= 7) {
967          /* Set both the source and destination types based on dest.type,
968           * ignoring the source register types.  The MAD and LRP emitters ensure
969           * that all four types are float.  The BFE and BFI2 emitters, however,
970           * may send us mixed D and UD types and want us to ignore that and use
971           * the destination type.
972           */
973          brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
974          brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
975 
976          /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
977           *
978           *    "Three source instructions can use operands with mixed-mode
979           *     precision. When SrcType field is set to :f or :hf it defines
980           *     precision for source 0 only, and fields Src1Type and Src2Type
981           *     define precision for other source operands:
982           *
983           *     0b = :f. Single precision Float (32-bit).
984           *     1b = :hf. Half precision Float (16-bit)."
985           */
986          if (src1.type == BRW_REGISTER_TYPE_HF)
987             brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
988 
989          if (src2.type == BRW_REGISTER_TYPE_HF)
990             brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
991       }
992    }
993 
994    return inst;
995 }
996 
997 
998 /***********************************************************************
999  * Convenience routines.
1000  */
1001 #define ALU1(OP)					\
1002 brw_inst *brw_##OP(struct brw_codegen *p,		\
1003 	      struct brw_reg dest,			\
1004 	      struct brw_reg src0)   			\
1005 {							\
1006    return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
1007 }
1008 
1009 #define ALU2(OP)					\
1010 brw_inst *brw_##OP(struct brw_codegen *p,		\
1011 	      struct brw_reg dest,			\
1012 	      struct brw_reg src0,			\
1013 	      struct brw_reg src1)   			\
1014 {							\
1015    return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
1016 }
1017 
1018 #define ALU3(OP)					\
1019 brw_inst *brw_##OP(struct brw_codegen *p,		\
1020 	      struct brw_reg dest,			\
1021 	      struct brw_reg src0,			\
1022 	      struct brw_reg src1,			\
1023 	      struct brw_reg src2)   			\
1024 {                                                       \
1025    if (p->current->access_mode == BRW_ALIGN_16) {       \
1026       if (src0.vstride == BRW_VERTICAL_STRIDE_0)        \
1027          src0.swizzle = BRW_SWIZZLE_XXXX;               \
1028       if (src1.vstride == BRW_VERTICAL_STRIDE_0)        \
1029          src1.swizzle = BRW_SWIZZLE_XXXX;               \
1030       if (src2.vstride == BRW_VERTICAL_STRIDE_0)        \
1031          src2.swizzle = BRW_SWIZZLE_XXXX;               \
1032    }                                                    \
1033    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
1034 }
1035 
1036 #define ALU3F(OP)                                               \
1037 brw_inst *brw_##OP(struct brw_codegen *p,         \
1038                                  struct brw_reg dest,           \
1039                                  struct brw_reg src0,           \
1040                                  struct brw_reg src1,           \
1041                                  struct brw_reg src2)           \
1042 {                                                               \
1043    assert(dest.type == BRW_REGISTER_TYPE_F ||                   \
1044           dest.type == BRW_REGISTER_TYPE_DF);                   \
1045    if (dest.type == BRW_REGISTER_TYPE_F) {                      \
1046       assert(src0.type == BRW_REGISTER_TYPE_F);                 \
1047       assert(src1.type == BRW_REGISTER_TYPE_F);                 \
1048       assert(src2.type == BRW_REGISTER_TYPE_F);                 \
1049    } else if (dest.type == BRW_REGISTER_TYPE_DF) {              \
1050       assert(src0.type == BRW_REGISTER_TYPE_DF);                \
1051       assert(src1.type == BRW_REGISTER_TYPE_DF);                \
1052       assert(src2.type == BRW_REGISTER_TYPE_DF);                \
1053    }                                                            \
1054                                                                 \
1055    if (p->current->access_mode == BRW_ALIGN_16) {               \
1056       if (src0.vstride == BRW_VERTICAL_STRIDE_0)                \
1057          src0.swizzle = BRW_SWIZZLE_XXXX;                       \
1058       if (src1.vstride == BRW_VERTICAL_STRIDE_0)                \
1059          src1.swizzle = BRW_SWIZZLE_XXXX;                       \
1060       if (src2.vstride == BRW_VERTICAL_STRIDE_0)                \
1061          src2.swizzle = BRW_SWIZZLE_XXXX;                       \
1062    }                                                            \
1063    return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1064 }
1065 
1066 ALU2(SEL)
ALU1(NOT)1067 ALU1(NOT)
1068 ALU2(AND)
1069 ALU2(OR)
1070 ALU2(XOR)
1071 ALU2(SHR)
1072 ALU2(SHL)
1073 ALU1(DIM)
1074 ALU2(ASR)
1075 ALU2(ROL)
1076 ALU2(ROR)
1077 ALU3(CSEL)
1078 ALU1(FRC)
1079 ALU1(RNDD)
1080 ALU1(RNDE)
1081 ALU1(RNDU)
1082 ALU1(RNDZ)
1083 ALU2(MAC)
1084 ALU2(MACH)
1085 ALU1(LZD)
1086 ALU2(DP4)
1087 ALU2(DPH)
1088 ALU2(DP3)
1089 ALU2(DP2)
1090 ALU3(MAD)
1091 ALU3F(LRP)
1092 ALU1(BFREV)
1093 ALU3(BFE)
1094 ALU2(BFI1)
1095 ALU3(BFI2)
1096 ALU1(FBH)
1097 ALU1(FBL)
1098 ALU1(CBIT)
1099 ALU2(ADDC)
1100 ALU2(SUBB)
1101 
1102 brw_inst *
1103 brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1104 {
1105    const struct gen_device_info *devinfo = p->devinfo;
1106 
1107    /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1108     * To avoid the problems that causes, we use an <X,2,0> source region to
1109     * read each element twice.
1110     */
1111    if (devinfo->gen == 7 && !devinfo->is_haswell &&
1112        brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1113        dest.type == BRW_REGISTER_TYPE_DF &&
1114        (src0.type == BRW_REGISTER_TYPE_F ||
1115         src0.type == BRW_REGISTER_TYPE_D ||
1116         src0.type == BRW_REGISTER_TYPE_UD) &&
1117        !has_scalar_region(src0)) {
1118       assert(src0.vstride == src0.width + src0.hstride);
1119       src0.vstride = src0.hstride;
1120       src0.width = BRW_WIDTH_2;
1121       src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1122    }
1123 
1124    return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1125 }
1126 
1127 brw_inst *
brw_ADD(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1128 brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1129         struct brw_reg src0, struct brw_reg src1)
1130 {
1131    /* 6.2.2: add */
1132    if (src0.type == BRW_REGISTER_TYPE_F ||
1133        (src0.file == BRW_IMMEDIATE_VALUE &&
1134 	src0.type == BRW_REGISTER_TYPE_VF)) {
1135       assert(src1.type != BRW_REGISTER_TYPE_UD);
1136       assert(src1.type != BRW_REGISTER_TYPE_D);
1137    }
1138 
1139    if (src1.type == BRW_REGISTER_TYPE_F ||
1140        (src1.file == BRW_IMMEDIATE_VALUE &&
1141 	src1.type == BRW_REGISTER_TYPE_VF)) {
1142       assert(src0.type != BRW_REGISTER_TYPE_UD);
1143       assert(src0.type != BRW_REGISTER_TYPE_D);
1144    }
1145 
1146    return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1147 }
1148 
1149 brw_inst *
brw_AVG(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1150 brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1151         struct brw_reg src0, struct brw_reg src1)
1152 {
1153    assert(dest.type == src0.type);
1154    assert(src0.type == src1.type);
1155    switch (src0.type) {
1156    case BRW_REGISTER_TYPE_B:
1157    case BRW_REGISTER_TYPE_UB:
1158    case BRW_REGISTER_TYPE_W:
1159    case BRW_REGISTER_TYPE_UW:
1160    case BRW_REGISTER_TYPE_D:
1161    case BRW_REGISTER_TYPE_UD:
1162       break;
1163    default:
1164       unreachable("Bad type for brw_AVG");
1165    }
1166 
1167    return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1168 }
1169 
1170 brw_inst *
brw_MUL(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1171 brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1172         struct brw_reg src0, struct brw_reg src1)
1173 {
1174    /* 6.32.38: mul */
1175    if (src0.type == BRW_REGISTER_TYPE_D ||
1176        src0.type == BRW_REGISTER_TYPE_UD ||
1177        src1.type == BRW_REGISTER_TYPE_D ||
1178        src1.type == BRW_REGISTER_TYPE_UD) {
1179       assert(dest.type != BRW_REGISTER_TYPE_F);
1180    }
1181 
1182    if (src0.type == BRW_REGISTER_TYPE_F ||
1183        (src0.file == BRW_IMMEDIATE_VALUE &&
1184 	src0.type == BRW_REGISTER_TYPE_VF)) {
1185       assert(src1.type != BRW_REGISTER_TYPE_UD);
1186       assert(src1.type != BRW_REGISTER_TYPE_D);
1187    }
1188 
1189    if (src1.type == BRW_REGISTER_TYPE_F ||
1190        (src1.file == BRW_IMMEDIATE_VALUE &&
1191 	src1.type == BRW_REGISTER_TYPE_VF)) {
1192       assert(src0.type != BRW_REGISTER_TYPE_UD);
1193       assert(src0.type != BRW_REGISTER_TYPE_D);
1194    }
1195 
1196    assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1197 	  src0.nr != BRW_ARF_ACCUMULATOR);
1198    assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1199 	  src1.nr != BRW_ARF_ACCUMULATOR);
1200 
1201    return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1202 }
1203 
1204 brw_inst *
brw_LINE(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1205 brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1206          struct brw_reg src0, struct brw_reg src1)
1207 {
1208    src0.vstride = BRW_VERTICAL_STRIDE_0;
1209    src0.width = BRW_WIDTH_1;
1210    src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1211    return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1212 }
1213 
1214 brw_inst *
brw_PLN(struct brw_codegen * p,struct brw_reg dest,struct brw_reg src0,struct brw_reg src1)1215 brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1216         struct brw_reg src0, struct brw_reg src1)
1217 {
1218    src0.vstride = BRW_VERTICAL_STRIDE_0;
1219    src0.width = BRW_WIDTH_1;
1220    src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1221    src1.vstride = BRW_VERTICAL_STRIDE_8;
1222    src1.width = BRW_WIDTH_8;
1223    src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1224    return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1225 }
1226 
1227 brw_inst *
brw_F32TO16(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src)1228 brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1229 {
1230    const struct gen_device_info *devinfo = p->devinfo;
1231    const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1232    /* The F32TO16 instruction doesn't support 32-bit destination types in
1233     * Align1 mode, and neither does the Gen8 implementation in terms of a
1234     * converting MOV.  Gen7 does zero out the high 16 bits in Align16 mode as
1235     * an undocumented feature.
1236     */
1237    const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1238                                  (!align16 || devinfo->gen >= 8));
1239    brw_inst *inst;
1240 
1241    if (align16) {
1242       assert(dst.type == BRW_REGISTER_TYPE_UD);
1243    } else {
1244       assert(dst.type == BRW_REGISTER_TYPE_UD ||
1245              dst.type == BRW_REGISTER_TYPE_W ||
1246              dst.type == BRW_REGISTER_TYPE_UW ||
1247              dst.type == BRW_REGISTER_TYPE_HF);
1248    }
1249 
1250    brw_push_insn_state(p);
1251 
1252    if (needs_zero_fill) {
1253       brw_set_default_access_mode(p, BRW_ALIGN_1);
1254       dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1255    }
1256 
1257    if (devinfo->gen >= 8) {
1258       inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1259    } else {
1260       assert(devinfo->gen == 7);
1261       inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1262    }
1263 
1264    if (needs_zero_fill) {
1265       if (devinfo->gen < 12)
1266          brw_inst_set_no_dd_clear(devinfo, inst, true);
1267       brw_set_default_swsb(p, tgl_swsb_null());
1268       inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1269       if (devinfo->gen < 12)
1270          brw_inst_set_no_dd_check(devinfo, inst, true);
1271    }
1272 
1273    brw_pop_insn_state(p);
1274    return inst;
1275 }
1276 
1277 brw_inst *
brw_F16TO32(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src)1278 brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1279 {
1280    const struct gen_device_info *devinfo = p->devinfo;
1281    bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1282 
1283    if (align16) {
1284       assert(src.type == BRW_REGISTER_TYPE_UD);
1285    } else {
1286       /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1287        *
1288        *   Because this instruction does not have a 16-bit floating-point
1289        *   type, the source data type must be Word (W). The destination type
1290        *   must be F (Float).
1291        */
1292       if (src.type == BRW_REGISTER_TYPE_UD)
1293          src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1294 
1295       assert(src.type == BRW_REGISTER_TYPE_W ||
1296              src.type == BRW_REGISTER_TYPE_UW ||
1297              src.type == BRW_REGISTER_TYPE_HF);
1298    }
1299 
1300    if (devinfo->gen >= 8) {
1301       return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1302    } else {
1303       assert(devinfo->gen == 7);
1304       return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1305    }
1306 }
1307 
1308 
brw_NOP(struct brw_codegen * p)1309 void brw_NOP(struct brw_codegen *p)
1310 {
1311    brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1312    memset(insn, 0, sizeof(*insn));
1313    brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1314 }
1315 
brw_SYNC(struct brw_codegen * p,enum tgl_sync_function func)1316 void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func)
1317 {
1318    brw_inst *insn = next_insn(p, BRW_OPCODE_SYNC);
1319    brw_inst_set_cond_modifier(p->devinfo, insn, func);
1320 }
1321 
1322 /***********************************************************************
1323  * Comparisons, if/else/endif
1324  */
1325 
1326 brw_inst *
brw_JMPI(struct brw_codegen * p,struct brw_reg index,unsigned predicate_control)1327 brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1328          unsigned predicate_control)
1329 {
1330    const struct gen_device_info *devinfo = p->devinfo;
1331    struct brw_reg ip = brw_ip_reg();
1332    brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1333 
1334    brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1335    brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1336    brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1337    brw_inst_set_pred_control(devinfo, inst, predicate_control);
1338 
1339    return inst;
1340 }
1341 
1342 static void
push_if_stack(struct brw_codegen * p,brw_inst * inst)1343 push_if_stack(struct brw_codegen *p, brw_inst *inst)
1344 {
1345    p->if_stack[p->if_stack_depth] = inst - p->store;
1346 
1347    p->if_stack_depth++;
1348    if (p->if_stack_array_size <= p->if_stack_depth) {
1349       p->if_stack_array_size *= 2;
1350       p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1351 			     p->if_stack_array_size);
1352    }
1353 }
1354 
1355 static brw_inst *
pop_if_stack(struct brw_codegen * p)1356 pop_if_stack(struct brw_codegen *p)
1357 {
1358    p->if_stack_depth--;
1359    return &p->store[p->if_stack[p->if_stack_depth]];
1360 }
1361 
1362 static void
push_loop_stack(struct brw_codegen * p,brw_inst * inst)1363 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1364 {
1365    if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1366       p->loop_stack_array_size *= 2;
1367       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1368 			       p->loop_stack_array_size);
1369       p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1370 				     p->loop_stack_array_size);
1371    }
1372 
1373    p->loop_stack[p->loop_stack_depth] = inst - p->store;
1374    p->loop_stack_depth++;
1375    p->if_depth_in_loop[p->loop_stack_depth] = 0;
1376 }
1377 
1378 static brw_inst *
get_inner_do_insn(struct brw_codegen * p)1379 get_inner_do_insn(struct brw_codegen *p)
1380 {
1381    return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1382 }
1383 
1384 /* EU takes the value from the flag register and pushes it onto some
1385  * sort of a stack (presumably merging with any flag value already on
1386  * the stack).  Within an if block, the flags at the top of the stack
1387  * control execution on each channel of the unit, eg. on each of the
1388  * 16 pixel values in our wm programs.
1389  *
1390  * When the matching 'else' instruction is reached (presumably by
1391  * countdown of the instruction count patched in by our ELSE/ENDIF
1392  * functions), the relevant flags are inverted.
1393  *
1394  * When the matching 'endif' instruction is reached, the flags are
1395  * popped off.  If the stack is now empty, normal execution resumes.
1396  */
1397 brw_inst *
brw_IF(struct brw_codegen * p,unsigned execute_size)1398 brw_IF(struct brw_codegen *p, unsigned execute_size)
1399 {
1400    const struct gen_device_info *devinfo = p->devinfo;
1401    brw_inst *insn;
1402 
1403    insn = next_insn(p, BRW_OPCODE_IF);
1404 
1405    /* Override the defaults for this instruction:
1406     */
1407    if (devinfo->gen < 6) {
1408       brw_set_dest(p, insn, brw_ip_reg());
1409       brw_set_src0(p, insn, brw_ip_reg());
1410       brw_set_src1(p, insn, brw_imm_d(0x0));
1411    } else if (devinfo->gen == 6) {
1412       brw_set_dest(p, insn, brw_imm_w(0));
1413       brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1414       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1415       brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1416    } else if (devinfo->gen == 7) {
1417       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1418       brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1419       brw_set_src1(p, insn, brw_imm_w(0));
1420       brw_inst_set_jip(devinfo, insn, 0);
1421       brw_inst_set_uip(devinfo, insn, 0);
1422    } else {
1423       brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1424       if (devinfo->gen < 12)
1425          brw_set_src0(p, insn, brw_imm_d(0));
1426       brw_inst_set_jip(devinfo, insn, 0);
1427       brw_inst_set_uip(devinfo, insn, 0);
1428    }
1429 
1430    brw_inst_set_exec_size(devinfo, insn, execute_size);
1431    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1432    brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1433    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1434    if (!p->single_program_flow && devinfo->gen < 6)
1435       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1436 
1437    push_if_stack(p, insn);
1438    p->if_depth_in_loop[p->loop_stack_depth]++;
1439    return insn;
1440 }
1441 
1442 /* This function is only used for gen6-style IF instructions with an
1443  * embedded comparison (conditional modifier).  It is not used on gen7.
1444  */
1445 brw_inst *
gen6_IF(struct brw_codegen * p,enum brw_conditional_mod conditional,struct brw_reg src0,struct brw_reg src1)1446 gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1447 	struct brw_reg src0, struct brw_reg src1)
1448 {
1449    const struct gen_device_info *devinfo = p->devinfo;
1450    brw_inst *insn;
1451 
1452    insn = next_insn(p, BRW_OPCODE_IF);
1453 
1454    brw_set_dest(p, insn, brw_imm_w(0));
1455    brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1456    brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1457    brw_set_src0(p, insn, src0);
1458    brw_set_src1(p, insn, src1);
1459 
1460    assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1461    assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1462    brw_inst_set_cond_modifier(devinfo, insn, conditional);
1463 
1464    push_if_stack(p, insn);
1465    return insn;
1466 }
1467 
1468 /**
1469  * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1470  */
1471 static void
convert_IF_ELSE_to_ADD(struct brw_codegen * p,brw_inst * if_inst,brw_inst * else_inst)1472 convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1473                        brw_inst *if_inst, brw_inst *else_inst)
1474 {
1475    const struct gen_device_info *devinfo = p->devinfo;
1476 
1477    /* The next instruction (where the ENDIF would be, if it existed) */
1478    brw_inst *next_inst = &p->store[p->nr_insn];
1479 
1480    assert(p->single_program_flow);
1481    assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1482    assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1483    assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1484 
1485    /* Convert IF to an ADD instruction that moves the instruction pointer
1486     * to the first instruction of the ELSE block.  If there is no ELSE
1487     * block, point to where ENDIF would be.  Reverse the predicate.
1488     *
1489     * There's no need to execute an ENDIF since we don't need to do any
1490     * stack operations, and if we're currently executing, we just want to
1491     * continue normally.
1492     */
1493    brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1494    brw_inst_set_pred_inv(devinfo, if_inst, true);
1495 
1496    if (else_inst != NULL) {
1497       /* Convert ELSE to an ADD instruction that points where the ENDIF
1498        * would be.
1499        */
1500       brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1501 
1502       brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1503       brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1504    } else {
1505       brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1506    }
1507 }
1508 
1509 /**
1510  * Patch IF and ELSE instructions with appropriate jump targets.
1511  */
1512 static void
patch_IF_ELSE(struct brw_codegen * p,brw_inst * if_inst,brw_inst * else_inst,brw_inst * endif_inst)1513 patch_IF_ELSE(struct brw_codegen *p,
1514               brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1515 {
1516    const struct gen_device_info *devinfo = p->devinfo;
1517 
1518    /* We shouldn't be patching IF and ELSE instructions in single program flow
1519     * mode when gen < 6, because in single program flow mode on those
1520     * platforms, we convert flow control instructions to conditional ADDs that
1521     * operate on IP (see brw_ENDIF).
1522     *
1523     * However, on Gen6, writing to IP doesn't work in single program flow mode
1524     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1525     * not be updated by non-flow control instructions.").  And on later
1526     * platforms, there is no significant benefit to converting control flow
1527     * instructions to conditional ADDs.  So we do patch IF and ELSE
1528     * instructions in single program flow mode on those platforms.
1529     */
1530    if (devinfo->gen < 6)
1531       assert(!p->single_program_flow);
1532 
1533    assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1534    assert(endif_inst != NULL);
1535    assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1536 
1537    unsigned br = brw_jump_scale(devinfo);
1538 
1539    assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1540    brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1541 
1542    if (else_inst == NULL) {
1543       /* Patch IF -> ENDIF */
1544       if (devinfo->gen < 6) {
1545 	 /* Turn it into an IFF, which means no mask stack operations for
1546 	  * all-false and jumping past the ENDIF.
1547 	  */
1548          brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1549          brw_inst_set_gen4_jump_count(devinfo, if_inst,
1550                                       br * (endif_inst - if_inst + 1));
1551          brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1552       } else if (devinfo->gen == 6) {
1553 	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1554          brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1555       } else {
1556          brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1557          brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1558       }
1559    } else {
1560       brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1561 
1562       /* Patch IF -> ELSE */
1563       if (devinfo->gen < 6) {
1564          brw_inst_set_gen4_jump_count(devinfo, if_inst,
1565                                       br * (else_inst - if_inst));
1566          brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1567       } else if (devinfo->gen == 6) {
1568          brw_inst_set_gen6_jump_count(devinfo, if_inst,
1569                                       br * (else_inst - if_inst + 1));
1570       }
1571 
1572       /* Patch ELSE -> ENDIF */
1573       if (devinfo->gen < 6) {
1574 	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1575 	  * matching ENDIF.
1576 	  */
1577          brw_inst_set_gen4_jump_count(devinfo, else_inst,
1578                                       br * (endif_inst - else_inst + 1));
1579          brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1580       } else if (devinfo->gen == 6) {
1581 	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1582          brw_inst_set_gen6_jump_count(devinfo, else_inst,
1583                                       br * (endif_inst - else_inst));
1584       } else {
1585 	 /* The IF instruction's JIP should point just past the ELSE */
1586          brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1587 	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1588          brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1589          brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1590          if (devinfo->gen >= 8) {
1591             /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1592              * should point to ENDIF.
1593              */
1594             brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1595          }
1596       }
1597    }
1598 }
1599 
1600 void
brw_ELSE(struct brw_codegen * p)1601 brw_ELSE(struct brw_codegen *p)
1602 {
1603    const struct gen_device_info *devinfo = p->devinfo;
1604    brw_inst *insn;
1605 
1606    insn = next_insn(p, BRW_OPCODE_ELSE);
1607 
1608    if (devinfo->gen < 6) {
1609       brw_set_dest(p, insn, brw_ip_reg());
1610       brw_set_src0(p, insn, brw_ip_reg());
1611       brw_set_src1(p, insn, brw_imm_d(0x0));
1612    } else if (devinfo->gen == 6) {
1613       brw_set_dest(p, insn, brw_imm_w(0));
1614       brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1615       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1616       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1617    } else if (devinfo->gen == 7) {
1618       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1619       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1620       brw_set_src1(p, insn, brw_imm_w(0));
1621       brw_inst_set_jip(devinfo, insn, 0);
1622       brw_inst_set_uip(devinfo, insn, 0);
1623    } else {
1624       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1625       if (devinfo->gen < 12)
1626          brw_set_src0(p, insn, brw_imm_d(0));
1627       brw_inst_set_jip(devinfo, insn, 0);
1628       brw_inst_set_uip(devinfo, insn, 0);
1629    }
1630 
1631    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1632    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1633    if (!p->single_program_flow && devinfo->gen < 6)
1634       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1635 
1636    push_if_stack(p, insn);
1637 }
1638 
1639 void
brw_ENDIF(struct brw_codegen * p)1640 brw_ENDIF(struct brw_codegen *p)
1641 {
1642    const struct gen_device_info *devinfo = p->devinfo;
1643    brw_inst *insn = NULL;
1644    brw_inst *else_inst = NULL;
1645    brw_inst *if_inst = NULL;
1646    brw_inst *tmp;
1647    bool emit_endif = true;
1648 
1649    /* In single program flow mode, we can express IF and ELSE instructions
1650     * equivalently as ADD instructions that operate on IP.  On platforms prior
1651     * to Gen6, flow control instructions cause an implied thread switch, so
1652     * this is a significant savings.
1653     *
1654     * However, on Gen6, writing to IP doesn't work in single program flow mode
1655     * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1656     * not be updated by non-flow control instructions.").  And on later
1657     * platforms, there is no significant benefit to converting control flow
1658     * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1659     * Gen5.
1660     */
1661    if (devinfo->gen < 6 && p->single_program_flow)
1662       emit_endif = false;
1663 
1664    /*
1665     * A single next_insn() may change the base address of instruction store
1666     * memory(p->store), so call it first before referencing the instruction
1667     * store pointer from an index
1668     */
1669    if (emit_endif)
1670       insn = next_insn(p, BRW_OPCODE_ENDIF);
1671 
1672    /* Pop the IF and (optional) ELSE instructions from the stack */
1673    p->if_depth_in_loop[p->loop_stack_depth]--;
1674    tmp = pop_if_stack(p);
1675    if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1676       else_inst = tmp;
1677       tmp = pop_if_stack(p);
1678    }
1679    if_inst = tmp;
1680 
1681    if (!emit_endif) {
1682       /* ENDIF is useless; don't bother emitting it. */
1683       convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1684       return;
1685    }
1686 
1687    if (devinfo->gen < 6) {
1688       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1689       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1690       brw_set_src1(p, insn, brw_imm_d(0x0));
1691    } else if (devinfo->gen == 6) {
1692       brw_set_dest(p, insn, brw_imm_w(0));
1693       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1694       brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1695    } else if (devinfo->gen == 7) {
1696       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1697       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1698       brw_set_src1(p, insn, brw_imm_w(0));
1699    } else {
1700       brw_set_src0(p, insn, brw_imm_d(0));
1701    }
1702 
1703    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1704    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1705    if (devinfo->gen < 6)
1706       brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1707 
1708    /* Also pop item off the stack in the endif instruction: */
1709    if (devinfo->gen < 6) {
1710       brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1711       brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1712    } else if (devinfo->gen == 6) {
1713       brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1714    } else {
1715       brw_inst_set_jip(devinfo, insn, 2);
1716    }
1717    patch_IF_ELSE(p, if_inst, else_inst, insn);
1718 }
1719 
1720 brw_inst *
brw_BREAK(struct brw_codegen * p)1721 brw_BREAK(struct brw_codegen *p)
1722 {
1723    const struct gen_device_info *devinfo = p->devinfo;
1724    brw_inst *insn;
1725 
1726    insn = next_insn(p, BRW_OPCODE_BREAK);
1727    if (devinfo->gen >= 8) {
1728       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1729       brw_set_src0(p, insn, brw_imm_d(0x0));
1730    } else if (devinfo->gen >= 6) {
1731       brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1732       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1733       brw_set_src1(p, insn, brw_imm_d(0x0));
1734    } else {
1735       brw_set_dest(p, insn, brw_ip_reg());
1736       brw_set_src0(p, insn, brw_ip_reg());
1737       brw_set_src1(p, insn, brw_imm_d(0x0));
1738       brw_inst_set_gen4_pop_count(devinfo, insn,
1739                                   p->if_depth_in_loop[p->loop_stack_depth]);
1740    }
1741    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1742    brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1743 
1744    return insn;
1745 }
1746 
1747 brw_inst *
brw_CONT(struct brw_codegen * p)1748 brw_CONT(struct brw_codegen *p)
1749 {
1750    const struct gen_device_info *devinfo = p->devinfo;
1751    brw_inst *insn;
1752 
1753    insn = next_insn(p, BRW_OPCODE_CONTINUE);
1754    brw_set_dest(p, insn, brw_ip_reg());
1755    if (devinfo->gen >= 8) {
1756       brw_set_src0(p, insn, brw_imm_d(0x0));
1757    } else {
1758       brw_set_src0(p, insn, brw_ip_reg());
1759       brw_set_src1(p, insn, brw_imm_d(0x0));
1760    }
1761 
1762    if (devinfo->gen < 6) {
1763       brw_inst_set_gen4_pop_count(devinfo, insn,
1764                                   p->if_depth_in_loop[p->loop_stack_depth]);
1765    }
1766    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1767    brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1768    return insn;
1769 }
1770 
1771 brw_inst *
brw_HALT(struct brw_codegen * p)1772 brw_HALT(struct brw_codegen *p)
1773 {
1774    const struct gen_device_info *devinfo = p->devinfo;
1775    brw_inst *insn;
1776 
1777    insn = next_insn(p, BRW_OPCODE_HALT);
1778    brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1779    if (devinfo->gen < 6) {
1780       /* From the Gen4 PRM:
1781        *
1782        *    "IP register must be put (for example, by the assembler) at <dst>
1783        *    and <src0> locations.
1784        */
1785       brw_set_dest(p, insn, brw_ip_reg());
1786       brw_set_src0(p, insn, brw_ip_reg());
1787       brw_set_src1(p, insn, brw_imm_d(0x0)); /* exitcode updated later. */
1788    } else if (devinfo->gen < 8) {
1789       brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1790       brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1791    } else if (devinfo->gen < 12) {
1792       brw_set_src0(p, insn, brw_imm_d(0x0));
1793    }
1794 
1795    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1796    brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1797    return insn;
1798 }
1799 
1800 /* DO/WHILE loop:
1801  *
1802  * The DO/WHILE is just an unterminated loop -- break or continue are
1803  * used for control within the loop.  We have a few ways they can be
1804  * done.
1805  *
1806  * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1807  * jip and no DO instruction.
1808  *
1809  * For non-uniform control flow pre-gen6, there's a DO instruction to
1810  * push the mask, and a WHILE to jump back, and BREAK to get out and
1811  * pop the mask.
1812  *
1813  * For gen6, there's no more mask stack, so no need for DO.  WHILE
1814  * just points back to the first instruction of the loop.
1815  */
1816 brw_inst *
brw_DO(struct brw_codegen * p,unsigned execute_size)1817 brw_DO(struct brw_codegen *p, unsigned execute_size)
1818 {
1819    const struct gen_device_info *devinfo = p->devinfo;
1820 
1821    if (devinfo->gen >= 6 || p->single_program_flow) {
1822       push_loop_stack(p, &p->store[p->nr_insn]);
1823       return &p->store[p->nr_insn];
1824    } else {
1825       brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1826 
1827       push_loop_stack(p, insn);
1828 
1829       /* Override the defaults for this instruction:
1830        */
1831       brw_set_dest(p, insn, brw_null_reg());
1832       brw_set_src0(p, insn, brw_null_reg());
1833       brw_set_src1(p, insn, brw_null_reg());
1834 
1835       brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1836       brw_inst_set_exec_size(devinfo, insn, execute_size);
1837       brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1838 
1839       return insn;
1840    }
1841 }
1842 
1843 /**
1844  * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1845  * instruction here.
1846  *
1847  * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1848  * nesting, since it can always just point to the end of the block/current loop.
1849  */
1850 static void
brw_patch_break_cont(struct brw_codegen * p,brw_inst * while_inst)1851 brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1852 {
1853    const struct gen_device_info *devinfo = p->devinfo;
1854    brw_inst *do_inst = get_inner_do_insn(p);
1855    brw_inst *inst;
1856    unsigned br = brw_jump_scale(devinfo);
1857 
1858    assert(devinfo->gen < 6);
1859 
1860    for (inst = while_inst - 1; inst != do_inst; inst--) {
1861       /* If the jump count is != 0, that means that this instruction has already
1862        * been patched because it's part of a loop inside of the one we're
1863        * patching.
1864        */
1865       if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1866           brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1867          brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1868       } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1869                  brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1870          brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1871       }
1872    }
1873 }
1874 
1875 brw_inst *
brw_WHILE(struct brw_codegen * p)1876 brw_WHILE(struct brw_codegen *p)
1877 {
1878    const struct gen_device_info *devinfo = p->devinfo;
1879    brw_inst *insn, *do_insn;
1880    unsigned br = brw_jump_scale(devinfo);
1881 
1882    if (devinfo->gen >= 6) {
1883       insn = next_insn(p, BRW_OPCODE_WHILE);
1884       do_insn = get_inner_do_insn(p);
1885 
1886       if (devinfo->gen >= 8) {
1887          brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1888          if (devinfo->gen < 12)
1889             brw_set_src0(p, insn, brw_imm_d(0));
1890          brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1891       } else if (devinfo->gen == 7) {
1892          brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1893          brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1894          brw_set_src1(p, insn, brw_imm_w(0));
1895          brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1896       } else {
1897          brw_set_dest(p, insn, brw_imm_w(0));
1898          brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1899          brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1900          brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1901       }
1902 
1903       brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1904 
1905    } else {
1906       if (p->single_program_flow) {
1907 	 insn = next_insn(p, BRW_OPCODE_ADD);
1908          do_insn = get_inner_do_insn(p);
1909 
1910 	 brw_set_dest(p, insn, brw_ip_reg());
1911 	 brw_set_src0(p, insn, brw_ip_reg());
1912 	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1913          brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1914       } else {
1915 	 insn = next_insn(p, BRW_OPCODE_WHILE);
1916          do_insn = get_inner_do_insn(p);
1917 
1918          assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1919 
1920 	 brw_set_dest(p, insn, brw_ip_reg());
1921 	 brw_set_src0(p, insn, brw_ip_reg());
1922 	 brw_set_src1(p, insn, brw_imm_d(0));
1923 
1924          brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1925          brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1926          brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1927 
1928 	 brw_patch_break_cont(p, insn);
1929       }
1930    }
1931    brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1932 
1933    p->loop_stack_depth--;
1934 
1935    return insn;
1936 }
1937 
1938 /* FORWARD JUMPS:
1939  */
brw_land_fwd_jump(struct brw_codegen * p,int jmp_insn_idx)1940 void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1941 {
1942    const struct gen_device_info *devinfo = p->devinfo;
1943    brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1944    unsigned jmpi = 1;
1945 
1946    if (devinfo->gen >= 5)
1947       jmpi = 2;
1948 
1949    assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1950    assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1951 
1952    brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1953                                 jmpi * (p->nr_insn - jmp_insn_idx - 1));
1954 }
1955 
1956 /* To integrate with the above, it makes sense that the comparison
1957  * instruction should populate the flag register.  It might be simpler
1958  * just to use the flag reg for most WM tasks?
1959  */
brw_CMP(struct brw_codegen * p,struct brw_reg dest,unsigned conditional,struct brw_reg src0,struct brw_reg src1)1960 void brw_CMP(struct brw_codegen *p,
1961 	     struct brw_reg dest,
1962 	     unsigned conditional,
1963 	     struct brw_reg src0,
1964 	     struct brw_reg src1)
1965 {
1966    const struct gen_device_info *devinfo = p->devinfo;
1967    brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1968 
1969    brw_inst_set_cond_modifier(devinfo, insn, conditional);
1970    brw_set_dest(p, insn, dest);
1971    brw_set_src0(p, insn, src0);
1972    brw_set_src1(p, insn, src1);
1973 
1974    /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1975     * page says:
1976     *    "Any CMP instruction with a null destination must use a {switch}."
1977     *
1978     * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1979     * mentioned on their work-arounds pages.
1980     */
1981    if (devinfo->gen == 7) {
1982       if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1983           dest.nr == BRW_ARF_NULL) {
1984          brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1985       }
1986    }
1987 }
1988 
1989 /***********************************************************************
1990  * Helpers for the various SEND message types:
1991  */
1992 
1993 /** Extended math function, float[8].
1994  */
gen4_math(struct brw_codegen * p,struct brw_reg dest,unsigned function,unsigned msg_reg_nr,struct brw_reg src,unsigned precision)1995 void gen4_math(struct brw_codegen *p,
1996 	       struct brw_reg dest,
1997 	       unsigned function,
1998 	       unsigned msg_reg_nr,
1999 	       struct brw_reg src,
2000 	       unsigned precision )
2001 {
2002    const struct gen_device_info *devinfo = p->devinfo;
2003    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2004    unsigned data_type;
2005    if (has_scalar_region(src)) {
2006       data_type = BRW_MATH_DATA_SCALAR;
2007    } else {
2008       data_type = BRW_MATH_DATA_VECTOR;
2009    }
2010 
2011    assert(devinfo->gen < 6);
2012 
2013    /* Example code doesn't set predicate_control for send
2014     * instructions.
2015     */
2016    brw_inst_set_pred_control(devinfo, insn, 0);
2017    brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2018 
2019    brw_set_dest(p, insn, dest);
2020    brw_set_src0(p, insn, src);
2021    brw_set_math_message(p,
2022                         insn,
2023                         function,
2024                         src.type == BRW_REGISTER_TYPE_D,
2025                         precision,
2026                         data_type);
2027 }
2028 
gen6_math(struct brw_codegen * p,struct brw_reg dest,unsigned function,struct brw_reg src0,struct brw_reg src1)2029 void gen6_math(struct brw_codegen *p,
2030 	       struct brw_reg dest,
2031 	       unsigned function,
2032 	       struct brw_reg src0,
2033 	       struct brw_reg src1)
2034 {
2035    const struct gen_device_info *devinfo = p->devinfo;
2036    brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
2037 
2038    assert(devinfo->gen >= 6);
2039 
2040    assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
2041           (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
2042 
2043    assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2044    if (devinfo->gen == 6) {
2045       assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2046       assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2047    }
2048 
2049    if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2050        function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2051        function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2052       assert(src0.type != BRW_REGISTER_TYPE_F);
2053       assert(src1.type != BRW_REGISTER_TYPE_F);
2054       assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2055              (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2056    } else {
2057       assert(src0.type == BRW_REGISTER_TYPE_F ||
2058              (src0.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2059       assert(src1.type == BRW_REGISTER_TYPE_F ||
2060              (src1.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
2061    }
2062 
2063    /* Source modifiers are ignored for extended math instructions on Gen6. */
2064    if (devinfo->gen == 6) {
2065       assert(!src0.negate);
2066       assert(!src0.abs);
2067       assert(!src1.negate);
2068       assert(!src1.abs);
2069    }
2070 
2071    brw_inst_set_math_function(devinfo, insn, function);
2072 
2073    brw_set_dest(p, insn, dest);
2074    brw_set_src0(p, insn, src0);
2075    brw_set_src1(p, insn, src1);
2076 }
2077 
2078 /**
2079  * Return the right surface index to access the thread scratch space using
2080  * stateless dataport messages.
2081  */
2082 unsigned
brw_scratch_surface_idx(const struct brw_codegen * p)2083 brw_scratch_surface_idx(const struct brw_codegen *p)
2084 {
2085    /* The scratch space is thread-local so IA coherency is unnecessary. */
2086    if (p->devinfo->gen >= 8)
2087       return GEN8_BTI_STATELESS_NON_COHERENT;
2088    else
2089       return BRW_BTI_STATELESS;
2090 }
2091 
2092 /**
2093  * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2094  * using a constant offset per channel.
2095  *
2096  * The offset must be aligned to oword size (16 bytes).  Used for
2097  * register spilling.
2098  */
brw_oword_block_write_scratch(struct brw_codegen * p,struct brw_reg mrf,int num_regs,unsigned offset)2099 void brw_oword_block_write_scratch(struct brw_codegen *p,
2100 				   struct brw_reg mrf,
2101 				   int num_regs,
2102 				   unsigned offset)
2103 {
2104    const struct gen_device_info *devinfo = p->devinfo;
2105    const unsigned target_cache =
2106       (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2107        devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2108        BRW_SFID_DATAPORT_WRITE);
2109    const struct tgl_swsb swsb = brw_get_default_swsb(p);
2110    uint32_t msg_type;
2111 
2112    if (devinfo->gen >= 6)
2113       offset /= 16;
2114 
2115    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2116 
2117    const unsigned mlen = 1 + num_regs;
2118 
2119    /* Set up the message header.  This is g0, with g0.2 filled with
2120     * the offset.  We don't want to leave our offset around in g0 or
2121     * it'll screw up texture samples, so set it up inside the message
2122     * reg.
2123     */
2124    {
2125       brw_push_insn_state(p);
2126       brw_set_default_exec_size(p, BRW_EXECUTE_8);
2127       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2128       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2129       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2130 
2131       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2132 
2133       /* set message header global offset field (reg 0, element 2) */
2134       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2135       brw_set_default_swsb(p, tgl_swsb_null());
2136       brw_MOV(p,
2137 	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2138 				  mrf.nr,
2139 				  2), BRW_REGISTER_TYPE_UD),
2140 	      brw_imm_ud(offset));
2141 
2142       brw_pop_insn_state(p);
2143       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2144    }
2145 
2146    {
2147       struct brw_reg dest;
2148       brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2149       int send_commit_msg;
2150       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2151 					 BRW_REGISTER_TYPE_UW);
2152 
2153       brw_inst_set_sfid(devinfo, insn, target_cache);
2154       brw_inst_set_compression(devinfo, insn, false);
2155 
2156       if (brw_inst_exec_size(devinfo, insn) >= 16)
2157 	 src_header = vec16(src_header);
2158 
2159       assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2160       if (devinfo->gen < 6)
2161          brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2162 
2163       /* Until gen6, writes followed by reads from the same location
2164        * are not guaranteed to be ordered unless write_commit is set.
2165        * If set, then a no-op write is issued to the destination
2166        * register to set a dependency, and a read from the destination
2167        * can be used to ensure the ordering.
2168        *
2169        * For gen6, only writes between different threads need ordering
2170        * protection.  Our use of DP writes is all about register
2171        * spilling within a thread.
2172        */
2173       if (devinfo->gen >= 6) {
2174 	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2175 	 send_commit_msg = 0;
2176       } else {
2177 	 dest = src_header;
2178 	 send_commit_msg = 1;
2179       }
2180 
2181       brw_set_dest(p, insn, dest);
2182       if (devinfo->gen >= 6) {
2183 	 brw_set_src0(p, insn, mrf);
2184       } else {
2185 	 brw_set_src0(p, insn, brw_null_reg());
2186       }
2187 
2188       if (devinfo->gen >= 6)
2189 	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2190       else
2191 	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2192 
2193       brw_set_desc(p, insn,
2194                    brw_message_desc(devinfo, mlen, send_commit_msg, true) |
2195                    brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
2196                                      BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2197                                      msg_type, 0, /* not a render target */
2198                                      send_commit_msg));
2199    }
2200 }
2201 
2202 
2203 /**
2204  * Read a block of owords (half a GRF each) from the scratch buffer
2205  * using a constant index per channel.
2206  *
2207  * Offset must be aligned to oword size (16 bytes).  Used for register
2208  * spilling.
2209  */
2210 void
brw_oword_block_read_scratch(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,int num_regs,unsigned offset)2211 brw_oword_block_read_scratch(struct brw_codegen *p,
2212 			     struct brw_reg dest,
2213 			     struct brw_reg mrf,
2214 			     int num_regs,
2215 			     unsigned offset)
2216 {
2217    const struct gen_device_info *devinfo = p->devinfo;
2218    const struct tgl_swsb swsb = brw_get_default_swsb(p);
2219 
2220    if (devinfo->gen >= 6)
2221       offset /= 16;
2222 
2223    if (p->devinfo->gen >= 7) {
2224       /* On gen 7 and above, we no longer have message registers and we can
2225        * send from any register we want.  By using the destination register
2226        * for the message, we guarantee that the implied message write won't
2227        * accidentally overwrite anything.  This has been a problem because
2228        * the MRF registers and source for the final FB write are both fixed
2229        * and may overlap.
2230        */
2231       mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2232    } else {
2233       mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2234    }
2235    dest = retype(dest, BRW_REGISTER_TYPE_UW);
2236 
2237    const unsigned rlen = num_regs;
2238    const unsigned target_cache =
2239       (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2240        devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2241        BRW_SFID_DATAPORT_READ);
2242 
2243    {
2244       brw_push_insn_state(p);
2245       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2246       brw_set_default_exec_size(p, BRW_EXECUTE_8);
2247       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2248       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2249 
2250       brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2251 
2252       /* set message header global offset field (reg 0, element 2) */
2253       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2254       brw_set_default_swsb(p, tgl_swsb_null());
2255       brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2256 
2257       brw_pop_insn_state(p);
2258       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2259    }
2260 
2261    {
2262       brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2263 
2264       brw_inst_set_sfid(devinfo, insn, target_cache);
2265       assert(brw_inst_pred_control(devinfo, insn) == 0);
2266       brw_inst_set_compression(devinfo, insn, false);
2267 
2268       brw_set_dest(p, insn, dest);	/* UW? */
2269       if (devinfo->gen >= 6) {
2270 	 brw_set_src0(p, insn, mrf);
2271       } else {
2272 	 brw_set_src0(p, insn, brw_null_reg());
2273          brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2274       }
2275 
2276       brw_set_desc(p, insn,
2277                    brw_message_desc(devinfo, 1, rlen, true) |
2278                    brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
2279                                     BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2280                                     BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2281                                     BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2282    }
2283 }
2284 
2285 void
gen7_block_read_scratch(struct brw_codegen * p,struct brw_reg dest,int num_regs,unsigned offset)2286 gen7_block_read_scratch(struct brw_codegen *p,
2287                         struct brw_reg dest,
2288                         int num_regs,
2289                         unsigned offset)
2290 {
2291    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2292    assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2293 
2294    brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2295 
2296    /* The HW requires that the header is present; this is to get the g0.5
2297     * scratch offset.
2298     */
2299    brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2300 
2301    /* According to the docs, offset is "A 12-bit HWord offset into the memory
2302     * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
2303     * is 32 bytes, which happens to be the size of a register.
2304     */
2305    offset /= REG_SIZE;
2306    assert(offset < (1 << 12));
2307 
2308    gen7_set_dp_scratch_message(p, insn,
2309                                false, /* scratch read */
2310                                false, /* OWords */
2311                                false, /* invalidate after read */
2312                                num_regs,
2313                                offset,
2314                                1,        /* mlen: just g0 */
2315                                num_regs, /* rlen */
2316                                true);    /* header present */
2317 }
2318 
2319 /**
2320  * Read float[4] vectors from the data port constant cache.
2321  * Location (in buffer) should be a multiple of 16.
2322  * Used for fetching shader constants.
2323  */
brw_oword_block_read(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,uint32_t offset,uint32_t bind_table_index)2324 void brw_oword_block_read(struct brw_codegen *p,
2325 			  struct brw_reg dest,
2326 			  struct brw_reg mrf,
2327 			  uint32_t offset,
2328 			  uint32_t bind_table_index)
2329 {
2330    const struct gen_device_info *devinfo = p->devinfo;
2331    const unsigned target_cache =
2332       (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2333        BRW_SFID_DATAPORT_READ);
2334    const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2335    const struct tgl_swsb swsb = brw_get_default_swsb(p);
2336 
2337    /* On newer hardware, offset is in units of owords. */
2338    if (devinfo->gen >= 6)
2339       offset /= 16;
2340 
2341    mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2342 
2343    brw_push_insn_state(p);
2344    brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2345    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2346    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2347 
2348    brw_push_insn_state(p);
2349    brw_set_default_exec_size(p, BRW_EXECUTE_8);
2350    brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2351    brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2352 
2353    /* set message header global offset field (reg 0, element 2) */
2354    brw_set_default_exec_size(p, BRW_EXECUTE_1);
2355    brw_set_default_swsb(p, tgl_swsb_null());
2356    brw_MOV(p,
2357 	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2358 			       mrf.nr,
2359 			       2), BRW_REGISTER_TYPE_UD),
2360 	   brw_imm_ud(offset));
2361    brw_pop_insn_state(p);
2362 
2363    brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2364 
2365    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2366 
2367    brw_inst_set_sfid(devinfo, insn, target_cache);
2368 
2369    /* cast dest to a uword[8] vector */
2370    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2371 
2372    brw_set_dest(p, insn, dest);
2373    if (devinfo->gen >= 6) {
2374       brw_set_src0(p, insn, mrf);
2375    } else {
2376       brw_set_src0(p, insn, brw_null_reg());
2377       brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2378    }
2379 
2380    brw_set_desc(p, insn,
2381                 brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2382                 brw_dp_read_desc(devinfo, bind_table_index,
2383                                  BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2384                                  BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2385                                  BRW_DATAPORT_READ_TARGET_DATA_CACHE));
2386 
2387    brw_pop_insn_state(p);
2388 }
2389 
2390 brw_inst *
brw_fb_WRITE(struct brw_codegen * p,struct brw_reg payload,struct brw_reg implied_header,unsigned msg_control,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool eot,bool last_render_target,bool header_present)2391 brw_fb_WRITE(struct brw_codegen *p,
2392              struct brw_reg payload,
2393              struct brw_reg implied_header,
2394              unsigned msg_control,
2395              unsigned binding_table_index,
2396              unsigned msg_length,
2397              unsigned response_length,
2398              bool eot,
2399              bool last_render_target,
2400              bool header_present)
2401 {
2402    const struct gen_device_info *devinfo = p->devinfo;
2403    const unsigned target_cache =
2404       (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2405        BRW_SFID_DATAPORT_WRITE);
2406    brw_inst *insn;
2407    unsigned msg_type;
2408    struct brw_reg dest, src0;
2409 
2410    if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2411       dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2412    else
2413       dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2414 
2415    if (devinfo->gen >= 6) {
2416       insn = next_insn(p, BRW_OPCODE_SENDC);
2417    } else {
2418       insn = next_insn(p, BRW_OPCODE_SEND);
2419    }
2420    brw_inst_set_sfid(devinfo, insn, target_cache);
2421    brw_inst_set_compression(devinfo, insn, false);
2422 
2423    if (devinfo->gen >= 6) {
2424       /* headerless version, just submit color payload */
2425       src0 = payload;
2426 
2427       msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2428    } else {
2429       assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2430       brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2431       src0 = implied_header;
2432 
2433       msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2434    }
2435 
2436    brw_set_dest(p, insn, dest);
2437    brw_set_src0(p, insn, src0);
2438    brw_set_desc(p, insn,
2439                 brw_message_desc(devinfo, msg_length, response_length,
2440                                  header_present) |
2441                 brw_dp_write_desc(devinfo, binding_table_index, msg_control,
2442                                   msg_type, last_render_target,
2443                                   0 /* send_commit_msg */));
2444    brw_inst_set_eot(devinfo, insn, eot);
2445 
2446    return insn;
2447 }
2448 
2449 brw_inst *
gen9_fb_READ(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,unsigned binding_table_index,unsigned msg_length,unsigned response_length,bool per_sample)2450 gen9_fb_READ(struct brw_codegen *p,
2451              struct brw_reg dst,
2452              struct brw_reg payload,
2453              unsigned binding_table_index,
2454              unsigned msg_length,
2455              unsigned response_length,
2456              bool per_sample)
2457 {
2458    const struct gen_device_info *devinfo = p->devinfo;
2459    assert(devinfo->gen >= 9);
2460    const unsigned msg_subtype =
2461       brw_get_default_exec_size(p) == BRW_EXECUTE_16 ? 0 : 1;
2462    brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2463 
2464    brw_inst_set_sfid(devinfo, insn, GEN6_SFID_DATAPORT_RENDER_CACHE);
2465    brw_set_dest(p, insn, dst);
2466    brw_set_src0(p, insn, payload);
2467    brw_set_desc(
2468       p, insn,
2469       brw_message_desc(devinfo, msg_length, response_length, true) |
2470       brw_dp_read_desc(devinfo, binding_table_index,
2471                        per_sample << 5 | msg_subtype,
2472                        GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2473                        BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2474    brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2475 
2476    return insn;
2477 }
2478 
2479 /**
2480  * Texture sample instruction.
2481  * Note: the msg_type plus msg_length values determine exactly what kind
2482  * of sampling operation is performed.  See volume 4, page 161 of docs.
2483  */
brw_SAMPLE(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,unsigned sampler,unsigned msg_type,unsigned response_length,unsigned msg_length,unsigned header_present,unsigned simd_mode,unsigned return_format)2484 void brw_SAMPLE(struct brw_codegen *p,
2485 		struct brw_reg dest,
2486 		unsigned msg_reg_nr,
2487 		struct brw_reg src0,
2488 		unsigned binding_table_index,
2489 		unsigned sampler,
2490 		unsigned msg_type,
2491 		unsigned response_length,
2492 		unsigned msg_length,
2493 		unsigned header_present,
2494 		unsigned simd_mode,
2495 		unsigned return_format)
2496 {
2497    const struct gen_device_info *devinfo = p->devinfo;
2498    brw_inst *insn;
2499 
2500    if (msg_reg_nr != -1)
2501       gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2502 
2503    insn = next_insn(p, BRW_OPCODE_SEND);
2504    brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2505    brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2506 
2507    /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2508     *
2509     *    "Instruction compression is not allowed for this instruction (that
2510     *     is, send). The hardware behavior is undefined if this instruction is
2511     *     set as compressed. However, compress control can be set to "SecHalf"
2512     *     to affect the EMask generation."
2513     *
2514     * No similar wording is found in later PRMs, but there are examples
2515     * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
2516     * are allowed in SIMD16 mode and they could not work without SecHalf.  For
2517     * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2518     */
2519    brw_inst_set_compression(devinfo, insn, false);
2520 
2521    if (devinfo->gen < 6)
2522       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2523 
2524    brw_set_dest(p, insn, dest);
2525    brw_set_src0(p, insn, src0);
2526    brw_set_desc(p, insn,
2527                 brw_message_desc(devinfo, msg_length, response_length,
2528                                  header_present) |
2529                 brw_sampler_desc(devinfo, binding_table_index, sampler,
2530                                  msg_type, simd_mode, return_format));
2531 }
2532 
2533 /* Adjust the message header's sampler state pointer to
2534  * select the correct group of 16 samplers.
2535  */
brw_adjust_sampler_state_pointer(struct brw_codegen * p,struct brw_reg header,struct brw_reg sampler_index)2536 void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2537                                       struct brw_reg header,
2538                                       struct brw_reg sampler_index)
2539 {
2540    /* The "Sampler Index" field can only store values between 0 and 15.
2541     * However, we can add an offset to the "Sampler State Pointer"
2542     * field, effectively selecting a different set of 16 samplers.
2543     *
2544     * The "Sampler State Pointer" needs to be aligned to a 32-byte
2545     * offset, and each sampler state is only 16-bytes, so we can't
2546     * exclusively use the offset - we have to use both.
2547     */
2548 
2549    const struct gen_device_info *devinfo = p->devinfo;
2550 
2551    if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2552       const int sampler_state_size = 16; /* 16 bytes */
2553       uint32_t sampler = sampler_index.ud;
2554 
2555       if (sampler >= 16) {
2556          assert(devinfo->is_haswell || devinfo->gen >= 8);
2557          brw_ADD(p,
2558                  get_element_ud(header, 3),
2559                  get_element_ud(brw_vec8_grf(0, 0), 3),
2560                  brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2561       }
2562    } else {
2563       /* Non-const sampler array indexing case */
2564       if (devinfo->gen < 8 && !devinfo->is_haswell) {
2565          return;
2566       }
2567 
2568       struct brw_reg temp = get_element_ud(header, 3);
2569 
2570       brw_push_insn_state(p);
2571       brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2572       brw_set_default_swsb(p, tgl_swsb_regdist(1));
2573       brw_SHL(p, temp, temp, brw_imm_ud(4));
2574       brw_ADD(p,
2575               get_element_ud(header, 3),
2576               get_element_ud(brw_vec8_grf(0, 0), 3),
2577               temp);
2578       brw_pop_insn_state(p);
2579    }
2580 }
2581 
2582 /* All these variables are pretty confusing - we might be better off
2583  * using bitmasks and macros for this, in the old style.  Or perhaps
2584  * just having the caller instantiate the fields in dword3 itself.
2585  */
brw_urb_WRITE(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,enum brw_urb_write_flags flags,unsigned msg_length,unsigned response_length,unsigned offset,unsigned swizzle)2586 void brw_urb_WRITE(struct brw_codegen *p,
2587 		   struct brw_reg dest,
2588 		   unsigned msg_reg_nr,
2589 		   struct brw_reg src0,
2590                    enum brw_urb_write_flags flags,
2591 		   unsigned msg_length,
2592 		   unsigned response_length,
2593 		   unsigned offset,
2594 		   unsigned swizzle)
2595 {
2596    const struct gen_device_info *devinfo = p->devinfo;
2597    brw_inst *insn;
2598 
2599    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2600 
2601    if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2602       /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2603       brw_push_insn_state(p);
2604       brw_set_default_access_mode(p, BRW_ALIGN_1);
2605       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2606       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2607       brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2608 		       BRW_REGISTER_TYPE_UD),
2609 	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2610 		brw_imm_ud(0xff00));
2611       brw_pop_insn_state(p);
2612    }
2613 
2614    insn = next_insn(p, BRW_OPCODE_SEND);
2615 
2616    assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2617 
2618    brw_set_dest(p, insn, dest);
2619    brw_set_src0(p, insn, src0);
2620    brw_set_src1(p, insn, brw_imm_d(0));
2621 
2622    if (devinfo->gen < 6)
2623       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2624 
2625    brw_set_urb_message(p,
2626 		       insn,
2627 		       flags,
2628 		       msg_length,
2629 		       response_length,
2630 		       offset,
2631 		       swizzle);
2632 }
2633 
2634 void
brw_send_indirect_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload,struct brw_reg desc,unsigned desc_imm,bool eot)2635 brw_send_indirect_message(struct brw_codegen *p,
2636                           unsigned sfid,
2637                           struct brw_reg dst,
2638                           struct brw_reg payload,
2639                           struct brw_reg desc,
2640                           unsigned desc_imm,
2641                           bool eot)
2642 {
2643    const struct gen_device_info *devinfo = p->devinfo;
2644    struct brw_inst *send;
2645 
2646    dst = retype(dst, BRW_REGISTER_TYPE_UW);
2647 
2648    assert(desc.type == BRW_REGISTER_TYPE_UD);
2649 
2650    if (desc.file == BRW_IMMEDIATE_VALUE) {
2651       send = next_insn(p, BRW_OPCODE_SEND);
2652       brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2653       brw_set_desc(p, send, desc.ud | desc_imm);
2654    } else {
2655       const struct tgl_swsb swsb = brw_get_default_swsb(p);
2656       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2657 
2658       brw_push_insn_state(p);
2659       brw_set_default_access_mode(p, BRW_ALIGN_1);
2660       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2661       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2662       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2663       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2664 
2665       /* Load the indirect descriptor to an address register using OR so the
2666        * caller can specify additional descriptor bits with the desc_imm
2667        * immediate.
2668        */
2669       brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2670 
2671       brw_pop_insn_state(p);
2672 
2673       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2674       send = next_insn(p, BRW_OPCODE_SEND);
2675       brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2676 
2677       if (devinfo->gen >= 12)
2678          brw_inst_set_send_sel_reg32_desc(devinfo, send, true);
2679       else
2680          brw_set_src1(p, send, addr);
2681    }
2682 
2683    brw_set_dest(p, send, dst);
2684    brw_inst_set_sfid(devinfo, send, sfid);
2685    brw_inst_set_eot(devinfo, send, eot);
2686 }
2687 
2688 void
brw_send_indirect_split_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload0,struct brw_reg payload1,struct brw_reg desc,unsigned desc_imm,struct brw_reg ex_desc,unsigned ex_desc_imm,bool eot)2689 brw_send_indirect_split_message(struct brw_codegen *p,
2690                                 unsigned sfid,
2691                                 struct brw_reg dst,
2692                                 struct brw_reg payload0,
2693                                 struct brw_reg payload1,
2694                                 struct brw_reg desc,
2695                                 unsigned desc_imm,
2696                                 struct brw_reg ex_desc,
2697                                 unsigned ex_desc_imm,
2698                                 bool eot)
2699 {
2700    const struct gen_device_info *devinfo = p->devinfo;
2701    struct brw_inst *send;
2702 
2703    dst = retype(dst, BRW_REGISTER_TYPE_UW);
2704 
2705    assert(desc.type == BRW_REGISTER_TYPE_UD);
2706 
2707    if (desc.file == BRW_IMMEDIATE_VALUE) {
2708       desc.ud |= desc_imm;
2709    } else {
2710       const struct tgl_swsb swsb = brw_get_default_swsb(p);
2711       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2712 
2713       brw_push_insn_state(p);
2714       brw_set_default_access_mode(p, BRW_ALIGN_1);
2715       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2716       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2717       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2718       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2719 
2720       /* Load the indirect descriptor to an address register using OR so the
2721        * caller can specify additional descriptor bits with the desc_imm
2722        * immediate.
2723        */
2724       brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2725 
2726       brw_pop_insn_state(p);
2727       desc = addr;
2728 
2729       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2730    }
2731 
2732    if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
2733        (devinfo->gen >= 12 || (ex_desc.ud & INTEL_MASK(15, 12)) == 0)) {
2734       ex_desc.ud |= ex_desc_imm;
2735    } else {
2736       const struct tgl_swsb swsb = brw_get_default_swsb(p);
2737       struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);
2738 
2739       brw_push_insn_state(p);
2740       brw_set_default_access_mode(p, BRW_ALIGN_1);
2741       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2742       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2743       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2744       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2745 
2746       /* Load the indirect extended descriptor to an address register using OR
2747        * so the caller can specify additional descriptor bits with the
2748        * desc_imm immediate.
2749        *
2750        * Even though the instruction dispatcher always pulls the SFID and EOT
2751        * fields from the instruction itself, actual external unit which
2752        * processes the message gets the SFID and EOT from the extended
2753        * descriptor which comes from the address register.  If we don't OR
2754        * those two bits in, the external unit may get confused and hang.
2755        */
2756       unsigned imm_part = ex_desc_imm | sfid | eot << 5;
2757 
2758       if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2759          /* ex_desc bits 15:12 don't exist in the instruction encoding prior
2760           * to Gen12, so we may have fallen back to an indirect extended
2761           * descriptor.
2762           */
2763          brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
2764       } else {
2765          brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
2766       }
2767 
2768       brw_pop_insn_state(p);
2769       ex_desc = addr;
2770 
2771       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2772    }
2773 
2774    send = next_insn(p, devinfo->gen >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);
2775    brw_set_dest(p, send, dst);
2776    brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));
2777    brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));
2778 
2779    if (desc.file == BRW_IMMEDIATE_VALUE) {
2780       brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);
2781       brw_inst_set_send_desc(devinfo, send, desc.ud);
2782    } else {
2783       assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2784       assert(desc.nr == BRW_ARF_ADDRESS);
2785       assert(desc.subnr == 0);
2786       brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);
2787    }
2788 
2789    if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2790       brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
2791       brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
2792    } else {
2793       assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2794       assert(ex_desc.nr == BRW_ARF_ADDRESS);
2795       assert((ex_desc.subnr & 0x3) == 0);
2796       brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
2797       brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2);
2798    }
2799 
2800    brw_inst_set_sfid(devinfo, send, sfid);
2801    brw_inst_set_eot(devinfo, send, eot);
2802 }
2803 
2804 static void
brw_send_indirect_surface_message(struct brw_codegen * p,unsigned sfid,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned desc_imm)2805 brw_send_indirect_surface_message(struct brw_codegen *p,
2806                                   unsigned sfid,
2807                                   struct brw_reg dst,
2808                                   struct brw_reg payload,
2809                                   struct brw_reg surface,
2810                                   unsigned desc_imm)
2811 {
2812    if (surface.file != BRW_IMMEDIATE_VALUE) {
2813       const struct tgl_swsb swsb = brw_get_default_swsb(p);
2814       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2815 
2816       brw_push_insn_state(p);
2817       brw_set_default_access_mode(p, BRW_ALIGN_1);
2818       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2819       brw_set_default_exec_size(p, BRW_EXECUTE_1);
2820       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2821       brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2822 
2823       /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2824        * some surface array is accessed out of bounds.
2825        */
2826       brw_AND(p, addr,
2827               suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2828                         BRW_GET_SWZ(surface.swizzle, 0)),
2829               brw_imm_ud(0xff));
2830 
2831       brw_pop_insn_state(p);
2832 
2833       surface = addr;
2834       brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2835    }
2836 
2837    brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2838 }
2839 
2840 static bool
while_jumps_before_offset(const struct gen_device_info * devinfo,brw_inst * insn,int while_offset,int start_offset)2841 while_jumps_before_offset(const struct gen_device_info *devinfo,
2842                           brw_inst *insn, int while_offset, int start_offset)
2843 {
2844    int scale = 16 / brw_jump_scale(devinfo);
2845    int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2846                                : brw_inst_jip(devinfo, insn);
2847    assert(jip < 0);
2848    return while_offset + jip * scale <= start_offset;
2849 }
2850 
2851 
2852 static int
brw_find_next_block_end(struct brw_codegen * p,int start_offset)2853 brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2854 {
2855    int offset;
2856    void *store = p->store;
2857    const struct gen_device_info *devinfo = p->devinfo;
2858 
2859    int depth = 0;
2860 
2861    for (offset = next_offset(devinfo, store, start_offset);
2862         offset < p->next_insn_offset;
2863         offset = next_offset(devinfo, store, offset)) {
2864       brw_inst *insn = store + offset;
2865 
2866       switch (brw_inst_opcode(devinfo, insn)) {
2867       case BRW_OPCODE_IF:
2868          depth++;
2869          break;
2870       case BRW_OPCODE_ENDIF:
2871          if (depth == 0)
2872             return offset;
2873          depth--;
2874          break;
2875       case BRW_OPCODE_WHILE:
2876          /* If the while doesn't jump before our instruction, it's the end
2877           * of a sibling do...while loop.  Ignore it.
2878           */
2879          if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2880             continue;
2881          /* fallthrough */
2882       case BRW_OPCODE_ELSE:
2883       case BRW_OPCODE_HALT:
2884          if (depth == 0)
2885             return offset;
2886       default:
2887          break;
2888       }
2889    }
2890 
2891    return 0;
2892 }
2893 
2894 /* There is no DO instruction on gen6, so to find the end of the loop
2895  * we have to see if the loop is jumping back before our start
2896  * instruction.
2897  */
2898 static int
brw_find_loop_end(struct brw_codegen * p,int start_offset)2899 brw_find_loop_end(struct brw_codegen *p, int start_offset)
2900 {
2901    const struct gen_device_info *devinfo = p->devinfo;
2902    int offset;
2903    void *store = p->store;
2904 
2905    assert(devinfo->gen >= 6);
2906 
2907    /* Always start after the instruction (such as a WHILE) we're trying to fix
2908     * up.
2909     */
2910    for (offset = next_offset(devinfo, store, start_offset);
2911         offset < p->next_insn_offset;
2912         offset = next_offset(devinfo, store, offset)) {
2913       brw_inst *insn = store + offset;
2914 
2915       if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2916 	 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2917 	    return offset;
2918       }
2919    }
2920    assert(!"not reached");
2921    return start_offset;
2922 }
2923 
2924 /* After program generation, go back and update the UIP and JIP of
2925  * BREAK, CONT, and HALT instructions to their correct locations.
2926  */
2927 void
brw_set_uip_jip(struct brw_codegen * p,int start_offset)2928 brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2929 {
2930    const struct gen_device_info *devinfo = p->devinfo;
2931    int offset;
2932    int br = brw_jump_scale(devinfo);
2933    int scale = 16 / br;
2934    void *store = p->store;
2935 
2936    if (devinfo->gen < 6)
2937       return;
2938 
2939    for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2940       brw_inst *insn = store + offset;
2941       assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2942 
2943       int block_end_offset = brw_find_next_block_end(p, offset);
2944       switch (brw_inst_opcode(devinfo, insn)) {
2945       case BRW_OPCODE_BREAK:
2946          assert(block_end_offset != 0);
2947          brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2948 	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2949          brw_inst_set_uip(devinfo, insn,
2950 	    (brw_find_loop_end(p, offset) - offset +
2951              (devinfo->gen == 6 ? 16 : 0)) / scale);
2952 	 break;
2953       case BRW_OPCODE_CONTINUE:
2954          assert(block_end_offset != 0);
2955          brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2956          brw_inst_set_uip(devinfo, insn,
2957             (brw_find_loop_end(p, offset) - offset) / scale);
2958 
2959          assert(brw_inst_uip(devinfo, insn) != 0);
2960          assert(brw_inst_jip(devinfo, insn) != 0);
2961 	 break;
2962 
2963       case BRW_OPCODE_ENDIF: {
2964          int32_t jump = (block_end_offset == 0) ?
2965                         1 * br : (block_end_offset - offset) / scale;
2966          if (devinfo->gen >= 7)
2967             brw_inst_set_jip(devinfo, insn, jump);
2968          else
2969             brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2970 	 break;
2971       }
2972 
2973       case BRW_OPCODE_HALT:
2974 	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2975 	  *
2976 	  *    "In case of the halt instruction not inside any conditional
2977 	  *     code block, the value of <JIP> and <UIP> should be the
2978 	  *     same. In case of the halt instruction inside conditional code
2979 	  *     block, the <UIP> should be the end of the program, and the
2980 	  *     <JIP> should be end of the most inner conditional code block."
2981 	  *
2982 	  * The uip will have already been set by whoever set up the
2983 	  * instruction.
2984 	  */
2985 	 if (block_end_offset == 0) {
2986             brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2987 	 } else {
2988             brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2989 	 }
2990          assert(brw_inst_uip(devinfo, insn) != 0);
2991          assert(brw_inst_jip(devinfo, insn) != 0);
2992 	 break;
2993 
2994       default:
2995          break;
2996       }
2997    }
2998 }
2999 
brw_ff_sync(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,bool allocate,unsigned response_length,bool eot)3000 void brw_ff_sync(struct brw_codegen *p,
3001 		   struct brw_reg dest,
3002 		   unsigned msg_reg_nr,
3003 		   struct brw_reg src0,
3004 		   bool allocate,
3005 		   unsigned response_length,
3006 		   bool eot)
3007 {
3008    const struct gen_device_info *devinfo = p->devinfo;
3009    brw_inst *insn;
3010 
3011    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
3012 
3013    insn = next_insn(p, BRW_OPCODE_SEND);
3014    brw_set_dest(p, insn, dest);
3015    brw_set_src0(p, insn, src0);
3016    brw_set_src1(p, insn, brw_imm_d(0));
3017 
3018    if (devinfo->gen < 6)
3019       brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
3020 
3021    brw_set_ff_sync_message(p,
3022 			   insn,
3023 			   allocate,
3024 			   response_length,
3025 			   eot);
3026 }
3027 
3028 /**
3029  * Emit the SEND instruction necessary to generate stream output data on Gen6
3030  * (for transform feedback).
3031  *
3032  * If send_commit_msg is true, this is the last piece of stream output data
3033  * from this thread, so send the data as a committed write.  According to the
3034  * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
3035  *
3036  *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
3037  *   writes are complete by sending the final write as a committed write."
3038  */
3039 void
brw_svb_write(struct brw_codegen * p,struct brw_reg dest,unsigned msg_reg_nr,struct brw_reg src0,unsigned binding_table_index,bool send_commit_msg)3040 brw_svb_write(struct brw_codegen *p,
3041               struct brw_reg dest,
3042               unsigned msg_reg_nr,
3043               struct brw_reg src0,
3044               unsigned binding_table_index,
3045               bool   send_commit_msg)
3046 {
3047    const struct gen_device_info *devinfo = p->devinfo;
3048    const unsigned target_cache =
3049       (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
3050        devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
3051        BRW_SFID_DATAPORT_WRITE);
3052    brw_inst *insn;
3053 
3054    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
3055 
3056    insn = next_insn(p, BRW_OPCODE_SEND);
3057    brw_inst_set_sfid(devinfo, insn, target_cache);
3058    brw_set_dest(p, insn, dest);
3059    brw_set_src0(p, insn, src0);
3060    brw_set_desc(p, insn,
3061                 brw_message_desc(devinfo, 1, send_commit_msg, true) |
3062                 brw_dp_write_desc(devinfo, binding_table_index,
3063                                   0, /* msg_control: ignored */
3064                                   GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
3065                                   0, /* last_render_target: ignored */
3066                                   send_commit_msg)); /* send_commit_msg */
3067 }
3068 
3069 static unsigned
brw_surface_payload_size(unsigned num_channels,unsigned exec_size)3070 brw_surface_payload_size(unsigned num_channels,
3071                          unsigned exec_size /**< 0 for SIMD4x2 */)
3072 {
3073    if (exec_size == 0)
3074       return 1; /* SIMD4x2 */
3075    else if (exec_size <= 8)
3076       return num_channels;
3077    else
3078       return 2 * num_channels;
3079 }
3080 
3081 void
brw_untyped_atomic(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned atomic_op,unsigned msg_length,bool response_expected,bool header_present)3082 brw_untyped_atomic(struct brw_codegen *p,
3083                    struct brw_reg dst,
3084                    struct brw_reg payload,
3085                    struct brw_reg surface,
3086                    unsigned atomic_op,
3087                    unsigned msg_length,
3088                    bool response_expected,
3089                    bool header_present)
3090 {
3091    const struct gen_device_info *devinfo = p->devinfo;
3092    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3093                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3094                           GEN7_SFID_DATAPORT_DATA_CACHE);
3095    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3096    /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3097    const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3098    const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3099                               has_simd4x2 ? 0 : 8;
3100    const unsigned response_length =
3101       brw_surface_payload_size(response_expected, exec_size);
3102    const unsigned desc =
3103       brw_message_desc(devinfo, msg_length, response_length, header_present) |
3104       brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
3105                                  response_expected);
3106    /* Mask out unused components -- This is especially important in Align16
3107     * mode on generations that don't have native support for SIMD4x2 atomics,
3108     * because unused but enabled components will cause the dataport to perform
3109     * additional atomic operations on the addresses that happen to be in the
3110     * uninitialized Y, Z and W coordinates of the payload.
3111     */
3112    const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3113 
3114    brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
3115                                      payload, surface, desc);
3116 }
3117 
3118 void
brw_untyped_surface_read(struct brw_codegen * p,struct brw_reg dst,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels)3119 brw_untyped_surface_read(struct brw_codegen *p,
3120                          struct brw_reg dst,
3121                          struct brw_reg payload,
3122                          struct brw_reg surface,
3123                          unsigned msg_length,
3124                          unsigned num_channels)
3125 {
3126    const struct gen_device_info *devinfo = p->devinfo;
3127    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3128                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3129                           GEN7_SFID_DATAPORT_DATA_CACHE);
3130    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3131    const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
3132    const unsigned response_length =
3133       brw_surface_payload_size(num_channels, exec_size);
3134    const unsigned desc =
3135       brw_message_desc(devinfo, msg_length, response_length, false) |
3136       brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
3137 
3138    brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
3139 }
3140 
3141 void
brw_untyped_surface_write(struct brw_codegen * p,struct brw_reg payload,struct brw_reg surface,unsigned msg_length,unsigned num_channels,bool header_present)3142 brw_untyped_surface_write(struct brw_codegen *p,
3143                           struct brw_reg payload,
3144                           struct brw_reg surface,
3145                           unsigned msg_length,
3146                           unsigned num_channels,
3147                           bool header_present)
3148 {
3149    const struct gen_device_info *devinfo = p->devinfo;
3150    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3151                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3152                           GEN7_SFID_DATAPORT_DATA_CACHE);
3153    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3154    /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3155    const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
3156    const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3157                               has_simd4x2 ? 0 : 8;
3158    const unsigned desc =
3159       brw_message_desc(devinfo, msg_length, 0, header_present) |
3160       brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
3161    /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3162    const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
3163 
3164    brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
3165                                      payload, surface, desc);
3166 }
3167 
3168 static void
brw_set_memory_fence_message(struct brw_codegen * p,struct brw_inst * insn,enum brw_message_target sfid,bool commit_enable,unsigned bti)3169 brw_set_memory_fence_message(struct brw_codegen *p,
3170                              struct brw_inst *insn,
3171                              enum brw_message_target sfid,
3172                              bool commit_enable,
3173                              unsigned bti)
3174 {
3175    const struct gen_device_info *devinfo = p->devinfo;
3176 
3177    brw_set_desc(p, insn, brw_message_desc(
3178                    devinfo, 1, (commit_enable ? 1 : 0), true));
3179 
3180    brw_inst_set_sfid(devinfo, insn, sfid);
3181 
3182    switch (sfid) {
3183    case GEN6_SFID_DATAPORT_RENDER_CACHE:
3184       brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3185       break;
3186    case GEN7_SFID_DATAPORT_DATA_CACHE:
3187       brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3188       break;
3189    default:
3190       unreachable("Not reached");
3191    }
3192 
3193    if (commit_enable)
3194       brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3195 
3196    assert(devinfo->gen >= 11 || bti == 0);
3197    brw_inst_set_binding_table_index(devinfo, insn, bti);
3198 }
3199 
3200 void
brw_memory_fence(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src,enum opcode send_op,enum brw_message_target sfid,bool commit_enable,unsigned bti)3201 brw_memory_fence(struct brw_codegen *p,
3202                  struct brw_reg dst,
3203                  struct brw_reg src,
3204                  enum opcode send_op,
3205                  enum brw_message_target sfid,
3206                  bool commit_enable,
3207                  unsigned bti)
3208 {
3209    const struct gen_device_info *devinfo = p->devinfo;
3210 
3211    dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
3212    src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
3213 
3214    /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3215     * message doesn't write anything back.
3216     */
3217    struct brw_inst *insn = next_insn(p, send_op);
3218    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3219    brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3220    brw_set_dest(p, insn, dst);
3221    brw_set_src0(p, insn, src);
3222    brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
3223 }
3224 
3225 void
brw_pixel_interpolator_query(struct brw_codegen * p,struct brw_reg dest,struct brw_reg mrf,bool noperspective,unsigned mode,struct brw_reg data,unsigned msg_length,unsigned response_length)3226 brw_pixel_interpolator_query(struct brw_codegen *p,
3227                              struct brw_reg dest,
3228                              struct brw_reg mrf,
3229                              bool noperspective,
3230                              unsigned mode,
3231                              struct brw_reg data,
3232                              unsigned msg_length,
3233                              unsigned response_length)
3234 {
3235    const struct gen_device_info *devinfo = p->devinfo;
3236    const uint16_t exec_size = brw_get_default_exec_size(p);
3237    const unsigned slot_group = brw_get_default_group(p) / 16;
3238    const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
3239    const unsigned desc =
3240       brw_message_desc(devinfo, msg_length, response_length, false) |
3241       brw_pixel_interp_desc(devinfo, mode, noperspective, simd_mode,
3242                             slot_group);
3243 
3244    /* brw_send_indirect_message will automatically use a direct send message
3245     * if data is actually immediate.
3246     */
3247    brw_send_indirect_message(p,
3248                              GEN7_SFID_PIXEL_INTERPOLATOR,
3249                              dest,
3250                              mrf,
3251                              vec1(data),
3252                              desc,
3253                              false);
3254 }
3255 
3256 void
brw_find_live_channel(struct brw_codegen * p,struct brw_reg dst,struct brw_reg mask)3257 brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3258                       struct brw_reg mask)
3259 {
3260    const struct gen_device_info *devinfo = p->devinfo;
3261    const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3262    const unsigned qtr_control = brw_get_default_group(p) / 8;
3263    brw_inst *inst;
3264 
3265    assert(devinfo->gen >= 7);
3266    assert(mask.type == BRW_REGISTER_TYPE_UD);
3267 
3268    brw_push_insn_state(p);
3269 
3270    /* The flag register is only used on Gen7 in align1 mode, so avoid setting
3271     * unnecessary bits in the instruction words, get the information we need
3272     * and reset the default flag register. This allows more instructions to be
3273     * compacted.
3274     */
3275    const unsigned flag_subreg = p->current->flag_subreg;
3276    brw_set_default_flag_reg(p, 0, 0);
3277 
3278    if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3279       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3280 
3281       if (devinfo->gen >= 8) {
3282          /* Getting the first active channel index is easy on Gen8: Just find
3283           * the first bit set in the execution mask.  The register exists on
3284           * HSW already but it reads back as all ones when the current
3285           * instruction has execution masking disabled, so it's kind of
3286           * useless.
3287           */
3288          struct brw_reg exec_mask =
3289             retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3290 
3291          brw_set_default_exec_size(p, BRW_EXECUTE_1);
3292          if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3293             /* Unfortunately, ce0 does not take into account the thread
3294              * dispatch mask, which may be a problem in cases where it's not
3295              * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3296              * some n).  Combine ce0 with the given dispatch (or vector) mask
3297              * to mask off those channels which were never dispatched by the
3298              * hardware.
3299              */
3300             brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3301             brw_set_default_swsb(p, tgl_swsb_regdist(1));
3302             brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3303             exec_mask = vec1(dst);
3304          }
3305 
3306          /* Quarter control has the effect of magically shifting the value of
3307           * ce0 so you'll get the first active channel relative to the
3308           * specified quarter control as result.
3309           */
3310          inst = brw_FBL(p, vec1(dst), exec_mask);
3311       } else {
3312          const struct brw_reg flag = brw_flag_subreg(flag_subreg);
3313 
3314          brw_set_default_exec_size(p, BRW_EXECUTE_1);
3315          brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3316 
3317          /* Run enough instructions returning zero with execution masking and
3318           * a conditional modifier enabled in order to get the full execution
3319           * mask in f1.0.  We could use a single 32-wide move here if it
3320           * weren't because of the hardware bug that causes channel enables to
3321           * be applied incorrectly to the second half of 32-wide instructions
3322           * on Gen7.
3323           */
3324          const unsigned lower_size = MIN2(16, exec_size);
3325          for (unsigned i = 0; i < exec_size / lower_size; i++) {
3326             inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3327                            brw_imm_uw(0));
3328             brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3329             brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3330             brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3331             brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3332             brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3333             brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3334          }
3335 
3336          /* Find the first bit set in the exec_size-wide portion of the flag
3337           * register that was updated by the last sequence of MOV
3338           * instructions.
3339           */
3340          const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3341          brw_set_default_exec_size(p, BRW_EXECUTE_1);
3342          brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3343       }
3344    } else {
3345       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3346 
3347       if (devinfo->gen >= 8 &&
3348           mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3349          /* In SIMD4x2 mode the first active channel index is just the
3350           * negation of the first bit of the mask register.  Note that ce0
3351           * doesn't take into account the dispatch mask, so the Gen7 path
3352           * should be used instead unless you have the guarantee that the
3353           * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3354           * for some n).
3355           */
3356          inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3357                         negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3358                         brw_imm_ud(1));
3359 
3360       } else {
3361          /* Overwrite the destination without and with execution masking to
3362           * find out which of the channels is active.
3363           */
3364          brw_push_insn_state(p);
3365          brw_set_default_exec_size(p, BRW_EXECUTE_4);
3366          brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3367                  brw_imm_ud(1));
3368 
3369          inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3370                         brw_imm_ud(0));
3371          brw_pop_insn_state(p);
3372          brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3373       }
3374    }
3375 
3376    brw_pop_insn_state(p);
3377 }
3378 
3379 void
brw_broadcast(struct brw_codegen * p,struct brw_reg dst,struct brw_reg src,struct brw_reg idx)3380 brw_broadcast(struct brw_codegen *p,
3381               struct brw_reg dst,
3382               struct brw_reg src,
3383               struct brw_reg idx)
3384 {
3385    const struct gen_device_info *devinfo = p->devinfo;
3386    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3387    brw_inst *inst;
3388 
3389    brw_push_insn_state(p);
3390    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3391    brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3392 
3393    assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3394           src.address_mode == BRW_ADDRESS_DIRECT);
3395    assert(!src.abs && !src.negate);
3396    assert(src.type == dst.type);
3397 
3398    if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3399        idx.file == BRW_IMMEDIATE_VALUE) {
3400       /* Trivial, the source is already uniform or the index is a constant.
3401        * We will typically not get here if the optimizer is doing its job, but
3402        * asserting would be mean.
3403        */
3404       const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3405       src = align1 ? stride(suboffset(src, i), 0, 1, 0) :
3406                      stride(suboffset(src, 4 * i), 0, 4, 1);
3407 
3408       if (type_sz(src.type) > 4 && !devinfo->has_64bit_float) {
3409          brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3410                     subscript(src, BRW_REGISTER_TYPE_D, 0));
3411          brw_set_default_swsb(p, tgl_swsb_null());
3412          brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3413                     subscript(src, BRW_REGISTER_TYPE_D, 1));
3414       } else {
3415          brw_MOV(p, dst, src);
3416       }
3417    } else {
3418       /* From the Haswell PRM section "Register Region Restrictions":
3419        *
3420        *    "The lower bits of the AddressImmediate must not overflow to
3421        *    change the register address.  The lower 5 bits of Address
3422        *    Immediate when added to lower 5 bits of address register gives
3423        *    the sub-register offset. The upper bits of Address Immediate
3424        *    when added to upper bits of address register gives the register
3425        *    address. Any overflow from sub-register offset is dropped."
3426        *
3427        * Fortunately, for broadcast, we never have a sub-register offset so
3428        * this isn't an issue.
3429        */
3430       assert(src.subnr == 0);
3431 
3432       if (align1) {
3433          const struct brw_reg addr =
3434             retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3435          unsigned offset = src.nr * REG_SIZE + src.subnr;
3436          /* Limit in bytes of the signed indirect addressing immediate. */
3437          const unsigned limit = 512;
3438 
3439          brw_push_insn_state(p);
3440          brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3441          brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3442 
3443          /* Take into account the component size and horizontal stride. */
3444          assert(src.vstride == src.hstride + src.width);
3445          brw_SHL(p, addr, vec1(idx),
3446                  brw_imm_ud(util_logbase2(type_sz(src.type)) +
3447                             src.hstride - 1));
3448 
3449          /* We can only address up to limit bytes using the indirect
3450           * addressing immediate, account for the difference if the source
3451           * register is above this limit.
3452           */
3453          if (offset >= limit) {
3454             brw_set_default_swsb(p, tgl_swsb_regdist(1));
3455             brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3456             offset = offset % limit;
3457          }
3458 
3459          brw_pop_insn_state(p);
3460 
3461          brw_set_default_swsb(p, tgl_swsb_regdist(1));
3462 
3463          /* Use indirect addressing to fetch the specified component. */
3464          if (type_sz(src.type) > 4 &&
3465              (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo) ||
3466               !devinfo->has_64bit_float)) {
3467             /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3468              *
3469              *    "When source or destination datatype is 64b or operation is
3470              *    integer DWord multiply, indirect addressing must not be
3471              *    used."
3472              *
3473              * To work around both of this issue, we do two integer MOVs
3474              * insead of one 64-bit MOV.  Because no double value should ever
3475              * cross a register boundary, it's safe to use the immediate
3476              * offset in the indirect here to handle adding 4 bytes to the
3477              * offset and avoid the extra ADD to the register file.
3478              */
3479             brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3480                        retype(brw_vec1_indirect(addr.subnr, offset),
3481                               BRW_REGISTER_TYPE_D));
3482             brw_set_default_swsb(p, tgl_swsb_null());
3483             brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3484                        retype(brw_vec1_indirect(addr.subnr, offset + 4),
3485                               BRW_REGISTER_TYPE_D));
3486          } else {
3487             brw_MOV(p, dst,
3488                     retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3489          }
3490       } else {
3491          /* In SIMD4x2 mode the index can be either zero or one, replicate it
3492           * to all bits of a flag register,
3493           */
3494          inst = brw_MOV(p,
3495                         brw_null_reg(),
3496                         stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3497          brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3498          brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3499          brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3500 
3501          /* and use predicated SEL to pick the right channel. */
3502          inst = brw_SEL(p, dst,
3503                         stride(suboffset(src, 4), 4, 4, 1),
3504                         stride(src, 4, 4, 1));
3505          brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3506          brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3507       }
3508    }
3509 
3510    brw_pop_insn_state(p);
3511 }
3512 
3513 /**
3514  * This instruction is generated as a single-channel align1 instruction by
3515  * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3516  *
3517  * We can't use the typed atomic op in the FS because that has the execution
3518  * mask ANDed with the pixel mask, but we just want to write the one dword for
3519  * all the pixels.
3520  *
3521  * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3522  * one u32.  So we use the same untyped atomic write message as the pixel
3523  * shader.
3524  *
3525  * The untyped atomic operation requires a BUFFER surface type with RAW
3526  * format, and is only accessible through the legacy DATA_CACHE dataport
3527  * messages.
3528  */
brw_shader_time_add(struct brw_codegen * p,struct brw_reg payload,uint32_t surf_index)3529 void brw_shader_time_add(struct brw_codegen *p,
3530                          struct brw_reg payload,
3531                          uint32_t surf_index)
3532 {
3533    const struct gen_device_info *devinfo = p->devinfo;
3534    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3535                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
3536                           GEN7_SFID_DATAPORT_DATA_CACHE);
3537    assert(devinfo->gen >= 7);
3538 
3539    brw_push_insn_state(p);
3540    brw_set_default_access_mode(p, BRW_ALIGN_1);
3541    brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3542    brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3543    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3544 
3545    /* We use brw_vec1_reg and unmasked because we want to increment the given
3546     * offset only once.
3547     */
3548    brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3549                                       BRW_ARF_NULL, 0));
3550    brw_set_src0(p, send, brw_vec1_reg(payload.file,
3551                                       payload.nr, 0));
3552    brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) |
3553                           brw_dp_untyped_atomic_desc(devinfo, 1, BRW_AOP_ADD,
3554                                                      false)));
3555 
3556    brw_inst_set_sfid(devinfo, send, sfid);
3557    brw_inst_set_binding_table_index(devinfo, send, surf_index);
3558 
3559    brw_pop_insn_state(p);
3560 }
3561 
3562 
3563 /**
3564  * Emit the SEND message for a barrier
3565  */
3566 void
brw_barrier(struct brw_codegen * p,struct brw_reg src)3567 brw_barrier(struct brw_codegen *p, struct brw_reg src)
3568 {
3569    const struct gen_device_info *devinfo = p->devinfo;
3570    struct brw_inst *inst;
3571 
3572    assert(devinfo->gen >= 7);
3573 
3574    brw_push_insn_state(p);
3575    brw_set_default_access_mode(p, BRW_ALIGN_1);
3576    inst = next_insn(p, BRW_OPCODE_SEND);
3577    brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3578    brw_set_src0(p, inst, src);
3579    brw_set_src1(p, inst, brw_null_reg());
3580    brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3581 
3582    brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3583    brw_inst_set_gateway_subfuncid(devinfo, inst,
3584                                   BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3585 
3586    brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3587    brw_pop_insn_state(p);
3588 }
3589 
3590 
3591 /**
3592  * Emit the wait instruction for a barrier
3593  */
3594 void
brw_WAIT(struct brw_codegen * p)3595 brw_WAIT(struct brw_codegen *p)
3596 {
3597    const struct gen_device_info *devinfo = p->devinfo;
3598    struct brw_inst *insn;
3599 
3600    struct brw_reg src = brw_notification_reg();
3601 
3602    insn = next_insn(p, BRW_OPCODE_WAIT);
3603    brw_set_dest(p, insn, src);
3604    brw_set_src0(p, insn, src);
3605    brw_set_src1(p, insn, brw_null_reg());
3606 
3607    brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3608    brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3609 }
3610 
3611 void
brw_float_controls_mode(struct brw_codegen * p,unsigned mode,unsigned mask)3612 brw_float_controls_mode(struct brw_codegen *p,
3613                         unsigned mode, unsigned mask)
3614 {
3615    /* From the Skylake PRM, Volume 7, page 760:
3616     *  "Implementation Restriction on Register Access: When the control
3617     *   register is used as an explicit source and/or destination, hardware
3618     *   does not ensure execution pipeline coherency. Software must set the
3619     *   thread control field to ‘switch’ for an instruction that uses
3620     *   control register as an explicit operand."
3621     *
3622     * On Gen12+ this is implemented in terms of SWSB annotations instead.
3623     */
3624    brw_set_default_swsb(p, tgl_swsb_regdist(1));
3625 
3626    brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3627                             brw_imm_ud(~mask));
3628    brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3629    if (p->devinfo->gen < 12)
3630       brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3631 
3632    if (mode) {
3633       brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3634                                  brw_imm_ud(mode));
3635       brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1);
3636       if (p->devinfo->gen < 12)
3637          brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH);
3638    }
3639 
3640    if (p->devinfo->gen >= 12)
3641       brw_SYNC(p, TGL_SYNC_NOP);
3642 }
3643 
3644 void
brw_update_reloc_imm(const struct gen_device_info * devinfo,brw_inst * inst,uint32_t value)3645 brw_update_reloc_imm(const struct gen_device_info *devinfo,
3646                      brw_inst *inst,
3647                      uint32_t value)
3648 {
3649    /* Sanity check that the instruction is a MOV of an immediate */
3650    assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV);
3651    assert(brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE);
3652 
3653    /* If it was compacted, we can't safely rewrite */
3654    assert(brw_inst_cmpt_control(devinfo, inst) == 0);
3655 
3656    brw_inst_set_imm_ud(devinfo, inst, value);
3657 }
3658 
3659 /* A default value for constants that will be patched at run-time.
3660  * We pick an arbitrary value that prevents instruction compaction.
3661  */
3662 #define DEFAULT_PATCH_IMM 0x4a7cc037
3663 
3664 void
brw_MOV_reloc_imm(struct brw_codegen * p,struct brw_reg dst,enum brw_reg_type src_type,uint32_t id)3665 brw_MOV_reloc_imm(struct brw_codegen *p,
3666                   struct brw_reg dst,
3667                   enum brw_reg_type src_type,
3668                   uint32_t id)
3669 {
3670    assert(type_sz(src_type) == 4);
3671    assert(type_sz(dst.type) == 4);
3672 
3673    if (p->num_relocs + 1 > p->reloc_array_size) {
3674       p->reloc_array_size = MAX2(16, p->reloc_array_size * 2);
3675       p->relocs = reralloc(p->mem_ctx, p->relocs,
3676                            struct brw_shader_reloc, p->reloc_array_size);
3677    }
3678 
3679    p->relocs[p->num_relocs++] = (struct brw_shader_reloc) {
3680       .id = id,
3681       .offset = p->next_insn_offset,
3682    };
3683 
3684    brw_MOV(p, dst, retype(brw_imm_ud(DEFAULT_PATCH_IMM), src_type));
3685 }
3686